+
+
+
\ No newline at end of file
diff --git a/docs/data/0.json b/docs/data/0.json
new file mode 100644
index 0000000..9a117a0
--- /dev/null
+++ b/docs/data/0.json
@@ -0,0 +1,542 @@
+{
+ "0": {
+ "file_id": 0,
+ "content": "/README.md",
+ "type": "filepath"
+ },
+ "1": {
+ "file_id": 0,
+ "content": "This code repository contains tools for generating and analyzing neuron explanations in language models, including public datasets in JSON format and data sources for related neurons and tokens. It also addresses GPT-2 model availability and fixes a GELU implementation bug for inference.",
+ "type": "summary"
+ },
+ "2": {
+ "file_id": 0,
+ "content": "# Automated interpretability\n## Code and tools\nThis repository contains code and tools associated with the [Language models can explain neurons in\nlanguage models](https://openaipublic.blob.core.windows.net/neuron-explainer/paper/index.html) paper, specifically:\n* Code for automatically generating, simulating, and scoring explanations of neuron behavior using\nthe methodology described in the paper. See the\n[neuron-explainer README](neuron-explainer/README.md) for more information.\nNote: if you run into errors of the form \"Error: Could not find any credentials that grant access to storage account: 'openaipublic' and container: 'neuron-explainer'\".\" you might be able to fix this by signing up for an azure account and specifying the credentials as described in the error message. \n* A tool for viewing neuron activations and explanations, accessible\n[here](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html). See\nthe [neuron-viewer README](neuron-viewer/README.md) for more information.",
+ "type": "code",
+ "location": "/README.md:1-16"
+ },
+ "3": {
+ "file_id": 0,
+ "content": "This repository contains code and tools for the Language models can explain neurons in language models paper. It includes a tool for generating, simulating, and scoring explanations of neuron behavior using the methodology described in the paper. Additionally, there's a tool for viewing neuron activations and explanations accessible online.",
+ "type": "comment"
+ },
+ "4": {
+ "file_id": 0,
+ "content": "## Public datasets\nTogether with this code, we're also releasing public datasets of GPT-2 XL neurons and explanations.\nHere's an overview of those datasets. \n* Neuron activations: `az://openaipublic/neuron-explainer/data/collated-activations/{layer_index}/{neuron_index}.json`\n - Tokenized text sequences and their activations for the neuron. We\n provide multiple sets of tokens and activations: top-activating ones, random\n samples from several quantiles; and a completely random sample. We also provide\n some basic statistics for the activations.\n - Each file contains a JSON-formatted\n [`NeuronRecord`](neuron-explainer/neuron_explainer/activations/activations.py#L89) dataclass.\n* Neuron explanations: `az://openaipublic/neuron-explainer/data/explanations/{layer_index}/{neuron_index}.jsonl`\n - Scored model-generated explanations of the behavior of the neuron, including simulation results.\n - Each file contains a JSON-formatted\n [`NeuronSimulationResults`](neuron-explainer/neuron_explainer/explanations/explanations.py#L146)",
+ "type": "code",
+ "location": "/README.md:18-33"
+ },
+ "5": {
+ "file_id": 0,
+ "content": "This code provides the location and overview of public datasets for GPT-2 XL neurons and explanations. The datasets include neuron activations and explanations in JSON format, with different sets of tokens and activations provided.",
+ "type": "comment"
+ },
+ "6": {
+ "file_id": 0,
+ "content": " dataclass.\n* Related neurons: `az://openaipublic/neuron-explainer/data/related-neurons/weight-based/{layer_index}/{neuron_index}.json`\n - Lists of the upstream and downstream neurons with the most positive and negative connections (see below for definition).\n - Each file contains a JSON-formatted dataclass whose definition is not included in this repo.\n* Tokens with high average activations:\n`az://openaipublic/neuron-explainer/data/related-tokens/activation-based/{layer_index}/{neuron_index}.json`\n - Lists of tokens with the highest average activations for individual neurons, and their average activations.\n - Each file contains a JSON-formatted [`TokenLookupTableSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L36)\n dataclass.\n* Tokens with large inbound and outbound weights:\n`az://openaipublic/neuron-explainer/data/related-tokens/weight-based/{layer_index}/{neuron_index}.json`\n - List of the most-positive and most-negative input and output tokens for individual neurons,",
+ "type": "code",
+ "location": "/README.md:34-45"
+ },
+ "7": {
+ "file_id": 0,
+ "content": "This code defines data sources for related neurons and tokens in a model, stored in Azure Blob Storage. The related neurons include upstream and downstream neurons with the most positive and negative connections, as well as tokens with high average activations or large inbound and outbound weights. Each file contains a JSON-formatted dataclass, which is not included in this repository.",
+ "type": "comment"
+ },
+ "8": {
+ "file_id": 0,
+ "content": " as well as the associated weight (see below for definition). \n - Each file contains a JSON-formatted [`WeightBasedSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L17)\n dataclass.\nUpdate (July 5, 2023):\nWe also released a set of explanations for GPT-2 Small. The methodology is slightly different from the methodology used for GPT-2 XL so the results aren't directly comparable.\n* Neuron activations: `az://openaipublic/neuron-explainer/gpt2_small_data/collated-activations/{layer_index}/{neuron_index}.json`\n* Neuron explanations: `az://openaipublic/neuron-explainer/gpt2_small_data/explanations/{layer_index}/{neuron_index}.jsonl`\nUpdate (August 30, 2023): We recently discovered a bug in how we performed inference on the GPT-2 series models used for the paper and for these datasets. Specifically, we used an optimized GELU implementation rather than the original GELU implementation associated with GPT-2. While the model’s behavior is very similar across ",
+ "type": "code",
+ "location": "/README.md:46-55"
+ },
+ "9": {
+ "file_id": 0,
+ "content": "This code provides information about the availability of neuron activations and explanations for GPT-2 models in different sizes. It also mentions updates on the data, including a bug fix related to the GELU implementation used for inference.",
+ "type": "comment"
+ },
+ "10": {
+ "file_id": 0,
+ "content": "these two configurations, the post-MLP activation values we used to generate and simulate explanations differ from the correct values by the following amounts for GPT-2 small:\n- Median: 0.0090\n- 90th percentile: 0.0252\n- 99th percentile: 0.0839\n- 99.9th percentile: 0.1736\n### Definition of connection weights\nRefer to [GPT-2 model code](https://github.com/openai/gpt-2/blob/master/src/model.py) for\nunderstanding of model weight conventions.\n*Neuron-neuron*: For two neurons `(l1, n1)` and `(l2, n2)` with `l1 < l2`, the connection strength is defined as\n`h{l1}.mlp.c_proj.w[:, n1, :] @ diag(h{l2}.ln_2.g) @ h{l2}.mlp.c_fc.w[:, :, n2]`.\n*Neuron-token*: For token `t` and neuron `(l, n)`, the input weight is computed as\n`wte[t, :] @ diag(h{l}.ln_2.g) @ h{l}.mlp.c_fc.w[:, :, n]`\nand the output weight is computed as\n`h{l}.mlp.c_proj.w[:, n, :] @ diag(ln_f.g) @ wte[t, :]`.\n### Misc Lists of Interesting Neurons\nLists of neurons we thought were interesting according to different criteria, with some preliminary descriptions.",
+ "type": "code",
+ "location": "/README.md:55-76"
+ },
+ "11": {
+ "file_id": 0,
+ "content": "This code is explaining the difference in activation values between two configurations for GPT-2 small. It also provides a link to understand the model weight conventions and defines connection weights between neurons and tokens. Additionally, it mentions lists of interesting neurons with some preliminary descriptions.",
+ "type": "comment"
+ },
+ "12": {
+ "file_id": 0,
+ "content": "* [Interesting Neurons (external)](https://docs.google.com/spreadsheets/d/1p7fYs31NU8sJoeKyUx4Mn2laGx8xXfHg_KcIvYiKPpg/edit#gid=0)\n* [Neurons that score high on random, possibly monosemantic? (external)](https://docs.google.com/spreadsheets/d/1TqKFcz-84jyIHLU7VRoTc8BoFBMpbgac-iNBnxVurQ8/edit?usp=sharing)\n* [Clusters of neurons well explained by activation explanation but not by tokens](https://docs.google.com/document/d/1lWhKowpKDdwTMALD_K541cdwgGoQx8DFUSuEe1U2AGE/edit?usp=sharing)\n* [Neurons sensitive to truncation](https://docs.google.com/document/d/1x89TWBvuHcyC2t01EDbJZJ5LQYHozlcS-VUmr5shf_A/edit?usp=sharing)",
+ "type": "code",
+ "location": "/README.md:77-80"
+ },
+ "13": {
+ "file_id": 0,
+ "content": "These are links to external spreadsheets and documents containing neurons with specific characteristics, such as interesting neurons, high-scoring neurons on random tests, clusters well explained by activation explanation but not by tokens, and neurons sensitive to truncation.",
+ "type": "comment"
+ },
+ "14": {
+ "file_id": 1,
+ "content": "/neuron-explainer/README.md",
+ "type": "filepath"
+ },
+ "15": {
+ "file_id": 1,
+ "content": "This directory contains our code for generating and simulating explanations of neuron behavior.",
+ "type": "summary"
+ },
+ "16": {
+ "file_id": 1,
+ "content": "# Neuron explainer\nThis directory contains a version of our code for generating, simulating and scoring explanations of\nneuron behavior.\n# Setup\n```\npip install -e .\n```\n# Usage\nFor example usage, see the `demos` folder:\n* [Generating and scoring activation-based explanations](demos/generate_and_score_explanation.ipynb)\n* [Generating and scoring explanations based on tokens with high average activations](demos/generate_and_score_token_look_up_table_explanation.ipynb)\n* [Generating explanations for human-written neuron puzzles](demos/explain_puzzles.ipynb)",
+ "type": "code",
+ "location": "/neuron-explainer/README.md:1-18"
+ },
+ "17": {
+ "file_id": 1,
+ "content": "This directory contains our code for generating and simulating explanations of neuron behavior.",
+ "type": "comment"
+ },
+ "18": {
+ "file_id": 2,
+ "content": "/neuron-explainer/demos/explain_puzzles.py",
+ "type": "filepath"
+ },
+ "19": {
+ "file_id": 2,
+ "content": "The code imports libraries, sets up the OpenAI API key, initializes the explainer model, and loops through each puzzle to generate explanations. It generates one explanation for a given input, checks if there's only 1 explanation, assigns it to 'model_generated_explanation', and prints both the explanation and expected answer for the puzzle.",
+ "type": "summary"
+ },
+ "20": {
+ "file_id": 2,
+ "content": "#!/usr/bin/env python\n# coding: utf-8\n# In[ ]:\nget_ipython().run_line_magic('load_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\n# In[ ]:\nimport os\nos.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\nfrom neuron_explainer.activations.activation_records import calculate_max_activation\nfrom neuron_explainer.explanations.explainer import TokenActivationPairExplainer\nfrom neuron_explainer.explanations.prompt_builder import PromptFormat\nfrom neuron_explainer.explanations.puzzles import PUZZLES_BY_NAME\nEXPLAINER_MODEL_NAME = \"gpt-4\"\nexplainer = TokenActivationPairExplainer(\n model_name=EXPLAINER_MODEL_NAME,\n prompt_format=PromptFormat.HARMONY_V4,\n max_concurrent=1,\n)\nfor puzzle_name, puzzle in PUZZLES_BY_NAME.items():\n print(f\"{puzzle_name=}\")\n puzzle_answer = puzzle.explanation\n # Generate an explanation for the puzzle.\n explanations = await explainer.generate_explanations(\n all_activation_records=puzzle.activation_records,\n max_activation=calculate_max_activation(puzzle.activation_records),",
+ "type": "code",
+ "location": "/neuron-explainer/demos/explain_puzzles.py:1-38"
+ },
+ "21": {
+ "file_id": 2,
+ "content": "Code imports necessary libraries, sets up OpenAI API key, initializes the explainer model, and begins looping through each puzzle in PUZZLES_BY_NAME to generate explanations.",
+ "type": "comment"
+ },
+ "22": {
+ "file_id": 2,
+ "content": " num_samples=1,\n )\n assert len(explanations) == 1\n model_generated_explanation = explanations[0]\n print(f\"{model_generated_explanation=}\")\n print(f\"{puzzle_answer=}\\n\")",
+ "type": "code",
+ "location": "/neuron-explainer/demos/explain_puzzles.py:39-44"
+ },
+ "23": {
+ "file_id": 2,
+ "content": "This code generates one explanation for a given input and asserts that the number of explanations is equal to 1. It then assigns the generated explanation to 'model_generated_explanation' and prints it along with the expected answer for the puzzle.",
+ "type": "comment"
+ },
+ "24": {
+ "file_id": 3,
+ "content": "/neuron-explainer/demos/generate_and_score_explanation.py",
+ "type": "filepath"
+ },
+ "25": {
+ "file_id": 3,
+ "content": "The code loads neuron data, generates an explanation using an explainer model, and sets up a simulator to evaluate the impact of explanations on neuron output, then performs simulations with given activation records and prints preferred scores.",
+ "type": "summary"
+ },
+ "26": {
+ "file_id": 3,
+ "content": "#!/usr/bin/env python\n# coding: utf-8\n# In[ ]:\nget_ipython().run_line_magic('load_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\n# In[ ]:\nimport os\nos.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\nfrom neuron_explainer.activations.activation_records import calculate_max_activation\nfrom neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron\nfrom neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator\nfrom neuron_explainer.explanations.explainer import TokenActivationPairExplainer\nfrom neuron_explainer.explanations.prompt_builder import PromptFormat\nfrom neuron_explainer.explanations.scoring import simulate_and_score\nfrom neuron_explainer.explanations.simulator import ExplanationNeuronSimulator\nEXPLAINER_MODEL_NAME = \"gpt-4\"\nSIMULATOR_MODEL_NAME = \"text-davinci-003\"\n# test_response = await client.make_request(prompt=\"test 123<|endofprompt|>\", max_tokens=2)\n# print(\"Response:\", test_response[\"choices\"][0][\"text\"])\n# Load a neuron record.",
+ "type": "code",
+ "location": "/neuron-explainer/demos/generate_and_score_explanation.py:1-33"
+ },
+ "27": {
+ "file_id": 3,
+ "content": "This code sets the OpenAI API key, imports necessary modules, defines constants for explainer and simulator models, and loads a neuron record.",
+ "type": "comment"
+ },
+ "28": {
+ "file_id": 3,
+ "content": "neuron_record = load_neuron(9, 6236)\n# Grab the activation records we'll need.\nslice_params = ActivationRecordSliceParams(n_examples_per_split=5)\ntrain_activation_records = neuron_record.train_activation_records(\n activation_record_slice_params=slice_params\n)\nvalid_activation_records = neuron_record.valid_activation_records(\n activation_record_slice_params=slice_params\n)\n# Generate an explanation for the neuron.\nexplainer = TokenActivationPairExplainer(\n model_name=EXPLAINER_MODEL_NAME,\n prompt_format=PromptFormat.HARMONY_V4,\n max_concurrent=1,\n)\nexplanations = await explainer.generate_explanations(\n all_activation_records=train_activation_records,\n max_activation=calculate_max_activation(train_activation_records),\n num_samples=1,\n)\nassert len(explanations) == 1\nexplanation = explanations[0]\nprint(f\"{explanation=}\")\n# Simulate and score the explanation.\nsimulator = UncalibratedNeuronSimulator(\n ExplanationNeuronSimulator(\n SIMULATOR_MODEL_NAME,\n explanation,\n max_concurrent=1,",
+ "type": "code",
+ "location": "/neuron-explainer/demos/generate_and_score_explanation.py:34-65"
+ },
+ "29": {
+ "file_id": 3,
+ "content": "Loading neuron data for layer 9, split 6236.\nCreating activation records slices and loading the train and validation activation records.\nGenerating an explanation for the selected neuron using a specified explainer model.\nRetrieving the generated explanation and storing it in the variable \"explanation\".\nSetting up a simulator to evaluate the provided explanation's impact on the neuron's output.",
+ "type": "comment"
+ },
+ "30": {
+ "file_id": 3,
+ "content": " prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n )\n)\nscored_simulation = await simulate_and_score(simulator, valid_activation_records)\nprint(f\"score={scored_simulation.get_preferred_score():.2f}\")",
+ "type": "code",
+ "location": "/neuron-explainer/demos/generate_and_score_explanation.py:66-70"
+ },
+ "31": {
+ "file_id": 3,
+ "content": "Performs simulation with given activation records and prints preferred score.",
+ "type": "comment"
+ },
+ "32": {
+ "file_id": 4,
+ "content": "/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py",
+ "type": "filepath"
+ },
+ "33": {
+ "file_id": 4,
+ "content": "The code prepares the environment, imports modules, and configures API keys for an explanation model. It loads data, generates explanations, and simulates them using a specific format. The preferred score is then printed with two decimal places.",
+ "type": "summary"
+ },
+ "34": {
+ "file_id": 4,
+ "content": "#!/usr/bin/env python\n# coding: utf-8\n# In[ ]:\nget_ipython().run_line_magic('load_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\n# In[ ]:\nimport os\nos.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\nfrom neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron\nfrom neuron_explainer.activations.token_connections import load_token_lookup_table_connections_of_neuron\nfrom neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator\nfrom neuron_explainer.explanations.explainer import TokenSpaceRepresentationExplainer\nfrom neuron_explainer.explanations.prompt_builder import PromptFormat\nfrom neuron_explainer.explanations.scoring import simulate_and_score\nfrom neuron_explainer.explanations.simulator import ExplanationNeuronSimulator\nEXPLAINER_MODEL_NAME = \"gpt-4\"\nSIMULATOR_MODEL_NAME = \"text-davinci-003\"\n# test_response = await client.make_request(prompt=\"test 123<|endofprompt|>\", max_tokens=2)\n# print(\"Response:\", test_response[\"choices\"][0][\"text\"])",
+ "type": "code",
+ "location": "/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py:1-31"
+ },
+ "35": {
+ "file_id": 4,
+ "content": "This code is setting up the environment and importing necessary modules for running an explanation model and simulator. It also sets the OpenAI API key, explanation model name, and simulator model name.",
+ "type": "comment"
+ },
+ "36": {
+ "file_id": 4,
+ "content": "layer_index = 9\nneuron_index = 6236\n# Load a token lookup table.\ntoken_lookup_table = load_token_lookup_table_connections_of_neuron(layer_index, neuron_index)\n# Load a neuron record.\nneuron_record = load_neuron(layer_index, neuron_index)\n# Grab the activation records we'll need.\nslice_params = ActivationRecordSliceParams(n_examples_per_split=5)\nvalid_activation_records = neuron_record.valid_activation_records(\n activation_record_slice_params=slice_params\n)\n# Generate an explanation for the neuron.\nexplainer = TokenSpaceRepresentationExplainer(\n model_name=EXPLAINER_MODEL_NAME,\n prompt_format=PromptFormat.HARMONY_V4,\n max_concurrent=1,\n)\nexplanations = await explainer.generate_explanations(\n tokens=token_lookup_table.tokens,\n num_samples=1,\n)\nassert len(explanations) == 1\nexplanation = explanations[0]\nprint(f\"{explanation=}\")\n# Simulate and score the explanation.\nsimulator = UncalibratedNeuronSimulator(\n ExplanationNeuronSimulator(\n SIMULATOR_MODEL_NAME,\n explanation,\n max_concurrent=1,",
+ "type": "code",
+ "location": "/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py:33-67"
+ },
+ "37": {
+ "file_id": 4,
+ "content": "Loading token lookup table and neuron record for a specific layer and index.\nGenerating an explanation using the provided token look up table.\nSimulating and scoring the generated explanation.",
+ "type": "comment"
+ },
+ "38": {
+ "file_id": 4,
+ "content": " prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n )\n)\nscored_simulation = await simulate_and_score(simulator, valid_activation_records)\nprint(f\"score={scored_simulation.get_preferred_score():.2f}\")",
+ "type": "code",
+ "location": "/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py:68-72"
+ },
+ "39": {
+ "file_id": 4,
+ "content": "Setting prompt format to \"INSTRUCTION_FOLLOWING\" and calling a function to simulate and score the activation records. Then, printing the preferred score with two decimal places.",
+ "type": "comment"
+ },
+ "40": {
+ "file_id": 5,
+ "content": "/neuron-explainer/neuron_explainer/activations/activation_records.py",
+ "type": "filepath"
+ },
+ "41": {
+ "file_id": 5,
+ "content": "The code handles activation records, features for max values and formatting neuron activations, and marks activations as unknown based on user inputs. It also calculates the ratio of non-zero activations to total activations across all records.",
+ "type": "summary"
+ },
+ "42": {
+ "file_id": 5,
+ "content": "\"\"\"Utilities for formatting activation records into prompts.\"\"\"\nimport math\nfrom typing import Optional, Sequence\nfrom neuron_explainer.activations.activations import ActivationRecord\nUNKNOWN_ACTIVATION_STRING = \"unknown\"\ndef relu(x: float) -> float:\n return max(0.0, x)\ndef calculate_max_activation(activation_records: Sequence[ActivationRecord]) -> float:\n \"\"\"Return the maximum activation value of the neuron across all the activation records.\"\"\"\n flattened = [\n # Relu is used to assume any values less than 0 are indicating the neuron is in the resting\n # state. This is a simplifying assumption that works with relu/gelu.\n max(relu(x) for x in activation_record.activations)\n for activation_record in activation_records\n ]\n return max(flattened)\ndef normalize_activations(activation_record: list[float], max_activation: float) -> list[int]:\n \"\"\"Convert raw neuron activations to integers on the range [0, 10].\"\"\"\n if max_activation <= 0:\n return [0 for x in activation_record]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activation_records.py:1-29"
+ },
+ "43": {
+ "file_id": 5,
+ "content": "This code defines functions to handle activation records, including calculating the maximum activation value and normalizing neuron activations. It also includes a REALU function for handling activation values less than 0 as resting state indicators.",
+ "type": "comment"
+ },
+ "44": {
+ "file_id": 5,
+ "content": " # Relu is used to assume any values less than 0 are indicating the neuron is in the resting\n # state. This is a simplifying assumption that works with relu/gelu.\n return [min(10, math.floor(10 * relu(x) / max_activation)) for x in activation_record]\ndef _format_activation_record(\n activation_record: ActivationRecord,\n max_activation: float,\n omit_zeros: bool,\n hide_activations: bool = False,\n start_index: int = 0,\n) -> str:\n \"\"\"Format neuron activations into a string, suitable for use in prompts.\"\"\"\n tokens = activation_record.tokens\n normalized_activations = normalize_activations(activation_record.activations, max_activation)\n if omit_zeros:\n assert (not hide_activations) and start_index == 0, \"Can't hide activations and omit zeros\"\n tokens = [\n token for token, activation in zip(tokens, normalized_activations) if activation > 0\n ]\n normalized_activations = [x for x in normalized_activations if x > 0]\n entries = []\n assert len(tokens) == len(normalized_activations)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activation_records.py:30-53"
+ },
+ "45": {
+ "file_id": 5,
+ "content": "The code snippet is responsible for formatting neuron activations into a string. It first applies an optional normalization to the activations, then optionally removes zeros and hides activations based on user inputs. The resulting string contains tokens and their corresponding normalized or hidden activations, suitable for use in prompts.",
+ "type": "comment"
+ },
+ "46": {
+ "file_id": 5,
+ "content": " for index, token, activation in zip(range(len(tokens)), tokens, normalized_activations):\n activation_string = str(int(activation))\n if hide_activations or index < start_index:\n activation_string = UNKNOWN_ACTIVATION_STRING\n entries.append(f\"{token}\\t{activation_string}\")\n return \"\\n\".join(entries)\ndef format_activation_records(\n activation_records: Sequence[ActivationRecord],\n max_activation: float,\n *,\n omit_zeros: bool = False,\n start_indices: Optional[list[int]] = None,\n hide_activations: bool = False,\n) -> str:\n \"\"\"Format a list of activation records into a string.\"\"\"\n return (\n \"\\n\\n\"\n + \"\\n\\n\\n\".join(\n [\n _format_activation_record(\n activation_record,\n max_activation,\n omit_zeros=omit_zeros,\n hide_activations=hide_activations,\n start_index=0 if start_indices is None else start_indices[i],\n )",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activation_records.py:54-81"
+ },
+ "47": {
+ "file_id": 5,
+ "content": "The code formats a list of activation records into a string. It iterates through each token, activation pair and normalizes the activations. If hide_activations or index is less than start_index, it replaces activation with UNKNOWN_ACTIVATION_STRING. The final formatted string joins entries with newline characters and includes and markers.",
+ "type": "comment"
+ },
+ "48": {
+ "file_id": 5,
+ "content": " for i, activation_record in enumerate(activation_records)\n ]\n )\n + \"\\n\\n\"\n )\ndef _format_tokens_for_simulation(tokens: Sequence[str]) -> str:\n \"\"\"\n Format tokens into a string with each token marked as having an \"unknown\" activation, suitable\n for use in prompts.\n \"\"\"\n entries = []\n for token in tokens:\n entries.append(f\"{token}\\t{UNKNOWN_ACTIVATION_STRING}\")\n return \"\\n\".join(entries)\ndef format_sequences_for_simulation(\n all_tokens: Sequence[Sequence[str]],\n) -> str:\n \"\"\"\n Format a list of lists of tokens into a string with each token marked as having an \"unknown\"\n activation, suitable for use in prompts.\n \"\"\"\n return (\n \"\\n\\n\"\n + \"\\n\\n\\n\".join(\n [_format_tokens_for_simulation(tokens) for tokens in all_tokens]\n )\n + \"\\n\\n\"\n )\ndef non_zero_activation_proportion(\n activation_records: Sequence[ActivationRecord], max_activation: float\n) -> float:\n \"\"\"Return the proportion of activation values that aren't zero.\"\"\"",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activation_records.py:82-119"
+ },
+ "49": {
+ "file_id": 5,
+ "content": "This code contains several functions to format and manipulate activation records and tokens for simulation purposes. The `_format_tokens_for_simulation` function formats a sequence of strings into a string with each token marked as having an \"unknown\" activation, suitable for use in prompts. The `format_sequences_for_simulation` function extends this to format a list of lists of tokens into a string with each token marked as having an \"unknown\" activation, also suitable for use in prompts. Finally, the `non_zero_activation_proportion` function calculates the proportion of non-zero activation values among a sequence of ActivationRecord objects.",
+ "type": "comment"
+ },
+ "50": {
+ "file_id": 5,
+ "content": " total_activations_count = sum(\n [len(activation_record.activations) for activation_record in activation_records]\n )\n normalized_activations = [\n normalize_activations(activation_record.activations, max_activation)\n for activation_record in activation_records\n ]\n non_zero_activations_count = sum(\n [len([x for x in activations if x != 0]) for activations in normalized_activations]\n )\n return non_zero_activations_count / total_activations_count",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activation_records.py:120-130"
+ },
+ "51": {
+ "file_id": 5,
+ "content": "Calculating the ratio of non-zero activations to total activations across all activation records.",
+ "type": "comment"
+ },
+ "52": {
+ "file_id": 6,
+ "content": "/neuron-explainer/neuron_explainer/activations/activations.py",
+ "type": "filepath"
+ },
+ "53": {
+ "file_id": 6,
+ "content": "The code involves dataclasses, enums for slicing activation records, ensures disjoint and covering slices, obtains interleaved subsets for training, validation, explanation evaluations, checks neuron existence, fetches neuron data from a file, ensures compatibility with NeuronRecord dataclass, provides options for synchronous/asynchronous processing, retrieves fold names in numeric order from the \"neurons\" directory.",
+ "type": "summary"
+ },
+ "54": {
+ "file_id": 6,
+ "content": "# Dataclasses and enums for storing neuron-indexed information about activations. Also, related\n# helper functions.\nimport math\nfrom dataclasses import dataclass, field\nfrom typing import List, Optional, Union\nimport urllib.request\nimport blobfile as bf\nimport boostedblob as bbb\nfrom neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass\nfrom neuron_explainer.azure import standardize_azure_url\n@register_dataclass\n@dataclass\nclass ActivationRecord(FastDataclass):\n \"\"\"Collated lists of tokens and their activations for a single neuron.\"\"\"\n tokens: List[str]\n \"\"\"Tokens in the text sequence, represented as strings.\"\"\"\n activations: List[float]\n \"\"\"Raw activation values for the neuron on each token in the text sequence.\"\"\"\n@register_dataclass\n@dataclass\nclass NeuronId(FastDataclass):\n \"\"\"Identifier for a neuron in an artificial neural network.\"\"\"\n layer_index: int\n \"\"\"The index of layer the neuron is in. The first layer used during inference has index 0.\"\"\"\n neuron_index: int",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:1-33"
+ },
+ "55": {
+ "file_id": 6,
+ "content": "Defines dataclasses and enums for storing information about neuron-indexed activations, along with related helper functions.",
+ "type": "comment"
+ },
+ "56": {
+ "file_id": 6,
+ "content": " \"\"\"The neuron's index within in its layer. Indices start from 0 in each layer.\"\"\"\ndef _check_slices(\n slices_by_split: dict[str, slice],\n expected_num_values: int,\n) -> None:\n \"\"\"Assert that the slices are disjoint and fully cover the intended range.\"\"\"\n indices = set()\n sum_of_slice_lengths = 0\n n_splits = len(slices_by_split.keys())\n for s in slices_by_split.values():\n subrange = range(expected_num_values)[s]\n sum_of_slice_lengths += len(subrange)\n indices |= set(subrange)\n assert (\n sum_of_slice_lengths == expected_num_values\n ), f\"{sum_of_slice_lengths=} != {expected_num_values=}\"\n stride = n_splits\n expected_indices = set.union(\n *[set(range(start_index, expected_num_values, stride)) for start_index in range(n_splits)]\n )\n assert indices == expected_indices, f\"{indices=} != {expected_indices=}\"\ndef get_slices_for_splits(\n splits: list[str],\n num_activation_records_per_split: int,\n) -> dict[str, slice]:\n \"\"\"\n Get equal-sized interleaved subsets for each of a list of splits, given the number of elements",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:34-64"
+ },
+ "57": {
+ "file_id": 6,
+ "content": "This code defines two functions: `_check_slices` and `get_slices_for_splits`.\n- `_check_slices` checks if slices are disjoint and fully cover the intended range.\n- `get_slices_for_splits` gets equal-sized interleaved subsets for a list of splits.",
+ "type": "comment"
+ },
+ "58": {
+ "file_id": 6,
+ "content": " to include in each split.\n \"\"\"\n stride = len(splits)\n num_activation_records_for_even_splits = num_activation_records_per_split * stride\n slices_by_split = {\n split: slice(split_index, num_activation_records_for_even_splits, stride)\n for split_index, split in enumerate(splits)\n }\n _check_slices(\n slices_by_split=slices_by_split,\n expected_num_values=num_activation_records_for_even_splits,\n )\n return slices_by_split\n@dataclass\nclass ActivationRecordSliceParams:\n \"\"\"How to select splits (train, valid, etc.) of activation records.\"\"\"\n n_examples_per_split: Optional[int]\n \"\"\"The number of examples to include in each split.\"\"\"\n@register_dataclass\n@dataclass\nclass NeuronRecord(FastDataclass):\n \"\"\"Neuron-indexed activation data, including summary stats and notable activation records.\"\"\"\n neuron_id: NeuronId\n \"\"\"Identifier for the neuron.\"\"\"\n random_sample: list[ActivationRecord] = field(default_factory=list)\n \"\"\"\n Random activation records for this neuron. The random sample is independent from those used for",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:65-99"
+ },
+ "59": {
+ "file_id": 6,
+ "content": "This code defines a class for ActivationRecordSliceParams, which specifies how to slice activation records based on the number of examples per split. It also includes a dataclass NeuronRecord that stores neuron-indexed activation data with summary stats and notable activation records.",
+ "type": "comment"
+ },
+ "60": {
+ "file_id": 6,
+ "content": " other neurons.\n \"\"\"\n random_sample_by_quantile: Optional[list[list[ActivationRecord]]] = None\n \"\"\"\n Random samples of activation records in each of the specified quantiles. None if quantile\n tracking is disabled.\n \"\"\"\n quantile_boundaries: Optional[list[float]] = None\n \"\"\"Boundaries of the quantiles used to generate the random_sample_by_quantile field.\"\"\"\n # Moments of activations\n mean: Optional[float] = math.nan\n variance: Optional[float] = math.nan\n skewness: Optional[float] = math.nan\n kurtosis: Optional[float] = math.nan\n most_positive_activation_records: list[ActivationRecord] = field(default_factory=list)\n \"\"\"\n Activation records with the most positive figure of merit value for this neuron over all dataset\n examples.\n \"\"\"\n @property\n def max_activation(self) -> float:\n \"\"\"Return the maximum activation value over all top-activating activation records.\"\"\"\n return max([max(ar.activations) for ar in self.most_positive_activation_records])",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:100-125"
+ },
+ "61": {
+ "file_id": 6,
+ "content": "This code represents a class for neuron activation records. It has attributes for random samples at specific quantiles, quantile boundaries, and moments of the activation values (mean, variance, skewness, kurtosis). Additionally, it includes a list of most positive activation records and a property to return the maximum activation value across all top-activating activation records.",
+ "type": "comment"
+ },
+ "62": {
+ "file_id": 6,
+ "content": " def _get_top_activation_slices(\n self, activation_record_slice_params: ActivationRecordSliceParams\n ) -> dict[str, slice]:\n splits = [\"train\", \"calibration\", \"valid\", \"test\"]\n n_examples_per_split = activation_record_slice_params.n_examples_per_split\n if n_examples_per_split is None:\n n_examples_per_split = len(self.most_positive_activation_records) // len(splits)\n assert len(self.most_positive_activation_records) >= n_examples_per_split * len(splits)\n return get_slices_for_splits(splits, n_examples_per_split)\n def _get_random_activation_slices(\n self, activation_record_slice_params: ActivationRecordSliceParams\n ) -> dict[str, slice]:\n splits = [\"calibration\", \"valid\", \"test\"]\n n_examples_per_split = activation_record_slice_params.n_examples_per_split\n if n_examples_per_split is None:\n n_examples_per_split = len(self.random_sample) // len(splits)\n # NOTE: this assert could trigger on some ol",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:127-144"
+ },
+ "63": {
+ "file_id": 6,
+ "content": "Code defines two methods, _get_top_activation_slices and _get_random_activation_slices, which return slices for activation records based on specified parameters. These slices are used to select a subset of the activation records for further processing.",
+ "type": "comment"
+ },
+ "64": {
+ "file_id": 6,
+ "content": "d datasets with only 10 random samples, in which case you may have to remove \"test\" from the set of splits\n assert len(self.random_sample) >= n_examples_per_split * len(splits)\n return get_slices_for_splits(splits, n_examples_per_split)\n def train_activation_records(\n self,\n activation_record_slice_params: ActivationRecordSliceParams,\n ) -> list[ActivationRecord]:\n \"\"\"\n Train split, typically used for generating explanations. Consists exclusively of\n top-activating records since context window limitations make it difficult to include\n random records.\n \"\"\"\n return self.most_positive_activation_records[\n self._get_top_activation_slices(activation_record_slice_params)[\"train\"]\n ]\n def calibration_activation_records(\n self,\n activation_record_slice_params: ActivationRecordSliceParams,\n ) -> list[ActivationRecord]:\n \"\"\"\n Calibration split, typically used for calibrating neuron simulations. See",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:144-166"
+ },
+ "65": {
+ "file_id": 6,
+ "content": "The code defines three methods: \"get_slices_for_splits\", \"train_activation_records\", and \"calibration_activation_records\".\n\"get_slices_for_splits\" returns slices for the specified splits based on the given number of examples per split.\n\"train_activation_records\" retrieves activation records from the \"most_positive_activation_records\" list for the training split.\n\"calibration_activation_records\" retrieves activation records for the calibration split.",
+ "type": "comment"
+ },
+ "66": {
+ "file_id": 6,
+ "content": " http://go/neuron_explanation_methodology for an explanation of calibration. Consists of\n top-activating records and random records in a 1:1 ratio.\n \"\"\"\n return (\n self.most_positive_activation_records[\n self._get_top_activation_slices(activation_record_slice_params)[\"calibration\"]\n ]\n + self.random_sample[\n self._get_random_activation_slices(activation_record_slice_params)[\"calibration\"]\n ]\n )\n def valid_activation_records(\n self,\n activation_record_slice_params: ActivationRecordSliceParams,\n ) -> list[ActivationRecord]:\n \"\"\"\n Validation split, typically used for evaluating explanations, either automatically with\n simulation + correlation coefficient scoring, or manually by humans. Consists of\n top-activating records and random records in a 1:1 ratio.\n \"\"\"\n return (\n self.most_positive_activation_records[\n self._get_top_activation_slices(activation_record_slice_params)[\"valid\"]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:167-190"
+ },
+ "67": {
+ "file_id": 6,
+ "content": "This code defines two methods: \"calibration\" and \"valid_activation_records\". Both methods return a combination of top-activating records and random records in a 1:1 ratio, which can be used for explanation validation or evaluation.",
+ "type": "comment"
+ },
+ "68": {
+ "file_id": 6,
+ "content": " ]\n + self.random_sample[\n self._get_random_activation_slices(activation_record_slice_params)[\"valid\"]\n ]\n )\n def test_activation_records(\n self,\n activation_record_slice_params: ActivationRecordSliceParams,\n ) -> list[ActivationRecord]:\n \"\"\"\n Test split, typically used for explanation evaluations that can't use the validation split.\n Consists of top-activating records and random records in a 1:1 ratio.\n \"\"\"\n return (\n self.most_positive_activation_records[\n self._get_top_activation_slices(activation_record_slice_params)[\"test\"]\n ]\n + self.random_sample[\n self._get_random_activation_slices(activation_record_slice_params)[\"test\"]\n ]\n )\ndef neuron_exists(\n dataset_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]\n) -> bool:\n \"\"\"Return whether the specified neuron exists.\"\"\"\n file = bf.join(dataset_path, \"neurons\", str(layer_index), f\"{neuron_index}.json\")",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:191-219"
+ },
+ "69": {
+ "file_id": 6,
+ "content": "The code defines three functions:\n1. \"get_activation_slices\": returns activation slices for training and validation splits based on the given parameters.\n2. \"test_activation_records\": returns a list of activation records used for explanation evaluations that can't use the validation split, containing top-activating records and random records in a 1:1 ratio.\n3. \"neuron_exists\": checks if a specified neuron exists based on given dataset path, layer index, and neuron index.",
+ "type": "comment"
+ },
+ "70": {
+ "file_id": 6,
+ "content": " return bf.exists(file)\ndef load_neuron(\n layer_index: Union[str, int],\n neuron_index: Union[str, int],\n dataset_path: str = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations\",\n) -> NeuronRecord:\n \"\"\"Load the NeuronRecord for the specified neuron.\"\"\"\n url = \"/\".join([dataset_path, str(layer_index), f\"{neuron_index}.json\"])\n url = standardize_azure_url(url)\n with urllib.request.urlopen(url) as f:\n neuron_record = loads(f.read())\n if not isinstance(neuron_record, NeuronRecord):\n raise ValueError(\n f\"Stored data incompatible with current version of NeuronRecord dataclass.\"\n )\n return neuron_record\n@bbb.ensure_session\nasync def load_neuron_async(\n layer_index: Union[str, int],\n neuron_index: Union[str, int],\n dataset_path: str = \"az://openaipublic/neuron-explainer/data/collated-activations\",\n) -> NeuronRecord:\n \"\"\"Async version of load_neuron.\"\"\"\n file = bf.join(dataset_path, str(layer_index), f\"{neuron_index}.json\")",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:220-247"
+ },
+ "71": {
+ "file_id": 6,
+ "content": "This code contains two functions, `load_neuron` and an asynchronous version `load_neuron_async`, that fetch NeuronRecord data from a specified neuron. It first constructs the URL to the dataset based on layer index and neuron index, then opens the URL and reads the JSON data. If the read data is not of type NeuronRecord, it raises an error. The asynchronous version uses BigBangEngine's `ensure_session` decorator for asynchronous execution.",
+ "type": "comment"
+ },
+ "72": {
+ "file_id": 6,
+ "content": " return await read_neuron_file(file)\n@bbb.ensure_session\nasync def read_neuron_file(neuron_filename: str) -> NeuronRecord:\n \"\"\"Like load_neuron_async, but takes a raw neuron filename.\"\"\"\n raw_contents = await bbb.read.read_single(neuron_filename)\n neuron_record = loads(raw_contents.decode(\"utf-8\"))\n if not isinstance(neuron_record, NeuronRecord):\n raise ValueError(\n f\"Stored data incompatible with current version of NeuronRecord dataclass.\"\n )\n return neuron_record\ndef get_sorted_neuron_indices(dataset_path: str, layer_index: Union[str, int]) -> List[int]:\n \"\"\"Returns the indices of all neurons in this layer, in ascending order.\"\"\"\n layer_dir = bf.join(dataset_path, \"neurons\", str(layer_index))\n return sorted(\n [int(f.split(\".\")[0]) for f in bf.listdir(layer_dir) if f.split(\".\")[0].isnumeric()]\n )\ndef get_sorted_layers(dataset_path: str) -> List[str]:\n \"\"\"\n Return the indices of all layers in this dataset, in ascending numerical order, as strings.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:248-273"
+ },
+ "73": {
+ "file_id": 6,
+ "content": "This code retrieves neuron data from a file, checks its compatibility with the NeuronRecord dataclass, and provides functions to get sorted layer indices and layer names for a given dataset.",
+ "type": "comment"
+ },
+ "74": {
+ "file_id": 6,
+ "content": " \"\"\"\n return [\n str(x)\n for x in sorted(\n [int(x) for x in bf.listdir(bf.join(dataset_path, \"neurons\")) if x.isnumeric()]\n )\n ]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/activations.py:274-280"
+ },
+ "75": {
+ "file_id": 6,
+ "content": "Gets numeric fold names from \"neurons\" directory and sorts them.",
+ "type": "comment"
+ },
+ "76": {
+ "file_id": 7,
+ "content": "/neuron-explainer/neuron_explainer/activations/token_connections.py",
+ "type": "filepath"
+ },
+ "77": {
+ "file_id": 7,
+ "content": "The code accesses a lookup table from an Azure dataset, containing tokens and their average activations for specified neuron using standardized URLs. This function then reads the file and displays its contents in a readable format.",
+ "type": "summary"
+ },
+ "78": {
+ "file_id": 7,
+ "content": "from dataclasses import dataclass\nfrom typing import List, Union\nimport blobfile as bf\nfrom neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass\nfrom neuron_explainer.azure import standardize_azure_url\nimport urllib.request\n@register_dataclass\n@dataclass\nclass TokensAndWeights(FastDataclass):\n tokens: List[str]\n strengths: List[float]\n@register_dataclass\n@dataclass\nclass WeightBasedSummaryOfNeuron(FastDataclass):\n input_positive: TokensAndWeights\n input_negative: TokensAndWeights\n output_positive: TokensAndWeights\n output_negative: TokensAndWeights\ndef load_token_weight_connections_of_neuron(\n layer_index: Union[str, int],\n neuron_index: Union[str, int],\n dataset_path: str = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based\",\n) -> WeightBasedSummaryOfNeuron:\n \"\"\"Load the TokenLookupTableSummaryOfNeuron for the specified neuron.\"\"\"\n url = \"/\".join([dataset_path, str(layer_index), f\"{neuron_index}.json\"])\n url = standardize_azure_url(url)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/token_connections.py:1-33"
+ },
+ "79": {
+ "file_id": 7,
+ "content": "Loading token-weight connections of a neuron from an Azure dataset. The function retrieves and returns the TokenLookupTableSummaryOfNeuron for the specified layer index and neuron index. It uses standardized Azure URLs for accessing the data.",
+ "type": "comment"
+ },
+ "80": {
+ "file_id": 7,
+ "content": " with urllib.request.urlopen(url) as f:\n return loads(f.read(), backwards_compatible=False)\n@register_dataclass\n@dataclass\nclass TokenLookupTableSummaryOfNeuron(FastDataclass):\n \"\"\"List of tokens and the average activations of a given neuron in response to each\n respective token. These are selected from among the tokens in the vocabulary with the\n highest average activations across an internet text dataset, with the highest activations\n first.\"\"\"\n tokens: List[str]\n average_activations: List[float]\ndef load_token_lookup_table_connections_of_neuron(\n layer_index: Union[str, int],\n neuron_index: Union[str, int],\n dataset_path: str = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based\",\n) -> TokenLookupTableSummaryOfNeuron:\n \"\"\"Load the TokenLookupTableSummaryOfNeuron for the specified neuron.\"\"\"\n url = \"/\".join([dataset_path, str(layer_index), f\"{neuron_index}.json\"])\n url = standardize_azure_url(url)\n with urllib.request.urlopen(url) as f:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/token_connections.py:34-58"
+ },
+ "81": {
+ "file_id": 7,
+ "content": "This code loads a lookup table containing tokens and their average activations for a given neuron. The table is generated from the highest average activations across an internet text dataset, and the data is retrieved from an Azure URL.",
+ "type": "comment"
+ },
+ "82": {
+ "file_id": 7,
+ "content": " return loads(f.read(), backwards_compatible=False)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/activations/token_connections.py:59-59"
+ },
+ "83": {
+ "file_id": 7,
+ "content": "This function reads the file and returns its contents in a readable format.",
+ "type": "comment"
+ },
+ "84": {
+ "file_id": 8,
+ "content": "/neuron-explainer/neuron_explainer/api_client.py",
+ "type": "filepath"
+ },
+ "85": {
+ "file_id": 8,
+ "content": "The code initializes an API client with error handling, response caching for OpenAI requests, and implements exponential backoff for retry mechanisms. It uses HTTPX to make asynchronous requests and starts an event loop in the main function.",
+ "type": "summary"
+ },
+ "86": {
+ "file_id": 8,
+ "content": "import asyncio\nimport contextlib\nimport os\nimport random\nimport traceback\nfrom asyncio import Semaphore\nfrom functools import wraps\nfrom typing import Any, Callable, Optional\nimport httpx\nimport orjson\ndef is_api_error(err: Exception) -> bool:\n if isinstance(err, httpx.HTTPStatusError):\n response = err.response\n error_data = response.json().get(\"error\", {})\n error_message = error_data.get(\"message\")\n if response.status_code in [400, 404, 415]:\n if error_data.get(\"type\") == \"idempotency_error\":\n print(f\"Retrying after idempotency error: {error_message} ({response.url})\")\n return True\n else:\n # Invalid request\n return False\n else:\n print(f\"Retrying after API error: {error_message} ({response.url})\")\n return True\n elif isinstance(err, httpx.ConnectError):\n print(f\"Retrying after connection error... ({err.request.url})\")\n return True\n elif isinstance(err, httpx.TimeoutException):",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/api_client.py:1-34"
+ },
+ "87": {
+ "file_id": 8,
+ "content": "This function checks if the error thrown is an API error or a connection error. If it's an API error with status code 400, 404, or 415, it may be due to an idempotency error and can be retried. Otherwise, if it's a connection error, it also needs to be retried.",
+ "type": "comment"
+ },
+ "88": {
+ "file_id": 8,
+ "content": " print(f\"Retrying after a timeout error... ({err.request.url})\")\n return True\n elif isinstance(err, httpx.ReadError):\n print(f\"Retrying after a read error... ({err.request.url})\")\n return True\n print(f\"Retrying after an unexpected error: {repr(err)}\")\n traceback.print_tb(err.__traceback__)\n return True\ndef exponential_backoff(\n retry_on: Callable[[Exception], bool] = lambda err: True\n) -> Callable[[Callable], Callable]:\n \"\"\"\n Returns a decorator which retries the wrapped function as long as the specified retry_on\n function returns True for the exception, applying exponential backoff with jitter after\n failures, up to a retry limit.\n \"\"\"\n init_delay_s = 1.0\n max_delay_s = 10.0\n # Roughly 30 minutes before we give up.\n max_tries = 200\n backoff_multiplier = 2.0\n jitter = 0.2\n def decorate(f: Callable) -> Callable:\n assert asyncio.iscoroutinefunction(f)\n @wraps(f)\n async def f_retry(*args: Any, **kwargs: Any) -> None:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/api_client.py:35-66"
+ },
+ "89": {
+ "file_id": 8,
+ "content": "The code defines a decorator function `exponential_backoff` that retries a wrapped asynchronous function with exponential backoff and jitter after failures. The retry attempts continue until the specified `retry_on` condition returns False or the maximum number of tries is reached. It also prints error messages and stack traces for unexpected errors during the retries.",
+ "type": "comment"
+ },
+ "90": {
+ "file_id": 8,
+ "content": " delay_s = init_delay_s\n for i in range(max_tries):\n try:\n return await f(*args, **kwargs)\n except Exception as err:\n if not retry_on(err) or i == max_tries - 1:\n raise\n jittered_delay = random.uniform(delay_s * (1 - jitter), delay_s * (1 + jitter))\n await asyncio.sleep(jittered_delay)\n delay_s = min(delay_s * backoff_multiplier, max_delay_s)\n return f_retry\n return decorate\nAPI_KEY = os.getenv(\"OPENAI_API_KEY\")\nassert API_KEY, \"Please set the OPENAI_API_KEY environment variable\"\nAPI_HTTP_HEADERS = {\n \"Content-Type\": \"application/json\",\n \"Authorization\": \"Bearer \" + API_KEY,\n}\nBASE_API_URL = \"https://api.openai.com/v1\"\nclass ApiClient:\n \"\"\"Performs inference using the OpenAI API. Supports response caching and concurrency limits.\"\"\"\n def __init__(\n self,\n model_name: str,\n # If set, no more than this number of HTTP requests will be made concurrently.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/api_client.py:67-98"
+ },
+ "91": {
+ "file_id": 8,
+ "content": "Code snippet handles API requests with retry mechanism and error handling. It sets the OpenAI API key, HTTP headers, and base API URL for making requests. The ApiClient class is initialized with a model_name parameter and supports response caching and concurrency limits.",
+ "type": "comment"
+ },
+ "92": {
+ "file_id": 8,
+ "content": " max_concurrent: Optional[int] = None,\n # Whether to cache request/response pairs in memory to avoid duplicating requests.\n cache: bool = False,\n ):\n self.model_name = model_name\n if max_concurrent is not None:\n self._concurrency_check: Optional[Semaphore] = Semaphore(max_concurrent)\n else:\n self._concurrency_check = None\n if cache:\n self._cache: Optional[dict[str, Any]] = {}\n else:\n self._cache = None\n @exponential_backoff(retry_on=is_api_error)\n async def make_request(\n self, timeout_seconds: Optional[int] = None, **kwargs: Any\n ) -> dict[str, Any]:\n if self._cache is not None:\n key = orjson.dumps(kwargs)\n if key in self._cache:\n return self._cache[key]\n async with contextlib.AsyncExitStack() as stack:\n if self._concurrency_check is not None:\n await stack.enter_async_context(self._concurrency_check)\n http_client = await stack.enter_async_context(",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/api_client.py:99-126"
+ },
+ "93": {
+ "file_id": 8,
+ "content": "The code is initializing an API client with optional parameters for maximum concurrent requests, and whether to cache request/response pairs. It also has a method `make_request` which uses exponential backoff retry mechanism when making HTTP calls. If caching is enabled, it checks if the request has been cached before executing the call.",
+ "type": "comment"
+ },
+ "94": {
+ "file_id": 8,
+ "content": " httpx.AsyncClient(timeout=timeout_seconds)\n )\n # If the request has a \"messages\" key, it should be sent to the /chat/completions\n # endpoint. Otherwise, it should be sent to the /completions endpoint.\n url = BASE_API_URL + (\"/chat/completions\" if \"messages\" in kwargs else \"/completions\")\n kwargs[\"model\"] = self.model_name\n response = await http_client.post(url, headers=API_HTTP_HEADERS, json=kwargs)\n # The response json has useful information but the exception doesn't include it, so print it\n # out then reraise.\n try:\n response.raise_for_status()\n except Exception as e:\n print(response.json())\n raise e\n if self._cache is not None:\n self._cache[key] = response.json()\n return response.json()\nif __name__ == \"__main__\":\n async def main() -> None:\n client = ApiClient(model_name=\"gpt-3.5-turbo\", max_concurrent=1)\n print(await client.make_request(prompt=\"Why did the chicken cross the road?\", max_tokens=9))",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/api_client.py:127-150"
+ },
+ "95": {
+ "file_id": 8,
+ "content": "This code is creating an instance of `ApiClient` class and making a request to OpenAI API using the `make_request` method. The request URL depends on whether the input has \"messages\" key or not, and it uses HTTPX client for asynchronous requests. If there's an error in the response, it prints the JSON data then re-raises the exception. If a cache is set, the response JSON will be cached under the specified key.",
+ "type": "comment"
+ },
+ "96": {
+ "file_id": 8,
+ "content": " asyncio.run(main())",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/api_client.py:152-152"
+ },
+ "97": {
+ "file_id": 8,
+ "content": "This code starts an asynchronous event loop and runs the main function.",
+ "type": "comment"
+ },
+ "98": {
+ "file_id": 9,
+ "content": "/neuron-explainer/neuron_explainer/azure.py",
+ "type": "filepath"
+ },
+ "99": {
+ "file_id": 9,
+ "content": "This function converts the input URL to Azure format if it starts with \"az://openaipublic/\".",
+ "type": "summary"
+ }
+}
\ No newline at end of file
diff --git a/docs/data/1.json b/docs/data/1.json
new file mode 100644
index 0000000..d7cae3c
--- /dev/null
+++ b/docs/data/1.json
@@ -0,0 +1,547 @@
+{
+ "100": {
+ "file_id": 9,
+ "content": "def standardize_azure_url(url):\n \"\"\"Make sure url is converted to url format, not an azure path\"\"\"\n if url.startswith(\"az://openaipublic/\"):\n url = url.replace(\"az://openaipublic/\", \"https://openaipublic.blob.core.windows.net/\")\n return url",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/azure.py:1-5"
+ },
+ "101": {
+ "file_id": 9,
+ "content": "This function converts the input URL to Azure format if it starts with \"az://openaipublic/\".",
+ "type": "comment"
+ },
+ "102": {
+ "file_id": 10,
+ "content": "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py",
+ "type": "filepath"
+ },
+ "103": {
+ "file_id": 10,
+ "content": "The CalibratedNeuronSimulator improves NeuronSimulator with calibration methods, while the LinearCalibratedNeuronSimulator uses flattened activations and true activations for calibration, and PercentileMatchingCalibratedNeuronSimulator ensures distribution matching on the calibration set.",
+ "type": "summary"
+ },
+ "104": {
+ "file_id": 10,
+ "content": "\"\"\"\nCode for calibrating simulations of neuron behavior. Calibration refers to a process of mapping from\na space of predicted activation values (e.g. [0, 10]) to the real activation distribution for a\nneuron.\nSee http://go/neuron_explanation_methodology for description of calibration step. Necessary for\nsimulating neurons in the context of ablate-to-simulation, but can be skipped when using correlation\nscoring. (Calibration may still improve quality for scoring, at least for non-linear calibration\nmethods.)\n\"\"\"\nfrom __future__ import annotations\nimport asyncio\nfrom abc import abstractmethod\nfrom typing import Optional, Sequence\nimport numpy as np\nfrom neuron_explainer.activations.activations import ActivationRecord\nfrom neuron_explainer.explanations.explanations import ActivationScale\nfrom neuron_explainer.explanations.simulator import NeuronSimulator, SequenceSimulation\nfrom sklearn import linear_model\nclass CalibratedNeuronSimulator(NeuronSimulator):\n \"\"\"\n Wrap a NeuronSimulator and calibrate it to map from the predicted activation space to the",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:1-27"
+ },
+ "105": {
+ "file_id": 10,
+ "content": "CalibratedNeuronSimulator class inherits from NeuronSimulator and provides calibration for mapping predicted activation values to real neuron activations.",
+ "type": "comment"
+ },
+ "106": {
+ "file_id": 10,
+ "content": " actual neuron activation space.\n \"\"\"\n def __init__(self, uncalibrated_simulator: NeuronSimulator):\n self.uncalibrated_simulator = uncalibrated_simulator\n @classmethod\n async def create(\n cls,\n uncalibrated_simulator: NeuronSimulator,\n calibration_activation_records: Sequence[ActivationRecord],\n ) -> CalibratedNeuronSimulator:\n \"\"\"\n Create and calibrate a calibrated simulator (so initialization and calibration can be done\n in one call).\n \"\"\"\n calibrated_simulator = cls(uncalibrated_simulator)\n await calibrated_simulator.calibrate(calibration_activation_records)\n return calibrated_simulator\n async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None:\n \"\"\"\n Determine parameters to map from the predicted activation space to the real neuron\n activation space, based on a calibration set.\n Use when simulated sequences haven't already been produced on the calibration set.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:28-53"
+ },
+ "107": {
+ "file_id": 10,
+ "content": "This code defines a class method `create()` and a method `calibrate()` for the `CalibratedNeuronSimulator` class. The `create()` method creates and calibrates a simulator in one call, while the `calibrate()` method determines parameters to map from predicted activation space to real neuron activation space based on a calibration set.",
+ "type": "comment"
+ },
+ "108": {
+ "file_id": 10,
+ "content": " \"\"\"\n simulations = await asyncio.gather(\n *[\n self.uncalibrated_simulator.simulate(activations.tokens)\n for activations in calibration_activation_records\n ]\n )\n self.calibrate_from_simulations(calibration_activation_records, simulations)\n def calibrate_from_simulations(\n self,\n calibration_activation_records: Sequence[ActivationRecord],\n simulations: Sequence[SequenceSimulation],\n ) -> None:\n \"\"\"\n Determine parameters to map from the predicted activation space to the real neuron\n activation space, based on a calibration set.\n Use when simulated sequences have already been produced on the calibration set.\n \"\"\"\n flattened_activations = []\n flattened_simulated_activations: list[float] = []\n for activations, simulation in zip(calibration_activation_records, simulations):\n flattened_activations.extend(activations.activations)\n flattened_simulated_activations.extend(simulation.expected_activations)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:54-78"
+ },
+ "109": {
+ "file_id": 10,
+ "content": "Calibrating simulator by generating flattened activation sequences for both calibration_activation_records and simulations.",
+ "type": "comment"
+ },
+ "110": {
+ "file_id": 10,
+ "content": " self._calibrate_from_flattened_activations(\n np.array(flattened_activations), np.array(flattened_simulated_activations)\n )\n @abstractmethod\n def _calibrate_from_flattened_activations(\n self,\n true_activations: np.ndarray,\n uncalibrated_activations: np.ndarray,\n ) -> None:\n \"\"\"\n Determine parameters to map from the predicted activation space to the real neuron\n activation space, based on a calibration set.\n Take numpy arrays of all true activations and all uncalibrated activations on the\n calibration set over all sequences.\n \"\"\"\n @abstractmethod\n def apply_calibration(self, values: Sequence[float]) -> list[float]:\n \"\"\"Apply the learned calibration to a sequence of values.\"\"\"\n async def simulate(self, tokens: Sequence[str]) -> SequenceSimulation:\n uncalibrated_seq_simulation = await self.uncalibrated_simulator.simulate(tokens)\n calibrated_activations = self.apply_calibration(\n uncalibrated_seq_simulation.expected_activations",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:79-104"
+ },
+ "111": {
+ "file_id": 10,
+ "content": "This code defines a calibrated simulator that can be used to map the predicted activation space of a model to the actual neuron activation space. It contains methods for calibration and applying calibration to sequences of values. The simulate method is also defined, which uses an uncalibrated simulator to obtain expected activations and applies the calibration to obtain the final calibrated activations.",
+ "type": "comment"
+ },
+ "112": {
+ "file_id": 10,
+ "content": " )\n calibrated_distribution_values = [\n self.apply_calibration(dv) for dv in uncalibrated_seq_simulation.distribution_values\n ]\n return SequenceSimulation(\n tokens=uncalibrated_seq_simulation.tokens,\n expected_activations=calibrated_activations,\n activation_scale=ActivationScale.NEURON_ACTIVATIONS,\n distribution_values=calibrated_distribution_values,\n distribution_probabilities=uncalibrated_seq_simulation.distribution_probabilities,\n uncalibrated_simulation=uncalibrated_seq_simulation,\n )\nclass UncalibratedNeuronSimulator(CalibratedNeuronSimulator):\n \"\"\"Pass through the activations without trying to calibrate.\"\"\"\n def __init__(self, uncalibrated_simulator: NeuronSimulator):\n super().__init__(uncalibrated_simulator)\n async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None:\n pass\n def _calibrate_from_flattened_activations(\n self,",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:105-129"
+ },
+ "113": {
+ "file_id": 10,
+ "content": "CalibratedNeuronSimulator applies calibration to uncalibrated sequence simulation. UncalibratedNeuronSimulator passes through activations without calibration.",
+ "type": "comment"
+ },
+ "114": {
+ "file_id": 10,
+ "content": " true_activations: np.ndarray,\n uncalibrated_activations: np.ndarray,\n ) -> None:\n pass\n def apply_calibration(self, values: Sequence[float]) -> list[float]:\n return values if isinstance(values, list) else list(values)\nclass LinearCalibratedNeuronSimulator(CalibratedNeuronSimulator):\n \"\"\"Find a linear mapping from uncalibrated activations to true activations.\n Should not change ev_correlation_score because it is invariant to linear transformations.\n \"\"\"\n def __init__(self, uncalibrated_simulator: NeuronSimulator):\n super().__init__(uncalibrated_simulator)\n self._regression: Optional[linear_model.LinearRegression] = None\n def _calibrate_from_flattened_activations(\n self,\n true_activations: np.ndarray,\n uncalibrated_activations: np.ndarray,\n ) -> None:\n self._regression = linear_model.LinearRegression()\n self._regression.fit(uncalibrated_activations.reshape(-1, 1), true_activations)\n def apply_calibration(self, values: Sequence[float]) -> list[float]:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:130-157"
+ },
+ "115": {
+ "file_id": 10,
+ "content": "This code defines a class `LinearCalibratedNeuronSimulator` that inherits from `CalibratedNeuratorSimulator`. It initializes an optional linear regression model and provides two methods. The method `_calibrate_from_flattened_activations` fits the linear regression model with flattened uncalibrated activations and true activations, and the method `apply_calibration` applies the calibration to a given sequence of values if they are a list.",
+ "type": "comment"
+ },
+ "116": {
+ "file_id": 10,
+ "content": " if self._regression is None:\n raise ValueError(\"Must call calibrate() before apply_calibration\")\n if len(values) == 0:\n return []\n return self._regression.predict(np.reshape(np.array(values), (-1, 1))).tolist()\nclass PercentileMatchingCalibratedNeuronSimulator(CalibratedNeuronSimulator):\n \"\"\"\n Map the nth percentile of the uncalibrated activations to the nth percentile of the true\n activations for all n.\n This will match the distribution of true activations on the calibration set, but will be\n overconfident outside of the calibration set.\n \"\"\"\n def __init__(self, uncalibrated_simulator: NeuronSimulator):\n super().__init__(uncalibrated_simulator)\n self._uncalibrated_activations: Optional[np.ndarray] = None\n self._true_activations: Optional[np.ndarray] = None\n def _calibrate_from_flattened_activations(\n self,\n true_activations: np.ndarray,\n uncalibrated_activations: np.ndarray,\n ) -> None:\n self._uncalibrated_activations = np.sort(uncalibrated_activations)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:158-184"
+ },
+ "117": {
+ "file_id": 10,
+ "content": "This code defines a `PercentileMatchingCalibratedNeuronSimulator` class that calibrates a neuron simulator by mapping the nth percentile of uncalibrated activations to the nth percentile of true activations for all n. This will match the distribution of true activations on the calibration set but will be overconfident outside of it. The `__init__` method initializes an instance with an optional `uncalibrated_simulator`, and the `_calibrate_from_flattened_activations` method performs the actual calibration using true activations and uncalibrated activations as inputs.",
+ "type": "comment"
+ },
+ "118": {
+ "file_id": 10,
+ "content": " self._true_activations = np.sort(true_activations)\n def apply_calibration(self, values: Sequence[float]) -> list[float]:\n if self._true_activations is None or self._uncalibrated_activations is None:\n raise ValueError(\"Must call calibrate() before apply_calibration\")\n if len(values) == 0:\n return []\n return np.interp(\n np.array(values), self._uncalibrated_activations, self._true_activations\n ).tolist()",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:185-194"
+ },
+ "119": {
+ "file_id": 10,
+ "content": "Sorting true_activations for calibration and raising ValueError if calibrate() not called before apply_calibration.",
+ "type": "comment"
+ },
+ "120": {
+ "file_id": 11,
+ "content": "/neuron-explainer/neuron_explainer/explanations/explainer.py",
+ "type": "filepath"
+ },
+ "121": {
+ "file_id": 11,
+ "content": "The code includes an AI model for generating explanations using API calls and prompts, along with helper functions, constants, and a base class NeuronExplainer. It also handles long prompts and extracts explanations from completion lists while removing extra spaces.",
+ "type": "summary"
+ },
+ "122": {
+ "file_id": 11,
+ "content": "\"\"\"Uses API calls to generate explanations of neuron behavior.\"\"\"\nfrom __future__ import annotations\nimport logging\nimport re\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import Any, Optional, Sequence, Union\nfrom neuron_explainer.activations.activation_records import (\n calculate_max_activation,\n format_activation_records,\n non_zero_activation_proportion,\n)\nfrom neuron_explainer.activations.activations import ActivationRecord\nfrom neuron_explainer.api_client import ApiClient\nfrom neuron_explainer.explanations.few_shot_examples import FewShotExampleSet\nfrom neuron_explainer.explanations.prompt_builder import (\n HarmonyMessage,\n PromptBuilder,\n PromptFormat,\n Role,\n)\nfrom neuron_explainer.explanations.token_space_few_shot_examples import (\n TokenSpaceFewShotExampleSet,\n)\nlogger = logging.getLogger(__name__)\n# TODO(williamrs): This prefix may not work well for some things, like predicting the next token.\n# Try other options like \"this neuron activates for\".\nEXPLANATION_PREFIX = \"the main thing this neuron does is find\"",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:1-34"
+ },
+ "123": {
+ "file_id": 11,
+ "content": "This code imports necessary modules and defines a few classes for generating explanations of neuron behavior using API calls. It also sets a prefix to be used when generating explanations.",
+ "type": "comment"
+ },
+ "124": {
+ "file_id": 11,
+ "content": "def _split_numbered_list(text: str) -> list[str]:\n \"\"\"Split a numbered list into a list of strings.\"\"\"\n lines = re.split(r\"\\n\\d+\\.\", text)\n # Strip the leading whitespace from each line.\n return [line.lstrip() for line in lines]\ndef _remove_final_period(text: str) -> str:\n \"\"\"Strip a final period or period-space from a string.\"\"\"\n if text.endswith(\".\"):\n return text[:-1]\n elif text.endswith(\". \"):\n return text[:-2]\n return text\nclass ContextSize(int, Enum):\n TWO_K = 2049\n FOUR_K = 4097\n @classmethod\n def from_int(cls, i: int) -> ContextSize:\n for context_size in cls:\n if context_size.value == i:\n return context_size\n raise ValueError(f\"{i} is not a valid ContextSize\")\nHARMONY_V4_MODELS = [\"gpt-3.5-turbo\", \"gpt-4\"]\nclass NeuronExplainer(ABC):\n \"\"\"\n Abstract base class for Explainer classes that generate explanations from subclass-specific\n input data.\n \"\"\"\n def __init__(\n self,\n model_name: str,\n prompt_format: PromptFormat = PromptFormat.HARMONY_V4,",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:37-77"
+ },
+ "125": {
+ "file_id": 11,
+ "content": "This code defines a class called NeuronExplainer, which is an abstract base class for generating explanations from subclass-specific input data. It also includes helper functions for splitting numbered lists and removing final periods or period-spaces from strings. The code also defines two constants: HARMONY_V4_MODELS (a list of supported model names) and ContextSize (an enumeration representing different context sizes).",
+ "type": "comment"
+ },
+ "126": {
+ "file_id": 11,
+ "content": " # This parameter lets us adjust the length of the prompt when we're generating explanations\n # using older models with shorter context windows. In the future we can use it to experiment\n # with longer context windows.\n context_size: ContextSize = ContextSize.FOUR_K,\n max_concurrent: Optional[int] = 10,\n cache: bool = False,\n ):\n if prompt_format == PromptFormat.HARMONY_V4:\n assert model_name in HARMONY_V4_MODELS\n elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:\n assert model_name not in HARMONY_V4_MODELS\n else:\n raise ValueError(f\"Unhandled prompt format {prompt_format}\")\n self.model_name = model_name\n self.prompt_format = prompt_format\n self.context_size = context_size\n self.client = ApiClient(model_name=model_name, max_concurrent=max_concurrent, cache=cache)\n async def generate_explanations(\n self,\n *,\n num_samples: int = 5,\n max_tokens: int = 60,",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:78-101"
+ },
+ "127": {
+ "file_id": 11,
+ "content": "This code is defining a class with an initializer and a method for generating explanations. It takes in parameters such as model name, prompt format, context size, max concurrent requests, and cache settings. It also asserts that the model name is appropriate for the prompt format provided, preventing incorrect usage.",
+ "type": "comment"
+ },
+ "128": {
+ "file_id": 11,
+ "content": " temperature: float = 1.0,\n top_p: float = 1.0,\n **prompt_kwargs: Any,\n ) -> list[Any]:\n \"\"\"Generate explanations based on subclass-specific input data.\"\"\"\n prompt = self.make_explanation_prompt(max_tokens_for_completion=max_tokens, **prompt_kwargs)\n generate_kwargs: dict[str, Any] = {\n \"n\": num_samples,\n \"max_tokens\": max_tokens,\n \"temperature\": temperature,\n \"top_p\": top_p,\n }\n if self.prompt_format == PromptFormat.HARMONY_V4:\n assert isinstance(prompt, list)\n assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n generate_kwargs[\"messages\"] = prompt\n else:\n assert isinstance(prompt, str)\n generate_kwargs[\"prompt\"] = prompt\n response = await self.client.make_request(**generate_kwargs)\n logger.debug(\"response in generate_explanations is %s\", response)\n if self.prompt_format == PromptFormat.HARMONY_V4:\n explanations = [x[\"message\"][\"content\"] for x in response[\"choices\"]]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:102-128"
+ },
+ "129": {
+ "file_id": 11,
+ "content": "The code is generating explanations based on subclass-specific input data. It first creates a prompt and then passes the prompt along with other parameters to a language model for completion. If the format is HarmonyV4, it expects a list of dictionaries (HarmonyMessage), otherwise a string prompt is passed. The response from the language model is then processed to extract explanations.",
+ "type": "comment"
+ },
+ "130": {
+ "file_id": 11,
+ "content": " elif self.prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:\n explanations = [x[\"text\"] for x in response[\"choices\"]]\n else:\n raise ValueError(f\"Unhandled prompt format {self.prompt_format}\")\n return self.postprocess_explanations(explanations, prompt_kwargs)\n @abstractmethod\n def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:\n \"\"\"\n Create a prompt to send to the API to generate one or more explanations.\n A prompt can be a simple string, or a list of HarmonyMessages, depending on the PromptFormat\n used by this instance.\n \"\"\"\n ...\n def postprocess_explanations(\n self, completions: list[str], prompt_kwargs: dict[str, Any]\n ) -> list[Any]:\n \"\"\"Postprocess the completions returned by the API into a list of explanations.\"\"\"\n return completions # no-op by default\n def _prompt_is_too_long(\n self, prompt_builder: PromptBuilder, max_tokens_for_completion: int",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:129-153"
+ },
+ "131": {
+ "file_id": 11,
+ "content": "This code defines a class for generating explanations using a prompt and an API. The `make_explanation_prompt` method is used to create a prompt to send to the API, which can be a string or a list of HarmonyMessages depending on the PromptFormat. The `postprocess_explanations` method post-processes the completions returned by the API into a list of explanations (by default it returns the completions as is). If the prompt format is unhandled, a ValueError is raised.",
+ "type": "comment"
+ },
+ "132": {
+ "file_id": 11,
+ "content": " ) -> bool:\n # We'll get a context size error if the prompt itself plus the maximum number of tokens for\n # the completion is longer than the context size.\n prompt_length = prompt_builder.prompt_length_in_tokens(self.prompt_format)\n if prompt_length + max_tokens_for_completion > self.context_size.value:\n print(\n f\"Prompt is too long: {prompt_length} + {max_tokens_for_completion} > \"\n f\"{self.context_size.value}\"\n )\n return True\n return False\nclass TokenActivationPairExplainer(NeuronExplainer):\n \"\"\"\n Generate explanations of neuron behavior using a prompt with lists of token/activation pairs.\n \"\"\"\n def __init__(\n self,\n model_name: str,\n prompt_format: PromptFormat = PromptFormat.HARMONY_V4,\n # This parameter lets us adjust the length of the prompt when we're generating explanations\n # using older models with shorter context windows. In the future we can use it to experiment",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:154-177"
+ },
+ "133": {
+ "file_id": 11,
+ "content": "This code checks if the prompt length combined with the maximum tokens for completion exceeds the context size. If so, it prints an error and returns True; otherwise, it returns False. The class TokenActivationPairExplainer generates explanations using token/activation pairs and prompts.",
+ "type": "comment"
+ },
+ "134": {
+ "file_id": 11,
+ "content": " # with 8k+ context windows.\n context_size: ContextSize = ContextSize.FOUR_K,\n few_shot_example_set: FewShotExampleSet = FewShotExampleSet.ORIGINAL,\n repeat_non_zero_activations: bool = True,\n max_concurrent: Optional[int] = 10,\n cache: bool = False,\n ):\n super().__init__(\n model_name=model_name,\n prompt_format=prompt_format,\n max_concurrent=max_concurrent,\n cache=cache,\n )\n self.context_size = context_size\n self.few_shot_example_set = few_shot_example_set\n self.repeat_non_zero_activations = repeat_non_zero_activations\n def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:\n original_kwargs = kwargs.copy()\n all_activation_records: Sequence[ActivationRecord] = kwargs.pop(\"all_activation_records\")\n max_activation: float = kwargs.pop(\"max_activation\")\n kwargs.setdefault(\"numbered_list_of_n_explanations\", None)\n numbered_list_of_n_explanations: Optional[int] = kwargs.pop(",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:178-200"
+ },
+ "135": {
+ "file_id": 11,
+ "content": "Creates an instance of the class with specified parameters like model name, prompt format, context size, few-shot example set, repeating non-zero activations, maximum concurrent processes, and cache settings. Overrides superclass initializer to set these parameters. Defines a method make_explanation_prompt which takes all_activation_records, max_activation, numbered_list_of_n_explanations as input and returns explanation prompt as output.",
+ "type": "comment"
+ },
+ "136": {
+ "file_id": 11,
+ "content": " \"numbered_list_of_n_explanations\"\n )\n if numbered_list_of_n_explanations is not None:\n assert numbered_list_of_n_explanations > 0, numbered_list_of_n_explanations\n # This parameter lets us dynamically shrink the prompt if our initial attempt to create it\n # results in something that's too long. It's only implemented for the 4k context size.\n kwargs.setdefault(\"omit_n_activation_records\", 0)\n omit_n_activation_records: int = kwargs.pop(\"omit_n_activation_records\")\n max_tokens_for_completion: int = kwargs.pop(\"max_tokens_for_completion\")\n assert not kwargs, f\"Unexpected kwargs: {kwargs}\"\n prompt_builder = PromptBuilder()\n prompt_builder.add_message(\n Role.SYSTEM,\n \"We're studying neurons in a neural network. Each neuron looks for some particular \"\n \"thing in a short document. Look at the parts of the document the neuron activates for \"\n \"and summarize in a single sentence what the neuron is looking for. Don't list \"",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:201-217"
+ },
+ "137": {
+ "file_id": 11,
+ "content": "This code is setting up parameters for the prompt builder, such as number of explanations and optional omit activation records. It ensures no unexpected kwargs are present and adds a message to the prompt builder explaining the neuron's function in analyzing short documents.",
+ "type": "comment"
+ },
+ "138": {
+ "file_id": 11,
+ "content": " \"examples of words.\\n\\nThe activation format is tokenactivation. Activation \"\n \"values range from 0 to 10. A neuron finding what it's looking for is represented by a \"\n \"non-zero activation value. The higher the activation value, the stronger the match.\",\n )\n few_shot_examples = self.few_shot_example_set.get_examples()\n num_omitted_activation_records = 0\n for i, few_shot_example in enumerate(few_shot_examples):\n few_shot_activation_records = few_shot_example.activation_records\n if self.context_size == ContextSize.TWO_K:\n # If we're using a 2k context window, we only have room for one activation record\n # per few-shot example. (Two few-shot examples with one activation record each seems\n # to work better than one few-shot example with two activation records, in local\n # testing.)\n few_shot_activation_records = few_shot_activation_records[:1]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:218-231"
+ },
+ "139": {
+ "file_id": 11,
+ "content": "Explains the activation format and its meaning, then selects one activation record from each few-shot example when using a 2k context window.",
+ "type": "comment"
+ },
+ "140": {
+ "file_id": 11,
+ "content": " elif (\n self.context_size == ContextSize.FOUR_K\n and num_omitted_activation_records < omit_n_activation_records\n ):\n # Drop the last activation record for this few-shot example to save tokens, assuming\n # there are at least two activation records.\n if len(few_shot_activation_records) > 1:\n print(f\"Warning: omitting activation record from few-shot example {i}\")\n few_shot_activation_records = few_shot_activation_records[:-1]\n num_omitted_activation_records += 1\n self._add_per_neuron_explanation_prompt(\n prompt_builder,\n few_shot_activation_records,\n i,\n calculate_max_activation(few_shot_example.activation_records),\n numbered_list_of_n_explanations=numbered_list_of_n_explanations,\n explanation=few_shot_example.explanation,\n )\n self._add_per_neuron_explanation_prompt(",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:232-250"
+ },
+ "141": {
+ "file_id": 11,
+ "content": "If context size is 4K and there are fewer activation records omitted than needed, drop the last one for the few-shot example if there are more than one activation record, then add the per-neuron explanation prompt.",
+ "type": "comment"
+ },
+ "142": {
+ "file_id": 11,
+ "content": " prompt_builder,\n # If we're using a 2k context window, we only have room for two of the activation\n # records.\n all_activation_records[:2]\n if self.context_size == ContextSize.TWO_K\n else all_activation_records,\n len(few_shot_examples),\n max_activation,\n numbered_list_of_n_explanations=numbered_list_of_n_explanations,\n explanation=None,\n )\n # If the prompt is too long *and* we omitted the specified number of activation records, try\n # again, omitting one more. (If we didn't make the specified number of omissions, we're out\n # of opportunities to omit records, so we just return the prompt as-is.)\n if (\n self._prompt_is_too_long(prompt_builder, max_tokens_for_completion)\n and num_omitted_activation_records == omit_n_activation_records\n ):\n original_kwargs[\"omit_n_activation_records\"] = omit_n_activation_records + 1\n return self.make_explanation_prompt(**original_kwargs)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:251-270"
+ },
+ "143": {
+ "file_id": 11,
+ "content": "Code snippet is part of a function that generates an explanation prompt for a model. It includes the activation records, context size, number of few-shot examples, maximum activation value, and a boolean to indicate if an explanation is provided or not. If the prompt exceeds the specified token limit due to the inclusion of activation records, it tries again by omitting one more record until the desired number of omit activation records is reached or the prompt is too long with no opportunity for further omissions.",
+ "type": "comment"
+ },
+ "144": {
+ "file_id": 11,
+ "content": " return prompt_builder.build(self.prompt_format)\n def _add_per_neuron_explanation_prompt(\n self,\n prompt_builder: PromptBuilder,\n activation_records: Sequence[ActivationRecord],\n index: int,\n max_activation: float,\n # When set, this indicates that the prompt should solicit a numbered list of the given\n # number of explanations, rather than a single explanation.\n numbered_list_of_n_explanations: Optional[int],\n explanation: Optional[str], # None means this is the end of the full prompt.\n ) -> None:\n max_activation = calculate_max_activation(activation_records)\n user_message = f\"\"\"\nNeuron {index + 1}\nActivations:{format_activation_records(activation_records, max_activation, omit_zeros=False)}\"\"\"\n # We repeat the non-zero activations only if it was requested and if the proportion of\n # non-zero activations isn't too high.\n if (\n self.repeat_non_zero_activations\n and non_zero_activation_proportion(activation_records, max_activation) < 0.2",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:271-293"
+ },
+ "145": {
+ "file_id": 11,
+ "content": "Function that adds per-neuron explanations to the prompt based on activation records and optional parameters.",
+ "type": "comment"
+ },
+ "146": {
+ "file_id": 11,
+ "content": " ):\n user_message += (\n f\"\\nSame activations, but with all zeros filtered out:\"\n f\"{format_activation_records(activation_records, max_activation, omit_zeros=True)}\"\n )\n if numbered_list_of_n_explanations is None:\n user_message += f\"\\nExplanation of neuron {index + 1} behavior:\"\n assistant_message = \"\"\n # For the IF format, we want <|endofprompt|> to come before the explanation prefix.\n if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:\n assistant_message += f\" {EXPLANATION_PREFIX}\"\n else:\n user_message += f\" {EXPLANATION_PREFIX}\"\n prompt_builder.add_message(Role.USER, user_message)\n if explanation is not None:\n assistant_message += f\" {explanation}.\"\n if assistant_message:\n prompt_builder.add_message(Role.ASSISTANT, assistant_message)\n else:\n if explanation is None:\n # For the final neuron, we solicit a numbered list of explanations.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:294-316"
+ },
+ "147": {
+ "file_id": 11,
+ "content": "This code seems to be a part of an explainable AI model. It generates user and assistant messages based on neuron activations, and either provides the explanation for a specific neuron or solicits a numbered list of explanations for all neurons.",
+ "type": "comment"
+ },
+ "148": {
+ "file_id": 11,
+ "content": " prompt_builder.add_message(\n Role.USER,\n f\"\"\"\\nHere are {numbered_list_of_n_explanations} possible explanations for neuron {index + 1} behavior, each beginning with \"{EXPLANATION_PREFIX}\":\\n1. {EXPLANATION_PREFIX}\"\"\",\n )\n else:\n # For the few-shot examples, we only present one explanation, but we present it as a\n # numbered list.\n prompt_builder.add_message(\n Role.USER,\n f\"\"\"\\nHere is 1 possible explanation for neuron {index + 1} behavior, beginning with \"{EXPLANATION_PREFIX}\":\\n1. {EXPLANATION_PREFIX}\"\"\",\n )\n prompt_builder.add_message(Role.ASSISTANT, f\" {explanation}.\")\n def postprocess_explanations(\n self, completions: list[str], prompt_kwargs: dict[str, Any]\n ) -> list[Any]:\n \"\"\"Postprocess the explanations returned by the API\"\"\"\n numbered_list_of_n_explanations = prompt_kwargs.get(\"numbered_list_of_n_explanations\")",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:317-334"
+ },
+ "149": {
+ "file_id": 11,
+ "content": "Code snippet adds messages to the prompt_builder depending on the number of explanations. If there are more than one, it creates a numbered list of explanations starting with \"EXPLANATION_PREFIX\". Otherwise, it presents only one explanation as part of a numbered list and then adds the explanation itself. The postprocess_explanations function processes explanations returned by the API.",
+ "type": "comment"
+ },
+ "150": {
+ "file_id": 11,
+ "content": " if numbered_list_of_n_explanations is None:\n return completions\n else:\n all_explanations = []\n for completion in completions:\n for explanation in _split_numbered_list(completion):\n if explanation.startswith(EXPLANATION_PREFIX):\n explanation = explanation[len(EXPLANATION_PREFIX) :]\n all_explanations.append(explanation.strip())\n return all_explanations\nclass TokenSpaceRepresentationExplainer(NeuronExplainer):\n \"\"\"\n Generate explanations of arbitrary lists of tokens which disproportionately activate a\n particular neuron. These lists of tokens can be generated in various ways. As an example, in one\n set of experiments, we compute the average activation for each neuron conditional on each token\n that appears in an internet text corpus. We then sort the tokens by their average activation,\n and show 50 of the top 100 tokens. Other techniques that could be used include taking the top",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:335-353"
+ },
+ "151": {
+ "file_id": 11,
+ "content": "Code block checks if the \"numbered_list_of_n_explanations\" is None and returns the \"completions\". If it's not None, it iterates through each completion and explanation in a nested loop. For each explanation that starts with EXPLANATION_PREFIX, it removes the prefix and appends the trimmed explanation to all_explanations list. Finally, it returns the list of all explanations.",
+ "type": "comment"
+ },
+ "152": {
+ "file_id": 11,
+ "content": " tokens in the logit lens or tuned lens representations of a neuron.\n \"\"\"\n def __init__(\n self,\n model_name: str,\n prompt_format: PromptFormat = PromptFormat.HARMONY_V4,\n context_size: ContextSize = ContextSize.FOUR_K,\n few_shot_example_set: TokenSpaceFewShotExampleSet = TokenSpaceFewShotExampleSet.ORIGINAL,\n use_few_shot: bool = False,\n output_numbered_list: bool = False,\n max_concurrent: Optional[int] = 10,\n cache: bool = False,\n ):\n super().__init__(\n model_name=model_name,\n prompt_format=prompt_format,\n context_size=context_size,\n max_concurrent=max_concurrent,\n cache=cache,\n )\n self.use_few_shot = use_few_shot\n self.output_numbered_list = output_numbered_list\n if self.use_few_shot:\n assert few_shot_example_set is not None\n self.few_shot_examples: Optional[TokenSpaceFewShotExampleSet] = few_shot_example_set\n else:\n self.few_shot_examples = None",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:354-381"
+ },
+ "153": {
+ "file_id": 11,
+ "content": "This function initializes a new instance of the Explainer class. It takes in parameters like model name, prompt format, context size, few-shot example set, use_few_shot flag, output_numbered_list flag, max_concurrent, and cache. If use_few_shot is True, it asserts that few_shot_example_set is not None and sets self.few_shot_examples accordingly.",
+ "type": "comment"
+ },
+ "154": {
+ "file_id": 11,
+ "content": " self.prompt_prefix = (\n \"We're studying neurons in a neural network. Each neuron looks for some particular \"\n \"kind of token (which can be a word, or part of a word). Look at the tokens the neuron \"\n \"activates for (listed below) and summarize in a single sentence what the neuron is \"\n \"looking for. Don't list examples of words.\"\n )\n def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:\n tokens: list[str] = kwargs.pop(\"tokens\")\n max_tokens_for_completion = kwargs.pop(\"max_tokens_for_completion\")\n assert not kwargs, f\"Unexpected kwargs: {kwargs}\"\n # Note that this does not preserve the precise tokens, as e.g.\n # f\" {token_with_no_leading_space}\" may be tokenized as \"f{token_with_leading_space}\".\n # TODO(dan): Try out other variants, including \"\\n\".join(...) and \",\".join(...)\n stringified_tokens = \", \".join([f\"'{t}'\" for t in tokens])\n prompt_builder = PromptBuilder()",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:382-398"
+ },
+ "155": {
+ "file_id": 11,
+ "content": "Code snippet:\n```python\ndef make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:\n tokens: list[str] = kwargs.pop(\"tokens\")\n max_tokens_for_completion = kwargs.pop(\"max_tokens_for_completion\")\n assert not kwargs, f\"Unexpected kwargs: {kwargs}\"\n stringified_tokens = \", \".join([f\"'{t}'\" for t in tokens])\n prompt_builder = PromptBuilder()\n```\nComment: This function constructs a prompt to ask about the neuron's activation tokens. It takes the \"tokens\" and \"max_tokens_for_completion\" as input arguments, and uses PromptBuilder to build the final prompt.",
+ "type": "comment"
+ },
+ "156": {
+ "file_id": 11,
+ "content": " prompt_builder.add_message(Role.SYSTEM, self.prompt_prefix)\n if self.use_few_shot:\n self._add_few_shot_examples(prompt_builder)\n self._add_neuron_specific_prompt(prompt_builder, stringified_tokens, explanation=None)\n if self._prompt_is_too_long(prompt_builder, max_tokens_for_completion):\n raise ValueError(f\"Prompt too long: {prompt_builder.build(self.prompt_format)}\")\n else:\n return prompt_builder.build(self.prompt_format)\n def _add_few_shot_examples(self, prompt_builder: PromptBuilder) -> None:\n \"\"\"\n Append few-shot examples to the prompt. Each one consists of a comma-delimited list of\n tokens and corresponding explanations, as saved in\n alignment/neuron_explainer/weight_explainer/token_space_few_shot_examples.py.\n \"\"\"\n assert self.few_shot_examples is not None\n few_shot_example_list = self.few_shot_examples.get_examples()\n if self.output_numbered_list:\n raise NotImplementedError(\"Numbered list output not supported for few-shot examples\")",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:399-418"
+ },
+ "157": {
+ "file_id": 11,
+ "content": "This code adds a prompt to the prompt builder. It starts with a system message, then adds few-shot examples if specified and prompts related to neurons. If the prompt is too long, it raises a ValueError. The code also includes an unimplemented feature for numbered lists in few-shot examples.",
+ "type": "comment"
+ },
+ "158": {
+ "file_id": 11,
+ "content": " else:\n for few_shot_example in few_shot_example_list:\n self._add_neuron_specific_prompt(\n prompt_builder,\n \", \".join([f\"'{t}'\" for t in few_shot_example.tokens]),\n explanation=few_shot_example.explanation,\n )\n def _add_neuron_specific_prompt(\n self,\n prompt_builder: PromptBuilder,\n stringified_tokens: str,\n explanation: Optional[str],\n ) -> None:\n \"\"\"\n Append a neuron-specific prompt to the prompt builder. The prompt consists of a list of\n tokens followed by either an explanation (if one is passed, for few shot examples) or by\n the beginning of a completion, to be completed by the model with an explanation.\n \"\"\"\n user_message = f\"\\n\\n\\n\\nTokens:\\n{stringified_tokens}\\n\\nExplanation:\\n\"\n assistant_message = \"\"\n looking_for = \"This neuron is looking for\"\n if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:419-441"
+ },
+ "159": {
+ "file_id": 11,
+ "content": "This code adds a neuron-specific prompt to the prompt builder. If the example is not a few shot example, it adds a list of tokens and either an explanation or a starting point for the model to complete with an explanation. The prompt format can be instruction following.",
+ "type": "comment"
+ },
+ "160": {
+ "file_id": 11,
+ "content": " # We want <|endofprompt|> to come before \"This neuron is looking for\" in the IF format.\n assistant_message += looking_for\n else:\n user_message += looking_for\n if self.output_numbered_list:\n start_of_list = \"\\n1.\"\n if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:\n assistant_message += start_of_list\n else:\n user_message += start_of_list\n if explanation is not None:\n assistant_message += f\"{explanation}.\"\n prompt_builder.add_message(Role.USER, user_message)\n if assistant_message:\n prompt_builder.add_message(Role.ASSISTANT, assistant_message)\n def postprocess_explanations(\n self, completions: list[str], prompt_kwargs: dict[str, Any]\n ) -> list[str]:\n if self.output_numbered_list:\n # Each list in the top-level list will have multiple explanations (multiple strings).\n all_explanations = []\n for completion in completions:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:442-464"
+ },
+ "161": {
+ "file_id": 11,
+ "content": "This code adds user and assistant messages to a prompt builder based on the prompt format, output numbered list preference, and explanation presence. The postprocess_explanations function then handles multiple explanations in a list format for completions.",
+ "type": "comment"
+ },
+ "162": {
+ "file_id": 11,
+ "content": " for explanation in _split_numbered_list(completion):\n if explanation.startswith(EXPLANATION_PREFIX):\n explanation = explanation[len(EXPLANATION_PREFIX) :]\n all_explanations.append(explanation.strip())\n return all_explanations\n else:\n # Each element in the top-level list will be an explanation as a string.\n return [_remove_final_period(explanation) for explanation in completions]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explainer.py:465-472"
+ },
+ "163": {
+ "file_id": 11,
+ "content": "This code is parsing a completion list, extracting explanations and removing extra spaces.",
+ "type": "comment"
+ },
+ "164": {
+ "file_id": 12,
+ "content": "/neuron-explainer/neuron_explainer/explanations/explanations.py",
+ "type": "filepath"
+ },
+ "165": {
+ "file_id": 12,
+ "content": "The code includes classes for neuron explanations, scores, and simulation results with asynchronous loading from JSON file reading. The function retrieves sorted neuron indices by joining the explanation path with the layer number, listing files, filtering numeric filenames, converting to integers, and sorting the list.",
+ "type": "summary"
+ },
+ "166": {
+ "file_id": 12,
+ "content": "# Dataclasses and enums for storing neuron explanations, their scores, and related data. Also,\n# related helper functions.\nfrom __future__ import annotations\nimport json\nfrom dataclasses import dataclass\nfrom enum import Enum\nfrom typing import List, Optional, Union\nimport blobfile as bf\nimport boostedblob as bbb\nfrom neuron_explainer.activations.activations import NeuronId\nfrom neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass\nclass ActivationScale(str, Enum):\n \"\"\"Which \"units\" are stored in the expected_activations/distribution_values fields of a\n SequenceSimulation.\n This enum identifies whether the values represent real activations of the neuron or something\n else. Different scales are not necessarily related by a linear transformation.\n \"\"\"\n NEURON_ACTIVATIONS = \"neuron_activations\"\n \"\"\"Values represent real activations of the neuron.\"\"\"\n SIMULATED_NORMALIZED_ACTIVATIONS = \"simulated_normalized_activations\"\n \"\"\"\n Values represent simulated activations of the neuron, normalized to the range [0, 10]. This",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explanations.py:1-29"
+ },
+ "167": {
+ "file_id": 12,
+ "content": "This code defines dataclasses and enums for storing neuron explanations, scores, and related data. It also includes helper functions and handles different activation scales for neurons.",
+ "type": "comment"
+ },
+ "168": {
+ "file_id": 12,
+ "content": " scale is arbitrary and should not be interpreted as a neuron activation.\n \"\"\"\n@register_dataclass\n@dataclass\nclass SequenceSimulation(FastDataclass):\n \"\"\"The result of a simulation of neuron activations on one text sequence.\"\"\"\n tokens: list[str]\n \"\"\"The sequence of tokens that was simulated.\"\"\"\n expected_activations: list[float]\n \"\"\"Expected value of the possibly-normalized activation for each token in the sequence.\"\"\"\n activation_scale: ActivationScale\n \"\"\"What scale is used for values in the expected_activations field.\"\"\"\n distribution_values: list[list[float]]\n \"\"\"\n For each token in the sequence, a list of values from the discrete distribution of activations\n produced from simulation. Tokens will be included here if and only if they are in the top K=15\n tokens predicted by the simulator, and excluded otherwise.\n May be transformed to another unit by calibration. When we simulate a neuron, we produce a\n discrete distribution with values in the arbitrary discretized space of the neuron, e.g. 10%",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explanations.py:30-52"
+ },
+ "169": {
+ "file_id": 12,
+ "content": "This code defines a dataclass for storing the results of simulating neuron activations on a text sequence. It includes the sequence of tokens, expected activation values, scale, and distribution values from the simulation, excluding non-significant tokens.",
+ "type": "comment"
+ },
+ "170": {
+ "file_id": 12,
+ "content": " chance of 0, 70% chance of 1, 20% chance of 2. Which we store as distribution_values =\n [0, 1, 2], distribution_probabilities = [0.1, 0.7, 0.2]. When we transform the distribution to\n the real activation units, we can correspondingly transform the values of this distribution\n to get a distribution in the units of the neuron. e.g. if the mapping from the discretized space\n to the real activation unit of the neuron is f(x) = x/2, then the distribution becomes 10%\n chance of 0, 70% chance of 0.5, 20% chance of 1. Which we store as distribution_values =\n [0, 0.5, 1], distribution_probabilities = [0.1, 0.7, 0.2].\n \"\"\"\n distribution_probabilities: list[list[float]]\n \"\"\"\n For each token in the sequence, the probability of the corresponding value in\n distribution_values.\n \"\"\"\n uncalibrated_simulation: Optional[\"SequenceSimulation\"] = None\n \"\"\"The result of the simulation before calibration.\"\"\"\n@register_dataclass\n@dataclass\nclass ScoredSequenceSimulation(FastDataclass):",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explanations.py:53-73"
+ },
+ "171": {
+ "file_id": 12,
+ "content": "This code describes a class called ScoredSequenceSimulation, which stores a distribution of values and their probabilities for each token in a sequence. It also has an optional uncalibrated_simulation attribute representing the simulation before calibration.",
+ "type": "comment"
+ },
+ "172": {
+ "file_id": 12,
+ "content": " \"\"\"\n SequenceSimulation result with a score (for that sequence only) and ground truth activations.\n \"\"\"\n simulation: SequenceSimulation\n \"\"\"The result of a simulation of neuron activations.\"\"\"\n true_activations: List[float]\n \"\"\"Ground truth activations on the sequence (not normalized)\"\"\"\n ev_correlation_score: float\n \"\"\"\n Correlation coefficient between the expected values of the normalized activations from the\n simulation and the unnormalized true activations of the neuron on the text sequence.\n \"\"\"\n rsquared_score: Optional[float] = None\n \"\"\"R^2 of the simulated activations.\"\"\"\n absolute_dev_explained_score: Optional[float] = None\n \"\"\"\n Score based on absolute difference between real and simulated activations.\n absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real))\n \"\"\"\n@register_dataclass\n@dataclass\nclass ScoredSimulation(FastDataclass):\n \"\"\"Result of scoring a neuron simulation on multiple sequences.\"\"\"\n scored_sequence_simulations: List[ScoredSequenceSimulation]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explanations.py:74-101"
+ },
+ "173": {
+ "file_id": 12,
+ "content": "This code defines a ScoredSimulation class that represents the result of scoring a neuron simulation on multiple sequences. It includes properties like simulation, true_activations, ev_correlation_score, and optional rsquared_score and absolute_dev_explained_score for evaluating the simulation's performance.",
+ "type": "comment"
+ },
+ "174": {
+ "file_id": 12,
+ "content": " \"\"\"ScoredSequenceSimulation for each sequence\"\"\"\n ev_correlation_score: Optional[float] = None\n \"\"\"\n Correlation coefficient between the expected values of the normalized activations from the\n simulation and the unnormalized true activations on a dataset created from all score_results.\n (Note that this is not equivalent to averaging across sequences.)\n \"\"\"\n rsquared_score: Optional[float] = None\n \"\"\"R^2 of the simulated activations.\"\"\"\n absolute_dev_explained_score: Optional[float] = None\n \"\"\"\n Score based on absolute difference between real and simulated activations.\n absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real)).\n \"\"\"\n def get_preferred_score(self) -> Optional[float]:\n \"\"\"\n This method may return None in cases where the score is undefined, for example if the\n normalized activations were all zero, yielding a correlation coefficient of NaN.\n \"\"\"\n return self.ev_correlation_score\n@register_dataclass",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explanations.py:102-125"
+ },
+ "175": {
+ "file_id": 12,
+ "content": "This code defines a class with three score metrics (ev_correlation_score, rsquared_score, absolute_dev_explained_score) for evaluated sequences and provides a get_preferred_score method to return the preferred score.",
+ "type": "comment"
+ },
+ "176": {
+ "file_id": 12,
+ "content": "@dataclass\nclass ScoredExplanation(FastDataclass):\n \"\"\"Simulator parameters and the results of scoring it on multiple sequences\"\"\"\n explanation: str\n \"\"\"The explanation used for simulation.\"\"\"\n scored_simulation: ScoredSimulation\n \"\"\"Result of scoring the neuron simulator on multiple sequences.\"\"\"\n def get_preferred_score(self) -> Optional[float]:\n \"\"\"\n This method may return None in cases where the score is undefined, for example if the\n normalized activations were all zero, yielding a correlation coefficient of NaN.\n \"\"\"\n return self.scored_simulation.get_preferred_score()\n@register_dataclass\n@dataclass\nclass NeuronSimulationResults(FastDataclass):\n \"\"\"Simulation results and scores for a neuron.\"\"\"\n neuron_id: NeuronId\n scored_explanations: list[ScoredExplanation]\ndef load_neuron_explanations(\n explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]\n) -> Optional[NeuronSimulationResults]:\n \"\"\"Load scored explanations for the specified neuron.\"\"\"",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explanations.py:126-156"
+ },
+ "177": {
+ "file_id": 12,
+ "content": "Class representing simulator parameters and scoring results for multiple sequences.\nFunction returns preferred score or None if undefined (e.g., normalized activations all zero).\nClass represents simulation results and scores for a specific neuron.\nFunction loads scored explanations for the specified neuron from given path.",
+ "type": "comment"
+ },
+ "178": {
+ "file_id": 12,
+ "content": " file = bf.join(explanations_path, str(layer_index), f\"{neuron_index}.jsonl\")\n if not bf.exists(file):\n return None\n with bf.BlobFile(file) as f:\n for line in f:\n return loads(line)\n return None\n@bbb.ensure_session\nasync def load_neuron_explanations_async(\n explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]\n) -> Optional[NeuronSimulationResults]:\n \"\"\"Load scored explanations for the specified neuron, asynchronously.\"\"\"\n return await read_explanation_file(\n bf.join(explanations_path, str(layer_index), f\"{neuron_index}.jsonl\")\n )\n@bbb.ensure_session\nasync def read_file(filename: str) -> Optional[str]:\n \"\"\"Read the contents of the given file as a string, asynchronously.\"\"\"\n try:\n raw_contents = await bbb.read.read_single(filename)\n except FileNotFoundError:\n print(f\"Could not read {filename}\")\n return None\n lines = []\n for line in raw_contents.decode(\"utf-8\").split(\"\\n\"):\n if len(line) > 0:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explanations.py:157-186"
+ },
+ "179": {
+ "file_id": 12,
+ "content": "1. Loads scored explanations for the specified neuron asynchronously.\n2. Read the contents of the given file as a string, asynchronously.\n3. Splits the content into lines and returns non-empty lines.",
+ "type": "comment"
+ },
+ "180": {
+ "file_id": 12,
+ "content": " lines.append(line)\n assert len(lines) == 1, filename\n return lines[0]\n@bbb.ensure_session\nasync def read_explanation_file(explanation_filename: str) -> Optional[NeuronSimulationResults]:\n \"\"\"Load scored explanations from the given filename, asynchronously.\"\"\"\n line = await read_file(explanation_filename)\n return loads(line) if line is not None else None\n@bbb.ensure_session\nasync def read_json_file(filename: str) -> Optional[dict]:\n \"\"\"Read the contents of the given file as a JSON object, asynchronously.\"\"\"\n line = await read_file(filename)\n return json.loads(line) if line is not None else None\ndef get_numerical_subdirs(dataset_path: str) -> list[str]:\n \"\"\"Return the names of all numbered subdirectories in the specified directory.\n Used to get all layer directories in an explanation directory.\n \"\"\"\n return [\n str(x)\n for x in sorted(\n [\n int(x)\n for x in bf.listdir(dataset_path)\n if bf.isdir(bf.join(dataset_path, x)) and x.isnumeric()",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explanations.py:187-217"
+ },
+ "181": {
+ "file_id": 12,
+ "content": "- reads explanation file from filename\n- loads scored explanations asynchronously\n- reads the contents of a file as JSON object asynchronously\n- returns names of numbered subdirectories in specified directory",
+ "type": "comment"
+ },
+ "182": {
+ "file_id": 12,
+ "content": " ]\n )\n ]\ndef get_sorted_neuron_indices_from_explanations(\n explanations_path: str, layer: Union[str, int]\n) -> list[int]:\n \"\"\"Return the indices of all neurons in this layer, in ascending order.\"\"\"\n layer_dir = bf.join(explanations_path, str(layer))\n return sorted(\n [int(f.split(\".\")[0]) for f in bf.listdir(layer_dir) if f.split(\".\")[0].isnumeric()]\n )",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/explanations.py:218-230"
+ },
+ "183": {
+ "file_id": 12,
+ "content": "This function retrieves the sorted neuron indices from explanations for a given layer. It does this by joining the explanation path with the layer number, listing all files in that directory, filtering numeric filenames, converting them to integers, and finally sorting the resulting list.",
+ "type": "comment"
+ },
+ "184": {
+ "file_id": 13,
+ "content": "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py",
+ "type": "filepath"
+ },
+ "185": {
+ "file_id": 13,
+ "content": "This code includes a `PromptFormat` class for formatting methods, a `HarmonyMessage` dictionary for roles and content, and a `PromptBuilder` class to create prompts with token counting using GPT-4 encoding. It checks roles, creates deep copies of messages, and handles system messages. The code also checks the last user message and appends \"<|endofprompt|>\" before returning either a list of messages' contents or concatenating them into a single string, while raising a ValueError for unknown prompt formats.",
+ "type": "summary"
+ },
+ "186": {
+ "file_id": 13,
+ "content": "from __future__ import annotations\nfrom enum import Enum\nfrom typing import TypedDict, Union\nimport tiktoken\nHarmonyMessage = TypedDict(\n \"HarmonyMessage\",\n {\n \"role\": str,\n \"content\": str,\n },\n)\nclass PromptFormat(str, Enum):\n \"\"\"\n Different ways of formatting the components of a prompt into the format accepted by the relevant\n API server endpoint.\n \"\"\"\n NONE = \"none\"\n \"\"\"Suitable for use with models that don't use special tokens for instructions.\"\"\"\n INSTRUCTION_FOLLOWING = \"instruction_following\"\n \"\"\"Suitable for IF models that use <|endofprompt|>.\"\"\"\n HARMONY_V4 = \"harmony_v4\"\n \"\"\"\n Suitable for Harmony models that use a structured turn-taking role+content format. Generates a\n list of HarmonyMessage dicts that can be sent to the /chat/completions endpoint.\n \"\"\"\n @classmethod\n def from_string(cls, s: str) -> PromptFormat:\n for prompt_format in cls:\n if prompt_format.value == s:\n return prompt_format\n raise ValueError(f\"{s} is not a valid PromptFormat\")",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:1-38"
+ },
+ "187": {
+ "file_id": 13,
+ "content": "The code defines a class `PromptFormat` which is an enumeration of different prompt formatting methods. The `HarmonyMessage` is a typed dictionary defining the role and content of each message in the prompt. There's also a method `from_string` that returns the corresponding `PromptFormat` from a string input.",
+ "type": "comment"
+ },
+ "188": {
+ "file_id": 13,
+ "content": "class Role(str, Enum):\n \"\"\"See https://platform.openai.com/docs/guides/chat\"\"\"\n SYSTEM = \"system\"\n USER = \"user\"\n ASSISTANT = \"assistant\"\nclass PromptBuilder:\n \"\"\"Class for accumulating components of a prompt and then formatting them into an output.\"\"\"\n def __init__(self) -> None:\n self._messages: list[HarmonyMessage] = []\n def add_message(self, role: Role, message: str) -> None:\n self._messages.append(HarmonyMessage(role=role, content=message))\n def prompt_length_in_tokens(self, prompt_format: PromptFormat) -> int:\n # TODO(sbills): Make the model/encoding configurable. This implementation assumes GPT-4.\n encoding = tiktoken.get_encoding(\"cl100k_base\")\n if prompt_format == PromptFormat.HARMONY_V4:\n # Approximately-correct implementation adapted from this documentation:\n # https://platform.openai.com/docs/guides/chat/introduction\n num_tokens = 0\n for message in self._messages:\n num_tokens += (",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:41-66"
+ },
+ "189": {
+ "file_id": 13,
+ "content": "This code defines a PromptBuilder class for creating prompts. It initializes an empty list of HarmonyMessages and has methods to add messages and calculate the prompt's length in tokens using GPT-4 encoding.",
+ "type": "comment"
+ },
+ "190": {
+ "file_id": 13,
+ "content": " 4 # every message follows <|im_start|>{role/name}\\n{content}<|im_end|>\\n\n )\n num_tokens += len(encoding.encode(message[\"content\"], allowed_special=\"all\"))\n num_tokens += 2 # every reply is primed with <|im_start|>assistant\n return num_tokens\n else:\n prompt_str = self.build(prompt_format)\n assert isinstance(prompt_str, str)\n return len(encoding.encode(prompt_str, allowed_special=\"all\"))\n def build(\n self, prompt_format: PromptFormat, *, allow_extra_system_messages: bool = False\n ) -> Union[str, list[HarmonyMessage]]:\n \"\"\"\n Validates the messages added so far (reasonable alternation of assistant vs. user, etc.)\n and returns either a regular string (maybe with <|endofprompt|> tokens) or a list of\n HarmonyMessages suitable for use with the /chat/completions endpoint.\n The `allow_extra_system_messages` parameter allows the caller to specify that the prompt",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:67-85"
+ },
+ "191": {
+ "file_id": 13,
+ "content": "This code calculates the number of tokens in a given message by encoding it and adding the length to a running total. If no specific format is provided, it builds a prompt according to a specified format and encodes it for token counting. The build function also validates the alternation of assistant and user messages.",
+ "type": "comment"
+ },
+ "192": {
+ "file_id": 13,
+ "content": " should be allowed to contain system messages after the very first one.\n \"\"\"\n # Create a deep copy of the messages so we can modify it and so that the caller can't\n # modify the internal state of this object.\n messages = [message.copy() for message in self._messages]\n expected_next_role = Role.SYSTEM\n for message in messages:\n role = message[\"role\"]\n assert role == expected_next_role or (\n allow_extra_system_messages and role == Role.SYSTEM\n ), f\"Expected message from {expected_next_role} but got message from {role}\"\n if role == Role.SYSTEM:\n expected_next_role = Role.USER\n elif role == Role.USER:\n expected_next_role = Role.ASSISTANT\n elif role == Role.ASSISTANT:\n expected_next_role = Role.USER\n if prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:\n last_user_message = None\n for message in messages:\n if message[\"role\"] == Role.USER:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:86-108"
+ },
+ "193": {
+ "file_id": 13,
+ "content": "The code creates a deep copy of the messages to prevent any external modification. It then checks if the next message is from the expected role and allows extra system messages if specified. Finally, it prepares for prompt formatting if necessary.",
+ "type": "comment"
+ },
+ "194": {
+ "file_id": 13,
+ "content": " last_user_message = message\n assert last_user_message is not None\n last_user_message[\"content\"] += \"<|endofprompt|>\"\n if prompt_format == PromptFormat.HARMONY_V4:\n return messages\n elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:\n return \"\".join(message[\"content\"] for message in messages)\n else:\n raise ValueError(f\"Unknown prompt format: {prompt_format}\")",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:109-118"
+ },
+ "195": {
+ "file_id": 13,
+ "content": "This code checks the last user message and appends \"<|endofprompt|>\" to its content. Depending on the prompt format, it either returns a list of messages' contents or concatenates them into a single string. If an unknown prompt format is encountered, it raises a ValueError.",
+ "type": "comment"
+ },
+ "196": {
+ "file_id": 14,
+ "content": "/neuron-explainer/neuron_explainer/explanations/puzzles.py",
+ "type": "filepath"
+ },
+ "197": {
+ "file_id": 14,
+ "content": "The code creates a Puzzle class for ground truth and false explanations, tokenizes sentences and JSON representations, preprocesses input data, and assigns puzzles to the name in PUZZLES_BY_NAME dictionary using convert_puzzle_dict_to_puzzle function.",
+ "type": "summary"
+ },
+ "198": {
+ "file_id": 14,
+ "content": "import json\nimport os\nfrom dataclasses import dataclass\nfrom neuron_explainer.activations.activations import ActivationRecord\n@dataclass(frozen=True)\nclass Puzzle:\n \"\"\"A puzzle is a ground truth explanation, a collection of sentences (stored as ActivationRecords) with activations\n according to that explanation, and a collection of false explanations\"\"\"\n name: str\n explanation: str\n activation_records: list[ActivationRecord]\n false_explanations: list[str]\ndef convert_puzzle_to_tokenized_sentences(puzzle: Puzzle) -> list[list[str]]:\n \"\"\"Converts a puzzle to a list of tokenized sentences.\"\"\"\n return [record.tokens for record in puzzle.activation_records]\ndef convert_puzzle_dict_to_puzzle(puzzle_dict: dict) -> Puzzle:\n \"\"\"Converts a json dictionary representation of a puzzle to the Puzzle class.\"\"\"\n puzzle_activation_records = []\n for sentence in puzzle_dict[\"sentences\"]:\n # Token-activation pairs are listed as either a string or a list of a string and a float. If it is a list, the float is the activation.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/puzzles.py:1-28"
+ },
+ "199": {
+ "file_id": 14,
+ "content": "The code defines a Puzzle class representing ground truth explanations and false explanations. It also includes functions to convert a puzzle to a list of tokenized sentences and to create a Puzzle object from a JSON dictionary representation.",
+ "type": "comment"
+ }
+}
\ No newline at end of file
diff --git a/docs/data/2.json b/docs/data/2.json
new file mode 100644
index 0000000..ea8b840
--- /dev/null
+++ b/docs/data/2.json
@@ -0,0 +1,549 @@
+{
+ "200": {
+ "file_id": 14,
+ "content": " # If it is only a string, the activation is assumed to be 0. This is useful for readability and reducing redundancy in the data.\n tokens = [t[0] if type(t) is list else t for t in sentence]\n assert all([type(t) is str for t in tokens]), \"All tokens must be strings\"\n activations = [float(t[1]) if type(t) is list else 0.0 for t in sentence]\n assert all([type(t) is float for t in activations]), \"All activations must be floats\"\n puzzle_activation_records.append(ActivationRecord(tokens=tokens, activations=activations))\n return Puzzle(\n name=puzzle_dict[\"name\"],\n explanation=puzzle_dict[\"explanation\"],\n activation_records=puzzle_activation_records,\n false_explanations=puzzle_dict[\"false_explanations\"],\n )\nPUZZLES_BY_NAME: dict[str, Puzzle] = dict()\nscript_dir = os.path.dirname(os.path.abspath(__file__))\nwith open(os.path.join(script_dir, \"puzzles.json\"), \"r\") as f:\n puzzle_dicts = json.loads(f.read())\n for name in puzzle_dicts.keys():",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/puzzles.py:29-49"
+ },
+ "201": {
+ "file_id": 14,
+ "content": "The code preprocesses input data for puzzle explanations. It checks if all tokens are strings and all activations are floats, then creates a Puzzle object with name, explanation, activation records, and false explanations. The code reads puzzle data from \"puzzles.json\" file in the same directory.",
+ "type": "comment"
+ },
+ "202": {
+ "file_id": 14,
+ "content": " PUZZLES_BY_NAME[name] = convert_puzzle_dict_to_puzzle(puzzle_dicts[name])",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/puzzles.py:50-50"
+ },
+ "203": {
+ "file_id": 14,
+ "content": "Assigning puzzle to the name in PUZZLES_BY_NAME dictionary using convert_puzzle_dict_to_puzzle function.",
+ "type": "comment"
+ },
+ "204": {
+ "file_id": 15,
+ "content": "/neuron-explainer/neuron_explainer/explanations/scoring.py",
+ "type": "filepath"
+ },
+ "205": {
+ "file_id": 15,
+ "content": "The code generates a scoring function and explanation simulator for assessing neuron evaluations based on correlation and R-squared, using an asynchronous approach. The `make_simulator_and_score` function is created to generate the simulator and score the activation records, returning the scored simulations.",
+ "type": "summary"
+ },
+ "206": {
+ "file_id": 15,
+ "content": "from __future__ import annotations\nimport asyncio\nimport logging\nfrom typing import Any, Callable, Coroutine, Sequence\nimport numpy as np\nfrom neuron_explainer.activations.activations import ActivationRecord\nfrom neuron_explainer.explanations.calibrated_simulator import (\n CalibratedNeuronSimulator,\n LinearCalibratedNeuronSimulator,\n)\nfrom neuron_explainer.explanations.explanations import (\n ScoredSequenceSimulation,\n ScoredSimulation,\n SequenceSimulation,\n)\nfrom neuron_explainer.explanations.simulator import ExplanationNeuronSimulator, NeuronSimulator\ndef flatten_list(list_of_lists: Sequence[Sequence[Any]]) -> list[Any]:\n return [item for sublist in list_of_lists for item in sublist]\ndef correlation_score(\n real_activations: Sequence[float] | np.ndarray,\n predicted_activations: Sequence[float] | np.ndarray,\n) -> float:\n return np.corrcoef(real_activations, predicted_activations)[0, 1]\ndef score_from_simulation(\n real_activations: ActivationRecord,\n simulation: SequenceSimulation,",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/scoring.py:1-34"
+ },
+ "207": {
+ "file_id": 15,
+ "content": "Code imports necessary modules and defines three functions:\n1. flatten_list(): Converts a list of lists into a single flat list.\n2. correlation_score(): Computes the correlation coefficient between two sequences of real and predicted activations.\n3. score_from_simulation(): Calculates the correlation score for a given sequence simulation.\n\nThis code is used for scoring simulations based on activation correlations in neuron explanations.",
+ "type": "comment"
+ },
+ "208": {
+ "file_id": 15,
+ "content": " score_function: Callable[[Sequence[float] | np.ndarray, Sequence[float] | np.ndarray], float],\n) -> float:\n return score_function(real_activations.activations, simulation.expected_activations)\ndef rsquared_score_from_sequences(\n real_activations: Sequence[float] | np.ndarray,\n predicted_activations: Sequence[float] | np.ndarray,\n) -> float:\n return float(\n 1\n - np.mean(np.square(np.array(real_activations) - np.array(predicted_activations)))\n / np.mean(np.square(np.array(real_activations)))\n )\ndef absolute_dev_explained_score_from_sequences(\n real_activations: Sequence[float] | np.ndarray,\n predicted_activations: Sequence[float] | np.ndarray,\n) -> float:\n return float(\n 1\n - np.mean(np.abs(np.array(real_activations) - np.array(predicted_activations)))\n / np.mean(np.abs(np.array(real_activations)))\n )\nasync def make_explanation_simulator(\n explanation: str,\n calibration_activation_records: Sequence[ActivationRecord],\n model_name: str,",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/scoring.py:35-65"
+ },
+ "209": {
+ "file_id": 15,
+ "content": "This code defines a scoring function that takes in two sequences of floats (or numpy arrays) and returns a score. It provides two specific scoring functions: rsquared_score_from_sequences and absolute_dev_explained_score_from_sequences, which calculate the R-squared and absolute deviation explained scores respectively. These functions are used in make_explanation_simulator, which asynchronously creates an explanation simulator for a given explanation, calibration activation records, and model name.",
+ "type": "comment"
+ },
+ "210": {
+ "file_id": 15,
+ "content": " calibrated_simulator_class: type[CalibratedNeuronSimulator] = LinearCalibratedNeuronSimulator,\n) -> CalibratedNeuronSimulator:\n \"\"\"\n Make a simulator that uses an explanation to predict activations and calibrates it on the given\n activation records.\n \"\"\"\n simulator = ExplanationNeuronSimulator(model_name, explanation)\n calibrated_simulator = calibrated_simulator_class(simulator)\n await calibrated_simulator.calibrate(calibration_activation_records)\n return calibrated_simulator\nasync def _simulate_and_score_sequence(\n simulator: NeuronSimulator, activations: ActivationRecord\n) -> ScoredSequenceSimulation:\n \"\"\"Score an explanation of a neuron by how well it predicts activations on a sentence.\"\"\"\n simulation = await simulator.simulate(activations.tokens)\n logging.debug(simulation)\n rsquared_score = score_from_simulation(activations, simulation, rsquared_score_from_sequences)\n absolute_dev_explained_score = score_from_simulation(\n activations, simulation, absolute_dev_explained_score_from_sequences",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/scoring.py:66-86"
+ },
+ "211": {
+ "file_id": 15,
+ "content": "This function creates a calibrated neuron simulator using an explanation and a model, and then uses it to simulate and score a sequence of activations. The returned score is based on R-squared and absolute deviation explained scores from sequences.",
+ "type": "comment"
+ },
+ "212": {
+ "file_id": 15,
+ "content": " )\n scored_sequence_simulation = ScoredSequenceSimulation(\n simulation=simulation,\n true_activations=activations.activations,\n ev_correlation_score=score_from_simulation(activations, simulation, correlation_score),\n rsquared_score=rsquared_score,\n absolute_dev_explained_score=absolute_dev_explained_score,\n )\n return scored_sequence_simulation\ndef aggregate_scored_sequence_simulations(\n scored_sequence_simulations: list[ScoredSequenceSimulation],\n) -> ScoredSimulation:\n \"\"\"\n Aggregate a list of scored sequence simulations. The logic for doing this is non-trivial for EV\n scores, since we want to calculate the correlation over all activations from all sequences at\n once rather than simply averaging per-sequence correlations.\n \"\"\"\n all_true_activations: list[float] = []\n all_expected_values: list[float] = []\n for scored_sequence_simulation in scored_sequence_simulations:\n all_true_activations.extend(scored_sequence_simulation.true_activations or [])",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/scoring.py:87-109"
+ },
+ "213": {
+ "file_id": 15,
+ "content": "Code calculates and aggregates scored sequence simulations for each activation in a list of activations.\nIt combines true activations and expected values from all sequences to calculate the correlation score.",
+ "type": "comment"
+ },
+ "214": {
+ "file_id": 15,
+ "content": " all_expected_values.extend(scored_sequence_simulation.simulation.expected_activations)\n ev_correlation_score = (\n correlation_score(all_true_activations, all_expected_values)\n if len(all_true_activations) > 0\n else None\n )\n rsquared_score = rsquared_score_from_sequences(all_true_activations, all_expected_values)\n absolute_dev_explained_score = absolute_dev_explained_score_from_sequences(\n all_true_activations, all_expected_values\n )\n return ScoredSimulation(\n scored_sequence_simulations=scored_sequence_simulations,\n ev_correlation_score=ev_correlation_score,\n rsquared_score=rsquared_score,\n absolute_dev_explained_score=absolute_dev_explained_score,\n )\nasync def simulate_and_score(\n simulator: NeuronSimulator,\n activation_records: Sequence[ActivationRecord],\n) -> ScoredSimulation:\n \"\"\"\n Score an explanation of a neuron by how well it predicts activations on the given text\n sequences.\n \"\"\"\n scored_sequence_simulations = await asyncio.gather(",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/scoring.py:110-137"
+ },
+ "215": {
+ "file_id": 15,
+ "content": "Code performs the following:\n1. Extends the list of all_expected_values with simulation's expected activation values.\n2. Calculates Ev correlation score, R squared score, and absolute dev explained score for explanation prediction accuracy.\n3. Returns a ScoredSimulation object with scores and simulations.",
+ "type": "comment"
+ },
+ "216": {
+ "file_id": 15,
+ "content": " *[\n _simulate_and_score_sequence(\n simulator,\n activation_record,\n )\n for activation_record in activation_records\n ]\n )\n return aggregate_scored_sequence_simulations(scored_sequence_simulations)\nasync def make_simulator_and_score(\n make_simulator: Coroutine[None, None, NeuronSimulator],\n activation_records: Sequence[ActivationRecord],\n) -> ScoredSimulation:\n \"\"\"Chain together creating the simulator and using it to score activation records.\"\"\"\n simulator = await make_simulator\n return await simulate_and_score(simulator, activation_records)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/scoring.py:138-155"
+ },
+ "217": {
+ "file_id": 15,
+ "content": "This code defines a function called `make_simulator_and_score` that takes in a coroutine for creating a simulator and a sequence of activation records. It then creates the simulator and uses it to score the activation records, returning the scored simulations. The code is asynchronous and uses awaitable operations.",
+ "type": "comment"
+ },
+ "218": {
+ "file_id": 16,
+ "content": "/neuron-explainer/neuron_explainer/explanations/simulator.py",
+ "type": "filepath"
+ },
+ "219": {
+ "file_id": 16,
+ "content": "Both comments discuss improvements in simulation object initialization, API calls for neuron activation simulations, token splitting, and prompt builder functions. The code proposes better prompt formats, validates input, predicts activations using few-shot examples, verifies completion validity, and generates explanations for sequence 1 tokens.",
+ "type": "summary"
+ },
+ "220": {
+ "file_id": 16,
+ "content": "\"\"\"Uses API calls to simulate neuron activations based on an explanation.\"\"\"\nfrom __future__ import annotations\nimport asyncio\nimport logging\nfrom abc import ABC, abstractmethod\nfrom collections import OrderedDict\nfrom enum import Enum\nfrom typing import Any, Optional, Sequence, Union\nimport numpy as np\nfrom neuron_explainer.activations.activation_records import (\n calculate_max_activation,\n format_activation_records,\n format_sequences_for_simulation,\n normalize_activations,\n)\nfrom neuron_explainer.activations.activations import ActivationRecord\nfrom neuron_explainer.api_client import ApiClient\nfrom neuron_explainer.explanations.explainer import EXPLANATION_PREFIX\nfrom neuron_explainer.explanations.explanations import ActivationScale, SequenceSimulation\nfrom neuron_explainer.explanations.few_shot_examples import FewShotExampleSet\nfrom neuron_explainer.explanations.prompt_builder import (\n HarmonyMessage,\n PromptBuilder,\n PromptFormat,\n Role,\n)\nlogger = logging.getLogger(__name__)\n# Our prompts use normalized activation values, which map any range of positive activations to the",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:1-33"
+ },
+ "221": {
+ "file_id": 16,
+ "content": "This code uses API calls to simulate neuron activations based on an explanation. It includes classes for activation records, activation scaling, and sequence simulations, as well as functions for formatting activation records, normalizing activations, and building prompts.",
+ "type": "comment"
+ },
+ "222": {
+ "file_id": 16,
+ "content": "# integers from 0 to 10.\nMAX_NORMALIZED_ACTIVATION = 10\nVALID_ACTIVATION_TOKENS_ORDERED = list(str(i) for i in range(MAX_NORMALIZED_ACTIVATION + 1))\nVALID_ACTIVATION_TOKENS = set(VALID_ACTIVATION_TOKENS_ORDERED)\nclass SimulationType(str, Enum):\n \"\"\"How to simulate neuron activations. Values correspond to subclasses of NeuronSimulator.\"\"\"\n ALL_AT_ONCE = \"all_at_once\"\n \"\"\"\n Use a single prompt with tokens; calculate EVs using logprobs.\n Implemented by ExplanationNeuronSimulator.\n \"\"\"\n ONE_AT_A_TIME = \"one_at_a_time\"\n \"\"\"\n Use a separate prompt for each token being simulated; calculate EVs using logprobs.\n Implemented by ExplanationTokenByTokenSimulator.\n \"\"\"\n @classmethod\n def from_string(cls, s: str) -> SimulationType:\n for simulation_type in SimulationType:\n if simulation_type.value == s:\n return simulation_type\n raise ValueError(f\"Invalid simulation type: {s}\")\ndef compute_expected_value(\n norm_probabilities_by_distribution_value: OrderedDict[int, float]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:34-66"
+ },
+ "223": {
+ "file_id": 16,
+ "content": "This code defines a SimulationType enum with three simulation types: ALL_AT_ONCE, ONE_AT_A_TIME. It also has a function to compute expected values given normed probabilities by distribution value.",
+ "type": "comment"
+ },
+ "224": {
+ "file_id": 16,
+ "content": ") -> float:\n \"\"\"\n Given a map from distribution values (integers on the range [0, 10]) to normalized\n probabilities, return an expected value for the distribution.\n \"\"\"\n return np.dot(\n np.array(list(norm_probabilities_by_distribution_value.keys())),\n np.array(list(norm_probabilities_by_distribution_value.values())),\n )\ndef parse_top_logprobs(top_logprobs: dict[str, float]) -> OrderedDict[int, float]:\n \"\"\"\n Given a map from tokens to logprobs, return a map from distribution values (integers on the\n range [0, 10]) to unnormalized probabilities (in the sense that they may not sum to 1).\n \"\"\"\n probabilities_by_distribution_value = OrderedDict()\n for token, logprob in top_logprobs.items():\n if token in VALID_ACTIVATION_TOKENS:\n token_as_int = int(token)\n probabilities_by_distribution_value[token_as_int] = np.exp(logprob)\n return probabilities_by_distribution_value\ndef compute_predicted_activation_stats_for_token(\n top_logprobs: dict[str, float],",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:67-92"
+ },
+ "225": {
+ "file_id": 16,
+ "content": "Code chunk 1 (lines 66-91):\n\nThis code calculates the expected value for a distribution given normalized probabilities. It also includes functions to parse top logprobs into a distribution of unnormalized probabilities and compute predicted activation statistics for a token. The code uses numpy arrays for efficient computations and orderd dictionaries for mapping tokens or distribution values to their respective probabilities or logprobs.",
+ "type": "comment"
+ },
+ "226": {
+ "file_id": 16,
+ "content": ") -> tuple[OrderedDict[int, float], float]:\n probabilities_by_distribution_value = parse_top_logprobs(top_logprobs)\n total_p_of_distribution_values = sum(probabilities_by_distribution_value.values())\n norm_probabilities_by_distribution_value = OrderedDict(\n {\n distribution_value: p / total_p_of_distribution_values\n for distribution_value, p in probabilities_by_distribution_value.items()\n }\n )\n expected_value = compute_expected_value(norm_probabilities_by_distribution_value)\n return (\n norm_probabilities_by_distribution_value,\n expected_value,\n )\n# Adapted from tether/tether/core/encoder.py.\ndef convert_to_byte_array(s: str) -> bytearray:\n byte_array = bytearray()\n assert s.startswith(\"bytes:\"), s\n s = s[6:]\n while len(s) > 0:\n if s[0] == \"\\\\\":\n # Hex encoding.\n assert s[1] == \"x\"\n assert len(s) >= 4\n byte_array.append(int(s[2:4], 16))\n s = s[4:]\n else:\n # Regular ascii encoding.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:93-122"
+ },
+ "227": {
+ "file_id": 16,
+ "content": "This function takes the top log probabilities, normalizes them to probabilities, computes the expected value based on these normalized probabilities, and returns both as a tuple. It also includes a helper function that converts a string into a byte array using hexadecimal encoding.",
+ "type": "comment"
+ },
+ "228": {
+ "file_id": 16,
+ "content": " byte_array.append(ord(s[0]))\n s = s[1:]\n return byte_array\ndef handle_byte_encoding(\n response_tokens: Sequence[str], merged_response_index: int\n) -> tuple[str, int]:\n \"\"\"\n Handle the case where the current token is a sequence of bytes. This may involve merging\n multiple response tokens into a single token.\n \"\"\"\n response_token = response_tokens[merged_response_index]\n if response_token.startswith(\"bytes:\"):\n byte_array = bytearray()\n while True:\n byte_array = convert_to_byte_array(response_token) + byte_array\n try:\n # If we can decode the byte array as utf-8, then we're done.\n response_token = byte_array.decode(\"utf-8\")\n break\n except UnicodeDecodeError:\n # If not, then we need to merge the previous response token into the byte\n # array.\n merged_response_index -= 1\n response_token = response_tokens[merged_response_index]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:123-148"
+ },
+ "229": {
+ "file_id": 16,
+ "content": "This code handles the case where a response token is composed of a sequence of bytes. It merges multiple response tokens into a single token until it can be decoded as UTF-8. If a UnicodeDecodeError occurs, it continues to merge previous response tokens into the byte array.",
+ "type": "comment"
+ },
+ "230": {
+ "file_id": 16,
+ "content": " return response_token, merged_response_index\ndef was_token_split(current_token: str, response_tokens: Sequence[str], start_index: int) -> bool:\n \"\"\"\n Return whether current_token (a token from the subject model) was split into multiple tokens by\n the simulator model (as represented by the tokens in response_tokens). start_index is the index\n in response_tokens at which to begin looking backward to form a complete token. It is usually\n the first token *before* the delimiter that separates the token from the normalized activation,\n barring some unusual cases.\n This mainly happens if the subject model uses a different tokenizer than the simulator model.\n But it can also happen in cases where Unicode characters are split. This function handles both\n cases.\n \"\"\"\n merged_response_tokens = \"\"\n merged_response_index = start_index\n while len(merged_response_tokens) < len(current_token):\n response_token = response_tokens[merged_response_index]\n response_token, merged_response_index = handle_byte_encoding(",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:149-168"
+ },
+ "231": {
+ "file_id": 16,
+ "content": "This function checks if a token from the subject model was split into multiple tokens by the simulator model. It handles cases where different tokenizers are used or Unicode characters are split.",
+ "type": "comment"
+ },
+ "232": {
+ "file_id": 16,
+ "content": " response_tokens, merged_response_index\n )\n merged_response_tokens = response_token + merged_response_tokens\n merged_response_index -= 1\n # It's possible that merged_response_tokens is longer than current_token at this point,\n # since the between-lines delimiter may have been merged into the original token. But it\n # should always be the case that merged_response_tokens ends with current_token.\n assert merged_response_tokens.endswith(current_token)\n num_merged_tokens = start_index - merged_response_index\n token_was_split = num_merged_tokens > 1\n if token_was_split:\n logger.debug(\n \"Warning: token from the subject model was split into 2+ tokens by the simulator model.\"\n )\n return token_was_split\ndef parse_simulation_response(\n response: dict[str, Any],\n prompt_format: PromptFormat,\n tokens: Sequence[str],\n) -> SequenceSimulation:\n \"\"\"\n Parse an API response to a simulation prompt.\n Args:\n response: response from the API",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:169-195"
+ },
+ "233": {
+ "file_id": 16,
+ "content": "The code is checking if a token from the subject model was split into two or more tokens by the simulator model. It asserts that merged_response_tokens ends with current_token, calculates the number of merged tokens, and logs a warning if the token was split.",
+ "type": "comment"
+ },
+ "234": {
+ "file_id": 16,
+ "content": " prompt_format: how the prompt was formatted\n tokens: list of tokens as strings in the sequence where the neuron is being simulated\n \"\"\"\n choice = response[\"choices\"][0]\n if prompt_format == PromptFormat.HARMONY_V4:\n text = choice[\"message\"][\"content\"]\n elif prompt_format in [\n PromptFormat.NONE,\n PromptFormat.INSTRUCTION_FOLLOWING,\n ]:\n text = choice[\"text\"]\n else:\n raise ValueError(f\"Unhandled prompt format {prompt_format}\")\n response_tokens = choice[\"logprobs\"][\"tokens\"]\n choice[\"logprobs\"][\"token_logprobs\"]\n top_logprobs = choice[\"logprobs\"][\"top_logprobs\"]\n token_text_offset = choice[\"logprobs\"][\"text_offset\"]\n # This only works because the sequence \"\" tokenizes into multiple tokens if it appears in\n # a text sequence in the prompt.\n scoring_start = text.rfind(\"\")\n expected_values = []\n original_sequence_tokens: list[str] = []\n distribution_values: list[list[float]] = []\n distribution_probabilities: list[list[float]] = []",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:196-219"
+ },
+ "235": {
+ "file_id": 16,
+ "content": "This function retrieves the text and token data from the response, handling different prompt formats. It then extracts the starting position of the \"\" token in the text, setting up lists for further calculations.",
+ "type": "comment"
+ },
+ "236": {
+ "file_id": 16,
+ "content": " for i in range(2, len(response_tokens)):\n if len(original_sequence_tokens) == len(tokens):\n # Make sure we haven't hit some sort of off-by-one error.\n # TODO(sbills): Generalize this to handle different tokenizers.\n reached_end = response_tokens[i + 1] == \"<\" and response_tokens[i + 2] == \"end\"\n assert reached_end, f\"{response_tokens[i-3:i+3]}\"\n break\n if token_text_offset[i] >= scoring_start:\n # We're looking for the first token after a tab. This token should be the text\n # \"unknown\" if hide_activations=True or a normalized activation (0-10) otherwise.\n # If it isn't, that means that the tab is not appearing as a delimiter, but rather\n # as a token, in which case we should move on to the next response token.\n if response_tokens[i - 1] == \"\\t\":\n if response_tokens[i] != \"unknown\":\n logger.debug(\"Ignoring tab token that is not followed by an 'unknown' token.\")",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:220-234"
+ },
+ "237": {
+ "file_id": 16,
+ "content": "Checking if the response tokens have reached the end and if the tab token is followed by an \"unknown\" token.",
+ "type": "comment"
+ },
+ "238": {
+ "file_id": 16,
+ "content": " continue\n # j represents the index of the token in a \"tokenactivation\" line, barring\n # one of the unusual cases handled below.\n j = i - 2\n current_token = tokens[len(original_sequence_tokens)]\n if current_token == response_tokens[j] or was_token_split(\n current_token, response_tokens, j\n ):\n # We're in the normal case where the tokenization didn't throw off the\n # formatting or in the token-was-split case, which we handle the usual way.\n current_top_logprobs = top_logprobs[i]\n (\n norm_probabilities_by_distribution_value,\n expected_value,\n ) = compute_predicted_activation_stats_for_token(\n current_top_logprobs,\n )\n current_distribution_values = list(\n norm_probabilities_by_distribution_value.keys()",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:235-256"
+ },
+ "239": {
+ "file_id": 16,
+ "content": "Identifying correct token and computing predicted activation stats for the identified token.",
+ "type": "comment"
+ },
+ "240": {
+ "file_id": 16,
+ "content": " )\n current_distribution_probabilities = list(\n norm_probabilities_by_distribution_value.values()\n )\n else:\n # We're in a case where the tokenization resulted in a newline being folded into\n # the token. We can't do our usual prediction of activation stats for the token,\n # since the model did not observe the original token. Instead, we use dummy\n # values. See the TODO elsewhere in this file about coming up with a better\n # prompt format that avoids this situation.\n newline_folded_into_token = \"\\n\" in response_tokens[j]\n assert (\n newline_folded_into_token\n ), f\"`{current_token=}` {response_tokens[j-3:j+3]=}\"\n logger.debug(\n \"Warning: newline before a tokenactivation line was folded into the token\"",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:257-272"
+ },
+ "241": {
+ "file_id": 16,
+ "content": "If tokenization resulted in a newline being folded into the token, use dummy values for activation prediction. This is due to the model not observing the original token and a better prompt format should be used to avoid this situation.",
+ "type": "comment"
+ },
+ "242": {
+ "file_id": 16,
+ "content": " )\n current_distribution_values = []\n current_distribution_probabilities = []\n expected_value = 0.0\n original_sequence_tokens.append(current_token)\n distribution_values.append([float(v) for v in current_distribution_values])\n distribution_probabilities.append(current_distribution_probabilities)\n expected_values.append(expected_value)\n return SequenceSimulation(\n tokens=original_sequence_tokens,\n expected_activations=expected_values,\n activation_scale=ActivationScale.SIMULATED_NORMALIZED_ACTIVATIONS,\n distribution_values=distribution_values,\n distribution_probabilities=distribution_probabilities,\n )\nclass NeuronSimulator(ABC):\n \"\"\"Abstract base class for simulating neuron behavior.\"\"\"\n @abstractmethod\n async def simulate(self, tokens: Sequence[str]) -> SequenceSimulation:\n \"\"\"Simulate the behavior of a neuron based on an explanation.\"\"\"",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:273-297"
+ },
+ "243": {
+ "file_id": 16,
+ "content": "The code is defining a NeuronSimulator class with an abstract method \"simulate\" that takes in a sequence of tokens and returns a SequenceSimulation object. The SequenceSimulation object contains the original token sequence, expected activations, activation scale, distribution values, and distribution probabilities.",
+ "type": "comment"
+ },
+ "244": {
+ "file_id": 16,
+ "content": " ...\nclass ExplanationNeuronSimulator(NeuronSimulator):\n \"\"\"\n Simulate neuron behavior based on an explanation.\n This class uses a few-shot prompt with examples of other explanations and activations. This\n prompt allows us to score all of the tokens at once using a nifty trick involving logprobs.\n \"\"\"\n def __init__(\n self,\n model_name: str,\n explanation: str,\n max_concurrent: Optional[int] = 10,\n few_shot_example_set: FewShotExampleSet = FewShotExampleSet.ORIGINAL,\n prompt_format: PromptFormat = PromptFormat.INSTRUCTION_FOLLOWING,\n cache: bool = False,\n ):\n self.api_client = ApiClient(\n model_name=model_name, max_concurrent=max_concurrent, cache=cache\n )\n self.explanation = explanation\n self.few_shot_example_set = few_shot_example_set\n self.prompt_format = prompt_format\n async def simulate(\n self,\n tokens: Sequence[str],\n ) -> SequenceSimulation:\n prompt = self.make_simulation_prompt(tokens)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:298-329"
+ },
+ "245": {
+ "file_id": 16,
+ "content": "This code defines a class called \"ExplanationNeuronSimulator\" that simulates neuron behavior based on an explanation. It uses a few-shot prompt with examples of other explanations and activations, allowing for scoring all tokens at once using logprobs. The constructor takes in parameters like model name, explanation, maximum concurrent tasks, example set type, prompt format, and cache settings. It also initializes an \"ApiClient\" object. The class has a method called \"simulate\" that takes a sequence of tokens as input and returns a SequenceSimulation.",
+ "type": "comment"
+ },
+ "246": {
+ "file_id": 16,
+ "content": " generate_kwargs: dict[str, Any] = {\n \"max_tokens\": 0,\n \"echo\": True,\n \"logprobs\": 15,\n }\n if self.prompt_format == PromptFormat.HARMONY_V4:\n assert isinstance(prompt, list)\n assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n generate_kwargs[\"messages\"] = prompt\n else:\n assert isinstance(prompt, str)\n generate_kwargs[\"prompt\"] = prompt\n response = await self.api_client.make_request(**generate_kwargs)\n logger.debug(\"response in score_explanation_by_activations is %s\", response)\n result = parse_simulation_response(response, self.prompt_format, tokens)\n logger.debug(\"result in score_explanation_by_activations is %s\", result)\n return result\n # TODO(sbills): The current tokenactivation format can result in improper tokenization.\n # In particular, if the token is itself a tab, we may get a single \"\\t\\t\" token rather than two\n # \"\\t\" tokens. Consider using a separator that does not appear in any multi-character tokens.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:331-352"
+ },
+ "247": {
+ "file_id": 16,
+ "content": "This code is making an API request to generate a response based on the provided prompt or message, depending on the prompt format. It then parses the response and returns the result. The code includes assertions for validating the input and a TODO comment indicating potential issues with the tokenization format.",
+ "type": "comment"
+ },
+ "248": {
+ "file_id": 16,
+ "content": " def make_simulation_prompt(self, tokens: Sequence[str]) -> Union[str, list[HarmonyMessage]]:\n \"\"\"Create a few-shot prompt for predicting neuron activations for the given tokens.\"\"\"\n # TODO(sbills): The prompts in this file are subtly different from the ones in explainer.py.\n # Consider reconciling them.\n prompt_builder = PromptBuilder()\n prompt_builder.add_message(\n Role.SYSTEM,\n \"\"\"We're studying neurons in a neural network.\nEach neuron looks for some particular thing in a short document.\nLook at summary of what the neuron does, and try to predict how it will fire on each token.\nThe activation format is tokenactivation, activations go from 0 to 10, \"unknown\" indicates an unknown activation. Most activations will be 0.\n\"\"\",\n )\n few_shot_examples = self.few_shot_example_set.get_examples()\n for i, example in enumerate(few_shot_examples):\n prompt_builder.add_message(\n Role.USER,\n f\"\\n\\nNeuron {i + 1}\\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX} \"",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:353-373"
+ },
+ "249": {
+ "file_id": 16,
+ "content": "This code creates a prompt for predicting neuron activations using a few-shot example set. It adds a system message with instructions on how to analyze the neurons in a neural network and then appends user messages for each example in the set, including the example itself along with an explanation of the neuron's behavior.",
+ "type": "comment"
+ },
+ "250": {
+ "file_id": 16,
+ "content": " f\"{example.explanation}\",\n )\n formatted_activation_records = format_activation_records(\n example.activation_records,\n calculate_max_activation(example.activation_records),\n start_indices=example.first_revealed_activation_indices,\n )\n prompt_builder.add_message(\n Role.ASSISTANT, f\"\\nActivations: {formatted_activation_records}\\n\"\n )\n prompt_builder.add_message(\n Role.USER,\n f\"\\n\\nNeuron {len(few_shot_examples) + 1}\\nExplanation of neuron \"\n f\"{len(few_shot_examples) + 1} behavior: {EXPLANATION_PREFIX} \"\n f\"{self.explanation.strip()}\",\n )\n prompt_builder.add_message(\n Role.ASSISTANT, f\"\\nActivations: {format_sequences_for_simulation([tokens])}\"\n )\n return prompt_builder.build(self.prompt_format)\nclass ExplanationTokenByTokenSimulator(NeuronSimulator):\n \"\"\"\n Simulate neuron behavior based on an explanation.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:374-399"
+ },
+ "251": {
+ "file_id": 16,
+ "content": "This code snippet is part of a Neuron Simulator that simulates neuron behavior based on an explanation. It adds formatted activation records and messages to a prompt builder, including explanations of neuron behavior for few-shot examples.",
+ "type": "comment"
+ },
+ "252": {
+ "file_id": 16,
+ "content": " Unlike ExplanationNeuronSimulator, this class uses one few-shot prompt per token to calculate\n expected activations. This is slower. This class gets a one-token completion and calculates an\n expected value from that token's logprobs.\n \"\"\"\n def __init__(\n self,\n model_name: str,\n explanation: str,\n max_concurrent: Optional[int] = 10,\n few_shot_example_set: FewShotExampleSet = FewShotExampleSet.NEWER,\n prompt_format: PromptFormat = PromptFormat.INSTRUCTION_FOLLOWING,\n cache: bool = False,\n ):\n assert (\n few_shot_example_set != FewShotExampleSet.ORIGINAL\n ), \"This simulator doesn't support the ORIGINAL few-shot example set.\"\n self.api_client = ApiClient(\n model_name=model_name, max_concurrent=max_concurrent, cache=cache\n )\n self.explanation = explanation\n self.few_shot_example_set = few_shot_example_set\n self.prompt_format = prompt_format\n async def simulate(\n self,",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:401-426"
+ },
+ "253": {
+ "file_id": 16,
+ "content": "This class initializes an API client and takes inputs like model name, explanation, max concurrent requests, example set, prompt format, and cache. It asserts that the few-shot example set is not ORIGINAL since this simulator doesn't support it. Then, it performs a simulation using one token prompt per token and calculates expected values from log probabilities. This method is slower compared to ExplanationNeuronSimulator.",
+ "type": "comment"
+ },
+ "254": {
+ "file_id": 16,
+ "content": " tokens: Sequence[str],\n ) -> SequenceSimulation:\n responses_by_token = await asyncio.gather(\n *[\n self._get_activation_stats_for_single_token(tokens, self.explanation, token_index)\n for token_index in range(len(tokens))\n ]\n )\n expected_values, distribution_values, distribution_probabilities = [], [], []\n for response in responses_by_token:\n activation_logprobs = response[\"choices\"][0][\"logprobs\"][\"top_logprobs\"][0]\n (\n norm_probabilities_by_distribution_value,\n expected_value,\n ) = compute_predicted_activation_stats_for_token(\n activation_logprobs,\n )\n distribution_values.append(\n [float(v) for v in norm_probabilities_by_distribution_value.keys()]\n )\n distribution_probabilities.append(\n list(norm_probabilities_by_distribution_value.values())\n )\n expected_values.append(expected_value)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:427-450"
+ },
+ "255": {
+ "file_id": 16,
+ "content": "This function collects activation statistics for each token in the input sequence and then normalizes the probabilities by distribution values, expected values, and appends them to their respective lists.",
+ "type": "comment"
+ },
+ "256": {
+ "file_id": 16,
+ "content": " result = SequenceSimulation(\n tokens=list(tokens), # SequenceSimulation expects List type\n expected_activations=expected_values,\n activation_scale=ActivationScale.SIMULATED_NORMALIZED_ACTIVATIONS,\n distribution_values=distribution_values,\n distribution_probabilities=distribution_probabilities,\n )\n logger.debug(\"result in score_explanation_by_activations is %s\", result)\n return result\n async def _get_activation_stats_for_single_token(\n self,\n tokens: Sequence[str],\n explanation: str,\n token_index_to_score: int,\n ) -> dict:\n prompt = self.make_single_token_simulation_prompt(\n tokens,\n explanation,\n token_index_to_score=token_index_to_score,\n )\n return await self.api_client.make_request(\n prompt=prompt, max_tokens=1, echo=False, logprobs=15\n )\n def _add_single_token_simulation_subprompt(\n self,\n prompt_builder: PromptBuilder,",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:452-479"
+ },
+ "257": {
+ "file_id": 16,
+ "content": "This code is creating a SequenceSimulation object and logging its result. It also defines an asynchronous function that retrieves activation statistics for a single token using API client, and adds a subprompt to a prompt builder.",
+ "type": "comment"
+ },
+ "258": {
+ "file_id": 16,
+ "content": " activation_record: ActivationRecord,\n neuron_index: int,\n explanation: str,\n token_index_to_score: int,\n end_of_prompt: bool,\n ) -> None:\n trimmed_activation_record = ActivationRecord(\n tokens=activation_record.tokens[: token_index_to_score + 1],\n activations=activation_record.activations[: token_index_to_score + 1],\n )\n prompt_builder.add_message(\n Role.USER,\n f\"\"\"\nNeuron {neuron_index}\nExplanation of neuron {neuron_index} behavior: {EXPLANATION_PREFIX} {explanation.strip()}\nText:\n{\"\".join(trimmed_activation_record.tokens)}\nLast token in the text:\n{trimmed_activation_record.tokens[-1]}\nLast token activation, considering the token in the context in which it appeared in the text:\n\"\"\",\n )\n if not end_of_prompt:\n normalized_activations = normalize_activations(\n trimmed_activation_record.activations, calculate_max_activation([activation_record])\n )\n prompt_builder.add_message(",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:480-508"
+ },
+ "259": {
+ "file_id": 16,
+ "content": "Creating trimmed activation record and adding messages to the prompt builder.",
+ "type": "comment"
+ },
+ "260": {
+ "file_id": 16,
+ "content": " Role.ASSISTANT, str(normalized_activations[-1]) + (\"\" if end_of_prompt else \"\\n\\n\")\n )\n def make_single_token_simulation_prompt(\n self,\n tokens: Sequence[str],\n explanation: str,\n token_index_to_score: int,\n ) -> Union[str, list[HarmonyMessage]]:\n \"\"\"Make a few-shot prompt for predicting the neuron's activation on a single token.\"\"\"\n assert explanation != \"\"\n prompt_builder = PromptBuilder()\n prompt_builder.add_message(\n Role.SYSTEM,\n \"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.\nThe activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.\n\"\"\",\n )\n few_shot_examples = self.few_shot_example_set.get_examples()\n for i, example in enumerate(few_shot_examples):",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:509-531"
+ },
+ "261": {
+ "file_id": 16,
+ "content": "This function generates a prompt for predicting the neuron's activation on a single token. It involves adding a system message explaining the task and providing few-shot examples.",
+ "type": "comment"
+ },
+ "262": {
+ "file_id": 16,
+ "content": " prompt_builder.add_message(\n Role.USER,\n f\"Neuron {i + 1}\\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX} \"\n f\"{example.explanation}\\n\",\n )\n formatted_activation_records = format_activation_records(\n example.activation_records,\n calculate_max_activation(example.activation_records),\n start_indices=None,\n )\n prompt_builder.add_message(\n Role.ASSISTANT,\n f\"Activations: {formatted_activation_records}\\n\\n\",\n )\n prompt_builder.add_message(\n Role.SYSTEM,\n \"Now, we're going predict the activation of a new neuron on a single token, \"\n \"following the same rules as the examples above. Activations still range from 0 to 10.\",\n )\n single_token_example = self.few_shot_example_set.get_single_token_prediction_example()\n assert single_token_example.token_index_to_score is not None",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:532-553"
+ },
+ "263": {
+ "file_id": 16,
+ "content": "Generating a prompt to explain neuron behavior and visualize activation records for an example, then adding a message asking to predict the activation of a new neuron on a single token following the same rules.",
+ "type": "comment"
+ },
+ "264": {
+ "file_id": 16,
+ "content": " self._add_single_token_simulation_subprompt(\n prompt_builder,\n single_token_example.activation_records[0],\n len(few_shot_examples) + 1,\n explanation,\n token_index_to_score=single_token_example.token_index_to_score,\n end_of_prompt=False,\n )\n activation_record = ActivationRecord(\n tokens=list(tokens[: token_index_to_score + 1]), # ActivationRecord expects List type.\n activations=[0.0] * len(tokens),\n )\n self._add_single_token_simulation_subprompt(\n prompt_builder,\n activation_record,\n len(few_shot_examples) + 2,\n explanation,\n token_index_to_score,\n end_of_prompt=True,\n )\n return prompt_builder.build(self.prompt_format, allow_extra_system_messages=True)\ndef _format_record_for_logprob_free_simulation(\n activation_record: ActivationRecord,\n include_activations: bool = False,\n max_activation: Optional[float] = None,",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:554-581"
+ },
+ "265": {
+ "file_id": 16,
+ "content": "This code adds two subprompts to a prompt builder, one for a single token example and another for an activation record. It then returns the final formatted prompt.",
+ "type": "comment"
+ },
+ "266": {
+ "file_id": 16,
+ "content": ") -> str:\n response = \"\"\n if include_activations:\n assert max_activation is not None\n assert len(activation_record.tokens) == len(\n activation_record.activations\n ), f\"{len(activation_record.tokens)=}, {len(activation_record.activations)=}\"\n normalized_activations = normalize_activations(\n activation_record.activations, max_activation=max_activation\n )\n for i, token in enumerate(activation_record.tokens):\n # We use a weird unicode character here to make it easier to parse the response (can split on \"༗\\n\").\n if include_activations:\n response += f\"{token}\\t{normalized_activations[i]}༗\\n\"\n else:\n response += f\"{token}\\t༗\\n\"\n return response\ndef _parse_no_logprobs_completion(\n completion: str,\n tokens: Sequence[str],\n) -> Sequence[int]:\n \"\"\"\n Parse a completion into a list of simulated activations. If the model did not faithfully\n reproduce the token sequence, return a list of 0s. If the model's activation for a token",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:582-607"
+ },
+ "267": {
+ "file_id": 16,
+ "content": "This code is parsing a completion into a list of simulated activations. If the model did not faithfully reproduce the token sequence, it returns a list of 0s. It also includes an optional normalization of activations based on max_activation parameter.",
+ "type": "comment"
+ },
+ "268": {
+ "file_id": 16,
+ "content": " is not an integer betwee 0 and 10, substitute 0.\n Args:\n completion: completion from the API\n tokens: list of tokens as strings in the sequence where the neuron is being simulated\n \"\"\"\n zero_prediction = [0] * len(tokens)\n token_lines = completion.strip(\"\\n\").split(\"༗\\n\")\n start_line_index = None\n for i, token_line in enumerate(token_lines):\n if token_line.startswith(f\"{tokens[0]}\\t\"):\n start_line_index = i\n break\n # If we didn't find the first token, or if the number of lines in the completion doesn't match\n # the number of tokens, return a list of 0s.\n if start_line_index is None or len(token_lines) - start_line_index != len(tokens):\n return zero_prediction\n predicted_activations = []\n for i, token_line in enumerate(token_lines[start_line_index:]):\n if not token_line.startswith(f\"{tokens[i]}\\t\"):\n return zero_prediction\n predicted_activation = token_line.split(\"\\t\")[1]\n if predicted_activation not in VALID_ACTIVATION_TOKENS:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:608-631"
+ },
+ "269": {
+ "file_id": 16,
+ "content": "This code checks if the first token is present in the completion and if the number of lines matches the number of tokens. If not, it returns a list of 0s. It then extracts the predicted activations for each token from the completion.",
+ "type": "comment"
+ },
+ "270": {
+ "file_id": 16,
+ "content": " predicted_activations.append(0)\n else:\n predicted_activations.append(int(predicted_activation))\n return predicted_activations\nclass LogprobFreeExplanationTokenSimulator(NeuronSimulator):\n \"\"\"\n Simulate neuron behavior based on an explanation.\n Unlike ExplanationNeuronSimulator and ExplanationTokenByTokenSimulator, this class does not rely on\n logprobs to calculate expected activations. Instead, it uses a few-shot prompt that displays all of the\n tokens at once, and request that the model repeat the tokens with the activations appended. Sampling\n is with temperature = 0. Thus, the activations are deterministic. Also, each activation for a token\n is a function of all the activations that came previously and all of the tokens in the sequence, not\n just the current and previous tokens. In the case where the model does not faithfully reproduce the\n token sequence, the simulator will return a response where every predicted activation is 0. Example prompt as follows:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:632-648"
+ },
+ "271": {
+ "file_id": 16,
+ "content": "The code appends deterministic activations to the explanation token sequence.",
+ "type": "comment"
+ },
+ "272": {
+ "file_id": 16,
+ "content": " Explanation: Explanation 1\n Sequence 1 Tokens Without Activations:\n A\\t_\n B\\t_\n C\\t_\n Sequence 1 Tokens With Activations:\n A\\t4_\n B\\t10_\n C\\t0_\n Sequence 2 Tokens Without Activations:\n D\\t_\n E\\t_\n F\\t_\n Sequence 2 Tokens With Activations:\n D\\t3_\n E\\t6_\n F\\t9_\n Explanation: Explanation 2\n Sequence 1 Tokens Without Activations:\n G\\t_\n H\\t_\n I\\t_\n Sequence 1 Tokens With Activations:\n \n G\\t2_\n H\\t0_\n I\\t3_\n \"\"\"\n def __init__(\n self,\n model_name: str,\n explanation: str,\n max_concurrent: Optional[int] = 10,\n few_shot_example_set: FewShotExampleSet = FewShotExampleSet.NEWER,\n prompt_format: PromptFormat = PromptFormat.HARMONY_V4,\n cache: bool = False,\n ):\n assert (\n few_shot_example_set != FewShotExampleSet.ORIGINAL\n ), \"This simulator doesn't support the ORIGINAL few-shot example set.\"\n self.api_client = ApiClient(\n model_name=model_name, max_concurrent=max_concurrent, cache=cache",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:650-706"
+ },
+ "273": {
+ "file_id": 16,
+ "content": "This code is initializing an instance of a simulator. It takes the model name, explanation, maximum concurrent samples, few-shot example set (not ORIGINAL), prompt format, and cache settings as parameters. The assert statement ensures that the few-shot example set is not ORIGINAL because this simulator doesn't support it. It then initializes an instance of ApiClient with the given model name, maximum concurrent samples, and cache settings.",
+ "type": "comment"
+ },
+ "274": {
+ "file_id": 16,
+ "content": " )\n self.explanation = explanation\n self.few_shot_example_set = few_shot_example_set\n self.prompt_format = prompt_format\n async def simulate(\n self,\n tokens: Sequence[str],\n ) -> SequenceSimulation:\n prompt = self._make_simulation_prompt(\n tokens,\n self.explanation,\n )\n response = await self.api_client.make_request(\n prompt=prompt, echo=False, max_tokens=1000\n )\n assert len(response[\"choices\"]) == 1\n choice = response[\"choices\"][0]\n if self.prompt_format == PromptFormat.HARMONY_V4:\n completion = choice[\"message\"][\"content\"]\n elif self.prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:\n completion = choice[\"text\"]\n else:\n raise ValueError(f\"Unhandled prompt format {self.prompt_format}\")\n predicted_activations = _parse_no_logprobs_completion(completion, tokens)\n result = SequenceSimulation(\n activation_scale=ActivationScale.SIMULATED_NORMALIZED_ACTIVATIONS,",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:707-736"
+ },
+ "275": {
+ "file_id": 16,
+ "content": "Code creates a simulation prompt, sends it to API client for processing, and stores the result.",
+ "type": "comment"
+ },
+ "276": {
+ "file_id": 16,
+ "content": " expected_activations=predicted_activations,\n # Since the predicted activation is just a sampled token, we don't have a distribution.\n distribution_values=None,\n distribution_probabilities=None,\n tokens=list(tokens), # SequenceSimulation expects List type\n )\n logger.debug(\"result in score_explanation_by_activations is %s\", result)\n return result\n def _make_simulation_prompt(\n self,\n tokens: Sequence[str],\n explanation: str,\n ) -> Union[str, list[HarmonyMessage]]:\n \"\"\"Make a few-shot prompt for predicting the neuron's activations on a sequence.\"\"\"\n assert explanation != \"\"\n prompt_builder = PromptBuilder(allow_extra_system_messages=True)\n prompt_builder.add_message(\n Role.SYSTEM,\n \"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:737-756"
+ },
+ "277": {
+ "file_id": 16,
+ "content": "Creating a SimulationResult object with expected activations, and None distribution values and probabilities.\n\nFunction to build a simulation prompt using PromptBuilder and add a system message about studying neurons in neural networks.",
+ "type": "comment"
+ },
+ "278": {
+ "file_id": 16,
+ "content": "The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.\nFor each sequence, you will see the tokens in the sequence where the activations are left blank. You will print the exact same tokens verbatim, but with the activations filled in according to the explanation.\n\"\"\",\n )\n few_shot_examples = self.few_shot_example_set.get_examples()\n for i, example in enumerate(few_shot_examples):\n few_shot_example_max_activation = calculate_max_activation(example.activation_records)\n prompt_builder.add_message(\n Role.USER,\n f\"Neuron {i + 1}\\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX} \"\n f\"{example.explanation}\\n\\n\"\n f\"Sequence 1 Tokens without Activations:\\n{_format_record_for_logprob_free_simulation(example.activation_records[0], include_activations=False)}\\n\\n\"\n f\"Sequence 1 Tokens with Activations:\\n\",\n )\n prompt_builder.add_message(",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:758-774"
+ },
+ "279": {
+ "file_id": 16,
+ "content": "This code generates explanations for neuron behavior in a sequence, and for each neuron, it shows the tokens with and without activations. Activation records are used to determine the max activation for that neuron. The output includes an explanation prefix, tokens without and with activations for Sequence 1, and is added to a prompt builder.",
+ "type": "comment"
+ },
+ "280": {
+ "file_id": 16,
+ "content": " Role.ASSISTANT,\n f\"{_format_record_for_logprob_free_simulation(example.activation_records[0], include_activations=True, max_activation=few_shot_example_max_activation)}\\n\\n\",\n )\n for record_index, record in enumerate(example.activation_records[1:]):\n prompt_builder.add_message(\n Role.USER,\n f\"Sequence {record_index + 2} Tokens without Activations:\\n{_format_record_for_logprob_free_simulation(record, include_activations=False)}\\n\\n\"\n f\"Sequence {record_index + 2} Tokens with Activations:\\n\",\n )\n prompt_builder.add_message(\n Role.ASSISTANT,\n f\"{_format_record_for_logprob_free_simulation(record, include_activations=True, max_activation=few_shot_example_max_activation)}\\n\\n\",\n )\n neuron_index = len(few_shot_examples) + 1\n prompt_builder.add_message(\n Role.USER,\n f\"Neuron {neuron_index}\\nExplanation of neuron {neuron_index} behavior: {EXPLANATION_PREFIX} \"",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:775-793"
+ },
+ "281": {
+ "file_id": 16,
+ "content": "This code is building a prompt for an AI model by adding messages to the prompt_builder. It iterates through activation records of an example, adding information about tokens with and without activations for each record. Finally, it adds a message for the next neuron index with its explanation.",
+ "type": "comment"
+ },
+ "282": {
+ "file_id": 16,
+ "content": " f\"{explanation}\\n\\n\"\n f\"Sequence 1 Tokens without Activations:\\n{_format_record_for_logprob_free_simulation(ActivationRecord(tokens=tokens, activations=[]), include_activations=False)}\\n\\n\"\n f\"Sequence 1 Tokens with Activations:\\n\",\n )\n return prompt_builder.build(self.prompt_format)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/simulator.py:794-798"
+ },
+ "283": {
+ "file_id": 16,
+ "content": "This code generates a formatted explanation for sequence 1 tokens without and with activations, and returns it in a prompt format.",
+ "type": "comment"
+ },
+ "284": {
+ "file_id": 17,
+ "content": "/neuron-explainer/neuron_explainer/explanations/test_explainer.py",
+ "type": "filepath"
+ },
+ "285": {
+ "file_id": 17,
+ "content": "The code initializes an event loop, tests explanation formats with generated prompts and neuron behavior visualization using GPT-4 and Harmony V4 for token lists up to 20 tokens.",
+ "type": "summary"
+ },
+ "286": {
+ "file_id": 17,
+ "content": "import asyncio\nfrom typing import Any\nfrom neuron_explainer.explanations.explainer import (\n TokenActivationPairExplainer,\n TokenSpaceRepresentationExplainer,\n)\nfrom neuron_explainer.explanations.few_shot_examples import TEST_EXAMPLES, FewShotExampleSet\nfrom neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role\nfrom neuron_explainer.explanations.token_space_few_shot_examples import (\n TokenSpaceFewShotExampleSet,\n)\ndef setup_module(unused_module: Any) -> None:\n # Make sure we have an event loop, since the attempt to create the Semaphore in\n # ResearchApiClient will fail without it.\n loop = asyncio.new_event_loop()\n asyncio.set_event_loop(loop)\ndef test_if_formatting() -> None:\n expected_prompt = \"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:1-23"
+ },
+ "287": {
+ "file_id": 17,
+ "content": "Setting up the event loop for async operations.",
+ "type": "comment"
+ },
+ "288": {
+ "file_id": 17,
+ "content": "The activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.\nNeuron 1\nActivations:\n\na\t10\nb\t0\nc\t0\n\n\nd\t0\ne\t10\nf\t0\n\nExplanation of neuron 1 behavior: the main thing this neuron does is find vowels.\nNeuron 2\nActivations:\n\na\t10\nb\t0\nc\t0\n\n\nd\t0\ne\t10\nf\t0\n\nExplanation of neuron 2 behavior:<|endofprompt|> the main thing this neuron does is find\"\"\"\n explainer = TokenActivationPairExplainer(\n model_name=\"text-davinci-003\",\n prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n few_shot_example_set=FewShotExampleSet.TEST,\n )\n prompt = explainer.make_explanation_prompt(\n all_activation_records=TEST_EXAMPLES[0].activation_records,\n max_activation=1.0,\n max_tokens_for_completion=20,\n )\n assert prompt == expected_prompt\ndef test_harmony_format() -> None:\n expected_prompt = [\n HarmonyMessage(",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:25-73"
+ },
+ "289": {
+ "file_id": 17,
+ "content": "This code initializes an explainer object with specific parameters and then generates a test prompt using the provided activation records. The generated prompt is then asserted to be equal to the expected prompt. The main purpose of this code is to test whether the explanation format matches the expected output for a given set of activation records.",
+ "type": "comment"
+ },
+ "290": {
+ "file_id": 17,
+ "content": " role=Role.SYSTEM,\n content=\"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words.\nThe activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.\"\"\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 1\nActivations:\n\na\t10\nb\t0\nc\t0\n\n\nd\t0\ne\t10\nf\t0\n\nExplanation of neuron 1 behavior: the main thing this neuron does is find\"\"\",\n ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\" vowels.\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 2\nActivations:\n\na\t10\nb\t0\nc\t0\n\n\nd\t0",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:74-114"
+ },
+ "291": {
+ "file_id": 17,
+ "content": "Code explains the neuron's behavior in a neural network, showing activation values for tokens and summarizing what each neuron is looking for.",
+ "type": "comment"
+ },
+ "292": {
+ "file_id": 17,
+ "content": "e\t10\nf\t0\n\nExplanation of neuron 2 behavior: the main thing this neuron does is find\"\"\",\n ),\n ]\n explainer = TokenActivationPairExplainer(\n model_name=\"gpt-4\",\n prompt_format=PromptFormat.HARMONY_V4,\n few_shot_example_set=FewShotExampleSet.TEST,\n )\n prompt = explainer.make_explanation_prompt(\n all_activation_records=TEST_EXAMPLES[0].activation_records,\n max_activation=1.0,\n max_tokens_for_completion=20,\n )\n assert isinstance(prompt, list)\n assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n for actual_message, expected_message in zip(prompt, expected_prompt):\n assert actual_message[\"role\"] == expected_message[\"role\"]\n assert actual_message[\"content\"] == expected_message[\"content\"]\n assert prompt == expected_prompt\ndef test_token_space_explainer_if_formatting() -> None:\n expected_prompt = \"\"\"We're studying neurons in a neural network. Each neuron looks for some particular kind of token (which can be a w",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:115-143"
+ },
+ "293": {
+ "file_id": 17,
+ "content": "This code initializes an explainer object, sets the model name to \"gpt-4\", prompt format to Harmony_v4, and few shot example set to TEST. Then it creates a list of prompts for explanation by calling `make_explanation_prompt` function with a list of activation records, max activation, and max tokens for completion. The code asserts that the resulting prompt is a list and each item in the list is a dictionary (HarmonyMessage) and compares it with the expected_prompt. Finally, it tests if the prompt matches the expected_prompt by comparing their contents.",
+ "type": "comment"
+ },
+ "294": {
+ "file_id": 17,
+ "content": "ord, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.\nTokens:\n'these', ' are', ' tokens'\nExplanation:\nThis neuron is looking for this is a test explanation.\nTokens:\n'foo', 'bar', 'baz'\nExplanation:\n<|endofprompt|>This neuron is looking for\"\"\"\n explainer = TokenSpaceRepresentationExplainer(\n model_name=\"text-davinci-002\",\n prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n use_few_shot=True,\n few_shot_example_set=TokenSpaceFewShotExampleSet.TEST,\n )\n prompt = explainer.make_explanation_prompt(\n tokens=[\"foo\", \"bar\", \"baz\"],\n max_tokens_for_completion=20,\n )\n assert prompt == expected_prompt\ndef test_token_space_explainer_harmony_formatting() -> None:\n expected_prompt = [\n HarmonyMessage(\n role=Role.SYSTEM,\n content=\"We're studying neurons in a neural network. Each neuron looks for some particular k",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:143-179"
+ },
+ "295": {
+ "file_id": 17,
+ "content": "This code initializes a TokenSpaceRepresentationExplainer with specific parameters and then uses it to generate an explanation prompt given a set of tokens. The expected output is compared to the generated prompt in the test case.",
+ "type": "comment"
+ },
+ "296": {
+ "file_id": 17,
+ "content": "ind of token (which can be a word, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nTokens:\n'these', ' are', ' tokens'\nExplanation:\nThis neuron is looking for\"\"\",\n ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\" this is a test explanation.\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nTokens:\n'foo', 'bar', 'baz'\nExplanation:\nThis neuron is looking for\"\"\",\n ),\n ]\n explainer = TokenSpaceRepresentationExplainer(\n model_name=\"gpt-4\",\n prompt_format=PromptFormat.HARMONY_V4,\n use_few_shot=True,\n few_shot_example_set=TokenSpaceFewShotExampleSet.TEST,\n )\n prompt = explainer.make_explanation_prompt(\n tokens=[\"foo\", \"bar\", \"baz\"],\n max_tokens_for_completion=20,\n )\n assert isinstance(prompt, list)",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:179-222"
+ },
+ "297": {
+ "file_id": 17,
+ "content": "The code initializes a TokenSpaceRepresentationExplainer with GPT-4 model and Harmony V4 prompt format. It also uses few-shot learning with the test example set and generates an explanation prompt for the tokens 'foo', 'bar', and 'baz'. The explanation prompt will be in list format, and its length should not exceed 20 tokens.",
+ "type": "comment"
+ },
+ "298": {
+ "file_id": 17,
+ "content": " assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n for actual_message, expected_message in zip(prompt, expected_prompt):\n assert actual_message[\"role\"] == expected_message[\"role\"]\n assert actual_message[\"content\"] == expected_message[\"content\"]\n assert prompt == expected_prompt",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:223-227"
+ },
+ "299": {
+ "file_id": 17,
+ "content": "Checking if the prompt is a list of HarmonyMessages and if each message's role and content match the expected values.",
+ "type": "comment"
+ }
+}
\ No newline at end of file
diff --git a/docs/data/3.json b/docs/data/3.json
new file mode 100644
index 0000000..b316565
--- /dev/null
+++ b/docs/data/3.json
@@ -0,0 +1,538 @@
+{
+ "300": {
+ "file_id": 18,
+ "content": "/neuron-explainer/neuron_explainer/explanations/test_simulator.py",
+ "type": "filepath"
+ },
+ "301": {
+ "file_id": 18,
+ "content": "The comments describe testing a function that checks the accuracy of neuron behavior prompts in neural networks, ensuring they align with expectations for text-davinci-003 model.",
+ "type": "summary"
+ },
+ "302": {
+ "file_id": 18,
+ "content": "from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet\nfrom neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role\nfrom neuron_explainer.explanations.simulator import (\n ExplanationNeuronSimulator,\n ExplanationTokenByTokenSimulator,\n)\ndef test_make_explanation_simulation_prompt_if_format() -> None:\n expected_prompt = \"\"\"We're studying neurons in a neural network.\nEach neuron looks for some particular thing in a short document.\nLook at summary of what the neuron does, and try to predict how it will fire on each token.\nThe activation format is tokenactivation, activations go from 0 to 10, \"unknown\" indicates an unknown activation. Most activations will be 0.\nNeuron 1\nExplanation of neuron 1 behavior: the main thing this neuron does is find vowels\nActivations: \n\na\t10\nb\t0\nc\t0\n\n\nd\tunknown\ne\t10\nf\t0\n\nNeuron 2\nExplanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION<|endofprompt|>\nActivations: \n",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:1-36"
+ },
+ "303": {
+ "file_id": 18,
+ "content": "Code snippet defines a test function to check the generation of explanation simulation prompt with a specific format.\nThe expected prompt format includes neuron behavior summaries, activation values for each token, and an \"unknown\" indication when necessary.",
+ "type": "comment"
+ },
+ "304": {
+ "file_id": 18,
+ "content": "0\tunknown\n1\tunknown\n2\tunknown\n\n\"\"\"\n prompt = ExplanationNeuronSimulator(\n model_name=\"text-davinci-003\",\n explanation=\"EXPLANATION\",\n few_shot_example_set=FewShotExampleSet.TEST,\n prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n ).make_simulation_prompt(\n tokens=[str(x) for x in range(3)],\n )\n assert prompt == expected_prompt\ndef test_make_explanation_simulation_prompt_harmony_format() -> None:\n expected_prompt = [\n HarmonyMessage(\n role=Role.SYSTEM,\n content=\"\"\"We're studying neurons in a neural network.\nEach neuron looks for some particular thing in a short document.\nLook at summary of what the neuron does, and try to predict how it will fire on each token.\nThe activation format is tokenactivation, activations go from 0 to 10, \"unknown\" indicates an unknown activation. Most activations will be 0.\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 1\nExplanation of neuron 1 behavior: the main thing this neuron does is find vowels\"\"\",",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:37-69"
+ },
+ "305": {
+ "file_id": 18,
+ "content": "ExplanationNeuronSimulator is being used to generate a simulation prompt for the text-davinci-003 model. The prompt will include information about neurons in a neural network, their roles, and how they analyze short documents. Each token will have an activation level from 0 to 10 or \"unknown\".",
+ "type": "comment"
+ },
+ "306": {
+ "file_id": 18,
+ "content": " ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\"\"\"\nActivations: \n\na\t10\nb\t0\nc\t0\n\n\nd\tunknown\ne\t10\nf\t0\n\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 2\nExplanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION\"\"\",\n ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\"\"\"\nActivations: \n\n0\tunknown\n1\tunknown\n2\tunknown\n\n\"\"\",\n ),\n ]\n prompt = ExplanationNeuronSimulator(\n model_name=\"gpt-4\",\n explanation=\"EXPLANATION\",\n few_shot_example_set=FewShotExampleSet.TEST,\n prompt_format=PromptFormat.HARMONY_V4,\n ).make_simulation_prompt(\n tokens=[str(x) for x in range(3)],\n )\n assert isinstance(prompt, list)\n assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n for actual_message, expected_message in zip(prompt, expected_prompt):\n assert actual_message[\"role\"] == expected_message[\"role\"]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:70-119"
+ },
+ "307": {
+ "file_id": 18,
+ "content": "This code is defining a test simulation prompt using the ExplanationNeuronSimulator class, with given input parameters such as model_name, explanation, few_shot_example_set, and prompt_format. The simulation prompts are created in HarmonyMessage format, and assertions are used to check if the created prompts match the expected format and structure.",
+ "type": "comment"
+ },
+ "308": {
+ "file_id": 18,
+ "content": " assert actual_message[\"content\"] == expected_message[\"content\"]\n assert prompt == expected_prompt\ndef test_make_token_by_token_simulation_prompt_if_format() -> None:\n expected_prompt = \"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.\nThe activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.\nNeuron 1\nExplanation of neuron 1 behavior: the main thing this neuron does is find vowels\nActivations: \n\na\t10\nb\t0\nc\t0\n\n\nd\t0\ne\t10\nf\t0\n\nNow, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.\nNeuron 2\nExplanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else\nText:\nghi\nLast token in the text:\ni\nLast token activation, considering the token in the context in which it appeared in the text:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:120-153"
+ },
+ "309": {
+ "file_id": 18,
+ "content": "The code is asserting that the actual message content matches the expected message, and that the prompt matches the expected prompt. This test checks if the simulation prompt and its format are as expected.",
+ "type": "comment"
+ },
+ "310": {
+ "file_id": 18,
+ "content": "10\nNeuron 3\nExplanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else\nText:\n01\nLast token in the text:\n1\nLast token activation, considering the token in the context in which it appeared in the text:\n<|endofprompt|>\"\"\"\n prompt = ExplanationTokenByTokenSimulator(\n model_name=\"text-davinci-003\",\n explanation=\"EXPLANATION\",\n few_shot_example_set=FewShotExampleSet.TEST,\n prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n ).make_single_token_simulation_prompt(\n tokens=[str(x) for x in range(3)],\n explanation=\"numbers and nothing else\",\n token_index_to_score=1,\n )\n assert prompt == expected_prompt\ndef test_make_token_by_token_simulation_prompt_harmony_format() -> None:\n expected_prompt = [\n HarmonyMessage(\n role=Role.SYSTEM,\n content=\"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:154-184"
+ },
+ "311": {
+ "file_id": 18,
+ "content": "Test function that checks if a prompt generated for explaining the behavior of a neuron in a neural network is correct. It uses an explanation and token index to generate the prompt, which is then compared with the expected prompt.",
+ "type": "comment"
+ },
+ "312": {
+ "file_id": 18,
+ "content": "The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"Neuron 1\nExplanation of neuron 1 behavior: the main thing this neuron does is find vowels\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\"\"\"Activations: \n\na\t10\nb\t0\nc\t0\n\n\nd\t0\ne\t10\nf\t0\n\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.SYSTEM,\n content=\"Now, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 2\nExplanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else\nText:\nghi\nLast token in the text:\ni\nLast token activation, considering the token in the context in which it appeared in the text:\n\"\"\",",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:186-229"
+ },
+ "313": {
+ "file_id": 18,
+ "content": "Code is defining and testing a neuron simulator to analyze the behavior of different neurons based on their activations when processing text tokens. Activations are represented in the format \"tokenactivation\" and range from 0 to 10, with most being 0. The simulation considers single tokens in context and predicts the activation for new neurons following similar rules as previous examples.",
+ "type": "comment"
+ },
+ "314": {
+ "file_id": 18,
+ "content": " ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\"\"\"10\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 3\nExplanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else\nText:\n01\nLast token in the text:\n1\nLast token activation, considering the token in the context in which it appeared in the text:\n\"\"\",\n ),\n ]\n prompt = ExplanationTokenByTokenSimulator(\n model_name=\"gpt-4\",\n explanation=\"EXPLANATION\",\n few_shot_example_set=FewShotExampleSet.TEST,\n prompt_format=PromptFormat.HARMONY_V4,\n ).make_single_token_simulation_prompt(\n tokens=[str(x) for x in range(3)],\n explanation=\"numbers and nothing else\",\n token_index_to_score=1,\n )\n assert isinstance(prompt, list)\n assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n for actual_message, expected_message in zip(prompt, expected_prompt):\n assert actual_message[\"role\"] == expected_message[\"role\"]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:230-267"
+ },
+ "315": {
+ "file_id": 18,
+ "content": "The code is generating a simulation prompt for an AI model (in this case, \"gpt-4\") to interpret the behavior of neuron 3. The prompt includes information about the neuron's function and the context it operates in. It checks that the output is a list of HarmonyMessage objects and that each message's role matches the expected roles.",
+ "type": "comment"
+ },
+ "316": {
+ "file_id": 18,
+ "content": " assert actual_message[\"content\"] == expected_message[\"content\"]\n assert prompt == expected_prompt",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:268-269"
+ },
+ "317": {
+ "file_id": 18,
+ "content": "Asserting that the content of actual_message matches expected_message and prompt matches expected_prompt.",
+ "type": "comment"
+ },
+ "318": {
+ "file_id": 19,
+ "content": "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py",
+ "type": "filepath"
+ },
+ "319": {
+ "file_id": 19,
+ "content": "The code introduces a class for few-shot examples and sets, primarily focusing on medical and scientific terms, to assist neuron explainers in token-based few-shot learning. It also defines two lists of token examples for testing the token space few-shot explanation function, with one list containing time and date tokens and another test example with three tokens each having an associated explanation.",
+ "type": "summary"
+ },
+ "320": {
+ "file_id": 19,
+ "content": "from dataclasses import dataclass\nfrom enum import Enum\nfrom typing import List\nfrom neuron_explainer.fast_dataclasses import FastDataclass\n@dataclass\nclass Example(FastDataclass):\n \"\"\"\n An example list of tokens as strings corresponding to top token space inputs of a neuron, with a\n string explanation of the neuron's behavior on these tokens.\n \"\"\"\n tokens: List[str]\n explanation: str\nclass TokenSpaceFewShotExampleSet(Enum):\n \"\"\"Determines which few-shot examples to use when sampling explanations.\"\"\"\n ORIGINAL = \"original\"\n TEST = \"test\"\n def get_examples(self) -> list[Example]:\n \"\"\"Returns regular examples for use in a few-shot prompt.\"\"\"\n if self is TokenSpaceFewShotExampleSet.ORIGINAL:\n return ORIGINAL_EXAMPLES\n elif self is TokenSpaceFewShotExampleSet.TEST:\n return TEST_EXAMPLES\n else:\n raise ValueError(f\"Unhandled example set: {self}\")\nORIGINAL_EXAMPLES = [\n Example(\n tokens=[\n \"actual\",\n \" literal\",",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:1-39"
+ },
+ "321": {
+ "file_id": 19,
+ "content": "This code defines a class for token-based few shot examples and their sets. It also contains methods to get the examples based on the example set specified.",
+ "type": "comment"
+ },
+ "322": {
+ "file_id": 19,
+ "content": " \" actual\",\n \" hyper\",\n \" real\",\n \" EX\",\n \" Real\",\n \"^\",\n \"Full\",\n \" full\",\n \" optical\",\n \" style\",\n \"any\",\n \"ALL\",\n \"extreme\",\n \" miniature\",\n \" Optical\",\n \" faint\",\n \"~\",\n \" Physical\",\n \" REAL\",\n \"*\",\n \"virtual\",\n \"TYPE\",\n \" technical\",\n \"otally\",\n \" physic\",\n \"Type\",\n \"<\",\n \"images\",\n \"atic\",\n \" sheer\",\n \" Style\",\n \" partial\",\n \" natural\",\n \"Hyper\",\n \" Any\",\n \" theoretical\",\n \"|\",\n \" ultimate\",\n \"oing\",\n \" constant\",\n \"ANY\",\n \"antically\",\n \"ishly\",\n \" ex\",\n \" visual\",\n \"special\",\n \"omorphic\",\n \"visual\",\n ],",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:40-88"
+ },
+ "323": {
+ "file_id": 19,
+ "content": "This code appears to be a list of words, likely used for tokenization purposes in natural language processing or machine learning tasks. The variety of terms suggests it could be related to various fields and concepts.",
+ "type": "comment"
+ },
+ "324": {
+ "file_id": 19,
+ "content": " explanation=\" adjectives related to being real, or to physical properties and evidence\",\n ),\n Example(\n tokens=[\n \"cephal\",\n \"aeus\",\n \" coma\",\n \"bered\",\n \"abetes\",\n \"inflamm\",\n \"rugged\",\n \"alysed\",\n \"azine\",\n \"hered\",\n \"cells\",\n \"aneously\",\n \"fml\",\n \"igm\",\n \"culosis\",\n \"iani\",\n \"CTV\",\n \"disabled\",\n \"heric\",\n \"ulo\",\n \"geoning\",\n \"awi\",\n \"translation\",\n \"iral\",\n \"govtrack\",\n \"mson\",\n \"cloth\",\n \"nesota\",\n \" Dise\",\n \" Lyme\",\n \" dementia\",\n \"agn\",\n \" reversible\",\n \" susceptibility\",\n \"esthesia\",\n \"orf\",\n \" inflamm\",\n \" Obesity\",\n \" tox\",\n \" Disorders\",\n \"uberty\",\n \"blind\",",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:89-134"
+ },
+ "325": {
+ "file_id": 19,
+ "content": "This code defines a list of examples for token-based few-shot learning in the context of neuron explainers. The examples consist of various tokens related to medical and scientific terms.",
+ "type": "comment"
+ },
+ "326": {
+ "file_id": 19,
+ "content": " \"ALTH\",\n \"avier\",\n \" Immunity\",\n \" Hurt\",\n \"ulet\",\n \"ueless\",\n \" sluggish\",\n \"rosis\",\n ],\n explanation=\" words related to physical medical conditions\",\n ),\n Example(\n tokens=[\n \" January\",\n \"terday\",\n \"cember\",\n \" April\",\n \" July\",\n \"September\",\n \"December\",\n \"Thursday\",\n \"quished\",\n \"November\",\n \"Tuesday\",\n \"uesday\",\n \" Sept\",\n \"ruary\",\n \" March\",\n \";;;;;;;;;;;;\",\n \" Monday\",\n \"Wednesday\",\n \" Saturday\",\n \" Wednesday\",\n \"Reloaded\",\n \"aturday\",\n \" August\",\n \"Feb\",\n \"Sunday\",\n \"Reviewed\",\n \"uggest\",\n \" Dhabi\",\n \"ACTED\",\n \"tten\",\n \"Year\",\n \"August\",\n \"alogue\",\n \"MX\",",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:135-181"
+ },
+ "327": {
+ "file_id": 19,
+ "content": "This code is defining example sentences for few-shot learning in the token space, with tokens related to physical medical conditions and dates.",
+ "type": "comment"
+ },
+ "328": {
+ "file_id": 19,
+ "content": " \" Janeiro\",\n \"yss\",\n \" Leilan\",\n \" Fiscal\",\n \" referen\",\n \"semb\",\n \"eele\",\n \"wcs\",\n \"detail\",\n \"ertation\",\n \" Reborn\",\n \" Sunday\",\n \"itially\",\n \"aturdays\",\n \" Dise\",\n \"essage\",\n ],\n explanation=\" nouns related to time and dates\",\n ),\n]\nTEST_EXAMPLES = [\n Example(\n tokens=[\n \"these\",\n \" are\",\n \" tokens\",\n ],\n explanation=\" this is a test explanation\",\n ),\n]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:182-212"
+ },
+ "329": {
+ "file_id": 19,
+ "content": "This code defines two lists of token examples for testing the token space few shot explanation function. The first list contains tokens related to time and dates, and the second one is a test example with three tokens. Each example has an associated explanation.",
+ "type": "comment"
+ },
+ "330": {
+ "file_id": 20,
+ "content": "/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py",
+ "type": "filepath"
+ },
+ "331": {
+ "file_id": 20,
+ "content": "Imports FastDataclass and related functions from the fast_dataclasses module, and sets __all__ to include them.",
+ "type": "summary"
+ },
+ "332": {
+ "file_id": 20,
+ "content": "from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass\n__all__ = [\"FastDataclass\", \"dumps\", \"loads\", \"register_dataclass\"]",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py:1-3"
+ },
+ "333": {
+ "file_id": 20,
+ "content": "Imports FastDataclass and related functions from the fast_dataclasses module, and sets __all__ to include them.",
+ "type": "comment"
+ },
+ "334": {
+ "file_id": 21,
+ "content": "/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py",
+ "type": "filepath"
+ },
+ "335": {
+ "file_id": 21,
+ "content": "The fast dataclass utility offers efficient serialization and deserialization with limited data validation, using orjson for numpy objects. It includes unit tests, a function to register new dataclasses, and a recursive object hook for handling lists and dictionaries.",
+ "type": "summary"
+ },
+ "336": {
+ "file_id": 21,
+ "content": "# Utilities for dataclasses that are very fast to serialize and deserialize, with limited data\n# validation. Fields must not be tuples, since they get serialized and then deserialized as lists.\n#\n# The unit tests for this library show how to use it.\nimport json\nfrom dataclasses import dataclass, field, fields, is_dataclass\nfrom functools import partial\nfrom typing import Any, Union\nimport orjson\ndataclasses_by_name = {}\ndataclasses_by_fieldnames = {}\n@dataclass\nclass FastDataclass:\n dataclass_name: str = field(init=False)\n def __post_init__(self) -> None:\n self.dataclass_name = self.__class__.__name__\ndef register_dataclass(cls): # type: ignore\n assert is_dataclass(cls), \"Only dataclasses can be registered.\"\n dataclasses_by_name[cls.__name__] = cls\n name_set = frozenset(f.name for f in fields(cls) if f.name != \"dataclass_name\")\n dataclasses_by_fieldnames[name_set] = cls\n return cls\ndef dumps(obj: Any) -> bytes:\n return orjson.dumps(obj, option=orjson.OPT_SERIALIZE_NUMPY)\ndef _object_hook(d: Any, backwards_compatible: bool = True) -> Any:",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py:1-37"
+ },
+ "337": {
+ "file_id": 21,
+ "content": "The code defines a fast dataclass utility that provides efficient serialization and deserialization while limiting data validation. The library ensures fields are not tuples to avoid issues during serialization and deserialization. The unit tests in the codebase demonstrate how to use this fast dataclass utility. It uses orjson for serializing numpy objects and provides a function to register new dataclasses with the utility.",
+ "type": "comment"
+ },
+ "338": {
+ "file_id": 21,
+ "content": " # If d is a list, recurse.\n if isinstance(d, list):\n return [_object_hook(x, backwards_compatible=backwards_compatible) for x in d]\n # If d is not a dict, return it as is.\n if not isinstance(d, dict):\n return d\n cls = None\n if \"dataclass_name\" in d:\n if d[\"dataclass_name\"] in dataclasses_by_name:\n cls = dataclasses_by_name[d[\"dataclass_name\"]]\n else:\n assert backwards_compatible, (\n f\"Dataclass {d['dataclass_name']} not found, set backwards_compatible=True if you \"\n f\"are okay with that.\"\n )\n # Load objects created without dataclass_name set.\n else:\n # Try our best to find a dataclass if backwards_compatible is True.\n if backwards_compatible:\n d_fields = frozenset(d.keys())\n if d_fields in dataclasses_by_fieldnames:\n cls = dataclasses_by_fieldnames[d_fields]\n elif len(d_fields) > 0:\n # Check if the fields are a subset of a dataclass (if the dataclass had extra fields",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py:38-61"
+ },
+ "339": {
+ "file_id": 21,
+ "content": "Checks if the input is a list, if so it recursively applies the object hook to each element. If not a list or dict, returns as is. If a dict, tries to find the corresponding dataclass based on either \"dataclass_name\" key or fieldnames, falling back if backwards_compatible is set to True.",
+ "type": "comment"
+ },
+ "340": {
+ "file_id": 21,
+ "content": " # added since the data was created). Note that this will fail if fields were removed\n # from the dataclass.\n for key, possible_cls in dataclasses_by_fieldnames.items():\n if d_fields.issubset(key):\n cls = possible_cls\n break\n else:\n print(f\"Could not find dataclass for {d_fields} {cls}\")\n new_d = {\n k: _object_hook(v, backwards_compatible=backwards_compatible)\n for k, v in d.items()\n if k != \"dataclass_name\"\n }\n if cls is not None:\n return cls(**new_d)\n else:\n return new_d\ndef loads(s: Union[str, bytes], backwards_compatible: bool = True) -> Any:\n return json.loads(\n s,\n object_hook=partial(_object_hook, backwards_compatible=backwards_compatible),\n )",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py:62-85"
+ },
+ "341": {
+ "file_id": 21,
+ "content": "The code aims to load and parse JSON data that uses dataclasses. It checks for the compatibility of the loaded data with existing dataclass definitions, then creates a new dataclass instance or a dictionary based on the input.",
+ "type": "comment"
+ },
+ "342": {
+ "file_id": 22,
+ "content": "/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py",
+ "type": "filepath"
+ },
+ "343": {
+ "file_id": 22,
+ "content": "The code defines three classes, registers them for serialization and deserialization using FastDataclass, and tests functionality including handling of bad data, testing different scenarios for deserializing data using the `loads` function, asserting correct data type identification, and raising a TypeError when unexpected fields are present.",
+ "type": "summary"
+ },
+ "344": {
+ "file_id": 22,
+ "content": "from dataclasses import dataclass\nimport pytest\nfrom .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass\n# Inheritance is a bit tricky with our setup. dataclass_name must be set for instances of these\n# classes to serialize and deserialize correctly, but if it's given a default value, then subclasses\n# can't have any fields that don't have default values, because of how constructors are generated\n# for dataclasses (fields with no default value can't follow those with default values). To work\n# around this, we set dataclass_name in __post_init__ on the base class, which is called after the\n# constructor. The implementation does the right thing for both the base class and the subclass.\n@register_dataclass\n@dataclass\nclass DataclassC(FastDataclass):\n ints: list[int]\n@register_dataclass\n@dataclass\nclass DataclassC_ext(DataclassC):\n s: str\n@register_dataclass\n@dataclass\nclass DataclassB(FastDataclass):\n str_to_c: dict[str, DataclassC]\n cs: list[DataclassC]\n@register_dataclass\n@dataclass",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py:1-34"
+ },
+ "345": {
+ "file_id": 22,
+ "content": "This code defines three classes, DataclassC, DataclassC_ext, and DataclassB, which inherit from FastDataclass. These classes have fields of different types and are registered using the register_dataclass decorator. The purpose is to enable serialization and deserialization for instances of these classes.",
+ "type": "comment"
+ },
+ "346": {
+ "file_id": 22,
+ "content": "class DataclassA(FastDataclass):\n floats: list[float]\n strings: list[str]\n bs: list[DataclassB]\n@register_dataclass\n@dataclass\nclass DataclassD(FastDataclass):\n s1: str\n s2: str = \"default\"\ndef test_dataclasses() -> None:\n a = DataclassA(\n floats=[1.0, 2.0],\n strings=[\"a\", \"b\"],\n bs=[\n DataclassB(\n str_to_c={\"a\": DataclassC(ints=[1, 2]), \"b\": DataclassC(ints=[3, 4])},\n cs=[DataclassC(ints=[5, 6]), DataclassC_ext(ints=[7, 8], s=\"s\")],\n ),\n DataclassB(\n str_to_c={\"c\": DataclassC_ext(ints=[9, 10], s=\"t\"), \"d\": DataclassC(ints=[11, 12])},\n cs=[DataclassC(ints=[13, 14]), DataclassC(ints=[15, 16])],\n ),\n ],\n )\n assert loads(dumps(a)) == a\ndef test_c_and_c_ext() -> None:\n c_ext = DataclassC_ext(ints=[3, 4], s=\"s\")\n assert loads(dumps(c_ext)) == c_ext\n c = DataclassC(ints=[1, 2])\n assert loads(dumps(c)) == c\ndef test_bad_serialized_data() -> None:\n assert type(loads(dumps(DataclassC(ints=[3, 4])))) == DataclassC",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py:35-75"
+ },
+ "347": {
+ "file_id": 22,
+ "content": "- Instantiate a DataclassA object with specified floats, strings, and nested DataclassB objects.\n- Assert that the serialized and deserialized versions of the DataclassA object are equal.\n- Test serialization and deserialization for DataclassC and DataclassC_ext.\n- Test handling of bad serialized data.",
+ "type": "comment"
+ },
+ "348": {
+ "file_id": 22,
+ "content": " assert type(loads('{\"ints\": [3, 4]}', backwards_compatible=False)) == dict\n assert type(loads('{\"ints\": [3, 4], \"dataclass_name\": \"DataclassC\"}')) == DataclassC\n with pytest.raises(TypeError):\n loads('{\"ints\": [3, 4], \"bogus_extra_field\": \"foo\", \"dataclass_name\": \"DataclassC\"}')\n with pytest.raises(TypeError):\n loads('{\"ints_field_is_missing\": [3, 4], \"dataclass_name\": \"DataclassC\"}')\n assert type(loads('{\"s1\": \"test\"}', backwards_compatible=False)) == dict\n assert type(loads('{\"s1\": \"test\"}', backwards_compatible=True)) == DataclassD",
+ "type": "code",
+ "location": "/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py:76-83"
+ },
+ "349": {
+ "file_id": 22,
+ "content": "The code tests different scenarios for deserializing data using the `loads` function. It asserts that it correctly identifies the data type and raises a TypeError when unexpected fields are present.",
+ "type": "comment"
+ },
+ "350": {
+ "file_id": 23,
+ "content": "/neuron-explainer/setup.py",
+ "type": "filepath"
+ },
+ "351": {
+ "file_id": 23,
+ "content": "This code is a setup script for the \"neuron_explainer\" package. It specifies package name, dependencies, and required Python version.",
+ "type": "summary"
+ },
+ "352": {
+ "file_id": 23,
+ "content": "from setuptools import setup, find_packages\nsetup(\n name=\"neuron_explainer\",\n packages=find_packages(),\n version=\"0.0.1\",\n author=\"OpenAI\",\n install_requires=[\n \"httpx>=0.22\",\n \"scikit-learn\",\n \"boostedblob>=0.13.0\",\n \"tiktoken\",\n \"blobfile\",\n \"numpy\",\n \"pytest\",\n \"orjson\",\n ],\n url=\"\",\n description=\"\",\n python_requires='>=3.9',\n)",
+ "type": "code",
+ "location": "/neuron-explainer/setup.py:1-21"
+ },
+ "353": {
+ "file_id": 23,
+ "content": "This code is a setup script for the \"neuron_explainer\" package. It specifies package name, dependencies, and required Python version.",
+ "type": "comment"
+ },
+ "354": {
+ "file_id": 24,
+ "content": "/neuron-viewer/README.md",
+ "type": "filepath"
+ },
+ "355": {
+ "file_id": 24,
+ "content": "This code is for a neuron viewer, which can be accessed through the public website. It provides an implementation of the website and supports local development with instructions to install and run both backend and frontend.",
+ "type": "summary"
+ },
+ "356": {
+ "file_id": 24,
+ "content": "# Neuron viewer\nThe easiest way to view neurons and explanations is using the\n[public website](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html).\nThis directory contains the implementation of that website as well as lightweight servers that make\nit possible to run an alternative version of the website locally.\n## Local development\nInstall:\n```npm install```\nRun the backend:\n```npm run startpy```\nRun the frontend:\n```npm start```",
+ "type": "code",
+ "location": "/neuron-viewer/README.md:1-20"
+ },
+ "357": {
+ "file_id": 24,
+ "content": "This code is for a neuron viewer, which can be accessed through the public website. It provides an implementation of the website and supports local development with instructions to install and run both backend and frontend.",
+ "type": "comment"
+ },
+ "358": {
+ "file_id": 25,
+ "content": "/neuron-viewer/python/server.py",
+ "type": "filepath"
+ },
+ "359": {
+ "file_id": 25,
+ "content": "The code imports libraries, defines functions for loading JSON data and starting the Flask server. The Flask app is configured with logging, CORS, and an after_request function to allow cross-origin requests. It also includes a main function which can be executed if the script is run directly, accepting optional dev, host_name, and port arguments.",
+ "type": "summary"
+ },
+ "360": {
+ "file_id": 25,
+ "content": "# %%\nimport logging\nfrom flask import Flask, request\nfrom flask_cors import CORS\nimport json\nimport urllib.request\ndef load_az_json(url):\n with urllib.request.urlopen(url) as f:\n return json.load(f)\ndef start(\n dev: bool = False,\n host_name: str = \"0.0.0.0\",\n port: int = 80,\n):\n app = Flask(\"interpretability chat\")\n app.logger.setLevel(logging.INFO)\n # app.logger.disabled = True\n CORS(app)\n @app.after_request\n def after_request(response):\n response.headers.add(\"Access-Control-Allow-Origin\", \"*\")\n response.headers.add(\n \"Access-Control-Allow-Headers\", \"Content-Type,Authorization\"\n )\n response.headers.add(\n \"Access-Control-Allow-Methods\", \"GET,PUT,POST,DELETE,OPTIONS\"\n )\n return response\n @app.route(\"/load_az\", methods=[\"GET\", \"POST\"])\n async def load_az():\n args = request.get_json()\n path = args[\"path\"]\n result = load_az_json(path)\n return result\n app.run(debug=dev, host=host_name, port=port, use_reloader=False)",
+ "type": "code",
+ "location": "/neuron-viewer/python/server.py:1-43"
+ },
+ "361": {
+ "file_id": 25,
+ "content": "Imports necessary libraries and defines functions for loading JSON data and starting the Flask server.\nFlask app is configured with logging, CORS, and after_request function to allow cross-origin requests.",
+ "type": "comment"
+ },
+ "362": {
+ "file_id": 25,
+ "content": "def main(dev: bool = True, host_name: str = \"0.0.0.0\", port: int = 8000):\n start(dev=dev, host_name=host_name, port=port)\nif __name__ == \"__main__\":\n main()",
+ "type": "code",
+ "location": "/neuron-viewer/python/server.py:46-51"
+ },
+ "363": {
+ "file_id": 25,
+ "content": "This code defines a main function and executes it if the script is run directly. It accepts optional boolean dev, string host_name, and int port arguments.",
+ "type": "comment"
+ },
+ "364": {
+ "file_id": 26,
+ "content": "/neuron-viewer/src/App.jsx",
+ "type": "filepath"
+ },
+ "365": {
+ "file_id": 26,
+ "content": "Imports CSS and Feed component, sets up React Router for routing between components.",
+ "type": "summary"
+ },
+ "366": {
+ "file_id": 26,
+ "content": "import \"./App.css\"\nimport Feed from \"./feed\"\nimport React from \"react\"\nimport { Routes, Route, HashRouter } from \"react-router-dom\"\nfunction App() {\n return (\n \n \n } />\n } />\n \n \n )\n}\nexport default App",
+ "type": "code",
+ "location": "/neuron-viewer/src/App.jsx:1-17"
+ },
+ "367": {
+ "file_id": 26,
+ "content": "Imports CSS and Feed component, sets up React Router for routing between components.",
+ "type": "comment"
+ },
+ "368": {
+ "file_id": 27,
+ "content": "/neuron-viewer/src/feed.jsx",
+ "type": "filepath"
+ },
+ "369": {
+ "file_id": 27,
+ "content": "The code sets up a functional component, Feed, which displays either a welcome message or information about the selected neuron depending on whether one is chosen. It checks for the neuron selection in URL parameters and renders additional panes such as explanation, dataset list, top tokens, and similar neurons if an active neuron is present; otherwise, it shows a welcome message.",
+ "type": "summary"
+ },
+ "370": {
+ "file_id": 27,
+ "content": "import * as Panes from \"./panes\"\nimport React, { useEffect } from \"react\"\nimport Welcome from \"./welcome\"\nimport { useState } from \"react\"\nimport { useParams, Link } from \"react-router-dom\"\nexport default function Feed() {\n const params = useParams()\n // If params is missing either index, there's no neuron selected.\n let activeNeuron;\n if (params.layer === undefined || params.neuron === undefined) {\n activeNeuron = null\n } else {\n // Grab the layer and neuron indices from the params, casting them to ints.\n activeNeuron = {\n \"layer\": parseInt(params.layer),\n \"neuron\": parseInt(params.neuron),\n }\n }\n const Pane = ({ children }) => (\n
\n )\n}",
+ "type": "code",
+ "location": "/neuron-viewer/src/feed.jsx:39-64"
+ },
+ "373": {
+ "file_id": 27,
+ "content": "This code defines a functional component that renders a layout for the neuron viewer. If an active neuron is present, it displays explanation, dataset list, top tokens, and similar neurons panes. Otherwise, it shows a welcome message.",
+ "type": "comment"
+ },
+ "374": {
+ "file_id": 28,
+ "content": "/neuron-viewer/src/heatmapGrid.tsx",
+ "type": "filepath"
+ },
+ "375": {
+ "file_id": 28,
+ "content": "This code exports a functional component that takes an array of 2D arrays of \"TokenAndActivation\" objects and renders a heatmap for each token. The tokens are displayed within a block-style div, with each token's heatmap displayed inside its respective div.",
+ "type": "summary"
+ },
+ "376": {
+ "file_id": 28,
+ "content": "import { TokenAndActivation } from \"./types\"\nimport TokenHeatmap from \"./tokenHeatmap\";\nexport default ({ allTokens }: { allTokens: TokenAndActivation[][]}) => {\n return (\n
\n {allTokens.map((tokens, i) => (\n
\n \n
\n ))}\n
\n );\n};",
+ "type": "code",
+ "location": "/neuron-viewer/src/heatmapGrid.tsx:1-14"
+ },
+ "377": {
+ "file_id": 28,
+ "content": "This code exports a functional component that takes an array of 2D arrays of \"TokenAndActivation\" objects and renders a heatmap for each token. The tokens are displayed within a block-style div, with each token's heatmap displayed inside its respective div.",
+ "type": "comment"
+ },
+ "378": {
+ "file_id": 29,
+ "content": "/neuron-viewer/src/index.jsx",
+ "type": "filepath"
+ },
+ "379": {
+ "file_id": 29,
+ "content": "This code imports necessary modules and sets up the root element for a React application, which then renders the App component within a strict mode. It also configures performance measurement if desired.",
+ "type": "summary"
+ },
+ "380": {
+ "file_id": 29,
+ "content": "import React from 'react';\nimport ReactDOM from 'react-dom/client';\nimport './index.css';\nimport App from './App';\nimport reportWebVitals from './reportWebVitals';\nconst root = ReactDOM.createRoot(document.getElementById('root'));\nroot.render(\n \n \n \n);\n// If you want to start measuring performance in your app, pass a function\n// to log results (for example: reportWebVitals(console.log))\n// or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals\nreportWebVitals();",
+ "type": "code",
+ "location": "/neuron-viewer/src/index.jsx:1-17"
+ },
+ "381": {
+ "file_id": 29,
+ "content": "This code imports necessary modules and sets up the root element for a React application, which then renders the App component within a strict mode. It also configures performance measurement if desired.",
+ "type": "comment"
+ },
+ "382": {
+ "file_id": 30,
+ "content": "/neuron-viewer/src/interpAPI.ts",
+ "type": "filepath"
+ },
+ "383": {
+ "file_id": 30,
+ "content": "The code retrieves top-connected neurons and their corresponding layer-neuron pairs, using functions to load JSON files from Azure Blob Storage and memoization.",
+ "type": "summary"
+ },
+ "384": {
+ "file_id": 30,
+ "content": "import {Neuron} from './types';\nimport {memoizeAsync} from \"./utils\"\nexport const load_file_no_cache = async(path: string) => {\n const data = {\n path: path\n }\n const url = new URL(\"/load_az\", window.location.href)\n url.port = '8000';\n return await (\n await fetch(url, {\n method: \"POST\", // or 'PUT'\n headers: {\n \"Content-Type\": \"application/json\",\n },\n body: JSON.stringify(data),\n })\n ).json()\n}\nexport const load_file_az = async(path: string) => {\n const res = (\n await fetch(path, {\n method: \"GET\",\n mode: \"cors\",\n headers: {\n \"Content-Type\": \"application/json\",\n },\n })\n )\n if (!res.ok) {\n console.error(`HTTP error: ${res.status} - ${res.statusText}`);\n return;\n }\n return await res.json()\n}\n// export const load_file = memoizeAsync('load_file', load_file_no_cache)\nexport const load_file = window.location.host.indexOf('localhost:') === -1 ? load_file_az : load_file_no_cache;\n// # (derived from az://oaialignment/datasets/interp/gpt2_xl/v1/webtext1/len_nomax/n_50000/mlp_post_act/ranked_by_max_activation)",
+ "type": "code",
+ "location": "/neuron-viewer/src/interpAPI.ts:1-44"
+ },
+ "385": {
+ "file_id": 30,
+ "content": "This code defines two functions, `load_file_no_cache` and `load_file_az`, for loading data from a file. The first function sends the file path to a server using POST request with JSON body. The second function retrieves the file content using GET request with CORS mode. A memoization function is defined but not used in this code. The `load_file` variable is set based on whether the application is running locally or remotely, and it points to either the local or remote loading function.",
+ "type": "comment"
+ },
+ "386": {
+ "file_id": 30,
+ "content": "// const NEURON_RECORDS_PATH = \"az://oaisbills/rcall/oss/migrated_make_crow_datasets/gpt2_xl_n_50000_64_token/neurons\"\nconst NEURON_RECORDS_PATH = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations\"\n// # (derived from az://oaialignment/datasets/interp/gpt2_xl/v1/webtext1/len_nomax/n_50000/mlp_post_act/ranked_by_max_activation/neurons/explanations/canonical-run-v1)\n// const EXPLANATIONS_PATH = \"az://oaisbills/rcall/oss/migrated_explanation_datasets/canonical_gpt2_xl_all_neurons\"\nconst EXPLANATIONS_PATH = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/explanations\"\n// weight-based\n// const WHOLE_LAYER_WEIGHT_TOKENS_PATH = \"az://oaidan/rcall/data/interpretability/connections/gpt2-xl/mlp/unnorm_token_representations_uncommon_vanilla\"\n// const WEIGHT_TOKENS_PATH = \"az://oaijeffwu/jeffwu-data/interpretability/neuron-connections/gpt2-xl/weight-based\"\nconst WEIGHT_TOKENS_PATH = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based\"",
+ "type": "code",
+ "location": "/neuron-viewer/src/interpAPI.ts:45-55"
+ },
+ "387": {
+ "file_id": 30,
+ "content": "The code defines constants for the path to neuron records, explanations, and related tokens (weight-based). The previous paths were derived from Azure Storage, but now they are pointing to a public Blob storage in Windows. These paths are used to access the necessary data for interpretation tasks.",
+ "type": "comment"
+ },
+ "388": {
+ "file_id": 30,
+ "content": "// lookup table\n// const WHOLE_LAYER_ACTIVATION_TOKENS_PATH = \"az://oaidan/rcall/data/interpretability/connections/gpt2_xl/mlp/unnorm_token_representations_vanilla_and_common_in_colangv2_unigram\"\n// const ACTIVATION_TOKENS_PATH = \"az://oaijeffwu/jeffwu-data/interpretability/neuron-connections/gpt2-xl/lookup-table\"\nconst ACTIVATION_TOKENS_PATH = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based\"\n// const CONNECTIONS_PATH = \"az://oaialignment/datasets/interp/connections/gpt2/neuron_space/incl_attn_False\"\nconst CONNECTIONS_PATH = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-neurons/weight-based\"\nexport const get_explanations = async (activeNeuron: Neuron) => {\n const result = await load_file(`${EXPLANATIONS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.jsonl`)\n return result\n}\nexport const get_top_tokens = async (activeNeuron: Neuron, weightType: string) => {\n let TOKENS_PATH;\n if (weightType === 'weight') {\n TOKENS_PATH = WEIGHT_TOKENS_PATH;",
+ "type": "code",
+ "location": "/neuron-viewer/src/interpAPI.ts:56-73"
+ },
+ "389": {
+ "file_id": 30,
+ "content": "This code defines constants for storage locations of lookup table and connection paths, and functions to retrieve explanations and top tokens based on a given neuron and weight type. The code also uses Azure Blob Storage to load JSON files containing explanation data and token representations.",
+ "type": "comment"
+ },
+ "390": {
+ "file_id": 30,
+ "content": " } else if (weightType === 'activation') {\n TOKENS_PATH = ACTIVATION_TOKENS_PATH;\n } else {\n throw new Error(`Invalid weightType: ${weightType}`)\n }\n const result = await load_file(`${TOKENS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)\n return result\n // const result = await load_file_no_cache(`${ORIG_TOKENS_PATH}/${activeNeuron.layer}.json`)\n // return result.neuron_summaries[activeNeuron.neuron]\n}\nexport const get_top_neuron_connections = async (activeNeuron: Neuron) => {\n const result = await load_file(`${CONNECTIONS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)\n const res: {[key: string]: [number, number]} = {};\n [\"input\", \"output\"].forEach((direction) => {\n const sign = \"positive\" // \"negative\"\n const weight_name: string = {output: \"c_proj\", input: \"c_fc\"}[direction] as string;\n const res_for_dir = result[weight_name];\n if (res_for_dir === null) {\n return\n }\n // let key = 'top_negative_neurons'\n c",
+ "type": "code",
+ "location": "/neuron-viewer/src/interpAPI.ts:74-97"
+ },
+ "391": {
+ "file_id": 30,
+ "content": "Checks the weightType and sets the corresponding TOKENS_PATH for loading neuron data. If an invalid weightType is given, throws an error. Loads and returns the neuron data from the specified file path.",
+ "type": "comment"
+ },
+ "392": {
+ "file_id": 30,
+ "content": "onst top_neuron_strs = res_for_dir[`top_${sign}_neurons`] // {layer}_{neuron} strings for each top-connected neuron\n const top_weights = res_for_dir[`top_${sign}_weights`]\n const top_layer_neuron_tuples = top_neuron_strs.map((neuron_str: string, i: number) => {\n const [layer, neuron] = neuron_str.split(\"_\").map((x: string) => parseInt(x))\n return [layer, neuron, top_weights[i]] as [number, number, number]\n })\n res[direction] = top_layer_neuron_tuples.slice(0, 10)\n })\n return res\n}\nexport const get_neuron_record = async(activeNeuron: Neuron) => {\n const result = await load_file(`${NEURON_RECORDS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)\n return result\n}",
+ "type": "code",
+ "location": "/neuron-viewer/src/interpAPI.ts:97-112"
+ },
+ "393": {
+ "file_id": 30,
+ "content": "This code retrieves the top-connected neurons for a given direction and sign from a result object, maps them to layer, neuron, and weight tuples, and returns the top 10 layer-neuron pairs. It also defines a function `get_neuron_record` that asynchronously loads a JSON file representing a neuron's record based on its layer and neuron ID.",
+ "type": "comment"
+ },
+ "394": {
+ "file_id": 31,
+ "content": "/neuron-viewer/src/panes/datasetList.jsx",
+ "type": "filepath"
+ },
+ "395": {
+ "file_id": 31,
+ "content": "The code imports components, generates sequence lists, fetches and displays data, normalizes sequences of activations, renders them with labels, provides a button for data visibility, and allows users to visualize heatmaps through iterating slices of data.",
+ "type": "summary"
+ },
+ "396": {
+ "file_id": 31,
+ "content": "import HeatmapGrid from \"../heatmapGrid\"\nimport React, { useEffect, useState } from \"react\"\nimport { normalizeTokenActs } from \"../types\"\nimport {get_neuron_record} from \"../interpAPI\"\nfunction zip_sequences(sequences) {\n return sequences.map(({ activations, tokens }) => {\n return tokens.map((token, idx) => ({\n token,\n activation: activations[idx],\n }))\n })\n}\nexport default ({ activeNeuron }) => {\n const [data, setData] = useState(null)\n const [showingMore, setShowingMore] = useState({})\n const [isLoading, setIsLoading] = useState(true)\n useEffect(() => {\n async function fetchData() {\n if (data) {\n return\n }\n const result = await get_neuron_record(activeNeuron)\n console.log(result)\n const all_sequences = []\n all_sequences.push({\n // label: '[0.999, 1] (Top quantile, sorted. 50 of 50000)',\n label: 'Top',\n sequences: zip_sequences(result.most_positive_activation_records),\n default_show: 4,\n })\n all_sequences.push({\n label: 'Quantile range [0.99, 0.999] sample',",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/datasetList.jsx:1-36"
+ },
+ "397": {
+ "file_id": 31,
+ "content": "Importing HeatmapGrid component and React, useState, and useEffect hooks from 'react'. Defining a zip_sequences function that takes in sequences as input. Exporting a default functional component that takes an activeNeuron prop. Inside the component, setting up state variables for data, showingMore, and isLoading using useState hook. Using the useEffect hook to fetch data when the component mounts or if there's a change in the activeNeuron prop. The fetched data is then used to create all_sequences array, which contains objects with label, sequences, and default_show properties.",
+ "type": "comment"
+ },
+ "398": {
+ "file_id": 31,
+ "content": " sequences: zip_sequences(result.random_sample_by_quantile[3]),\n default_show: 1,\n })\n all_sequences.push({\n label: 'Quantile range [0.9, 0.99] sample',\n sequences: zip_sequences(result.random_sample_by_quantile[2]),\n default_show: 1,\n })\n all_sequences.push({\n label: 'Quantile range [0.5, 0.9] sample',\n sequences: zip_sequences(result.random_sample_by_quantile[1]),\n default_show: 1,\n })\n all_sequences.push({\n label: 'Quantile range [0, 0.5] sample',\n sequences: zip_sequences(result.random_sample_by_quantile[0]),\n default_show: 1,\n })\n all_sequences.push({\n // label: '[0, 1] (Random)',\n label: 'Random sample',\n sequences: zip_sequences(result.random_sample),\n default_show: 2,\n })\n // for reference\n // intervals = [(0, 1), (0, 0.5), (0.5, 0.9), (0.9, 0.99), (0.99, 0.999), (0.999, 1)]\n // saved_activations_by_interval = [neuron_record.random_sample] + neuron_record.random_sample_by_decile[:-1] + [neuron_record.top_activations]",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/datasetList.jsx:37-63"
+ },
+ "399": {
+ "file_id": 31,
+ "content": "This code generates a list of sequences for different quantile ranges and a random sample. It adds these sequences to the all_sequences array, which will be used later in the program. The intervals used here are defined as a reference.",
+ "type": "comment"
+ }
+}
\ No newline at end of file
diff --git a/docs/data/4.json b/docs/data/4.json
new file mode 100644
index 0000000..7c80a0e
--- /dev/null
+++ b/docs/data/4.json
@@ -0,0 +1,541 @@
+{
+ "400": {
+ "file_id": 31,
+ "content": " setData(all_sequences)\n setIsLoading(false)\n }\n fetchData()\n }, [activeNeuron])\n if (isLoading) {\n return (\n
\n >\n )\n}\nexport default ExplanationDisplay",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/explanation.jsx:148-167"
+ },
+ "417": {
+ "file_id": 32,
+ "content": "This code renders two SimulationHeatmap components, one for the top 5 real and simulated sequences, and another for the remaining real and simulated sequences. The toggle value determines if overlay activations should be displayed.",
+ "type": "comment"
+ },
+ "418": {
+ "file_id": 33,
+ "content": "/neuron-viewer/src/panes/index.js",
+ "type": "filepath"
+ },
+ "419": {
+ "file_id": 33,
+ "content": "Importing components from separate files for use in the application.",
+ "type": "summary"
+ },
+ "420": {
+ "file_id": 33,
+ "content": "export { default as TopTokens } from \"./topTokens\"\nexport { default as Explanation } from \"./explanation\"\nexport { default as DatasetList } from \"./datasetList\"\nexport { default as SimilarNeurons } from \"./similarNeurons\"",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/index.js:1-4"
+ },
+ "421": {
+ "file_id": 33,
+ "content": "Importing components from separate files for use in the application.",
+ "type": "comment"
+ },
+ "422": {
+ "file_id": 34,
+ "content": "/neuron-viewer/src/panes/similarNeurons.jsx",
+ "type": "filepath"
+ },
+ "423": {
+ "file_id": 34,
+ "content": "The functional component fetches and displays information about neurons, using state variables and hooks to manage data. It organizes the displayed connections in a visually pleasing format and shows related neurons based on user-selected neuron, fetching similar ones in upstream and downstream sections with a loading animation while data is fetched, showing up to 3 related neurons for each section using NeuronInfo component.",
+ "type": "summary"
+ },
+ "424": {
+ "file_id": 34,
+ "content": "import React, { useEffect, useState } from \"react\"\nimport _ from \"lodash\"\nimport { Link } from \"react-router-dom\"\nimport { get_explanations, get_top_neuron_connections } from \"../interpAPI\"\nfunction NeuronInfo({ neuron, strength }) {\n const [info, setInfo] = useState(null)\n useEffect(() => {\n async function fetchInfo() {\n const result = (await get_explanations({\n layer: neuron.layer,\n neuron: neuron.neuron,\n }))\n setInfo(result)\n }\n if (!info) {\n fetchInfo()\n }\n }, [])\n if (!info) {\n return (\n
\n ",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/similarNeurons.jsx:1-40"
+ },
+ "425": {
+ "file_id": 34,
+ "content": "Importing necessary modules and defining a functional component for displaying neuron information.\nState variable \"info\" is set to null initially using useState hook.\nuseEffect hook is used to fetch the explanation data when the component mounts.\nIf info is not available, a loading indicator is displayed.\nWhen info is available, the neuron information is rendered within a div element.",
+ "type": "comment"
+ },
+ "426": {
+ "file_id": 34,
+ "content": " Neuron {neuron.layer}:{neuron.neuron}\n \n
\n )\n}\nexport default function SimilarNeurons({ activeNeuron: neuron }) {\n const [similarNeurons, setSimilarNeurons] = useState([])\n const [isLoading, setIsLoading] = useState(true)\n useEffect(() => {\n async function fetchSimilarNeurons() {\n const result = await get_top_neuron_connections(neuron)\n setSimilarNeurons(result)\n setIsLoading(false)",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/similarNeurons.jsx:41-72"
+ },
+ "427": {
+ "file_id": 34,
+ "content": "Code snippet displays information about similar neurons, their connection strength and scored explanations for a given activeNeuron. It fetches data using the 'get_top_neuron_connections' function and renders it in a visually formatted way. The state variables 'similarNeurons', 'isLoading' are managed with useState hook, and the useEffect hook is used to fetch similar neurons data when the activeNeuron prop changes.",
+ "type": "comment"
+ },
+ "428": {
+ "file_id": 34,
+ "content": " }\n fetchSimilarNeurons()\n }, [neuron])\n if (isLoading) {\n return (\n
",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/similarNeurons.jsx:73-105"
+ },
+ "429": {
+ "file_id": 34,
+ "content": "Functionality: Displays related neurons based on user-selected neuron\n\nCode explanation:\n- If a user selects a neuron, fetch the similar neurons and display them in two sections - upstream and downstream.\n- Show a loading animation while data is fetched.\n- Display up to 3 related neurons for each section (upstream and downstream).\n- Use NeuronInfo component to represent each displayed neuron.",
+ "type": "comment"
+ },
+ "430": {
+ "file_id": 34,
+ "content": "
\n )\n}",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/similarNeurons.jsx:106-118"
+ },
+ "431": {
+ "file_id": 34,
+ "content": "Rendering a list of downstream neurons for the selected neuron, up to n_show.",
+ "type": "comment"
+ },
+ "432": {
+ "file_id": 35,
+ "content": "/neuron-viewer/src/panes/topTokens.jsx",
+ "type": "filepath"
+ },
+ "433": {
+ "file_id": 35,
+ "content": "The code imports React, fetches token data from an API, renders loading indicator, displays tokens with interactive elements and tooltips, limited to 20 input tokens, and formats the tokens in rounded-full boxes with red text color.",
+ "type": "summary"
+ },
+ "434": {
+ "file_id": 35,
+ "content": "import React, { useState, useEffect } from \"react\"\nimport { get_top_tokens } from \"../interpAPI\"\nconst TokenDisplay = ({ activeNeuron }) => {\n const [isLoading, setIsLoading] = useState(true)\n const [data, setData] = useState(null)\n const loadTokens = async () => {\n setIsLoading(true)\n const weightStrengths = await get_top_tokens(activeNeuron, 'weight')\n const activationStrengths = await get_top_tokens(activeNeuron, 'activation')\n const data = {\n activeNeuron,\n weightStrengths,\n activationStrengths,\n }\n setData(data)\n setIsLoading(false)\n }\n useEffect(() => {\n if (!data) {\n loadTokens()\n }\n }, [])\n return (\n
\n
Related tokens
\n {isLoading ? (\n
\n
loading tokens
\n
\n ) : (\n <>\n
Mean-activation-based
\n
",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/topTokens.jsx:1-41"
+ },
+ "435": {
+ "file_id": 35,
+ "content": "This code imports React and two useState/useEffect hooks. It defines a TokenDisplay component that fetches related tokens data from an interpAPI endpoint based on the activeNeuron prop. It checks if the data is loaded, displays a loading indicator if not, and renders mean-activation-based token information when done loading.",
+ "type": "comment"
+ },
+ "436": {
+ "file_id": 35,
+ "content": " {data.activationStrengths.tokens.map((token, idx) => {\n return (\n data.activationStrengths.average_activations[idx] === null ? null :\n \n {token}\n \n )\n })}\n
",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/topTokens.jsx:64-88"
+ },
+ "439": {
+ "file_id": 35,
+ "content": "This code maps over input tokens and displays them with a specific styling. It also handles the strength values of each token for negative inputs, displaying them only if they are not null and showing the title with the respective strength value when hovered over.",
+ "type": "comment"
+ },
+ "440": {
+ "file_id": 35,
+ "content": "
\n )\n}\nexport default TokenDisplay",
+ "type": "code",
+ "location": "/neuron-viewer/src/panes/topTokens.jsx:111-124"
+ },
+ "443": {
+ "file_id": 35,
+ "content": "This code is rendering a div containing tokens in an array. The tokens are displayed within rounded-full text boxes with red text color.",
+ "type": "comment"
+ },
+ "444": {
+ "file_id": 36,
+ "content": "/neuron-viewer/src/reportWebVitals.js",
+ "type": "filepath"
+ },
+ "445": {
+ "file_id": 36,
+ "content": "This code defines a function `reportWebVitals` that, when given a callback function as an argument, uses the `web-vitals` library to measure and report various web vital metrics like CLS, FID, FCP, LCP, TTFB. If no callback or invalid callback is passed, it does nothing.",
+ "type": "summary"
+ },
+ "446": {
+ "file_id": 36,
+ "content": "const reportWebVitals = onPerfEntry => {\n if (onPerfEntry && onPerfEntry instanceof Function) {\n import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {\n getCLS(onPerfEntry);\n getFID(onPerfEntry);\n getFCP(onPerfEntry);\n getLCP(onPerfEntry);\n getTTFB(onPerfEntry);\n });\n }\n };\n export default reportWebVitals;",
+ "type": "code",
+ "location": "/neuron-viewer/src/reportWebVitals.js:1-13"
+ },
+ "447": {
+ "file_id": 36,
+ "content": "This code defines a function `reportWebVitals` that, when given a callback function as an argument, uses the `web-vitals` library to measure and report various web vital metrics like CLS, FID, FCP, LCP, TTFB. If no callback or invalid callback is passed, it does nothing.",
+ "type": "comment"
+ },
+ "448": {
+ "file_id": 37,
+ "content": "/neuron-viewer/src/simulationHeatmap.tsx",
+ "type": "filepath"
+ },
+ "449": {
+ "file_id": 37,
+ "content": "The React component visualizes matching sequences and activations through colored divs or heatmaps, while rendering a div for each simulation in the array.",
+ "type": "summary"
+ },
+ "450": {
+ "file_id": 37,
+ "content": "import React, { useState } from 'react';\nimport { interpolateColor, Color, getInterpolatedColor, DEFAULT_COLORS, DEFAULT_BOUNDARIES, TokenAndActivation } from './types'\ntype Props = {\n sequences: TokenAndActivation[][], \n simulated_sequences: TokenAndActivation[][], \n overlay_activations: boolean,\n colors?: Color[], \n boundaries?: number[],\n}\nexport default function SimulationSequences({ sequences, simulated_sequences, overlay_activations, colors = DEFAULT_COLORS, boundaries = DEFAULT_BOUNDARIES }: Props) {\n return <>\n {\n sequences.map((tokens, i) => {\n let simulated_tokens = simulated_sequences[i];\n if (overlay_activations) {\n return (\n
\n {tokens.map(({ token, activation, normalized_activation }, j) => {\n const { token: simulated_token, activation: simulated_activation, normalized_activation: simulated_normalized_activation } = simulated_tokens[j];",
+ "type": "code",
+ "location": "/neuron-viewer/src/simulationHeatmap.tsx:1-21"
+ },
+ "451": {
+ "file_id": 37,
+ "content": "This code is a React component that takes in two sets of sequences (sequences and simulated_sequences), along with optional overlay_activations, colors, and boundaries props. It maps through each sequence, then each token within the sequence, comparing the activation values between the original sequence and the simulated one. If overlay_activations is true, it will display both sets of activations in a div element with custom styling.",
+ "type": "comment"
+ },
+ "452": {
+ "file_id": 37,
+ "content": " if (simulated_token !== token) {\n throw new Error('simulated tokens not matching')\n }\n const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation);\n const simcolor = getInterpolatedColor(colors, boundaries, simulated_normalized_activation || simulated_activation);\n return
\n )\n}",
+ "type": "code",
+ "location": "/neuron-viewer/src/tokenHeatmap.tsx:24-32"
+ },
+ "465": {
+ "file_id": 38,
+ "content": "Creates a colored span for each token in the input text, using RGB values to determine color intensity.",
+ "type": "comment"
+ },
+ "466": {
+ "file_id": 39,
+ "content": "/neuron-viewer/src/types.ts",
+ "type": "filepath"
+ },
+ "467": {
+ "file_id": 39,
+ "content": "This code defines functions for color interpolation and normalization of activation values. It flattens, scales, and normalizes data using imported types, with a function to interpolate colors between two given colors based on a ratio, and default color and boundary values provided.",
+ "type": "summary"
+ },
+ "468": {
+ "file_id": 39,
+ "content": "import { scaleLinear } from \"d3-scale\"\nimport { min, max, flatten } from \"lodash\"\nexport type Neuron = {\n layer: number;\n neuron: number;\n}\nexport type TokenAndActivation = {\n token: string,\n activation: number\n normalized_activation?: number\n}\nexport type TokenSequence = TokenAndActivation[]\nexport const normalizeTokenActs = (...sequences: TokenSequence[][]) => {\n // console.log('sequences', sequences)\n let flattened: TokenAndActivation[] = flatten(flatten(sequences))\n // Replace all activations less than 0 in data.tokens with 0. This matches the format in the\n // top + random activation records displayed in the main grid.\n flattened = flattened.map(({token, activation}) => {\n return {\n token,\n activation: Math.max(activation, 0)\n }\n })\n const maxActivation = max(flattened.map((ta) => ta.activation)) || 0;\n const neuronScale = scaleLinear()\n // Even though we're only displaying positive activations, we still need to scale in a way that\n // accounts for the existence of negative activations, since our color scale includes them.",
+ "type": "code",
+ "location": "/neuron-viewer/src/types.ts:1-31"
+ },
+ "469": {
+ "file_id": 39,
+ "content": "This code imports necessary functions and defines types for neuron, token and activation data. It then creates a function normalizeTokenActs that takes in multiple sequences of tokens and their activations, flattens them into one array, replaces any negative activations with 0, finds the maximum activation value across all sequences, and scales the data to be between 0 and 1 for colorization purposes.",
+ "type": "comment"
+ },
+ "470": {
+ "file_id": 39,
+ "content": " .domain([0, maxActivation])\n .range([0, 1])\n return sequences.map((seq) => seq.map((tas) => tas.map(({ token, activation }) => ({\n token,\n activation,\n normalized_activation: neuronScale(activation),\n }))))\n}\nexport type Color = {r: number, g: number, b: number};\nexport function interpolateColor(color_l: Color, color_r: Color, value: number) {\n const color = {\n r: Math.round(color_l.r + (color_r.r - color_l.r) * value),\n g: Math.round(color_l.g + (color_r.g - color_l.g) * value),\n b: Math.round(color_l.b + (color_r.b - color_l.b) * value),\n }\n return color\n}\nexport function getInterpolatedColor(colors: Color[], boundaries: number[], value: number) {\n const index = boundaries.findIndex((boundary) => boundary >= value)\n const colorIndex = Math.max(0, index - 1)\n const color_left = colors[colorIndex]\n const color_right = colors[colorIndex + 1]\n const boundary_left = boundaries[colorIndex]\n const boundary_right = boundaries[colorIndex + 1]\n const ratio = (value - boundary_left) / (boundary_right - boundary_left)",
+ "type": "code",
+ "location": "/neuron-viewer/src/types.ts:32-59"
+ },
+ "471": {
+ "file_id": 39,
+ "content": "This code defines functions for color interpolation and normalization of activation values in sequences. It also exports a Color type which represents RGB colors, with each component ranging from 0 to 255. The \"interpolateColor\" function takes three parameters: two colors (left and right) and a value between 0 and 1, representing the position on a gradient between the left and right colors. It calculates the new color based on the interpolation of the RGB components between these two colors. The \"getInterpolatedColor\" function uses color boundaries to determine the appropriate color for a given value by finding the index of the boundary and using it to choose the appropriate color from the predefined colors array.",
+ "type": "comment"
+ },
+ "472": {
+ "file_id": 39,
+ "content": " const color = interpolateColor(color_left, color_right, ratio)\n return color\n}\nexport const DEFAULT_COLORS = [\n // { r: 255, g: 0, b: 105 },\n { r: 255, g: 255, b: 255 },\n { r: 0, g: 255, b: 0 },\n]\nexport const DEFAULT_BOUNDARIES = [\n // 0, 0.5, 1\n 0, 1\n]",
+ "type": "code",
+ "location": "/neuron-viewer/src/types.ts:60-72"
+ },
+ "473": {
+ "file_id": 39,
+ "content": "This code defines a function to interpolate colors between two given colors based on a ratio, and provides default color and boundary values.",
+ "type": "comment"
+ },
+ "474": {
+ "file_id": 40,
+ "content": "/neuron-viewer/src/utils.ts",
+ "type": "filepath"
+ },
+ "475": {
+ "file_id": 40,
+ "content": "1. memoizeAsync: Memoizes asynchronous functions by storing their results in localStorage and returning them if they have already been computed.\n2. getQueryParams: Retrieves URL query parameters from the current window location and returns them as an object.",
+ "type": "summary"
+ },
+ "476": {
+ "file_id": 40,
+ "content": "export const memoizeAsync = (fnname: string, fn: any) => {\n return async (...args: any) => {\n const key = `memoized:${fnname}:${args.map((x: any) => JSON.stringify(x)).join(\"-\")}`\n const val = localStorage.getItem(key);\n if (val === null) {\n const value = await fn(...args)\n localStorage.setItem(key, JSON.stringify(value))\n console.log(`memoized ${fnname}(${args.map((x: any) => JSON.stringify(x)).join(\", \")})`, value)\n return value\n } else {\n // console.log(`parsing`, val)\n return JSON.parse(val)\n }\n }\n}\nexport const getQueryParams = () => {\n const urlParams = new URLSearchParams(window.location.search)\n const params: {[key: string]: any} = {}\n for (const [key, value] of urlParams.entries()) {\n params[key] = value\n }\n return params\n}",
+ "type": "code",
+ "location": "/neuron-viewer/src/utils.ts:1-25"
+ },
+ "477": {
+ "file_id": 40,
+ "content": "1. memoizeAsync: Memoizes asynchronous functions by storing their results in localStorage and returning them if they have already been computed.\n2. getQueryParams: Retrieves URL query parameters from the current window location and returns them as an object.",
+ "type": "comment"
+ },
+ "478": {
+ "file_id": 41,
+ "content": "/neuron-viewer/src/welcome.tsx",
+ "type": "filepath"
+ },
+ "479": {
+ "file_id": 41,
+ "content": "The code provides a NeuronForm component that uses hooks to handle layers and neurons, featuring an array of predefined text classification neurons. The Neuron Viewer tool allows users to view specific details or select neurons randomly.",
+ "type": "summary"
+ },
+ "480": {
+ "file_id": 41,
+ "content": "import { useState, FormEvent } from \"react\"\nimport { useNavigate } from \"react-router-dom\"\nfunction NeuronForm() {\n const [input_layer, setLayer] = useState(0)\n const [input_neuron, setNeuron] = useState(0)\n const navigate = useNavigate()\n const knownGoodNeurons = [\n /**************\n /* well explained + interesting\n ***************/\n {heading: 'Somewhat well explained by GPT-4', layer: 0, neuron: 0, label: ''},\n {layer: 5, neuron: 131, label: \"citations\", description: \"citations, especially biblical and legal\"},\n {layer: 12, neuron: 847, label: \"numbers in fractions\", description: \"numbers in fractions\"}, // \n {layer: 12, neuron: 5820, label: \"short flags\", description: \"single letter command line flags\"}, // \n {layer: 14, neuron: 417, label: \"doing things right\", description: \"words and phrases related to performing actions correctly or properly\"}, // score 0.42\n {layer: 15, neuron: 4538, label: \"leading transitions\", description: \"transition words at the start of documents\"},",
+ "type": "code",
+ "location": "/neuron-viewer/src/welcome.tsx:1-18"
+ },
+ "481": {
+ "file_id": 41,
+ "content": "This code imports React hooks and defines a NeuronForm component that utilizes useState to store the layer and neuron values. It also uses useNavigate from react-router-dom for navigation. The code includes an array of objects representing known good neurons with their respective layers, neurons, labels, and descriptions.",
+ "type": "comment"
+ },
+ "482": {
+ "file_id": 41,
+ "content": " {layer: 17, neuron: 3218, label: \"success\", description: \"expressions of completion or success\"}, // score 0.38\n {layer: 18, neuron: 5302, label: \"X *by*\", description: \"the word 'by' in phrases indicating side by side or sequential events.\"}, // score 0.48\n {layer: 19, neuron: 1377, label: \"similes\", description: \"comparisons and analogies, often using the word 'like'\"}, // score 0.42\n {layer: 21, neuron: 2932, label: \"Canada\", description: \"references to Canadian people, places, and entities\"}, // score 0.78\n {layer: 25, neuron: 2602, label: \"similes\", description: \"descriptive comparisons, especially similes\"}, // score 0.40\n {layer: 25, neuron: 4870, label: \"certainty\", description: \"phrases related to certainty and confidence.\"}, // score 0.37\n {layer: 30, neuron: 28, label: \"times\", description: \"specific times (with hours and minutes)\"}, \n // https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html#/layers/5/neurons/2326\n {heading: 'Partially explained by GPT-4', layer: 0, neuron: 0, label: ''},",
+ "type": "code",
+ "location": "/neuron-viewer/src/welcome.tsx:19-27"
+ },
+ "483": {
+ "file_id": 41,
+ "content": "This code represents a collection of neuron explanations for an AI model. Each entry in the array includes information about the layer, neuron number, label, and description. The scores indicate how relevant each neuron is to the given text.",
+ "type": "comment"
+ },
+ "484": {
+ "file_id": 41,
+ "content": " {layer: 0, neuron: 816, label: \"Marvel comics vibes\", description: \"language and context related to Marvel comics, movies, and characters, as well as other superhero-themed content\"}, // score 0.44\n {layer: 0, neuron: 742, label: \"Second token 'and'\", description: \"'and', 'in', and punctuation at the second token\"},\n {layer: 4, neuron: 4342, label: \"token counter\", description: \"counting repeated occurrences of a token\"},\n {layer: 5, neuron: 2326, label: \"rhymes with 'at'\", description: \"syllables rhyming with 'at', sometimes 'it', 'et', 'ot'\"},\n {layer: 5, neuron: 4492, label: \"leading 'an'\", description: \"sentences that start with 'an'\"}, // score 0.77\n {layer: 6, neuron: 3251, label: \"not all\", description: \"not all\"},\n {layer: 10, neuron: 2851, label: \"leading acronyms\", description: \"acronyms after punctuation or newlines\"},\n {layer: 12, neuron: 2884, label: \"hypothetical had\", description: \"had in hypothetical contexts\"}, // \n {layer: 14, neuron: 3539, label: \"long sequences\", description: \"long sequences of stuff\"},",
+ "type": "code",
+ "location": "/neuron-viewer/src/welcome.tsx:28-36"
+ },
+ "485": {
+ "file_id": 41,
+ "content": "These are individual neuron definitions for various layers in a neural network, each with a specific label and description. The numbers represent unique identifiers for these neurons.",
+ "type": "comment"
+ },
+ "486": {
+ "file_id": 41,
+ "content": " {layer: 14, neuron: 3822, label: \"X by/after *X*\", description: \"noun repetitions separated by 'by' or 'after'\"},\n {layer: 21, neuron: 3982, label: \"any *and* all\", description: \"any/anything *and/&* all/everything\"},\n {layer: 26, neuron: 20, label: \"truth, skin, or sun\", description: \"truth, skin, or sun\"},\n // layer=18&neuron=5302\n /**************\n /* boring\n ***************/\n /**************\n /* poorly explained + interesting\n ***************/\n {heading: 'Poorly explained by GPT-4', layer: 0, neuron: 0, label: ''},\n // Actually activates for negated version “not so much … as” even when not so much is fairly far apart\n // another \"not all\": 13&neuron=1352\n // {layer: 0, neuron: 2823, label: \"Hillary email leak vibes\", description: \"contexts related to Hillary Clinton leaked emails\"}, // score ??\n // {layer: 12, neuron: 3718, label: \"comparative phrases and negations\", description: \"comparative phrases and negations\"}, // score 0.12\n {layer: 13, neuron: 410, label: \"N and N+1\", description: \"a number following its predecessor\"}, // score ??",
+ "type": "code",
+ "location": "/neuron-viewer/src/welcome.tsx:37-52"
+ },
+ "487": {
+ "file_id": 41,
+ "content": "This code represents a collection of layers and neurons with their respective labels and descriptions. The comments describe the meaning or purpose behind each entry, such as \"X by/after *X*\", which refers to noun repetitions separated by 'by' or 'after', and \"any *and* all\" for any/anything & all/everything. Some entries are marked as poorly explained, interesting, or have specific contexts like Hillary Clinton leaked emails. The scores represent the relevance of these entries in the context.",
+ "type": "comment"
+ },
+ "488": {
+ "file_id": 41,
+ "content": " {layer: 13, neuron: 979, label: \"subtle plurals\", description: \"subtle/nonobvious plurals\"}, // score ??\n // slash after number 12&neuron=847\n // numbers predicting slash: 14&neuron=92\n // 0&neuron=2823\n {layer: 14, neuron: 1251, label: \"subjunctive verbs\", description: \"verbs in subjunctive mood\"}, // score ??\n {layer: 16, neuron: 518, label: \"pattern breaks\", description: \"tokens that break an established pattern in an ongoing list\"}, // score 0.2 with totally wrong explanation\n {layer: 17, neuron: 821, label: \"idioms\", description: \"idioms\"},\n {layer: 18, neuron: 3481, label: \"post-typo\", description: \"first token following a typo\"}, // score ??\n {layer: 18, neuron: 3552, label: \"repeated text\", description: \"repeated text\"}, // score ??\n // another shared last names: https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html#/layers/20/neurons/3164\n {layer: 19, neuron: 1763, label: \"shared last names\", description: \"last names when two different people sharing last name are mentioned\"}, // score 0.36",
+ "type": "code",
+ "location": "/neuron-viewer/src/welcome.tsx:53-63"
+ },
+ "489": {
+ "file_id": 41,
+ "content": "Code represents a list of neurons in the Neuron Viewer tool, each with a layer, neuron ID, label, description, and possibly a score. The labels indicate different types of linguistic patterns or features identified by OpenAI's GPT-3 model. The descriptions provide context on what these neurons represent.",
+ "type": "comment"
+ },
+ "490": {
+ "file_id": 41,
+ "content": " {layer: 20, neuron: 4334, label: \"previous break\", description: \"tokens that previously preceded a linebreak\"}, // score ??\n {layer: 27, neuron: 116, label: \"MTG vibes\", description: \"Magic the Gathering contexts\"}, // score ??\n {layer: 35, neuron: 1523, label: \"NBA name predictor\", description: \"NBA person/player name predictor\"}, // score ??\n // {layer: 36, neuron: 2275, label: \"she predictor\", description: \"prediction of the token 'she'\"}, // score ??\n // {layer: 36, neuron: 5107, label: \"Mormon vibes\", description: \"Mormon related context\"}, // score ??\n // ] predictor 40&neuron=4505\n {layer: 46, neuron: 2181, label: \"C predictor\", description: \"prediction of the token 'C'\"}, // score ??\n ]\n const handleSubmit = (e: FormEvent) => {\n e.preventDefault()\n navigate(`/layers/${input_layer}/neurons/${input_neuron}`)\n return false\n }\n const handleNeuronClick = (layer: number, neuron: number) => {\n navigate(`/layers/${layer}/neurons/${neuron}`)\n }\n const feelingLuckySubmit = () => {",
+ "type": "code",
+ "location": "/neuron-viewer/src/welcome.tsx:64-83"
+ },
+ "491": {
+ "file_id": 41,
+ "content": "These are examples of neurons with their associated labels, descriptions, and potential scores. The handleSubmit function handles form submission to navigate to a specific layer and neuron. The handleNeuronClick function navigates to a specific neuron when clicked.",
+ "type": "comment"
+ },
+ "492": {
+ "file_id": 41,
+ "content": " const layer = Math.floor(Math.random() * 48);\n const neuron = Math.floor(Math.random() * 6400);\n navigate(`/layers/${layer}/neurons/${neuron}`)\n return false\n }\n return (\n
\n )\n}\nexport default NeuronForm",
+ "type": "code",
+ "location": "/neuron-viewer/src/welcome.tsx:145-158"
+ },
+ "497": {
+ "file_id": 41,
+ "content": "This code represents a JSX component called NeuronForm. It displays a set of buttons, each representing a neuron and its associated layer number. The buttons have hover effects for styling and display the description, label, layer, and neuron count for each neuron.",
+ "type": "comment"
+ },
+ "498": {
+ "file_id": 42,
+ "content": "/neuron-viewer/tailwind.config.js",
+ "type": "filepath"
+ },
+ "499": {
+ "file_id": 42,
+ "content": "Configuring Tailwind CSS with content from \"./src/**/*.{html,js,jsx}\" and empty extend and plugins.",
+ "type": "summary"
+ }
+}
\ No newline at end of file
diff --git a/docs/data/5.json b/docs/data/5.json
new file mode 100644
index 0000000..73bdc3a
--- /dev/null
+++ b/docs/data/5.json
@@ -0,0 +1,13 @@
+{
+ "500": {
+ "file_id": 42,
+ "content": "/** @type {import('tailwindcss').Config} */\nmodule.exports = {\n content: [\"./src/**/*.{html,js,jsx}\"],\n theme: {\n extend: {},\n },\n plugins: [],\n}",
+ "type": "code",
+ "location": "/neuron-viewer/tailwind.config.js:1-8"
+ },
+ "501": {
+ "file_id": 42,
+ "content": "Configuring Tailwind CSS with content from \"./src/**/*.{html,js,jsx}\" and empty extend and plugins.",
+ "type": "comment"
+ }
+}
\ No newline at end of file
diff --git a/docs/data/titles/0.json b/docs/data/titles/0.json
new file mode 100644
index 0000000..85fc418
--- /dev/null
+++ b/docs/data/titles/0.json
@@ -0,0 +1,241 @@
+{
+ "/README.md": "Neuron Explanation Tools for Language Models",
+ "/README.md:1-16": "Neuron Explanation Toolkit",
+ "/README.md:18-33": "GPT-2 XL Datasets Overview",
+ "/README.md:34-45": "Azure Blob Storage Data Sources for Neurons and Tokens",
+ "/README.md:46-55": "GPT-2 Neuron Activations Explained",
+ "/README.md:55-76": "GPT-2 Small: Activation Value Differences",
+ "/README.md:77-80": "Curated Neuron Datasets",
+ "/neuron-explainer/README.md": "Neuron Explainer Codebase",
+ "/neuron-explainer/demos/explain_puzzles.py": "Explaining Puzzles with Neuron Explainer",
+ "/neuron-explainer/demos/explain_puzzles.py:1-38": "Explain Puzzles with Neuron Explainer",
+ "/neuron-explainer/demos/explain_puzzles.py:39-44": "One Explanation Generator",
+ "/neuron-explainer/demos/generate_and_score_explanation.py": "Generate and Score Explanations",
+ "/neuron-explainer/demos/generate_and_score_explanation.py:1-33": "Setting Up Neuron Explainer Demo",
+ "/neuron-explainer/demos/generate_and_score_explanation.py:34-65": "Neuron Explanation Generation and Evaluation",
+ "/neuron-explainer/demos/generate_and_score_explanation.py:66-70": "Generate and Score Explanation",
+ "/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py": "Generate and Score Token Look-up Table Explanations",
+ "/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py:1-31": "Setting Up Explainer Environment",
+ "/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py:33-67": "Generate and Score Token Lookup Table Explanation",
+ "/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py:68-72": "Generate and Score Token Lookup Table",
+ "/neuron-explainer/neuron_explainer/activations/activation_records.py": "Activation Record Calculator",
+ "/neuron-explainer/neuron_explainer/activations/activation_records.py:1-29": "Activation Record Handling",
+ "/neuron-explainer/neuron_explainer/activations/activation_records.py:120-130": "Non-zero Activation Ratio",
+ "/neuron-explainer/neuron_explainer/activations/activation_records.py:30-53": "Activation Formatter for Neuron Explainer",
+ "/neuron-explainer/neuron_explainer/activations/activation_records.py:54-81": "Activation Record Formatter",
+ "/neuron-explainer/neuron_explainer/activations/activation_records.py:82-119": "Simulation-Ready Activation Record Formatting",
+ "/neuron-explainer/neuron_explainer/activations/activations.py": "Interleaved Activations Dataclasses",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:1-33": "Neuron Activation Dataclasses",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:100-125": "Neuron Activation Class: Random Samples and Properties",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:127-144": "Getting Activation Slices",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:144-166": "Activations Retrieval Methods",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:167-190": "Calibration and Validation Methods",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:191-219": "Explanation Activations Functions",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:220-247": "Loading Neuron Data Functions",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:248-273": "Neuron Data Retriever",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:274-280": "Sorting Numeric Fold Names",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:34-64": "Interleaved Subsets for Activations",
+ "/neuron-explainer/neuron_explainer/activations/activations.py:65-99": "Activation Record Slicing Class",
+ "/neuron-explainer/neuron_explainer/activations/token_connections.py": "Token Activations Lookup",
+ "/neuron-explainer/neuron_explainer/activations/token_connections.py:1-33": "Azure Token-Weight Loader",
+ "/neuron-explainer/neuron_explainer/activations/token_connections.py:34-58": "Token-Based Neuron Activations",
+ "/neuron-explainer/neuron_explainer/activations/token_connections.py:59-59": "Reads and Formats File Contents",
+ "/neuron-explainer/neuron_explainer/api_client.py": "OpenAI API Client with Error Handling and Caching",
+ "/neuron-explainer/neuron_explainer/api_client.py:1-34": "API Error Retry Logic",
+ "/neuron-explainer/neuron_explainer/api_client.py:127-150": "Making OpenAI API Requests with ApiClient",
+ "/neuron-explainer/neuron_explainer/api_client.py:152-152": "Starting Asynchronous Event Loop",
+ "/neuron-explainer/neuron_explainer/api_client.py:35-66": "Exponential Backoff Decorator",
+ "/neuron-explainer/neuron_explainer/api_client.py:67-98": "Retryable OpenAI API Client",
+ "/neuron-explainer/neuron_explainer/api_client.py:99-126": "Async API Client with Caching and Retry",
+ "/neuron-explainer/neuron_explainer/azure.py": "Azure URL Conversion",
+ "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py": "Calibrated Neuron Simulators",
+ "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:1-27": "Calibrated Neuron Simulator",
+ "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:105-129": "Calibration for Neuron Simulation",
+ "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:130-157": "Linear Regression-Based Calibration",
+ "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:158-184": "Calibrated Neuron Simulator through Percentile Matching",
+ "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:185-194": "Ensure Calibration before Applying",
+ "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:28-53": "Calibrated Neuron Simulator Creation and Calibration",
+ "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:54-78": "Calibrated Simulator via Activation Sequences",
+ "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:79-104": "Calibrated Neuron Simulator",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py": "AI Model for Explanation Generation",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:1-34": "Neuron Explainer: API-based Explanations",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:102-128": "Subclass-Specific Explanation Generation",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:129-153": "API-Based Neuron Explanation Generator",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:154-177": "Check Prompt Length and Token Limit",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:178-200": "Neuron Explainer Initialization",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:201-217": "Neuron Explainer Setup",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:218-231": "Activation Format and Selection",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:232-250": "Activation Record Optimization",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:251-270": "Activation Record Omitting",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:271-293": "Neuron Explainer Function",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:294-316": "Neuron Explainer Code\n\nExplanation: The code in the given path is related to the Neuron Explainer functionality, hence the title \"Neuron Explainer Code\" represents the content in a concise and meaningful manner",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:317-334": "Explanation Listing in Neuron Explainer",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:335-353": "Trimmed Explanations List",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:354-381": "Initializing Explainer Instance",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:37-77": "NeuronExplainer: Explaining Subclass Input Data",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:382-398": "Neuron Explanation Prompt Builder",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:399-418": "Neuron Explainer Prompt Builder",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:419-441": "Neuron Explainer Prompt Addition",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:442-464": "Code for Handling Explanations in Neuron Explainer",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:465-472": "Neuron Explainer: Explanation Parser and Extra Space Remover",
+ "/neuron-explainer/neuron_explainer/explanations/explainer.py:78-101": "Neuron Explainer Code",
+ "/neuron-explainer/neuron_explainer/explanations/explanations.py": "Asynchronous Neuron Explanation Loading",
+ "/neuron-explainer/neuron_explainer/explanations/explanations.py:1-29": "Neuron Explanation Dataclasses",
+ "/neuron-explainer/neuron_explainer/explanations/explanations.py:102-125": "Evaluation Metrics Class for Neuron Explainer",
+ "/neuron-explainer/neuron_explainer/explanations/explanations.py:126-156": "Neuron Explainer: Simulation Results and Scoring",
+ "/neuron-explainer/neuron_explainer/explanations/explanations.py:157-186": "Asynchronous Neuron Explanations",
+ "/neuron-explainer/neuron_explainer/explanations/explanations.py:187-217": "Asynchronous Explanation Loading",
+ "/neuron-explainer/neuron_explainer/explanations/explanations.py:218-230": "Sorting Neuron Indices from Explanations",
+ "/neuron-explainer/neuron_explainer/explanations/explanations.py:30-52": "Neuron Activation Dataclass",
+ "/neuron-explainer/neuron_explainer/explanations/explanations.py:53-73": "ScoredSequenceSimulation Class",
+ "/neuron-explainer/neuron_explainer/explanations/explanations.py:74-101": "ScoredSimulation Class: Evaluating Neuron Simulations",
+ "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py": "Neuron Explainer's Prompt Builder",
+ "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:1-38": "Prompt Formatting and Message Dictionary",
+ "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:109-118": "Prompt Builder and Formatter",
+ "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:41-66": "PromptBuilder: Creating Custom Prompts",
+ "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:67-85": "Token Counter and Prompt Builder",
+ "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:86-108": "Deep Copy and Role Checking in Prompt Builder",
+ "/neuron-explainer/neuron_explainer/explanations/puzzles.py": "Puzzles Class for Explainers",
+ "/neuron-explainer/neuron_explainer/explanations/puzzles.py:1-28": "Puzzle Class for Ground Truth Explanations",
+ "/neuron-explainer/neuron_explainer/explanations/puzzles.py:29-49": "Puzzle Data Preprocessor",
+ "/neuron-explainer/neuron_explainer/explanations/puzzles.py:50-50": "Assign Puzzles to Names",
+ "/neuron-explainer/neuron_explainer/explanations/scoring.py": "Asynchronous Neuron Explainer Scoring",
+ "/neuron-explainer/neuron_explainer/explanations/scoring.py:1-34": "Neuron Explanation Scoring Algorithm",
+ "/neuron-explainer/neuron_explainer/explanations/scoring.py:110-137": "ScoredSimulation: Evaluating Explanation Prediction Accuracy",
+ "/neuron-explainer/neuron_explainer/explanations/scoring.py:138-155": "Asynchronous Scoring Function",
+ "/neuron-explainer/neuron_explainer/explanations/scoring.py:35-65": "Scoring Functions for Explanations",
+ "/neuron-explainer/neuron_explainer/explanations/scoring.py:66-86": "Calibrated Neuron Simulator Scoring",
+ "/neuron-explainer/neuron_explainer/explanations/scoring.py:87-109": "Correlated Activation Scoring",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py": "Enhanced Neuron Simulation Objects",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:1-33": "Neuron Explainer Simulator",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:123-148": "Merging Response Tokens in UTF-8",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:149-168": "Token Split Check Function",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:169-195": "Token Split Warning",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:196-219": "Extracting Prompt Data from Responses",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:220-234": "Checking Response End and Tab Tokens",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:235-256": "Token Activation Stats",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:257-272": "Handling Newline Folded Tokenization in Model",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:273-297": "NeuronSimulator: Abstract Simulation Class",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:298-329": "Neuron Simulator Class for Explanations",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:331-352": "API Request Parser with Assertions",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:34-66": "Simulation Type Enum and Expected Value Function",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:353-373": "Neuron Activation Predictor Code Snippet",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:374-399": "Neuron Simulator Explainer",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:401-426": "Neuron Explainer Simulator: Slow Python Implementation",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:427-450": "Normalizing Token Probabilities",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:452-479": "Sequencing Neuron Activation Logging",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:480-508": "Trimming Activation Records and Prompt Customization",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:509-531": "Neuron Simulator Prompt Generator",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:532-553": "Neuron Behavior Simulator",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:554-581": "Enhanced Prompt Builder with Subprompts",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:582-607": "Simulated Activations Parsing",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:608-631": "Token Validation Checker",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:632-648": "Deterministic Activations Appending",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:650-706": "Initialize Simulator Instance",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:67-92": "Compute Expected Value from Normalized Probs",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:707-736": "Simulation Prompt Processing and Storage",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:737-756": "Neuron Simulation with Expected Activations",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:758-774": "Neuron Explanation Simulator",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:775-793": "Neuron Explainer Prompt Builder",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:794-798": "Formatted Neuron Explanations Generator",
+ "/neuron-explainer/neuron_explainer/explanations/simulator.py:93-122": "Normalized Log Probabilities Simulator",
+ "/neuron-explainer/neuron_explainer/explanations/test_explainer.py": "Neuron Explainer Test with GPT-4 & Harmony V4",
+ "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:1-23": "Async Operations Event Loop Setup",
+ "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:115-143": "Explainer Prompt Initialization Test",
+ "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:143-179": "Neuron Explainer Test Case: Prompt Generation",
+ "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:179-222": "GPT-4 Model Token Explanation Generation",
+ "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:223-227": "Validation of HarmonyMessage Roles and Contents",
+ "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:25-73": "Test Neuron Explainer Format",
+ "/neuron-explainer/neuron_explainer/explanations/test_explainer.py:74-114": "Neuron Behavior in Neural Networks Explained",
+ "/neuron-explainer/neuron_explainer/explanations/test_simulator.py": "Neuron Explainer Test Simulator",
+ "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:1-36": "Test Neuron Explainer Prompt Format",
+ "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:120-153": "Simulation Prompt Test",
+ "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:154-184": "Neuron Explainer Prompt Tester",
+ "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:186-229": "Neuron Simulator for Text Activations",
+ "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:230-267": "Neuron 3 Simulation for GPT-4",
+ "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:268-269": "Message and Prompt Matching",
+ "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:37-69": "Neuron Simulator Prompt Generation",
+ "/neuron-explainer/neuron_explainer/explanations/test_simulator.py:70-119": "Test Simulation Prompts in ExplanationNeuronSimulator",
+ "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py": "Token-Based Few-Shot Learning for Neuron Explainers",
+ "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:1-39": "Token-Based Few Shot Examples Class",
+ "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:135-181": "Token-based Few-Shot Learning Examples",
+ "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:182-212": "Token Space Few-Shot Examples",
+ "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:40-88": "Few-Shot Token Space Explanation",
+ "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:89-134": "Few-Shot Token Examples for Neuron Explainers",
+ "/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py": "Fast Dataclass Import Init",
+ "/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py": "Efficient Dataclass Serialization with orjson",
+ "/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py:1-37": "Fast, Efficient Dataclass Utility",
+ "/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py:38-61": "FastDataclasses.py: Recursive Object Hook",
+ "/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py:62-85": "Fast Dataclasses JSON Parser",
+ "/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py": "Testing FastDataclass Serialization and Deserialization",
+ "/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py:1-34": "Fast Dataclasses Serialization Test",
+ "/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py:35-75": "Fast Dataclasses Serialization Test",
+ "/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py:76-83": "Deserialization Scenarios Testing",
+ "/neuron-explainer/setup.py": "Neuron Explainer Setup",
+ "/neuron-viewer/README.md": "Neuron Viewer: Public Website and Development Guide",
+ "/neuron-viewer/python/server.py": "Flask JSON Server Setup",
+ "/neuron-viewer/python/server.py:1-43": "JSON Loading and Flask Setup",
+ "/neuron-viewer/python/server.py:46-51": "Main Function Executor",
+ "/neuron-viewer/src/App.jsx": "Setting Up Routing in App.jsx",
+ "/neuron-viewer/src/feed.jsx": "Neuron Viewer Feed",
+ "/neuron-viewer/src/feed.jsx:1-38": "Neuron Viewer Feed Setup",
+ "/neuron-viewer/src/feed.jsx:39-64": "Neuron Viewer Layout",
+ "/neuron-viewer/src/heatmapGrid.tsx": "Heatmap Grid Component",
+ "/neuron-viewer/src/index.jsx": "React App Setup",
+ "/neuron-viewer/src/interpAPI.ts": "Top Neuron Retriever",
+ "/neuron-viewer/src/interpAPI.ts:1-44": "File Loading Functions",
+ "/neuron-viewer/src/interpAPI.ts:45-55": "Neuron Records Storage Update",
+ "/neuron-viewer/src/interpAPI.ts:56-73": "Explanation Retrieval Functions",
+ "/neuron-viewer/src/interpAPI.ts:74-97": "Weight-Type Based TOKENS_PATH Loading",
+ "/neuron-viewer/src/interpAPI.ts:97-112": "Top Neuron Connections Extractor",
+ "/neuron-viewer/src/panes/datasetList.jsx": "Neuron Viewer Dataset List",
+ "/neuron-viewer/src/panes/datasetList.jsx:1-36": "Neuron Viewer: Dataset List",
+ "/neuron-viewer/src/panes/datasetList.jsx:37-63": "Generate Sequences for Quantile Ranges",
+ "/neuron-viewer/src/panes/datasetList.jsx:64-94": "Neuron Viewer: Dataset List",
+ "/neuron-viewer/src/panes/datasetList.jsx:95-104": "Heatmap Visualization in Dataset List",
+ "/neuron-viewer/src/panes/explanation.jsx": "Loading Neuron Sequences Viewer",
+ "/neuron-viewer/src/panes/explanation.jsx:1-33": "Neuron Viewer Explanation Setup",
+ "/neuron-viewer/src/panes/explanation.jsx:124-147": "Toggle Overlay Control",
+ "/neuron-viewer/src/panes/explanation.jsx:148-167": "Overlay Simulation Heatmap Comparison",
+ "/neuron-viewer/src/panes/explanation.jsx:34-67": "Neuron Viewer: Explanation Pane",
+ "/neuron-viewer/src/panes/explanation.jsx:69-96": "Explanation Component in React App",
+ "/neuron-viewer/src/panes/explanation.jsx:97-123": "Toggle Switch with Checkbox and Background Color Change",
+ "/neuron-viewer/src/panes/index.js": "Importing Components from Separate Files",
+ "/neuron-viewer/src/panes/similarNeurons.jsx": "Neuron Viewer: Similar Neurons Pane",
+ "/neuron-viewer/src/panes/similarNeurons.jsx:1-40": "Neuron Info Component",
+ "/neuron-viewer/src/panes/similarNeurons.jsx:106-118": "Displaying Neuron Connections",
+ "/neuron-viewer/src/panes/similarNeurons.jsx:41-72": "Similar Neurons Pane",
+ "/neuron-viewer/src/panes/similarNeurons.jsx:73-105": "Display Related Neurons in Upstream and Downstream Sections",
+ "/neuron-viewer/src/panes/topTokens.jsx": "Top Tokens Display",
+ "/neuron-viewer/src/panes/topTokens.jsx:1-41": "TokenDisplay Component: Fetching and Displaying Tokens Data",
+ "/neuron-viewer/src/panes/topTokens.jsx:111-124": "Reduced Top Token Display",
+ "/neuron-viewer/src/panes/topTokens.jsx:42-63": "Interactive Token Strength Display",
+ "/neuron-viewer/src/panes/topTokens.jsx:64-88": "Styled Token Mapper",
+ "/neuron-viewer/src/panes/topTokens.jsx:89-111": "Top 20 Output Positive Tokens",
+ "/neuron-viewer/src/reportWebVitals.js": "Measure and Report Web Vitals in Neuron Viewer",
+ "/neuron-viewer/src/simulationHeatmap.tsx": "Simulation Heatmap Component",
+ "/neuron-viewer/src/simulationHeatmap.tsx:1-21": "Overlaying Activation Heatmap in Neuron Viewer",
+ "/neuron-viewer/src/simulationHeatmap.tsx:22-40": "Simulation Heatmap Matching",
+ "/neuron-viewer/src/simulationHeatmap.tsx:41-64": "Real Activations Heatmap",
+ "/neuron-viewer/src/simulationHeatmap.tsx:65-88": "Simulation Heatmap Visualization",
+ "/neuron-viewer/src/simulationHeatmap.tsx:89-95": "Simulation Heatmap Component",
+ "/neuron-viewer/src/tokenHeatmap.tsx": "TokenHeatmap: React Heatmap for Token Activations",
+ "/neuron-viewer/src/tokenHeatmap.tsx:1-23": "TokenHeatmap: React Component for Token Activation Heatmaps",
+ "/neuron-viewer/src/tokenHeatmap.tsx:24-32": "Token Heatmap Generator",
+ "/neuron-viewer/src/types.ts": "Color Interpolation and Normalization Functions",
+ "/neuron-viewer/src/types.ts:1-31": "Normalize Token-Activation Data",
+ "/neuron-viewer/src/types.ts:32-59": "Color Interpolation Functions",
+ "/neuron-viewer/src/types.ts:60-72": "Color Interpolation Function",
+ "/neuron-viewer/src/utils.ts": "Asynchronous Memoization and Query Params Utility",
+ "/neuron-viewer/src/welcome.tsx": "Neuron Viewer: Intuitive Text Classification",
+ "/neuron-viewer/src/welcome.tsx:1-18": "Neuron Form Component Using React Hooks",
+ "/neuron-viewer/src/welcome.tsx:117-144": "Neuron Viewer App",
+ "/neuron-viewer/src/welcome.tsx:145-158": "Neuron Form Component",
+ "/neuron-viewer/src/welcome.tsx:19-27": "Neuron Explainer Code",
+ "/neuron-viewer/src/welcome.tsx:28-36": "Neuron Viewer: Individual Layer Definitions",
+ "/neuron-viewer/src/welcome.tsx:37-52": "Neuron Viewer Labels",
+ "/neuron-viewer/src/welcome.tsx:53-63": "Neuron Viewer: Decoding GPT-3's Linguistic Patterns",
+ "/neuron-viewer/src/welcome.tsx:64-83": "Neuron Viewer: Navigation and Handling",
+ "/neuron-viewer/src/welcome.tsx:84-116": "Neuron Navigator",
+ "/neuron-viewer/tailwind.config.js": "Tailwind CSS Configuration"
+}
\ No newline at end of file
diff --git a/docs/doc/0298660f-1360-4f0a-a22d-4ae083f5ecaf.json b/docs/doc/0298660f-1360-4f0a-a22d-4ae083f5ecaf.json
new file mode 100644
index 0000000..d5d1919
--- /dev/null
+++ b/docs/doc/0298660f-1360-4f0a-a22d-4ae083f5ecaf.json
@@ -0,0 +1,50 @@
+{
+ "summary": "The code includes classes for neuron explanations, scores, and simulation results with asynchronous loading from JSON file reading. The function retrieves sorted neuron indices by joining the explanation path with the layer number, listing files, filtering numeric filenames, converting to integers, and sorting the list.",
+ "details": [
+ {
+ "comment": "This code defines dataclasses and enums for storing neuron explanations, scores, and related data. It also includes helper functions and handles different activation scales for neurons.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py\":0-28",
+ "content": "# Dataclasses and enums for storing neuron explanations, their scores, and related data. Also,\n# related helper functions.\nfrom __future__ import annotations\nimport json\nfrom dataclasses import dataclass\nfrom enum import Enum\nfrom typing import List, Optional, Union\nimport blobfile as bf\nimport boostedblob as bbb\nfrom neuron_explainer.activations.activations import NeuronId\nfrom neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass\nclass ActivationScale(str, Enum):\n \"\"\"Which \"units\" are stored in the expected_activations/distribution_values fields of a\n SequenceSimulation.\n This enum identifies whether the values represent real activations of the neuron or something\n else. Different scales are not necessarily related by a linear transformation.\n \"\"\"\n NEURON_ACTIVATIONS = \"neuron_activations\"\n \"\"\"Values represent real activations of the neuron.\"\"\"\n SIMULATED_NORMALIZED_ACTIVATIONS = \"simulated_normalized_activations\"\n \"\"\"\n Values represent simulated activations of the neuron, normalized to the range [0, 10]. This"
+ },
+ {
+ "comment": "This code defines a dataclass for storing the results of simulating neuron activations on a text sequence. It includes the sequence of tokens, expected activation values, scale, and distribution values from the simulation, excluding non-significant tokens.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py\":29-51",
+ "content": " scale is arbitrary and should not be interpreted as a neuron activation.\n \"\"\"\n@register_dataclass\n@dataclass\nclass SequenceSimulation(FastDataclass):\n \"\"\"The result of a simulation of neuron activations on one text sequence.\"\"\"\n tokens: list[str]\n \"\"\"The sequence of tokens that was simulated.\"\"\"\n expected_activations: list[float]\n \"\"\"Expected value of the possibly-normalized activation for each token in the sequence.\"\"\"\n activation_scale: ActivationScale\n \"\"\"What scale is used for values in the expected_activations field.\"\"\"\n distribution_values: list[list[float]]\n \"\"\"\n For each token in the sequence, a list of values from the discrete distribution of activations\n produced from simulation. Tokens will be included here if and only if they are in the top K=15\n tokens predicted by the simulator, and excluded otherwise.\n May be transformed to another unit by calibration. When we simulate a neuron, we produce a\n discrete distribution with values in the arbitrary discretized space of the neuron, e.g. 10%"
+ },
+ {
+ "comment": "This code describes a class called ScoredSequenceSimulation, which stores a distribution of values and their probabilities for each token in a sequence. It also has an optional uncalibrated_simulation attribute representing the simulation before calibration.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py\":52-72",
+ "content": " chance of 0, 70% chance of 1, 20% chance of 2. Which we store as distribution_values =\n [0, 1, 2], distribution_probabilities = [0.1, 0.7, 0.2]. When we transform the distribution to\n the real activation units, we can correspondingly transform the values of this distribution\n to get a distribution in the units of the neuron. e.g. if the mapping from the discretized space\n to the real activation unit of the neuron is f(x) = x/2, then the distribution becomes 10%\n chance of 0, 70% chance of 0.5, 20% chance of 1. Which we store as distribution_values =\n [0, 0.5, 1], distribution_probabilities = [0.1, 0.7, 0.2].\n \"\"\"\n distribution_probabilities: list[list[float]]\n \"\"\"\n For each token in the sequence, the probability of the corresponding value in\n distribution_values.\n \"\"\"\n uncalibrated_simulation: Optional[\"SequenceSimulation\"] = None\n \"\"\"The result of the simulation before calibration.\"\"\"\n@register_dataclass\n@dataclass\nclass ScoredSequenceSimulation(FastDataclass):"
+ },
+ {
+ "comment": "This code defines a ScoredSimulation class that represents the result of scoring a neuron simulation on multiple sequences. It includes properties like simulation, true_activations, ev_correlation_score, and optional rsquared_score and absolute_dev_explained_score for evaluating the simulation's performance.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py\":73-100",
+ "content": " \"\"\"\n SequenceSimulation result with a score (for that sequence only) and ground truth activations.\n \"\"\"\n simulation: SequenceSimulation\n \"\"\"The result of a simulation of neuron activations.\"\"\"\n true_activations: List[float]\n \"\"\"Ground truth activations on the sequence (not normalized)\"\"\"\n ev_correlation_score: float\n \"\"\"\n Correlation coefficient between the expected values of the normalized activations from the\n simulation and the unnormalized true activations of the neuron on the text sequence.\n \"\"\"\n rsquared_score: Optional[float] = None\n \"\"\"R^2 of the simulated activations.\"\"\"\n absolute_dev_explained_score: Optional[float] = None\n \"\"\"\n Score based on absolute difference between real and simulated activations.\n absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real))\n \"\"\"\n@register_dataclass\n@dataclass\nclass ScoredSimulation(FastDataclass):\n \"\"\"Result of scoring a neuron simulation on multiple sequences.\"\"\"\n scored_sequence_simulations: List[ScoredSequenceSimulation]"
+ },
+ {
+ "comment": "This code defines a class with three score metrics (ev_correlation_score, rsquared_score, absolute_dev_explained_score) for evaluated sequences and provides a get_preferred_score method to return the preferred score.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py\":101-124",
+ "content": " \"\"\"ScoredSequenceSimulation for each sequence\"\"\"\n ev_correlation_score: Optional[float] = None\n \"\"\"\n Correlation coefficient between the expected values of the normalized activations from the\n simulation and the unnormalized true activations on a dataset created from all score_results.\n (Note that this is not equivalent to averaging across sequences.)\n \"\"\"\n rsquared_score: Optional[float] = None\n \"\"\"R^2 of the simulated activations.\"\"\"\n absolute_dev_explained_score: Optional[float] = None\n \"\"\"\n Score based on absolute difference between real and simulated activations.\n absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real)).\n \"\"\"\n def get_preferred_score(self) -> Optional[float]:\n \"\"\"\n This method may return None in cases where the score is undefined, for example if the\n normalized activations were all zero, yielding a correlation coefficient of NaN.\n \"\"\"\n return self.ev_correlation_score\n@register_dataclass"
+ },
+ {
+ "comment": "Class representing simulator parameters and scoring results for multiple sequences.\nFunction returns preferred score or None if undefined (e.g., normalized activations all zero).\nClass represents simulation results and scores for a specific neuron.\nFunction loads scored explanations for the specified neuron from given path.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py\":125-155",
+ "content": "@dataclass\nclass ScoredExplanation(FastDataclass):\n \"\"\"Simulator parameters and the results of scoring it on multiple sequences\"\"\"\n explanation: str\n \"\"\"The explanation used for simulation.\"\"\"\n scored_simulation: ScoredSimulation\n \"\"\"Result of scoring the neuron simulator on multiple sequences.\"\"\"\n def get_preferred_score(self) -> Optional[float]:\n \"\"\"\n This method may return None in cases where the score is undefined, for example if the\n normalized activations were all zero, yielding a correlation coefficient of NaN.\n \"\"\"\n return self.scored_simulation.get_preferred_score()\n@register_dataclass\n@dataclass\nclass NeuronSimulationResults(FastDataclass):\n \"\"\"Simulation results and scores for a neuron.\"\"\"\n neuron_id: NeuronId\n scored_explanations: list[ScoredExplanation]\ndef load_neuron_explanations(\n explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]\n) -> Optional[NeuronSimulationResults]:\n \"\"\"Load scored explanations for the specified neuron.\"\"\""
+ },
+ {
+ "comment": "1. Loads scored explanations for the specified neuron asynchronously.\n2. Read the contents of the given file as a string, asynchronously.\n3. Splits the content into lines and returns non-empty lines.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py\":156-185",
+ "content": " file = bf.join(explanations_path, str(layer_index), f\"{neuron_index}.jsonl\")\n if not bf.exists(file):\n return None\n with bf.BlobFile(file) as f:\n for line in f:\n return loads(line)\n return None\n@bbb.ensure_session\nasync def load_neuron_explanations_async(\n explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]\n) -> Optional[NeuronSimulationResults]:\n \"\"\"Load scored explanations for the specified neuron, asynchronously.\"\"\"\n return await read_explanation_file(\n bf.join(explanations_path, str(layer_index), f\"{neuron_index}.jsonl\")\n )\n@bbb.ensure_session\nasync def read_file(filename: str) -> Optional[str]:\n \"\"\"Read the contents of the given file as a string, asynchronously.\"\"\"\n try:\n raw_contents = await bbb.read.read_single(filename)\n except FileNotFoundError:\n print(f\"Could not read {filename}\")\n return None\n lines = []\n for line in raw_contents.decode(\"utf-8\").split(\"\\n\"):\n if len(line) > 0:"
+ },
+ {
+ "comment": "- reads explanation file from filename\n- loads scored explanations asynchronously\n- reads the contents of a file as JSON object asynchronously\n- returns names of numbered subdirectories in specified directory",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py\":186-216",
+ "content": " lines.append(line)\n assert len(lines) == 1, filename\n return lines[0]\n@bbb.ensure_session\nasync def read_explanation_file(explanation_filename: str) -> Optional[NeuronSimulationResults]:\n \"\"\"Load scored explanations from the given filename, asynchronously.\"\"\"\n line = await read_file(explanation_filename)\n return loads(line) if line is not None else None\n@bbb.ensure_session\nasync def read_json_file(filename: str) -> Optional[dict]:\n \"\"\"Read the contents of the given file as a JSON object, asynchronously.\"\"\"\n line = await read_file(filename)\n return json.loads(line) if line is not None else None\ndef get_numerical_subdirs(dataset_path: str) -> list[str]:\n \"\"\"Return the names of all numbered subdirectories in the specified directory.\n Used to get all layer directories in an explanation directory.\n \"\"\"\n return [\n str(x)\n for x in sorted(\n [\n int(x)\n for x in bf.listdir(dataset_path)\n if bf.isdir(bf.join(dataset_path, x)) and x.isnumeric()"
+ },
+ {
+ "comment": "This function retrieves the sorted neuron indices from explanations for a given layer. It does this by joining the explanation path with the layer number, listing all files in that directory, filtering numeric filenames, converting them to integers, and finally sorting the resulting list.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py\":217-229",
+ "content": " ]\n )\n ]\ndef get_sorted_neuron_indices_from_explanations(\n explanations_path: str, layer: Union[str, int]\n) -> list[int]:\n \"\"\"Return the indices of all neurons in this layer, in ascending order.\"\"\"\n layer_dir = bf.join(explanations_path, str(layer))\n return sorted(\n [int(f.split(\".\")[0]) for f in bf.listdir(layer_dir) if f.split(\".\")[0].isnumeric()]\n )"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/05e49fd6-efd9-47c4-8ad3-de010755f1d8.json b/docs/doc/05e49fd6-efd9-47c4-8ad3-de010755f1d8.json
new file mode 100644
index 0000000..c34ce22
--- /dev/null
+++ b/docs/doc/05e49fd6-efd9-47c4-8ad3-de010755f1d8.json
@@ -0,0 +1,10 @@
+{
+ "summary": "This code defines a function `reportWebVitals` that, when given a callback function as an argument, uses the `web-vitals` library to measure and report various web vital metrics like CLS, FID, FCP, LCP, TTFB. If no callback or invalid callback is passed, it does nothing.",
+ "details": [
+ {
+ "comment": "This code defines a function `reportWebVitals` that, when given a callback function as an argument, uses the `web-vitals` library to measure and report various web vital metrics like CLS, FID, FCP, LCP, TTFB. If no callback or invalid callback is passed, it does nothing.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/reportWebVitals.js\":0-12",
+ "content": "const reportWebVitals = onPerfEntry => {\n if (onPerfEntry && onPerfEntry instanceof Function) {\n import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {\n getCLS(onPerfEntry);\n getFID(onPerfEntry);\n getFCP(onPerfEntry);\n getLCP(onPerfEntry);\n getTTFB(onPerfEntry);\n });\n }\n };\n export default reportWebVitals;"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/121605bf-f63c-4008-ae03-7f2ae50d2676.json b/docs/doc/121605bf-f63c-4008-ae03-7f2ae50d2676.json
new file mode 100644
index 0000000..8af7b17
--- /dev/null
+++ b/docs/doc/121605bf-f63c-4008-ae03-7f2ae50d2676.json
@@ -0,0 +1,10 @@
+{
+ "summary": "1. memoizeAsync: Memoizes asynchronous functions by storing their results in localStorage and returning them if they have already been computed.\n2. getQueryParams: Retrieves URL query parameters from the current window location and returns them as an object.",
+ "details": [
+ {
+ "comment": "1. memoizeAsync: Memoizes asynchronous functions by storing their results in localStorage and returning them if they have already been computed.\n2. getQueryParams: Retrieves URL query parameters from the current window location and returns them as an object.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/utils.ts\":0-24",
+ "content": "export const memoizeAsync = (fnname: string, fn: any) => {\n return async (...args: any) => {\n const key = `memoized:${fnname}:${args.map((x: any) => JSON.stringify(x)).join(\"-\")}`\n const val = localStorage.getItem(key);\n if (val === null) {\n const value = await fn(...args)\n localStorage.setItem(key, JSON.stringify(value))\n console.log(`memoized ${fnname}(${args.map((x: any) => JSON.stringify(x)).join(\", \")})`, value)\n return value\n } else {\n // console.log(`parsing`, val)\n return JSON.parse(val)\n }\n }\n}\nexport const getQueryParams = () => {\n const urlParams = new URLSearchParams(window.location.search)\n const params: {[key: string]: any} = {}\n for (const [key, value] of urlParams.entries()) {\n params[key] = value\n }\n return params\n}"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/194382e2-36a3-41c5-a6f7-c76f9b7764ec.json b/docs/doc/194382e2-36a3-41c5-a6f7-c76f9b7764ec.json
new file mode 100644
index 0000000..8e8d874
--- /dev/null
+++ b/docs/doc/194382e2-36a3-41c5-a6f7-c76f9b7764ec.json
@@ -0,0 +1,35 @@
+{
+ "summary": "The code initializes an API client with error handling, response caching for OpenAI requests, and implements exponential backoff for retry mechanisms. It uses HTTPX to make asynchronous requests and starts an event loop in the main function.",
+ "details": [
+ {
+ "comment": "This function checks if the error thrown is an API error or a connection error. If it's an API error with status code 400, 404, or 415, it may be due to an idempotency error and can be retried. Otherwise, if it's a connection error, it also needs to be retried.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/api_client.py\":0-33",
+ "content": "import asyncio\nimport contextlib\nimport os\nimport random\nimport traceback\nfrom asyncio import Semaphore\nfrom functools import wraps\nfrom typing import Any, Callable, Optional\nimport httpx\nimport orjson\ndef is_api_error(err: Exception) -> bool:\n if isinstance(err, httpx.HTTPStatusError):\n response = err.response\n error_data = response.json().get(\"error\", {})\n error_message = error_data.get(\"message\")\n if response.status_code in [400, 404, 415]:\n if error_data.get(\"type\") == \"idempotency_error\":\n print(f\"Retrying after idempotency error: {error_message} ({response.url})\")\n return True\n else:\n # Invalid request\n return False\n else:\n print(f\"Retrying after API error: {error_message} ({response.url})\")\n return True\n elif isinstance(err, httpx.ConnectError):\n print(f\"Retrying after connection error... ({err.request.url})\")\n return True\n elif isinstance(err, httpx.TimeoutException):"
+ },
+ {
+ "comment": "The code defines a decorator function `exponential_backoff` that retries a wrapped asynchronous function with exponential backoff and jitter after failures. The retry attempts continue until the specified `retry_on` condition returns False or the maximum number of tries is reached. It also prints error messages and stack traces for unexpected errors during the retries.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/api_client.py\":34-65",
+ "content": " print(f\"Retrying after a timeout error... ({err.request.url})\")\n return True\n elif isinstance(err, httpx.ReadError):\n print(f\"Retrying after a read error... ({err.request.url})\")\n return True\n print(f\"Retrying after an unexpected error: {repr(err)}\")\n traceback.print_tb(err.__traceback__)\n return True\ndef exponential_backoff(\n retry_on: Callable[[Exception], bool] = lambda err: True\n) -> Callable[[Callable], Callable]:\n \"\"\"\n Returns a decorator which retries the wrapped function as long as the specified retry_on\n function returns True for the exception, applying exponential backoff with jitter after\n failures, up to a retry limit.\n \"\"\"\n init_delay_s = 1.0\n max_delay_s = 10.0\n # Roughly 30 minutes before we give up.\n max_tries = 200\n backoff_multiplier = 2.0\n jitter = 0.2\n def decorate(f: Callable) -> Callable:\n assert asyncio.iscoroutinefunction(f)\n @wraps(f)\n async def f_retry(*args: Any, **kwargs: Any) -> None:"
+ },
+ {
+ "comment": "Code snippet handles API requests with retry mechanism and error handling. It sets the OpenAI API key, HTTP headers, and base API URL for making requests. The ApiClient class is initialized with a model_name parameter and supports response caching and concurrency limits.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/api_client.py\":66-97",
+ "content": " delay_s = init_delay_s\n for i in range(max_tries):\n try:\n return await f(*args, **kwargs)\n except Exception as err:\n if not retry_on(err) or i == max_tries - 1:\n raise\n jittered_delay = random.uniform(delay_s * (1 - jitter), delay_s * (1 + jitter))\n await asyncio.sleep(jittered_delay)\n delay_s = min(delay_s * backoff_multiplier, max_delay_s)\n return f_retry\n return decorate\nAPI_KEY = os.getenv(\"OPENAI_API_KEY\")\nassert API_KEY, \"Please set the OPENAI_API_KEY environment variable\"\nAPI_HTTP_HEADERS = {\n \"Content-Type\": \"application/json\",\n \"Authorization\": \"Bearer \" + API_KEY,\n}\nBASE_API_URL = \"https://api.openai.com/v1\"\nclass ApiClient:\n \"\"\"Performs inference using the OpenAI API. Supports response caching and concurrency limits.\"\"\"\n def __init__(\n self,\n model_name: str,\n # If set, no more than this number of HTTP requests will be made concurrently."
+ },
+ {
+ "comment": "The code is initializing an API client with optional parameters for maximum concurrent requests, and whether to cache request/response pairs. It also has a method `make_request` which uses exponential backoff retry mechanism when making HTTP calls. If caching is enabled, it checks if the request has been cached before executing the call.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/api_client.py\":98-125",
+ "content": " max_concurrent: Optional[int] = None,\n # Whether to cache request/response pairs in memory to avoid duplicating requests.\n cache: bool = False,\n ):\n self.model_name = model_name\n if max_concurrent is not None:\n self._concurrency_check: Optional[Semaphore] = Semaphore(max_concurrent)\n else:\n self._concurrency_check = None\n if cache:\n self._cache: Optional[dict[str, Any]] = {}\n else:\n self._cache = None\n @exponential_backoff(retry_on=is_api_error)\n async def make_request(\n self, timeout_seconds: Optional[int] = None, **kwargs: Any\n ) -> dict[str, Any]:\n if self._cache is not None:\n key = orjson.dumps(kwargs)\n if key in self._cache:\n return self._cache[key]\n async with contextlib.AsyncExitStack() as stack:\n if self._concurrency_check is not None:\n await stack.enter_async_context(self._concurrency_check)\n http_client = await stack.enter_async_context("
+ },
+ {
+ "comment": "This code is creating an instance of `ApiClient` class and making a request to OpenAI API using the `make_request` method. The request URL depends on whether the input has \"messages\" key or not, and it uses HTTPX client for asynchronous requests. If there's an error in the response, it prints the JSON data then re-raises the exception. If a cache is set, the response JSON will be cached under the specified key.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/api_client.py\":126-149",
+ "content": " httpx.AsyncClient(timeout=timeout_seconds)\n )\n # If the request has a \"messages\" key, it should be sent to the /chat/completions\n # endpoint. Otherwise, it should be sent to the /completions endpoint.\n url = BASE_API_URL + (\"/chat/completions\" if \"messages\" in kwargs else \"/completions\")\n kwargs[\"model\"] = self.model_name\n response = await http_client.post(url, headers=API_HTTP_HEADERS, json=kwargs)\n # The response json has useful information but the exception doesn't include it, so print it\n # out then reraise.\n try:\n response.raise_for_status()\n except Exception as e:\n print(response.json())\n raise e\n if self._cache is not None:\n self._cache[key] = response.json()\n return response.json()\nif __name__ == \"__main__\":\n async def main() -> None:\n client = ApiClient(model_name=\"gpt-3.5-turbo\", max_concurrent=1)\n print(await client.make_request(prompt=\"Why did the chicken cross the road?\", max_tokens=9))"
+ },
+ {
+ "comment": "This code starts an asynchronous event loop and runs the main function.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/api_client.py\":151-151",
+ "content": " asyncio.run(main())"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/1b9f0b08-38ae-4975-b0e3-85674797a4a3.json b/docs/doc/1b9f0b08-38ae-4975-b0e3-85674797a4a3.json
new file mode 100644
index 0000000..21610f3
--- /dev/null
+++ b/docs/doc/1b9f0b08-38ae-4975-b0e3-85674797a4a3.json
@@ -0,0 +1,35 @@
+{
+ "summary": "This code repository contains tools for generating and analyzing neuron explanations in language models, including public datasets in JSON format and data sources for related neurons and tokens. It also addresses GPT-2 model availability and fixes a GELU implementation bug for inference.",
+ "details": [
+ {
+ "comment": "This repository contains code and tools for the Language models can explain neurons in language models paper. It includes a tool for generating, simulating, and scoring explanations of neuron behavior using the methodology described in the paper. Additionally, there's a tool for viewing neuron activations and explanations accessible online.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/README.md\":0-15",
+ "content": "# Automated interpretability\n## Code and tools\nThis repository contains code and tools associated with the [Language models can explain neurons in\nlanguage models](https://openaipublic.blob.core.windows.net/neuron-explainer/paper/index.html) paper, specifically:\n* Code for automatically generating, simulating, and scoring explanations of neuron behavior using\nthe methodology described in the paper. See the\n[neuron-explainer README](neuron-explainer/README.md) for more information.\nNote: if you run into errors of the form \"Error: Could not find any credentials that grant access to storage account: 'openaipublic' and container: 'neuron-explainer'\".\" you might be able to fix this by signing up for an azure account and specifying the credentials as described in the error message. \n* A tool for viewing neuron activations and explanations, accessible\n[here](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html). See\nthe [neuron-viewer README](neuron-viewer/README.md) for more information."
+ },
+ {
+ "comment": "This code provides the location and overview of public datasets for GPT-2 XL neurons and explanations. The datasets include neuron activations and explanations in JSON format, with different sets of tokens and activations provided.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/README.md\":17-32",
+ "content": "## Public datasets\nTogether with this code, we're also releasing public datasets of GPT-2 XL neurons and explanations.\nHere's an overview of those datasets. \n* Neuron activations: `az://openaipublic/neuron-explainer/data/collated-activations/{layer_index}/{neuron_index}.json`\n - Tokenized text sequences and their activations for the neuron. We\n provide multiple sets of tokens and activations: top-activating ones, random\n samples from several quantiles; and a completely random sample. We also provide\n some basic statistics for the activations.\n - Each file contains a JSON-formatted\n [`NeuronRecord`](neuron-explainer/neuron_explainer/activations/activations.py#L89) dataclass.\n* Neuron explanations: `az://openaipublic/neuron-explainer/data/explanations/{layer_index}/{neuron_index}.jsonl`\n - Scored model-generated explanations of the behavior of the neuron, including simulation results.\n - Each file contains a JSON-formatted\n [`NeuronSimulationResults`](neuron-explainer/neuron_explainer/explanations/explanations.py#L146)"
+ },
+ {
+ "comment": "This code defines data sources for related neurons and tokens in a model, stored in Azure Blob Storage. The related neurons include upstream and downstream neurons with the most positive and negative connections, as well as tokens with high average activations or large inbound and outbound weights. Each file contains a JSON-formatted dataclass, which is not included in this repository.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/README.md\":33-44",
+ "content": " dataclass.\n* Related neurons: `az://openaipublic/neuron-explainer/data/related-neurons/weight-based/{layer_index}/{neuron_index}.json`\n - Lists of the upstream and downstream neurons with the most positive and negative connections (see below for definition).\n - Each file contains a JSON-formatted dataclass whose definition is not included in this repo.\n* Tokens with high average activations:\n`az://openaipublic/neuron-explainer/data/related-tokens/activation-based/{layer_index}/{neuron_index}.json`\n - Lists of tokens with the highest average activations for individual neurons, and their average activations.\n - Each file contains a JSON-formatted [`TokenLookupTableSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L36)\n dataclass.\n* Tokens with large inbound and outbound weights:\n`az://openaipublic/neuron-explainer/data/related-tokens/weight-based/{layer_index}/{neuron_index}.json`\n - List of the most-positive and most-negative input and output tokens for individual neurons,"
+ },
+ {
+ "comment": "This code provides information about the availability of neuron activations and explanations for GPT-2 models in different sizes. It also mentions updates on the data, including a bug fix related to the GELU implementation used for inference.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/README.md\":45-54",
+ "content": " as well as the associated weight (see below for definition). \n - Each file contains a JSON-formatted [`WeightBasedSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L17)\n dataclass.\nUpdate (July 5, 2023):\nWe also released a set of explanations for GPT-2 Small. The methodology is slightly different from the methodology used for GPT-2 XL so the results aren't directly comparable.\n* Neuron activations: `az://openaipublic/neuron-explainer/gpt2_small_data/collated-activations/{layer_index}/{neuron_index}.json`\n* Neuron explanations: `az://openaipublic/neuron-explainer/gpt2_small_data/explanations/{layer_index}/{neuron_index}.jsonl`\nUpdate (August 30, 2023): We recently discovered a bug in how we performed inference on the GPT-2 series models used for the paper and for these datasets. Specifically, we used an optimized GELU implementation rather than the original GELU implementation associated with GPT-2. While the model\u2019s behavior is very similar across "
+ },
+ {
+ "comment": "This code is explaining the difference in activation values between two configurations for GPT-2 small. It also provides a link to understand the model weight conventions and defines connection weights between neurons and tokens. Additionally, it mentions lists of interesting neurons with some preliminary descriptions.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/README.md\":54-75",
+ "content": "these two configurations, the post-MLP activation values we used to generate and simulate explanations differ from the correct values by the following amounts for GPT-2 small:\n- Median: 0.0090\n- 90th percentile: 0.0252\n- 99th percentile: 0.0839\n- 99.9th percentile: 0.1736\n### Definition of connection weights\nRefer to [GPT-2 model code](https://github.com/openai/gpt-2/blob/master/src/model.py) for\nunderstanding of model weight conventions.\n*Neuron-neuron*: For two neurons `(l1, n1)` and `(l2, n2)` with `l1 < l2`, the connection strength is defined as\n`h{l1}.mlp.c_proj.w[:, n1, :] @ diag(h{l2}.ln_2.g) @ h{l2}.mlp.c_fc.w[:, :, n2]`.\n*Neuron-token*: For token `t` and neuron `(l, n)`, the input weight is computed as\n`wte[t, :] @ diag(h{l}.ln_2.g) @ h{l}.mlp.c_fc.w[:, :, n]`\nand the output weight is computed as\n`h{l}.mlp.c_proj.w[:, n, :] @ diag(ln_f.g) @ wte[t, :]`.\n### Misc Lists of Interesting Neurons\nLists of neurons we thought were interesting according to different criteria, with some preliminary descriptions."
+ },
+ {
+ "comment": "These are links to external spreadsheets and documents containing neurons with specific characteristics, such as interesting neurons, high-scoring neurons on random tests, clusters well explained by activation explanation but not by tokens, and neurons sensitive to truncation.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/README.md\":76-79",
+ "content": "* [Interesting Neurons (external)](https://docs.google.com/spreadsheets/d/1p7fYs31NU8sJoeKyUx4Mn2laGx8xXfHg_KcIvYiKPpg/edit#gid=0)\n* [Neurons that score high on random, possibly monosemantic? (external)](https://docs.google.com/spreadsheets/d/1TqKFcz-84jyIHLU7VRoTc8BoFBMpbgac-iNBnxVurQ8/edit?usp=sharing)\n* [Clusters of neurons well explained by activation explanation but not by tokens](https://docs.google.com/document/d/1lWhKowpKDdwTMALD_K541cdwgGoQx8DFUSuEe1U2AGE/edit?usp=sharing)\n* [Neurons sensitive to truncation](https://docs.google.com/document/d/1x89TWBvuHcyC2t01EDbJZJ5LQYHozlcS-VUmr5shf_A/edit?usp=sharing)"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/209b69f8-0ea1-4d78-bbce-5dcb2c493c87.json b/docs/doc/209b69f8-0ea1-4d78-bbce-5dcb2c493c87.json
new file mode 100644
index 0000000..19e868b
--- /dev/null
+++ b/docs/doc/209b69f8-0ea1-4d78-bbce-5dcb2c493c87.json
@@ -0,0 +1,45 @@
+{
+ "summary": "The CalibratedNeuronSimulator improves NeuronSimulator with calibration methods, while the LinearCalibratedNeuronSimulator uses flattened activations and true activations for calibration, and PercentileMatchingCalibratedNeuronSimulator ensures distribution matching on the calibration set.",
+ "details": [
+ {
+ "comment": "CalibratedNeuronSimulator class inherits from NeuronSimulator and provides calibration for mapping predicted activation values to real neuron activations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py\":0-26",
+ "content": "\"\"\"\nCode for calibrating simulations of neuron behavior. Calibration refers to a process of mapping from\na space of predicted activation values (e.g. [0, 10]) to the real activation distribution for a\nneuron.\nSee http://go/neuron_explanation_methodology for description of calibration step. Necessary for\nsimulating neurons in the context of ablate-to-simulation, but can be skipped when using correlation\nscoring. (Calibration may still improve quality for scoring, at least for non-linear calibration\nmethods.)\n\"\"\"\nfrom __future__ import annotations\nimport asyncio\nfrom abc import abstractmethod\nfrom typing import Optional, Sequence\nimport numpy as np\nfrom neuron_explainer.activations.activations import ActivationRecord\nfrom neuron_explainer.explanations.explanations import ActivationScale\nfrom neuron_explainer.explanations.simulator import NeuronSimulator, SequenceSimulation\nfrom sklearn import linear_model\nclass CalibratedNeuronSimulator(NeuronSimulator):\n \"\"\"\n Wrap a NeuronSimulator and calibrate it to map from the predicted activation space to the"
+ },
+ {
+ "comment": "This code defines a class method `create()` and a method `calibrate()` for the `CalibratedNeuronSimulator` class. The `create()` method creates and calibrates a simulator in one call, while the `calibrate()` method determines parameters to map from predicted activation space to real neuron activation space based on a calibration set.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py\":27-52",
+ "content": " actual neuron activation space.\n \"\"\"\n def __init__(self, uncalibrated_simulator: NeuronSimulator):\n self.uncalibrated_simulator = uncalibrated_simulator\n @classmethod\n async def create(\n cls,\n uncalibrated_simulator: NeuronSimulator,\n calibration_activation_records: Sequence[ActivationRecord],\n ) -> CalibratedNeuronSimulator:\n \"\"\"\n Create and calibrate a calibrated simulator (so initialization and calibration can be done\n in one call).\n \"\"\"\n calibrated_simulator = cls(uncalibrated_simulator)\n await calibrated_simulator.calibrate(calibration_activation_records)\n return calibrated_simulator\n async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None:\n \"\"\"\n Determine parameters to map from the predicted activation space to the real neuron\n activation space, based on a calibration set.\n Use when simulated sequences haven't already been produced on the calibration set."
+ },
+ {
+ "comment": "Calibrating simulator by generating flattened activation sequences for both calibration_activation_records and simulations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py\":53-77",
+ "content": " \"\"\"\n simulations = await asyncio.gather(\n *[\n self.uncalibrated_simulator.simulate(activations.tokens)\n for activations in calibration_activation_records\n ]\n )\n self.calibrate_from_simulations(calibration_activation_records, simulations)\n def calibrate_from_simulations(\n self,\n calibration_activation_records: Sequence[ActivationRecord],\n simulations: Sequence[SequenceSimulation],\n ) -> None:\n \"\"\"\n Determine parameters to map from the predicted activation space to the real neuron\n activation space, based on a calibration set.\n Use when simulated sequences have already been produced on the calibration set.\n \"\"\"\n flattened_activations = []\n flattened_simulated_activations: list[float] = []\n for activations, simulation in zip(calibration_activation_records, simulations):\n flattened_activations.extend(activations.activations)\n flattened_simulated_activations.extend(simulation.expected_activations)"
+ },
+ {
+ "comment": "This code defines a calibrated simulator that can be used to map the predicted activation space of a model to the actual neuron activation space. It contains methods for calibration and applying calibration to sequences of values. The simulate method is also defined, which uses an uncalibrated simulator to obtain expected activations and applies the calibration to obtain the final calibrated activations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py\":78-103",
+ "content": " self._calibrate_from_flattened_activations(\n np.array(flattened_activations), np.array(flattened_simulated_activations)\n )\n @abstractmethod\n def _calibrate_from_flattened_activations(\n self,\n true_activations: np.ndarray,\n uncalibrated_activations: np.ndarray,\n ) -> None:\n \"\"\"\n Determine parameters to map from the predicted activation space to the real neuron\n activation space, based on a calibration set.\n Take numpy arrays of all true activations and all uncalibrated activations on the\n calibration set over all sequences.\n \"\"\"\n @abstractmethod\n def apply_calibration(self, values: Sequence[float]) -> list[float]:\n \"\"\"Apply the learned calibration to a sequence of values.\"\"\"\n async def simulate(self, tokens: Sequence[str]) -> SequenceSimulation:\n uncalibrated_seq_simulation = await self.uncalibrated_simulator.simulate(tokens)\n calibrated_activations = self.apply_calibration(\n uncalibrated_seq_simulation.expected_activations"
+ },
+ {
+ "comment": "CalibratedNeuronSimulator applies calibration to uncalibrated sequence simulation. UncalibratedNeuronSimulator passes through activations without calibration.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py\":104-128",
+ "content": " )\n calibrated_distribution_values = [\n self.apply_calibration(dv) for dv in uncalibrated_seq_simulation.distribution_values\n ]\n return SequenceSimulation(\n tokens=uncalibrated_seq_simulation.tokens,\n expected_activations=calibrated_activations,\n activation_scale=ActivationScale.NEURON_ACTIVATIONS,\n distribution_values=calibrated_distribution_values,\n distribution_probabilities=uncalibrated_seq_simulation.distribution_probabilities,\n uncalibrated_simulation=uncalibrated_seq_simulation,\n )\nclass UncalibratedNeuronSimulator(CalibratedNeuronSimulator):\n \"\"\"Pass through the activations without trying to calibrate.\"\"\"\n def __init__(self, uncalibrated_simulator: NeuronSimulator):\n super().__init__(uncalibrated_simulator)\n async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None:\n pass\n def _calibrate_from_flattened_activations(\n self,"
+ },
+ {
+ "comment": "This code defines a class `LinearCalibratedNeuronSimulator` that inherits from `CalibratedNeuratorSimulator`. It initializes an optional linear regression model and provides two methods. The method `_calibrate_from_flattened_activations` fits the linear regression model with flattened uncalibrated activations and true activations, and the method `apply_calibration` applies the calibration to a given sequence of values if they are a list.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py\":129-156",
+ "content": " true_activations: np.ndarray,\n uncalibrated_activations: np.ndarray,\n ) -> None:\n pass\n def apply_calibration(self, values: Sequence[float]) -> list[float]:\n return values if isinstance(values, list) else list(values)\nclass LinearCalibratedNeuronSimulator(CalibratedNeuronSimulator):\n \"\"\"Find a linear mapping from uncalibrated activations to true activations.\n Should not change ev_correlation_score because it is invariant to linear transformations.\n \"\"\"\n def __init__(self, uncalibrated_simulator: NeuronSimulator):\n super().__init__(uncalibrated_simulator)\n self._regression: Optional[linear_model.LinearRegression] = None\n def _calibrate_from_flattened_activations(\n self,\n true_activations: np.ndarray,\n uncalibrated_activations: np.ndarray,\n ) -> None:\n self._regression = linear_model.LinearRegression()\n self._regression.fit(uncalibrated_activations.reshape(-1, 1), true_activations)\n def apply_calibration(self, values: Sequence[float]) -> list[float]:"
+ },
+ {
+ "comment": "This code defines a `PercentileMatchingCalibratedNeuronSimulator` class that calibrates a neuron simulator by mapping the nth percentile of uncalibrated activations to the nth percentile of true activations for all n. This will match the distribution of true activations on the calibration set but will be overconfident outside of it. The `__init__` method initializes an instance with an optional `uncalibrated_simulator`, and the `_calibrate_from_flattened_activations` method performs the actual calibration using true activations and uncalibrated activations as inputs.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py\":157-183",
+ "content": " if self._regression is None:\n raise ValueError(\"Must call calibrate() before apply_calibration\")\n if len(values) == 0:\n return []\n return self._regression.predict(np.reshape(np.array(values), (-1, 1))).tolist()\nclass PercentileMatchingCalibratedNeuronSimulator(CalibratedNeuronSimulator):\n \"\"\"\n Map the nth percentile of the uncalibrated activations to the nth percentile of the true\n activations for all n.\n This will match the distribution of true activations on the calibration set, but will be\n overconfident outside of the calibration set.\n \"\"\"\n def __init__(self, uncalibrated_simulator: NeuronSimulator):\n super().__init__(uncalibrated_simulator)\n self._uncalibrated_activations: Optional[np.ndarray] = None\n self._true_activations: Optional[np.ndarray] = None\n def _calibrate_from_flattened_activations(\n self,\n true_activations: np.ndarray,\n uncalibrated_activations: np.ndarray,\n ) -> None:\n self._uncalibrated_activations = np.sort(uncalibrated_activations)"
+ },
+ {
+ "comment": "Sorting true_activations for calibration and raising ValueError if calibrate() not called before apply_calibration.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py\":184-193",
+ "content": " self._true_activations = np.sort(true_activations)\n def apply_calibration(self, values: Sequence[float]) -> list[float]:\n if self._true_activations is None or self._uncalibrated_activations is None:\n raise ValueError(\"Must call calibrate() before apply_calibration\")\n if len(values) == 0:\n return []\n return np.interp(\n np.array(values), self._uncalibrated_activations, self._true_activations\n ).tolist()"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/274b9e0f-d26d-439e-a0ca-3951909abe03.json b/docs/doc/274b9e0f-d26d-439e-a0ca-3951909abe03.json
new file mode 100644
index 0000000..79fc4a6
--- /dev/null
+++ b/docs/doc/274b9e0f-d26d-439e-a0ca-3951909abe03.json
@@ -0,0 +1,10 @@
+{
+ "summary": "Imports FastDataclass and related functions from the fast_dataclasses module, and sets __all__ to include them.",
+ "details": [
+ {
+ "comment": "Imports FastDataclass and related functions from the fast_dataclasses module, and sets __all__ to include them.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py\":0-2",
+ "content": "from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass\n__all__ = [\"FastDataclass\", \"dumps\", \"loads\", \"register_dataclass\"]"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/2d2ff443-aaa6-4d91-bec6-45c60d479e16.json b/docs/doc/2d2ff443-aaa6-4d91-bec6-45c60d479e16.json
new file mode 100644
index 0000000..483c996
--- /dev/null
+++ b/docs/doc/2d2ff443-aaa6-4d91-bec6-45c60d479e16.json
@@ -0,0 +1,30 @@
+{
+ "summary": "The React component visualizes matching sequences and activations through colored divs or heatmaps, while rendering a div for each simulation in the array.",
+ "details": [
+ {
+ "comment": "This code is a React component that takes in two sets of sequences (sequences and simulated_sequences), along with optional overlay_activations, colors, and boundaries props. It maps through each sequence, then each token within the sequence, comparing the activation values between the original sequence and the simulated one. If overlay_activations is true, it will display both sets of activations in a div element with custom styling.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/simulationHeatmap.tsx\":0-20",
+ "content": "import React, { useState } from 'react';\nimport { interpolateColor, Color, getInterpolatedColor, DEFAULT_COLORS, DEFAULT_BOUNDARIES, TokenAndActivation } from './types'\ntype Props = {\n sequences: TokenAndActivation[][], \n simulated_sequences: TokenAndActivation[][], \n overlay_activations: boolean,\n colors?: Color[], \n boundaries?: number[],\n}\nexport default function SimulationSequences({ sequences, simulated_sequences, overlay_activations, colors = DEFAULT_COLORS, boundaries = DEFAULT_BOUNDARIES }: Props) {\n return <>\n {\n sequences.map((tokens, i) => {\n let simulated_tokens = simulated_sequences[i];\n if (overlay_activations) {\n return (\n
\n {tokens.map(({ token, activation, normalized_activation }, j) => {\n const { token: simulated_token, activation: simulated_activation, normalized_activation: simulated_normalized_activation } = simulated_tokens[j];"
+ },
+ {
+ "comment": "Checking if simulated tokens match and creating colored divs for activation and simulation values.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/simulationHeatmap.tsx\":21-39",
+ "content": " if (simulated_token !== token) {\n throw new Error('simulated tokens not matching')\n }\n const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation);\n const simcolor = getInterpolatedColor(colors, boundaries, simulated_normalized_activation || simulated_activation);\n return
"
+ },
+ {
+ "comment": "Renders a div component for each simulation in the simulations array.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/simulationHeatmap.tsx\":88-94",
+ "content": "
\n )\n }\n })\n }\n >\n}"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/3095add8-4383-426b-9b68-ab1a0eb07c51.json b/docs/doc/3095add8-4383-426b-9b68-ab1a0eb07c51.json
new file mode 100644
index 0000000..f78fc0f
--- /dev/null
+++ b/docs/doc/3095add8-4383-426b-9b68-ab1a0eb07c51.json
@@ -0,0 +1,60 @@
+{
+ "summary": "The code involves dataclasses, enums for slicing activation records, ensures disjoint and covering slices, obtains interleaved subsets for training, validation, explanation evaluations, checks neuron existence, fetches neuron data from a file, ensures compatibility with NeuronRecord dataclass, provides options for synchronous/asynchronous processing, retrieves fold names in numeric order from the \"neurons\" directory.",
+ "details": [
+ {
+ "comment": "Defines dataclasses and enums for storing information about neuron-indexed activations, along with related helper functions.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":0-32",
+ "content": "# Dataclasses and enums for storing neuron-indexed information about activations. Also, related\n# helper functions.\nimport math\nfrom dataclasses import dataclass, field\nfrom typing import List, Optional, Union\nimport urllib.request\nimport blobfile as bf\nimport boostedblob as bbb\nfrom neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass\nfrom neuron_explainer.azure import standardize_azure_url\n@register_dataclass\n@dataclass\nclass ActivationRecord(FastDataclass):\n \"\"\"Collated lists of tokens and their activations for a single neuron.\"\"\"\n tokens: List[str]\n \"\"\"Tokens in the text sequence, represented as strings.\"\"\"\n activations: List[float]\n \"\"\"Raw activation values for the neuron on each token in the text sequence.\"\"\"\n@register_dataclass\n@dataclass\nclass NeuronId(FastDataclass):\n \"\"\"Identifier for a neuron in an artificial neural network.\"\"\"\n layer_index: int\n \"\"\"The index of layer the neuron is in. The first layer used during inference has index 0.\"\"\"\n neuron_index: int"
+ },
+ {
+ "comment": "This code defines two functions: `_check_slices` and `get_slices_for_splits`.\n- `_check_slices` checks if slices are disjoint and fully cover the intended range.\n- `get_slices_for_splits` gets equal-sized interleaved subsets for a list of splits.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":33-63",
+ "content": " \"\"\"The neuron's index within in its layer. Indices start from 0 in each layer.\"\"\"\ndef _check_slices(\n slices_by_split: dict[str, slice],\n expected_num_values: int,\n) -> None:\n \"\"\"Assert that the slices are disjoint and fully cover the intended range.\"\"\"\n indices = set()\n sum_of_slice_lengths = 0\n n_splits = len(slices_by_split.keys())\n for s in slices_by_split.values():\n subrange = range(expected_num_values)[s]\n sum_of_slice_lengths += len(subrange)\n indices |= set(subrange)\n assert (\n sum_of_slice_lengths == expected_num_values\n ), f\"{sum_of_slice_lengths=} != {expected_num_values=}\"\n stride = n_splits\n expected_indices = set.union(\n *[set(range(start_index, expected_num_values, stride)) for start_index in range(n_splits)]\n )\n assert indices == expected_indices, f\"{indices=} != {expected_indices=}\"\ndef get_slices_for_splits(\n splits: list[str],\n num_activation_records_per_split: int,\n) -> dict[str, slice]:\n \"\"\"\n Get equal-sized interleaved subsets for each of a list of splits, given the number of elements"
+ },
+ {
+ "comment": "This code defines a class for ActivationRecordSliceParams, which specifies how to slice activation records based on the number of examples per split. It also includes a dataclass NeuronRecord that stores neuron-indexed activation data with summary stats and notable activation records.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":64-98",
+ "content": " to include in each split.\n \"\"\"\n stride = len(splits)\n num_activation_records_for_even_splits = num_activation_records_per_split * stride\n slices_by_split = {\n split: slice(split_index, num_activation_records_for_even_splits, stride)\n for split_index, split in enumerate(splits)\n }\n _check_slices(\n slices_by_split=slices_by_split,\n expected_num_values=num_activation_records_for_even_splits,\n )\n return slices_by_split\n@dataclass\nclass ActivationRecordSliceParams:\n \"\"\"How to select splits (train, valid, etc.) of activation records.\"\"\"\n n_examples_per_split: Optional[int]\n \"\"\"The number of examples to include in each split.\"\"\"\n@register_dataclass\n@dataclass\nclass NeuronRecord(FastDataclass):\n \"\"\"Neuron-indexed activation data, including summary stats and notable activation records.\"\"\"\n neuron_id: NeuronId\n \"\"\"Identifier for the neuron.\"\"\"\n random_sample: list[ActivationRecord] = field(default_factory=list)\n \"\"\"\n Random activation records for this neuron. The random sample is independent from those used for"
+ },
+ {
+ "comment": "This code represents a class for neuron activation records. It has attributes for random samples at specific quantiles, quantile boundaries, and moments of the activation values (mean, variance, skewness, kurtosis). Additionally, it includes a list of most positive activation records and a property to return the maximum activation value across all top-activating activation records.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":99-124",
+ "content": " other neurons.\n \"\"\"\n random_sample_by_quantile: Optional[list[list[ActivationRecord]]] = None\n \"\"\"\n Random samples of activation records in each of the specified quantiles. None if quantile\n tracking is disabled.\n \"\"\"\n quantile_boundaries: Optional[list[float]] = None\n \"\"\"Boundaries of the quantiles used to generate the random_sample_by_quantile field.\"\"\"\n # Moments of activations\n mean: Optional[float] = math.nan\n variance: Optional[float] = math.nan\n skewness: Optional[float] = math.nan\n kurtosis: Optional[float] = math.nan\n most_positive_activation_records: list[ActivationRecord] = field(default_factory=list)\n \"\"\"\n Activation records with the most positive figure of merit value for this neuron over all dataset\n examples.\n \"\"\"\n @property\n def max_activation(self) -> float:\n \"\"\"Return the maximum activation value over all top-activating activation records.\"\"\"\n return max([max(ar.activations) for ar in self.most_positive_activation_records])"
+ },
+ {
+ "comment": "Code defines two methods, _get_top_activation_slices and _get_random_activation_slices, which return slices for activation records based on specified parameters. These slices are used to select a subset of the activation records for further processing.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":126-143",
+ "content": " def _get_top_activation_slices(\n self, activation_record_slice_params: ActivationRecordSliceParams\n ) -> dict[str, slice]:\n splits = [\"train\", \"calibration\", \"valid\", \"test\"]\n n_examples_per_split = activation_record_slice_params.n_examples_per_split\n if n_examples_per_split is None:\n n_examples_per_split = len(self.most_positive_activation_records) // len(splits)\n assert len(self.most_positive_activation_records) >= n_examples_per_split * len(splits)\n return get_slices_for_splits(splits, n_examples_per_split)\n def _get_random_activation_slices(\n self, activation_record_slice_params: ActivationRecordSliceParams\n ) -> dict[str, slice]:\n splits = [\"calibration\", \"valid\", \"test\"]\n n_examples_per_split = activation_record_slice_params.n_examples_per_split\n if n_examples_per_split is None:\n n_examples_per_split = len(self.random_sample) // len(splits)\n # NOTE: this assert could trigger on some ol"
+ },
+ {
+ "comment": "The code defines three methods: \"get_slices_for_splits\", \"train_activation_records\", and \"calibration_activation_records\".\n\"get_slices_for_splits\" returns slices for the specified splits based on the given number of examples per split.\n\"train_activation_records\" retrieves activation records from the \"most_positive_activation_records\" list for the training split.\n\"calibration_activation_records\" retrieves activation records for the calibration split.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":143-165",
+ "content": "d datasets with only 10 random samples, in which case you may have to remove \"test\" from the set of splits\n assert len(self.random_sample) >= n_examples_per_split * len(splits)\n return get_slices_for_splits(splits, n_examples_per_split)\n def train_activation_records(\n self,\n activation_record_slice_params: ActivationRecordSliceParams,\n ) -> list[ActivationRecord]:\n \"\"\"\n Train split, typically used for generating explanations. Consists exclusively of\n top-activating records since context window limitations make it difficult to include\n random records.\n \"\"\"\n return self.most_positive_activation_records[\n self._get_top_activation_slices(activation_record_slice_params)[\"train\"]\n ]\n def calibration_activation_records(\n self,\n activation_record_slice_params: ActivationRecordSliceParams,\n ) -> list[ActivationRecord]:\n \"\"\"\n Calibration split, typically used for calibrating neuron simulations. See"
+ },
+ {
+ "comment": "This code defines two methods: \"calibration\" and \"valid_activation_records\". Both methods return a combination of top-activating records and random records in a 1:1 ratio, which can be used for explanation validation or evaluation.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":166-189",
+ "content": " http://go/neuron_explanation_methodology for an explanation of calibration. Consists of\n top-activating records and random records in a 1:1 ratio.\n \"\"\"\n return (\n self.most_positive_activation_records[\n self._get_top_activation_slices(activation_record_slice_params)[\"calibration\"]\n ]\n + self.random_sample[\n self._get_random_activation_slices(activation_record_slice_params)[\"calibration\"]\n ]\n )\n def valid_activation_records(\n self,\n activation_record_slice_params: ActivationRecordSliceParams,\n ) -> list[ActivationRecord]:\n \"\"\"\n Validation split, typically used for evaluating explanations, either automatically with\n simulation + correlation coefficient scoring, or manually by humans. Consists of\n top-activating records and random records in a 1:1 ratio.\n \"\"\"\n return (\n self.most_positive_activation_records[\n self._get_top_activation_slices(activation_record_slice_params)[\"valid\"]"
+ },
+ {
+ "comment": "The code defines three functions:\n1. \"get_activation_slices\": returns activation slices for training and validation splits based on the given parameters.\n2. \"test_activation_records\": returns a list of activation records used for explanation evaluations that can't use the validation split, containing top-activating records and random records in a 1:1 ratio.\n3. \"neuron_exists\": checks if a specified neuron exists based on given dataset path, layer index, and neuron index.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":190-218",
+ "content": " ]\n + self.random_sample[\n self._get_random_activation_slices(activation_record_slice_params)[\"valid\"]\n ]\n )\n def test_activation_records(\n self,\n activation_record_slice_params: ActivationRecordSliceParams,\n ) -> list[ActivationRecord]:\n \"\"\"\n Test split, typically used for explanation evaluations that can't use the validation split.\n Consists of top-activating records and random records in a 1:1 ratio.\n \"\"\"\n return (\n self.most_positive_activation_records[\n self._get_top_activation_slices(activation_record_slice_params)[\"test\"]\n ]\n + self.random_sample[\n self._get_random_activation_slices(activation_record_slice_params)[\"test\"]\n ]\n )\ndef neuron_exists(\n dataset_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]\n) -> bool:\n \"\"\"Return whether the specified neuron exists.\"\"\"\n file = bf.join(dataset_path, \"neurons\", str(layer_index), f\"{neuron_index}.json\")"
+ },
+ {
+ "comment": "This code contains two functions, `load_neuron` and an asynchronous version `load_neuron_async`, that fetch NeuronRecord data from a specified neuron. It first constructs the URL to the dataset based on layer index and neuron index, then opens the URL and reads the JSON data. If the read data is not of type NeuronRecord, it raises an error. The asynchronous version uses BigBangEngine's `ensure_session` decorator for asynchronous execution.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":219-246",
+ "content": " return bf.exists(file)\ndef load_neuron(\n layer_index: Union[str, int],\n neuron_index: Union[str, int],\n dataset_path: str = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations\",\n) -> NeuronRecord:\n \"\"\"Load the NeuronRecord for the specified neuron.\"\"\"\n url = \"/\".join([dataset_path, str(layer_index), f\"{neuron_index}.json\"])\n url = standardize_azure_url(url)\n with urllib.request.urlopen(url) as f:\n neuron_record = loads(f.read())\n if not isinstance(neuron_record, NeuronRecord):\n raise ValueError(\n f\"Stored data incompatible with current version of NeuronRecord dataclass.\"\n )\n return neuron_record\n@bbb.ensure_session\nasync def load_neuron_async(\n layer_index: Union[str, int],\n neuron_index: Union[str, int],\n dataset_path: str = \"az://openaipublic/neuron-explainer/data/collated-activations\",\n) -> NeuronRecord:\n \"\"\"Async version of load_neuron.\"\"\"\n file = bf.join(dataset_path, str(layer_index), f\"{neuron_index}.json\")"
+ },
+ {
+ "comment": "This code retrieves neuron data from a file, checks its compatibility with the NeuronRecord dataclass, and provides functions to get sorted layer indices and layer names for a given dataset.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":247-272",
+ "content": " return await read_neuron_file(file)\n@bbb.ensure_session\nasync def read_neuron_file(neuron_filename: str) -> NeuronRecord:\n \"\"\"Like load_neuron_async, but takes a raw neuron filename.\"\"\"\n raw_contents = await bbb.read.read_single(neuron_filename)\n neuron_record = loads(raw_contents.decode(\"utf-8\"))\n if not isinstance(neuron_record, NeuronRecord):\n raise ValueError(\n f\"Stored data incompatible with current version of NeuronRecord dataclass.\"\n )\n return neuron_record\ndef get_sorted_neuron_indices(dataset_path: str, layer_index: Union[str, int]) -> List[int]:\n \"\"\"Returns the indices of all neurons in this layer, in ascending order.\"\"\"\n layer_dir = bf.join(dataset_path, \"neurons\", str(layer_index))\n return sorted(\n [int(f.split(\".\")[0]) for f in bf.listdir(layer_dir) if f.split(\".\")[0].isnumeric()]\n )\ndef get_sorted_layers(dataset_path: str) -> List[str]:\n \"\"\"\n Return the indices of all layers in this dataset, in ascending numerical order, as strings."
+ },
+ {
+ "comment": "Gets numeric fold names from \"neurons\" directory and sorts them.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activations.py\":273-279",
+ "content": " \"\"\"\n return [\n str(x)\n for x in sorted(\n [int(x) for x in bf.listdir(bf.join(dataset_path, \"neurons\")) if x.isnumeric()]\n )\n ]"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/33daeebe-812e-4081-a997-e168abc9e262.json b/docs/doc/33daeebe-812e-4081-a997-e168abc9e262.json
new file mode 100644
index 0000000..fee2043
--- /dev/null
+++ b/docs/doc/33daeebe-812e-4081-a997-e168abc9e262.json
@@ -0,0 +1,30 @@
+{
+ "summary": "The code introduces a class for few-shot examples and sets, primarily focusing on medical and scientific terms, to assist neuron explainers in token-based few-shot learning. It also defines two lists of token examples for testing the token space few-shot explanation function, with one list containing time and date tokens and another test example with three tokens each having an associated explanation.",
+ "details": [
+ {
+ "comment": "This code defines a class for token-based few shot examples and their sets. It also contains methods to get the examples based on the example set specified.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py\":0-38",
+ "content": "from dataclasses import dataclass\nfrom enum import Enum\nfrom typing import List\nfrom neuron_explainer.fast_dataclasses import FastDataclass\n@dataclass\nclass Example(FastDataclass):\n \"\"\"\n An example list of tokens as strings corresponding to top token space inputs of a neuron, with a\n string explanation of the neuron's behavior on these tokens.\n \"\"\"\n tokens: List[str]\n explanation: str\nclass TokenSpaceFewShotExampleSet(Enum):\n \"\"\"Determines which few-shot examples to use when sampling explanations.\"\"\"\n ORIGINAL = \"original\"\n TEST = \"test\"\n def get_examples(self) -> list[Example]:\n \"\"\"Returns regular examples for use in a few-shot prompt.\"\"\"\n if self is TokenSpaceFewShotExampleSet.ORIGINAL:\n return ORIGINAL_EXAMPLES\n elif self is TokenSpaceFewShotExampleSet.TEST:\n return TEST_EXAMPLES\n else:\n raise ValueError(f\"Unhandled example set: {self}\")\nORIGINAL_EXAMPLES = [\n Example(\n tokens=[\n \"actual\",\n \" literal\","
+ },
+ {
+ "comment": "This code appears to be a list of words, likely used for tokenization purposes in natural language processing or machine learning tasks. The variety of terms suggests it could be related to various fields and concepts.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py\":39-87",
+ "content": " \" actual\",\n \" hyper\",\n \" real\",\n \" EX\",\n \" Real\",\n \"^\",\n \"Full\",\n \" full\",\n \" optical\",\n \" style\",\n \"any\",\n \"ALL\",\n \"extreme\",\n \" miniature\",\n \" Optical\",\n \" faint\",\n \"~\",\n \" Physical\",\n \" REAL\",\n \"*\",\n \"virtual\",\n \"TYPE\",\n \" technical\",\n \"otally\",\n \" physic\",\n \"Type\",\n \"<\",\n \"images\",\n \"atic\",\n \" sheer\",\n \" Style\",\n \" partial\",\n \" natural\",\n \"Hyper\",\n \" Any\",\n \" theoretical\",\n \"|\",\n \" ultimate\",\n \"oing\",\n \" constant\",\n \"ANY\",\n \"antically\",\n \"ishly\",\n \" ex\",\n \" visual\",\n \"special\",\n \"omorphic\",\n \"visual\",\n ],"
+ },
+ {
+ "comment": "This code defines a list of examples for token-based few-shot learning in the context of neuron explainers. The examples consist of various tokens related to medical and scientific terms.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py\":88-133",
+ "content": " explanation=\" adjectives related to being real, or to physical properties and evidence\",\n ),\n Example(\n tokens=[\n \"cephal\",\n \"aeus\",\n \" coma\",\n \"bered\",\n \"abetes\",\n \"inflamm\",\n \"rugged\",\n \"alysed\",\n \"azine\",\n \"hered\",\n \"cells\",\n \"aneously\",\n \"fml\",\n \"igm\",\n \"culosis\",\n \"iani\",\n \"CTV\",\n \"disabled\",\n \"heric\",\n \"ulo\",\n \"geoning\",\n \"awi\",\n \"translation\",\n \"iral\",\n \"govtrack\",\n \"mson\",\n \"cloth\",\n \"nesota\",\n \" Dise\",\n \" Lyme\",\n \" dementia\",\n \"agn\",\n \" reversible\",\n \" susceptibility\",\n \"esthesia\",\n \"orf\",\n \" inflamm\",\n \" Obesity\",\n \" tox\",\n \" Disorders\",\n \"uberty\",\n \"blind\","
+ },
+ {
+ "comment": "This code is defining example sentences for few-shot learning in the token space, with tokens related to physical medical conditions and dates.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py\":134-180",
+ "content": " \"ALTH\",\n \"avier\",\n \" Immunity\",\n \" Hurt\",\n \"ulet\",\n \"ueless\",\n \" sluggish\",\n \"rosis\",\n ],\n explanation=\" words related to physical medical conditions\",\n ),\n Example(\n tokens=[\n \" January\",\n \"terday\",\n \"cember\",\n \" April\",\n \" July\",\n \"September\",\n \"December\",\n \"Thursday\",\n \"quished\",\n \"November\",\n \"Tuesday\",\n \"uesday\",\n \" Sept\",\n \"ruary\",\n \" March\",\n \";;;;;;;;;;;;\",\n \" Monday\",\n \"Wednesday\",\n \" Saturday\",\n \" Wednesday\",\n \"Reloaded\",\n \"aturday\",\n \" August\",\n \"Feb\",\n \"Sunday\",\n \"Reviewed\",\n \"uggest\",\n \" Dhabi\",\n \"ACTED\",\n \"tten\",\n \"Year\",\n \"August\",\n \"alogue\",\n \"MX\","
+ },
+ {
+ "comment": "This code defines two lists of token examples for testing the token space few shot explanation function. The first list contains tokens related to time and dates, and the second one is a test example with three tokens. Each example has an associated explanation.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py\":181-211",
+ "content": " \" Janeiro\",\n \"yss\",\n \" Leilan\",\n \" Fiscal\",\n \" referen\",\n \"semb\",\n \"eele\",\n \"wcs\",\n \"detail\",\n \"ertation\",\n \" Reborn\",\n \" Sunday\",\n \"itially\",\n \"aturdays\",\n \" Dise\",\n \"essage\",\n ],\n explanation=\" nouns related to time and dates\",\n ),\n]\nTEST_EXAMPLES = [\n Example(\n tokens=[\n \"these\",\n \" are\",\n \" tokens\",\n ],\n explanation=\" this is a test explanation\",\n ),\n]"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/35f00687-c1f7-48e9-a0d5-459971018df8.json b/docs/doc/35f00687-c1f7-48e9-a0d5-459971018df8.json
new file mode 100644
index 0000000..da36d4a
--- /dev/null
+++ b/docs/doc/35f00687-c1f7-48e9-a0d5-459971018df8.json
@@ -0,0 +1,20 @@
+{
+ "summary": "This code defines functions for color interpolation and normalization of activation values. It flattens, scales, and normalizes data using imported types, with a function to interpolate colors between two given colors based on a ratio, and default color and boundary values provided.",
+ "details": [
+ {
+ "comment": "This code imports necessary functions and defines types for neuron, token and activation data. It then creates a function normalizeTokenActs that takes in multiple sequences of tokens and their activations, flattens them into one array, replaces any negative activations with 0, finds the maximum activation value across all sequences, and scales the data to be between 0 and 1 for colorization purposes.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/types.ts\":0-30",
+ "content": "import { scaleLinear } from \"d3-scale\"\nimport { min, max, flatten } from \"lodash\"\nexport type Neuron = {\n layer: number;\n neuron: number;\n}\nexport type TokenAndActivation = {\n token: string,\n activation: number\n normalized_activation?: number\n}\nexport type TokenSequence = TokenAndActivation[]\nexport const normalizeTokenActs = (...sequences: TokenSequence[][]) => {\n // console.log('sequences', sequences)\n let flattened: TokenAndActivation[] = flatten(flatten(sequences))\n // Replace all activations less than 0 in data.tokens with 0. This matches the format in the\n // top + random activation records displayed in the main grid.\n flattened = flattened.map(({token, activation}) => {\n return {\n token,\n activation: Math.max(activation, 0)\n }\n })\n const maxActivation = max(flattened.map((ta) => ta.activation)) || 0;\n const neuronScale = scaleLinear()\n // Even though we're only displaying positive activations, we still need to scale in a way that\n // accounts for the existence of negative activations, since our color scale includes them."
+ },
+ {
+ "comment": "This code defines functions for color interpolation and normalization of activation values in sequences. It also exports a Color type which represents RGB colors, with each component ranging from 0 to 255. The \"interpolateColor\" function takes three parameters: two colors (left and right) and a value between 0 and 1, representing the position on a gradient between the left and right colors. It calculates the new color based on the interpolation of the RGB components between these two colors. The \"getInterpolatedColor\" function uses color boundaries to determine the appropriate color for a given value by finding the index of the boundary and using it to choose the appropriate color from the predefined colors array.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/types.ts\":31-58",
+ "content": " .domain([0, maxActivation])\n .range([0, 1])\n return sequences.map((seq) => seq.map((tas) => tas.map(({ token, activation }) => ({\n token,\n activation,\n normalized_activation: neuronScale(activation),\n }))))\n}\nexport type Color = {r: number, g: number, b: number};\nexport function interpolateColor(color_l: Color, color_r: Color, value: number) {\n const color = {\n r: Math.round(color_l.r + (color_r.r - color_l.r) * value),\n g: Math.round(color_l.g + (color_r.g - color_l.g) * value),\n b: Math.round(color_l.b + (color_r.b - color_l.b) * value),\n }\n return color\n}\nexport function getInterpolatedColor(colors: Color[], boundaries: number[], value: number) {\n const index = boundaries.findIndex((boundary) => boundary >= value)\n const colorIndex = Math.max(0, index - 1)\n const color_left = colors[colorIndex]\n const color_right = colors[colorIndex + 1]\n const boundary_left = boundaries[colorIndex]\n const boundary_right = boundaries[colorIndex + 1]\n const ratio = (value - boundary_left) / (boundary_right - boundary_left)"
+ },
+ {
+ "comment": "This code defines a function to interpolate colors between two given colors based on a ratio, and provides default color and boundary values.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/types.ts\":59-71",
+ "content": " const color = interpolateColor(color_left, color_right, ratio)\n return color\n}\nexport const DEFAULT_COLORS = [\n // { r: 255, g: 0, b: 105 },\n { r: 255, g: 255, b: 255 },\n { r: 0, g: 255, b: 0 },\n]\nexport const DEFAULT_BOUNDARIES = [\n // 0, 0.5, 1\n 0, 1\n]"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/3d736916-db57-49c3-aed5-bd8f4976d772.json b/docs/doc/3d736916-db57-49c3-aed5-bd8f4976d772.json
new file mode 100644
index 0000000..7179d50
--- /dev/null
+++ b/docs/doc/3d736916-db57-49c3-aed5-bd8f4976d772.json
@@ -0,0 +1,15 @@
+{
+ "summary": "The TokenHeatmap React component visualizes token activations as a heatmap, with optional user-specified colors and boundaries. It manages loading state and provides tooltips for activation values, while the mentioned colored span functionality seems unrelated to the main focus of the summary.",
+ "details": [
+ {
+ "comment": "This code is a React component called TokenHeatmap that takes in an array of tokens and their activations, and renders them as a heatmap. It uses the DEFAULT_COLORS and DEFAULT_BOUNDARIES if not specified by the user. The component handles loading state and provides tooltips for activation values when not in loading state.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/tokenHeatmap.tsx\":0-22",
+ "content": "import React from \"react\"\nimport { interpolateColor, Color, getInterpolatedColor, DEFAULT_COLORS, DEFAULT_BOUNDARIES, TokenAndActivation } from './types'\ntype Props = {\n tokens: TokenAndActivation[], \n loading?: boolean, \n colors?: Color[], \n boundaries?: number[]\n}\nexport default function TokenHeatmap({ tokens, loading, colors = DEFAULT_COLORS, boundaries = DEFAULT_BOUNDARIES }: Props) {\n //
\n )\n}"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/3e5231a4-ac8e-431a-8bd6-cf3febfc9fc6.json b/docs/doc/3e5231a4-ac8e-431a-8bd6-cf3febfc9fc6.json
new file mode 100644
index 0000000..d355453
--- /dev/null
+++ b/docs/doc/3e5231a4-ac8e-431a-8bd6-cf3febfc9fc6.json
@@ -0,0 +1,165 @@
+{
+ "summary": "Both comments discuss improvements in simulation object initialization, API calls for neuron activation simulations, token splitting, and prompt builder functions. The code proposes better prompt formats, validates input, predicts activations using few-shot examples, verifies completion validity, and generates explanations for sequence 1 tokens.",
+ "details": [
+ {
+ "comment": "This code uses API calls to simulate neuron activations based on an explanation. It includes classes for activation records, activation scaling, and sequence simulations, as well as functions for formatting activation records, normalizing activations, and building prompts.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":0-32",
+ "content": "\"\"\"Uses API calls to simulate neuron activations based on an explanation.\"\"\"\nfrom __future__ import annotations\nimport asyncio\nimport logging\nfrom abc import ABC, abstractmethod\nfrom collections import OrderedDict\nfrom enum import Enum\nfrom typing import Any, Optional, Sequence, Union\nimport numpy as np\nfrom neuron_explainer.activations.activation_records import (\n calculate_max_activation,\n format_activation_records,\n format_sequences_for_simulation,\n normalize_activations,\n)\nfrom neuron_explainer.activations.activations import ActivationRecord\nfrom neuron_explainer.api_client import ApiClient\nfrom neuron_explainer.explanations.explainer import EXPLANATION_PREFIX\nfrom neuron_explainer.explanations.explanations import ActivationScale, SequenceSimulation\nfrom neuron_explainer.explanations.few_shot_examples import FewShotExampleSet\nfrom neuron_explainer.explanations.prompt_builder import (\n HarmonyMessage,\n PromptBuilder,\n PromptFormat,\n Role,\n)\nlogger = logging.getLogger(__name__)\n# Our prompts use normalized activation values, which map any range of positive activations to the"
+ },
+ {
+ "comment": "This code defines a SimulationType enum with three simulation types: ALL_AT_ONCE, ONE_AT_A_TIME. It also has a function to compute expected values given normed probabilities by distribution value.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":33-65",
+ "content": "# integers from 0 to 10.\nMAX_NORMALIZED_ACTIVATION = 10\nVALID_ACTIVATION_TOKENS_ORDERED = list(str(i) for i in range(MAX_NORMALIZED_ACTIVATION + 1))\nVALID_ACTIVATION_TOKENS = set(VALID_ACTIVATION_TOKENS_ORDERED)\nclass SimulationType(str, Enum):\n \"\"\"How to simulate neuron activations. Values correspond to subclasses of NeuronSimulator.\"\"\"\n ALL_AT_ONCE = \"all_at_once\"\n \"\"\"\n Use a single prompt with tokens; calculate EVs using logprobs.\n Implemented by ExplanationNeuronSimulator.\n \"\"\"\n ONE_AT_A_TIME = \"one_at_a_time\"\n \"\"\"\n Use a separate prompt for each token being simulated; calculate EVs using logprobs.\n Implemented by ExplanationTokenByTokenSimulator.\n \"\"\"\n @classmethod\n def from_string(cls, s: str) -> SimulationType:\n for simulation_type in SimulationType:\n if simulation_type.value == s:\n return simulation_type\n raise ValueError(f\"Invalid simulation type: {s}\")\ndef compute_expected_value(\n norm_probabilities_by_distribution_value: OrderedDict[int, float]"
+ },
+ {
+ "comment": "Code chunk 1 (lines 66-91):\n\nThis code calculates the expected value for a distribution given normalized probabilities. It also includes functions to parse top logprobs into a distribution of unnormalized probabilities and compute predicted activation statistics for a token. The code uses numpy arrays for efficient computations and orderd dictionaries for mapping tokens or distribution values to their respective probabilities or logprobs.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":66-91",
+ "content": ") -> float:\n \"\"\"\n Given a map from distribution values (integers on the range [0, 10]) to normalized\n probabilities, return an expected value for the distribution.\n \"\"\"\n return np.dot(\n np.array(list(norm_probabilities_by_distribution_value.keys())),\n np.array(list(norm_probabilities_by_distribution_value.values())),\n )\ndef parse_top_logprobs(top_logprobs: dict[str, float]) -> OrderedDict[int, float]:\n \"\"\"\n Given a map from tokens to logprobs, return a map from distribution values (integers on the\n range [0, 10]) to unnormalized probabilities (in the sense that they may not sum to 1).\n \"\"\"\n probabilities_by_distribution_value = OrderedDict()\n for token, logprob in top_logprobs.items():\n if token in VALID_ACTIVATION_TOKENS:\n token_as_int = int(token)\n probabilities_by_distribution_value[token_as_int] = np.exp(logprob)\n return probabilities_by_distribution_value\ndef compute_predicted_activation_stats_for_token(\n top_logprobs: dict[str, float],"
+ },
+ {
+ "comment": "This function takes the top log probabilities, normalizes them to probabilities, computes the expected value based on these normalized probabilities, and returns both as a tuple. It also includes a helper function that converts a string into a byte array using hexadecimal encoding.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":92-121",
+ "content": ") -> tuple[OrderedDict[int, float], float]:\n probabilities_by_distribution_value = parse_top_logprobs(top_logprobs)\n total_p_of_distribution_values = sum(probabilities_by_distribution_value.values())\n norm_probabilities_by_distribution_value = OrderedDict(\n {\n distribution_value: p / total_p_of_distribution_values\n for distribution_value, p in probabilities_by_distribution_value.items()\n }\n )\n expected_value = compute_expected_value(norm_probabilities_by_distribution_value)\n return (\n norm_probabilities_by_distribution_value,\n expected_value,\n )\n# Adapted from tether/tether/core/encoder.py.\ndef convert_to_byte_array(s: str) -> bytearray:\n byte_array = bytearray()\n assert s.startswith(\"bytes:\"), s\n s = s[6:]\n while len(s) > 0:\n if s[0] == \"\\\\\":\n # Hex encoding.\n assert s[1] == \"x\"\n assert len(s) >= 4\n byte_array.append(int(s[2:4], 16))\n s = s[4:]\n else:\n # Regular ascii encoding."
+ },
+ {
+ "comment": "This code handles the case where a response token is composed of a sequence of bytes. It merges multiple response tokens into a single token until it can be decoded as UTF-8. If a UnicodeDecodeError occurs, it continues to merge previous response tokens into the byte array.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":122-147",
+ "content": " byte_array.append(ord(s[0]))\n s = s[1:]\n return byte_array\ndef handle_byte_encoding(\n response_tokens: Sequence[str], merged_response_index: int\n) -> tuple[str, int]:\n \"\"\"\n Handle the case where the current token is a sequence of bytes. This may involve merging\n multiple response tokens into a single token.\n \"\"\"\n response_token = response_tokens[merged_response_index]\n if response_token.startswith(\"bytes:\"):\n byte_array = bytearray()\n while True:\n byte_array = convert_to_byte_array(response_token) + byte_array\n try:\n # If we can decode the byte array as utf-8, then we're done.\n response_token = byte_array.decode(\"utf-8\")\n break\n except UnicodeDecodeError:\n # If not, then we need to merge the previous response token into the byte\n # array.\n merged_response_index -= 1\n response_token = response_tokens[merged_response_index]"
+ },
+ {
+ "comment": "This function checks if a token from the subject model was split into multiple tokens by the simulator model. It handles cases where different tokenizers are used or Unicode characters are split.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":148-167",
+ "content": " return response_token, merged_response_index\ndef was_token_split(current_token: str, response_tokens: Sequence[str], start_index: int) -> bool:\n \"\"\"\n Return whether current_token (a token from the subject model) was split into multiple tokens by\n the simulator model (as represented by the tokens in response_tokens). start_index is the index\n in response_tokens at which to begin looking backward to form a complete token. It is usually\n the first token *before* the delimiter that separates the token from the normalized activation,\n barring some unusual cases.\n This mainly happens if the subject model uses a different tokenizer than the simulator model.\n But it can also happen in cases where Unicode characters are split. This function handles both\n cases.\n \"\"\"\n merged_response_tokens = \"\"\n merged_response_index = start_index\n while len(merged_response_tokens) < len(current_token):\n response_token = response_tokens[merged_response_index]\n response_token, merged_response_index = handle_byte_encoding("
+ },
+ {
+ "comment": "The code is checking if a token from the subject model was split into two or more tokens by the simulator model. It asserts that merged_response_tokens ends with current_token, calculates the number of merged tokens, and logs a warning if the token was split.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":168-194",
+ "content": " response_tokens, merged_response_index\n )\n merged_response_tokens = response_token + merged_response_tokens\n merged_response_index -= 1\n # It's possible that merged_response_tokens is longer than current_token at this point,\n # since the between-lines delimiter may have been merged into the original token. But it\n # should always be the case that merged_response_tokens ends with current_token.\n assert merged_response_tokens.endswith(current_token)\n num_merged_tokens = start_index - merged_response_index\n token_was_split = num_merged_tokens > 1\n if token_was_split:\n logger.debug(\n \"Warning: token from the subject model was split into 2+ tokens by the simulator model.\"\n )\n return token_was_split\ndef parse_simulation_response(\n response: dict[str, Any],\n prompt_format: PromptFormat,\n tokens: Sequence[str],\n) -> SequenceSimulation:\n \"\"\"\n Parse an API response to a simulation prompt.\n Args:\n response: response from the API"
+ },
+ {
+ "comment": "This function retrieves the text and token data from the response, handling different prompt formats. It then extracts the starting position of the \"\" token in the text, setting up lists for further calculations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":195-218",
+ "content": " prompt_format: how the prompt was formatted\n tokens: list of tokens as strings in the sequence where the neuron is being simulated\n \"\"\"\n choice = response[\"choices\"][0]\n if prompt_format == PromptFormat.HARMONY_V4:\n text = choice[\"message\"][\"content\"]\n elif prompt_format in [\n PromptFormat.NONE,\n PromptFormat.INSTRUCTION_FOLLOWING,\n ]:\n text = choice[\"text\"]\n else:\n raise ValueError(f\"Unhandled prompt format {prompt_format}\")\n response_tokens = choice[\"logprobs\"][\"tokens\"]\n choice[\"logprobs\"][\"token_logprobs\"]\n top_logprobs = choice[\"logprobs\"][\"top_logprobs\"]\n token_text_offset = choice[\"logprobs\"][\"text_offset\"]\n # This only works because the sequence \"\" tokenizes into multiple tokens if it appears in\n # a text sequence in the prompt.\n scoring_start = text.rfind(\"\")\n expected_values = []\n original_sequence_tokens: list[str] = []\n distribution_values: list[list[float]] = []\n distribution_probabilities: list[list[float]] = []"
+ },
+ {
+ "comment": "Checking if the response tokens have reached the end and if the tab token is followed by an \"unknown\" token.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":219-233",
+ "content": " for i in range(2, len(response_tokens)):\n if len(original_sequence_tokens) == len(tokens):\n # Make sure we haven't hit some sort of off-by-one error.\n # TODO(sbills): Generalize this to handle different tokenizers.\n reached_end = response_tokens[i + 1] == \"<\" and response_tokens[i + 2] == \"end\"\n assert reached_end, f\"{response_tokens[i-3:i+3]}\"\n break\n if token_text_offset[i] >= scoring_start:\n # We're looking for the first token after a tab. This token should be the text\n # \"unknown\" if hide_activations=True or a normalized activation (0-10) otherwise.\n # If it isn't, that means that the tab is not appearing as a delimiter, but rather\n # as a token, in which case we should move on to the next response token.\n if response_tokens[i - 1] == \"\\t\":\n if response_tokens[i] != \"unknown\":\n logger.debug(\"Ignoring tab token that is not followed by an 'unknown' token.\")"
+ },
+ {
+ "comment": "Identifying correct token and computing predicted activation stats for the identified token.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":234-255",
+ "content": " continue\n # j represents the index of the token in a \"tokenactivation\" line, barring\n # one of the unusual cases handled below.\n j = i - 2\n current_token = tokens[len(original_sequence_tokens)]\n if current_token == response_tokens[j] or was_token_split(\n current_token, response_tokens, j\n ):\n # We're in the normal case where the tokenization didn't throw off the\n # formatting or in the token-was-split case, which we handle the usual way.\n current_top_logprobs = top_logprobs[i]\n (\n norm_probabilities_by_distribution_value,\n expected_value,\n ) = compute_predicted_activation_stats_for_token(\n current_top_logprobs,\n )\n current_distribution_values = list(\n norm_probabilities_by_distribution_value.keys()"
+ },
+ {
+ "comment": "If tokenization resulted in a newline being folded into the token, use dummy values for activation prediction. This is due to the model not observing the original token and a better prompt format should be used to avoid this situation.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":256-271",
+ "content": " )\n current_distribution_probabilities = list(\n norm_probabilities_by_distribution_value.values()\n )\n else:\n # We're in a case where the tokenization resulted in a newline being folded into\n # the token. We can't do our usual prediction of activation stats for the token,\n # since the model did not observe the original token. Instead, we use dummy\n # values. See the TODO elsewhere in this file about coming up with a better\n # prompt format that avoids this situation.\n newline_folded_into_token = \"\\n\" in response_tokens[j]\n assert (\n newline_folded_into_token\n ), f\"`{current_token=}` {response_tokens[j-3:j+3]=}\"\n logger.debug(\n \"Warning: newline before a tokenactivation line was folded into the token\""
+ },
+ {
+ "comment": "The code is defining a NeuronSimulator class with an abstract method \"simulate\" that takes in a sequence of tokens and returns a SequenceSimulation object. The SequenceSimulation object contains the original token sequence, expected activations, activation scale, distribution values, and distribution probabilities.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":272-296",
+ "content": " )\n current_distribution_values = []\n current_distribution_probabilities = []\n expected_value = 0.0\n original_sequence_tokens.append(current_token)\n distribution_values.append([float(v) for v in current_distribution_values])\n distribution_probabilities.append(current_distribution_probabilities)\n expected_values.append(expected_value)\n return SequenceSimulation(\n tokens=original_sequence_tokens,\n expected_activations=expected_values,\n activation_scale=ActivationScale.SIMULATED_NORMALIZED_ACTIVATIONS,\n distribution_values=distribution_values,\n distribution_probabilities=distribution_probabilities,\n )\nclass NeuronSimulator(ABC):\n \"\"\"Abstract base class for simulating neuron behavior.\"\"\"\n @abstractmethod\n async def simulate(self, tokens: Sequence[str]) -> SequenceSimulation:\n \"\"\"Simulate the behavior of a neuron based on an explanation.\"\"\""
+ },
+ {
+ "comment": "This code defines a class called \"ExplanationNeuronSimulator\" that simulates neuron behavior based on an explanation. It uses a few-shot prompt with examples of other explanations and activations, allowing for scoring all tokens at once using logprobs. The constructor takes in parameters like model name, explanation, maximum concurrent tasks, example set type, prompt format, and cache settings. It also initializes an \"ApiClient\" object. The class has a method called \"simulate\" that takes a sequence of tokens as input and returns a SequenceSimulation.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":297-328",
+ "content": " ...\nclass ExplanationNeuronSimulator(NeuronSimulator):\n \"\"\"\n Simulate neuron behavior based on an explanation.\n This class uses a few-shot prompt with examples of other explanations and activations. This\n prompt allows us to score all of the tokens at once using a nifty trick involving logprobs.\n \"\"\"\n def __init__(\n self,\n model_name: str,\n explanation: str,\n max_concurrent: Optional[int] = 10,\n few_shot_example_set: FewShotExampleSet = FewShotExampleSet.ORIGINAL,\n prompt_format: PromptFormat = PromptFormat.INSTRUCTION_FOLLOWING,\n cache: bool = False,\n ):\n self.api_client = ApiClient(\n model_name=model_name, max_concurrent=max_concurrent, cache=cache\n )\n self.explanation = explanation\n self.few_shot_example_set = few_shot_example_set\n self.prompt_format = prompt_format\n async def simulate(\n self,\n tokens: Sequence[str],\n ) -> SequenceSimulation:\n prompt = self.make_simulation_prompt(tokens)"
+ },
+ {
+ "comment": "This code is making an API request to generate a response based on the provided prompt or message, depending on the prompt format. It then parses the response and returns the result. The code includes assertions for validating the input and a TODO comment indicating potential issues with the tokenization format.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":330-351",
+ "content": " generate_kwargs: dict[str, Any] = {\n \"max_tokens\": 0,\n \"echo\": True,\n \"logprobs\": 15,\n }\n if self.prompt_format == PromptFormat.HARMONY_V4:\n assert isinstance(prompt, list)\n assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n generate_kwargs[\"messages\"] = prompt\n else:\n assert isinstance(prompt, str)\n generate_kwargs[\"prompt\"] = prompt\n response = await self.api_client.make_request(**generate_kwargs)\n logger.debug(\"response in score_explanation_by_activations is %s\", response)\n result = parse_simulation_response(response, self.prompt_format, tokens)\n logger.debug(\"result in score_explanation_by_activations is %s\", result)\n return result\n # TODO(sbills): The current tokenactivation format can result in improper tokenization.\n # In particular, if the token is itself a tab, we may get a single \"\\t\\t\" token rather than two\n # \"\\t\" tokens. Consider using a separator that does not appear in any multi-character tokens."
+ },
+ {
+ "comment": "This code creates a prompt for predicting neuron activations using a few-shot example set. It adds a system message with instructions on how to analyze the neurons in a neural network and then appends user messages for each example in the set, including the example itself along with an explanation of the neuron's behavior.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":352-372",
+ "content": " def make_simulation_prompt(self, tokens: Sequence[str]) -> Union[str, list[HarmonyMessage]]:\n \"\"\"Create a few-shot prompt for predicting neuron activations for the given tokens.\"\"\"\n # TODO(sbills): The prompts in this file are subtly different from the ones in explainer.py.\n # Consider reconciling them.\n prompt_builder = PromptBuilder()\n prompt_builder.add_message(\n Role.SYSTEM,\n \"\"\"We're studying neurons in a neural network.\nEach neuron looks for some particular thing in a short document.\nLook at summary of what the neuron does, and try to predict how it will fire on each token.\nThe activation format is tokenactivation, activations go from 0 to 10, \"unknown\" indicates an unknown activation. Most activations will be 0.\n\"\"\",\n )\n few_shot_examples = self.few_shot_example_set.get_examples()\n for i, example in enumerate(few_shot_examples):\n prompt_builder.add_message(\n Role.USER,\n f\"\\n\\nNeuron {i + 1}\\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX} \""
+ },
+ {
+ "comment": "This code snippet is part of a Neuron Simulator that simulates neuron behavior based on an explanation. It adds formatted activation records and messages to a prompt builder, including explanations of neuron behavior for few-shot examples.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":373-398",
+ "content": " f\"{example.explanation}\",\n )\n formatted_activation_records = format_activation_records(\n example.activation_records,\n calculate_max_activation(example.activation_records),\n start_indices=example.first_revealed_activation_indices,\n )\n prompt_builder.add_message(\n Role.ASSISTANT, f\"\\nActivations: {formatted_activation_records}\\n\"\n )\n prompt_builder.add_message(\n Role.USER,\n f\"\\n\\nNeuron {len(few_shot_examples) + 1}\\nExplanation of neuron \"\n f\"{len(few_shot_examples) + 1} behavior: {EXPLANATION_PREFIX} \"\n f\"{self.explanation.strip()}\",\n )\n prompt_builder.add_message(\n Role.ASSISTANT, f\"\\nActivations: {format_sequences_for_simulation([tokens])}\"\n )\n return prompt_builder.build(self.prompt_format)\nclass ExplanationTokenByTokenSimulator(NeuronSimulator):\n \"\"\"\n Simulate neuron behavior based on an explanation."
+ },
+ {
+ "comment": "This class initializes an API client and takes inputs like model name, explanation, max concurrent requests, example set, prompt format, and cache. It asserts that the few-shot example set is not ORIGINAL since this simulator doesn't support it. Then, it performs a simulation using one token prompt per token and calculates expected values from log probabilities. This method is slower compared to ExplanationNeuronSimulator.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":400-425",
+ "content": " Unlike ExplanationNeuronSimulator, this class uses one few-shot prompt per token to calculate\n expected activations. This is slower. This class gets a one-token completion and calculates an\n expected value from that token's logprobs.\n \"\"\"\n def __init__(\n self,\n model_name: str,\n explanation: str,\n max_concurrent: Optional[int] = 10,\n few_shot_example_set: FewShotExampleSet = FewShotExampleSet.NEWER,\n prompt_format: PromptFormat = PromptFormat.INSTRUCTION_FOLLOWING,\n cache: bool = False,\n ):\n assert (\n few_shot_example_set != FewShotExampleSet.ORIGINAL\n ), \"This simulator doesn't support the ORIGINAL few-shot example set.\"\n self.api_client = ApiClient(\n model_name=model_name, max_concurrent=max_concurrent, cache=cache\n )\n self.explanation = explanation\n self.few_shot_example_set = few_shot_example_set\n self.prompt_format = prompt_format\n async def simulate(\n self,"
+ },
+ {
+ "comment": "This function collects activation statistics for each token in the input sequence and then normalizes the probabilities by distribution values, expected values, and appends them to their respective lists.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":426-449",
+ "content": " tokens: Sequence[str],\n ) -> SequenceSimulation:\n responses_by_token = await asyncio.gather(\n *[\n self._get_activation_stats_for_single_token(tokens, self.explanation, token_index)\n for token_index in range(len(tokens))\n ]\n )\n expected_values, distribution_values, distribution_probabilities = [], [], []\n for response in responses_by_token:\n activation_logprobs = response[\"choices\"][0][\"logprobs\"][\"top_logprobs\"][0]\n (\n norm_probabilities_by_distribution_value,\n expected_value,\n ) = compute_predicted_activation_stats_for_token(\n activation_logprobs,\n )\n distribution_values.append(\n [float(v) for v in norm_probabilities_by_distribution_value.keys()]\n )\n distribution_probabilities.append(\n list(norm_probabilities_by_distribution_value.values())\n )\n expected_values.append(expected_value)"
+ },
+ {
+ "comment": "This code is creating a SequenceSimulation object and logging its result. It also defines an asynchronous function that retrieves activation statistics for a single token using API client, and adds a subprompt to a prompt builder.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":451-478",
+ "content": " result = SequenceSimulation(\n tokens=list(tokens), # SequenceSimulation expects List type\n expected_activations=expected_values,\n activation_scale=ActivationScale.SIMULATED_NORMALIZED_ACTIVATIONS,\n distribution_values=distribution_values,\n distribution_probabilities=distribution_probabilities,\n )\n logger.debug(\"result in score_explanation_by_activations is %s\", result)\n return result\n async def _get_activation_stats_for_single_token(\n self,\n tokens: Sequence[str],\n explanation: str,\n token_index_to_score: int,\n ) -> dict:\n prompt = self.make_single_token_simulation_prompt(\n tokens,\n explanation,\n token_index_to_score=token_index_to_score,\n )\n return await self.api_client.make_request(\n prompt=prompt, max_tokens=1, echo=False, logprobs=15\n )\n def _add_single_token_simulation_subprompt(\n self,\n prompt_builder: PromptBuilder,"
+ },
+ {
+ "comment": "Creating trimmed activation record and adding messages to the prompt builder.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":479-507",
+ "content": " activation_record: ActivationRecord,\n neuron_index: int,\n explanation: str,\n token_index_to_score: int,\n end_of_prompt: bool,\n ) -> None:\n trimmed_activation_record = ActivationRecord(\n tokens=activation_record.tokens[: token_index_to_score + 1],\n activations=activation_record.activations[: token_index_to_score + 1],\n )\n prompt_builder.add_message(\n Role.USER,\n f\"\"\"\nNeuron {neuron_index}\nExplanation of neuron {neuron_index} behavior: {EXPLANATION_PREFIX} {explanation.strip()}\nText:\n{\"\".join(trimmed_activation_record.tokens)}\nLast token in the text:\n{trimmed_activation_record.tokens[-1]}\nLast token activation, considering the token in the context in which it appeared in the text:\n\"\"\",\n )\n if not end_of_prompt:\n normalized_activations = normalize_activations(\n trimmed_activation_record.activations, calculate_max_activation([activation_record])\n )\n prompt_builder.add_message("
+ },
+ {
+ "comment": "This function generates a prompt for predicting the neuron's activation on a single token. It involves adding a system message explaining the task and providing few-shot examples.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":508-530",
+ "content": " Role.ASSISTANT, str(normalized_activations[-1]) + (\"\" if end_of_prompt else \"\\n\\n\")\n )\n def make_single_token_simulation_prompt(\n self,\n tokens: Sequence[str],\n explanation: str,\n token_index_to_score: int,\n ) -> Union[str, list[HarmonyMessage]]:\n \"\"\"Make a few-shot prompt for predicting the neuron's activation on a single token.\"\"\"\n assert explanation != \"\"\n prompt_builder = PromptBuilder()\n prompt_builder.add_message(\n Role.SYSTEM,\n \"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.\nThe activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.\n\"\"\",\n )\n few_shot_examples = self.few_shot_example_set.get_examples()\n for i, example in enumerate(few_shot_examples):"
+ },
+ {
+ "comment": "Generating a prompt to explain neuron behavior and visualize activation records for an example, then adding a message asking to predict the activation of a new neuron on a single token following the same rules.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":531-552",
+ "content": " prompt_builder.add_message(\n Role.USER,\n f\"Neuron {i + 1}\\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX} \"\n f\"{example.explanation}\\n\",\n )\n formatted_activation_records = format_activation_records(\n example.activation_records,\n calculate_max_activation(example.activation_records),\n start_indices=None,\n )\n prompt_builder.add_message(\n Role.ASSISTANT,\n f\"Activations: {formatted_activation_records}\\n\\n\",\n )\n prompt_builder.add_message(\n Role.SYSTEM,\n \"Now, we're going predict the activation of a new neuron on a single token, \"\n \"following the same rules as the examples above. Activations still range from 0 to 10.\",\n )\n single_token_example = self.few_shot_example_set.get_single_token_prediction_example()\n assert single_token_example.token_index_to_score is not None"
+ },
+ {
+ "comment": "This code adds two subprompts to a prompt builder, one for a single token example and another for an activation record. It then returns the final formatted prompt.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":553-580",
+ "content": " self._add_single_token_simulation_subprompt(\n prompt_builder,\n single_token_example.activation_records[0],\n len(few_shot_examples) + 1,\n explanation,\n token_index_to_score=single_token_example.token_index_to_score,\n end_of_prompt=False,\n )\n activation_record = ActivationRecord(\n tokens=list(tokens[: token_index_to_score + 1]), # ActivationRecord expects List type.\n activations=[0.0] * len(tokens),\n )\n self._add_single_token_simulation_subprompt(\n prompt_builder,\n activation_record,\n len(few_shot_examples) + 2,\n explanation,\n token_index_to_score,\n end_of_prompt=True,\n )\n return prompt_builder.build(self.prompt_format, allow_extra_system_messages=True)\ndef _format_record_for_logprob_free_simulation(\n activation_record: ActivationRecord,\n include_activations: bool = False,\n max_activation: Optional[float] = None,"
+ },
+ {
+ "comment": "This code is parsing a completion into a list of simulated activations. If the model did not faithfully reproduce the token sequence, it returns a list of 0s. It also includes an optional normalization of activations based on max_activation parameter.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":581-606",
+ "content": ") -> str:\n response = \"\"\n if include_activations:\n assert max_activation is not None\n assert len(activation_record.tokens) == len(\n activation_record.activations\n ), f\"{len(activation_record.tokens)=}, {len(activation_record.activations)=}\"\n normalized_activations = normalize_activations(\n activation_record.activations, max_activation=max_activation\n )\n for i, token in enumerate(activation_record.tokens):\n # We use a weird unicode character here to make it easier to parse the response (can split on \"\u0f17\\n\").\n if include_activations:\n response += f\"{token}\\t{normalized_activations[i]}\u0f17\\n\"\n else:\n response += f\"{token}\\t\u0f17\\n\"\n return response\ndef _parse_no_logprobs_completion(\n completion: str,\n tokens: Sequence[str],\n) -> Sequence[int]:\n \"\"\"\n Parse a completion into a list of simulated activations. If the model did not faithfully\n reproduce the token sequence, return a list of 0s. If the model's activation for a token"
+ },
+ {
+ "comment": "This code checks if the first token is present in the completion and if the number of lines matches the number of tokens. If not, it returns a list of 0s. It then extracts the predicted activations for each token from the completion.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":607-630",
+ "content": " is not an integer betwee 0 and 10, substitute 0.\n Args:\n completion: completion from the API\n tokens: list of tokens as strings in the sequence where the neuron is being simulated\n \"\"\"\n zero_prediction = [0] * len(tokens)\n token_lines = completion.strip(\"\\n\").split(\"\u0f17\\n\")\n start_line_index = None\n for i, token_line in enumerate(token_lines):\n if token_line.startswith(f\"{tokens[0]}\\t\"):\n start_line_index = i\n break\n # If we didn't find the first token, or if the number of lines in the completion doesn't match\n # the number of tokens, return a list of 0s.\n if start_line_index is None or len(token_lines) - start_line_index != len(tokens):\n return zero_prediction\n predicted_activations = []\n for i, token_line in enumerate(token_lines[start_line_index:]):\n if not token_line.startswith(f\"{tokens[i]}\\t\"):\n return zero_prediction\n predicted_activation = token_line.split(\"\\t\")[1]\n if predicted_activation not in VALID_ACTIVATION_TOKENS:"
+ },
+ {
+ "comment": "The code appends deterministic activations to the explanation token sequence.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":631-647",
+ "content": " predicted_activations.append(0)\n else:\n predicted_activations.append(int(predicted_activation))\n return predicted_activations\nclass LogprobFreeExplanationTokenSimulator(NeuronSimulator):\n \"\"\"\n Simulate neuron behavior based on an explanation.\n Unlike ExplanationNeuronSimulator and ExplanationTokenByTokenSimulator, this class does not rely on\n logprobs to calculate expected activations. Instead, it uses a few-shot prompt that displays all of the\n tokens at once, and request that the model repeat the tokens with the activations appended. Sampling\n is with temperature = 0. Thus, the activations are deterministic. Also, each activation for a token\n is a function of all the activations that came previously and all of the tokens in the sequence, not\n just the current and previous tokens. In the case where the model does not faithfully reproduce the\n token sequence, the simulator will return a response where every predicted activation is 0. Example prompt as follows:"
+ },
+ {
+ "comment": "This code is initializing an instance of a simulator. It takes the model name, explanation, maximum concurrent samples, few-shot example set (not ORIGINAL), prompt format, and cache settings as parameters. The assert statement ensures that the few-shot example set is not ORIGINAL because this simulator doesn't support it. It then initializes an instance of ApiClient with the given model name, maximum concurrent samples, and cache settings.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":649-705",
+ "content": " Explanation: Explanation 1\n Sequence 1 Tokens Without Activations:\n A\\t_\n B\\t_\n C\\t_\n Sequence 1 Tokens With Activations:\n A\\t4_\n B\\t10_\n C\\t0_\n Sequence 2 Tokens Without Activations:\n D\\t_\n E\\t_\n F\\t_\n Sequence 2 Tokens With Activations:\n D\\t3_\n E\\t6_\n F\\t9_\n Explanation: Explanation 2\n Sequence 1 Tokens Without Activations:\n G\\t_\n H\\t_\n I\\t_\n Sequence 1 Tokens With Activations:\n \n G\\t2_\n H\\t0_\n I\\t3_\n \"\"\"\n def __init__(\n self,\n model_name: str,\n explanation: str,\n max_concurrent: Optional[int] = 10,\n few_shot_example_set: FewShotExampleSet = FewShotExampleSet.NEWER,\n prompt_format: PromptFormat = PromptFormat.HARMONY_V4,\n cache: bool = False,\n ):\n assert (\n few_shot_example_set != FewShotExampleSet.ORIGINAL\n ), \"This simulator doesn't support the ORIGINAL few-shot example set.\"\n self.api_client = ApiClient(\n model_name=model_name, max_concurrent=max_concurrent, cache=cache"
+ },
+ {
+ "comment": "Code creates a simulation prompt, sends it to API client for processing, and stores the result.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":706-735",
+ "content": " )\n self.explanation = explanation\n self.few_shot_example_set = few_shot_example_set\n self.prompt_format = prompt_format\n async def simulate(\n self,\n tokens: Sequence[str],\n ) -> SequenceSimulation:\n prompt = self._make_simulation_prompt(\n tokens,\n self.explanation,\n )\n response = await self.api_client.make_request(\n prompt=prompt, echo=False, max_tokens=1000\n )\n assert len(response[\"choices\"]) == 1\n choice = response[\"choices\"][0]\n if self.prompt_format == PromptFormat.HARMONY_V4:\n completion = choice[\"message\"][\"content\"]\n elif self.prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:\n completion = choice[\"text\"]\n else:\n raise ValueError(f\"Unhandled prompt format {self.prompt_format}\")\n predicted_activations = _parse_no_logprobs_completion(completion, tokens)\n result = SequenceSimulation(\n activation_scale=ActivationScale.SIMULATED_NORMALIZED_ACTIVATIONS,"
+ },
+ {
+ "comment": "Creating a SimulationResult object with expected activations, and None distribution values and probabilities.\n\nFunction to build a simulation prompt using PromptBuilder and add a system message about studying neurons in neural networks.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":736-755",
+ "content": " expected_activations=predicted_activations,\n # Since the predicted activation is just a sampled token, we don't have a distribution.\n distribution_values=None,\n distribution_probabilities=None,\n tokens=list(tokens), # SequenceSimulation expects List type\n )\n logger.debug(\"result in score_explanation_by_activations is %s\", result)\n return result\n def _make_simulation_prompt(\n self,\n tokens: Sequence[str],\n explanation: str,\n ) -> Union[str, list[HarmonyMessage]]:\n \"\"\"Make a few-shot prompt for predicting the neuron's activations on a sequence.\"\"\"\n assert explanation != \"\"\n prompt_builder = PromptBuilder(allow_extra_system_messages=True)\n prompt_builder.add_message(\n Role.SYSTEM,\n \"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token."
+ },
+ {
+ "comment": "This code generates explanations for neuron behavior in a sequence, and for each neuron, it shows the tokens with and without activations. Activation records are used to determine the max activation for that neuron. The output includes an explanation prefix, tokens without and with activations for Sequence 1, and is added to a prompt builder.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":757-773",
+ "content": "The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.\nFor each sequence, you will see the tokens in the sequence where the activations are left blank. You will print the exact same tokens verbatim, but with the activations filled in according to the explanation.\n\"\"\",\n )\n few_shot_examples = self.few_shot_example_set.get_examples()\n for i, example in enumerate(few_shot_examples):\n few_shot_example_max_activation = calculate_max_activation(example.activation_records)\n prompt_builder.add_message(\n Role.USER,\n f\"Neuron {i + 1}\\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX} \"\n f\"{example.explanation}\\n\\n\"\n f\"Sequence 1 Tokens without Activations:\\n{_format_record_for_logprob_free_simulation(example.activation_records[0], include_activations=False)}\\n\\n\"\n f\"Sequence 1 Tokens with Activations:\\n\",\n )\n prompt_builder.add_message("
+ },
+ {
+ "comment": "This code is building a prompt for an AI model by adding messages to the prompt_builder. It iterates through activation records of an example, adding information about tokens with and without activations for each record. Finally, it adds a message for the next neuron index with its explanation.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":774-792",
+ "content": " Role.ASSISTANT,\n f\"{_format_record_for_logprob_free_simulation(example.activation_records[0], include_activations=True, max_activation=few_shot_example_max_activation)}\\n\\n\",\n )\n for record_index, record in enumerate(example.activation_records[1:]):\n prompt_builder.add_message(\n Role.USER,\n f\"Sequence {record_index + 2} Tokens without Activations:\\n{_format_record_for_logprob_free_simulation(record, include_activations=False)}\\n\\n\"\n f\"Sequence {record_index + 2} Tokens with Activations:\\n\",\n )\n prompt_builder.add_message(\n Role.ASSISTANT,\n f\"{_format_record_for_logprob_free_simulation(record, include_activations=True, max_activation=few_shot_example_max_activation)}\\n\\n\",\n )\n neuron_index = len(few_shot_examples) + 1\n prompt_builder.add_message(\n Role.USER,\n f\"Neuron {neuron_index}\\nExplanation of neuron {neuron_index} behavior: {EXPLANATION_PREFIX} \""
+ },
+ {
+ "comment": "This code generates a formatted explanation for sequence 1 tokens without and with activations, and returns it in a prompt format.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py\":793-797",
+ "content": " f\"{explanation}\\n\\n\"\n f\"Sequence 1 Tokens without Activations:\\n{_format_record_for_logprob_free_simulation(ActivationRecord(tokens=tokens, activations=[]), include_activations=False)}\\n\\n\"\n f\"Sequence 1 Tokens with Activations:\\n\",\n )\n return prompt_builder.build(self.prompt_format)"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/41d57ce6-9b32-42ac-ac73-882837572ad9.json b/docs/doc/41d57ce6-9b32-42ac-ac73-882837572ad9.json
new file mode 100644
index 0000000..e17bf5d
--- /dev/null
+++ b/docs/doc/41d57ce6-9b32-42ac-ac73-882837572ad9.json
@@ -0,0 +1,40 @@
+{
+ "summary": "The code initializes an event loop, tests explanation formats with generated prompts and neuron behavior visualization using GPT-4 and Harmony V4 for token lists up to 20 tokens.",
+ "details": [
+ {
+ "comment": "Setting up the event loop for async operations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_explainer.py\":0-22",
+ "content": "import asyncio\nfrom typing import Any\nfrom neuron_explainer.explanations.explainer import (\n TokenActivationPairExplainer,\n TokenSpaceRepresentationExplainer,\n)\nfrom neuron_explainer.explanations.few_shot_examples import TEST_EXAMPLES, FewShotExampleSet\nfrom neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role\nfrom neuron_explainer.explanations.token_space_few_shot_examples import (\n TokenSpaceFewShotExampleSet,\n)\ndef setup_module(unused_module: Any) -> None:\n # Make sure we have an event loop, since the attempt to create the Semaphore in\n # ResearchApiClient will fail without it.\n loop = asyncio.new_event_loop()\n asyncio.set_event_loop(loop)\ndef test_if_formatting() -> None:\n expected_prompt = \"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words."
+ },
+ {
+ "comment": "This code initializes an explainer object with specific parameters and then generates a test prompt using the provided activation records. The generated prompt is then asserted to be equal to the expected prompt. The main purpose of this code is to test whether the explanation format matches the expected output for a given set of activation records.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_explainer.py\":24-72",
+ "content": "The activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.\nNeuron 1\nActivations:\n\na\t10\nb\t0\nc\t0\n\n\nd\t0\ne\t10\nf\t0\n\nExplanation of neuron 1 behavior: the main thing this neuron does is find vowels.\nNeuron 2\nActivations:\n\na\t10\nb\t0\nc\t0\n\n\nd\t0\ne\t10\nf\t0\n\nExplanation of neuron 2 behavior:<|endofprompt|> the main thing this neuron does is find\"\"\"\n explainer = TokenActivationPairExplainer(\n model_name=\"text-davinci-003\",\n prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n few_shot_example_set=FewShotExampleSet.TEST,\n )\n prompt = explainer.make_explanation_prompt(\n all_activation_records=TEST_EXAMPLES[0].activation_records,\n max_activation=1.0,\n max_tokens_for_completion=20,\n )\n assert prompt == expected_prompt\ndef test_harmony_format() -> None:\n expected_prompt = [\n HarmonyMessage("
+ },
+ {
+ "comment": "Code explains the neuron's behavior in a neural network, showing activation values for tokens and summarizing what each neuron is looking for.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_explainer.py\":73-113",
+ "content": " role=Role.SYSTEM,\n content=\"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words.\nThe activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.\"\"\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 1\nActivations:\n\na\t10\nb\t0\nc\t0\n\n\nd\t0\ne\t10\nf\t0\n\nExplanation of neuron 1 behavior: the main thing this neuron does is find\"\"\",\n ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\" vowels.\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 2\nActivations:\n\na\t10\nb\t0\nc\t0\n\n\nd\t0"
+ },
+ {
+ "comment": "This code initializes an explainer object, sets the model name to \"gpt-4\", prompt format to Harmony_v4, and few shot example set to TEST. Then it creates a list of prompts for explanation by calling `make_explanation_prompt` function with a list of activation records, max activation, and max tokens for completion. The code asserts that the resulting prompt is a list and each item in the list is a dictionary (HarmonyMessage) and compares it with the expected_prompt. Finally, it tests if the prompt matches the expected_prompt by comparing their contents.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_explainer.py\":114-142",
+ "content": "e\t10\nf\t0\n\nExplanation of neuron 2 behavior: the main thing this neuron does is find\"\"\",\n ),\n ]\n explainer = TokenActivationPairExplainer(\n model_name=\"gpt-4\",\n prompt_format=PromptFormat.HARMONY_V4,\n few_shot_example_set=FewShotExampleSet.TEST,\n )\n prompt = explainer.make_explanation_prompt(\n all_activation_records=TEST_EXAMPLES[0].activation_records,\n max_activation=1.0,\n max_tokens_for_completion=20,\n )\n assert isinstance(prompt, list)\n assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n for actual_message, expected_message in zip(prompt, expected_prompt):\n assert actual_message[\"role\"] == expected_message[\"role\"]\n assert actual_message[\"content\"] == expected_message[\"content\"]\n assert prompt == expected_prompt\ndef test_token_space_explainer_if_formatting() -> None:\n expected_prompt = \"\"\"We're studying neurons in a neural network. Each neuron looks for some particular kind of token (which can be a w"
+ },
+ {
+ "comment": "This code initializes a TokenSpaceRepresentationExplainer with specific parameters and then uses it to generate an explanation prompt given a set of tokens. The expected output is compared to the generated prompt in the test case.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_explainer.py\":142-178",
+ "content": "ord, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.\nTokens:\n'these', ' are', ' tokens'\nExplanation:\nThis neuron is looking for this is a test explanation.\nTokens:\n'foo', 'bar', 'baz'\nExplanation:\n<|endofprompt|>This neuron is looking for\"\"\"\n explainer = TokenSpaceRepresentationExplainer(\n model_name=\"text-davinci-002\",\n prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n use_few_shot=True,\n few_shot_example_set=TokenSpaceFewShotExampleSet.TEST,\n )\n prompt = explainer.make_explanation_prompt(\n tokens=[\"foo\", \"bar\", \"baz\"],\n max_tokens_for_completion=20,\n )\n assert prompt == expected_prompt\ndef test_token_space_explainer_harmony_formatting() -> None:\n expected_prompt = [\n HarmonyMessage(\n role=Role.SYSTEM,\n content=\"We're studying neurons in a neural network. Each neuron looks for some particular k"
+ },
+ {
+ "comment": "The code initializes a TokenSpaceRepresentationExplainer with GPT-4 model and Harmony V4 prompt format. It also uses few-shot learning with the test example set and generates an explanation prompt for the tokens 'foo', 'bar', and 'baz'. The explanation prompt will be in list format, and its length should not exceed 20 tokens.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_explainer.py\":178-221",
+ "content": "ind of token (which can be a word, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nTokens:\n'these', ' are', ' tokens'\nExplanation:\nThis neuron is looking for\"\"\",\n ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\" this is a test explanation.\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nTokens:\n'foo', 'bar', 'baz'\nExplanation:\nThis neuron is looking for\"\"\",\n ),\n ]\n explainer = TokenSpaceRepresentationExplainer(\n model_name=\"gpt-4\",\n prompt_format=PromptFormat.HARMONY_V4,\n use_few_shot=True,\n few_shot_example_set=TokenSpaceFewShotExampleSet.TEST,\n )\n prompt = explainer.make_explanation_prompt(\n tokens=[\"foo\", \"bar\", \"baz\"],\n max_tokens_for_completion=20,\n )\n assert isinstance(prompt, list)"
+ },
+ {
+ "comment": "Checking if the prompt is a list of HarmonyMessages and if each message's role and content match the expected values.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_explainer.py\":222-226",
+ "content": " assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n for actual_message, expected_message in zip(prompt, expected_prompt):\n assert actual_message[\"role\"] == expected_message[\"role\"]\n assert actual_message[\"content\"] == expected_message[\"content\"]\n assert prompt == expected_prompt"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/44155aa4-0b33-4cc9-9e1e-6e55355280ef.json b/docs/doc/44155aa4-0b33-4cc9-9e1e-6e55355280ef.json
new file mode 100644
index 0000000..62f0e9a
--- /dev/null
+++ b/docs/doc/44155aa4-0b33-4cc9-9e1e-6e55355280ef.json
@@ -0,0 +1,20 @@
+{
+ "summary": "The code utilizes OpenAI API for puzzle explanation generation and adheres to Jupyter Notebook format version.",
+ "details": [
+ {
+ "comment": "Importing OpenAI API key, setting up explainer model with prompt format and max concurrent. Loading PUZZLES_BY_NAME for puzzle explanation.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/explain_puzzles.ipynb\":0-36",
+ "content": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": null,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n \"%load_ext autoreload\\n\",\n \"%autoreload 2\"\n ]\n },\n {\n \"cell_type\": \"code\",\n \"execution_count\": null,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n \"import os\\n\",\n \"\\n\",\n \"os.environ[\\\"OPENAI_API_KEY\\\"] = \\\"put-key-here\\\"\\n\",\n \"\\n\",\n \"from neuron_explainer.activations.activation_records import calculate_max_activation\\n\",\n \"from neuron_explainer.explanations.explainer import TokenActivationPairExplainer\\n\",\n \"from neuron_explainer.explanations.prompt_builder import PromptFormat\\n\",\n \"from neuron_explainer.explanations.puzzles import PUZZLES_BY_NAME\\n\",\n \"\\n\",\n \"\\n\",\n \"EXPLAINER_MODEL_NAME = \\\"gpt-4\\\"\\n\",\n \"\\n\",\n \"explainer = TokenActivationPairExplainer(\\n\",\n \" model_name=EXPLAINER_MODEL_NAME,\\n\",\n \" prompt_format=PromptFormat.HARMONY_V4,\\n\",\n \" max_concurrent=1,\\n\",\n \")\\n\",\n \"\\n\",\n \"for puzzle_name, puzzle in PUZZLES_BY_NAME.items():\\n\","
+ },
+ {
+ "comment": "This code is generating an explanation for a given puzzle using an explainer. It prints the name of the puzzle, the model-generated explanation, and the expected answer for the puzzle.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/explain_puzzles.ipynb\":37-72",
+ "content": " \" print(f\\\"{puzzle_name=}\\\")\\n\",\n \" puzzle_answer = puzzle.explanation\\n\",\n \" # Generate an explanation for the puzzle.\\n\",\n \" explanations = await explainer.generate_explanations(\\n\",\n \" all_activation_records=puzzle.activation_records,\\n\",\n \" max_activation=calculate_max_activation(puzzle.activation_records),\\n\",\n \" num_samples=1,\\n\",\n \" )\\n\",\n \" assert len(explanations) == 1\\n\",\n \" model_generated_explanation = explanations[0]\\n\",\n \" print(f\\\"{model_generated_explanation=}\\\")\\n\",\n \" print(f\\\"{puzzle_answer=}\\\\n\\\")\\n\",\n \"\\n\"\n ]\n }\n ],\n \"metadata\": {\n \"kernelspec\": {\n \"display_name\": \"openai\",\n \"language\": \"python\",\n \"name\": \"openai\"\n },\n \"language_info\": {\n \"codemirror_mode\": {\n \"name\": \"ipython\",\n \"version\": 3\n },\n \"file_extension\": \".py\",\n \"mimetype\": \"text/x-python\",\n \"name\": \"python\",\n \"nbconvert_exporter\": \"python\",\n \"pygments_lexer\": \"ipython3\",\n \"version\": \"3.9.9\"\n },\n \"orig_nbformat\": 4\n },"
+ },
+ {
+ "comment": "These lines specify the Jupyter Notebook format version and minor version.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/explain_puzzles.ipynb\":73-75",
+ "content": " \"nbformat\": 4,\n \"nbformat_minor\": 2\n}"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/44add69d-1a50-4610-a2ad-9ceb60941614.json b/docs/doc/44add69d-1a50-4610-a2ad-9ceb60941614.json
new file mode 100644
index 0000000..dc33553
--- /dev/null
+++ b/docs/doc/44add69d-1a50-4610-a2ad-9ceb60941614.json
@@ -0,0 +1,110 @@
+{
+ "summary": "The code includes an AI model for generating explanations using API calls and prompts, along with helper functions, constants, and a base class NeuronExplainer. It also handles long prompts and extracts explanations from completion lists while removing extra spaces.",
+ "details": [
+ {
+ "comment": "This code imports necessary modules and defines a few classes for generating explanations of neuron behavior using API calls. It also sets a prefix to be used when generating explanations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":0-33",
+ "content": "\"\"\"Uses API calls to generate explanations of neuron behavior.\"\"\"\nfrom __future__ import annotations\nimport logging\nimport re\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import Any, Optional, Sequence, Union\nfrom neuron_explainer.activations.activation_records import (\n calculate_max_activation,\n format_activation_records,\n non_zero_activation_proportion,\n)\nfrom neuron_explainer.activations.activations import ActivationRecord\nfrom neuron_explainer.api_client import ApiClient\nfrom neuron_explainer.explanations.few_shot_examples import FewShotExampleSet\nfrom neuron_explainer.explanations.prompt_builder import (\n HarmonyMessage,\n PromptBuilder,\n PromptFormat,\n Role,\n)\nfrom neuron_explainer.explanations.token_space_few_shot_examples import (\n TokenSpaceFewShotExampleSet,\n)\nlogger = logging.getLogger(__name__)\n# TODO(williamrs): This prefix may not work well for some things, like predicting the next token.\n# Try other options like \"this neuron activates for\".\nEXPLANATION_PREFIX = \"the main thing this neuron does is find\""
+ },
+ {
+ "comment": "This code defines a class called NeuronExplainer, which is an abstract base class for generating explanations from subclass-specific input data. It also includes helper functions for splitting numbered lists and removing final periods or period-spaces from strings. The code also defines two constants: HARMONY_V4_MODELS (a list of supported model names) and ContextSize (an enumeration representing different context sizes).",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":36-76",
+ "content": "def _split_numbered_list(text: str) -> list[str]:\n \"\"\"Split a numbered list into a list of strings.\"\"\"\n lines = re.split(r\"\\n\\d+\\.\", text)\n # Strip the leading whitespace from each line.\n return [line.lstrip() for line in lines]\ndef _remove_final_period(text: str) -> str:\n \"\"\"Strip a final period or period-space from a string.\"\"\"\n if text.endswith(\".\"):\n return text[:-1]\n elif text.endswith(\". \"):\n return text[:-2]\n return text\nclass ContextSize(int, Enum):\n TWO_K = 2049\n FOUR_K = 4097\n @classmethod\n def from_int(cls, i: int) -> ContextSize:\n for context_size in cls:\n if context_size.value == i:\n return context_size\n raise ValueError(f\"{i} is not a valid ContextSize\")\nHARMONY_V4_MODELS = [\"gpt-3.5-turbo\", \"gpt-4\"]\nclass NeuronExplainer(ABC):\n \"\"\"\n Abstract base class for Explainer classes that generate explanations from subclass-specific\n input data.\n \"\"\"\n def __init__(\n self,\n model_name: str,\n prompt_format: PromptFormat = PromptFormat.HARMONY_V4,"
+ },
+ {
+ "comment": "This code is defining a class with an initializer and a method for generating explanations. It takes in parameters such as model name, prompt format, context size, max concurrent requests, and cache settings. It also asserts that the model name is appropriate for the prompt format provided, preventing incorrect usage.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":77-100",
+ "content": " # This parameter lets us adjust the length of the prompt when we're generating explanations\n # using older models with shorter context windows. In the future we can use it to experiment\n # with longer context windows.\n context_size: ContextSize = ContextSize.FOUR_K,\n max_concurrent: Optional[int] = 10,\n cache: bool = False,\n ):\n if prompt_format == PromptFormat.HARMONY_V4:\n assert model_name in HARMONY_V4_MODELS\n elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:\n assert model_name not in HARMONY_V4_MODELS\n else:\n raise ValueError(f\"Unhandled prompt format {prompt_format}\")\n self.model_name = model_name\n self.prompt_format = prompt_format\n self.context_size = context_size\n self.client = ApiClient(model_name=model_name, max_concurrent=max_concurrent, cache=cache)\n async def generate_explanations(\n self,\n *,\n num_samples: int = 5,\n max_tokens: int = 60,"
+ },
+ {
+ "comment": "The code is generating explanations based on subclass-specific input data. It first creates a prompt and then passes the prompt along with other parameters to a language model for completion. If the format is HarmonyV4, it expects a list of dictionaries (HarmonyMessage), otherwise a string prompt is passed. The response from the language model is then processed to extract explanations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":101-127",
+ "content": " temperature: float = 1.0,\n top_p: float = 1.0,\n **prompt_kwargs: Any,\n ) -> list[Any]:\n \"\"\"Generate explanations based on subclass-specific input data.\"\"\"\n prompt = self.make_explanation_prompt(max_tokens_for_completion=max_tokens, **prompt_kwargs)\n generate_kwargs: dict[str, Any] = {\n \"n\": num_samples,\n \"max_tokens\": max_tokens,\n \"temperature\": temperature,\n \"top_p\": top_p,\n }\n if self.prompt_format == PromptFormat.HARMONY_V4:\n assert isinstance(prompt, list)\n assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n generate_kwargs[\"messages\"] = prompt\n else:\n assert isinstance(prompt, str)\n generate_kwargs[\"prompt\"] = prompt\n response = await self.client.make_request(**generate_kwargs)\n logger.debug(\"response in generate_explanations is %s\", response)\n if self.prompt_format == PromptFormat.HARMONY_V4:\n explanations = [x[\"message\"][\"content\"] for x in response[\"choices\"]]"
+ },
+ {
+ "comment": "This code defines a class for generating explanations using a prompt and an API. The `make_explanation_prompt` method is used to create a prompt to send to the API, which can be a string or a list of HarmonyMessages depending on the PromptFormat. The `postprocess_explanations` method post-processes the completions returned by the API into a list of explanations (by default it returns the completions as is). If the prompt format is unhandled, a ValueError is raised.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":128-152",
+ "content": " elif self.prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:\n explanations = [x[\"text\"] for x in response[\"choices\"]]\n else:\n raise ValueError(f\"Unhandled prompt format {self.prompt_format}\")\n return self.postprocess_explanations(explanations, prompt_kwargs)\n @abstractmethod\n def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:\n \"\"\"\n Create a prompt to send to the API to generate one or more explanations.\n A prompt can be a simple string, or a list of HarmonyMessages, depending on the PromptFormat\n used by this instance.\n \"\"\"\n ...\n def postprocess_explanations(\n self, completions: list[str], prompt_kwargs: dict[str, Any]\n ) -> list[Any]:\n \"\"\"Postprocess the completions returned by the API into a list of explanations.\"\"\"\n return completions # no-op by default\n def _prompt_is_too_long(\n self, prompt_builder: PromptBuilder, max_tokens_for_completion: int"
+ },
+ {
+ "comment": "This code checks if the prompt length combined with the maximum tokens for completion exceeds the context size. If so, it prints an error and returns True; otherwise, it returns False. The class TokenActivationPairExplainer generates explanations using token/activation pairs and prompts.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":153-176",
+ "content": " ) -> bool:\n # We'll get a context size error if the prompt itself plus the maximum number of tokens for\n # the completion is longer than the context size.\n prompt_length = prompt_builder.prompt_length_in_tokens(self.prompt_format)\n if prompt_length + max_tokens_for_completion > self.context_size.value:\n print(\n f\"Prompt is too long: {prompt_length} + {max_tokens_for_completion} > \"\n f\"{self.context_size.value}\"\n )\n return True\n return False\nclass TokenActivationPairExplainer(NeuronExplainer):\n \"\"\"\n Generate explanations of neuron behavior using a prompt with lists of token/activation pairs.\n \"\"\"\n def __init__(\n self,\n model_name: str,\n prompt_format: PromptFormat = PromptFormat.HARMONY_V4,\n # This parameter lets us adjust the length of the prompt when we're generating explanations\n # using older models with shorter context windows. In the future we can use it to experiment"
+ },
+ {
+ "comment": "Creates an instance of the class with specified parameters like model name, prompt format, context size, few-shot example set, repeating non-zero activations, maximum concurrent processes, and cache settings. Overrides superclass initializer to set these parameters. Defines a method make_explanation_prompt which takes all_activation_records, max_activation, numbered_list_of_n_explanations as input and returns explanation prompt as output.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":177-199",
+ "content": " # with 8k+ context windows.\n context_size: ContextSize = ContextSize.FOUR_K,\n few_shot_example_set: FewShotExampleSet = FewShotExampleSet.ORIGINAL,\n repeat_non_zero_activations: bool = True,\n max_concurrent: Optional[int] = 10,\n cache: bool = False,\n ):\n super().__init__(\n model_name=model_name,\n prompt_format=prompt_format,\n max_concurrent=max_concurrent,\n cache=cache,\n )\n self.context_size = context_size\n self.few_shot_example_set = few_shot_example_set\n self.repeat_non_zero_activations = repeat_non_zero_activations\n def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:\n original_kwargs = kwargs.copy()\n all_activation_records: Sequence[ActivationRecord] = kwargs.pop(\"all_activation_records\")\n max_activation: float = kwargs.pop(\"max_activation\")\n kwargs.setdefault(\"numbered_list_of_n_explanations\", None)\n numbered_list_of_n_explanations: Optional[int] = kwargs.pop("
+ },
+ {
+ "comment": "This code is setting up parameters for the prompt builder, such as number of explanations and optional omit activation records. It ensures no unexpected kwargs are present and adds a message to the prompt builder explaining the neuron's function in analyzing short documents.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":200-216",
+ "content": " \"numbered_list_of_n_explanations\"\n )\n if numbered_list_of_n_explanations is not None:\n assert numbered_list_of_n_explanations > 0, numbered_list_of_n_explanations\n # This parameter lets us dynamically shrink the prompt if our initial attempt to create it\n # results in something that's too long. It's only implemented for the 4k context size.\n kwargs.setdefault(\"omit_n_activation_records\", 0)\n omit_n_activation_records: int = kwargs.pop(\"omit_n_activation_records\")\n max_tokens_for_completion: int = kwargs.pop(\"max_tokens_for_completion\")\n assert not kwargs, f\"Unexpected kwargs: {kwargs}\"\n prompt_builder = PromptBuilder()\n prompt_builder.add_message(\n Role.SYSTEM,\n \"We're studying neurons in a neural network. Each neuron looks for some particular \"\n \"thing in a short document. Look at the parts of the document the neuron activates for \"\n \"and summarize in a single sentence what the neuron is looking for. Don't list \""
+ },
+ {
+ "comment": "Explains the activation format and its meaning, then selects one activation record from each few-shot example when using a 2k context window.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":217-230",
+ "content": " \"examples of words.\\n\\nThe activation format is tokenactivation. Activation \"\n \"values range from 0 to 10. A neuron finding what it's looking for is represented by a \"\n \"non-zero activation value. The higher the activation value, the stronger the match.\",\n )\n few_shot_examples = self.few_shot_example_set.get_examples()\n num_omitted_activation_records = 0\n for i, few_shot_example in enumerate(few_shot_examples):\n few_shot_activation_records = few_shot_example.activation_records\n if self.context_size == ContextSize.TWO_K:\n # If we're using a 2k context window, we only have room for one activation record\n # per few-shot example. (Two few-shot examples with one activation record each seems\n # to work better than one few-shot example with two activation records, in local\n # testing.)\n few_shot_activation_records = few_shot_activation_records[:1]"
+ },
+ {
+ "comment": "If context size is 4K and there are fewer activation records omitted than needed, drop the last one for the few-shot example if there are more than one activation record, then add the per-neuron explanation prompt.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":231-249",
+ "content": " elif (\n self.context_size == ContextSize.FOUR_K\n and num_omitted_activation_records < omit_n_activation_records\n ):\n # Drop the last activation record for this few-shot example to save tokens, assuming\n # there are at least two activation records.\n if len(few_shot_activation_records) > 1:\n print(f\"Warning: omitting activation record from few-shot example {i}\")\n few_shot_activation_records = few_shot_activation_records[:-1]\n num_omitted_activation_records += 1\n self._add_per_neuron_explanation_prompt(\n prompt_builder,\n few_shot_activation_records,\n i,\n calculate_max_activation(few_shot_example.activation_records),\n numbered_list_of_n_explanations=numbered_list_of_n_explanations,\n explanation=few_shot_example.explanation,\n )\n self._add_per_neuron_explanation_prompt("
+ },
+ {
+ "comment": "Code snippet is part of a function that generates an explanation prompt for a model. It includes the activation records, context size, number of few-shot examples, maximum activation value, and a boolean to indicate if an explanation is provided or not. If the prompt exceeds the specified token limit due to the inclusion of activation records, it tries again by omitting one more record until the desired number of omit activation records is reached or the prompt is too long with no opportunity for further omissions.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":250-269",
+ "content": " prompt_builder,\n # If we're using a 2k context window, we only have room for two of the activation\n # records.\n all_activation_records[:2]\n if self.context_size == ContextSize.TWO_K\n else all_activation_records,\n len(few_shot_examples),\n max_activation,\n numbered_list_of_n_explanations=numbered_list_of_n_explanations,\n explanation=None,\n )\n # If the prompt is too long *and* we omitted the specified number of activation records, try\n # again, omitting one more. (If we didn't make the specified number of omissions, we're out\n # of opportunities to omit records, so we just return the prompt as-is.)\n if (\n self._prompt_is_too_long(prompt_builder, max_tokens_for_completion)\n and num_omitted_activation_records == omit_n_activation_records\n ):\n original_kwargs[\"omit_n_activation_records\"] = omit_n_activation_records + 1\n return self.make_explanation_prompt(**original_kwargs)"
+ },
+ {
+ "comment": "Function that adds per-neuron explanations to the prompt based on activation records and optional parameters.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":270-292",
+ "content": " return prompt_builder.build(self.prompt_format)\n def _add_per_neuron_explanation_prompt(\n self,\n prompt_builder: PromptBuilder,\n activation_records: Sequence[ActivationRecord],\n index: int,\n max_activation: float,\n # When set, this indicates that the prompt should solicit a numbered list of the given\n # number of explanations, rather than a single explanation.\n numbered_list_of_n_explanations: Optional[int],\n explanation: Optional[str], # None means this is the end of the full prompt.\n ) -> None:\n max_activation = calculate_max_activation(activation_records)\n user_message = f\"\"\"\nNeuron {index + 1}\nActivations:{format_activation_records(activation_records, max_activation, omit_zeros=False)}\"\"\"\n # We repeat the non-zero activations only if it was requested and if the proportion of\n # non-zero activations isn't too high.\n if (\n self.repeat_non_zero_activations\n and non_zero_activation_proportion(activation_records, max_activation) < 0.2"
+ },
+ {
+ "comment": "This code seems to be a part of an explainable AI model. It generates user and assistant messages based on neuron activations, and either provides the explanation for a specific neuron or solicits a numbered list of explanations for all neurons.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":293-315",
+ "content": " ):\n user_message += (\n f\"\\nSame activations, but with all zeros filtered out:\"\n f\"{format_activation_records(activation_records, max_activation, omit_zeros=True)}\"\n )\n if numbered_list_of_n_explanations is None:\n user_message += f\"\\nExplanation of neuron {index + 1} behavior:\"\n assistant_message = \"\"\n # For the IF format, we want <|endofprompt|> to come before the explanation prefix.\n if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:\n assistant_message += f\" {EXPLANATION_PREFIX}\"\n else:\n user_message += f\" {EXPLANATION_PREFIX}\"\n prompt_builder.add_message(Role.USER, user_message)\n if explanation is not None:\n assistant_message += f\" {explanation}.\"\n if assistant_message:\n prompt_builder.add_message(Role.ASSISTANT, assistant_message)\n else:\n if explanation is None:\n # For the final neuron, we solicit a numbered list of explanations."
+ },
+ {
+ "comment": "Code snippet adds messages to the prompt_builder depending on the number of explanations. If there are more than one, it creates a numbered list of explanations starting with \"EXPLANATION_PREFIX\". Otherwise, it presents only one explanation as part of a numbered list and then adds the explanation itself. The postprocess_explanations function processes explanations returned by the API.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":316-333",
+ "content": " prompt_builder.add_message(\n Role.USER,\n f\"\"\"\\nHere are {numbered_list_of_n_explanations} possible explanations for neuron {index + 1} behavior, each beginning with \"{EXPLANATION_PREFIX}\":\\n1. {EXPLANATION_PREFIX}\"\"\",\n )\n else:\n # For the few-shot examples, we only present one explanation, but we present it as a\n # numbered list.\n prompt_builder.add_message(\n Role.USER,\n f\"\"\"\\nHere is 1 possible explanation for neuron {index + 1} behavior, beginning with \"{EXPLANATION_PREFIX}\":\\n1. {EXPLANATION_PREFIX}\"\"\",\n )\n prompt_builder.add_message(Role.ASSISTANT, f\" {explanation}.\")\n def postprocess_explanations(\n self, completions: list[str], prompt_kwargs: dict[str, Any]\n ) -> list[Any]:\n \"\"\"Postprocess the explanations returned by the API\"\"\"\n numbered_list_of_n_explanations = prompt_kwargs.get(\"numbered_list_of_n_explanations\")"
+ },
+ {
+ "comment": "Code block checks if the \"numbered_list_of_n_explanations\" is None and returns the \"completions\". If it's not None, it iterates through each completion and explanation in a nested loop. For each explanation that starts with EXPLANATION_PREFIX, it removes the prefix and appends the trimmed explanation to all_explanations list. Finally, it returns the list of all explanations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":334-352",
+ "content": " if numbered_list_of_n_explanations is None:\n return completions\n else:\n all_explanations = []\n for completion in completions:\n for explanation in _split_numbered_list(completion):\n if explanation.startswith(EXPLANATION_PREFIX):\n explanation = explanation[len(EXPLANATION_PREFIX) :]\n all_explanations.append(explanation.strip())\n return all_explanations\nclass TokenSpaceRepresentationExplainer(NeuronExplainer):\n \"\"\"\n Generate explanations of arbitrary lists of tokens which disproportionately activate a\n particular neuron. These lists of tokens can be generated in various ways. As an example, in one\n set of experiments, we compute the average activation for each neuron conditional on each token\n that appears in an internet text corpus. We then sort the tokens by their average activation,\n and show 50 of the top 100 tokens. Other techniques that could be used include taking the top"
+ },
+ {
+ "comment": "This function initializes a new instance of the Explainer class. It takes in parameters like model name, prompt format, context size, few-shot example set, use_few_shot flag, output_numbered_list flag, max_concurrent, and cache. If use_few_shot is True, it asserts that few_shot_example_set is not None and sets self.few_shot_examples accordingly.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":353-380",
+ "content": " tokens in the logit lens or tuned lens representations of a neuron.\n \"\"\"\n def __init__(\n self,\n model_name: str,\n prompt_format: PromptFormat = PromptFormat.HARMONY_V4,\n context_size: ContextSize = ContextSize.FOUR_K,\n few_shot_example_set: TokenSpaceFewShotExampleSet = TokenSpaceFewShotExampleSet.ORIGINAL,\n use_few_shot: bool = False,\n output_numbered_list: bool = False,\n max_concurrent: Optional[int] = 10,\n cache: bool = False,\n ):\n super().__init__(\n model_name=model_name,\n prompt_format=prompt_format,\n context_size=context_size,\n max_concurrent=max_concurrent,\n cache=cache,\n )\n self.use_few_shot = use_few_shot\n self.output_numbered_list = output_numbered_list\n if self.use_few_shot:\n assert few_shot_example_set is not None\n self.few_shot_examples: Optional[TokenSpaceFewShotExampleSet] = few_shot_example_set\n else:\n self.few_shot_examples = None"
+ },
+ {
+ "comment": "Code snippet:\n```python\ndef make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:\n tokens: list[str] = kwargs.pop(\"tokens\")\n max_tokens_for_completion = kwargs.pop(\"max_tokens_for_completion\")\n assert not kwargs, f\"Unexpected kwargs: {kwargs}\"\n stringified_tokens = \", \".join([f\"'{t}'\" for t in tokens])\n prompt_builder = PromptBuilder()\n```\nComment: This function constructs a prompt to ask about the neuron's activation tokens. It takes the \"tokens\" and \"max_tokens_for_completion\" as input arguments, and uses PromptBuilder to build the final prompt.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":381-397",
+ "content": " self.prompt_prefix = (\n \"We're studying neurons in a neural network. Each neuron looks for some particular \"\n \"kind of token (which can be a word, or part of a word). Look at the tokens the neuron \"\n \"activates for (listed below) and summarize in a single sentence what the neuron is \"\n \"looking for. Don't list examples of words.\"\n )\n def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:\n tokens: list[str] = kwargs.pop(\"tokens\")\n max_tokens_for_completion = kwargs.pop(\"max_tokens_for_completion\")\n assert not kwargs, f\"Unexpected kwargs: {kwargs}\"\n # Note that this does not preserve the precise tokens, as e.g.\n # f\" {token_with_no_leading_space}\" may be tokenized as \"f{token_with_leading_space}\".\n # TODO(dan): Try out other variants, including \"\\n\".join(...) and \",\".join(...)\n stringified_tokens = \", \".join([f\"'{t}'\" for t in tokens])\n prompt_builder = PromptBuilder()"
+ },
+ {
+ "comment": "This code adds a prompt to the prompt builder. It starts with a system message, then adds few-shot examples if specified and prompts related to neurons. If the prompt is too long, it raises a ValueError. The code also includes an unimplemented feature for numbered lists in few-shot examples.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":398-417",
+ "content": " prompt_builder.add_message(Role.SYSTEM, self.prompt_prefix)\n if self.use_few_shot:\n self._add_few_shot_examples(prompt_builder)\n self._add_neuron_specific_prompt(prompt_builder, stringified_tokens, explanation=None)\n if self._prompt_is_too_long(prompt_builder, max_tokens_for_completion):\n raise ValueError(f\"Prompt too long: {prompt_builder.build(self.prompt_format)}\")\n else:\n return prompt_builder.build(self.prompt_format)\n def _add_few_shot_examples(self, prompt_builder: PromptBuilder) -> None:\n \"\"\"\n Append few-shot examples to the prompt. Each one consists of a comma-delimited list of\n tokens and corresponding explanations, as saved in\n alignment/neuron_explainer/weight_explainer/token_space_few_shot_examples.py.\n \"\"\"\n assert self.few_shot_examples is not None\n few_shot_example_list = self.few_shot_examples.get_examples()\n if self.output_numbered_list:\n raise NotImplementedError(\"Numbered list output not supported for few-shot examples\")"
+ },
+ {
+ "comment": "This code adds a neuron-specific prompt to the prompt builder. If the example is not a few shot example, it adds a list of tokens and either an explanation or a starting point for the model to complete with an explanation. The prompt format can be instruction following.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":418-440",
+ "content": " else:\n for few_shot_example in few_shot_example_list:\n self._add_neuron_specific_prompt(\n prompt_builder,\n \", \".join([f\"'{t}'\" for t in few_shot_example.tokens]),\n explanation=few_shot_example.explanation,\n )\n def _add_neuron_specific_prompt(\n self,\n prompt_builder: PromptBuilder,\n stringified_tokens: str,\n explanation: Optional[str],\n ) -> None:\n \"\"\"\n Append a neuron-specific prompt to the prompt builder. The prompt consists of a list of\n tokens followed by either an explanation (if one is passed, for few shot examples) or by\n the beginning of a completion, to be completed by the model with an explanation.\n \"\"\"\n user_message = f\"\\n\\n\\n\\nTokens:\\n{stringified_tokens}\\n\\nExplanation:\\n\"\n assistant_message = \"\"\n looking_for = \"This neuron is looking for\"\n if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:"
+ },
+ {
+ "comment": "This code adds user and assistant messages to a prompt builder based on the prompt format, output numbered list preference, and explanation presence. The postprocess_explanations function then handles multiple explanations in a list format for completions.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":441-463",
+ "content": " # We want <|endofprompt|> to come before \"This neuron is looking for\" in the IF format.\n assistant_message += looking_for\n else:\n user_message += looking_for\n if self.output_numbered_list:\n start_of_list = \"\\n1.\"\n if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:\n assistant_message += start_of_list\n else:\n user_message += start_of_list\n if explanation is not None:\n assistant_message += f\"{explanation}.\"\n prompt_builder.add_message(Role.USER, user_message)\n if assistant_message:\n prompt_builder.add_message(Role.ASSISTANT, assistant_message)\n def postprocess_explanations(\n self, completions: list[str], prompt_kwargs: dict[str, Any]\n ) -> list[str]:\n if self.output_numbered_list:\n # Each list in the top-level list will have multiple explanations (multiple strings).\n all_explanations = []\n for completion in completions:"
+ },
+ {
+ "comment": "This code is parsing a completion list, extracting explanations and removing extra spaces.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py\":464-471",
+ "content": " for explanation in _split_numbered_list(completion):\n if explanation.startswith(EXPLANATION_PREFIX):\n explanation = explanation[len(EXPLANATION_PREFIX) :]\n all_explanations.append(explanation.strip())\n return all_explanations\n else:\n # Each element in the top-level list will be an explanation as a string.\n return [_remove_final_period(explanation) for explanation in completions]"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/5aaec5f8-9059-48e1-a939-c9c4a9151b26.json b/docs/doc/5aaec5f8-9059-48e1-a939-c9c4a9151b26.json
new file mode 100644
index 0000000..df578a9
--- /dev/null
+++ b/docs/doc/5aaec5f8-9059-48e1-a939-c9c4a9151b26.json
@@ -0,0 +1,20 @@
+{
+ "summary": "The code accesses a lookup table from an Azure dataset, containing tokens and their average activations for specified neuron using standardized URLs. This function then reads the file and displays its contents in a readable format.",
+ "details": [
+ {
+ "comment": "Loading token-weight connections of a neuron from an Azure dataset. The function retrieves and returns the TokenLookupTableSummaryOfNeuron for the specified layer index and neuron index. It uses standardized Azure URLs for accessing the data.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/token_connections.py\":0-32",
+ "content": "from dataclasses import dataclass\nfrom typing import List, Union\nimport blobfile as bf\nfrom neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass\nfrom neuron_explainer.azure import standardize_azure_url\nimport urllib.request\n@register_dataclass\n@dataclass\nclass TokensAndWeights(FastDataclass):\n tokens: List[str]\n strengths: List[float]\n@register_dataclass\n@dataclass\nclass WeightBasedSummaryOfNeuron(FastDataclass):\n input_positive: TokensAndWeights\n input_negative: TokensAndWeights\n output_positive: TokensAndWeights\n output_negative: TokensAndWeights\ndef load_token_weight_connections_of_neuron(\n layer_index: Union[str, int],\n neuron_index: Union[str, int],\n dataset_path: str = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based\",\n) -> WeightBasedSummaryOfNeuron:\n \"\"\"Load the TokenLookupTableSummaryOfNeuron for the specified neuron.\"\"\"\n url = \"/\".join([dataset_path, str(layer_index), f\"{neuron_index}.json\"])\n url = standardize_azure_url(url)"
+ },
+ {
+ "comment": "This code loads a lookup table containing tokens and their average activations for a given neuron. The table is generated from the highest average activations across an internet text dataset, and the data is retrieved from an Azure URL.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/token_connections.py\":33-57",
+ "content": " with urllib.request.urlopen(url) as f:\n return loads(f.read(), backwards_compatible=False)\n@register_dataclass\n@dataclass\nclass TokenLookupTableSummaryOfNeuron(FastDataclass):\n \"\"\"List of tokens and the average activations of a given neuron in response to each\n respective token. These are selected from among the tokens in the vocabulary with the\n highest average activations across an internet text dataset, with the highest activations\n first.\"\"\"\n tokens: List[str]\n average_activations: List[float]\ndef load_token_lookup_table_connections_of_neuron(\n layer_index: Union[str, int],\n neuron_index: Union[str, int],\n dataset_path: str = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based\",\n) -> TokenLookupTableSummaryOfNeuron:\n \"\"\"Load the TokenLookupTableSummaryOfNeuron for the specified neuron.\"\"\"\n url = \"/\".join([dataset_path, str(layer_index), f\"{neuron_index}.json\"])\n url = standardize_azure_url(url)\n with urllib.request.urlopen(url) as f:"
+ },
+ {
+ "comment": "This function reads the file and returns its contents in a readable format.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/token_connections.py\":58-58",
+ "content": " return loads(f.read(), backwards_compatible=False)"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/5b02d16d-02f8-434e-b987-e98ef54b6ab7.json b/docs/doc/5b02d16d-02f8-434e-b987-e98ef54b6ab7.json
new file mode 100644
index 0000000..32c0557
--- /dev/null
+++ b/docs/doc/5b02d16d-02f8-434e-b987-e98ef54b6ab7.json
@@ -0,0 +1,30 @@
+{
+ "summary": "The code retrieves top-connected neurons and their corresponding layer-neuron pairs, using functions to load JSON files from Azure Blob Storage and memoization.",
+ "details": [
+ {
+ "comment": "This code defines two functions, `load_file_no_cache` and `load_file_az`, for loading data from a file. The first function sends the file path to a server using POST request with JSON body. The second function retrieves the file content using GET request with CORS mode. A memoization function is defined but not used in this code. The `load_file` variable is set based on whether the application is running locally or remotely, and it points to either the local or remote loading function.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/interpAPI.ts\":0-43",
+ "content": "import {Neuron} from './types';\nimport {memoizeAsync} from \"./utils\"\nexport const load_file_no_cache = async(path: string) => {\n const data = {\n path: path\n }\n const url = new URL(\"/load_az\", window.location.href)\n url.port = '8000';\n return await (\n await fetch(url, {\n method: \"POST\", // or 'PUT'\n headers: {\n \"Content-Type\": \"application/json\",\n },\n body: JSON.stringify(data),\n })\n ).json()\n}\nexport const load_file_az = async(path: string) => {\n const res = (\n await fetch(path, {\n method: \"GET\",\n mode: \"cors\",\n headers: {\n \"Content-Type\": \"application/json\",\n },\n })\n )\n if (!res.ok) {\n console.error(`HTTP error: ${res.status} - ${res.statusText}`);\n return;\n }\n return await res.json()\n}\n// export const load_file = memoizeAsync('load_file', load_file_no_cache)\nexport const load_file = window.location.host.indexOf('localhost:') === -1 ? load_file_az : load_file_no_cache;\n// # (derived from az://oaialignment/datasets/interp/gpt2_xl/v1/webtext1/len_nomax/n_50000/mlp_post_act/ranked_by_max_activation)"
+ },
+ {
+ "comment": "The code defines constants for the path to neuron records, explanations, and related tokens (weight-based). The previous paths were derived from Azure Storage, but now they are pointing to a public Blob storage in Windows. These paths are used to access the necessary data for interpretation tasks.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/interpAPI.ts\":44-54",
+ "content": "// const NEURON_RECORDS_PATH = \"az://oaisbills/rcall/oss/migrated_make_crow_datasets/gpt2_xl_n_50000_64_token/neurons\"\nconst NEURON_RECORDS_PATH = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations\"\n// # (derived from az://oaialignment/datasets/interp/gpt2_xl/v1/webtext1/len_nomax/n_50000/mlp_post_act/ranked_by_max_activation/neurons/explanations/canonical-run-v1)\n// const EXPLANATIONS_PATH = \"az://oaisbills/rcall/oss/migrated_explanation_datasets/canonical_gpt2_xl_all_neurons\"\nconst EXPLANATIONS_PATH = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/explanations\"\n// weight-based\n// const WHOLE_LAYER_WEIGHT_TOKENS_PATH = \"az://oaidan/rcall/data/interpretability/connections/gpt2-xl/mlp/unnorm_token_representations_uncommon_vanilla\"\n// const WEIGHT_TOKENS_PATH = \"az://oaijeffwu/jeffwu-data/interpretability/neuron-connections/gpt2-xl/weight-based\"\nconst WEIGHT_TOKENS_PATH = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based\""
+ },
+ {
+ "comment": "This code defines constants for storage locations of lookup table and connection paths, and functions to retrieve explanations and top tokens based on a given neuron and weight type. The code also uses Azure Blob Storage to load JSON files containing explanation data and token representations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/interpAPI.ts\":55-72",
+ "content": "// lookup table\n// const WHOLE_LAYER_ACTIVATION_TOKENS_PATH = \"az://oaidan/rcall/data/interpretability/connections/gpt2_xl/mlp/unnorm_token_representations_vanilla_and_common_in_colangv2_unigram\"\n// const ACTIVATION_TOKENS_PATH = \"az://oaijeffwu/jeffwu-data/interpretability/neuron-connections/gpt2-xl/lookup-table\"\nconst ACTIVATION_TOKENS_PATH = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based\"\n// const CONNECTIONS_PATH = \"az://oaialignment/datasets/interp/connections/gpt2/neuron_space/incl_attn_False\"\nconst CONNECTIONS_PATH = \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-neurons/weight-based\"\nexport const get_explanations = async (activeNeuron: Neuron) => {\n const result = await load_file(`${EXPLANATIONS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.jsonl`)\n return result\n}\nexport const get_top_tokens = async (activeNeuron: Neuron, weightType: string) => {\n let TOKENS_PATH;\n if (weightType === 'weight') {\n TOKENS_PATH = WEIGHT_TOKENS_PATH;"
+ },
+ {
+ "comment": "Checks the weightType and sets the corresponding TOKENS_PATH for loading neuron data. If an invalid weightType is given, throws an error. Loads and returns the neuron data from the specified file path.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/interpAPI.ts\":73-96",
+ "content": " } else if (weightType === 'activation') {\n TOKENS_PATH = ACTIVATION_TOKENS_PATH;\n } else {\n throw new Error(`Invalid weightType: ${weightType}`)\n }\n const result = await load_file(`${TOKENS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)\n return result\n // const result = await load_file_no_cache(`${ORIG_TOKENS_PATH}/${activeNeuron.layer}.json`)\n // return result.neuron_summaries[activeNeuron.neuron]\n}\nexport const get_top_neuron_connections = async (activeNeuron: Neuron) => {\n const result = await load_file(`${CONNECTIONS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)\n const res: {[key: string]: [number, number]} = {};\n [\"input\", \"output\"].forEach((direction) => {\n const sign = \"positive\" // \"negative\"\n const weight_name: string = {output: \"c_proj\", input: \"c_fc\"}[direction] as string;\n const res_for_dir = result[weight_name];\n if (res_for_dir === null) {\n return\n }\n // let key = 'top_negative_neurons'\n c"
+ },
+ {
+ "comment": "This code retrieves the top-connected neurons for a given direction and sign from a result object, maps them to layer, neuron, and weight tuples, and returns the top 10 layer-neuron pairs. It also defines a function `get_neuron_record` that asynchronously loads a JSON file representing a neuron's record based on its layer and neuron ID.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/interpAPI.ts\":96-111",
+ "content": "onst top_neuron_strs = res_for_dir[`top_${sign}_neurons`] // {layer}_{neuron} strings for each top-connected neuron\n const top_weights = res_for_dir[`top_${sign}_weights`]\n const top_layer_neuron_tuples = top_neuron_strs.map((neuron_str: string, i: number) => {\n const [layer, neuron] = neuron_str.split(\"_\").map((x: string) => parseInt(x))\n return [layer, neuron, top_weights[i]] as [number, number, number]\n })\n res[direction] = top_layer_neuron_tuples.slice(0, 10)\n })\n return res\n}\nexport const get_neuron_record = async(activeNeuron: Neuron) => {\n const result = await load_file(`${NEURON_RECORDS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)\n return result\n}"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/6d0f1122-d655-4dbf-8076-a3aa9e37a834.json b/docs/doc/6d0f1122-d655-4dbf-8076-a3aa9e37a834.json
new file mode 100644
index 0000000..cd92eb0
--- /dev/null
+++ b/docs/doc/6d0f1122-d655-4dbf-8076-a3aa9e37a834.json
@@ -0,0 +1,50 @@
+{
+ "summary": "The code provides a NeuronForm component that uses hooks to handle layers and neurons, featuring an array of predefined text classification neurons. The Neuron Viewer tool allows users to view specific details or select neurons randomly.",
+ "details": [
+ {
+ "comment": "This code imports React hooks and defines a NeuronForm component that utilizes useState to store the layer and neuron values. It also uses useNavigate from react-router-dom for navigation. The code includes an array of objects representing known good neurons with their respective layers, neurons, labels, and descriptions.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/welcome.tsx\":0-17",
+ "content": "import { useState, FormEvent } from \"react\"\nimport { useNavigate } from \"react-router-dom\"\nfunction NeuronForm() {\n const [input_layer, setLayer] = useState(0)\n const [input_neuron, setNeuron] = useState(0)\n const navigate = useNavigate()\n const knownGoodNeurons = [\n /**************\n /* well explained + interesting\n ***************/\n {heading: 'Somewhat well explained by GPT-4', layer: 0, neuron: 0, label: ''},\n {layer: 5, neuron: 131, label: \"citations\", description: \"citations, especially biblical and legal\"},\n {layer: 12, neuron: 847, label: \"numbers in fractions\", description: \"numbers in fractions\"}, // \n {layer: 12, neuron: 5820, label: \"short flags\", description: \"single letter command line flags\"}, // \n {layer: 14, neuron: 417, label: \"doing things right\", description: \"words and phrases related to performing actions correctly or properly\"}, // score 0.42\n {layer: 15, neuron: 4538, label: \"leading transitions\", description: \"transition words at the start of documents\"},"
+ },
+ {
+ "comment": "This code represents a collection of neuron explanations for an AI model. Each entry in the array includes information about the layer, neuron number, label, and description. The scores indicate how relevant each neuron is to the given text.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/welcome.tsx\":18-26",
+ "content": " {layer: 17, neuron: 3218, label: \"success\", description: \"expressions of completion or success\"}, // score 0.38\n {layer: 18, neuron: 5302, label: \"X *by*\", description: \"the word 'by' in phrases indicating side by side or sequential events.\"}, // score 0.48\n {layer: 19, neuron: 1377, label: \"similes\", description: \"comparisons and analogies, often using the word 'like'\"}, // score 0.42\n {layer: 21, neuron: 2932, label: \"Canada\", description: \"references to Canadian people, places, and entities\"}, // score 0.78\n {layer: 25, neuron: 2602, label: \"similes\", description: \"descriptive comparisons, especially similes\"}, // score 0.40\n {layer: 25, neuron: 4870, label: \"certainty\", description: \"phrases related to certainty and confidence.\"}, // score 0.37\n {layer: 30, neuron: 28, label: \"times\", description: \"specific times (with hours and minutes)\"}, \n // https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html#/layers/5/neurons/2326\n {heading: 'Partially explained by GPT-4', layer: 0, neuron: 0, label: ''},"
+ },
+ {
+ "comment": "These are individual neuron definitions for various layers in a neural network, each with a specific label and description. The numbers represent unique identifiers for these neurons.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/welcome.tsx\":27-35",
+ "content": " {layer: 0, neuron: 816, label: \"Marvel comics vibes\", description: \"language and context related to Marvel comics, movies, and characters, as well as other superhero-themed content\"}, // score 0.44\n {layer: 0, neuron: 742, label: \"Second token 'and'\", description: \"'and', 'in', and punctuation at the second token\"},\n {layer: 4, neuron: 4342, label: \"token counter\", description: \"counting repeated occurrences of a token\"},\n {layer: 5, neuron: 2326, label: \"rhymes with 'at'\", description: \"syllables rhyming with 'at', sometimes 'it', 'et', 'ot'\"},\n {layer: 5, neuron: 4492, label: \"leading 'an'\", description: \"sentences that start with 'an'\"}, // score 0.77\n {layer: 6, neuron: 3251, label: \"not all\", description: \"not all\"},\n {layer: 10, neuron: 2851, label: \"leading acronyms\", description: \"acronyms after punctuation or newlines\"},\n {layer: 12, neuron: 2884, label: \"hypothetical had\", description: \"had in hypothetical contexts\"}, // \n {layer: 14, neuron: 3539, label: \"long sequences\", description: \"long sequences of stuff\"},"
+ },
+ {
+ "comment": "This code represents a collection of layers and neurons with their respective labels and descriptions. The comments describe the meaning or purpose behind each entry, such as \"X by/after *X*\", which refers to noun repetitions separated by 'by' or 'after', and \"any *and* all\" for any/anything & all/everything. Some entries are marked as poorly explained, interesting, or have specific contexts like Hillary Clinton leaked emails. The scores represent the relevance of these entries in the context.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/welcome.tsx\":36-51",
+ "content": " {layer: 14, neuron: 3822, label: \"X by/after *X*\", description: \"noun repetitions separated by 'by' or 'after'\"},\n {layer: 21, neuron: 3982, label: \"any *and* all\", description: \"any/anything *and/&* all/everything\"},\n {layer: 26, neuron: 20, label: \"truth, skin, or sun\", description: \"truth, skin, or sun\"},\n // layer=18&neuron=5302\n /**************\n /* boring\n ***************/\n /**************\n /* poorly explained + interesting\n ***************/\n {heading: 'Poorly explained by GPT-4', layer: 0, neuron: 0, label: ''},\n // Actually activates for negated version \u201cnot so much \u2026 as\u201d even when not so much is fairly far apart\n // another \"not all\": 13&neuron=1352\n // {layer: 0, neuron: 2823, label: \"Hillary email leak vibes\", description: \"contexts related to Hillary Clinton leaked emails\"}, // score ??\n // {layer: 12, neuron: 3718, label: \"comparative phrases and negations\", description: \"comparative phrases and negations\"}, // score 0.12\n {layer: 13, neuron: 410, label: \"N and N+1\", description: \"a number following its predecessor\"}, // score ??"
+ },
+ {
+ "comment": "Code represents a list of neurons in the Neuron Viewer tool, each with a layer, neuron ID, label, description, and possibly a score. The labels indicate different types of linguistic patterns or features identified by OpenAI's GPT-3 model. The descriptions provide context on what these neurons represent.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/welcome.tsx\":52-62",
+ "content": " {layer: 13, neuron: 979, label: \"subtle plurals\", description: \"subtle/nonobvious plurals\"}, // score ??\n // slash after number 12&neuron=847\n // numbers predicting slash: 14&neuron=92\n // 0&neuron=2823\n {layer: 14, neuron: 1251, label: \"subjunctive verbs\", description: \"verbs in subjunctive mood\"}, // score ??\n {layer: 16, neuron: 518, label: \"pattern breaks\", description: \"tokens that break an established pattern in an ongoing list\"}, // score 0.2 with totally wrong explanation\n {layer: 17, neuron: 821, label: \"idioms\", description: \"idioms\"},\n {layer: 18, neuron: 3481, label: \"post-typo\", description: \"first token following a typo\"}, // score ??\n {layer: 18, neuron: 3552, label: \"repeated text\", description: \"repeated text\"}, // score ??\n // another shared last names: https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html#/layers/20/neurons/3164\n {layer: 19, neuron: 1763, label: \"shared last names\", description: \"last names when two different people sharing last name are mentioned\"}, // score 0.36"
+ },
+ {
+ "comment": "These are examples of neurons with their associated labels, descriptions, and potential scores. The handleSubmit function handles form submission to navigate to a specific layer and neuron. The handleNeuronClick function navigates to a specific neuron when clicked.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/welcome.tsx\":63-82",
+ "content": " {layer: 20, neuron: 4334, label: \"previous break\", description: \"tokens that previously preceded a linebreak\"}, // score ??\n {layer: 27, neuron: 116, label: \"MTG vibes\", description: \"Magic the Gathering contexts\"}, // score ??\n {layer: 35, neuron: 1523, label: \"NBA name predictor\", description: \"NBA person/player name predictor\"}, // score ??\n // {layer: 36, neuron: 2275, label: \"she predictor\", description: \"prediction of the token 'she'\"}, // score ??\n // {layer: 36, neuron: 5107, label: \"Mormon vibes\", description: \"Mormon related context\"}, // score ??\n // ] predictor 40&neuron=4505\n {layer: 46, neuron: 2181, label: \"C predictor\", description: \"prediction of the token 'C'\"}, // score ??\n ]\n const handleSubmit = (e: FormEvent) => {\n e.preventDefault()\n navigate(`/layers/${input_layer}/neurons/${input_neuron}`)\n return false\n }\n const handleNeuronClick = (layer: number, neuron: number) => {\n navigate(`/layers/${layer}/neurons/${neuron}`)\n }\n const feelingLuckySubmit = () => {"
+ },
+ {
+ "comment": "Picks a random neuron from the given layers and navigates to that neuron's page. Allows user to input layer and neuron index, but doesn't seem to have functionality for choosing specific neurons.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/welcome.tsx\":83-115",
+ "content": " const layer = Math.floor(Math.random() * 48);\n const neuron = Math.floor(Math.random() * 6400);\n navigate(`/layers/${layer}/neurons/${neuron}`)\n return false\n }\n return (\n
\n )\n}\nexport default NeuronForm"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/73b33b04-9b33-4739-96d4-8b216073a031.json b/docs/doc/73b33b04-9b33-4739-96d4-8b216073a031.json
new file mode 100644
index 0000000..27144b0
--- /dev/null
+++ b/docs/doc/73b33b04-9b33-4739-96d4-8b216073a031.json
@@ -0,0 +1,25 @@
+{
+ "summary": "The code imports components, generates sequence lists, fetches and displays data, normalizes sequences of activations, renders them with labels, provides a button for data visibility, and allows users to visualize heatmaps through iterating slices of data.",
+ "details": [
+ {
+ "comment": "Importing HeatmapGrid component and React, useState, and useEffect hooks from 'react'. Defining a zip_sequences function that takes in sequences as input. Exporting a default functional component that takes an activeNeuron prop. Inside the component, setting up state variables for data, showingMore, and isLoading using useState hook. Using the useEffect hook to fetch data when the component mounts or if there's a change in the activeNeuron prop. The fetched data is then used to create all_sequences array, which contains objects with label, sequences, and default_show properties.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/datasetList.jsx\":0-35",
+ "content": "import HeatmapGrid from \"../heatmapGrid\"\nimport React, { useEffect, useState } from \"react\"\nimport { normalizeTokenActs } from \"../types\"\nimport {get_neuron_record} from \"../interpAPI\"\nfunction zip_sequences(sequences) {\n return sequences.map(({ activations, tokens }) => {\n return tokens.map((token, idx) => ({\n token,\n activation: activations[idx],\n }))\n })\n}\nexport default ({ activeNeuron }) => {\n const [data, setData] = useState(null)\n const [showingMore, setShowingMore] = useState({})\n const [isLoading, setIsLoading] = useState(true)\n useEffect(() => {\n async function fetchData() {\n if (data) {\n return\n }\n const result = await get_neuron_record(activeNeuron)\n console.log(result)\n const all_sequences = []\n all_sequences.push({\n // label: '[0.999, 1] (Top quantile, sorted. 50 of 50000)',\n label: 'Top',\n sequences: zip_sequences(result.most_positive_activation_records),\n default_show: 4,\n })\n all_sequences.push({\n label: 'Quantile range [0.99, 0.999] sample',"
+ },
+ {
+ "comment": "This code generates a list of sequences for different quantile ranges and a random sample. It adds these sequences to the all_sequences array, which will be used later in the program. The intervals used here are defined as a reference.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/datasetList.jsx\":36-62",
+ "content": " sequences: zip_sequences(result.random_sample_by_quantile[3]),\n default_show: 1,\n })\n all_sequences.push({\n label: 'Quantile range [0.9, 0.99] sample',\n sequences: zip_sequences(result.random_sample_by_quantile[2]),\n default_show: 1,\n })\n all_sequences.push({\n label: 'Quantile range [0.5, 0.9] sample',\n sequences: zip_sequences(result.random_sample_by_quantile[1]),\n default_show: 1,\n })\n all_sequences.push({\n label: 'Quantile range [0, 0.5] sample',\n sequences: zip_sequences(result.random_sample_by_quantile[0]),\n default_show: 1,\n })\n all_sequences.push({\n // label: '[0, 1] (Random)',\n label: 'Random sample',\n sequences: zip_sequences(result.random_sample),\n default_show: 2,\n })\n // for reference\n // intervals = [(0, 1), (0, 0.5), (0.5, 0.9), (0.9, 0.99), (0.99, 0.999), (0.999, 1)]\n // saved_activations_by_interval = [neuron_record.random_sample] + neuron_record.random_sample_by_decile[:-1] + [neuron_record.top_activations]"
+ },
+ {
+ "comment": "This code fetches data for a dataset and displays it in a loading state until the data is available. It then normalizes the sequences of activations and renders them with their labels. A button allows users to show more or less of the data depending on the label.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/datasetList.jsx\":63-93",
+ "content": " setData(all_sequences)\n setIsLoading(false)\n }\n fetchData()\n }, [activeNeuron])\n if (isLoading) {\n return (\n
\n )\n}"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/7a510cfd-f907-4821-b9ec-eaca056823c7.json b/docs/doc/7a510cfd-f907-4821-b9ec-eaca056823c7.json
new file mode 100644
index 0000000..34e8cb1
--- /dev/null
+++ b/docs/doc/7a510cfd-f907-4821-b9ec-eaca056823c7.json
@@ -0,0 +1,10 @@
+{
+ "summary": "This function converts the input URL to Azure format if it starts with \"az://openaipublic/\".",
+ "details": [
+ {
+ "comment": "This function converts the input URL to Azure format if it starts with \"az://openaipublic/\".",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/azure.py\":0-4",
+ "content": "def standardize_azure_url(url):\n \"\"\"Make sure url is converted to url format, not an azure path\"\"\"\n if url.startswith(\"az://openaipublic/\"):\n url = url.replace(\"az://openaipublic/\", \"https://openaipublic.blob.core.windows.net/\")\n return url"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/89719fe2-702d-4e98-aa4f-8e449e89513a.json b/docs/doc/89719fe2-702d-4e98-aa4f-8e449e89513a.json
new file mode 100644
index 0000000..5b37d13
--- /dev/null
+++ b/docs/doc/89719fe2-702d-4e98-aa4f-8e449e89513a.json
@@ -0,0 +1,30 @@
+{
+ "summary": "The code imports React, fetches token data from an API, renders loading indicator, displays tokens with interactive elements and tooltips, limited to 20 input tokens, and formats the tokens in rounded-full boxes with red text color.",
+ "details": [
+ {
+ "comment": "This code imports React and two useState/useEffect hooks. It defines a TokenDisplay component that fetches related tokens data from an interpAPI endpoint based on the activeNeuron prop. It checks if the data is loaded, displays a loading indicator if not, and renders mean-activation-based token information when done loading.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/topTokens.jsx\":0-40",
+ "content": "import React, { useState, useEffect } from \"react\"\nimport { get_top_tokens } from \"../interpAPI\"\nconst TokenDisplay = ({ activeNeuron }) => {\n const [isLoading, setIsLoading] = useState(true)\n const [data, setData] = useState(null)\n const loadTokens = async () => {\n setIsLoading(true)\n const weightStrengths = await get_top_tokens(activeNeuron, 'weight')\n const activationStrengths = await get_top_tokens(activeNeuron, 'activation')\n const data = {\n activeNeuron,\n weightStrengths,\n activationStrengths,\n }\n setData(data)\n setIsLoading(false)\n }\n useEffect(() => {\n if (!data) {\n loadTokens()\n }\n }, [])\n return (\n
\n
Related tokens
\n {isLoading ? (\n
\n
loading tokens
\n
\n ) : (\n <>\n
Mean-activation-based
\n
"
+ },
+ {
+ "comment": "The code is mapping over tokens based on activation strengths and weight strengths to display them as interactive elements with their respective strength values. It also includes a limit of 20 input tokens for the weight-based section.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/topTokens.jsx\":41-62",
+ "content": " {data.activationStrengths.tokens.map((token, idx) => {\n return (\n data.activationStrengths.average_activations[idx] === null ? null :\n \n {token}\n \n )\n })}\n
"
+ },
+ {
+ "comment": "Output tokens section:\nIterates over top 20 output positive tokens and displays with strength information in tooltip.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/topTokens.jsx\":88-110",
+ "content": "
\n )\n}\nexport default TokenDisplay"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/8a132a79-3eb9-41fe-ba28-e4ac23f71aef.json b/docs/doc/8a132a79-3eb9-41fe-ba28-e4ac23f71aef.json
new file mode 100644
index 0000000..f62fa96
--- /dev/null
+++ b/docs/doc/8a132a79-3eb9-41fe-ba28-e4ac23f71aef.json
@@ -0,0 +1,35 @@
+{
+ "summary": "The code imports React, creates a component with loading and toggle switch, and displays top 5 or remaining sequences based on toggle value in the interface.",
+ "details": [
+ {
+ "comment": "Code imports React and related hooks, defines utility functions for zipping simulated and real sequences, and sets up state variables and a function to load explanations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/explanation.jsx\":0-32",
+ "content": "import React, { useState, useEffect } from \"react\"\nimport { get_explanations } from \"../interpAPI\"\n// import HeatmapGrid from \"../heatmapGrid\"\nimport SimulationHeatmap from \"../simulationHeatmap\"\nimport { normalizeTokenActs } from \"../types\"\nfunction zip_simulated_sequences(sequences) {\n return sequences.map(({ simulation }) => {\n return simulation.tokens.map((token, idx) => ({\n token,\n activation: simulation.expected_activations[idx],\n }))\n })\n}\nfunction zip_real_sequences(sequences) {\n return sequences.map(({ simulation, true_activations }) => {\n return simulation.tokens.map((token, idx) => ({\n token,\n activation: true_activations[idx],\n }))\n })\n}\nconst ExplanationDisplay = ({ activeNeuron }) => {\n const [isLoading, setIsLoading] = useState(true)\n const [data, setData] = useState(null)\n const [showingScoringDetails, setShowingScoringDetails] = useState(false)\n const [toggle, setToggle] = useState(false);\n const loadExplanation = async () => {\n const result = await get_explanations(activeNeuron);"
+ },
+ {
+ "comment": "This code is setting up data and handling loading of explanation, normalizing token actions, and providing a suggestion link for further explanation. It uses useEffect to check if there's data available and loads the explanation if not. It also calculates simulated and real sequences based on the data provided, and creates a suggestion link to provide more information about the neuron.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/explanation.jsx\":33-66",
+ "content": " setData(result.scored_explanations[0])\n setIsLoading(false)\n }\n useEffect(() => {\n if (!data) {\n loadExplanation()\n }\n }, [])\n const handleToggleChange = () => {\n setToggle(!toggle);\n };\n let sim_sequences;\n if (data) {\n sim_sequences = zip_simulated_sequences(data.scored_simulation.scored_sequence_simulations);\n [sim_sequences] = normalizeTokenActs(sim_sequences)\n } else {\n sim_sequences = []\n }\n let real_sequences;\n if (data) {\n real_sequences = zip_real_sequences(data.scored_simulation.scored_sequence_simulations);\n [real_sequences] = normalizeTokenActs(real_sequences)\n } else {\n real_sequences = []\n }\n const suggest_explanation_link = \"https://docs.google.com/forms/d/e/1FAIpQLSckMyDQedGhdISIqaqn0YGUtd2xqEWgPu7ehoPUTT2pTge_-g/viewform?\"\n + `usp=pp_url&entry.541490611=${activeNeuron.layer}`\n + `&entry.1688855196=${activeNeuron.neuron}`\n + `&entry.495312202=https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html%23/layers/${activeNeuron.layer}/neurons/${activeNeuron.neuron}`;"
+ },
+ {
+ "comment": "This code represents a component in a React application that displays an explanation, along with the option to suggest a better one and toggle scoring details. The explanation is stored in 'data' prop, and a loading state is shown when 'isLoading' is true.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/explanation.jsx\":68-95",
+ "content": " return (\n <>\n
\n >\n )\n}\nexport default ExplanationDisplay"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/8b815b51-4cf7-4970-992c-fdae54a83594.json b/docs/doc/8b815b51-4cf7-4970-992c-fdae54a83594.json
new file mode 100644
index 0000000..51b5139
--- /dev/null
+++ b/docs/doc/8b815b51-4cf7-4970-992c-fdae54a83594.json
@@ -0,0 +1,10 @@
+{
+ "summary": "This directory contains our code for generating and simulating explanations of neuron behavior.",
+ "details": [
+ {
+ "comment": "This directory contains our code for generating and simulating explanations of neuron behavior.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/README.md\":0-17",
+ "content": "# Neuron explainer\nThis directory contains a version of our code for generating, simulating and scoring explanations of\nneuron behavior.\n# Setup\n```\npip install -e .\n```\n# Usage\nFor example usage, see the `demos` folder:\n* [Generating and scoring activation-based explanations](demos/generate_and_score_explanation.ipynb)\n* [Generating and scoring explanations based on tokens with high average activations](demos/generate_and_score_token_look_up_table_explanation.ipynb)\n* [Generating explanations for human-written neuron puzzles](demos/explain_puzzles.ipynb)"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/902abdb1-02cb-401b-ba84-eaa0bdd58895.json b/docs/doc/902abdb1-02cb-401b-ba84-eaa0bdd58895.json
new file mode 100644
index 0000000..32ed8c3
--- /dev/null
+++ b/docs/doc/902abdb1-02cb-401b-ba84-eaa0bdd58895.json
@@ -0,0 +1,10 @@
+{
+ "summary": "This code is a setup script for the \"neuron_explainer\" package. It specifies package name, dependencies, and required Python version.",
+ "details": [
+ {
+ "comment": "This code is a setup script for the \"neuron_explainer\" package. It specifies package name, dependencies, and required Python version.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/setup.py\":0-20",
+ "content": "from setuptools import setup, find_packages\nsetup(\n name=\"neuron_explainer\",\n packages=find_packages(),\n version=\"0.0.1\",\n author=\"OpenAI\",\n install_requires=[\n \"httpx>=0.22\",\n \"scikit-learn\",\n \"boostedblob>=0.13.0\",\n \"tiktoken\",\n \"blobfile\",\n \"numpy\",\n \"pytest\",\n \"orjson\",\n ],\n url=\"\",\n description=\"\",\n python_requires='>=3.9',\n)"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/95e64e04-191f-4295-ba68-12b8cbdda9f2.json b/docs/doc/95e64e04-191f-4295-ba68-12b8cbdda9f2.json
new file mode 100644
index 0000000..07051de
--- /dev/null
+++ b/docs/doc/95e64e04-191f-4295-ba68-12b8cbdda9f2.json
@@ -0,0 +1,20 @@
+{
+ "summary": "The code loads neuron data, generates an explanation using an explainer model, and sets up a simulator to evaluate the impact of explanations on neuron output, then performs simulations with given activation records and prints preferred scores.",
+ "details": [
+ {
+ "comment": "This code sets the OpenAI API key, imports necessary modules, defines constants for explainer and simulator models, and loads a neuron record.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/generate_and_score_explanation.py\":0-32",
+ "content": "#!/usr/bin/env python\n# coding: utf-8\n# In[ ]:\nget_ipython().run_line_magic('load_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\n# In[ ]:\nimport os\nos.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\nfrom neuron_explainer.activations.activation_records import calculate_max_activation\nfrom neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron\nfrom neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator\nfrom neuron_explainer.explanations.explainer import TokenActivationPairExplainer\nfrom neuron_explainer.explanations.prompt_builder import PromptFormat\nfrom neuron_explainer.explanations.scoring import simulate_and_score\nfrom neuron_explainer.explanations.simulator import ExplanationNeuronSimulator\nEXPLAINER_MODEL_NAME = \"gpt-4\"\nSIMULATOR_MODEL_NAME = \"text-davinci-003\"\n# test_response = await client.make_request(prompt=\"test 123<|endofprompt|>\", max_tokens=2)\n# print(\"Response:\", test_response[\"choices\"][0][\"text\"])\n# Load a neuron record."
+ },
+ {
+ "comment": "Loading neuron data for layer 9, split 6236.\nCreating activation records slices and loading the train and validation activation records.\nGenerating an explanation for the selected neuron using a specified explainer model.\nRetrieving the generated explanation and storing it in the variable \"explanation\".\nSetting up a simulator to evaluate the provided explanation's impact on the neuron's output.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/generate_and_score_explanation.py\":33-64",
+ "content": "neuron_record = load_neuron(9, 6236)\n# Grab the activation records we'll need.\nslice_params = ActivationRecordSliceParams(n_examples_per_split=5)\ntrain_activation_records = neuron_record.train_activation_records(\n activation_record_slice_params=slice_params\n)\nvalid_activation_records = neuron_record.valid_activation_records(\n activation_record_slice_params=slice_params\n)\n# Generate an explanation for the neuron.\nexplainer = TokenActivationPairExplainer(\n model_name=EXPLAINER_MODEL_NAME,\n prompt_format=PromptFormat.HARMONY_V4,\n max_concurrent=1,\n)\nexplanations = await explainer.generate_explanations(\n all_activation_records=train_activation_records,\n max_activation=calculate_max_activation(train_activation_records),\n num_samples=1,\n)\nassert len(explanations) == 1\nexplanation = explanations[0]\nprint(f\"{explanation=}\")\n# Simulate and score the explanation.\nsimulator = UncalibratedNeuronSimulator(\n ExplanationNeuronSimulator(\n SIMULATOR_MODEL_NAME,\n explanation,\n max_concurrent=1,"
+ },
+ {
+ "comment": "Performs simulation with given activation records and prints preferred score.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/generate_and_score_explanation.py\":65-69",
+ "content": " prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n )\n)\nscored_simulation = await simulate_and_score(simulator, valid_activation_records)\nprint(f\"score={scored_simulation.get_preferred_score():.2f}\")"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/99f23668-8f5c-4437-8406-66d38691ca3e.json b/docs/doc/99f23668-8f5c-4437-8406-66d38691ca3e.json
new file mode 100644
index 0000000..f5ba86a
--- /dev/null
+++ b/docs/doc/99f23668-8f5c-4437-8406-66d38691ca3e.json
@@ -0,0 +1,30 @@
+{
+ "summary": "This code includes a `PromptFormat` class for formatting methods, a `HarmonyMessage` dictionary for roles and content, and a `PromptBuilder` class to create prompts with token counting using GPT-4 encoding. It checks roles, creates deep copies of messages, and handles system messages. The code also checks the last user message and appends \"<|endofprompt|>\" before returning either a list of messages' contents or concatenating them into a single string, while raising a ValueError for unknown prompt formats.",
+ "details": [
+ {
+ "comment": "The code defines a class `PromptFormat` which is an enumeration of different prompt formatting methods. The `HarmonyMessage` is a typed dictionary defining the role and content of each message in the prompt. There's also a method `from_string` that returns the corresponding `PromptFormat` from a string input.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/prompt_builder.py\":0-37",
+ "content": "from __future__ import annotations\nfrom enum import Enum\nfrom typing import TypedDict, Union\nimport tiktoken\nHarmonyMessage = TypedDict(\n \"HarmonyMessage\",\n {\n \"role\": str,\n \"content\": str,\n },\n)\nclass PromptFormat(str, Enum):\n \"\"\"\n Different ways of formatting the components of a prompt into the format accepted by the relevant\n API server endpoint.\n \"\"\"\n NONE = \"none\"\n \"\"\"Suitable for use with models that don't use special tokens for instructions.\"\"\"\n INSTRUCTION_FOLLOWING = \"instruction_following\"\n \"\"\"Suitable for IF models that use <|endofprompt|>.\"\"\"\n HARMONY_V4 = \"harmony_v4\"\n \"\"\"\n Suitable for Harmony models that use a structured turn-taking role+content format. Generates a\n list of HarmonyMessage dicts that can be sent to the /chat/completions endpoint.\n \"\"\"\n @classmethod\n def from_string(cls, s: str) -> PromptFormat:\n for prompt_format in cls:\n if prompt_format.value == s:\n return prompt_format\n raise ValueError(f\"{s} is not a valid PromptFormat\")"
+ },
+ {
+ "comment": "This code defines a PromptBuilder class for creating prompts. It initializes an empty list of HarmonyMessages and has methods to add messages and calculate the prompt's length in tokens using GPT-4 encoding.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/prompt_builder.py\":40-65",
+ "content": "class Role(str, Enum):\n \"\"\"See https://platform.openai.com/docs/guides/chat\"\"\"\n SYSTEM = \"system\"\n USER = \"user\"\n ASSISTANT = \"assistant\"\nclass PromptBuilder:\n \"\"\"Class for accumulating components of a prompt and then formatting them into an output.\"\"\"\n def __init__(self) -> None:\n self._messages: list[HarmonyMessage] = []\n def add_message(self, role: Role, message: str) -> None:\n self._messages.append(HarmonyMessage(role=role, content=message))\n def prompt_length_in_tokens(self, prompt_format: PromptFormat) -> int:\n # TODO(sbills): Make the model/encoding configurable. This implementation assumes GPT-4.\n encoding = tiktoken.get_encoding(\"cl100k_base\")\n if prompt_format == PromptFormat.HARMONY_V4:\n # Approximately-correct implementation adapted from this documentation:\n # https://platform.openai.com/docs/guides/chat/introduction\n num_tokens = 0\n for message in self._messages:\n num_tokens += ("
+ },
+ {
+ "comment": "This code calculates the number of tokens in a given message by encoding it and adding the length to a running total. If no specific format is provided, it builds a prompt according to a specified format and encodes it for token counting. The build function also validates the alternation of assistant and user messages.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/prompt_builder.py\":66-84",
+ "content": " 4 # every message follows <|im_start|>{role/name}\\n{content}<|im_end|>\\n\n )\n num_tokens += len(encoding.encode(message[\"content\"], allowed_special=\"all\"))\n num_tokens += 2 # every reply is primed with <|im_start|>assistant\n return num_tokens\n else:\n prompt_str = self.build(prompt_format)\n assert isinstance(prompt_str, str)\n return len(encoding.encode(prompt_str, allowed_special=\"all\"))\n def build(\n self, prompt_format: PromptFormat, *, allow_extra_system_messages: bool = False\n ) -> Union[str, list[HarmonyMessage]]:\n \"\"\"\n Validates the messages added so far (reasonable alternation of assistant vs. user, etc.)\n and returns either a regular string (maybe with <|endofprompt|> tokens) or a list of\n HarmonyMessages suitable for use with the /chat/completions endpoint.\n The `allow_extra_system_messages` parameter allows the caller to specify that the prompt"
+ },
+ {
+ "comment": "The code creates a deep copy of the messages to prevent any external modification. It then checks if the next message is from the expected role and allows extra system messages if specified. Finally, it prepares for prompt formatting if necessary.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/prompt_builder.py\":85-107",
+ "content": " should be allowed to contain system messages after the very first one.\n \"\"\"\n # Create a deep copy of the messages so we can modify it and so that the caller can't\n # modify the internal state of this object.\n messages = [message.copy() for message in self._messages]\n expected_next_role = Role.SYSTEM\n for message in messages:\n role = message[\"role\"]\n assert role == expected_next_role or (\n allow_extra_system_messages and role == Role.SYSTEM\n ), f\"Expected message from {expected_next_role} but got message from {role}\"\n if role == Role.SYSTEM:\n expected_next_role = Role.USER\n elif role == Role.USER:\n expected_next_role = Role.ASSISTANT\n elif role == Role.ASSISTANT:\n expected_next_role = Role.USER\n if prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:\n last_user_message = None\n for message in messages:\n if message[\"role\"] == Role.USER:"
+ },
+ {
+ "comment": "This code checks the last user message and appends \"<|endofprompt|>\" to its content. Depending on the prompt format, it either returns a list of messages' contents or concatenates them into a single string. If an unknown prompt format is encountered, it raises a ValueError.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/prompt_builder.py\":108-117",
+ "content": " last_user_message = message\n assert last_user_message is not None\n last_user_message[\"content\"] += \"<|endofprompt|>\"\n if prompt_format == PromptFormat.HARMONY_V4:\n return messages\n elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:\n return \"\".join(message[\"content\"] for message in messages)\n else:\n raise ValueError(f\"Unknown prompt format: {prompt_format}\")"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/a5438da7-b872-4f31-9077-50e0e715546e.json b/docs/doc/a5438da7-b872-4f31-9077-50e0e715546e.json
new file mode 100644
index 0000000..64e5e1a
--- /dev/null
+++ b/docs/doc/a5438da7-b872-4f31-9077-50e0e715546e.json
@@ -0,0 +1,45 @@
+{
+ "summary": "The comments describe testing a function that checks the accuracy of neuron behavior prompts in neural networks, ensuring they align with expectations for text-davinci-003 model.",
+ "details": [
+ {
+ "comment": "Code snippet defines a test function to check the generation of explanation simulation prompt with a specific format.\nThe expected prompt format includes neuron behavior summaries, activation values for each token, and an \"unknown\" indication when necessary.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py\":0-35",
+ "content": "from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet\nfrom neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role\nfrom neuron_explainer.explanations.simulator import (\n ExplanationNeuronSimulator,\n ExplanationTokenByTokenSimulator,\n)\ndef test_make_explanation_simulation_prompt_if_format() -> None:\n expected_prompt = \"\"\"We're studying neurons in a neural network.\nEach neuron looks for some particular thing in a short document.\nLook at summary of what the neuron does, and try to predict how it will fire on each token.\nThe activation format is tokenactivation, activations go from 0 to 10, \"unknown\" indicates an unknown activation. Most activations will be 0.\nNeuron 1\nExplanation of neuron 1 behavior: the main thing this neuron does is find vowels\nActivations: \n\na\t10\nb\t0\nc\t0\n\n\nd\tunknown\ne\t10\nf\t0\n\nNeuron 2\nExplanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION<|endofprompt|>\nActivations: \n"
+ },
+ {
+ "comment": "ExplanationNeuronSimulator is being used to generate a simulation prompt for the text-davinci-003 model. The prompt will include information about neurons in a neural network, their roles, and how they analyze short documents. Each token will have an activation level from 0 to 10 or \"unknown\".",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py\":36-68",
+ "content": "0\tunknown\n1\tunknown\n2\tunknown\n\n\"\"\"\n prompt = ExplanationNeuronSimulator(\n model_name=\"text-davinci-003\",\n explanation=\"EXPLANATION\",\n few_shot_example_set=FewShotExampleSet.TEST,\n prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n ).make_simulation_prompt(\n tokens=[str(x) for x in range(3)],\n )\n assert prompt == expected_prompt\ndef test_make_explanation_simulation_prompt_harmony_format() -> None:\n expected_prompt = [\n HarmonyMessage(\n role=Role.SYSTEM,\n content=\"\"\"We're studying neurons in a neural network.\nEach neuron looks for some particular thing in a short document.\nLook at summary of what the neuron does, and try to predict how it will fire on each token.\nThe activation format is tokenactivation, activations go from 0 to 10, \"unknown\" indicates an unknown activation. Most activations will be 0.\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 1\nExplanation of neuron 1 behavior: the main thing this neuron does is find vowels\"\"\","
+ },
+ {
+ "comment": "This code is defining a test simulation prompt using the ExplanationNeuronSimulator class, with given input parameters such as model_name, explanation, few_shot_example_set, and prompt_format. The simulation prompts are created in HarmonyMessage format, and assertions are used to check if the created prompts match the expected format and structure.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py\":69-118",
+ "content": " ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\"\"\"\nActivations: \n\na\t10\nb\t0\nc\t0\n\n\nd\tunknown\ne\t10\nf\t0\n\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 2\nExplanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION\"\"\",\n ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\"\"\"\nActivations: \n\n0\tunknown\n1\tunknown\n2\tunknown\n\n\"\"\",\n ),\n ]\n prompt = ExplanationNeuronSimulator(\n model_name=\"gpt-4\",\n explanation=\"EXPLANATION\",\n few_shot_example_set=FewShotExampleSet.TEST,\n prompt_format=PromptFormat.HARMONY_V4,\n ).make_simulation_prompt(\n tokens=[str(x) for x in range(3)],\n )\n assert isinstance(prompt, list)\n assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n for actual_message, expected_message in zip(prompt, expected_prompt):\n assert actual_message[\"role\"] == expected_message[\"role\"]"
+ },
+ {
+ "comment": "The code is asserting that the actual message content matches the expected message, and that the prompt matches the expected prompt. This test checks if the simulation prompt and its format are as expected.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py\":119-152",
+ "content": " assert actual_message[\"content\"] == expected_message[\"content\"]\n assert prompt == expected_prompt\ndef test_make_token_by_token_simulation_prompt_if_format() -> None:\n expected_prompt = \"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.\nThe activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.\nNeuron 1\nExplanation of neuron 1 behavior: the main thing this neuron does is find vowels\nActivations: \n\na\t10\nb\t0\nc\t0\n\n\nd\t0\ne\t10\nf\t0\n\nNow, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.\nNeuron 2\nExplanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else\nText:\nghi\nLast token in the text:\ni\nLast token activation, considering the token in the context in which it appeared in the text:"
+ },
+ {
+ "comment": "Test function that checks if a prompt generated for explaining the behavior of a neuron in a neural network is correct. It uses an explanation and token index to generate the prompt, which is then compared with the expected prompt.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py\":153-183",
+ "content": "10\nNeuron 3\nExplanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else\nText:\n01\nLast token in the text:\n1\nLast token activation, considering the token in the context in which it appeared in the text:\n<|endofprompt|>\"\"\"\n prompt = ExplanationTokenByTokenSimulator(\n model_name=\"text-davinci-003\",\n explanation=\"EXPLANATION\",\n few_shot_example_set=FewShotExampleSet.TEST,\n prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n ).make_single_token_simulation_prompt(\n tokens=[str(x) for x in range(3)],\n explanation=\"numbers and nothing else\",\n token_index_to_score=1,\n )\n assert prompt == expected_prompt\ndef test_make_token_by_token_simulation_prompt_harmony_format() -> None:\n expected_prompt = [\n HarmonyMessage(\n role=Role.SYSTEM,\n content=\"\"\"We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token."
+ },
+ {
+ "comment": "Code is defining and testing a neuron simulator to analyze the behavior of different neurons based on their activations when processing text tokens. Activations are represented in the format \"tokenactivation\" and range from 0 to 10, with most being 0. The simulation considers single tokens in context and predicts the activation for new neurons following similar rules as previous examples.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py\":185-228",
+ "content": "The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"Neuron 1\nExplanation of neuron 1 behavior: the main thing this neuron does is find vowels\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\"\"\"Activations: \n\na\t10\nb\t0\nc\t0\n\n\nd\t0\ne\t10\nf\t0\n\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.SYSTEM,\n content=\"Now, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 2\nExplanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else\nText:\nghi\nLast token in the text:\ni\nLast token activation, considering the token in the context in which it appeared in the text:\n\"\"\","
+ },
+ {
+ "comment": "The code is generating a simulation prompt for an AI model (in this case, \"gpt-4\") to interpret the behavior of neuron 3. The prompt includes information about the neuron's function and the context it operates in. It checks that the output is a list of HarmonyMessage objects and that each message's role matches the expected roles.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py\":229-266",
+ "content": " ),\n HarmonyMessage(\n role=Role.ASSISTANT,\n content=\"\"\"10\n\"\"\",\n ),\n HarmonyMessage(\n role=Role.USER,\n content=\"\"\"\nNeuron 3\nExplanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else\nText:\n01\nLast token in the text:\n1\nLast token activation, considering the token in the context in which it appeared in the text:\n\"\"\",\n ),\n ]\n prompt = ExplanationTokenByTokenSimulator(\n model_name=\"gpt-4\",\n explanation=\"EXPLANATION\",\n few_shot_example_set=FewShotExampleSet.TEST,\n prompt_format=PromptFormat.HARMONY_V4,\n ).make_single_token_simulation_prompt(\n tokens=[str(x) for x in range(3)],\n explanation=\"numbers and nothing else\",\n token_index_to_score=1,\n )\n assert isinstance(prompt, list)\n assert isinstance(prompt[0], dict) # Really a HarmonyMessage\n for actual_message, expected_message in zip(prompt, expected_prompt):\n assert actual_message[\"role\"] == expected_message[\"role\"]"
+ },
+ {
+ "comment": "Asserting that the content of actual_message matches expected_message and prompt matches expected_prompt.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py\":267-268",
+ "content": " assert actual_message[\"content\"] == expected_message[\"content\"]\n assert prompt == expected_prompt"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/aafda453-c8bd-487b-9e5e-72e4df80c558.json b/docs/doc/aafda453-c8bd-487b-9e5e-72e4df80c558.json
new file mode 100644
index 0000000..affa232
--- /dev/null
+++ b/docs/doc/aafda453-c8bd-487b-9e5e-72e4df80c558.json
@@ -0,0 +1,15 @@
+{
+ "summary": "The code imports libraries, defines functions for loading JSON data and starting the Flask server. The Flask app is configured with logging, CORS, and an after_request function to allow cross-origin requests. It also includes a main function which can be executed if the script is run directly, accepting optional dev, host_name, and port arguments.",
+ "details": [
+ {
+ "comment": "Imports necessary libraries and defines functions for loading JSON data and starting the Flask server.\nFlask app is configured with logging, CORS, and after_request function to allow cross-origin requests.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/python/server.py\":0-42",
+ "content": "# %%\nimport logging\nfrom flask import Flask, request\nfrom flask_cors import CORS\nimport json\nimport urllib.request\ndef load_az_json(url):\n with urllib.request.urlopen(url) as f:\n return json.load(f)\ndef start(\n dev: bool = False,\n host_name: str = \"0.0.0.0\",\n port: int = 80,\n):\n app = Flask(\"interpretability chat\")\n app.logger.setLevel(logging.INFO)\n # app.logger.disabled = True\n CORS(app)\n @app.after_request\n def after_request(response):\n response.headers.add(\"Access-Control-Allow-Origin\", \"*\")\n response.headers.add(\n \"Access-Control-Allow-Headers\", \"Content-Type,Authorization\"\n )\n response.headers.add(\n \"Access-Control-Allow-Methods\", \"GET,PUT,POST,DELETE,OPTIONS\"\n )\n return response\n @app.route(\"/load_az\", methods=[\"GET\", \"POST\"])\n async def load_az():\n args = request.get_json()\n path = args[\"path\"]\n result = load_az_json(path)\n return result\n app.run(debug=dev, host=host_name, port=port, use_reloader=False)"
+ },
+ {
+ "comment": "This code defines a main function and executes it if the script is run directly. It accepts optional boolean dev, string host_name, and int port arguments.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/python/server.py\":45-50",
+ "content": "def main(dev: bool = True, host_name: str = \"0.0.0.0\", port: int = 8000):\n start(dev=dev, host_name=host_name, port=port)\nif __name__ == \"__main__\":\n main()"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/ab0e5fe5-5f7c-4c84-a606-cd46721858a2.json b/docs/doc/ab0e5fe5-5f7c-4c84-a606-cd46721858a2.json
new file mode 100644
index 0000000..92462fd
--- /dev/null
+++ b/docs/doc/ab0e5fe5-5f7c-4c84-a606-cd46721858a2.json
@@ -0,0 +1,15 @@
+{
+ "summary": "The code imports libraries, sets up the OpenAI API key, initializes the explainer model, and loops through each puzzle to generate explanations. It generates one explanation for a given input, checks if there's only 1 explanation, assigns it to 'model_generated_explanation', and prints both the explanation and expected answer for the puzzle.",
+ "details": [
+ {
+ "comment": "Code imports necessary libraries, sets up OpenAI API key, initializes the explainer model, and begins looping through each puzzle in PUZZLES_BY_NAME to generate explanations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/explain_puzzles.py\":0-37",
+ "content": "#!/usr/bin/env python\n# coding: utf-8\n# In[ ]:\nget_ipython().run_line_magic('load_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\n# In[ ]:\nimport os\nos.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\nfrom neuron_explainer.activations.activation_records import calculate_max_activation\nfrom neuron_explainer.explanations.explainer import TokenActivationPairExplainer\nfrom neuron_explainer.explanations.prompt_builder import PromptFormat\nfrom neuron_explainer.explanations.puzzles import PUZZLES_BY_NAME\nEXPLAINER_MODEL_NAME = \"gpt-4\"\nexplainer = TokenActivationPairExplainer(\n model_name=EXPLAINER_MODEL_NAME,\n prompt_format=PromptFormat.HARMONY_V4,\n max_concurrent=1,\n)\nfor puzzle_name, puzzle in PUZZLES_BY_NAME.items():\n print(f\"{puzzle_name=}\")\n puzzle_answer = puzzle.explanation\n # Generate an explanation for the puzzle.\n explanations = await explainer.generate_explanations(\n all_activation_records=puzzle.activation_records,\n max_activation=calculate_max_activation(puzzle.activation_records),"
+ },
+ {
+ "comment": "This code generates one explanation for a given input and asserts that the number of explanations is equal to 1. It then assigns the generated explanation to 'model_generated_explanation' and prints it along with the expected answer for the puzzle.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/explain_puzzles.py\":38-43",
+ "content": " num_samples=1,\n )\n assert len(explanations) == 1\n model_generated_explanation = explanations[0]\n print(f\"{model_generated_explanation=}\")\n print(f\"{puzzle_answer=}\\n\")"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/b535ebdc-67d1-48b5-892a-9ce7d22ca0ad.json b/docs/doc/b535ebdc-67d1-48b5-892a-9ce7d22ca0ad.json
new file mode 100644
index 0000000..8187720
--- /dev/null
+++ b/docs/doc/b535ebdc-67d1-48b5-892a-9ce7d22ca0ad.json
@@ -0,0 +1,10 @@
+{
+ "summary": "Configuring Tailwind CSS with content from \"./src/**/*.{html,js,jsx}\" and empty extend and plugins.",
+ "details": [
+ {
+ "comment": "Configuring Tailwind CSS with content from \"./src/**/*.{html,js,jsx}\" and empty extend and plugins.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/tailwind.config.js\":0-7",
+ "content": "/** @type {import('tailwindcss').Config} */\nmodule.exports = {\n content: [\"./src/**/*.{html,js,jsx}\"],\n theme: {\n extend: {},\n },\n plugins: [],\n}"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/b86a19d5-d9f3-4dfc-939f-b640b0fdb466.json b/docs/doc/b86a19d5-d9f3-4dfc-939f-b640b0fdb466.json
new file mode 100644
index 0000000..780362d
--- /dev/null
+++ b/docs/doc/b86a19d5-d9f3-4dfc-939f-b640b0fdb466.json
@@ -0,0 +1,10 @@
+{
+ "summary": "This code exports a functional component that takes an array of 2D arrays of \"TokenAndActivation\" objects and renders a heatmap for each token. The tokens are displayed within a block-style div, with each token's heatmap displayed inside its respective div.",
+ "details": [
+ {
+ "comment": "This code exports a functional component that takes an array of 2D arrays of \"TokenAndActivation\" objects and renders a heatmap for each token. The tokens are displayed within a block-style div, with each token's heatmap displayed inside its respective div.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/heatmapGrid.tsx\":0-13",
+ "content": "import { TokenAndActivation } from \"./types\"\nimport TokenHeatmap from \"./tokenHeatmap\";\nexport default ({ allTokens }: { allTokens: TokenAndActivation[][]}) => {\n return (\n
\n {allTokens.map((tokens, i) => (\n
\n \n
\n ))}\n
\n );\n};"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/b96d6cd3-6a41-47fe-a922-9a118b887674.json b/docs/doc/b96d6cd3-6a41-47fe-a922-9a118b887674.json
new file mode 100644
index 0000000..d4da58b
--- /dev/null
+++ b/docs/doc/b96d6cd3-6a41-47fe-a922-9a118b887674.json
@@ -0,0 +1,35 @@
+{
+ "summary": "The code generates a scoring function and explanation simulator for assessing neuron evaluations based on correlation and R-squared, using an asynchronous approach. The `make_simulator_and_score` function is created to generate the simulator and score the activation records, returning the scored simulations.",
+ "details": [
+ {
+ "comment": "Code imports necessary modules and defines three functions:\n1. flatten_list(): Converts a list of lists into a single flat list.\n2. correlation_score(): Computes the correlation coefficient between two sequences of real and predicted activations.\n3. score_from_simulation(): Calculates the correlation score for a given sequence simulation.\n\nThis code is used for scoring simulations based on activation correlations in neuron explanations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/scoring.py\":0-33",
+ "content": "from __future__ import annotations\nimport asyncio\nimport logging\nfrom typing import Any, Callable, Coroutine, Sequence\nimport numpy as np\nfrom neuron_explainer.activations.activations import ActivationRecord\nfrom neuron_explainer.explanations.calibrated_simulator import (\n CalibratedNeuronSimulator,\n LinearCalibratedNeuronSimulator,\n)\nfrom neuron_explainer.explanations.explanations import (\n ScoredSequenceSimulation,\n ScoredSimulation,\n SequenceSimulation,\n)\nfrom neuron_explainer.explanations.simulator import ExplanationNeuronSimulator, NeuronSimulator\ndef flatten_list(list_of_lists: Sequence[Sequence[Any]]) -> list[Any]:\n return [item for sublist in list_of_lists for item in sublist]\ndef correlation_score(\n real_activations: Sequence[float] | np.ndarray,\n predicted_activations: Sequence[float] | np.ndarray,\n) -> float:\n return np.corrcoef(real_activations, predicted_activations)[0, 1]\ndef score_from_simulation(\n real_activations: ActivationRecord,\n simulation: SequenceSimulation,"
+ },
+ {
+ "comment": "This code defines a scoring function that takes in two sequences of floats (or numpy arrays) and returns a score. It provides two specific scoring functions: rsquared_score_from_sequences and absolute_dev_explained_score_from_sequences, which calculate the R-squared and absolute deviation explained scores respectively. These functions are used in make_explanation_simulator, which asynchronously creates an explanation simulator for a given explanation, calibration activation records, and model name.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/scoring.py\":34-64",
+ "content": " score_function: Callable[[Sequence[float] | np.ndarray, Sequence[float] | np.ndarray], float],\n) -> float:\n return score_function(real_activations.activations, simulation.expected_activations)\ndef rsquared_score_from_sequences(\n real_activations: Sequence[float] | np.ndarray,\n predicted_activations: Sequence[float] | np.ndarray,\n) -> float:\n return float(\n 1\n - np.mean(np.square(np.array(real_activations) - np.array(predicted_activations)))\n / np.mean(np.square(np.array(real_activations)))\n )\ndef absolute_dev_explained_score_from_sequences(\n real_activations: Sequence[float] | np.ndarray,\n predicted_activations: Sequence[float] | np.ndarray,\n) -> float:\n return float(\n 1\n - np.mean(np.abs(np.array(real_activations) - np.array(predicted_activations)))\n / np.mean(np.abs(np.array(real_activations)))\n )\nasync def make_explanation_simulator(\n explanation: str,\n calibration_activation_records: Sequence[ActivationRecord],\n model_name: str,"
+ },
+ {
+ "comment": "This function creates a calibrated neuron simulator using an explanation and a model, and then uses it to simulate and score a sequence of activations. The returned score is based on R-squared and absolute deviation explained scores from sequences.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/scoring.py\":65-85",
+ "content": " calibrated_simulator_class: type[CalibratedNeuronSimulator] = LinearCalibratedNeuronSimulator,\n) -> CalibratedNeuronSimulator:\n \"\"\"\n Make a simulator that uses an explanation to predict activations and calibrates it on the given\n activation records.\n \"\"\"\n simulator = ExplanationNeuronSimulator(model_name, explanation)\n calibrated_simulator = calibrated_simulator_class(simulator)\n await calibrated_simulator.calibrate(calibration_activation_records)\n return calibrated_simulator\nasync def _simulate_and_score_sequence(\n simulator: NeuronSimulator, activations: ActivationRecord\n) -> ScoredSequenceSimulation:\n \"\"\"Score an explanation of a neuron by how well it predicts activations on a sentence.\"\"\"\n simulation = await simulator.simulate(activations.tokens)\n logging.debug(simulation)\n rsquared_score = score_from_simulation(activations, simulation, rsquared_score_from_sequences)\n absolute_dev_explained_score = score_from_simulation(\n activations, simulation, absolute_dev_explained_score_from_sequences"
+ },
+ {
+ "comment": "Code calculates and aggregates scored sequence simulations for each activation in a list of activations.\nIt combines true activations and expected values from all sequences to calculate the correlation score.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/scoring.py\":86-108",
+ "content": " )\n scored_sequence_simulation = ScoredSequenceSimulation(\n simulation=simulation,\n true_activations=activations.activations,\n ev_correlation_score=score_from_simulation(activations, simulation, correlation_score),\n rsquared_score=rsquared_score,\n absolute_dev_explained_score=absolute_dev_explained_score,\n )\n return scored_sequence_simulation\ndef aggregate_scored_sequence_simulations(\n scored_sequence_simulations: list[ScoredSequenceSimulation],\n) -> ScoredSimulation:\n \"\"\"\n Aggregate a list of scored sequence simulations. The logic for doing this is non-trivial for EV\n scores, since we want to calculate the correlation over all activations from all sequences at\n once rather than simply averaging per-sequence correlations.\n \"\"\"\n all_true_activations: list[float] = []\n all_expected_values: list[float] = []\n for scored_sequence_simulation in scored_sequence_simulations:\n all_true_activations.extend(scored_sequence_simulation.true_activations or [])"
+ },
+ {
+ "comment": "Code performs the following:\n1. Extends the list of all_expected_values with simulation's expected activation values.\n2. Calculates Ev correlation score, R squared score, and absolute dev explained score for explanation prediction accuracy.\n3. Returns a ScoredSimulation object with scores and simulations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/scoring.py\":109-136",
+ "content": " all_expected_values.extend(scored_sequence_simulation.simulation.expected_activations)\n ev_correlation_score = (\n correlation_score(all_true_activations, all_expected_values)\n if len(all_true_activations) > 0\n else None\n )\n rsquared_score = rsquared_score_from_sequences(all_true_activations, all_expected_values)\n absolute_dev_explained_score = absolute_dev_explained_score_from_sequences(\n all_true_activations, all_expected_values\n )\n return ScoredSimulation(\n scored_sequence_simulations=scored_sequence_simulations,\n ev_correlation_score=ev_correlation_score,\n rsquared_score=rsquared_score,\n absolute_dev_explained_score=absolute_dev_explained_score,\n )\nasync def simulate_and_score(\n simulator: NeuronSimulator,\n activation_records: Sequence[ActivationRecord],\n) -> ScoredSimulation:\n \"\"\"\n Score an explanation of a neuron by how well it predicts activations on the given text\n sequences.\n \"\"\"\n scored_sequence_simulations = await asyncio.gather("
+ },
+ {
+ "comment": "This code defines a function called `make_simulator_and_score` that takes in a coroutine for creating a simulator and a sequence of activation records. It then creates the simulator and uses it to score the activation records, returning the scored simulations. The code is asynchronous and uses awaitable operations.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/scoring.py\":137-154",
+ "content": " *[\n _simulate_and_score_sequence(\n simulator,\n activation_record,\n )\n for activation_record in activation_records\n ]\n )\n return aggregate_scored_sequence_simulations(scored_sequence_simulations)\nasync def make_simulator_and_score(\n make_simulator: Coroutine[None, None, NeuronSimulator],\n activation_records: Sequence[ActivationRecord],\n) -> ScoredSimulation:\n \"\"\"Chain together creating the simulator and using it to score activation records.\"\"\"\n simulator = await make_simulator\n return await simulate_and_score(simulator, activation_records)"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/ba86dff1-0546-4876-b136-4532bcb3be09.json b/docs/doc/ba86dff1-0546-4876-b136-4532bcb3be09.json
new file mode 100644
index 0000000..c2acff3
--- /dev/null
+++ b/docs/doc/ba86dff1-0546-4876-b136-4532bcb3be09.json
@@ -0,0 +1,10 @@
+{
+ "summary": "This code is for a neuron viewer, which can be accessed through the public website. It provides an implementation of the website and supports local development with instructions to install and run both backend and frontend.",
+ "details": [
+ {
+ "comment": "This code is for a neuron viewer, which can be accessed through the public website. It provides an implementation of the website and supports local development with instructions to install and run both backend and frontend.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/README.md\":0-19",
+ "content": "# Neuron viewer\nThe easiest way to view neurons and explanations is using the\n[public website](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html).\nThis directory contains the implementation of that website as well as lightweight servers that make\nit possible to run an alternative version of the website locally.\n## Local development\nInstall:\n```npm install```\nRun the backend:\n```npm run startpy```\nRun the frontend:\n```npm start```"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/baa427f6-818e-427a-858e-1f065c275f5e.json b/docs/doc/baa427f6-818e-427a-858e-1f065c275f5e.json
new file mode 100644
index 0000000..4fc412b
--- /dev/null
+++ b/docs/doc/baa427f6-818e-427a-858e-1f065c275f5e.json
@@ -0,0 +1,15 @@
+{
+ "summary": "The code sets up a functional component, Feed, which displays either a welcome message or information about the selected neuron depending on whether one is chosen. It checks for the neuron selection in URL parameters and renders additional panes such as explanation, dataset list, top tokens, and similar neurons if an active neuron is present; otherwise, it shows a welcome message.",
+ "details": [
+ {
+ "comment": "The code imports necessary components and sets up the Feed component, which displays a welcome message or information about the selected neuron. It checks if there is a neuron selected based on parameters passed in the URL, and if not, it displays null. If a neuron is selected, it displays the layer and neuron number.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/feed.jsx\":0-37",
+ "content": "import * as Panes from \"./panes\"\nimport React, { useEffect } from \"react\"\nimport Welcome from \"./welcome\"\nimport { useState } from \"react\"\nimport { useParams, Link } from \"react-router-dom\"\nexport default function Feed() {\n const params = useParams()\n // If params is missing either index, there's no neuron selected.\n let activeNeuron;\n if (params.layer === undefined || params.neuron === undefined) {\n activeNeuron = null\n } else {\n // Grab the layer and neuron indices from the params, casting them to ints.\n activeNeuron = {\n \"layer\": parseInt(params.layer),\n \"neuron\": parseInt(params.neuron),\n }\n }\n const Pane = ({ children }) => (\n
\n )\n}"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/cd1af330-5001-4221-bda2-bf07841569fb.json b/docs/doc/cd1af330-5001-4221-bda2-bf07841569fb.json
new file mode 100644
index 0000000..f7a9610
--- /dev/null
+++ b/docs/doc/cd1af330-5001-4221-bda2-bf07841569fb.json
@@ -0,0 +1,25 @@
+{
+ "summary": "The functional component fetches and displays information about neurons, using state variables and hooks to manage data. It organizes the displayed connections in a visually pleasing format and shows related neurons based on user-selected neuron, fetching similar ones in upstream and downstream sections with a loading animation while data is fetched, showing up to 3 related neurons for each section using NeuronInfo component.",
+ "details": [
+ {
+ "comment": "Importing necessary modules and defining a functional component for displaying neuron information.\nState variable \"info\" is set to null initially using useState hook.\nuseEffect hook is used to fetch the explanation data when the component mounts.\nIf info is not available, a loading indicator is displayed.\nWhen info is available, the neuron information is rendered within a div element.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/similarNeurons.jsx\":0-39",
+ "content": "import React, { useEffect, useState } from \"react\"\nimport _ from \"lodash\"\nimport { Link } from \"react-router-dom\"\nimport { get_explanations, get_top_neuron_connections } from \"../interpAPI\"\nfunction NeuronInfo({ neuron, strength }) {\n const [info, setInfo] = useState(null)\n useEffect(() => {\n async function fetchInfo() {\n const result = (await get_explanations({\n layer: neuron.layer,\n neuron: neuron.neuron,\n }))\n setInfo(result)\n }\n if (!info) {\n fetchInfo()\n }\n }, [])\n if (!info) {\n return (\n
\n "
+ },
+ {
+ "comment": "Code snippet displays information about similar neurons, their connection strength and scored explanations for a given activeNeuron. It fetches data using the 'get_top_neuron_connections' function and renders it in a visually formatted way. The state variables 'similarNeurons', 'isLoading' are managed with useState hook, and the useEffect hook is used to fetch similar neurons data when the activeNeuron prop changes.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/similarNeurons.jsx\":40-71",
+ "content": " Neuron {neuron.layer}:{neuron.neuron}\n \n
\n )\n}\nexport default function SimilarNeurons({ activeNeuron: neuron }) {\n const [similarNeurons, setSimilarNeurons] = useState([])\n const [isLoading, setIsLoading] = useState(true)\n useEffect(() => {\n async function fetchSimilarNeurons() {\n const result = await get_top_neuron_connections(neuron)\n setSimilarNeurons(result)\n setIsLoading(false)"
+ },
+ {
+ "comment": "Functionality: Displays related neurons based on user-selected neuron\n\nCode explanation:\n- If a user selects a neuron, fetch the similar neurons and display them in two sections - upstream and downstream.\n- Show a loading animation while data is fetched.\n- Display up to 3 related neurons for each section (upstream and downstream).\n- Use NeuronInfo component to represent each displayed neuron.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/similarNeurons.jsx\":72-104",
+ "content": " }\n fetchSimilarNeurons()\n }, [neuron])\n if (isLoading) {\n return (\n
"
+ },
+ {
+ "comment": "Rendering a list of downstream neurons for the selected neuron, up to n_show.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/similarNeurons.jsx\":105-117",
+ "content": "
\n )\n}"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/da8bd2c6-b957-4f60-9d97-d4bcdfdd354d.json b/docs/doc/da8bd2c6-b957-4f60-9d97-d4bcdfdd354d.json
new file mode 100644
index 0000000..b3cc0e0
--- /dev/null
+++ b/docs/doc/da8bd2c6-b957-4f60-9d97-d4bcdfdd354d.json
@@ -0,0 +1,20 @@
+{
+ "summary": "The fast dataclass utility offers efficient serialization and deserialization with limited data validation, using orjson for numpy objects. It includes unit tests, a function to register new dataclasses, and a recursive object hook for handling lists and dictionaries.",
+ "details": [
+ {
+ "comment": "The code defines a fast dataclass utility that provides efficient serialization and deserialization while limiting data validation. The library ensures fields are not tuples to avoid issues during serialization and deserialization. The unit tests in the codebase demonstrate how to use this fast dataclass utility. It uses orjson for serializing numpy objects and provides a function to register new dataclasses with the utility.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py\":0-36",
+ "content": "# Utilities for dataclasses that are very fast to serialize and deserialize, with limited data\n# validation. Fields must not be tuples, since they get serialized and then deserialized as lists.\n#\n# The unit tests for this library show how to use it.\nimport json\nfrom dataclasses import dataclass, field, fields, is_dataclass\nfrom functools import partial\nfrom typing import Any, Union\nimport orjson\ndataclasses_by_name = {}\ndataclasses_by_fieldnames = {}\n@dataclass\nclass FastDataclass:\n dataclass_name: str = field(init=False)\n def __post_init__(self) -> None:\n self.dataclass_name = self.__class__.__name__\ndef register_dataclass(cls): # type: ignore\n assert is_dataclass(cls), \"Only dataclasses can be registered.\"\n dataclasses_by_name[cls.__name__] = cls\n name_set = frozenset(f.name for f in fields(cls) if f.name != \"dataclass_name\")\n dataclasses_by_fieldnames[name_set] = cls\n return cls\ndef dumps(obj: Any) -> bytes:\n return orjson.dumps(obj, option=orjson.OPT_SERIALIZE_NUMPY)\ndef _object_hook(d: Any, backwards_compatible: bool = True) -> Any:"
+ },
+ {
+ "comment": "Checks if the input is a list, if so it recursively applies the object hook to each element. If not a list or dict, returns as is. If a dict, tries to find the corresponding dataclass based on either \"dataclass_name\" key or fieldnames, falling back if backwards_compatible is set to True.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py\":37-60",
+ "content": " # If d is a list, recurse.\n if isinstance(d, list):\n return [_object_hook(x, backwards_compatible=backwards_compatible) for x in d]\n # If d is not a dict, return it as is.\n if not isinstance(d, dict):\n return d\n cls = None\n if \"dataclass_name\" in d:\n if d[\"dataclass_name\"] in dataclasses_by_name:\n cls = dataclasses_by_name[d[\"dataclass_name\"]]\n else:\n assert backwards_compatible, (\n f\"Dataclass {d['dataclass_name']} not found, set backwards_compatible=True if you \"\n f\"are okay with that.\"\n )\n # Load objects created without dataclass_name set.\n else:\n # Try our best to find a dataclass if backwards_compatible is True.\n if backwards_compatible:\n d_fields = frozenset(d.keys())\n if d_fields in dataclasses_by_fieldnames:\n cls = dataclasses_by_fieldnames[d_fields]\n elif len(d_fields) > 0:\n # Check if the fields are a subset of a dataclass (if the dataclass had extra fields"
+ },
+ {
+ "comment": "The code aims to load and parse JSON data that uses dataclasses. It checks for the compatibility of the loaded data with existing dataclass definitions, then creates a new dataclass instance or a dictionary based on the input.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py\":61-84",
+ "content": " # added since the data was created). Note that this will fail if fields were removed\n # from the dataclass.\n for key, possible_cls in dataclasses_by_fieldnames.items():\n if d_fields.issubset(key):\n cls = possible_cls\n break\n else:\n print(f\"Could not find dataclass for {d_fields} {cls}\")\n new_d = {\n k: _object_hook(v, backwards_compatible=backwards_compatible)\n for k, v in d.items()\n if k != \"dataclass_name\"\n }\n if cls is not None:\n return cls(**new_d)\n else:\n return new_d\ndef loads(s: Union[str, bytes], backwards_compatible: bool = True) -> Any:\n return json.loads(\n s,\n object_hook=partial(_object_hook, backwards_compatible=backwards_compatible),\n )"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/db3765af-c6d5-416a-983a-923f5529cd5e.json b/docs/doc/db3765af-c6d5-416a-983a-923f5529cd5e.json
new file mode 100644
index 0000000..b8cd9d7
--- /dev/null
+++ b/docs/doc/db3765af-c6d5-416a-983a-923f5529cd5e.json
@@ -0,0 +1,20 @@
+{
+ "summary": "The code creates a Puzzle class for ground truth and false explanations, tokenizes sentences and JSON representations, preprocesses input data, and assigns puzzles to the name in PUZZLES_BY_NAME dictionary using convert_puzzle_dict_to_puzzle function.",
+ "details": [
+ {
+ "comment": "The code defines a Puzzle class representing ground truth explanations and false explanations. It also includes functions to convert a puzzle to a list of tokenized sentences and to create a Puzzle object from a JSON dictionary representation.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/puzzles.py\":0-27",
+ "content": "import json\nimport os\nfrom dataclasses import dataclass\nfrom neuron_explainer.activations.activations import ActivationRecord\n@dataclass(frozen=True)\nclass Puzzle:\n \"\"\"A puzzle is a ground truth explanation, a collection of sentences (stored as ActivationRecords) with activations\n according to that explanation, and a collection of false explanations\"\"\"\n name: str\n explanation: str\n activation_records: list[ActivationRecord]\n false_explanations: list[str]\ndef convert_puzzle_to_tokenized_sentences(puzzle: Puzzle) -> list[list[str]]:\n \"\"\"Converts a puzzle to a list of tokenized sentences.\"\"\"\n return [record.tokens for record in puzzle.activation_records]\ndef convert_puzzle_dict_to_puzzle(puzzle_dict: dict) -> Puzzle:\n \"\"\"Converts a json dictionary representation of a puzzle to the Puzzle class.\"\"\"\n puzzle_activation_records = []\n for sentence in puzzle_dict[\"sentences\"]:\n # Token-activation pairs are listed as either a string or a list of a string and a float. If it is a list, the float is the activation."
+ },
+ {
+ "comment": "The code preprocesses input data for puzzle explanations. It checks if all tokens are strings and all activations are floats, then creates a Puzzle object with name, explanation, activation records, and false explanations. The code reads puzzle data from \"puzzles.json\" file in the same directory.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/puzzles.py\":28-48",
+ "content": " # If it is only a string, the activation is assumed to be 0. This is useful for readability and reducing redundancy in the data.\n tokens = [t[0] if type(t) is list else t for t in sentence]\n assert all([type(t) is str for t in tokens]), \"All tokens must be strings\"\n activations = [float(t[1]) if type(t) is list else 0.0 for t in sentence]\n assert all([type(t) is float for t in activations]), \"All activations must be floats\"\n puzzle_activation_records.append(ActivationRecord(tokens=tokens, activations=activations))\n return Puzzle(\n name=puzzle_dict[\"name\"],\n explanation=puzzle_dict[\"explanation\"],\n activation_records=puzzle_activation_records,\n false_explanations=puzzle_dict[\"false_explanations\"],\n )\nPUZZLES_BY_NAME: dict[str, Puzzle] = dict()\nscript_dir = os.path.dirname(os.path.abspath(__file__))\nwith open(os.path.join(script_dir, \"puzzles.json\"), \"r\") as f:\n puzzle_dicts = json.loads(f.read())\n for name in puzzle_dicts.keys():"
+ },
+ {
+ "comment": "Assigning puzzle to the name in PUZZLES_BY_NAME dictionary using convert_puzzle_dict_to_puzzle function.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/explanations/puzzles.py\":49-49",
+ "content": " PUZZLES_BY_NAME[name] = convert_puzzle_dict_to_puzzle(puzzle_dicts[name])"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/e33993e0-448f-43ef-aa47-f69d38a72306.json b/docs/doc/e33993e0-448f-43ef-aa47-f69d38a72306.json
new file mode 100644
index 0000000..362608a
--- /dev/null
+++ b/docs/doc/e33993e0-448f-43ef-aa47-f69d38a72306.json
@@ -0,0 +1,30 @@
+{
+ "summary": "The code handles activation records, features for max values and formatting neuron activations, and marks activations as unknown based on user inputs. It also calculates the ratio of non-zero activations to total activations across all records.",
+ "details": [
+ {
+ "comment": "This code defines functions to handle activation records, including calculating the maximum activation value and normalizing neuron activations. It also includes a REALU function for handling activation values less than 0 as resting state indicators.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activation_records.py\":0-28",
+ "content": "\"\"\"Utilities for formatting activation records into prompts.\"\"\"\nimport math\nfrom typing import Optional, Sequence\nfrom neuron_explainer.activations.activations import ActivationRecord\nUNKNOWN_ACTIVATION_STRING = \"unknown\"\ndef relu(x: float) -> float:\n return max(0.0, x)\ndef calculate_max_activation(activation_records: Sequence[ActivationRecord]) -> float:\n \"\"\"Return the maximum activation value of the neuron across all the activation records.\"\"\"\n flattened = [\n # Relu is used to assume any values less than 0 are indicating the neuron is in the resting\n # state. This is a simplifying assumption that works with relu/gelu.\n max(relu(x) for x in activation_record.activations)\n for activation_record in activation_records\n ]\n return max(flattened)\ndef normalize_activations(activation_record: list[float], max_activation: float) -> list[int]:\n \"\"\"Convert raw neuron activations to integers on the range [0, 10].\"\"\"\n if max_activation <= 0:\n return [0 for x in activation_record]"
+ },
+ {
+ "comment": "The code snippet is responsible for formatting neuron activations into a string. It first applies an optional normalization to the activations, then optionally removes zeros and hides activations based on user inputs. The resulting string contains tokens and their corresponding normalized or hidden activations, suitable for use in prompts.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activation_records.py\":29-52",
+ "content": " # Relu is used to assume any values less than 0 are indicating the neuron is in the resting\n # state. This is a simplifying assumption that works with relu/gelu.\n return [min(10, math.floor(10 * relu(x) / max_activation)) for x in activation_record]\ndef _format_activation_record(\n activation_record: ActivationRecord,\n max_activation: float,\n omit_zeros: bool,\n hide_activations: bool = False,\n start_index: int = 0,\n) -> str:\n \"\"\"Format neuron activations into a string, suitable for use in prompts.\"\"\"\n tokens = activation_record.tokens\n normalized_activations = normalize_activations(activation_record.activations, max_activation)\n if omit_zeros:\n assert (not hide_activations) and start_index == 0, \"Can't hide activations and omit zeros\"\n tokens = [\n token for token, activation in zip(tokens, normalized_activations) if activation > 0\n ]\n normalized_activations = [x for x in normalized_activations if x > 0]\n entries = []\n assert len(tokens) == len(normalized_activations)"
+ },
+ {
+ "comment": "The code formats a list of activation records into a string. It iterates through each token, activation pair and normalizes the activations. If hide_activations or index is less than start_index, it replaces activation with UNKNOWN_ACTIVATION_STRING. The final formatted string joins entries with newline characters and includes and markers.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activation_records.py\":53-80",
+ "content": " for index, token, activation in zip(range(len(tokens)), tokens, normalized_activations):\n activation_string = str(int(activation))\n if hide_activations or index < start_index:\n activation_string = UNKNOWN_ACTIVATION_STRING\n entries.append(f\"{token}\\t{activation_string}\")\n return \"\\n\".join(entries)\ndef format_activation_records(\n activation_records: Sequence[ActivationRecord],\n max_activation: float,\n *,\n omit_zeros: bool = False,\n start_indices: Optional[list[int]] = None,\n hide_activations: bool = False,\n) -> str:\n \"\"\"Format a list of activation records into a string.\"\"\"\n return (\n \"\\n\\n\"\n + \"\\n\\n\\n\".join(\n [\n _format_activation_record(\n activation_record,\n max_activation,\n omit_zeros=omit_zeros,\n hide_activations=hide_activations,\n start_index=0 if start_indices is None else start_indices[i],\n )"
+ },
+ {
+ "comment": "This code contains several functions to format and manipulate activation records and tokens for simulation purposes. The `_format_tokens_for_simulation` function formats a sequence of strings into a string with each token marked as having an \"unknown\" activation, suitable for use in prompts. The `format_sequences_for_simulation` function extends this to format a list of lists of tokens into a string with each token marked as having an \"unknown\" activation, also suitable for use in prompts. Finally, the `non_zero_activation_proportion` function calculates the proportion of non-zero activation values among a sequence of ActivationRecord objects.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activation_records.py\":81-118",
+ "content": " for i, activation_record in enumerate(activation_records)\n ]\n )\n + \"\\n\\n\"\n )\ndef _format_tokens_for_simulation(tokens: Sequence[str]) -> str:\n \"\"\"\n Format tokens into a string with each token marked as having an \"unknown\" activation, suitable\n for use in prompts.\n \"\"\"\n entries = []\n for token in tokens:\n entries.append(f\"{token}\\t{UNKNOWN_ACTIVATION_STRING}\")\n return \"\\n\".join(entries)\ndef format_sequences_for_simulation(\n all_tokens: Sequence[Sequence[str]],\n) -> str:\n \"\"\"\n Format a list of lists of tokens into a string with each token marked as having an \"unknown\"\n activation, suitable for use in prompts.\n \"\"\"\n return (\n \"\\n\\n\"\n + \"\\n\\n\\n\".join(\n [_format_tokens_for_simulation(tokens) for tokens in all_tokens]\n )\n + \"\\n\\n\"\n )\ndef non_zero_activation_proportion(\n activation_records: Sequence[ActivationRecord], max_activation: float\n) -> float:\n \"\"\"Return the proportion of activation values that aren't zero.\"\"\""
+ },
+ {
+ "comment": "Calculating the ratio of non-zero activations to total activations across all activation records.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/activations/activation_records.py\":119-129",
+ "content": " total_activations_count = sum(\n [len(activation_record.activations) for activation_record in activation_records]\n )\n normalized_activations = [\n normalize_activations(activation_record.activations, max_activation)\n for activation_record in activation_records\n ]\n non_zero_activations_count = sum(\n [len([x for x in activations if x != 0]) for activations in normalized_activations]\n )\n return non_zero_activations_count / total_activations_count"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/e46fd235-9fc9-4937-b8cd-1871880acda0.json b/docs/doc/e46fd235-9fc9-4937-b8cd-1871880acda0.json
new file mode 100644
index 0000000..16158ac
--- /dev/null
+++ b/docs/doc/e46fd235-9fc9-4937-b8cd-1871880acda0.json
@@ -0,0 +1,10 @@
+{
+ "summary": "This code imports necessary modules and sets up the root element for a React application, which then renders the App component within a strict mode. It also configures performance measurement if desired.",
+ "details": [
+ {
+ "comment": "This code imports necessary modules and sets up the root element for a React application, which then renders the App component within a strict mode. It also configures performance measurement if desired.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/index.jsx\":0-16",
+ "content": "import React from 'react';\nimport ReactDOM from 'react-dom/client';\nimport './index.css';\nimport App from './App';\nimport reportWebVitals from './reportWebVitals';\nconst root = ReactDOM.createRoot(document.getElementById('root'));\nroot.render(\n \n \n \n);\n// If you want to start measuring performance in your app, pass a function\n// to log results (for example: reportWebVitals(console.log))\n// or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals\nreportWebVitals();"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/e4ede89b-d0c0-4d24-8ca5-c7ec798c1ac5.json b/docs/doc/e4ede89b-d0c0-4d24-8ca5-c7ec798c1ac5.json
new file mode 100644
index 0000000..167d9c6
--- /dev/null
+++ b/docs/doc/e4ede89b-d0c0-4d24-8ca5-c7ec798c1ac5.json
@@ -0,0 +1,10 @@
+{
+ "summary": "Imports CSS and Feed component, sets up React Router for routing between components.",
+ "details": [
+ {
+ "comment": "Imports CSS and Feed component, sets up React Router for routing between components.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/App.jsx\":0-16",
+ "content": "import \"./App.css\"\nimport Feed from \"./feed\"\nimport React from \"react\"\nimport { Routes, Route, HashRouter } from \"react-router-dom\"\nfunction App() {\n return (\n \n \n } />\n } />\n \n \n )\n}\nexport default App"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/e862421d-6f38-4494-ae03-9d558eb35102.json b/docs/doc/e862421d-6f38-4494-ae03-9d558eb35102.json
new file mode 100644
index 0000000..1ddbb65
--- /dev/null
+++ b/docs/doc/e862421d-6f38-4494-ae03-9d558eb35102.json
@@ -0,0 +1,20 @@
+{
+ "summary": "The code defines three classes, registers them for serialization and deserialization using FastDataclass, and tests functionality including handling of bad data, testing different scenarios for deserializing data using the `loads` function, asserting correct data type identification, and raising a TypeError when unexpected fields are present.",
+ "details": [
+ {
+ "comment": "This code defines three classes, DataclassC, DataclassC_ext, and DataclassB, which inherit from FastDataclass. These classes have fields of different types and are registered using the register_dataclass decorator. The purpose is to enable serialization and deserialization for instances of these classes.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py\":0-33",
+ "content": "from dataclasses import dataclass\nimport pytest\nfrom .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass\n# Inheritance is a bit tricky with our setup. dataclass_name must be set for instances of these\n# classes to serialize and deserialize correctly, but if it's given a default value, then subclasses\n# can't have any fields that don't have default values, because of how constructors are generated\n# for dataclasses (fields with no default value can't follow those with default values). To work\n# around this, we set dataclass_name in __post_init__ on the base class, which is called after the\n# constructor. The implementation does the right thing for both the base class and the subclass.\n@register_dataclass\n@dataclass\nclass DataclassC(FastDataclass):\n ints: list[int]\n@register_dataclass\n@dataclass\nclass DataclassC_ext(DataclassC):\n s: str\n@register_dataclass\n@dataclass\nclass DataclassB(FastDataclass):\n str_to_c: dict[str, DataclassC]\n cs: list[DataclassC]\n@register_dataclass\n@dataclass"
+ },
+ {
+ "comment": "- Instantiate a DataclassA object with specified floats, strings, and nested DataclassB objects.\n- Assert that the serialized and deserialized versions of the DataclassA object are equal.\n- Test serialization and deserialization for DataclassC and DataclassC_ext.\n- Test handling of bad serialized data.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py\":34-74",
+ "content": "class DataclassA(FastDataclass):\n floats: list[float]\n strings: list[str]\n bs: list[DataclassB]\n@register_dataclass\n@dataclass\nclass DataclassD(FastDataclass):\n s1: str\n s2: str = \"default\"\ndef test_dataclasses() -> None:\n a = DataclassA(\n floats=[1.0, 2.0],\n strings=[\"a\", \"b\"],\n bs=[\n DataclassB(\n str_to_c={\"a\": DataclassC(ints=[1, 2]), \"b\": DataclassC(ints=[3, 4])},\n cs=[DataclassC(ints=[5, 6]), DataclassC_ext(ints=[7, 8], s=\"s\")],\n ),\n DataclassB(\n str_to_c={\"c\": DataclassC_ext(ints=[9, 10], s=\"t\"), \"d\": DataclassC(ints=[11, 12])},\n cs=[DataclassC(ints=[13, 14]), DataclassC(ints=[15, 16])],\n ),\n ],\n )\n assert loads(dumps(a)) == a\ndef test_c_and_c_ext() -> None:\n c_ext = DataclassC_ext(ints=[3, 4], s=\"s\")\n assert loads(dumps(c_ext)) == c_ext\n c = DataclassC(ints=[1, 2])\n assert loads(dumps(c)) == c\ndef test_bad_serialized_data() -> None:\n assert type(loads(dumps(DataclassC(ints=[3, 4])))) == DataclassC"
+ },
+ {
+ "comment": "The code tests different scenarios for deserializing data using the `loads` function. It asserts that it correctly identifies the data type and raises a TypeError when unexpected fields are present.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py\":75-82",
+ "content": " assert type(loads('{\"ints\": [3, 4]}', backwards_compatible=False)) == dict\n assert type(loads('{\"ints\": [3, 4], \"dataclass_name\": \"DataclassC\"}')) == DataclassC\n with pytest.raises(TypeError):\n loads('{\"ints\": [3, 4], \"bogus_extra_field\": \"foo\", \"dataclass_name\": \"DataclassC\"}')\n with pytest.raises(TypeError):\n loads('{\"ints_field_is_missing\": [3, 4], \"dataclass_name\": \"DataclassC\"}')\n assert type(loads('{\"s1\": \"test\"}', backwards_compatible=False)) == dict\n assert type(loads('{\"s1\": \"test\"}', backwards_compatible=True)) == DataclassD"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/ebb523b1-92bd-4491-bead-46e4fa454ff1.json b/docs/doc/ebb523b1-92bd-4491-bead-46e4fa454ff1.json
new file mode 100644
index 0000000..80f918a
--- /dev/null
+++ b/docs/doc/ebb523b1-92bd-4491-bead-46e4fa454ff1.json
@@ -0,0 +1,10 @@
+{
+ "summary": "Importing components from separate files for use in the application.",
+ "details": [
+ {
+ "comment": "Importing components from separate files for use in the application.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-viewer/src/panes/index.js\":0-3",
+ "content": "export { default as TopTokens } from \"./topTokens\"\nexport { default as Explanation } from \"./explanation\"\nexport { default as DatasetList } from \"./datasetList\"\nexport { default as SimilarNeurons } from \"./similarNeurons\""
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/doc/f00fe84a-3109-4f06-9d96-45aba25795d7.json b/docs/doc/f00fe84a-3109-4f06-9d96-45aba25795d7.json
new file mode 100644
index 0000000..f2be3ca
--- /dev/null
+++ b/docs/doc/f00fe84a-3109-4f06-9d96-45aba25795d7.json
@@ -0,0 +1,20 @@
+{
+ "summary": "The code prepares the environment, imports modules, and configures API keys for an explanation model. It loads data, generates explanations, and simulates them using a specific format. The preferred score is then printed with two decimal places.",
+ "details": [
+ {
+ "comment": "This code is setting up the environment and importing necessary modules for running an explanation model and simulator. It also sets the OpenAI API key, explanation model name, and simulator model name.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py\":0-30",
+ "content": "#!/usr/bin/env python\n# coding: utf-8\n# In[ ]:\nget_ipython().run_line_magic('load_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\n# In[ ]:\nimport os\nos.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\nfrom neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron\nfrom neuron_explainer.activations.token_connections import load_token_lookup_table_connections_of_neuron\nfrom neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator\nfrom neuron_explainer.explanations.explainer import TokenSpaceRepresentationExplainer\nfrom neuron_explainer.explanations.prompt_builder import PromptFormat\nfrom neuron_explainer.explanations.scoring import simulate_and_score\nfrom neuron_explainer.explanations.simulator import ExplanationNeuronSimulator\nEXPLAINER_MODEL_NAME = \"gpt-4\"\nSIMULATOR_MODEL_NAME = \"text-davinci-003\"\n# test_response = await client.make_request(prompt=\"test 123<|endofprompt|>\", max_tokens=2)\n# print(\"Response:\", test_response[\"choices\"][0][\"text\"])"
+ },
+ {
+ "comment": "Loading token lookup table and neuron record for a specific layer and index.\nGenerating an explanation using the provided token look up table.\nSimulating and scoring the generated explanation.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py\":32-66",
+ "content": "layer_index = 9\nneuron_index = 6236\n# Load a token lookup table.\ntoken_lookup_table = load_token_lookup_table_connections_of_neuron(layer_index, neuron_index)\n# Load a neuron record.\nneuron_record = load_neuron(layer_index, neuron_index)\n# Grab the activation records we'll need.\nslice_params = ActivationRecordSliceParams(n_examples_per_split=5)\nvalid_activation_records = neuron_record.valid_activation_records(\n activation_record_slice_params=slice_params\n)\n# Generate an explanation for the neuron.\nexplainer = TokenSpaceRepresentationExplainer(\n model_name=EXPLAINER_MODEL_NAME,\n prompt_format=PromptFormat.HARMONY_V4,\n max_concurrent=1,\n)\nexplanations = await explainer.generate_explanations(\n tokens=token_lookup_table.tokens,\n num_samples=1,\n)\nassert len(explanations) == 1\nexplanation = explanations[0]\nprint(f\"{explanation=}\")\n# Simulate and score the explanation.\nsimulator = UncalibratedNeuronSimulator(\n ExplanationNeuronSimulator(\n SIMULATOR_MODEL_NAME,\n explanation,\n max_concurrent=1,"
+ },
+ {
+ "comment": "Setting prompt format to \"INSTRUCTION_FOLLOWING\" and calling a function to simulate and score the activation records. Then, printing the preferred score with two decimal places.",
+ "location": "\"/media/root/Toshiba XG3/works/automated-interpretability/docs/src/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py\":67-71",
+ "content": " prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n )\n)\nscored_simulation = await simulate_and_score(simulator, valid_activation_records)\nprint(f\"score={scored_simulation.get_preferred_score():.2f}\")"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/docs/github-markdown.css b/docs/github-markdown.css
new file mode 100644
index 0000000..96a4f29
--- /dev/null
+++ b/docs/github-markdown.css
@@ -0,0 +1,1197 @@
+@media (prefers-color-scheme: dark) {
+
+ .markdown-body,
+ [data-theme="dark"] {
+ /*dark*/
+ color-scheme: dark;
+ --color-prettylights-syntax-comment: #8b949e;
+ --color-prettylights-syntax-constant: #79c0ff;
+ --color-prettylights-syntax-entity: #d2a8ff;
+ --color-prettylights-syntax-storage-modifier-import: #c9d1d9;
+ --color-prettylights-syntax-entity-tag: #7ee787;
+ --color-prettylights-syntax-keyword: #ff7b72;
+ --color-prettylights-syntax-string: #a5d6ff;
+ --color-prettylights-syntax-variable: #ffa657;
+ --color-prettylights-syntax-brackethighlighter-unmatched: #f85149;
+ --color-prettylights-syntax-invalid-illegal-text: #f0f6fc;
+ --color-prettylights-syntax-invalid-illegal-bg: #8e1519;
+ --color-prettylights-syntax-carriage-return-text: #f0f6fc;
+ --color-prettylights-syntax-carriage-return-bg: #b62324;
+ --color-prettylights-syntax-string-regexp: #7ee787;
+ --color-prettylights-syntax-markup-list: #f2cc60;
+ --color-prettylights-syntax-markup-heading: #1f6feb;
+ --color-prettylights-syntax-markup-italic: #c9d1d9;
+ --color-prettylights-syntax-markup-bold: #c9d1d9;
+ --color-prettylights-syntax-markup-deleted-text: #ffdcd7;
+ --color-prettylights-syntax-markup-deleted-bg: #67060c;
+ --color-prettylights-syntax-markup-inserted-text: #aff5b4;
+ --color-prettylights-syntax-markup-inserted-bg: #033a16;
+ --color-prettylights-syntax-markup-changed-text: #ffdfb6;
+ --color-prettylights-syntax-markup-changed-bg: #5a1e02;
+ --color-prettylights-syntax-markup-ignored-text: #c9d1d9;
+ --color-prettylights-syntax-markup-ignored-bg: #1158c7;
+ --color-prettylights-syntax-meta-diff-range: #d2a8ff;
+ --color-prettylights-syntax-brackethighlighter-angle: #8b949e;
+ --color-prettylights-syntax-sublimelinter-gutter-mark: #484f58;
+ --color-prettylights-syntax-constant-other-reference-link: #a5d6ff;
+ --color-fg-default: #e6edf3;
+ --color-fg-muted: #848d97;
+ --color-fg-subtle: #6e7681;
+ --color-canvas-default: #0d1117;
+ --color-canvas-subtle: #161b22;
+ --color-border-default: #30363d;
+ --color-border-muted: #21262d;
+ --color-neutral-muted: rgba(110, 118, 129, 0.4);
+ --color-accent-fg: #2f81f7;
+ --color-accent-emphasis: #1f6feb;
+ --color-success-fg: #3fb950;
+ --color-success-emphasis: #238636;
+ --color-attention-fg: #d29922;
+ --color-attention-emphasis: #9e6a03;
+ --color-attention-subtle: rgba(187, 128, 9, 0.15);
+ --color-danger-fg: #f85149;
+ --color-danger-emphasis: #da3633;
+ --color-done-fg: #a371f7;
+ --color-done-emphasis: #8957e5;
+ }
+}
+
+@media (prefers-color-scheme: light) {
+
+ .markdown-body,
+ [data-theme="light"] {
+ /*light*/
+ color-scheme: light;
+ --color-prettylights-syntax-comment: #57606a;
+ --color-prettylights-syntax-constant: #0550ae;
+ --color-prettylights-syntax-entity: #6639ba;
+ --color-prettylights-syntax-storage-modifier-import: #24292f;
+ --color-prettylights-syntax-entity-tag: #116329;
+ --color-prettylights-syntax-keyword: #cf222e;
+ --color-prettylights-syntax-string: #0a3069;
+ --color-prettylights-syntax-variable: #953800;
+ --color-prettylights-syntax-brackethighlighter-unmatched: #82071e;
+ --color-prettylights-syntax-invalid-illegal-text: #f6f8fa;
+ --color-prettylights-syntax-invalid-illegal-bg: #82071e;
+ --color-prettylights-syntax-carriage-return-text: #f6f8fa;
+ --color-prettylights-syntax-carriage-return-bg: #cf222e;
+ --color-prettylights-syntax-string-regexp: #116329;
+ --color-prettylights-syntax-markup-list: #3b2300;
+ --color-prettylights-syntax-markup-heading: #0550ae;
+ --color-prettylights-syntax-markup-italic: #24292f;
+ --color-prettylights-syntax-markup-bold: #24292f;
+ --color-prettylights-syntax-markup-deleted-text: #82071e;
+ --color-prettylights-syntax-markup-deleted-bg: #ffebe9;
+ --color-prettylights-syntax-markup-inserted-text: #116329;
+ --color-prettylights-syntax-markup-inserted-bg: #dafbe1;
+ --color-prettylights-syntax-markup-changed-text: #953800;
+ --color-prettylights-syntax-markup-changed-bg: #ffd8b5;
+ --color-prettylights-syntax-markup-ignored-text: #eaeef2;
+ --color-prettylights-syntax-markup-ignored-bg: #0550ae;
+ --color-prettylights-syntax-meta-diff-range: #8250df;
+ --color-prettylights-syntax-brackethighlighter-angle: #57606a;
+ --color-prettylights-syntax-sublimelinter-gutter-mark: #8c959f;
+ --color-prettylights-syntax-constant-other-reference-link: #0a3069;
+ --color-fg-default: #1F2328;
+ --color-fg-muted: #656d76;
+ --color-fg-subtle: #6e7781;
+ --color-canvas-default: #ffffff;
+ --color-canvas-subtle: #f6f8fa;
+ --color-border-default: #d0d7de;
+ --color-border-muted: hsla(210, 18%, 87%, 1);
+ --color-neutral-muted: rgba(175, 184, 193, 0.2);
+ --color-accent-fg: #0969da;
+ --color-accent-emphasis: #0969da;
+ --color-success-fg: #1a7f37;
+ --color-success-emphasis: #1f883d;
+ --color-attention-fg: #9a6700;
+ --color-attention-emphasis: #9a6700;
+ --color-attention-subtle: #fff8c5;
+ --color-danger-fg: #d1242f;
+ --color-danger-emphasis: #cf222e;
+ --color-done-fg: #8250df;
+ --color-done-emphasis: #8250df;
+ }
+}
+
+.markdown-body {
+ -ms-text-size-adjust: 100%;
+ -webkit-text-size-adjust: 100%;
+ margin: 0;
+ color: var(--color-fg-default);
+ background-color: var(--color-canvas-default);
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji";
+ font-size: 16px;
+ line-height: 1.5;
+ word-wrap: break-word;
+}
+
+.markdown-body .octicon {
+ display: inline-block;
+ fill: currentColor;
+ vertical-align: text-bottom;
+}
+
+.markdown-body h1:hover .anchor .octicon-link:before,
+.markdown-body h2:hover .anchor .octicon-link:before,
+.markdown-body h3:hover .anchor .octicon-link:before,
+.markdown-body h4:hover .anchor .octicon-link:before,
+.markdown-body h5:hover .anchor .octicon-link:before,
+.markdown-body h6:hover .anchor .octicon-link:before {
+ width: 16px;
+ height: 16px;
+ content: ' ';
+ display: inline-block;
+ background-color: currentColor;
+ -webkit-mask-image: url("data:image/svg+xml,");
+ mask-image: url("data:image/svg+xml,");
+}
+
+.markdown-body details,
+.markdown-body figcaption,
+.markdown-body figure {
+ display: block;
+}
+
+.markdown-body summary {
+ display: list-item;
+}
+
+.markdown-body [hidden] {
+ display: none !important;
+}
+
+.markdown-body a {
+ background-color: transparent;
+ color: var(--color-accent-fg);
+ text-decoration: none;
+}
+
+.markdown-body abbr[title] {
+ border-bottom: none;
+ -webkit-text-decoration: underline dotted;
+ text-decoration: underline dotted;
+}
+
+.markdown-body b,
+.markdown-body strong {
+ font-weight: var(--base-text-weight-semibold, 600);
+}
+
+.markdown-body dfn {
+ font-style: italic;
+}
+
+.markdown-body h1 {
+ margin: .67em 0;
+ font-weight: var(--base-text-weight-semibold, 600);
+ padding-bottom: .3em;
+ font-size: 2em;
+ border-bottom: 1px solid var(--color-border-muted);
+}
+
+.markdown-body mark {
+ background-color: var(--color-attention-subtle);
+ color: var(--color-fg-default);
+}
+
+.markdown-body small {
+ font-size: 90%;
+}
+
+.markdown-body sub,
+.markdown-body sup {
+ font-size: 75%;
+ line-height: 0;
+ position: relative;
+ vertical-align: baseline;
+}
+
+.markdown-body sub {
+ bottom: -0.25em;
+}
+
+.markdown-body sup {
+ top: -0.5em;
+}
+
+.markdown-body img {
+ border-style: none;
+ max-width: 100%;
+ box-sizing: content-box;
+ background-color: var(--color-canvas-default);
+}
+
+.markdown-body code,
+.markdown-body kbd,
+.markdown-body pre,
+.markdown-body samp {
+ font-family: monospace;
+ font-size: 1em;
+}
+
+.markdown-body figure {
+ margin: 1em 40px;
+}
+
+.markdown-body hr {
+ box-sizing: content-box;
+ overflow: hidden;
+ background: transparent;
+ border-bottom: 1px solid var(--color-border-muted);
+ height: .25em;
+ padding: 0;
+ margin: 24px 0;
+ background-color: var(--color-border-default);
+ border: 0;
+}
+
+.markdown-body input {
+ font: inherit;
+ margin: 0;
+ overflow: visible;
+ font-family: inherit;
+ font-size: inherit;
+ line-height: inherit;
+}
+
+.markdown-body [type=button],
+.markdown-body [type=reset],
+.markdown-body [type=submit] {
+ -webkit-appearance: button;
+ appearance: button;
+}
+
+.markdown-body [type=checkbox],
+.markdown-body [type=radio] {
+ box-sizing: border-box;
+ padding: 0;
+}
+
+.markdown-body [type=number]::-webkit-inner-spin-button,
+.markdown-body [type=number]::-webkit-outer-spin-button {
+ height: auto;
+}
+
+.markdown-body [type=search]::-webkit-search-cancel-button,
+.markdown-body [type=search]::-webkit-search-decoration {
+ -webkit-appearance: none;
+ appearance: none;
+}
+
+.markdown-body ::-webkit-input-placeholder {
+ color: inherit;
+ opacity: .54;
+}
+
+.markdown-body ::-webkit-file-upload-button {
+ -webkit-appearance: button;
+ appearance: button;
+ font: inherit;
+}
+
+.markdown-body a:hover {
+ text-decoration: underline;
+}
+
+.markdown-body ::placeholder {
+ color: var(--color-fg-subtle);
+ opacity: 1;
+}
+
+.markdown-body hr::before {
+ display: table;
+ content: "";
+}
+
+.markdown-body hr::after {
+ display: table;
+ clear: both;
+ content: "";
+}
+
+.markdown-body table {
+ border-spacing: 0;
+ border-collapse: collapse;
+ display: block;
+ width: max-content;
+ max-width: 100%;
+ overflow: auto;
+}
+
+.markdown-body td,
+.markdown-body th {
+ padding: 0;
+}
+
+.markdown-body details summary {
+ cursor: pointer;
+}
+
+.markdown-body details:not([open])>*:not(summary) {
+ display: none !important;
+}
+
+.markdown-body a:focus,
+.markdown-body [role=button]:focus,
+.markdown-body input[type=radio]:focus,
+.markdown-body input[type=checkbox]:focus {
+ outline: 2px solid var(--color-accent-fg);
+ outline-offset: -2px;
+ box-shadow: none;
+}
+
+.markdown-body a:focus:not(:focus-visible),
+.markdown-body [role=button]:focus:not(:focus-visible),
+.markdown-body input[type=radio]:focus:not(:focus-visible),
+.markdown-body input[type=checkbox]:focus:not(:focus-visible) {
+ outline: solid 1px transparent;
+}
+
+.markdown-body a:focus-visible,
+.markdown-body [role=button]:focus-visible,
+.markdown-body input[type=radio]:focus-visible,
+.markdown-body input[type=checkbox]:focus-visible {
+ outline: 2px solid var(--color-accent-fg);
+ outline-offset: -2px;
+ box-shadow: none;
+}
+
+.markdown-body a:not([class]):focus,
+.markdown-body a:not([class]):focus-visible,
+.markdown-body input[type=radio]:focus,
+.markdown-body input[type=radio]:focus-visible,
+.markdown-body input[type=checkbox]:focus,
+.markdown-body input[type=checkbox]:focus-visible {
+ outline-offset: 0;
+}
+
+.markdown-body kbd {
+ display: inline-block;
+ padding: 3px 5px;
+ font: 11px ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace;
+ line-height: 10px;
+ color: var(--color-fg-default);
+ vertical-align: middle;
+ background-color: var(--color-canvas-subtle);
+ border: solid 1px var(--color-neutral-muted);
+ border-bottom-color: var(--color-neutral-muted);
+ border-radius: 6px;
+ box-shadow: inset 0 -1px 0 var(--color-neutral-muted);
+}
+
+.markdown-body h1,
+.markdown-body h2,
+.markdown-body h3,
+.markdown-body h4,
+.markdown-body h5,
+.markdown-body h6 {
+ margin-top: 24px;
+ margin-bottom: 16px;
+ font-weight: var(--base-text-weight-semibold, 600);
+ line-height: 1.25;
+}
+
+.markdown-body h2 {
+ font-weight: var(--base-text-weight-semibold, 600);
+ padding-bottom: .3em;
+ font-size: 1.5em;
+ border-bottom: 1px solid var(--color-border-muted);
+}
+
+.markdown-body h3 {
+ font-weight: var(--base-text-weight-semibold, 600);
+ font-size: 1.25em;
+}
+
+.markdown-body h4 {
+ font-weight: var(--base-text-weight-semibold, 600);
+ font-size: 1em;
+}
+
+.markdown-body h5 {
+ font-weight: var(--base-text-weight-semibold, 600);
+ font-size: .875em;
+}
+
+.markdown-body h6 {
+ font-weight: var(--base-text-weight-semibold, 600);
+ font-size: .85em;
+ color: var(--color-fg-muted);
+}
+
+.markdown-body p {
+ margin-top: 0;
+ margin-bottom: 10px;
+}
+
+.markdown-body blockquote {
+ margin: 0;
+ padding: 0 1em;
+ color: var(--color-fg-muted);
+ border-left: .25em solid var(--color-border-default);
+}
+
+.markdown-body ul,
+.markdown-body ol {
+ margin-top: 0;
+ margin-bottom: 0;
+ padding-left: 2em;
+}
+
+.markdown-body ol ol,
+.markdown-body ul ol {
+ list-style-type: lower-roman;
+}
+
+.markdown-body ul ul ol,
+.markdown-body ul ol ol,
+.markdown-body ol ul ol,
+.markdown-body ol ol ol {
+ list-style-type: lower-alpha;
+}
+
+.markdown-body dd {
+ margin-left: 0;
+}
+
+.markdown-body tt,
+.markdown-body code,
+.markdown-body samp {
+ font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace;
+ font-size: 12px;
+}
+
+.markdown-body pre {
+ margin-top: 0;
+ margin-bottom: 0;
+ font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace;
+ font-size: 12px;
+ word-wrap: normal;
+}
+
+.markdown-body .octicon {
+ display: inline-block;
+ overflow: visible !important;
+ vertical-align: text-bottom;
+ fill: currentColor;
+}
+
+.markdown-body input::-webkit-outer-spin-button,
+.markdown-body input::-webkit-inner-spin-button {
+ margin: 0;
+ -webkit-appearance: none;
+ appearance: none;
+}
+
+.markdown-body .mr-2 {
+ margin-right: var(--base-size-8, 8px) !important;
+}
+
+.markdown-body::before {
+ display: table;
+ content: "";
+}
+
+.markdown-body::after {
+ display: table;
+ clear: both;
+ content: "";
+}
+
+.markdown-body>*:first-child {
+ margin-top: 0 !important;
+}
+
+.markdown-body>*:last-child {
+ margin-bottom: 0 !important;
+}
+
+.markdown-body a:not([href]) {
+ color: inherit;
+ text-decoration: none;
+}
+
+.markdown-body .absent {
+ color: var(--color-danger-fg);
+}
+
+.markdown-body .anchor {
+ float: left;
+ padding-right: 4px;
+ margin-left: -20px;
+ line-height: 1;
+}
+
+.markdown-body .anchor:focus {
+ outline: none;
+}
+
+.markdown-body p,
+.markdown-body blockquote,
+.markdown-body ul,
+.markdown-body ol,
+.markdown-body dl,
+.markdown-body table,
+.markdown-body pre,
+.markdown-body details {
+ margin-top: 0;
+ margin-bottom: 16px;
+}
+
+.markdown-body blockquote>:first-child {
+ margin-top: 0;
+}
+
+.markdown-body blockquote>:last-child {
+ margin-bottom: 0;
+}
+
+.markdown-body h1 .octicon-link,
+.markdown-body h2 .octicon-link,
+.markdown-body h3 .octicon-link,
+.markdown-body h4 .octicon-link,
+.markdown-body h5 .octicon-link,
+.markdown-body h6 .octicon-link {
+ color: var(--color-fg-default);
+ vertical-align: middle;
+ visibility: hidden;
+}
+
+.markdown-body h1:hover .anchor,
+.markdown-body h2:hover .anchor,
+.markdown-body h3:hover .anchor,
+.markdown-body h4:hover .anchor,
+.markdown-body h5:hover .anchor,
+.markdown-body h6:hover .anchor {
+ text-decoration: none;
+}
+
+.markdown-body h1:hover .anchor .octicon-link,
+.markdown-body h2:hover .anchor .octicon-link,
+.markdown-body h3:hover .anchor .octicon-link,
+.markdown-body h4:hover .anchor .octicon-link,
+.markdown-body h5:hover .anchor .octicon-link,
+.markdown-body h6:hover .anchor .octicon-link {
+ visibility: visible;
+}
+
+.markdown-body h1 tt,
+.markdown-body h1 code,
+.markdown-body h2 tt,
+.markdown-body h2 code,
+.markdown-body h3 tt,
+.markdown-body h3 code,
+.markdown-body h4 tt,
+.markdown-body h4 code,
+.markdown-body h5 tt,
+.markdown-body h5 code,
+.markdown-body h6 tt,
+.markdown-body h6 code {
+ padding: 0 .2em;
+ font-size: inherit;
+}
+
+.markdown-body summary h1,
+.markdown-body summary h2,
+.markdown-body summary h3,
+.markdown-body summary h4,
+.markdown-body summary h5,
+.markdown-body summary h6 {
+ display: inline-block;
+}
+
+.markdown-body summary h1 .anchor,
+.markdown-body summary h2 .anchor,
+.markdown-body summary h3 .anchor,
+.markdown-body summary h4 .anchor,
+.markdown-body summary h5 .anchor,
+.markdown-body summary h6 .anchor {
+ margin-left: -40px;
+}
+
+.markdown-body summary h1,
+.markdown-body summary h2 {
+ padding-bottom: 0;
+ border-bottom: 0;
+}
+
+.markdown-body ul.no-list,
+.markdown-body ol.no-list {
+ padding: 0;
+ list-style-type: none;
+}
+
+.markdown-body ol[type="a s"] {
+ list-style-type: lower-alpha;
+}
+
+.markdown-body ol[type="A s"] {
+ list-style-type: upper-alpha;
+}
+
+.markdown-body ol[type="i s"] {
+ list-style-type: lower-roman;
+}
+
+.markdown-body ol[type="I s"] {
+ list-style-type: upper-roman;
+}
+
+.markdown-body ol[type="1"] {
+ list-style-type: decimal;
+}
+
+.markdown-body div>ol:not([type]) {
+ list-style-type: decimal;
+}
+
+.markdown-body ul ul,
+.markdown-body ul ol,
+.markdown-body ol ol,
+.markdown-body ol ul {
+ margin-top: 0;
+ margin-bottom: 0;
+}
+
+.markdown-body li>p {
+ margin-top: 16px;
+}
+
+.markdown-body li+li {
+ margin-top: .25em;
+}
+
+.markdown-body dl {
+ padding: 0;
+}
+
+.markdown-body dl dt {
+ padding: 0;
+ margin-top: 16px;
+ font-size: 1em;
+ font-style: italic;
+ font-weight: var(--base-text-weight-semibold, 600);
+}
+
+.markdown-body dl dd {
+ padding: 0 16px;
+ margin-bottom: 16px;
+}
+
+.markdown-body table th {
+ font-weight: var(--base-text-weight-semibold, 600);
+}
+
+.markdown-body table th,
+.markdown-body table td {
+ padding: 6px 13px;
+ border: 1px solid var(--color-border-default);
+}
+
+.markdown-body table td>:last-child {
+ margin-bottom: 0;
+}
+
+.markdown-body table tr {
+ background-color: var(--color-canvas-default);
+ border-top: 1px solid var(--color-border-muted);
+}
+
+.markdown-body table tr:nth-child(2n) {
+ background-color: var(--color-canvas-subtle);
+}
+
+.markdown-body table img {
+ background-color: transparent;
+}
+
+.markdown-body img[align=right] {
+ padding-left: 20px;
+}
+
+.markdown-body img[align=left] {
+ padding-right: 20px;
+}
+
+.markdown-body .emoji {
+ max-width: none;
+ vertical-align: text-top;
+ background-color: transparent;
+}
+
+.markdown-body span.frame {
+ display: block;
+ overflow: hidden;
+}
+
+.markdown-body span.frame>span {
+ display: block;
+ float: left;
+ width: auto;
+ padding: 7px;
+ margin: 13px 0 0;
+ overflow: hidden;
+ border: 1px solid var(--color-border-default);
+}
+
+.markdown-body span.frame span img {
+ display: block;
+ float: left;
+}
+
+.markdown-body span.frame span span {
+ display: block;
+ padding: 5px 0 0;
+ clear: both;
+ color: var(--color-fg-default);
+}
+
+.markdown-body span.align-center {
+ display: block;
+ overflow: hidden;
+ clear: both;
+}
+
+.markdown-body span.align-center>span {
+ display: block;
+ margin: 13px auto 0;
+ overflow: hidden;
+ text-align: center;
+}
+
+.markdown-body span.align-center span img {
+ margin: 0 auto;
+ text-align: center;
+}
+
+.markdown-body span.align-right {
+ display: block;
+ overflow: hidden;
+ clear: both;
+}
+
+.markdown-body span.align-right>span {
+ display: block;
+ margin: 13px 0 0;
+ overflow: hidden;
+ text-align: right;
+}
+
+.markdown-body span.align-right span img {
+ margin: 0;
+ text-align: right;
+}
+
+.markdown-body span.float-left {
+ display: block;
+ float: left;
+ margin-right: 13px;
+ overflow: hidden;
+}
+
+.markdown-body span.float-left span {
+ margin: 13px 0 0;
+}
+
+.markdown-body span.float-right {
+ display: block;
+ float: right;
+ margin-left: 13px;
+ overflow: hidden;
+}
+
+.markdown-body span.float-right>span {
+ display: block;
+ margin: 13px auto 0;
+ overflow: hidden;
+ text-align: right;
+}
+
+.markdown-body code,
+.markdown-body tt {
+ padding: .2em .4em;
+ margin: 0;
+ font-size: 85%;
+ white-space: break-spaces;
+ background-color: var(--color-neutral-muted);
+ border-radius: 6px;
+}
+
+.markdown-body code br,
+.markdown-body tt br {
+ display: none;
+}
+
+.markdown-body del code {
+ text-decoration: inherit;
+}
+
+.markdown-body samp {
+ font-size: 85%;
+}
+
+.markdown-body pre code {
+ font-size: 100%;
+}
+
+.markdown-body pre>code {
+ padding: 0;
+ margin: 0;
+ word-break: normal;
+ white-space: pre;
+ background: transparent;
+ border: 0;
+}
+
+.markdown-body .highlight {
+ margin-bottom: 16px;
+}
+
+.markdown-body .highlight pre {
+ margin-bottom: 0;
+ word-break: normal;
+}
+
+.markdown-body .highlight pre,
+.markdown-body pre {
+ padding: 16px;
+ overflow: auto;
+ font-size: 85%;
+ line-height: 1.45;
+ color: var(--color-fg-default);
+ background-color: var(--color-canvas-subtle);
+ border-radius: 6px;
+}
+
+.markdown-body pre code,
+.markdown-body pre tt {
+ display: inline;
+ max-width: auto;
+ padding: 0;
+ margin: 0;
+ overflow: visible;
+ line-height: inherit;
+ word-wrap: normal;
+ background-color: transparent;
+ border: 0;
+}
+
+.markdown-body .csv-data td,
+.markdown-body .csv-data th {
+ padding: 5px;
+ overflow: hidden;
+ font-size: 12px;
+ line-height: 1;
+ text-align: left;
+ white-space: nowrap;
+}
+
+.markdown-body .csv-data .blob-num {
+ padding: 10px 8px 9px;
+ text-align: right;
+ background: var(--color-canvas-default);
+ border: 0;
+}
+
+.markdown-body .csv-data tr {
+ border-top: 0;
+}
+
+.markdown-body .csv-data th {
+ font-weight: var(--base-text-weight-semibold, 600);
+ background: var(--color-canvas-subtle);
+ border-top: 0;
+}
+
+.markdown-body [data-footnote-ref]::before {
+ content: "[";
+}
+
+.markdown-body [data-footnote-ref]::after {
+ content: "]";
+}
+
+.markdown-body .footnotes {
+ font-size: 12px;
+ color: var(--color-fg-muted);
+ border-top: 1px solid var(--color-border-default);
+}
+
+.markdown-body .footnotes ol {
+ padding-left: 16px;
+}
+
+.markdown-body .footnotes ol ul {
+ display: inline-block;
+ padding-left: 16px;
+ margin-top: 16px;
+}
+
+.markdown-body .footnotes li {
+ position: relative;
+}
+
+.markdown-body .footnotes li:target::before {
+ position: absolute;
+ top: -8px;
+ right: -8px;
+ bottom: -8px;
+ left: -24px;
+ pointer-events: none;
+ content: "";
+ border: 2px solid var(--color-accent-emphasis);
+ border-radius: 6px;
+}
+
+.markdown-body .footnotes li:target {
+ color: var(--color-fg-default);
+}
+
+.markdown-body .footnotes .data-footnote-backref g-emoji {
+ font-family: monospace;
+}
+
+.markdown-body .pl-c {
+ color: var(--color-prettylights-syntax-comment);
+}
+
+.markdown-body .pl-c1,
+.markdown-body .pl-s .pl-v {
+ color: var(--color-prettylights-syntax-constant);
+}
+
+.markdown-body .pl-e,
+.markdown-body .pl-en {
+ color: var(--color-prettylights-syntax-entity);
+}
+
+.markdown-body .pl-smi,
+.markdown-body .pl-s .pl-s1 {
+ color: var(--color-prettylights-syntax-storage-modifier-import);
+}
+
+.markdown-body .pl-ent {
+ color: var(--color-prettylights-syntax-entity-tag);
+}
+
+.markdown-body .pl-k {
+ color: var(--color-prettylights-syntax-keyword);
+}
+
+.markdown-body .pl-s,
+.markdown-body .pl-pds,
+.markdown-body .pl-s .pl-pse .pl-s1,
+.markdown-body .pl-sr,
+.markdown-body .pl-sr .pl-cce,
+.markdown-body .pl-sr .pl-sre,
+.markdown-body .pl-sr .pl-sra {
+ color: var(--color-prettylights-syntax-string);
+}
+
+.markdown-body .pl-v,
+.markdown-body .pl-smw {
+ color: var(--color-prettylights-syntax-variable);
+}
+
+.markdown-body .pl-bu {
+ color: var(--color-prettylights-syntax-brackethighlighter-unmatched);
+}
+
+.markdown-body .pl-ii {
+ color: var(--color-prettylights-syntax-invalid-illegal-text);
+ background-color: var(--color-prettylights-syntax-invalid-illegal-bg);
+}
+
+.markdown-body .pl-c2 {
+ color: var(--color-prettylights-syntax-carriage-return-text);
+ background-color: var(--color-prettylights-syntax-carriage-return-bg);
+}
+
+.markdown-body .pl-sr .pl-cce {
+ font-weight: bold;
+ color: var(--color-prettylights-syntax-string-regexp);
+}
+
+.markdown-body .pl-ml {
+ color: var(--color-prettylights-syntax-markup-list);
+}
+
+.markdown-body .pl-mh,
+.markdown-body .pl-mh .pl-en,
+.markdown-body .pl-ms {
+ font-weight: bold;
+ color: var(--color-prettylights-syntax-markup-heading);
+}
+
+.markdown-body .pl-mi {
+ font-style: italic;
+ color: var(--color-prettylights-syntax-markup-italic);
+}
+
+.markdown-body .pl-mb {
+ font-weight: bold;
+ color: var(--color-prettylights-syntax-markup-bold);
+}
+
+.markdown-body .pl-md {
+ color: var(--color-prettylights-syntax-markup-deleted-text);
+ background-color: var(--color-prettylights-syntax-markup-deleted-bg);
+}
+
+.markdown-body .pl-mi1 {
+ color: var(--color-prettylights-syntax-markup-inserted-text);
+ background-color: var(--color-prettylights-syntax-markup-inserted-bg);
+}
+
+.markdown-body .pl-mc {
+ color: var(--color-prettylights-syntax-markup-changed-text);
+ background-color: var(--color-prettylights-syntax-markup-changed-bg);
+}
+
+.markdown-body .pl-mi2 {
+ color: var(--color-prettylights-syntax-markup-ignored-text);
+ background-color: var(--color-prettylights-syntax-markup-ignored-bg);
+}
+
+.markdown-body .pl-mdr {
+ font-weight: bold;
+ color: var(--color-prettylights-syntax-meta-diff-range);
+}
+
+.markdown-body .pl-ba {
+ color: var(--color-prettylights-syntax-brackethighlighter-angle);
+}
+
+.markdown-body .pl-sg {
+ color: var(--color-prettylights-syntax-sublimelinter-gutter-mark);
+}
+
+.markdown-body .pl-corl {
+ text-decoration: underline;
+ color: var(--color-prettylights-syntax-constant-other-reference-link);
+}
+
+.markdown-body g-emoji {
+ display: inline-block;
+ min-width: 1ch;
+ font-family: "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
+ font-size: 1em;
+ font-style: normal !important;
+ font-weight: var(--base-text-weight-normal, 400);
+ line-height: 1;
+ vertical-align: -0.075em;
+}
+
+.markdown-body g-emoji img {
+ width: 1em;
+ height: 1em;
+}
+
+.markdown-body .task-list-item {
+ list-style-type: none;
+}
+
+.markdown-body .task-list-item label {
+ font-weight: var(--base-text-weight-normal, 400);
+}
+
+.markdown-body .task-list-item.enabled label {
+ cursor: pointer;
+}
+
+.markdown-body .task-list-item+.task-list-item {
+ margin-top: 4px;
+}
+
+.markdown-body .task-list-item .handle {
+ display: none;
+}
+
+.markdown-body .task-list-item-checkbox {
+ margin: 0 .2em .25em -1.4em;
+ vertical-align: middle;
+}
+
+.markdown-body .contains-task-list:dir(rtl) .task-list-item-checkbox {
+ margin: 0 -1.6em .25em .2em;
+}
+
+.markdown-body .contains-task-list {
+ position: relative;
+}
+
+.markdown-body .contains-task-list:hover .task-list-item-convert-container,
+.markdown-body .contains-task-list:focus-within .task-list-item-convert-container {
+ display: block;
+ width: auto;
+ height: 24px;
+ overflow: visible;
+ clip: auto;
+}
+
+.markdown-body ::-webkit-calendar-picker-indicator {
+ filter: invert(50%);
+}
+
+.markdown-body .markdown-alert {
+ padding: var(--base-size-8) var(--base-size-16);
+ margin-bottom: 16px;
+ color: inherit;
+ border-left: .25em solid var(--color-border-default);
+}
+
+.markdown-body .markdown-alert>:first-child {
+ margin-top: 0;
+}
+
+.markdown-body .markdown-alert>:last-child {
+ margin-bottom: 0;
+}
+
+.markdown-body .markdown-alert .markdown-alert-title {
+ display: flex;
+ font-weight: var(--base-text-weight-medium, 500);
+ align-items: center;
+ line-height: 1;
+}
+
+.markdown-body .markdown-alert.markdown-alert-note {
+ border-left-color: var(--color-accent-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-note .markdown-alert-title {
+ color: var(--color-accent-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-important {
+ border-left-color: var(--color-done-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-important .markdown-alert-title {
+ color: var(--color-done-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-warning {
+ border-left-color: var(--color-attention-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-warning .markdown-alert-title {
+ color: var(--color-attention-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-tip {
+ border-left-color: var(--color-success-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-tip .markdown-alert-title {
+ color: var(--color-success-fg);
+}
+
+.markdown-body .markdown-alert.markdown-alert-caution {
+ border-left-color: var(--color-danger-emphasis);
+}
+
+.markdown-body .markdown-alert.markdown-alert-caution .markdown-alert-title {
+ color: var(--color-danger-fg);
+}
\ No newline at end of file
diff --git a/docs/index.html b/docs/index.html
new file mode 100644
index 0000000..d1154b4
--- /dev/null
+++ b/docs/index.html
@@ -0,0 +1,1250 @@
+
+
+
+
+
+
+
+
+
+ Search Code By Comment
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Document index of:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/metadata.json b/docs/metadata.json
new file mode 100644
index 0000000..ca945fd
--- /dev/null
+++ b/docs/metadata.json
@@ -0,0 +1,225 @@
+{
+ "url": {
+ "full": "https://github.com/openai/automated-interpretability",
+ "partial": "openai/automated-interpretability"
+ },
+ "file_mapping": {
+ "0": {
+ "filepath": "/README.md",
+ "entry_id": 0,
+ "language_id": "plain-text"
+ },
+ "1": {
+ "filepath": "/neuron-explainer/README.md",
+ "entry_id": 14,
+ "language_id": "plain-text"
+ },
+ "2": {
+ "filepath": "/neuron-explainer/demos/explain_puzzles.py",
+ "entry_id": 18,
+ "language_id": "python"
+ },
+ "3": {
+ "filepath": "/neuron-explainer/demos/generate_and_score_explanation.py",
+ "entry_id": 24,
+ "language_id": "python"
+ },
+ "4": {
+ "filepath": "/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py",
+ "entry_id": 32,
+ "language_id": "python"
+ },
+ "5": {
+ "filepath": "/neuron-explainer/neuron_explainer/activations/activation_records.py",
+ "entry_id": 40,
+ "language_id": "python"
+ },
+ "6": {
+ "filepath": "/neuron-explainer/neuron_explainer/activations/activations.py",
+ "entry_id": 52,
+ "language_id": "python"
+ },
+ "7": {
+ "filepath": "/neuron-explainer/neuron_explainer/activations/token_connections.py",
+ "entry_id": 76,
+ "language_id": "python"
+ },
+ "8": {
+ "filepath": "/neuron-explainer/neuron_explainer/api_client.py",
+ "entry_id": 84,
+ "language_id": "python"
+ },
+ "9": {
+ "filepath": "/neuron-explainer/neuron_explainer/azure.py",
+ "entry_id": 98,
+ "language_id": "python"
+ },
+ "10": {
+ "filepath": "/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py",
+ "entry_id": 102,
+ "language_id": "python"
+ },
+ "11": {
+ "filepath": "/neuron-explainer/neuron_explainer/explanations/explainer.py",
+ "entry_id": 120,
+ "language_id": "python"
+ },
+ "12": {
+ "filepath": "/neuron-explainer/neuron_explainer/explanations/explanations.py",
+ "entry_id": 164,
+ "language_id": "python"
+ },
+ "13": {
+ "filepath": "/neuron-explainer/neuron_explainer/explanations/prompt_builder.py",
+ "entry_id": 184,
+ "language_id": "python"
+ },
+ "14": {
+ "filepath": "/neuron-explainer/neuron_explainer/explanations/puzzles.py",
+ "entry_id": 196,
+ "language_id": "python"
+ },
+ "15": {
+ "filepath": "/neuron-explainer/neuron_explainer/explanations/scoring.py",
+ "entry_id": 204,
+ "language_id": "python"
+ },
+ "16": {
+ "filepath": "/neuron-explainer/neuron_explainer/explanations/simulator.py",
+ "entry_id": 218,
+ "language_id": "python"
+ },
+ "17": {
+ "filepath": "/neuron-explainer/neuron_explainer/explanations/test_explainer.py",
+ "entry_id": 284,
+ "language_id": "python"
+ },
+ "18": {
+ "filepath": "/neuron-explainer/neuron_explainer/explanations/test_simulator.py",
+ "entry_id": 300,
+ "language_id": "python"
+ },
+ "19": {
+ "filepath": "/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py",
+ "entry_id": 318,
+ "language_id": "python"
+ },
+ "20": {
+ "filepath": "/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py",
+ "entry_id": 330,
+ "language_id": "python"
+ },
+ "21": {
+ "filepath": "/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py",
+ "entry_id": 334,
+ "language_id": "python"
+ },
+ "22": {
+ "filepath": "/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py",
+ "entry_id": 342,
+ "language_id": "python"
+ },
+ "23": {
+ "filepath": "/neuron-explainer/setup.py",
+ "entry_id": 350,
+ "language_id": "python"
+ },
+ "24": {
+ "filepath": "/neuron-viewer/README.md",
+ "entry_id": 354,
+ "language_id": "markdown"
+ },
+ "25": {
+ "filepath": "/neuron-viewer/python/server.py",
+ "entry_id": 358,
+ "language_id": "python"
+ },
+ "26": {
+ "filepath": "/neuron-viewer/src/App.jsx",
+ "entry_id": 364,
+ "language_id": "jsx"
+ },
+ "27": {
+ "filepath": "/neuron-viewer/src/feed.jsx",
+ "entry_id": 368,
+ "language_id": "jsx"
+ },
+ "28": {
+ "filepath": "/neuron-viewer/src/heatmapGrid.tsx",
+ "entry_id": 374,
+ "language_id": "tsx"
+ },
+ "29": {
+ "filepath": "/neuron-viewer/src/index.jsx",
+ "entry_id": 378,
+ "language_id": "jsx"
+ },
+ "30": {
+ "filepath": "/neuron-viewer/src/interpAPI.ts",
+ "entry_id": 382,
+ "language_id": "ts"
+ },
+ "31": {
+ "filepath": "/neuron-viewer/src/panes/datasetList.jsx",
+ "entry_id": 394,
+ "language_id": "jsx"
+ },
+ "32": {
+ "filepath": "/neuron-viewer/src/panes/explanation.jsx",
+ "entry_id": 404,
+ "language_id": "jsx"
+ },
+ "33": {
+ "filepath": "/neuron-viewer/src/panes/index.js",
+ "entry_id": 418,
+ "language_id": "javascript"
+ },
+ "34": {
+ "filepath": "/neuron-viewer/src/panes/similarNeurons.jsx",
+ "entry_id": 422,
+ "language_id": "jsx"
+ },
+ "35": {
+ "filepath": "/neuron-viewer/src/panes/topTokens.jsx",
+ "entry_id": 432,
+ "language_id": "jsx"
+ },
+ "36": {
+ "filepath": "/neuron-viewer/src/reportWebVitals.js",
+ "entry_id": 444,
+ "language_id": "javascript"
+ },
+ "37": {
+ "filepath": "/neuron-viewer/src/simulationHeatmap.tsx",
+ "entry_id": 448,
+ "language_id": "tsx"
+ },
+ "38": {
+ "filepath": "/neuron-viewer/src/tokenHeatmap.tsx",
+ "entry_id": 460,
+ "language_id": "tsx"
+ },
+ "39": {
+ "filepath": "/neuron-viewer/src/types.ts",
+ "entry_id": 466,
+ "language_id": "ts"
+ },
+ "40": {
+ "filepath": "/neuron-viewer/src/utils.ts",
+ "entry_id": 474,
+ "language_id": "ts"
+ },
+ "41": {
+ "filepath": "/neuron-viewer/src/welcome.tsx",
+ "entry_id": 478,
+ "language_id": "tsx"
+ },
+ "42": {
+ "filepath": "/neuron-viewer/tailwind.config.js",
+ "entry_id": 498,
+ "language_id": "javascript"
+ }
+ },
+ "project_name": "automated-interpretability",
+ "split_count": 6
+}
\ No newline at end of file
diff --git a/docs/metadata_title.json b/docs/metadata_title.json
new file mode 100644
index 0000000..0103dd2
--- /dev/null
+++ b/docs/metadata_title.json
@@ -0,0 +1 @@
+{"split_count": 1}
\ No newline at end of file
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
new file mode 100644
index 0000000..ff909ab
--- /dev/null
+++ b/docs/sitemap.xml
@@ -0,0 +1,271 @@
+
+
+
+
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/README.md
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/README.md
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/demos/explain_puzzles.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/demos/generate_and_score_explanation.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/activations/activation_records.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/activations/activations.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/activations/token_connections.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/api_client.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/azure.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/explanations/explainer.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/explanations/explanations.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/explanations/prompt_builder.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/explanations/puzzles.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/explanations/scoring.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/explanations/simulator.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/explanations/test_explainer.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/explanations/test_simulator.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-explainer/setup.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/README.md
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/python/server.py
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/App.jsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/feed.jsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/heatmapGrid.tsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/index.jsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/interpAPI.ts
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/panes/datasetList.jsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/panes/explanation.jsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/panes/index.js
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/panes/similarNeurons.jsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/panes/topTokens.jsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/reportWebVitals.js
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/simulationHeatmap.tsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/tokenHeatmap.tsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/types.ts
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/utils.ts
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/src/welcome.tsx
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability?q=/neuron-viewer/tailwind.config.js
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
+ https://james4ever0.github.io/automated-interpretability/tree.html?full=true
+ 2023-12-28T09:21:02+00:00
+ 1.00
+
+
+
\ No newline at end of file
diff --git a/docs/src/README.md b/docs/src/README.md
new file mode 100644
index 0000000..6a305a8
--- /dev/null
+++ b/docs/src/README.md
@@ -0,0 +1,80 @@
+# Automated interpretability
+
+## Code and tools
+
+This repository contains code and tools associated with the [Language models can explain neurons in
+language models](https://openaipublic.blob.core.windows.net/neuron-explainer/paper/index.html) paper, specifically:
+
+* Code for automatically generating, simulating, and scoring explanations of neuron behavior using
+the methodology described in the paper. See the
+[neuron-explainer README](neuron-explainer/README.md) for more information.
+
+Note: if you run into errors of the form "Error: Could not find any credentials that grant access to storage account: 'openaipublic' and container: 'neuron-explainer'"." you might be able to fix this by signing up for an azure account and specifying the credentials as described in the error message.
+
+* A tool for viewing neuron activations and explanations, accessible
+[here](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html). See
+the [neuron-viewer README](neuron-viewer/README.md) for more information.
+
+## Public datasets
+
+Together with this code, we're also releasing public datasets of GPT-2 XL neurons and explanations.
+Here's an overview of those datasets.
+
+* Neuron activations: `az://openaipublic/neuron-explainer/data/collated-activations/{layer_index}/{neuron_index}.json`
+ - Tokenized text sequences and their activations for the neuron. We
+ provide multiple sets of tokens and activations: top-activating ones, random
+ samples from several quantiles; and a completely random sample. We also provide
+ some basic statistics for the activations.
+ - Each file contains a JSON-formatted
+ [`NeuronRecord`](neuron-explainer/neuron_explainer/activations/activations.py#L89) dataclass.
+* Neuron explanations: `az://openaipublic/neuron-explainer/data/explanations/{layer_index}/{neuron_index}.jsonl`
+ - Scored model-generated explanations of the behavior of the neuron, including simulation results.
+ - Each file contains a JSON-formatted
+ [`NeuronSimulationResults`](neuron-explainer/neuron_explainer/explanations/explanations.py#L146)
+ dataclass.
+* Related neurons: `az://openaipublic/neuron-explainer/data/related-neurons/weight-based/{layer_index}/{neuron_index}.json`
+ - Lists of the upstream and downstream neurons with the most positive and negative connections (see below for definition).
+ - Each file contains a JSON-formatted dataclass whose definition is not included in this repo.
+* Tokens with high average activations:
+`az://openaipublic/neuron-explainer/data/related-tokens/activation-based/{layer_index}/{neuron_index}.json`
+ - Lists of tokens with the highest average activations for individual neurons, and their average activations.
+ - Each file contains a JSON-formatted [`TokenLookupTableSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L36)
+ dataclass.
+* Tokens with large inbound and outbound weights:
+`az://openaipublic/neuron-explainer/data/related-tokens/weight-based/{layer_index}/{neuron_index}.json`
+ - List of the most-positive and most-negative input and output tokens for individual neurons,
+ as well as the associated weight (see below for definition).
+ - Each file contains a JSON-formatted [`WeightBasedSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L17)
+ dataclass.
+
+Update (July 5, 2023):
+We also released a set of explanations for GPT-2 Small. The methodology is slightly different from the methodology used for GPT-2 XL so the results aren't directly comparable.
+* Neuron activations: `az://openaipublic/neuron-explainer/gpt2_small_data/collated-activations/{layer_index}/{neuron_index}.json`
+* Neuron explanations: `az://openaipublic/neuron-explainer/gpt2_small_data/explanations/{layer_index}/{neuron_index}.jsonl`
+
+Update (August 30, 2023): We recently discovered a bug in how we performed inference on the GPT-2 series models used for the paper and for these datasets. Specifically, we used an optimized GELU implementation rather than the original GELU implementation associated with GPT-2. While the model’s behavior is very similar across these two configurations, the post-MLP activation values we used to generate and simulate explanations differ from the correct values by the following amounts for GPT-2 small:
+
+- Median: 0.0090
+- 90th percentile: 0.0252
+- 99th percentile: 0.0839
+- 99.9th percentile: 0.1736
+
+### Definition of connection weights
+
+Refer to [GPT-2 model code](https://github.com/openai/gpt-2/blob/master/src/model.py) for
+understanding of model weight conventions.
+
+*Neuron-neuron*: For two neurons `(l1, n1)` and `(l2, n2)` with `l1 < l2`, the connection strength is defined as
+`h{l1}.mlp.c_proj.w[:, n1, :] @ diag(h{l2}.ln_2.g) @ h{l2}.mlp.c_fc.w[:, :, n2]`.
+
+*Neuron-token*: For token `t` and neuron `(l, n)`, the input weight is computed as
+`wte[t, :] @ diag(h{l}.ln_2.g) @ h{l}.mlp.c_fc.w[:, :, n]`
+and the output weight is computed as
+`h{l}.mlp.c_proj.w[:, n, :] @ diag(ln_f.g) @ wte[t, :]`.
+
+### Misc Lists of Interesting Neurons
+Lists of neurons we thought were interesting according to different criteria, with some preliminary descriptions.
+* [Interesting Neurons (external)](https://docs.google.com/spreadsheets/d/1p7fYs31NU8sJoeKyUx4Mn2laGx8xXfHg_KcIvYiKPpg/edit#gid=0)
+* [Neurons that score high on random, possibly monosemantic? (external)](https://docs.google.com/spreadsheets/d/1TqKFcz-84jyIHLU7VRoTc8BoFBMpbgac-iNBnxVurQ8/edit?usp=sharing)
+* [Clusters of neurons well explained by activation explanation but not by tokens](https://docs.google.com/document/d/1lWhKowpKDdwTMALD_K541cdwgGoQx8DFUSuEe1U2AGE/edit?usp=sharing)
+* [Neurons sensitive to truncation](https://docs.google.com/document/d/1x89TWBvuHcyC2t01EDbJZJ5LQYHozlcS-VUmr5shf_A/edit?usp=sharing)
diff --git a/docs/src/neuron-explainer/README.md b/docs/src/neuron-explainer/README.md
new file mode 100644
index 0000000..c05b893
--- /dev/null
+++ b/docs/src/neuron-explainer/README.md
@@ -0,0 +1,18 @@
+# Neuron explainer
+
+This directory contains a version of our code for generating, simulating and scoring explanations of
+neuron behavior.
+
+# Setup
+
+```
+pip install -e .
+```
+
+# Usage
+
+For example usage, see the `demos` folder:
+
+* [Generating and scoring activation-based explanations](demos/generate_and_score_explanation.ipynb)
+* [Generating and scoring explanations based on tokens with high average activations](demos/generate_and_score_token_look_up_table_explanation.ipynb)
+* [Generating explanations for human-written neuron puzzles](demos/explain_puzzles.ipynb)
diff --git a/docs/src/neuron-explainer/demos/explain_puzzles.py b/docs/src/neuron-explainer/demos/explain_puzzles.py
new file mode 100644
index 0000000..cc3cdf4
--- /dev/null
+++ b/docs/src/neuron-explainer/demos/explain_puzzles.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[ ]:
+
+
+get_ipython().run_line_magic('load_ext', 'autoreload')
+get_ipython().run_line_magic('autoreload', '2')
+
+
+# In[ ]:
+
+
+import os
+
+os.environ["OPENAI_API_KEY"] = "put-key-here"
+
+from neuron_explainer.activations.activation_records import calculate_max_activation
+from neuron_explainer.explanations.explainer import TokenActivationPairExplainer
+from neuron_explainer.explanations.prompt_builder import PromptFormat
+from neuron_explainer.explanations.puzzles import PUZZLES_BY_NAME
+
+
+EXPLAINER_MODEL_NAME = "gpt-4"
+
+explainer = TokenActivationPairExplainer(
+ model_name=EXPLAINER_MODEL_NAME,
+ prompt_format=PromptFormat.HARMONY_V4,
+ max_concurrent=1,
+)
+
+for puzzle_name, puzzle in PUZZLES_BY_NAME.items():
+ print(f"{puzzle_name=}")
+ puzzle_answer = puzzle.explanation
+ # Generate an explanation for the puzzle.
+ explanations = await explainer.generate_explanations(
+ all_activation_records=puzzle.activation_records,
+ max_activation=calculate_max_activation(puzzle.activation_records),
+ num_samples=1,
+ )
+ assert len(explanations) == 1
+ model_generated_explanation = explanations[0]
+ print(f"{model_generated_explanation=}")
+ print(f"{puzzle_answer=}\n")
+
+
diff --git a/docs/src/neuron-explainer/demos/generate_and_score_explanation.py b/docs/src/neuron-explainer/demos/generate_and_score_explanation.py
new file mode 100644
index 0000000..4f5c87a
--- /dev/null
+++ b/docs/src/neuron-explainer/demos/generate_and_score_explanation.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[ ]:
+
+
+get_ipython().run_line_magic('load_ext', 'autoreload')
+get_ipython().run_line_magic('autoreload', '2')
+
+
+# In[ ]:
+
+
+import os
+
+os.environ["OPENAI_API_KEY"] = "put-key-here"
+
+from neuron_explainer.activations.activation_records import calculate_max_activation
+from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron
+from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator
+from neuron_explainer.explanations.explainer import TokenActivationPairExplainer
+from neuron_explainer.explanations.prompt_builder import PromptFormat
+from neuron_explainer.explanations.scoring import simulate_and_score
+from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator
+
+EXPLAINER_MODEL_NAME = "gpt-4"
+SIMULATOR_MODEL_NAME = "text-davinci-003"
+
+
+# test_response = await client.make_request(prompt="test 123<|endofprompt|>", max_tokens=2)
+# print("Response:", test_response["choices"][0]["text"])
+
+# Load a neuron record.
+neuron_record = load_neuron(9, 6236)
+
+# Grab the activation records we'll need.
+slice_params = ActivationRecordSliceParams(n_examples_per_split=5)
+train_activation_records = neuron_record.train_activation_records(
+ activation_record_slice_params=slice_params
+)
+valid_activation_records = neuron_record.valid_activation_records(
+ activation_record_slice_params=slice_params
+)
+
+# Generate an explanation for the neuron.
+explainer = TokenActivationPairExplainer(
+ model_name=EXPLAINER_MODEL_NAME,
+ prompt_format=PromptFormat.HARMONY_V4,
+ max_concurrent=1,
+)
+explanations = await explainer.generate_explanations(
+ all_activation_records=train_activation_records,
+ max_activation=calculate_max_activation(train_activation_records),
+ num_samples=1,
+)
+assert len(explanations) == 1
+explanation = explanations[0]
+print(f"{explanation=}")
+
+# Simulate and score the explanation.
+simulator = UncalibratedNeuronSimulator(
+ ExplanationNeuronSimulator(
+ SIMULATOR_MODEL_NAME,
+ explanation,
+ max_concurrent=1,
+ prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
+ )
+)
+scored_simulation = await simulate_and_score(simulator, valid_activation_records)
+print(f"score={scored_simulation.get_preferred_score():.2f}")
+
diff --git a/docs/src/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py b/docs/src/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py
new file mode 100644
index 0000000..f28d3f9
--- /dev/null
+++ b/docs/src/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[ ]:
+
+
+get_ipython().run_line_magic('load_ext', 'autoreload')
+get_ipython().run_line_magic('autoreload', '2')
+
+
+# In[ ]:
+
+
+import os
+
+os.environ["OPENAI_API_KEY"] = "put-key-here"
+
+from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron
+from neuron_explainer.activations.token_connections import load_token_lookup_table_connections_of_neuron
+from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator
+from neuron_explainer.explanations.explainer import TokenSpaceRepresentationExplainer
+from neuron_explainer.explanations.prompt_builder import PromptFormat
+from neuron_explainer.explanations.scoring import simulate_and_score
+from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator
+
+EXPLAINER_MODEL_NAME = "gpt-4"
+SIMULATOR_MODEL_NAME = "text-davinci-003"
+
+
+# test_response = await client.make_request(prompt="test 123<|endofprompt|>", max_tokens=2)
+# print("Response:", test_response["choices"][0]["text"])
+
+layer_index = 9
+neuron_index = 6236
+
+# Load a token lookup table.
+token_lookup_table = load_token_lookup_table_connections_of_neuron(layer_index, neuron_index)
+
+# Load a neuron record.
+neuron_record = load_neuron(layer_index, neuron_index)
+
+# Grab the activation records we'll need.
+slice_params = ActivationRecordSliceParams(n_examples_per_split=5)
+valid_activation_records = neuron_record.valid_activation_records(
+ activation_record_slice_params=slice_params
+)
+
+# Generate an explanation for the neuron.
+explainer = TokenSpaceRepresentationExplainer(
+ model_name=EXPLAINER_MODEL_NAME,
+ prompt_format=PromptFormat.HARMONY_V4,
+ max_concurrent=1,
+)
+explanations = await explainer.generate_explanations(
+ tokens=token_lookup_table.tokens,
+ num_samples=1,
+)
+assert len(explanations) == 1
+explanation = explanations[0]
+print(f"{explanation=}")
+
+# Simulate and score the explanation.
+simulator = UncalibratedNeuronSimulator(
+ ExplanationNeuronSimulator(
+ SIMULATOR_MODEL_NAME,
+ explanation,
+ max_concurrent=1,
+ prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
+ )
+)
+scored_simulation = await simulate_and_score(simulator, valid_activation_records)
+print(f"score={scored_simulation.get_preferred_score():.2f}")
+
diff --git a/docs/src/neuron-explainer/neuron_explainer/activations/activation_records.py b/docs/src/neuron-explainer/neuron_explainer/activations/activation_records.py
new file mode 100644
index 0000000..95e01ad
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/activations/activation_records.py
@@ -0,0 +1,130 @@
+"""Utilities for formatting activation records into prompts."""
+
+import math
+from typing import Optional, Sequence
+
+from neuron_explainer.activations.activations import ActivationRecord
+
+UNKNOWN_ACTIVATION_STRING = "unknown"
+
+
+def relu(x: float) -> float:
+ return max(0.0, x)
+
+
+def calculate_max_activation(activation_records: Sequence[ActivationRecord]) -> float:
+ """Return the maximum activation value of the neuron across all the activation records."""
+ flattened = [
+ # Relu is used to assume any values less than 0 are indicating the neuron is in the resting
+ # state. This is a simplifying assumption that works with relu/gelu.
+ max(relu(x) for x in activation_record.activations)
+ for activation_record in activation_records
+ ]
+ return max(flattened)
+
+
+def normalize_activations(activation_record: list[float], max_activation: float) -> list[int]:
+ """Convert raw neuron activations to integers on the range [0, 10]."""
+ if max_activation <= 0:
+ return [0 for x in activation_record]
+ # Relu is used to assume any values less than 0 are indicating the neuron is in the resting
+ # state. This is a simplifying assumption that works with relu/gelu.
+ return [min(10, math.floor(10 * relu(x) / max_activation)) for x in activation_record]
+
+
+def _format_activation_record(
+ activation_record: ActivationRecord,
+ max_activation: float,
+ omit_zeros: bool,
+ hide_activations: bool = False,
+ start_index: int = 0,
+) -> str:
+ """Format neuron activations into a string, suitable for use in prompts."""
+ tokens = activation_record.tokens
+ normalized_activations = normalize_activations(activation_record.activations, max_activation)
+ if omit_zeros:
+ assert (not hide_activations) and start_index == 0, "Can't hide activations and omit zeros"
+ tokens = [
+ token for token, activation in zip(tokens, normalized_activations) if activation > 0
+ ]
+ normalized_activations = [x for x in normalized_activations if x > 0]
+
+ entries = []
+ assert len(tokens) == len(normalized_activations)
+ for index, token, activation in zip(range(len(tokens)), tokens, normalized_activations):
+ activation_string = str(int(activation))
+ if hide_activations or index < start_index:
+ activation_string = UNKNOWN_ACTIVATION_STRING
+ entries.append(f"{token}\t{activation_string}")
+ return "\n".join(entries)
+
+
+def format_activation_records(
+ activation_records: Sequence[ActivationRecord],
+ max_activation: float,
+ *,
+ omit_zeros: bool = False,
+ start_indices: Optional[list[int]] = None,
+ hide_activations: bool = False,
+) -> str:
+ """Format a list of activation records into a string."""
+ return (
+ "\n\n"
+ + "\n\n\n".join(
+ [
+ _format_activation_record(
+ activation_record,
+ max_activation,
+ omit_zeros=omit_zeros,
+ hide_activations=hide_activations,
+ start_index=0 if start_indices is None else start_indices[i],
+ )
+ for i, activation_record in enumerate(activation_records)
+ ]
+ )
+ + "\n\n"
+ )
+
+
+def _format_tokens_for_simulation(tokens: Sequence[str]) -> str:
+ """
+ Format tokens into a string with each token marked as having an "unknown" activation, suitable
+ for use in prompts.
+ """
+ entries = []
+ for token in tokens:
+ entries.append(f"{token}\t{UNKNOWN_ACTIVATION_STRING}")
+ return "\n".join(entries)
+
+
+def format_sequences_for_simulation(
+ all_tokens: Sequence[Sequence[str]],
+) -> str:
+ """
+ Format a list of lists of tokens into a string with each token marked as having an "unknown"
+ activation, suitable for use in prompts.
+ """
+ return (
+ "\n\n"
+ + "\n\n\n".join(
+ [_format_tokens_for_simulation(tokens) for tokens in all_tokens]
+ )
+ + "\n\n"
+ )
+
+
+def non_zero_activation_proportion(
+ activation_records: Sequence[ActivationRecord], max_activation: float
+) -> float:
+ """Return the proportion of activation values that aren't zero."""
+ total_activations_count = sum(
+ [len(activation_record.activations) for activation_record in activation_records]
+ )
+ normalized_activations = [
+ normalize_activations(activation_record.activations, max_activation)
+ for activation_record in activation_records
+ ]
+ non_zero_activations_count = sum(
+ [len([x for x in activations if x != 0]) for activations in normalized_activations]
+ )
+ return non_zero_activations_count / total_activations_count
diff --git a/docs/src/neuron-explainer/neuron_explainer/activations/activations.py b/docs/src/neuron-explainer/neuron_explainer/activations/activations.py
new file mode 100644
index 0000000..50acbfb
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/activations/activations.py
@@ -0,0 +1,280 @@
+# Dataclasses and enums for storing neuron-indexed information about activations. Also, related
+# helper functions.
+
+import math
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+import urllib.request
+import blobfile as bf
+import boostedblob as bbb
+from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass
+from neuron_explainer.azure import standardize_azure_url
+
+
+@register_dataclass
+@dataclass
+class ActivationRecord(FastDataclass):
+ """Collated lists of tokens and their activations for a single neuron."""
+
+ tokens: List[str]
+ """Tokens in the text sequence, represented as strings."""
+ activations: List[float]
+ """Raw activation values for the neuron on each token in the text sequence."""
+
+
+@register_dataclass
+@dataclass
+class NeuronId(FastDataclass):
+ """Identifier for a neuron in an artificial neural network."""
+
+ layer_index: int
+ """The index of layer the neuron is in. The first layer used during inference has index 0."""
+ neuron_index: int
+ """The neuron's index within in its layer. Indices start from 0 in each layer."""
+
+
+def _check_slices(
+ slices_by_split: dict[str, slice],
+ expected_num_values: int,
+) -> None:
+ """Assert that the slices are disjoint and fully cover the intended range."""
+ indices = set()
+ sum_of_slice_lengths = 0
+ n_splits = len(slices_by_split.keys())
+ for s in slices_by_split.values():
+ subrange = range(expected_num_values)[s]
+ sum_of_slice_lengths += len(subrange)
+ indices |= set(subrange)
+ assert (
+ sum_of_slice_lengths == expected_num_values
+ ), f"{sum_of_slice_lengths=} != {expected_num_values=}"
+ stride = n_splits
+ expected_indices = set.union(
+ *[set(range(start_index, expected_num_values, stride)) for start_index in range(n_splits)]
+ )
+ assert indices == expected_indices, f"{indices=} != {expected_indices=}"
+
+
+def get_slices_for_splits(
+ splits: list[str],
+ num_activation_records_per_split: int,
+) -> dict[str, slice]:
+ """
+ Get equal-sized interleaved subsets for each of a list of splits, given the number of elements
+ to include in each split.
+ """
+
+ stride = len(splits)
+ num_activation_records_for_even_splits = num_activation_records_per_split * stride
+ slices_by_split = {
+ split: slice(split_index, num_activation_records_for_even_splits, stride)
+ for split_index, split in enumerate(splits)
+ }
+ _check_slices(
+ slices_by_split=slices_by_split,
+ expected_num_values=num_activation_records_for_even_splits,
+ )
+ return slices_by_split
+
+
+@dataclass
+class ActivationRecordSliceParams:
+ """How to select splits (train, valid, etc.) of activation records."""
+
+ n_examples_per_split: Optional[int]
+ """The number of examples to include in each split."""
+
+
+@register_dataclass
+@dataclass
+class NeuronRecord(FastDataclass):
+ """Neuron-indexed activation data, including summary stats and notable activation records."""
+
+ neuron_id: NeuronId
+ """Identifier for the neuron."""
+
+ random_sample: list[ActivationRecord] = field(default_factory=list)
+ """
+ Random activation records for this neuron. The random sample is independent from those used for
+ other neurons.
+ """
+ random_sample_by_quantile: Optional[list[list[ActivationRecord]]] = None
+ """
+ Random samples of activation records in each of the specified quantiles. None if quantile
+ tracking is disabled.
+ """
+ quantile_boundaries: Optional[list[float]] = None
+ """Boundaries of the quantiles used to generate the random_sample_by_quantile field."""
+
+ # Moments of activations
+ mean: Optional[float] = math.nan
+ variance: Optional[float] = math.nan
+ skewness: Optional[float] = math.nan
+ kurtosis: Optional[float] = math.nan
+
+ most_positive_activation_records: list[ActivationRecord] = field(default_factory=list)
+ """
+ Activation records with the most positive figure of merit value for this neuron over all dataset
+ examples.
+ """
+
+ @property
+ def max_activation(self) -> float:
+ """Return the maximum activation value over all top-activating activation records."""
+ return max([max(ar.activations) for ar in self.most_positive_activation_records])
+
+ def _get_top_activation_slices(
+ self, activation_record_slice_params: ActivationRecordSliceParams
+ ) -> dict[str, slice]:
+ splits = ["train", "calibration", "valid", "test"]
+ n_examples_per_split = activation_record_slice_params.n_examples_per_split
+ if n_examples_per_split is None:
+ n_examples_per_split = len(self.most_positive_activation_records) // len(splits)
+ assert len(self.most_positive_activation_records) >= n_examples_per_split * len(splits)
+ return get_slices_for_splits(splits, n_examples_per_split)
+
+ def _get_random_activation_slices(
+ self, activation_record_slice_params: ActivationRecordSliceParams
+ ) -> dict[str, slice]:
+ splits = ["calibration", "valid", "test"]
+ n_examples_per_split = activation_record_slice_params.n_examples_per_split
+ if n_examples_per_split is None:
+ n_examples_per_split = len(self.random_sample) // len(splits)
+ # NOTE: this assert could trigger on some old datasets with only 10 random samples, in which case you may have to remove "test" from the set of splits
+ assert len(self.random_sample) >= n_examples_per_split * len(splits)
+ return get_slices_for_splits(splits, n_examples_per_split)
+
+ def train_activation_records(
+ self,
+ activation_record_slice_params: ActivationRecordSliceParams,
+ ) -> list[ActivationRecord]:
+ """
+ Train split, typically used for generating explanations. Consists exclusively of
+ top-activating records since context window limitations make it difficult to include
+ random records.
+ """
+ return self.most_positive_activation_records[
+ self._get_top_activation_slices(activation_record_slice_params)["train"]
+ ]
+
+ def calibration_activation_records(
+ self,
+ activation_record_slice_params: ActivationRecordSliceParams,
+ ) -> list[ActivationRecord]:
+ """
+ Calibration split, typically used for calibrating neuron simulations. See
+ http://go/neuron_explanation_methodology for an explanation of calibration. Consists of
+ top-activating records and random records in a 1:1 ratio.
+ """
+ return (
+ self.most_positive_activation_records[
+ self._get_top_activation_slices(activation_record_slice_params)["calibration"]
+ ]
+ + self.random_sample[
+ self._get_random_activation_slices(activation_record_slice_params)["calibration"]
+ ]
+ )
+
+ def valid_activation_records(
+ self,
+ activation_record_slice_params: ActivationRecordSliceParams,
+ ) -> list[ActivationRecord]:
+ """
+ Validation split, typically used for evaluating explanations, either automatically with
+ simulation + correlation coefficient scoring, or manually by humans. Consists of
+ top-activating records and random records in a 1:1 ratio.
+ """
+ return (
+ self.most_positive_activation_records[
+ self._get_top_activation_slices(activation_record_slice_params)["valid"]
+ ]
+ + self.random_sample[
+ self._get_random_activation_slices(activation_record_slice_params)["valid"]
+ ]
+ )
+
+ def test_activation_records(
+ self,
+ activation_record_slice_params: ActivationRecordSliceParams,
+ ) -> list[ActivationRecord]:
+ """
+ Test split, typically used for explanation evaluations that can't use the validation split.
+ Consists of top-activating records and random records in a 1:1 ratio.
+ """
+ return (
+ self.most_positive_activation_records[
+ self._get_top_activation_slices(activation_record_slice_params)["test"]
+ ]
+ + self.random_sample[
+ self._get_random_activation_slices(activation_record_slice_params)["test"]
+ ]
+ )
+
+
+def neuron_exists(
+ dataset_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]
+) -> bool:
+ """Return whether the specified neuron exists."""
+ file = bf.join(dataset_path, "neurons", str(layer_index), f"{neuron_index}.json")
+ return bf.exists(file)
+
+
+def load_neuron(
+ layer_index: Union[str, int],
+ neuron_index: Union[str, int],
+ dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations",
+) -> NeuronRecord:
+ """Load the NeuronRecord for the specified neuron."""
+ url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"])
+ url = standardize_azure_url(url)
+ with urllib.request.urlopen(url) as f:
+ neuron_record = loads(f.read())
+ if not isinstance(neuron_record, NeuronRecord):
+ raise ValueError(
+ f"Stored data incompatible with current version of NeuronRecord dataclass."
+ )
+ return neuron_record
+
+
+@bbb.ensure_session
+async def load_neuron_async(
+ layer_index: Union[str, int],
+ neuron_index: Union[str, int],
+ dataset_path: str = "az://openaipublic/neuron-explainer/data/collated-activations",
+) -> NeuronRecord:
+ """Async version of load_neuron."""
+ file = bf.join(dataset_path, str(layer_index), f"{neuron_index}.json")
+ return await read_neuron_file(file)
+
+
+@bbb.ensure_session
+async def read_neuron_file(neuron_filename: str) -> NeuronRecord:
+ """Like load_neuron_async, but takes a raw neuron filename."""
+ raw_contents = await bbb.read.read_single(neuron_filename)
+ neuron_record = loads(raw_contents.decode("utf-8"))
+ if not isinstance(neuron_record, NeuronRecord):
+ raise ValueError(
+ f"Stored data incompatible with current version of NeuronRecord dataclass."
+ )
+ return neuron_record
+
+
+def get_sorted_neuron_indices(dataset_path: str, layer_index: Union[str, int]) -> List[int]:
+ """Returns the indices of all neurons in this layer, in ascending order."""
+ layer_dir = bf.join(dataset_path, "neurons", str(layer_index))
+ return sorted(
+ [int(f.split(".")[0]) for f in bf.listdir(layer_dir) if f.split(".")[0].isnumeric()]
+ )
+
+
+def get_sorted_layers(dataset_path: str) -> List[str]:
+ """
+ Return the indices of all layers in this dataset, in ascending numerical order, as strings.
+ """
+ return [
+ str(x)
+ for x in sorted(
+ [int(x) for x in bf.listdir(bf.join(dataset_path, "neurons")) if x.isnumeric()]
+ )
+ ]
diff --git a/docs/src/neuron-explainer/neuron_explainer/activations/token_connections.py b/docs/src/neuron-explainer/neuron_explainer/activations/token_connections.py
new file mode 100644
index 0000000..821fd05
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/activations/token_connections.py
@@ -0,0 +1,59 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import blobfile as bf
+from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass
+from neuron_explainer.azure import standardize_azure_url
+import urllib.request
+
+
+@register_dataclass
+@dataclass
+class TokensAndWeights(FastDataclass):
+ tokens: List[str]
+ strengths: List[float]
+
+
+@register_dataclass
+@dataclass
+class WeightBasedSummaryOfNeuron(FastDataclass):
+ input_positive: TokensAndWeights
+ input_negative: TokensAndWeights
+ output_positive: TokensAndWeights
+ output_negative: TokensAndWeights
+
+
+def load_token_weight_connections_of_neuron(
+ layer_index: Union[str, int],
+ neuron_index: Union[str, int],
+ dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based",
+) -> WeightBasedSummaryOfNeuron:
+ """Load the TokenLookupTableSummaryOfNeuron for the specified neuron."""
+ url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"])
+ url = standardize_azure_url(url)
+ with urllib.request.urlopen(url) as f:
+ return loads(f.read(), backwards_compatible=False)
+
+
+@register_dataclass
+@dataclass
+class TokenLookupTableSummaryOfNeuron(FastDataclass):
+ """List of tokens and the average activations of a given neuron in response to each
+ respective token. These are selected from among the tokens in the vocabulary with the
+ highest average activations across an internet text dataset, with the highest activations
+ first."""
+
+ tokens: List[str]
+ average_activations: List[float]
+
+
+def load_token_lookup_table_connections_of_neuron(
+ layer_index: Union[str, int],
+ neuron_index: Union[str, int],
+ dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based",
+) -> TokenLookupTableSummaryOfNeuron:
+ """Load the TokenLookupTableSummaryOfNeuron for the specified neuron."""
+ url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"])
+ url = standardize_azure_url(url)
+ with urllib.request.urlopen(url) as f:
+ return loads(f.read(), backwards_compatible=False)
diff --git a/docs/src/neuron-explainer/neuron_explainer/api_client.py b/docs/src/neuron-explainer/neuron_explainer/api_client.py
new file mode 100644
index 0000000..46b5e96
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/api_client.py
@@ -0,0 +1,152 @@
+import asyncio
+import contextlib
+import os
+import random
+import traceback
+from asyncio import Semaphore
+from functools import wraps
+from typing import Any, Callable, Optional
+
+import httpx
+import orjson
+
+
+def is_api_error(err: Exception) -> bool:
+ if isinstance(err, httpx.HTTPStatusError):
+ response = err.response
+ error_data = response.json().get("error", {})
+ error_message = error_data.get("message")
+ if response.status_code in [400, 404, 415]:
+ if error_data.get("type") == "idempotency_error":
+ print(f"Retrying after idempotency error: {error_message} ({response.url})")
+ return True
+ else:
+ # Invalid request
+ return False
+ else:
+ print(f"Retrying after API error: {error_message} ({response.url})")
+ return True
+
+ elif isinstance(err, httpx.ConnectError):
+ print(f"Retrying after connection error... ({err.request.url})")
+ return True
+
+ elif isinstance(err, httpx.TimeoutException):
+ print(f"Retrying after a timeout error... ({err.request.url})")
+ return True
+
+ elif isinstance(err, httpx.ReadError):
+ print(f"Retrying after a read error... ({err.request.url})")
+ return True
+
+ print(f"Retrying after an unexpected error: {repr(err)}")
+ traceback.print_tb(err.__traceback__)
+ return True
+
+
+def exponential_backoff(
+ retry_on: Callable[[Exception], bool] = lambda err: True
+) -> Callable[[Callable], Callable]:
+ """
+ Returns a decorator which retries the wrapped function as long as the specified retry_on
+ function returns True for the exception, applying exponential backoff with jitter after
+ failures, up to a retry limit.
+ """
+ init_delay_s = 1.0
+ max_delay_s = 10.0
+ # Roughly 30 minutes before we give up.
+ max_tries = 200
+ backoff_multiplier = 2.0
+ jitter = 0.2
+
+ def decorate(f: Callable) -> Callable:
+ assert asyncio.iscoroutinefunction(f)
+
+ @wraps(f)
+ async def f_retry(*args: Any, **kwargs: Any) -> None:
+ delay_s = init_delay_s
+ for i in range(max_tries):
+ try:
+ return await f(*args, **kwargs)
+ except Exception as err:
+ if not retry_on(err) or i == max_tries - 1:
+ raise
+ jittered_delay = random.uniform(delay_s * (1 - jitter), delay_s * (1 + jitter))
+ await asyncio.sleep(jittered_delay)
+ delay_s = min(delay_s * backoff_multiplier, max_delay_s)
+
+ return f_retry
+
+ return decorate
+
+
+API_KEY = os.getenv("OPENAI_API_KEY")
+assert API_KEY, "Please set the OPENAI_API_KEY environment variable"
+API_HTTP_HEADERS = {
+ "Content-Type": "application/json",
+ "Authorization": "Bearer " + API_KEY,
+}
+BASE_API_URL = "https://api.openai.com/v1"
+
+
+class ApiClient:
+ """Performs inference using the OpenAI API. Supports response caching and concurrency limits."""
+
+ def __init__(
+ self,
+ model_name: str,
+ # If set, no more than this number of HTTP requests will be made concurrently.
+ max_concurrent: Optional[int] = None,
+ # Whether to cache request/response pairs in memory to avoid duplicating requests.
+ cache: bool = False,
+ ):
+ self.model_name = model_name
+
+ if max_concurrent is not None:
+ self._concurrency_check: Optional[Semaphore] = Semaphore(max_concurrent)
+ else:
+ self._concurrency_check = None
+
+ if cache:
+ self._cache: Optional[dict[str, Any]] = {}
+ else:
+ self._cache = None
+
+ @exponential_backoff(retry_on=is_api_error)
+ async def make_request(
+ self, timeout_seconds: Optional[int] = None, **kwargs: Any
+ ) -> dict[str, Any]:
+ if self._cache is not None:
+ key = orjson.dumps(kwargs)
+ if key in self._cache:
+ return self._cache[key]
+ async with contextlib.AsyncExitStack() as stack:
+ if self._concurrency_check is not None:
+ await stack.enter_async_context(self._concurrency_check)
+ http_client = await stack.enter_async_context(
+ httpx.AsyncClient(timeout=timeout_seconds)
+ )
+ # If the request has a "messages" key, it should be sent to the /chat/completions
+ # endpoint. Otherwise, it should be sent to the /completions endpoint.
+ url = BASE_API_URL + ("/chat/completions" if "messages" in kwargs else "/completions")
+ kwargs["model"] = self.model_name
+ response = await http_client.post(url, headers=API_HTTP_HEADERS, json=kwargs)
+ # The response json has useful information but the exception doesn't include it, so print it
+ # out then reraise.
+ try:
+ response.raise_for_status()
+ except Exception as e:
+ print(response.json())
+ raise e
+ if self._cache is not None:
+ self._cache[key] = response.json()
+ return response.json()
+
+
+if __name__ == "__main__":
+
+ async def main() -> None:
+ client = ApiClient(model_name="gpt-3.5-turbo", max_concurrent=1)
+ print(await client.make_request(prompt="Why did the chicken cross the road?", max_tokens=9))
+
+ asyncio.run(main())
diff --git a/docs/src/neuron-explainer/neuron_explainer/azure.py b/docs/src/neuron-explainer/neuron_explainer/azure.py
new file mode 100644
index 0000000..a38ab76
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/azure.py
@@ -0,0 +1,5 @@
+def standardize_azure_url(url):
+ """Make sure url is converted to url format, not an azure path"""
+ if url.startswith("az://openaipublic/"):
+ url = url.replace("az://openaipublic/", "https://openaipublic.blob.core.windows.net/")
+ return url
diff --git a/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py b/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py
new file mode 100644
index 0000000..32cfec6
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py
@@ -0,0 +1,194 @@
+"""
+Code for calibrating simulations of neuron behavior. Calibration refers to a process of mapping from
+a space of predicted activation values (e.g. [0, 10]) to the real activation distribution for a
+neuron.
+
+See http://go/neuron_explanation_methodology for description of calibration step. Necessary for
+simulating neurons in the context of ablate-to-simulation, but can be skipped when using correlation
+scoring. (Calibration may still improve quality for scoring, at least for non-linear calibration
+methods.)
+"""
+
+from __future__ import annotations
+
+import asyncio
+from abc import abstractmethod
+from typing import Optional, Sequence
+
+import numpy as np
+from neuron_explainer.activations.activations import ActivationRecord
+from neuron_explainer.explanations.explanations import ActivationScale
+from neuron_explainer.explanations.simulator import NeuronSimulator, SequenceSimulation
+from sklearn import linear_model
+
+
+class CalibratedNeuronSimulator(NeuronSimulator):
+ """
+ Wrap a NeuronSimulator and calibrate it to map from the predicted activation space to the
+ actual neuron activation space.
+ """
+
+ def __init__(self, uncalibrated_simulator: NeuronSimulator):
+ self.uncalibrated_simulator = uncalibrated_simulator
+
+ @classmethod
+ async def create(
+ cls,
+ uncalibrated_simulator: NeuronSimulator,
+ calibration_activation_records: Sequence[ActivationRecord],
+ ) -> CalibratedNeuronSimulator:
+ """
+ Create and calibrate a calibrated simulator (so initialization and calibration can be done
+ in one call).
+ """
+ calibrated_simulator = cls(uncalibrated_simulator)
+ await calibrated_simulator.calibrate(calibration_activation_records)
+ return calibrated_simulator
+
+ async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None:
+ """
+ Determine parameters to map from the predicted activation space to the real neuron
+ activation space, based on a calibration set.
+
+ Use when simulated sequences haven't already been produced on the calibration set.
+ """
+ simulations = await asyncio.gather(
+ *[
+ self.uncalibrated_simulator.simulate(activations.tokens)
+ for activations in calibration_activation_records
+ ]
+ )
+ self.calibrate_from_simulations(calibration_activation_records, simulations)
+
+ def calibrate_from_simulations(
+ self,
+ calibration_activation_records: Sequence[ActivationRecord],
+ simulations: Sequence[SequenceSimulation],
+ ) -> None:
+ """
+ Determine parameters to map from the predicted activation space to the real neuron
+ activation space, based on a calibration set.
+
+ Use when simulated sequences have already been produced on the calibration set.
+ """
+ flattened_activations = []
+ flattened_simulated_activations: list[float] = []
+ for activations, simulation in zip(calibration_activation_records, simulations):
+ flattened_activations.extend(activations.activations)
+ flattened_simulated_activations.extend(simulation.expected_activations)
+ self._calibrate_from_flattened_activations(
+ np.array(flattened_activations), np.array(flattened_simulated_activations)
+ )
+
+ @abstractmethod
+ def _calibrate_from_flattened_activations(
+ self,
+ true_activations: np.ndarray,
+ uncalibrated_activations: np.ndarray,
+ ) -> None:
+ """
+ Determine parameters to map from the predicted activation space to the real neuron
+ activation space, based on a calibration set.
+
+ Take numpy arrays of all true activations and all uncalibrated activations on the
+ calibration set over all sequences.
+ """
+
+ @abstractmethod
+ def apply_calibration(self, values: Sequence[float]) -> list[float]:
+ """Apply the learned calibration to a sequence of values."""
+
+ async def simulate(self, tokens: Sequence[str]) -> SequenceSimulation:
+ uncalibrated_seq_simulation = await self.uncalibrated_simulator.simulate(tokens)
+ calibrated_activations = self.apply_calibration(
+ uncalibrated_seq_simulation.expected_activations
+ )
+ calibrated_distribution_values = [
+ self.apply_calibration(dv) for dv in uncalibrated_seq_simulation.distribution_values
+ ]
+ return SequenceSimulation(
+ tokens=uncalibrated_seq_simulation.tokens,
+ expected_activations=calibrated_activations,
+ activation_scale=ActivationScale.NEURON_ACTIVATIONS,
+ distribution_values=calibrated_distribution_values,
+ distribution_probabilities=uncalibrated_seq_simulation.distribution_probabilities,
+ uncalibrated_simulation=uncalibrated_seq_simulation,
+ )
+
+
+class UncalibratedNeuronSimulator(CalibratedNeuronSimulator):
+ """Pass through the activations without trying to calibrate."""
+
+ def __init__(self, uncalibrated_simulator: NeuronSimulator):
+ super().__init__(uncalibrated_simulator)
+
+ async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None:
+ pass
+
+ def _calibrate_from_flattened_activations(
+ self,
+ true_activations: np.ndarray,
+ uncalibrated_activations: np.ndarray,
+ ) -> None:
+ pass
+
+ def apply_calibration(self, values: Sequence[float]) -> list[float]:
+ return values if isinstance(values, list) else list(values)
+
+
+class LinearCalibratedNeuronSimulator(CalibratedNeuronSimulator):
+ """Find a linear mapping from uncalibrated activations to true activations.
+
+ Should not change ev_correlation_score because it is invariant to linear transformations.
+ """
+
+ def __init__(self, uncalibrated_simulator: NeuronSimulator):
+ super().__init__(uncalibrated_simulator)
+ self._regression: Optional[linear_model.LinearRegression] = None
+
+ def _calibrate_from_flattened_activations(
+ self,
+ true_activations: np.ndarray,
+ uncalibrated_activations: np.ndarray,
+ ) -> None:
+ self._regression = linear_model.LinearRegression()
+ self._regression.fit(uncalibrated_activations.reshape(-1, 1), true_activations)
+
+ def apply_calibration(self, values: Sequence[float]) -> list[float]:
+ if self._regression is None:
+ raise ValueError("Must call calibrate() before apply_calibration")
+ if len(values) == 0:
+ return []
+ return self._regression.predict(np.reshape(np.array(values), (-1, 1))).tolist()
+
+
+class PercentileMatchingCalibratedNeuronSimulator(CalibratedNeuronSimulator):
+ """
+ Map the nth percentile of the uncalibrated activations to the nth percentile of the true
+ activations for all n.
+
+ This will match the distribution of true activations on the calibration set, but will be
+ overconfident outside of the calibration set.
+ """
+
+ def __init__(self, uncalibrated_simulator: NeuronSimulator):
+ super().__init__(uncalibrated_simulator)
+ self._uncalibrated_activations: Optional[np.ndarray] = None
+ self._true_activations: Optional[np.ndarray] = None
+
+ def _calibrate_from_flattened_activations(
+ self,
+ true_activations: np.ndarray,
+ uncalibrated_activations: np.ndarray,
+ ) -> None:
+ self._uncalibrated_activations = np.sort(uncalibrated_activations)
+ self._true_activations = np.sort(true_activations)
+
+ def apply_calibration(self, values: Sequence[float]) -> list[float]:
+ if self._true_activations is None or self._uncalibrated_activations is None:
+ raise ValueError("Must call calibrate() before apply_calibration")
+ if len(values) == 0:
+ return []
+ return np.interp(
+ np.array(values), self._uncalibrated_activations, self._true_activations
+ ).tolist()
diff --git a/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py b/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py
new file mode 100644
index 0000000..c60e5b1
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/explanations/explainer.py
@@ -0,0 +1,472 @@
+"""Uses API calls to generate explanations of neuron behavior."""
+
+from __future__ import annotations
+
+import logging
+import re
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Any, Optional, Sequence, Union
+
+from neuron_explainer.activations.activation_records import (
+ calculate_max_activation,
+ format_activation_records,
+ non_zero_activation_proportion,
+)
+from neuron_explainer.activations.activations import ActivationRecord
+from neuron_explainer.api_client import ApiClient
+from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet
+from neuron_explainer.explanations.prompt_builder import (
+ HarmonyMessage,
+ PromptBuilder,
+ PromptFormat,
+ Role,
+)
+from neuron_explainer.explanations.token_space_few_shot_examples import (
+ TokenSpaceFewShotExampleSet,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# TODO(williamrs): This prefix may not work well for some things, like predicting the next token.
+# Try other options like "this neuron activates for".
+EXPLANATION_PREFIX = "the main thing this neuron does is find"
+
+
+def _split_numbered_list(text: str) -> list[str]:
+ """Split a numbered list into a list of strings."""
+ lines = re.split(r"\n\d+\.", text)
+ # Strip the leading whitespace from each line.
+ return [line.lstrip() for line in lines]
+
+
+def _remove_final_period(text: str) -> str:
+ """Strip a final period or period-space from a string."""
+ if text.endswith("."):
+ return text[:-1]
+ elif text.endswith(". "):
+ return text[:-2]
+ return text
+
+
+class ContextSize(int, Enum):
+ TWO_K = 2049
+ FOUR_K = 4097
+
+ @classmethod
+ def from_int(cls, i: int) -> ContextSize:
+ for context_size in cls:
+ if context_size.value == i:
+ return context_size
+ raise ValueError(f"{i} is not a valid ContextSize")
+
+
+HARMONY_V4_MODELS = ["gpt-3.5-turbo", "gpt-4"]
+
+
+class NeuronExplainer(ABC):
+ """
+ Abstract base class for Explainer classes that generate explanations from subclass-specific
+ input data.
+ """
+
+ def __init__(
+ self,
+ model_name: str,
+ prompt_format: PromptFormat = PromptFormat.HARMONY_V4,
+ # This parameter lets us adjust the length of the prompt when we're generating explanations
+ # using older models with shorter context windows. In the future we can use it to experiment
+ # with longer context windows.
+ context_size: ContextSize = ContextSize.FOUR_K,
+ max_concurrent: Optional[int] = 10,
+ cache: bool = False,
+ ):
+ if prompt_format == PromptFormat.HARMONY_V4:
+ assert model_name in HARMONY_V4_MODELS
+ elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:
+ assert model_name not in HARMONY_V4_MODELS
+ else:
+ raise ValueError(f"Unhandled prompt format {prompt_format}")
+
+ self.model_name = model_name
+ self.prompt_format = prompt_format
+ self.context_size = context_size
+ self.client = ApiClient(model_name=model_name, max_concurrent=max_concurrent, cache=cache)
+
+ async def generate_explanations(
+ self,
+ *,
+ num_samples: int = 5,
+ max_tokens: int = 60,
+ temperature: float = 1.0,
+ top_p: float = 1.0,
+ **prompt_kwargs: Any,
+ ) -> list[Any]:
+ """Generate explanations based on subclass-specific input data."""
+ prompt = self.make_explanation_prompt(max_tokens_for_completion=max_tokens, **prompt_kwargs)
+
+ generate_kwargs: dict[str, Any] = {
+ "n": num_samples,
+ "max_tokens": max_tokens,
+ "temperature": temperature,
+ "top_p": top_p,
+ }
+
+ if self.prompt_format == PromptFormat.HARMONY_V4:
+ assert isinstance(prompt, list)
+ assert isinstance(prompt[0], dict) # Really a HarmonyMessage
+ generate_kwargs["messages"] = prompt
+ else:
+ assert isinstance(prompt, str)
+ generate_kwargs["prompt"] = prompt
+
+ response = await self.client.make_request(**generate_kwargs)
+ logger.debug("response in generate_explanations is %s", response)
+
+ if self.prompt_format == PromptFormat.HARMONY_V4:
+ explanations = [x["message"]["content"] for x in response["choices"]]
+ elif self.prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:
+ explanations = [x["text"] for x in response["choices"]]
+ else:
+ raise ValueError(f"Unhandled prompt format {self.prompt_format}")
+
+ return self.postprocess_explanations(explanations, prompt_kwargs)
+
+ @abstractmethod
+ def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:
+ """
+ Create a prompt to send to the API to generate one or more explanations.
+
+ A prompt can be a simple string, or a list of HarmonyMessages, depending on the PromptFormat
+ used by this instance.
+ """
+ ...
+
+ def postprocess_explanations(
+ self, completions: list[str], prompt_kwargs: dict[str, Any]
+ ) -> list[Any]:
+ """Postprocess the completions returned by the API into a list of explanations."""
+ return completions # no-op by default
+
+ def _prompt_is_too_long(
+ self, prompt_builder: PromptBuilder, max_tokens_for_completion: int
+ ) -> bool:
+ # We'll get a context size error if the prompt itself plus the maximum number of tokens for
+ # the completion is longer than the context size.
+ prompt_length = prompt_builder.prompt_length_in_tokens(self.prompt_format)
+ if prompt_length + max_tokens_for_completion > self.context_size.value:
+ print(
+ f"Prompt is too long: {prompt_length} + {max_tokens_for_completion} > "
+ f"{self.context_size.value}"
+ )
+ return True
+ return False
+
+
+class TokenActivationPairExplainer(NeuronExplainer):
+ """
+ Generate explanations of neuron behavior using a prompt with lists of token/activation pairs.
+ """
+
+ def __init__(
+ self,
+ model_name: str,
+ prompt_format: PromptFormat = PromptFormat.HARMONY_V4,
+ # This parameter lets us adjust the length of the prompt when we're generating explanations
+ # using older models with shorter context windows. In the future we can use it to experiment
+ # with 8k+ context windows.
+ context_size: ContextSize = ContextSize.FOUR_K,
+ few_shot_example_set: FewShotExampleSet = FewShotExampleSet.ORIGINAL,
+ repeat_non_zero_activations: bool = True,
+ max_concurrent: Optional[int] = 10,
+ cache: bool = False,
+ ):
+ super().__init__(
+ model_name=model_name,
+ prompt_format=prompt_format,
+ max_concurrent=max_concurrent,
+ cache=cache,
+ )
+ self.context_size = context_size
+ self.few_shot_example_set = few_shot_example_set
+ self.repeat_non_zero_activations = repeat_non_zero_activations
+
+ def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:
+ original_kwargs = kwargs.copy()
+ all_activation_records: Sequence[ActivationRecord] = kwargs.pop("all_activation_records")
+ max_activation: float = kwargs.pop("max_activation")
+ kwargs.setdefault("numbered_list_of_n_explanations", None)
+ numbered_list_of_n_explanations: Optional[int] = kwargs.pop(
+ "numbered_list_of_n_explanations"
+ )
+ if numbered_list_of_n_explanations is not None:
+ assert numbered_list_of_n_explanations > 0, numbered_list_of_n_explanations
+ # This parameter lets us dynamically shrink the prompt if our initial attempt to create it
+ # results in something that's too long. It's only implemented for the 4k context size.
+ kwargs.setdefault("omit_n_activation_records", 0)
+ omit_n_activation_records: int = kwargs.pop("omit_n_activation_records")
+ max_tokens_for_completion: int = kwargs.pop("max_tokens_for_completion")
+ assert not kwargs, f"Unexpected kwargs: {kwargs}"
+
+ prompt_builder = PromptBuilder()
+ prompt_builder.add_message(
+ Role.SYSTEM,
+ "We're studying neurons in a neural network. Each neuron looks for some particular "
+ "thing in a short document. Look at the parts of the document the neuron activates for "
+ "and summarize in a single sentence what the neuron is looking for. Don't list "
+ "examples of words.\n\nThe activation format is tokenactivation. Activation "
+ "values range from 0 to 10. A neuron finding what it's looking for is represented by a "
+ "non-zero activation value. The higher the activation value, the stronger the match.",
+ )
+ few_shot_examples = self.few_shot_example_set.get_examples()
+ num_omitted_activation_records = 0
+ for i, few_shot_example in enumerate(few_shot_examples):
+ few_shot_activation_records = few_shot_example.activation_records
+ if self.context_size == ContextSize.TWO_K:
+ # If we're using a 2k context window, we only have room for one activation record
+ # per few-shot example. (Two few-shot examples with one activation record each seems
+ # to work better than one few-shot example with two activation records, in local
+ # testing.)
+ few_shot_activation_records = few_shot_activation_records[:1]
+ elif (
+ self.context_size == ContextSize.FOUR_K
+ and num_omitted_activation_records < omit_n_activation_records
+ ):
+ # Drop the last activation record for this few-shot example to save tokens, assuming
+ # there are at least two activation records.
+ if len(few_shot_activation_records) > 1:
+ print(f"Warning: omitting activation record from few-shot example {i}")
+ few_shot_activation_records = few_shot_activation_records[:-1]
+ num_omitted_activation_records += 1
+ self._add_per_neuron_explanation_prompt(
+ prompt_builder,
+ few_shot_activation_records,
+ i,
+ calculate_max_activation(few_shot_example.activation_records),
+ numbered_list_of_n_explanations=numbered_list_of_n_explanations,
+ explanation=few_shot_example.explanation,
+ )
+ self._add_per_neuron_explanation_prompt(
+ prompt_builder,
+ # If we're using a 2k context window, we only have room for two of the activation
+ # records.
+ all_activation_records[:2]
+ if self.context_size == ContextSize.TWO_K
+ else all_activation_records,
+ len(few_shot_examples),
+ max_activation,
+ numbered_list_of_n_explanations=numbered_list_of_n_explanations,
+ explanation=None,
+ )
+ # If the prompt is too long *and* we omitted the specified number of activation records, try
+ # again, omitting one more. (If we didn't make the specified number of omissions, we're out
+ # of opportunities to omit records, so we just return the prompt as-is.)
+ if (
+ self._prompt_is_too_long(prompt_builder, max_tokens_for_completion)
+ and num_omitted_activation_records == omit_n_activation_records
+ ):
+ original_kwargs["omit_n_activation_records"] = omit_n_activation_records + 1
+ return self.make_explanation_prompt(**original_kwargs)
+ return prompt_builder.build(self.prompt_format)
+
+ def _add_per_neuron_explanation_prompt(
+ self,
+ prompt_builder: PromptBuilder,
+ activation_records: Sequence[ActivationRecord],
+ index: int,
+ max_activation: float,
+ # When set, this indicates that the prompt should solicit a numbered list of the given
+ # number of explanations, rather than a single explanation.
+ numbered_list_of_n_explanations: Optional[int],
+ explanation: Optional[str], # None means this is the end of the full prompt.
+ ) -> None:
+ max_activation = calculate_max_activation(activation_records)
+ user_message = f"""
+
+Neuron {index + 1}
+Activations:{format_activation_records(activation_records, max_activation, omit_zeros=False)}"""
+ # We repeat the non-zero activations only if it was requested and if the proportion of
+ # non-zero activations isn't too high.
+ if (
+ self.repeat_non_zero_activations
+ and non_zero_activation_proportion(activation_records, max_activation) < 0.2
+ ):
+ user_message += (
+ f"\nSame activations, but with all zeros filtered out:"
+ f"{format_activation_records(activation_records, max_activation, omit_zeros=True)}"
+ )
+
+ if numbered_list_of_n_explanations is None:
+ user_message += f"\nExplanation of neuron {index + 1} behavior:"
+ assistant_message = ""
+ # For the IF format, we want <|endofprompt|> to come before the explanation prefix.
+ if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
+ assistant_message += f" {EXPLANATION_PREFIX}"
+ else:
+ user_message += f" {EXPLANATION_PREFIX}"
+ prompt_builder.add_message(Role.USER, user_message)
+
+ if explanation is not None:
+ assistant_message += f" {explanation}."
+ if assistant_message:
+ prompt_builder.add_message(Role.ASSISTANT, assistant_message)
+ else:
+ if explanation is None:
+ # For the final neuron, we solicit a numbered list of explanations.
+ prompt_builder.add_message(
+ Role.USER,
+ f"""\nHere are {numbered_list_of_n_explanations} possible explanations for neuron {index + 1} behavior, each beginning with "{EXPLANATION_PREFIX}":\n1. {EXPLANATION_PREFIX}""",
+ )
+ else:
+ # For the few-shot examples, we only present one explanation, but we present it as a
+ # numbered list.
+ prompt_builder.add_message(
+ Role.USER,
+ f"""\nHere is 1 possible explanation for neuron {index + 1} behavior, beginning with "{EXPLANATION_PREFIX}":\n1. {EXPLANATION_PREFIX}""",
+ )
+ prompt_builder.add_message(Role.ASSISTANT, f" {explanation}.")
+
+ def postprocess_explanations(
+ self, completions: list[str], prompt_kwargs: dict[str, Any]
+ ) -> list[Any]:
+ """Postprocess the explanations returned by the API"""
+ numbered_list_of_n_explanations = prompt_kwargs.get("numbered_list_of_n_explanations")
+ if numbered_list_of_n_explanations is None:
+ return completions
+ else:
+ all_explanations = []
+ for completion in completions:
+ for explanation in _split_numbered_list(completion):
+ if explanation.startswith(EXPLANATION_PREFIX):
+ explanation = explanation[len(EXPLANATION_PREFIX) :]
+ all_explanations.append(explanation.strip())
+ return all_explanations
+
+
+class TokenSpaceRepresentationExplainer(NeuronExplainer):
+ """
+ Generate explanations of arbitrary lists of tokens which disproportionately activate a
+ particular neuron. These lists of tokens can be generated in various ways. As an example, in one
+ set of experiments, we compute the average activation for each neuron conditional on each token
+ that appears in an internet text corpus. We then sort the tokens by their average activation,
+ and show 50 of the top 100 tokens. Other techniques that could be used include taking the top
+ tokens in the logit lens or tuned lens representations of a neuron.
+ """
+
+ def __init__(
+ self,
+ model_name: str,
+ prompt_format: PromptFormat = PromptFormat.HARMONY_V4,
+ context_size: ContextSize = ContextSize.FOUR_K,
+ few_shot_example_set: TokenSpaceFewShotExampleSet = TokenSpaceFewShotExampleSet.ORIGINAL,
+ use_few_shot: bool = False,
+ output_numbered_list: bool = False,
+ max_concurrent: Optional[int] = 10,
+ cache: bool = False,
+ ):
+ super().__init__(
+ model_name=model_name,
+ prompt_format=prompt_format,
+ context_size=context_size,
+ max_concurrent=max_concurrent,
+ cache=cache,
+ )
+ self.use_few_shot = use_few_shot
+ self.output_numbered_list = output_numbered_list
+ if self.use_few_shot:
+ assert few_shot_example_set is not None
+ self.few_shot_examples: Optional[TokenSpaceFewShotExampleSet] = few_shot_example_set
+ else:
+ self.few_shot_examples = None
+ self.prompt_prefix = (
+ "We're studying neurons in a neural network. Each neuron looks for some particular "
+ "kind of token (which can be a word, or part of a word). Look at the tokens the neuron "
+ "activates for (listed below) and summarize in a single sentence what the neuron is "
+ "looking for. Don't list examples of words."
+ )
+
+ def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:
+ tokens: list[str] = kwargs.pop("tokens")
+ max_tokens_for_completion = kwargs.pop("max_tokens_for_completion")
+ assert not kwargs, f"Unexpected kwargs: {kwargs}"
+ # Note that this does not preserve the precise tokens, as e.g.
+ # f" {token_with_no_leading_space}" may be tokenized as "f{token_with_leading_space}".
+ # TODO(dan): Try out other variants, including "\n".join(...) and ",".join(...)
+ stringified_tokens = ", ".join([f"'{t}'" for t in tokens])
+
+ prompt_builder = PromptBuilder()
+ prompt_builder.add_message(Role.SYSTEM, self.prompt_prefix)
+ if self.use_few_shot:
+ self._add_few_shot_examples(prompt_builder)
+ self._add_neuron_specific_prompt(prompt_builder, stringified_tokens, explanation=None)
+
+ if self._prompt_is_too_long(prompt_builder, max_tokens_for_completion):
+ raise ValueError(f"Prompt too long: {prompt_builder.build(self.prompt_format)}")
+ else:
+ return prompt_builder.build(self.prompt_format)
+
+ def _add_few_shot_examples(self, prompt_builder: PromptBuilder) -> None:
+ """
+ Append few-shot examples to the prompt. Each one consists of a comma-delimited list of
+ tokens and corresponding explanations, as saved in
+ alignment/neuron_explainer/weight_explainer/token_space_few_shot_examples.py.
+ """
+ assert self.few_shot_examples is not None
+ few_shot_example_list = self.few_shot_examples.get_examples()
+ if self.output_numbered_list:
+ raise NotImplementedError("Numbered list output not supported for few-shot examples")
+ else:
+ for few_shot_example in few_shot_example_list:
+ self._add_neuron_specific_prompt(
+ prompt_builder,
+ ", ".join([f"'{t}'" for t in few_shot_example.tokens]),
+ explanation=few_shot_example.explanation,
+ )
+
+ def _add_neuron_specific_prompt(
+ self,
+ prompt_builder: PromptBuilder,
+ stringified_tokens: str,
+ explanation: Optional[str],
+ ) -> None:
+ """
+ Append a neuron-specific prompt to the prompt builder. The prompt consists of a list of
+ tokens followed by either an explanation (if one is passed, for few shot examples) or by
+ the beginning of a completion, to be completed by the model with an explanation.
+ """
+ user_message = f"\n\n\n\nTokens:\n{stringified_tokens}\n\nExplanation:\n"
+ assistant_message = ""
+ looking_for = "This neuron is looking for"
+ if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
+ # We want <|endofprompt|> to come before "This neuron is looking for" in the IF format.
+ assistant_message += looking_for
+ else:
+ user_message += looking_for
+ if self.output_numbered_list:
+ start_of_list = "\n1."
+ if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
+ assistant_message += start_of_list
+ else:
+ user_message += start_of_list
+ if explanation is not None:
+ assistant_message += f"{explanation}."
+ prompt_builder.add_message(Role.USER, user_message)
+ if assistant_message:
+ prompt_builder.add_message(Role.ASSISTANT, assistant_message)
+
+ def postprocess_explanations(
+ self, completions: list[str], prompt_kwargs: dict[str, Any]
+ ) -> list[str]:
+ if self.output_numbered_list:
+ # Each list in the top-level list will have multiple explanations (multiple strings).
+ all_explanations = []
+ for completion in completions:
+ for explanation in _split_numbered_list(completion):
+ if explanation.startswith(EXPLANATION_PREFIX):
+ explanation = explanation[len(EXPLANATION_PREFIX) :]
+ all_explanations.append(explanation.strip())
+ return all_explanations
+ else:
+ # Each element in the top-level list will be an explanation as a string.
+ return [_remove_final_period(explanation) for explanation in completions]
diff --git a/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py b/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py
new file mode 100644
index 0000000..70daea2
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/explanations/explanations.py
@@ -0,0 +1,230 @@
+# Dataclasses and enums for storing neuron explanations, their scores, and related data. Also,
+# related helper functions.
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional, Union
+
+import blobfile as bf
+import boostedblob as bbb
+from neuron_explainer.activations.activations import NeuronId
+from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass
+
+
+class ActivationScale(str, Enum):
+ """Which "units" are stored in the expected_activations/distribution_values fields of a
+ SequenceSimulation.
+
+ This enum identifies whether the values represent real activations of the neuron or something
+ else. Different scales are not necessarily related by a linear transformation.
+ """
+
+ NEURON_ACTIVATIONS = "neuron_activations"
+ """Values represent real activations of the neuron."""
+ SIMULATED_NORMALIZED_ACTIVATIONS = "simulated_normalized_activations"
+ """
+ Values represent simulated activations of the neuron, normalized to the range [0, 10]. This
+ scale is arbitrary and should not be interpreted as a neuron activation.
+ """
+
+
+@register_dataclass
+@dataclass
+class SequenceSimulation(FastDataclass):
+ """The result of a simulation of neuron activations on one text sequence."""
+
+ tokens: list[str]
+ """The sequence of tokens that was simulated."""
+ expected_activations: list[float]
+ """Expected value of the possibly-normalized activation for each token in the sequence."""
+ activation_scale: ActivationScale
+ """What scale is used for values in the expected_activations field."""
+ distribution_values: list[list[float]]
+ """
+ For each token in the sequence, a list of values from the discrete distribution of activations
+ produced from simulation. Tokens will be included here if and only if they are in the top K=15
+ tokens predicted by the simulator, and excluded otherwise.
+
+ May be transformed to another unit by calibration. When we simulate a neuron, we produce a
+ discrete distribution with values in the arbitrary discretized space of the neuron, e.g. 10%
+ chance of 0, 70% chance of 1, 20% chance of 2. Which we store as distribution_values =
+ [0, 1, 2], distribution_probabilities = [0.1, 0.7, 0.2]. When we transform the distribution to
+ the real activation units, we can correspondingly transform the values of this distribution
+ to get a distribution in the units of the neuron. e.g. if the mapping from the discretized space
+ to the real activation unit of the neuron is f(x) = x/2, then the distribution becomes 10%
+ chance of 0, 70% chance of 0.5, 20% chance of 1. Which we store as distribution_values =
+ [0, 0.5, 1], distribution_probabilities = [0.1, 0.7, 0.2].
+ """
+ distribution_probabilities: list[list[float]]
+ """
+ For each token in the sequence, the probability of the corresponding value in
+ distribution_values.
+ """
+
+ uncalibrated_simulation: Optional["SequenceSimulation"] = None
+ """The result of the simulation before calibration."""
+
+
+@register_dataclass
+@dataclass
+class ScoredSequenceSimulation(FastDataclass):
+ """
+ SequenceSimulation result with a score (for that sequence only) and ground truth activations.
+ """
+
+ simulation: SequenceSimulation
+ """The result of a simulation of neuron activations."""
+ true_activations: List[float]
+ """Ground truth activations on the sequence (not normalized)"""
+ ev_correlation_score: float
+ """
+ Correlation coefficient between the expected values of the normalized activations from the
+ simulation and the unnormalized true activations of the neuron on the text sequence.
+ """
+ rsquared_score: Optional[float] = None
+ """R^2 of the simulated activations."""
+ absolute_dev_explained_score: Optional[float] = None
+ """
+ Score based on absolute difference between real and simulated activations.
+ absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real))
+ """
+
+
+@register_dataclass
+@dataclass
+class ScoredSimulation(FastDataclass):
+ """Result of scoring a neuron simulation on multiple sequences."""
+
+ scored_sequence_simulations: List[ScoredSequenceSimulation]
+ """ScoredSequenceSimulation for each sequence"""
+ ev_correlation_score: Optional[float] = None
+ """
+ Correlation coefficient between the expected values of the normalized activations from the
+ simulation and the unnormalized true activations on a dataset created from all score_results.
+ (Note that this is not equivalent to averaging across sequences.)
+ """
+ rsquared_score: Optional[float] = None
+ """R^2 of the simulated activations."""
+ absolute_dev_explained_score: Optional[float] = None
+ """
+ Score based on absolute difference between real and simulated activations.
+ absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real)).
+ """
+
+ def get_preferred_score(self) -> Optional[float]:
+ """
+ This method may return None in cases where the score is undefined, for example if the
+ normalized activations were all zero, yielding a correlation coefficient of NaN.
+ """
+ return self.ev_correlation_score
+
+
+@register_dataclass
+@dataclass
+class ScoredExplanation(FastDataclass):
+ """Simulator parameters and the results of scoring it on multiple sequences"""
+
+ explanation: str
+ """The explanation used for simulation."""
+
+ scored_simulation: ScoredSimulation
+ """Result of scoring the neuron simulator on multiple sequences."""
+
+ def get_preferred_score(self) -> Optional[float]:
+ """
+ This method may return None in cases where the score is undefined, for example if the
+ normalized activations were all zero, yielding a correlation coefficient of NaN.
+ """
+ return self.scored_simulation.get_preferred_score()
+
+
+@register_dataclass
+@dataclass
+class NeuronSimulationResults(FastDataclass):
+ """Simulation results and scores for a neuron."""
+
+ neuron_id: NeuronId
+ scored_explanations: list[ScoredExplanation]
+
+
+def load_neuron_explanations(
+ explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]
+) -> Optional[NeuronSimulationResults]:
+ """Load scored explanations for the specified neuron."""
+ file = bf.join(explanations_path, str(layer_index), f"{neuron_index}.jsonl")
+ if not bf.exists(file):
+ return None
+ with bf.BlobFile(file) as f:
+ for line in f:
+ return loads(line)
+ return None
+
+
+@bbb.ensure_session
+async def load_neuron_explanations_async(
+ explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]
+) -> Optional[NeuronSimulationResults]:
+ """Load scored explanations for the specified neuron, asynchronously."""
+ return await read_explanation_file(
+ bf.join(explanations_path, str(layer_index), f"{neuron_index}.jsonl")
+ )
+
+
+@bbb.ensure_session
+async def read_file(filename: str) -> Optional[str]:
+ """Read the contents of the given file as a string, asynchronously."""
+ try:
+ raw_contents = await bbb.read.read_single(filename)
+ except FileNotFoundError:
+ print(f"Could not read {filename}")
+ return None
+ lines = []
+ for line in raw_contents.decode("utf-8").split("\n"):
+ if len(line) > 0:
+ lines.append(line)
+ assert len(lines) == 1, filename
+ return lines[0]
+
+
+@bbb.ensure_session
+async def read_explanation_file(explanation_filename: str) -> Optional[NeuronSimulationResults]:
+ """Load scored explanations from the given filename, asynchronously."""
+ line = await read_file(explanation_filename)
+ return loads(line) if line is not None else None
+
+
+@bbb.ensure_session
+async def read_json_file(filename: str) -> Optional[dict]:
+ """Read the contents of the given file as a JSON object, asynchronously."""
+ line = await read_file(filename)
+ return json.loads(line) if line is not None else None
+
+
+def get_numerical_subdirs(dataset_path: str) -> list[str]:
+ """Return the names of all numbered subdirectories in the specified directory.
+
+ Used to get all layer directories in an explanation directory.
+ """
+ return [
+ str(x)
+ for x in sorted(
+ [
+ int(x)
+ for x in bf.listdir(dataset_path)
+ if bf.isdir(bf.join(dataset_path, x)) and x.isnumeric()
+ ]
+ )
+ ]
+
+
+def get_sorted_neuron_indices_from_explanations(
+ explanations_path: str, layer: Union[str, int]
+) -> list[int]:
+ """Return the indices of all neurons in this layer, in ascending order."""
+ layer_dir = bf.join(explanations_path, str(layer))
+ return sorted(
+ [int(f.split(".")[0]) for f in bf.listdir(layer_dir) if f.split(".")[0].isnumeric()]
+ )
diff --git a/docs/src/neuron-explainer/neuron_explainer/explanations/prompt_builder.py b/docs/src/neuron-explainer/neuron_explainer/explanations/prompt_builder.py
new file mode 100644
index 0000000..3782940
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/explanations/prompt_builder.py
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+from enum import Enum
+from typing import TypedDict, Union
+
+import tiktoken
+
+HarmonyMessage = TypedDict(
+ "HarmonyMessage",
+ {
+ "role": str,
+ "content": str,
+ },
+)
+
+
+class PromptFormat(str, Enum):
+ """
+ Different ways of formatting the components of a prompt into the format accepted by the relevant
+ API server endpoint.
+ """
+
+ NONE = "none"
+ """Suitable for use with models that don't use special tokens for instructions."""
+ INSTRUCTION_FOLLOWING = "instruction_following"
+ """Suitable for IF models that use <|endofprompt|>."""
+ HARMONY_V4 = "harmony_v4"
+ """
+ Suitable for Harmony models that use a structured turn-taking role+content format. Generates a
+ list of HarmonyMessage dicts that can be sent to the /chat/completions endpoint.
+ """
+
+ @classmethod
+ def from_string(cls, s: str) -> PromptFormat:
+ for prompt_format in cls:
+ if prompt_format.value == s:
+ return prompt_format
+ raise ValueError(f"{s} is not a valid PromptFormat")
+
+
+class Role(str, Enum):
+ """See https://platform.openai.com/docs/guides/chat"""
+
+ SYSTEM = "system"
+ USER = "user"
+ ASSISTANT = "assistant"
+
+
+class PromptBuilder:
+ """Class for accumulating components of a prompt and then formatting them into an output."""
+
+ def __init__(self) -> None:
+ self._messages: list[HarmonyMessage] = []
+
+ def add_message(self, role: Role, message: str) -> None:
+ self._messages.append(HarmonyMessage(role=role, content=message))
+
+ def prompt_length_in_tokens(self, prompt_format: PromptFormat) -> int:
+ # TODO(sbills): Make the model/encoding configurable. This implementation assumes GPT-4.
+ encoding = tiktoken.get_encoding("cl100k_base")
+ if prompt_format == PromptFormat.HARMONY_V4:
+ # Approximately-correct implementation adapted from this documentation:
+ # https://platform.openai.com/docs/guides/chat/introduction
+ num_tokens = 0
+ for message in self._messages:
+ num_tokens += (
+ 4 # every message follows <|im_start|>{role/name}\n{content}<|im_end|>\n
+ )
+ num_tokens += len(encoding.encode(message["content"], allowed_special="all"))
+ num_tokens += 2 # every reply is primed with <|im_start|>assistant
+ return num_tokens
+ else:
+ prompt_str = self.build(prompt_format)
+ assert isinstance(prompt_str, str)
+ return len(encoding.encode(prompt_str, allowed_special="all"))
+
+ def build(
+ self, prompt_format: PromptFormat, *, allow_extra_system_messages: bool = False
+ ) -> Union[str, list[HarmonyMessage]]:
+ """
+ Validates the messages added so far (reasonable alternation of assistant vs. user, etc.)
+ and returns either a regular string (maybe with <|endofprompt|> tokens) or a list of
+ HarmonyMessages suitable for use with the /chat/completions endpoint.
+
+ The `allow_extra_system_messages` parameter allows the caller to specify that the prompt
+ should be allowed to contain system messages after the very first one.
+ """
+ # Create a deep copy of the messages so we can modify it and so that the caller can't
+ # modify the internal state of this object.
+ messages = [message.copy() for message in self._messages]
+
+ expected_next_role = Role.SYSTEM
+ for message in messages:
+ role = message["role"]
+ assert role == expected_next_role or (
+ allow_extra_system_messages and role == Role.SYSTEM
+ ), f"Expected message from {expected_next_role} but got message from {role}"
+ if role == Role.SYSTEM:
+ expected_next_role = Role.USER
+ elif role == Role.USER:
+ expected_next_role = Role.ASSISTANT
+ elif role == Role.ASSISTANT:
+ expected_next_role = Role.USER
+
+ if prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
+ last_user_message = None
+ for message in messages:
+ if message["role"] == Role.USER:
+ last_user_message = message
+ assert last_user_message is not None
+ last_user_message["content"] += "<|endofprompt|>"
+
+ if prompt_format == PromptFormat.HARMONY_V4:
+ return messages
+ elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:
+ return "".join(message["content"] for message in messages)
+ else:
+ raise ValueError(f"Unknown prompt format: {prompt_format}")
diff --git a/docs/src/neuron-explainer/neuron_explainer/explanations/puzzles.py b/docs/src/neuron-explainer/neuron_explainer/explanations/puzzles.py
new file mode 100644
index 0000000..2270d71
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/explanations/puzzles.py
@@ -0,0 +1,50 @@
+import json
+import os
+from dataclasses import dataclass
+
+from neuron_explainer.activations.activations import ActivationRecord
+
+
+@dataclass(frozen=True)
+class Puzzle:
+ """A puzzle is a ground truth explanation, a collection of sentences (stored as ActivationRecords) with activations
+ according to that explanation, and a collection of false explanations"""
+
+ name: str
+ explanation: str
+ activation_records: list[ActivationRecord]
+ false_explanations: list[str]
+
+
+def convert_puzzle_to_tokenized_sentences(puzzle: Puzzle) -> list[list[str]]:
+ """Converts a puzzle to a list of tokenized sentences."""
+ return [record.tokens for record in puzzle.activation_records]
+
+
+def convert_puzzle_dict_to_puzzle(puzzle_dict: dict) -> Puzzle:
+ """Converts a json dictionary representation of a puzzle to the Puzzle class."""
+ puzzle_activation_records = []
+ for sentence in puzzle_dict["sentences"]:
+ # Token-activation pairs are listed as either a string or a list of a string and a float. If it is a list, the float is the activation.
+ # If it is only a string, the activation is assumed to be 0. This is useful for readability and reducing redundancy in the data.
+ tokens = [t[0] if type(t) is list else t for t in sentence]
+ assert all([type(t) is str for t in tokens]), "All tokens must be strings"
+ activations = [float(t[1]) if type(t) is list else 0.0 for t in sentence]
+ assert all([type(t) is float for t in activations]), "All activations must be floats"
+
+ puzzle_activation_records.append(ActivationRecord(tokens=tokens, activations=activations))
+
+ return Puzzle(
+ name=puzzle_dict["name"],
+ explanation=puzzle_dict["explanation"],
+ activation_records=puzzle_activation_records,
+ false_explanations=puzzle_dict["false_explanations"],
+ )
+
+
+PUZZLES_BY_NAME: dict[str, Puzzle] = dict()
+script_dir = os.path.dirname(os.path.abspath(__file__))
+with open(os.path.join(script_dir, "puzzles.json"), "r") as f:
+ puzzle_dicts = json.loads(f.read())
+ for name in puzzle_dicts.keys():
+ PUZZLES_BY_NAME[name] = convert_puzzle_dict_to_puzzle(puzzle_dicts[name])
diff --git a/docs/src/neuron-explainer/neuron_explainer/explanations/scoring.py b/docs/src/neuron-explainer/neuron_explainer/explanations/scoring.py
new file mode 100644
index 0000000..f7f263a
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/explanations/scoring.py
@@ -0,0 +1,155 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+from typing import Any, Callable, Coroutine, Sequence
+
+import numpy as np
+from neuron_explainer.activations.activations import ActivationRecord
+from neuron_explainer.explanations.calibrated_simulator import (
+ CalibratedNeuronSimulator,
+ LinearCalibratedNeuronSimulator,
+)
+from neuron_explainer.explanations.explanations import (
+ ScoredSequenceSimulation,
+ ScoredSimulation,
+ SequenceSimulation,
+)
+from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator, NeuronSimulator
+
+
+def flatten_list(list_of_lists: Sequence[Sequence[Any]]) -> list[Any]:
+ return [item for sublist in list_of_lists for item in sublist]
+
+
+def correlation_score(
+ real_activations: Sequence[float] | np.ndarray,
+ predicted_activations: Sequence[float] | np.ndarray,
+) -> float:
+ return np.corrcoef(real_activations, predicted_activations)[0, 1]
+
+
+def score_from_simulation(
+ real_activations: ActivationRecord,
+ simulation: SequenceSimulation,
+ score_function: Callable[[Sequence[float] | np.ndarray, Sequence[float] | np.ndarray], float],
+) -> float:
+ return score_function(real_activations.activations, simulation.expected_activations)
+
+
+def rsquared_score_from_sequences(
+ real_activations: Sequence[float] | np.ndarray,
+ predicted_activations: Sequence[float] | np.ndarray,
+) -> float:
+ return float(
+ 1
+ - np.mean(np.square(np.array(real_activations) - np.array(predicted_activations)))
+ / np.mean(np.square(np.array(real_activations)))
+ )
+
+
+def absolute_dev_explained_score_from_sequences(
+ real_activations: Sequence[float] | np.ndarray,
+ predicted_activations: Sequence[float] | np.ndarray,
+) -> float:
+ return float(
+ 1
+ - np.mean(np.abs(np.array(real_activations) - np.array(predicted_activations)))
+ / np.mean(np.abs(np.array(real_activations)))
+ )
+
+
+async def make_explanation_simulator(
+ explanation: str,
+ calibration_activation_records: Sequence[ActivationRecord],
+ model_name: str,
+ calibrated_simulator_class: type[CalibratedNeuronSimulator] = LinearCalibratedNeuronSimulator,
+) -> CalibratedNeuronSimulator:
+ """
+ Make a simulator that uses an explanation to predict activations and calibrates it on the given
+ activation records.
+ """
+ simulator = ExplanationNeuronSimulator(model_name, explanation)
+ calibrated_simulator = calibrated_simulator_class(simulator)
+ await calibrated_simulator.calibrate(calibration_activation_records)
+ return calibrated_simulator
+
+
+async def _simulate_and_score_sequence(
+ simulator: NeuronSimulator, activations: ActivationRecord
+) -> ScoredSequenceSimulation:
+ """Score an explanation of a neuron by how well it predicts activations on a sentence."""
+ simulation = await simulator.simulate(activations.tokens)
+ logging.debug(simulation)
+ rsquared_score = score_from_simulation(activations, simulation, rsquared_score_from_sequences)
+ absolute_dev_explained_score = score_from_simulation(
+ activations, simulation, absolute_dev_explained_score_from_sequences
+ )
+ scored_sequence_simulation = ScoredSequenceSimulation(
+ simulation=simulation,
+ true_activations=activations.activations,
+ ev_correlation_score=score_from_simulation(activations, simulation, correlation_score),
+ rsquared_score=rsquared_score,
+ absolute_dev_explained_score=absolute_dev_explained_score,
+ )
+ return scored_sequence_simulation
+
+
+def aggregate_scored_sequence_simulations(
+ scored_sequence_simulations: list[ScoredSequenceSimulation],
+) -> ScoredSimulation:
+ """
+ Aggregate a list of scored sequence simulations. The logic for doing this is non-trivial for EV
+ scores, since we want to calculate the correlation over all activations from all sequences at
+ once rather than simply averaging per-sequence correlations.
+ """
+ all_true_activations: list[float] = []
+ all_expected_values: list[float] = []
+ for scored_sequence_simulation in scored_sequence_simulations:
+ all_true_activations.extend(scored_sequence_simulation.true_activations or [])
+ all_expected_values.extend(scored_sequence_simulation.simulation.expected_activations)
+ ev_correlation_score = (
+ correlation_score(all_true_activations, all_expected_values)
+ if len(all_true_activations) > 0
+ else None
+ )
+ rsquared_score = rsquared_score_from_sequences(all_true_activations, all_expected_values)
+ absolute_dev_explained_score = absolute_dev_explained_score_from_sequences(
+ all_true_activations, all_expected_values
+ )
+
+ return ScoredSimulation(
+ scored_sequence_simulations=scored_sequence_simulations,
+ ev_correlation_score=ev_correlation_score,
+ rsquared_score=rsquared_score,
+ absolute_dev_explained_score=absolute_dev_explained_score,
+ )
+
+
+async def simulate_and_score(
+ simulator: NeuronSimulator,
+ activation_records: Sequence[ActivationRecord],
+) -> ScoredSimulation:
+ """
+ Score an explanation of a neuron by how well it predicts activations on the given text
+ sequences.
+ """
+ scored_sequence_simulations = await asyncio.gather(
+ *[
+ _simulate_and_score_sequence(
+ simulator,
+ activation_record,
+ )
+ for activation_record in activation_records
+ ]
+ )
+ return aggregate_scored_sequence_simulations(scored_sequence_simulations)
+
+
+async def make_simulator_and_score(
+ make_simulator: Coroutine[None, None, NeuronSimulator],
+ activation_records: Sequence[ActivationRecord],
+) -> ScoredSimulation:
+ """Chain together creating the simulator and using it to score activation records."""
+ simulator = await make_simulator
+ return await simulate_and_score(simulator, activation_records)
diff --git a/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py b/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py
new file mode 100644
index 0000000..4111ead
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/explanations/simulator.py
@@ -0,0 +1,798 @@
+"""Uses API calls to simulate neuron activations based on an explanation."""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+from enum import Enum
+from typing import Any, Optional, Sequence, Union
+
+import numpy as np
+from neuron_explainer.activations.activation_records import (
+ calculate_max_activation,
+ format_activation_records,
+ format_sequences_for_simulation,
+ normalize_activations,
+)
+from neuron_explainer.activations.activations import ActivationRecord
+from neuron_explainer.api_client import ApiClient
+from neuron_explainer.explanations.explainer import EXPLANATION_PREFIX
+from neuron_explainer.explanations.explanations import ActivationScale, SequenceSimulation
+from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet
+from neuron_explainer.explanations.prompt_builder import (
+ HarmonyMessage,
+ PromptBuilder,
+ PromptFormat,
+ Role,
+)
+
+logger = logging.getLogger(__name__)
+
+# Our prompts use normalized activation values, which map any range of positive activations to the
+# integers from 0 to 10.
+MAX_NORMALIZED_ACTIVATION = 10
+VALID_ACTIVATION_TOKENS_ORDERED = list(str(i) for i in range(MAX_NORMALIZED_ACTIVATION + 1))
+VALID_ACTIVATION_TOKENS = set(VALID_ACTIVATION_TOKENS_ORDERED)
+
+
+class SimulationType(str, Enum):
+ """How to simulate neuron activations. Values correspond to subclasses of NeuronSimulator."""
+
+ ALL_AT_ONCE = "all_at_once"
+ """
+ Use a single prompt with tokens; calculate EVs using logprobs.
+
+ Implemented by ExplanationNeuronSimulator.
+ """
+
+ ONE_AT_A_TIME = "one_at_a_time"
+ """
+ Use a separate prompt for each token being simulated; calculate EVs using logprobs.
+
+ Implemented by ExplanationTokenByTokenSimulator.
+ """
+
+ @classmethod
+ def from_string(cls, s: str) -> SimulationType:
+ for simulation_type in SimulationType:
+ if simulation_type.value == s:
+ return simulation_type
+ raise ValueError(f"Invalid simulation type: {s}")
+
+
+def compute_expected_value(
+ norm_probabilities_by_distribution_value: OrderedDict[int, float]
+) -> float:
+ """
+ Given a map from distribution values (integers on the range [0, 10]) to normalized
+ probabilities, return an expected value for the distribution.
+ """
+ return np.dot(
+ np.array(list(norm_probabilities_by_distribution_value.keys())),
+ np.array(list(norm_probabilities_by_distribution_value.values())),
+ )
+
+
+def parse_top_logprobs(top_logprobs: dict[str, float]) -> OrderedDict[int, float]:
+ """
+ Given a map from tokens to logprobs, return a map from distribution values (integers on the
+ range [0, 10]) to unnormalized probabilities (in the sense that they may not sum to 1).
+ """
+ probabilities_by_distribution_value = OrderedDict()
+ for token, logprob in top_logprobs.items():
+ if token in VALID_ACTIVATION_TOKENS:
+ token_as_int = int(token)
+ probabilities_by_distribution_value[token_as_int] = np.exp(logprob)
+ return probabilities_by_distribution_value
+
+
+def compute_predicted_activation_stats_for_token(
+ top_logprobs: dict[str, float],
+) -> tuple[OrderedDict[int, float], float]:
+ probabilities_by_distribution_value = parse_top_logprobs(top_logprobs)
+ total_p_of_distribution_values = sum(probabilities_by_distribution_value.values())
+ norm_probabilities_by_distribution_value = OrderedDict(
+ {
+ distribution_value: p / total_p_of_distribution_values
+ for distribution_value, p in probabilities_by_distribution_value.items()
+ }
+ )
+ expected_value = compute_expected_value(norm_probabilities_by_distribution_value)
+ return (
+ norm_probabilities_by_distribution_value,
+ expected_value,
+ )
+
+
+# Adapted from tether/tether/core/encoder.py.
+def convert_to_byte_array(s: str) -> bytearray:
+ byte_array = bytearray()
+ assert s.startswith("bytes:"), s
+ s = s[6:]
+ while len(s) > 0:
+ if s[0] == "\\":
+ # Hex encoding.
+ assert s[1] == "x"
+ assert len(s) >= 4
+ byte_array.append(int(s[2:4], 16))
+ s = s[4:]
+ else:
+ # Regular ascii encoding.
+ byte_array.append(ord(s[0]))
+ s = s[1:]
+ return byte_array
+
+
+def handle_byte_encoding(
+ response_tokens: Sequence[str], merged_response_index: int
+) -> tuple[str, int]:
+ """
+ Handle the case where the current token is a sequence of bytes. This may involve merging
+ multiple response tokens into a single token.
+ """
+ response_token = response_tokens[merged_response_index]
+ if response_token.startswith("bytes:"):
+ byte_array = bytearray()
+ while True:
+ byte_array = convert_to_byte_array(response_token) + byte_array
+ try:
+ # If we can decode the byte array as utf-8, then we're done.
+ response_token = byte_array.decode("utf-8")
+ break
+ except UnicodeDecodeError:
+ # If not, then we need to merge the previous response token into the byte
+ # array.
+ merged_response_index -= 1
+ response_token = response_tokens[merged_response_index]
+ return response_token, merged_response_index
+
+
+def was_token_split(current_token: str, response_tokens: Sequence[str], start_index: int) -> bool:
+ """
+ Return whether current_token (a token from the subject model) was split into multiple tokens by
+ the simulator model (as represented by the tokens in response_tokens). start_index is the index
+ in response_tokens at which to begin looking backward to form a complete token. It is usually
+ the first token *before* the delimiter that separates the token from the normalized activation,
+ barring some unusual cases.
+
+ This mainly happens if the subject model uses a different tokenizer than the simulator model.
+ But it can also happen in cases where Unicode characters are split. This function handles both
+ cases.
+ """
+ merged_response_tokens = ""
+ merged_response_index = start_index
+ while len(merged_response_tokens) < len(current_token):
+ response_token = response_tokens[merged_response_index]
+ response_token, merged_response_index = handle_byte_encoding(
+ response_tokens, merged_response_index
+ )
+ merged_response_tokens = response_token + merged_response_tokens
+ merged_response_index -= 1
+ # It's possible that merged_response_tokens is longer than current_token at this point,
+ # since the between-lines delimiter may have been merged into the original token. But it
+ # should always be the case that merged_response_tokens ends with current_token.
+ assert merged_response_tokens.endswith(current_token)
+ num_merged_tokens = start_index - merged_response_index
+ token_was_split = num_merged_tokens > 1
+ if token_was_split:
+ logger.debug(
+ "Warning: token from the subject model was split into 2+ tokens by the simulator model."
+ )
+ return token_was_split
+
+
+def parse_simulation_response(
+ response: dict[str, Any],
+ prompt_format: PromptFormat,
+ tokens: Sequence[str],
+) -> SequenceSimulation:
+ """
+ Parse an API response to a simulation prompt.
+
+ Args:
+ response: response from the API
+ prompt_format: how the prompt was formatted
+ tokens: list of tokens as strings in the sequence where the neuron is being simulated
+ """
+ choice = response["choices"][0]
+ if prompt_format == PromptFormat.HARMONY_V4:
+ text = choice["message"]["content"]
+ elif prompt_format in [
+ PromptFormat.NONE,
+ PromptFormat.INSTRUCTION_FOLLOWING,
+ ]:
+ text = choice["text"]
+ else:
+ raise ValueError(f"Unhandled prompt format {prompt_format}")
+ response_tokens = choice["logprobs"]["tokens"]
+ choice["logprobs"]["token_logprobs"]
+ top_logprobs = choice["logprobs"]["top_logprobs"]
+ token_text_offset = choice["logprobs"]["text_offset"]
+ # This only works because the sequence "" tokenizes into multiple tokens if it appears in
+ # a text sequence in the prompt.
+ scoring_start = text.rfind("")
+ expected_values = []
+ original_sequence_tokens: list[str] = []
+ distribution_values: list[list[float]] = []
+ distribution_probabilities: list[list[float]] = []
+ for i in range(2, len(response_tokens)):
+ if len(original_sequence_tokens) == len(tokens):
+ # Make sure we haven't hit some sort of off-by-one error.
+ # TODO(sbills): Generalize this to handle different tokenizers.
+ reached_end = response_tokens[i + 1] == "<" and response_tokens[i + 2] == "end"
+ assert reached_end, f"{response_tokens[i-3:i+3]}"
+ break
+ if token_text_offset[i] >= scoring_start:
+ # We're looking for the first token after a tab. This token should be the text
+ # "unknown" if hide_activations=True or a normalized activation (0-10) otherwise.
+ # If it isn't, that means that the tab is not appearing as a delimiter, but rather
+ # as a token, in which case we should move on to the next response token.
+ if response_tokens[i - 1] == "\t":
+ if response_tokens[i] != "unknown":
+ logger.debug("Ignoring tab token that is not followed by an 'unknown' token.")
+ continue
+
+ # j represents the index of the token in a "tokenactivation" line, barring
+ # one of the unusual cases handled below.
+ j = i - 2
+
+ current_token = tokens[len(original_sequence_tokens)]
+ if current_token == response_tokens[j] or was_token_split(
+ current_token, response_tokens, j
+ ):
+ # We're in the normal case where the tokenization didn't throw off the
+ # formatting or in the token-was-split case, which we handle the usual way.
+ current_top_logprobs = top_logprobs[i]
+
+ (
+ norm_probabilities_by_distribution_value,
+ expected_value,
+ ) = compute_predicted_activation_stats_for_token(
+ current_top_logprobs,
+ )
+ current_distribution_values = list(
+ norm_probabilities_by_distribution_value.keys()
+ )
+ current_distribution_probabilities = list(
+ norm_probabilities_by_distribution_value.values()
+ )
+ else:
+ # We're in a case where the tokenization resulted in a newline being folded into
+ # the token. We can't do our usual prediction of activation stats for the token,
+ # since the model did not observe the original token. Instead, we use dummy
+ # values. See the TODO elsewhere in this file about coming up with a better
+ # prompt format that avoids this situation.
+ newline_folded_into_token = "\n" in response_tokens[j]
+ assert (
+ newline_folded_into_token
+ ), f"`{current_token=}` {response_tokens[j-3:j+3]=}"
+ logger.debug(
+ "Warning: newline before a tokenactivation line was folded into the token"
+ )
+ current_distribution_values = []
+ current_distribution_probabilities = []
+ expected_value = 0.0
+
+ original_sequence_tokens.append(current_token)
+ distribution_values.append([float(v) for v in current_distribution_values])
+ distribution_probabilities.append(current_distribution_probabilities)
+ expected_values.append(expected_value)
+
+ return SequenceSimulation(
+ tokens=original_sequence_tokens,
+ expected_activations=expected_values,
+ activation_scale=ActivationScale.SIMULATED_NORMALIZED_ACTIVATIONS,
+ distribution_values=distribution_values,
+ distribution_probabilities=distribution_probabilities,
+ )
+
+
+class NeuronSimulator(ABC):
+ """Abstract base class for simulating neuron behavior."""
+
+ @abstractmethod
+ async def simulate(self, tokens: Sequence[str]) -> SequenceSimulation:
+ """Simulate the behavior of a neuron based on an explanation."""
+ ...
+
+
+class ExplanationNeuronSimulator(NeuronSimulator):
+ """
+ Simulate neuron behavior based on an explanation.
+
+ This class uses a few-shot prompt with examples of other explanations and activations. This
+ prompt allows us to score all of the tokens at once using a nifty trick involving logprobs.
+ """
+
+ def __init__(
+ self,
+ model_name: str,
+ explanation: str,
+ max_concurrent: Optional[int] = 10,
+ few_shot_example_set: FewShotExampleSet = FewShotExampleSet.ORIGINAL,
+ prompt_format: PromptFormat = PromptFormat.INSTRUCTION_FOLLOWING,
+ cache: bool = False,
+ ):
+ self.api_client = ApiClient(
+ model_name=model_name, max_concurrent=max_concurrent, cache=cache
+ )
+ self.explanation = explanation
+ self.few_shot_example_set = few_shot_example_set
+ self.prompt_format = prompt_format
+
+ async def simulate(
+ self,
+ tokens: Sequence[str],
+ ) -> SequenceSimulation:
+ prompt = self.make_simulation_prompt(tokens)
+
+ generate_kwargs: dict[str, Any] = {
+ "max_tokens": 0,
+ "echo": True,
+ "logprobs": 15,
+ }
+ if self.prompt_format == PromptFormat.HARMONY_V4:
+ assert isinstance(prompt, list)
+ assert isinstance(prompt[0], dict) # Really a HarmonyMessage
+ generate_kwargs["messages"] = prompt
+ else:
+ assert isinstance(prompt, str)
+ generate_kwargs["prompt"] = prompt
+
+ response = await self.api_client.make_request(**generate_kwargs)
+ logger.debug("response in score_explanation_by_activations is %s", response)
+ result = parse_simulation_response(response, self.prompt_format, tokens)
+ logger.debug("result in score_explanation_by_activations is %s", result)
+ return result
+
+ # TODO(sbills): The current tokenactivation format can result in improper tokenization.
+ # In particular, if the token is itself a tab, we may get a single "\t\t" token rather than two
+ # "\t" tokens. Consider using a separator that does not appear in any multi-character tokens.
+ def make_simulation_prompt(self, tokens: Sequence[str]) -> Union[str, list[HarmonyMessage]]:
+ """Create a few-shot prompt for predicting neuron activations for the given tokens."""
+
+ # TODO(sbills): The prompts in this file are subtly different from the ones in explainer.py.
+ # Consider reconciling them.
+ prompt_builder = PromptBuilder()
+ prompt_builder.add_message(
+ Role.SYSTEM,
+ """We're studying neurons in a neural network.
+Each neuron looks for some particular thing in a short document.
+Look at summary of what the neuron does, and try to predict how it will fire on each token.
+
+The activation format is tokenactivation, activations go from 0 to 10, "unknown" indicates an unknown activation. Most activations will be 0.
+""",
+ )
+
+ few_shot_examples = self.few_shot_example_set.get_examples()
+ for i, example in enumerate(few_shot_examples):
+ prompt_builder.add_message(
+ Role.USER,
+ f"\n\nNeuron {i + 1}\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX} "
+ f"{example.explanation}",
+ )
+ formatted_activation_records = format_activation_records(
+ example.activation_records,
+ calculate_max_activation(example.activation_records),
+ start_indices=example.first_revealed_activation_indices,
+ )
+ prompt_builder.add_message(
+ Role.ASSISTANT, f"\nActivations: {formatted_activation_records}\n"
+ )
+
+ prompt_builder.add_message(
+ Role.USER,
+ f"\n\nNeuron {len(few_shot_examples) + 1}\nExplanation of neuron "
+ f"{len(few_shot_examples) + 1} behavior: {EXPLANATION_PREFIX} "
+ f"{self.explanation.strip()}",
+ )
+ prompt_builder.add_message(
+ Role.ASSISTANT, f"\nActivations: {format_sequences_for_simulation([tokens])}"
+ )
+ return prompt_builder.build(self.prompt_format)
+
+
+class ExplanationTokenByTokenSimulator(NeuronSimulator):
+ """
+ Simulate neuron behavior based on an explanation.
+
+ Unlike ExplanationNeuronSimulator, this class uses one few-shot prompt per token to calculate
+ expected activations. This is slower. This class gets a one-token completion and calculates an
+ expected value from that token's logprobs.
+ """
+
+ def __init__(
+ self,
+ model_name: str,
+ explanation: str,
+ max_concurrent: Optional[int] = 10,
+ few_shot_example_set: FewShotExampleSet = FewShotExampleSet.NEWER,
+ prompt_format: PromptFormat = PromptFormat.INSTRUCTION_FOLLOWING,
+ cache: bool = False,
+ ):
+ assert (
+ few_shot_example_set != FewShotExampleSet.ORIGINAL
+ ), "This simulator doesn't support the ORIGINAL few-shot example set."
+ self.api_client = ApiClient(
+ model_name=model_name, max_concurrent=max_concurrent, cache=cache
+ )
+ self.explanation = explanation
+ self.few_shot_example_set = few_shot_example_set
+ self.prompt_format = prompt_format
+
+ async def simulate(
+ self,
+ tokens: Sequence[str],
+ ) -> SequenceSimulation:
+ responses_by_token = await asyncio.gather(
+ *[
+ self._get_activation_stats_for_single_token(tokens, self.explanation, token_index)
+ for token_index in range(len(tokens))
+ ]
+ )
+ expected_values, distribution_values, distribution_probabilities = [], [], []
+ for response in responses_by_token:
+ activation_logprobs = response["choices"][0]["logprobs"]["top_logprobs"][0]
+ (
+ norm_probabilities_by_distribution_value,
+ expected_value,
+ ) = compute_predicted_activation_stats_for_token(
+ activation_logprobs,
+ )
+ distribution_values.append(
+ [float(v) for v in norm_probabilities_by_distribution_value.keys()]
+ )
+ distribution_probabilities.append(
+ list(norm_probabilities_by_distribution_value.values())
+ )
+ expected_values.append(expected_value)
+
+ result = SequenceSimulation(
+ tokens=list(tokens), # SequenceSimulation expects List type
+ expected_activations=expected_values,
+ activation_scale=ActivationScale.SIMULATED_NORMALIZED_ACTIVATIONS,
+ distribution_values=distribution_values,
+ distribution_probabilities=distribution_probabilities,
+ )
+ logger.debug("result in score_explanation_by_activations is %s", result)
+ return result
+
+ async def _get_activation_stats_for_single_token(
+ self,
+ tokens: Sequence[str],
+ explanation: str,
+ token_index_to_score: int,
+ ) -> dict:
+ prompt = self.make_single_token_simulation_prompt(
+ tokens,
+ explanation,
+ token_index_to_score=token_index_to_score,
+ )
+ return await self.api_client.make_request(
+ prompt=prompt, max_tokens=1, echo=False, logprobs=15
+ )
+
+ def _add_single_token_simulation_subprompt(
+ self,
+ prompt_builder: PromptBuilder,
+ activation_record: ActivationRecord,
+ neuron_index: int,
+ explanation: str,
+ token_index_to_score: int,
+ end_of_prompt: bool,
+ ) -> None:
+ trimmed_activation_record = ActivationRecord(
+ tokens=activation_record.tokens[: token_index_to_score + 1],
+ activations=activation_record.activations[: token_index_to_score + 1],
+ )
+ prompt_builder.add_message(
+ Role.USER,
+ f"""
+Neuron {neuron_index}
+Explanation of neuron {neuron_index} behavior: {EXPLANATION_PREFIX} {explanation.strip()}
+Text:
+{"".join(trimmed_activation_record.tokens)}
+
+Last token in the text:
+{trimmed_activation_record.tokens[-1]}
+
+Last token activation, considering the token in the context in which it appeared in the text:
+""",
+ )
+ if not end_of_prompt:
+ normalized_activations = normalize_activations(
+ trimmed_activation_record.activations, calculate_max_activation([activation_record])
+ )
+ prompt_builder.add_message(
+ Role.ASSISTANT, str(normalized_activations[-1]) + ("" if end_of_prompt else "\n\n")
+ )
+
+ def make_single_token_simulation_prompt(
+ self,
+ tokens: Sequence[str],
+ explanation: str,
+ token_index_to_score: int,
+ ) -> Union[str, list[HarmonyMessage]]:
+ """Make a few-shot prompt for predicting the neuron's activation on a single token."""
+ assert explanation != ""
+ prompt_builder = PromptBuilder()
+ prompt_builder.add_message(
+ Role.SYSTEM,
+ """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.
+
+The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.
+
+""",
+ )
+
+ few_shot_examples = self.few_shot_example_set.get_examples()
+ for i, example in enumerate(few_shot_examples):
+ prompt_builder.add_message(
+ Role.USER,
+ f"Neuron {i + 1}\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX} "
+ f"{example.explanation}\n",
+ )
+ formatted_activation_records = format_activation_records(
+ example.activation_records,
+ calculate_max_activation(example.activation_records),
+ start_indices=None,
+ )
+ prompt_builder.add_message(
+ Role.ASSISTANT,
+ f"Activations: {formatted_activation_records}\n\n",
+ )
+
+ prompt_builder.add_message(
+ Role.SYSTEM,
+ "Now, we're going predict the activation of a new neuron on a single token, "
+ "following the same rules as the examples above. Activations still range from 0 to 10.",
+ )
+ single_token_example = self.few_shot_example_set.get_single_token_prediction_example()
+ assert single_token_example.token_index_to_score is not None
+ self._add_single_token_simulation_subprompt(
+ prompt_builder,
+ single_token_example.activation_records[0],
+ len(few_shot_examples) + 1,
+ explanation,
+ token_index_to_score=single_token_example.token_index_to_score,
+ end_of_prompt=False,
+ )
+
+ activation_record = ActivationRecord(
+ tokens=list(tokens[: token_index_to_score + 1]), # ActivationRecord expects List type.
+ activations=[0.0] * len(tokens),
+ )
+ self._add_single_token_simulation_subprompt(
+ prompt_builder,
+ activation_record,
+ len(few_shot_examples) + 2,
+ explanation,
+ token_index_to_score,
+ end_of_prompt=True,
+ )
+ return prompt_builder.build(self.prompt_format, allow_extra_system_messages=True)
+
+
+def _format_record_for_logprob_free_simulation(
+ activation_record: ActivationRecord,
+ include_activations: bool = False,
+ max_activation: Optional[float] = None,
+) -> str:
+ response = ""
+ if include_activations:
+ assert max_activation is not None
+ assert len(activation_record.tokens) == len(
+ activation_record.activations
+ ), f"{len(activation_record.tokens)=}, {len(activation_record.activations)=}"
+ normalized_activations = normalize_activations(
+ activation_record.activations, max_activation=max_activation
+ )
+ for i, token in enumerate(activation_record.tokens):
+ # We use a weird unicode character here to make it easier to parse the response (can split on "༗\n").
+ if include_activations:
+ response += f"{token}\t{normalized_activations[i]}༗\n"
+ else:
+ response += f"{token}\t༗\n"
+ return response
+
+
+def _parse_no_logprobs_completion(
+ completion: str,
+ tokens: Sequence[str],
+) -> Sequence[int]:
+ """
+ Parse a completion into a list of simulated activations. If the model did not faithfully
+ reproduce the token sequence, return a list of 0s. If the model's activation for a token
+ is not an integer betwee 0 and 10, substitute 0.
+
+ Args:
+ completion: completion from the API
+ tokens: list of tokens as strings in the sequence where the neuron is being simulated
+ """
+ zero_prediction = [0] * len(tokens)
+ token_lines = completion.strip("\n").split("༗\n")
+ start_line_index = None
+ for i, token_line in enumerate(token_lines):
+ if token_line.startswith(f"{tokens[0]}\t"):
+ start_line_index = i
+ break
+
+ # If we didn't find the first token, or if the number of lines in the completion doesn't match
+ # the number of tokens, return a list of 0s.
+ if start_line_index is None or len(token_lines) - start_line_index != len(tokens):
+ return zero_prediction
+ predicted_activations = []
+ for i, token_line in enumerate(token_lines[start_line_index:]):
+ if not token_line.startswith(f"{tokens[i]}\t"):
+ return zero_prediction
+ predicted_activation = token_line.split("\t")[1]
+ if predicted_activation not in VALID_ACTIVATION_TOKENS:
+ predicted_activations.append(0)
+ else:
+ predicted_activations.append(int(predicted_activation))
+ return predicted_activations
+
+
+class LogprobFreeExplanationTokenSimulator(NeuronSimulator):
+ """
+ Simulate neuron behavior based on an explanation.
+
+ Unlike ExplanationNeuronSimulator and ExplanationTokenByTokenSimulator, this class does not rely on
+ logprobs to calculate expected activations. Instead, it uses a few-shot prompt that displays all of the
+ tokens at once, and request that the model repeat the tokens with the activations appended. Sampling
+ is with temperature = 0. Thus, the activations are deterministic. Also, each activation for a token
+ is a function of all the activations that came previously and all of the tokens in the sequence, not
+ just the current and previous tokens. In the case where the model does not faithfully reproduce the
+ token sequence, the simulator will return a response where every predicted activation is 0. Example prompt as follows:
+
+ Explanation: Explanation 1
+
+ Sequence 1 Tokens Without Activations:
+
+ A\t_
+ B\t_
+ C\t_
+
+ Sequence 1 Tokens With Activations:
+
+ A\t4_
+ B\t10_
+ C\t0_
+
+ Sequence 2 Tokens Without Activations:
+
+ D\t_
+ E\t_
+ F\t_
+
+ Sequence 2 Tokens With Activations:
+
+ D\t3_
+ E\t6_
+ F\t9_
+
+ Explanation: Explanation 2
+
+ Sequence 1 Tokens Without Activations:
+
+ G\t_
+ H\t_
+ I\t_
+
+ Sequence 1 Tokens With Activations:
+
+
+ G\t2_
+ H\t0_
+ I\t3_
+
+ """
+
+ def __init__(
+ self,
+ model_name: str,
+ explanation: str,
+ max_concurrent: Optional[int] = 10,
+ few_shot_example_set: FewShotExampleSet = FewShotExampleSet.NEWER,
+ prompt_format: PromptFormat = PromptFormat.HARMONY_V4,
+ cache: bool = False,
+ ):
+ assert (
+ few_shot_example_set != FewShotExampleSet.ORIGINAL
+ ), "This simulator doesn't support the ORIGINAL few-shot example set."
+ self.api_client = ApiClient(
+ model_name=model_name, max_concurrent=max_concurrent, cache=cache
+ )
+ self.explanation = explanation
+ self.few_shot_example_set = few_shot_example_set
+ self.prompt_format = prompt_format
+
+ async def simulate(
+ self,
+ tokens: Sequence[str],
+ ) -> SequenceSimulation:
+ prompt = self._make_simulation_prompt(
+ tokens,
+ self.explanation,
+ )
+ response = await self.api_client.make_request(
+ prompt=prompt, echo=False, max_tokens=1000
+ )
+ assert len(response["choices"]) == 1
+
+ choice = response["choices"][0]
+ if self.prompt_format == PromptFormat.HARMONY_V4:
+ completion = choice["message"]["content"]
+ elif self.prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:
+ completion = choice["text"]
+ else:
+ raise ValueError(f"Unhandled prompt format {self.prompt_format}")
+
+ predicted_activations = _parse_no_logprobs_completion(completion, tokens)
+
+ result = SequenceSimulation(
+ activation_scale=ActivationScale.SIMULATED_NORMALIZED_ACTIVATIONS,
+ expected_activations=predicted_activations,
+ # Since the predicted activation is just a sampled token, we don't have a distribution.
+ distribution_values=None,
+ distribution_probabilities=None,
+ tokens=list(tokens), # SequenceSimulation expects List type
+ )
+ logger.debug("result in score_explanation_by_activations is %s", result)
+ return result
+
+ def _make_simulation_prompt(
+ self,
+ tokens: Sequence[str],
+ explanation: str,
+ ) -> Union[str, list[HarmonyMessage]]:
+ """Make a few-shot prompt for predicting the neuron's activations on a sequence."""
+ assert explanation != ""
+ prompt_builder = PromptBuilder(allow_extra_system_messages=True)
+ prompt_builder.add_message(
+ Role.SYSTEM,
+ """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.
+
+The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.
+For each sequence, you will see the tokens in the sequence where the activations are left blank. You will print the exact same tokens verbatim, but with the activations filled in according to the explanation.
+""",
+ )
+
+ few_shot_examples = self.few_shot_example_set.get_examples()
+ for i, example in enumerate(few_shot_examples):
+ few_shot_example_max_activation = calculate_max_activation(example.activation_records)
+
+ prompt_builder.add_message(
+ Role.USER,
+ f"Neuron {i + 1}\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX} "
+ f"{example.explanation}\n\n"
+ f"Sequence 1 Tokens without Activations:\n{_format_record_for_logprob_free_simulation(example.activation_records[0], include_activations=False)}\n\n"
+ f"Sequence 1 Tokens with Activations:\n",
+ )
+ prompt_builder.add_message(
+ Role.ASSISTANT,
+ f"{_format_record_for_logprob_free_simulation(example.activation_records[0], include_activations=True, max_activation=few_shot_example_max_activation)}\n\n",
+ )
+
+ for record_index, record in enumerate(example.activation_records[1:]):
+ prompt_builder.add_message(
+ Role.USER,
+ f"Sequence {record_index + 2} Tokens without Activations:\n{_format_record_for_logprob_free_simulation(record, include_activations=False)}\n\n"
+ f"Sequence {record_index + 2} Tokens with Activations:\n",
+ )
+ prompt_builder.add_message(
+ Role.ASSISTANT,
+ f"{_format_record_for_logprob_free_simulation(record, include_activations=True, max_activation=few_shot_example_max_activation)}\n\n",
+ )
+
+ neuron_index = len(few_shot_examples) + 1
+ prompt_builder.add_message(
+ Role.USER,
+ f"Neuron {neuron_index}\nExplanation of neuron {neuron_index} behavior: {EXPLANATION_PREFIX} "
+ f"{explanation}\n\n"
+ f"Sequence 1 Tokens without Activations:\n{_format_record_for_logprob_free_simulation(ActivationRecord(tokens=tokens, activations=[]), include_activations=False)}\n\n"
+ f"Sequence 1 Tokens with Activations:\n",
+ )
+ return prompt_builder.build(self.prompt_format)
diff --git a/docs/src/neuron-explainer/neuron_explainer/explanations/test_explainer.py b/docs/src/neuron-explainer/neuron_explainer/explanations/test_explainer.py
new file mode 100644
index 0000000..5f2e7b8
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/explanations/test_explainer.py
@@ -0,0 +1,227 @@
+import asyncio
+from typing import Any
+
+from neuron_explainer.explanations.explainer import (
+ TokenActivationPairExplainer,
+ TokenSpaceRepresentationExplainer,
+)
+from neuron_explainer.explanations.few_shot_examples import TEST_EXAMPLES, FewShotExampleSet
+from neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role
+from neuron_explainer.explanations.token_space_few_shot_examples import (
+ TokenSpaceFewShotExampleSet,
+)
+
+
+def setup_module(unused_module: Any) -> None:
+ # Make sure we have an event loop, since the attempt to create the Semaphore in
+ # ResearchApiClient will fail without it.
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+
+
+def test_if_formatting() -> None:
+ expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words.
+
+The activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.
+
+Neuron 1
+Activations:
+
+a 10
+b 0
+c 0
+
+
+d 0
+e 10
+f 0
+
+
+Explanation of neuron 1 behavior: the main thing this neuron does is find vowels.
+
+Neuron 2
+Activations:
+
+a 10
+b 0
+c 0
+
+
+d 0
+e 10
+f 0
+
+
+Explanation of neuron 2 behavior:<|endofprompt|> the main thing this neuron does is find"""
+
+ explainer = TokenActivationPairExplainer(
+ model_name="text-davinci-003",
+ prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
+ few_shot_example_set=FewShotExampleSet.TEST,
+ )
+ prompt = explainer.make_explanation_prompt(
+ all_activation_records=TEST_EXAMPLES[0].activation_records,
+ max_activation=1.0,
+ max_tokens_for_completion=20,
+ )
+
+ assert prompt == expected_prompt
+
+
+def test_harmony_format() -> None:
+ expected_prompt = [
+ HarmonyMessage(
+ role=Role.SYSTEM,
+ content="""We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words.
+
+The activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.""",
+ ),
+ HarmonyMessage(
+ role=Role.USER,
+ content="""
+
+Neuron 1
+Activations:
+
+a 10
+b 0
+c 0
+
+
+d 0
+e 10
+f 0
+
+
+Explanation of neuron 1 behavior: the main thing this neuron does is find""",
+ ),
+ HarmonyMessage(
+ role=Role.ASSISTANT,
+ content=" vowels.",
+ ),
+ HarmonyMessage(
+ role=Role.USER,
+ content="""
+
+Neuron 2
+Activations:
+
+a 10
+b 0
+c 0
+
+
+d 0
+e 10
+f 0
+
+
+Explanation of neuron 2 behavior: the main thing this neuron does is find""",
+ ),
+ ]
+
+ explainer = TokenActivationPairExplainer(
+ model_name="gpt-4",
+ prompt_format=PromptFormat.HARMONY_V4,
+ few_shot_example_set=FewShotExampleSet.TEST,
+ )
+ prompt = explainer.make_explanation_prompt(
+ all_activation_records=TEST_EXAMPLES[0].activation_records,
+ max_activation=1.0,
+ max_tokens_for_completion=20,
+ )
+
+ assert isinstance(prompt, list)
+ assert isinstance(prompt[0], dict) # Really a HarmonyMessage
+ for actual_message, expected_message in zip(prompt, expected_prompt):
+ assert actual_message["role"] == expected_message["role"]
+ assert actual_message["content"] == expected_message["content"]
+ assert prompt == expected_prompt
+
+
+def test_token_space_explainer_if_formatting() -> None:
+ expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular kind of token (which can be a word, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.
+
+
+
+Tokens:
+'these', ' are', ' tokens'
+
+Explanation:
+This neuron is looking for this is a test explanation.
+
+
+
+Tokens:
+'foo', 'bar', 'baz'
+
+Explanation:
+<|endofprompt|>This neuron is looking for"""
+
+ explainer = TokenSpaceRepresentationExplainer(
+ model_name="text-davinci-002",
+ prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
+ use_few_shot=True,
+ few_shot_example_set=TokenSpaceFewShotExampleSet.TEST,
+ )
+ prompt = explainer.make_explanation_prompt(
+ tokens=["foo", "bar", "baz"],
+ max_tokens_for_completion=20,
+ )
+
+ assert prompt == expected_prompt
+
+
+def test_token_space_explainer_harmony_formatting() -> None:
+ expected_prompt = [
+ HarmonyMessage(
+ role=Role.SYSTEM,
+ content="We're studying neurons in a neural network. Each neuron looks for some particular kind of token (which can be a word, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.",
+ ),
+ HarmonyMessage(
+ role=Role.USER,
+ content="""
+
+
+
+Tokens:
+'these', ' are', ' tokens'
+
+Explanation:
+This neuron is looking for""",
+ ),
+ HarmonyMessage(
+ role=Role.ASSISTANT,
+ content=" this is a test explanation.",
+ ),
+ HarmonyMessage(
+ role=Role.USER,
+ content="""
+
+
+
+Tokens:
+'foo', 'bar', 'baz'
+
+Explanation:
+This neuron is looking for""",
+ ),
+ ]
+
+ explainer = TokenSpaceRepresentationExplainer(
+ model_name="gpt-4",
+ prompt_format=PromptFormat.HARMONY_V4,
+ use_few_shot=True,
+ few_shot_example_set=TokenSpaceFewShotExampleSet.TEST,
+ )
+ prompt = explainer.make_explanation_prompt(
+ tokens=["foo", "bar", "baz"],
+ max_tokens_for_completion=20,
+ )
+
+ assert isinstance(prompt, list)
+ assert isinstance(prompt[0], dict) # Really a HarmonyMessage
+ for actual_message, expected_message in zip(prompt, expected_prompt):
+ assert actual_message["role"] == expected_message["role"]
+ assert actual_message["content"] == expected_message["content"]
+ assert prompt == expected_prompt
diff --git a/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py b/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py
new file mode 100644
index 0000000..442b798
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/explanations/test_simulator.py
@@ -0,0 +1,269 @@
+from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet
+from neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role
+from neuron_explainer.explanations.simulator import (
+ ExplanationNeuronSimulator,
+ ExplanationTokenByTokenSimulator,
+)
+
+
+def test_make_explanation_simulation_prompt_if_format() -> None:
+ expected_prompt = """We're studying neurons in a neural network.
+Each neuron looks for some particular thing in a short document.
+Look at summary of what the neuron does, and try to predict how it will fire on each token.
+
+The activation format is tokenactivation, activations go from 0 to 10, "unknown" indicates an unknown activation. Most activations will be 0.
+
+
+Neuron 1
+Explanation of neuron 1 behavior: the main thing this neuron does is find vowels
+Activations:
+
+a 10
+b 0
+c 0
+
+
+d unknown
+e 10
+f 0
+
+
+
+
+Neuron 2
+Explanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION<|endofprompt|>
+Activations:
+
+0 unknown
+1 unknown
+2 unknown
+
+"""
+ prompt = ExplanationNeuronSimulator(
+ model_name="text-davinci-003",
+ explanation="EXPLANATION",
+ few_shot_example_set=FewShotExampleSet.TEST,
+ prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
+ ).make_simulation_prompt(
+ tokens=[str(x) for x in range(3)],
+ )
+ assert prompt == expected_prompt
+
+
+def test_make_explanation_simulation_prompt_harmony_format() -> None:
+ expected_prompt = [
+ HarmonyMessage(
+ role=Role.SYSTEM,
+ content="""We're studying neurons in a neural network.
+Each neuron looks for some particular thing in a short document.
+Look at summary of what the neuron does, and try to predict how it will fire on each token.
+
+The activation format is tokenactivation, activations go from 0 to 10, "unknown" indicates an unknown activation. Most activations will be 0.
+""",
+ ),
+ HarmonyMessage(
+ role=Role.USER,
+ content="""
+
+Neuron 1
+Explanation of neuron 1 behavior: the main thing this neuron does is find vowels""",
+ ),
+ HarmonyMessage(
+ role=Role.ASSISTANT,
+ content="""
+Activations:
+
+a 10
+b 0
+c 0
+
+
+d unknown
+e 10
+f 0
+
+
+""",
+ ),
+ HarmonyMessage(
+ role=Role.USER,
+ content="""
+
+Neuron 2
+Explanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION""",
+ ),
+ HarmonyMessage(
+ role=Role.ASSISTANT,
+ content="""
+Activations:
+
+0 unknown
+1 unknown
+2 unknown
+
+""",
+ ),
+ ]
+ prompt = ExplanationNeuronSimulator(
+ model_name="gpt-4",
+ explanation="EXPLANATION",
+ few_shot_example_set=FewShotExampleSet.TEST,
+ prompt_format=PromptFormat.HARMONY_V4,
+ ).make_simulation_prompt(
+ tokens=[str(x) for x in range(3)],
+ )
+
+ assert isinstance(prompt, list)
+ assert isinstance(prompt[0], dict) # Really a HarmonyMessage
+ for actual_message, expected_message in zip(prompt, expected_prompt):
+ assert actual_message["role"] == expected_message["role"]
+ assert actual_message["content"] == expected_message["content"]
+ assert prompt == expected_prompt
+
+
+def test_make_token_by_token_simulation_prompt_if_format() -> None:
+ expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.
+
+The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.
+
+Neuron 1
+Explanation of neuron 1 behavior: the main thing this neuron does is find vowels
+Activations:
+
+a 10
+b 0
+c 0
+
+
+d 0
+e 10
+f 0
+
+
+
+Now, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.
+Neuron 2
+Explanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else
+Text:
+ghi
+
+Last token in the text:
+i
+
+Last token activation, considering the token in the context in which it appeared in the text:
+10
+
+
+Neuron 3
+Explanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else
+Text:
+01
+
+Last token in the text:
+1
+
+Last token activation, considering the token in the context in which it appeared in the text:
+<|endofprompt|>"""
+ prompt = ExplanationTokenByTokenSimulator(
+ model_name="text-davinci-003",
+ explanation="EXPLANATION",
+ few_shot_example_set=FewShotExampleSet.TEST,
+ prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
+ ).make_single_token_simulation_prompt(
+ tokens=[str(x) for x in range(3)],
+ explanation="numbers and nothing else",
+ token_index_to_score=1,
+ )
+ assert prompt == expected_prompt
+
+
+def test_make_token_by_token_simulation_prompt_harmony_format() -> None:
+ expected_prompt = [
+ HarmonyMessage(
+ role=Role.SYSTEM,
+ content="""We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.
+
+The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.
+
+""",
+ ),
+ HarmonyMessage(
+ role=Role.USER,
+ content="""Neuron 1
+Explanation of neuron 1 behavior: the main thing this neuron does is find vowels
+""",
+ ),
+ HarmonyMessage(
+ role=Role.ASSISTANT,
+ content="""Activations:
+
+a 10
+b 0
+c 0
+
+
+d 0
+e 10
+f 0
+
+
+
+""",
+ ),
+ HarmonyMessage(
+ role=Role.SYSTEM,
+ content="Now, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.",
+ ),
+ HarmonyMessage(
+ role=Role.USER,
+ content="""
+Neuron 2
+Explanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else
+Text:
+ghi
+
+Last token in the text:
+i
+
+Last token activation, considering the token in the context in which it appeared in the text:
+""",
+ ),
+ HarmonyMessage(
+ role=Role.ASSISTANT,
+ content="""10
+
+""",
+ ),
+ HarmonyMessage(
+ role=Role.USER,
+ content="""
+Neuron 3
+Explanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else
+Text:
+01
+
+Last token in the text:
+1
+
+Last token activation, considering the token in the context in which it appeared in the text:
+""",
+ ),
+ ]
+
+ prompt = ExplanationTokenByTokenSimulator(
+ model_name="gpt-4",
+ explanation="EXPLANATION",
+ few_shot_example_set=FewShotExampleSet.TEST,
+ prompt_format=PromptFormat.HARMONY_V4,
+ ).make_single_token_simulation_prompt(
+ tokens=[str(x) for x in range(3)],
+ explanation="numbers and nothing else",
+ token_index_to_score=1,
+ )
+
+ assert isinstance(prompt, list)
+ assert isinstance(prompt[0], dict) # Really a HarmonyMessage
+ for actual_message, expected_message in zip(prompt, expected_prompt):
+ assert actual_message["role"] == expected_message["role"]
+ assert actual_message["content"] == expected_message["content"]
+ assert prompt == expected_prompt
diff --git a/docs/src/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py b/docs/src/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py
new file mode 100644
index 0000000..3fb0418
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py
@@ -0,0 +1,212 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import List
+
+from neuron_explainer.fast_dataclasses import FastDataclass
+
+
+@dataclass
+class Example(FastDataclass):
+ """
+ An example list of tokens as strings corresponding to top token space inputs of a neuron, with a
+ string explanation of the neuron's behavior on these tokens.
+ """
+
+ tokens: List[str]
+ explanation: str
+
+
+class TokenSpaceFewShotExampleSet(Enum):
+ """Determines which few-shot examples to use when sampling explanations."""
+
+ ORIGINAL = "original"
+ TEST = "test"
+
+ def get_examples(self) -> list[Example]:
+ """Returns regular examples for use in a few-shot prompt."""
+ if self is TokenSpaceFewShotExampleSet.ORIGINAL:
+ return ORIGINAL_EXAMPLES
+ elif self is TokenSpaceFewShotExampleSet.TEST:
+ return TEST_EXAMPLES
+ else:
+ raise ValueError(f"Unhandled example set: {self}")
+
+
+ORIGINAL_EXAMPLES = [
+ Example(
+ tokens=[
+ "actual",
+ " literal",
+ " actual",
+ " hyper",
+ " real",
+ " EX",
+ " Real",
+ "^",
+ "Full",
+ " full",
+ " optical",
+ " style",
+ "any",
+ "ALL",
+ "extreme",
+ " miniature",
+ " Optical",
+ " faint",
+ "~",
+ " Physical",
+ " REAL",
+ "*",
+ "virtual",
+ "TYPE",
+ " technical",
+ "otally",
+ " physic",
+ "Type",
+ "<",
+ "images",
+ "atic",
+ " sheer",
+ " Style",
+ " partial",
+ " natural",
+ "Hyper",
+ " Any",
+ " theoretical",
+ "|",
+ " ultimate",
+ "oing",
+ " constant",
+ "ANY",
+ "antically",
+ "ishly",
+ " ex",
+ " visual",
+ "special",
+ "omorphic",
+ "visual",
+ ],
+ explanation=" adjectives related to being real, or to physical properties and evidence",
+ ),
+ Example(
+ tokens=[
+ "cephal",
+ "aeus",
+ " coma",
+ "bered",
+ "abetes",
+ "inflamm",
+ "rugged",
+ "alysed",
+ "azine",
+ "hered",
+ "cells",
+ "aneously",
+ "fml",
+ "igm",
+ "culosis",
+ "iani",
+ "CTV",
+ "disabled",
+ "heric",
+ "ulo",
+ "geoning",
+ "awi",
+ "translation",
+ "iral",
+ "govtrack",
+ "mson",
+ "cloth",
+ "nesota",
+ " Dise",
+ " Lyme",
+ " dementia",
+ "agn",
+ " reversible",
+ " susceptibility",
+ "esthesia",
+ "orf",
+ " inflamm",
+ " Obesity",
+ " tox",
+ " Disorders",
+ "uberty",
+ "blind",
+ "ALTH",
+ "avier",
+ " Immunity",
+ " Hurt",
+ "ulet",
+ "ueless",
+ " sluggish",
+ "rosis",
+ ],
+ explanation=" words related to physical medical conditions",
+ ),
+ Example(
+ tokens=[
+ " January",
+ "terday",
+ "cember",
+ " April",
+ " July",
+ "September",
+ "December",
+ "Thursday",
+ "quished",
+ "November",
+ "Tuesday",
+ "uesday",
+ " Sept",
+ "ruary",
+ " March",
+ ";;;;;;;;;;;;",
+ " Monday",
+ "Wednesday",
+ " Saturday",
+ " Wednesday",
+ "Reloaded",
+ "aturday",
+ " August",
+ "Feb",
+ "Sunday",
+ "Reviewed",
+ "uggest",
+ " Dhabi",
+ "ACTED",
+ "tten",
+ "Year",
+ "August",
+ "alogue",
+ "MX",
+ " Janeiro",
+ "yss",
+ " Leilan",
+ " Fiscal",
+ " referen",
+ "semb",
+ "eele",
+ "wcs",
+ "detail",
+ "ertation",
+ " Reborn",
+ " Sunday",
+ "itially",
+ "aturdays",
+ " Dise",
+ "essage",
+ ],
+ explanation=" nouns related to time and dates",
+ ),
+]
+
+TEST_EXAMPLES = [
+ Example(
+ tokens=[
+ "these",
+ " are",
+ " tokens",
+ ],
+ explanation=" this is a test explanation",
+ ),
+]
diff --git a/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py b/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py
new file mode 100644
index 0000000..3ee0435
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py
@@ -0,0 +1,3 @@
+from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass
+
+__all__ = ["FastDataclass", "dumps", "loads", "register_dataclass"]
diff --git a/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py b/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py
new file mode 100644
index 0000000..592c1f9
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py
@@ -0,0 +1,85 @@
+# Utilities for dataclasses that are very fast to serialize and deserialize, with limited data
+# validation. Fields must not be tuples, since they get serialized and then deserialized as lists.
+#
+# The unit tests for this library show how to use it.
+
+import json
+from dataclasses import dataclass, field, fields, is_dataclass
+from functools import partial
+from typing import Any, Union
+
+import orjson
+
+dataclasses_by_name = {}
+dataclasses_by_fieldnames = {}
+
+
+@dataclass
+class FastDataclass:
+ dataclass_name: str = field(init=False)
+
+ def __post_init__(self) -> None:
+ self.dataclass_name = self.__class__.__name__
+
+
+def register_dataclass(cls): # type: ignore
+ assert is_dataclass(cls), "Only dataclasses can be registered."
+ dataclasses_by_name[cls.__name__] = cls
+ name_set = frozenset(f.name for f in fields(cls) if f.name != "dataclass_name")
+ dataclasses_by_fieldnames[name_set] = cls
+ return cls
+
+
+def dumps(obj: Any) -> bytes:
+ return orjson.dumps(obj, option=orjson.OPT_SERIALIZE_NUMPY)
+
+
+def _object_hook(d: Any, backwards_compatible: bool = True) -> Any:
+ # If d is a list, recurse.
+ if isinstance(d, list):
+ return [_object_hook(x, backwards_compatible=backwards_compatible) for x in d]
+ # If d is not a dict, return it as is.
+ if not isinstance(d, dict):
+ return d
+ cls = None
+ if "dataclass_name" in d:
+ if d["dataclass_name"] in dataclasses_by_name:
+ cls = dataclasses_by_name[d["dataclass_name"]]
+ else:
+ assert backwards_compatible, (
+ f"Dataclass {d['dataclass_name']} not found, set backwards_compatible=True if you "
+ f"are okay with that."
+ )
+ # Load objects created without dataclass_name set.
+ else:
+ # Try our best to find a dataclass if backwards_compatible is True.
+ if backwards_compatible:
+ d_fields = frozenset(d.keys())
+ if d_fields in dataclasses_by_fieldnames:
+ cls = dataclasses_by_fieldnames[d_fields]
+ elif len(d_fields) > 0:
+ # Check if the fields are a subset of a dataclass (if the dataclass had extra fields
+ # added since the data was created). Note that this will fail if fields were removed
+ # from the dataclass.
+ for key, possible_cls in dataclasses_by_fieldnames.items():
+ if d_fields.issubset(key):
+ cls = possible_cls
+ break
+ else:
+ print(f"Could not find dataclass for {d_fields} {cls}")
+ new_d = {
+ k: _object_hook(v, backwards_compatible=backwards_compatible)
+ for k, v in d.items()
+ if k != "dataclass_name"
+ }
+ if cls is not None:
+ return cls(**new_d)
+ else:
+ return new_d
+
+
+def loads(s: Union[str, bytes], backwards_compatible: bool = True) -> Any:
+ return json.loads(
+ s,
+ object_hook=partial(_object_hook, backwards_compatible=backwards_compatible),
+ )
diff --git a/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py b/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py
new file mode 100644
index 0000000..8f7c2b5
--- /dev/null
+++ b/docs/src/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py
@@ -0,0 +1,83 @@
+from dataclasses import dataclass
+
+import pytest
+
+from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass
+
+
+# Inheritance is a bit tricky with our setup. dataclass_name must be set for instances of these
+# classes to serialize and deserialize correctly, but if it's given a default value, then subclasses
+# can't have any fields that don't have default values, because of how constructors are generated
+# for dataclasses (fields with no default value can't follow those with default values). To work
+# around this, we set dataclass_name in __post_init__ on the base class, which is called after the
+# constructor. The implementation does the right thing for both the base class and the subclass.
+@register_dataclass
+@dataclass
+class DataclassC(FastDataclass):
+ ints: list[int]
+
+
+@register_dataclass
+@dataclass
+class DataclassC_ext(DataclassC):
+ s: str
+
+
+@register_dataclass
+@dataclass
+class DataclassB(FastDataclass):
+ str_to_c: dict[str, DataclassC]
+ cs: list[DataclassC]
+
+
+@register_dataclass
+@dataclass
+class DataclassA(FastDataclass):
+ floats: list[float]
+ strings: list[str]
+ bs: list[DataclassB]
+
+
+@register_dataclass
+@dataclass
+class DataclassD(FastDataclass):
+ s1: str
+ s2: str = "default"
+
+
+def test_dataclasses() -> None:
+ a = DataclassA(
+ floats=[1.0, 2.0],
+ strings=["a", "b"],
+ bs=[
+ DataclassB(
+ str_to_c={"a": DataclassC(ints=[1, 2]), "b": DataclassC(ints=[3, 4])},
+ cs=[DataclassC(ints=[5, 6]), DataclassC_ext(ints=[7, 8], s="s")],
+ ),
+ DataclassB(
+ str_to_c={"c": DataclassC_ext(ints=[9, 10], s="t"), "d": DataclassC(ints=[11, 12])},
+ cs=[DataclassC(ints=[13, 14]), DataclassC(ints=[15, 16])],
+ ),
+ ],
+ )
+ assert loads(dumps(a)) == a
+
+
+def test_c_and_c_ext() -> None:
+ c_ext = DataclassC_ext(ints=[3, 4], s="s")
+ assert loads(dumps(c_ext)) == c_ext
+
+ c = DataclassC(ints=[1, 2])
+ assert loads(dumps(c)) == c
+
+
+def test_bad_serialized_data() -> None:
+ assert type(loads(dumps(DataclassC(ints=[3, 4])))) == DataclassC
+ assert type(loads('{"ints": [3, 4]}', backwards_compatible=False)) == dict
+ assert type(loads('{"ints": [3, 4], "dataclass_name": "DataclassC"}')) == DataclassC
+ with pytest.raises(TypeError):
+ loads('{"ints": [3, 4], "bogus_extra_field": "foo", "dataclass_name": "DataclassC"}')
+ with pytest.raises(TypeError):
+ loads('{"ints_field_is_missing": [3, 4], "dataclass_name": "DataclassC"}')
+ assert type(loads('{"s1": "test"}', backwards_compatible=False)) == dict
+ assert type(loads('{"s1": "test"}', backwards_compatible=True)) == DataclassD
diff --git a/docs/src/neuron-explainer/setup.py b/docs/src/neuron-explainer/setup.py
new file mode 100644
index 0000000..91e52f3
--- /dev/null
+++ b/docs/src/neuron-explainer/setup.py
@@ -0,0 +1,21 @@
+from setuptools import setup, find_packages
+
+setup(
+ name="neuron_explainer",
+ packages=find_packages(),
+ version="0.0.1",
+ author="OpenAI",
+ install_requires=[
+ "httpx>=0.22",
+ "scikit-learn",
+ "boostedblob>=0.13.0",
+ "tiktoken",
+ "blobfile",
+ "numpy",
+ "pytest",
+ "orjson",
+ ],
+ url="",
+ description="",
+ python_requires='>=3.9',
+)
diff --git a/docs/src/neuron-viewer/README.md b/docs/src/neuron-viewer/README.md
new file mode 100644
index 0000000..8381ce0
--- /dev/null
+++ b/docs/src/neuron-viewer/README.md
@@ -0,0 +1,20 @@
+# Neuron viewer
+
+The easiest way to view neurons and explanations is using the
+[public website](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html).
+This directory contains the implementation of that website as well as lightweight servers that make
+it possible to run an alternative version of the website locally.
+
+## Local development
+
+Install:
+
+```npm install```
+
+Run the backend:
+
+```npm run startpy```
+
+Run the frontend:
+
+```npm start```
diff --git a/docs/src/neuron-viewer/python/server.py b/docs/src/neuron-viewer/python/server.py
new file mode 100644
index 0000000..9e9a26b
--- /dev/null
+++ b/docs/src/neuron-viewer/python/server.py
@@ -0,0 +1,51 @@
+# %%
+import logging
+
+from flask import Flask, request
+from flask_cors import CORS
+
+import json
+
+import urllib.request
+
+def load_az_json(url):
+ with urllib.request.urlopen(url) as f:
+ return json.load(f)
+
+def start(
+ dev: bool = False,
+ host_name: str = "0.0.0.0",
+ port: int = 80,
+):
+ app = Flask("interpretability chat")
+ app.logger.setLevel(logging.INFO)
+ # app.logger.disabled = True
+ CORS(app)
+
+ @app.after_request
+ def after_request(response):
+ response.headers.add("Access-Control-Allow-Origin", "*")
+ response.headers.add(
+ "Access-Control-Allow-Headers", "Content-Type,Authorization"
+ )
+ response.headers.add(
+ "Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,OPTIONS"
+ )
+ return response
+
+ @app.route("/load_az", methods=["GET", "POST"])
+ async def load_az():
+ args = request.get_json()
+ path = args["path"]
+ result = load_az_json(path)
+ return result
+
+ app.run(debug=dev, host=host_name, port=port, use_reloader=False)
+
+
+def main(dev: bool = True, host_name: str = "0.0.0.0", port: int = 8000):
+ start(dev=dev, host_name=host_name, port=port)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/docs/src/neuron-viewer/src/App.jsx b/docs/src/neuron-viewer/src/App.jsx
new file mode 100644
index 0000000..97e5303
--- /dev/null
+++ b/docs/src/neuron-viewer/src/App.jsx
@@ -0,0 +1,17 @@
+import "./App.css"
+import Feed from "./feed"
+import React from "react"
+import { Routes, Route, HashRouter } from "react-router-dom"
+
+function App() {
+ return (
+
+
+ } />
+ } />
+
+
+ )
+}
+
+export default App
diff --git a/docs/src/neuron-viewer/src/feed.jsx b/docs/src/neuron-viewer/src/feed.jsx
new file mode 100644
index 0000000..cab824a
--- /dev/null
+++ b/docs/src/neuron-viewer/src/feed.jsx
@@ -0,0 +1,64 @@
+import * as Panes from "./panes"
+import React, { useEffect } from "react"
+import Welcome from "./welcome"
+import { useState } from "react"
+import { useParams, Link } from "react-router-dom"
+
+export default function Feed() {
+ const params = useParams()
+ // If params is missing either index, there's no neuron selected.
+ let activeNeuron;
+ if (params.layer === undefined || params.neuron === undefined) {
+ activeNeuron = null
+ } else {
+ // Grab the layer and neuron indices from the params, casting them to ints.
+ activeNeuron = {
+ "layer": parseInt(params.layer),
+ "neuron": parseInt(params.neuron),
+ }
+ }
+
+ const Pane = ({ children }) => (
+