Instructions to use google/gemma-scope-9b-pt-res with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- SAELens
How to use google/gemma-scope-9b-pt-res with SAELens:
# pip install sae-lens from sae_lens import SAE sae, cfg_dict, sparsity = SAE.from_pretrained( release = "RELEASE_ID", # e.g., "gpt2-small-res-jb". See other options in https://github.com/jbloomAus/SAELens/blob/main/sae_lens/pretrained_saes.yaml sae_id = "SAE_ID", # e.g., "blocks.8.hook_resid_pre". Won't always be a hook point ) - Notebooks
- Google Colab
- Kaggle
Removing SAEs with LR != 7e-5 (#7)
Browse files- 9B: Add sparsity lambdas for residual stream and clean up the feature splitting suite so that we only have SAEs with learning rate 7e-5 (745af8a812b0a8946168208fe98402b1474225f9)
This view is limited to 50 files because it contains too many changes. See raw diff
- layer_0/width_131k/average_l0_11/hparams.json +1 -0
- layer_0/width_131k/average_l0_15/hparams.json +1 -0
- layer_0/width_131k/average_l0_21/hparams.json +1 -0
- layer_0/width_131k/average_l0_30/hparams.json +1 -0
- layer_0/width_131k/average_l0_41/hparams.json +1 -0
- layer_0/width_131k/average_l0_8/hparams.json +1 -0
- layer_0/width_16k/average_l0_11/hparams.json +1 -0
- layer_0/width_16k/average_l0_129/hparams.json +1 -0
- layer_0/width_16k/average_l0_17/hparams.json +1 -0
- layer_0/width_16k/average_l0_35/hparams.json +1 -0
- layer_0/width_16k/average_l0_68/hparams.json +1 -0
- layer_1/width_131k/average_l0_13/hparams.json +1 -0
- layer_1/width_131k/average_l0_20/hparams.json +1 -0
- layer_1/width_131k/average_l0_33/hparams.json +1 -0
- layer_1/width_131k/average_l0_56/hparams.json +1 -0
- layer_1/width_131k/average_l0_6/hparams.json +1 -0
- layer_1/width_131k/average_l0_9/hparams.json +1 -0
- layer_1/width_16k/average_l0_15/hparams.json +1 -0
- layer_1/width_16k/average_l0_175/hparams.json +1 -0
- layer_1/width_16k/average_l0_31/hparams.json +1 -0
- layer_1/width_16k/average_l0_69/hparams.json +1 -0
- layer_1/width_16k/average_l0_9/hparams.json +1 -0
- layer_10/width_131k/average_l0_15/hparams.json +1 -0
- layer_10/width_131k/average_l0_151/hparams.json +1 -0
- layer_10/width_131k/average_l0_27/hparams.json +1 -0
- layer_10/width_131k/average_l0_47/hparams.json +1 -0
- layer_10/width_131k/average_l0_84/hparams.json +1 -0
- layer_10/width_131k/average_l0_9/hparams.json +1 -0
- layer_10/width_16k/average_l0_10/hparams.json +1 -0
- layer_10/width_16k/average_l0_113/hparams.json +1 -0
- layer_10/width_16k/average_l0_17/hparams.json +1 -0
- layer_10/width_16k/average_l0_243/hparams.json +1 -0
- layer_10/width_16k/average_l0_31/hparams.json +1 -0
- layer_10/width_16k/average_l0_57/hparams.json +1 -0
- layer_11/width_131k/average_l0_16/hparams.json +1 -0
- layer_11/width_131k/average_l0_162/hparams.json +1 -0
- layer_11/width_131k/average_l0_27/hparams.json +1 -0
- layer_11/width_131k/average_l0_49/hparams.json +1 -0
- layer_11/width_131k/average_l0_88/hparams.json +1 -0
- layer_11/width_131k/average_l0_9/hparams.json +1 -0
- layer_11/width_16k/average_l0_10/hparams.json +1 -0
- layer_11/width_16k/average_l0_118/hparams.json +1 -0
- layer_11/width_16k/average_l0_18/hparams.json +1 -0
- layer_11/width_16k/average_l0_255/hparams.json +1 -0
- layer_11/width_16k/average_l0_32/hparams.json +1 -0
- layer_11/width_16k/average_l0_60/hparams.json +1 -0
- layer_12/width_131k/average_l0_10/hparams.json +1 -0
- layer_12/width_131k/average_l0_17/hparams.json +1 -0
- layer_12/width_131k/average_l0_183/hparams.json +1 -0
- layer_12/width_131k/average_l0_29/hparams.json +1 -0
layer_0/width_131k/average_l0_11/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.002}
|
layer_0/width_131k/average_l0_15/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.001}
|
layer_0/width_131k/average_l0_21/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.0005}
|
layer_0/width_131k/average_l0_30/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.00025}
|
layer_0/width_131k/average_l0_41/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.000125}
|
layer_0/width_131k/average_l0_8/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.004}
|
layer_0/width_16k/average_l0_11/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.002}
|
layer_0/width_16k/average_l0_129/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.000125}
|
layer_0/width_16k/average_l0_17/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.001}
|
layer_0/width_16k/average_l0_35/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.0005}
|
layer_0/width_16k/average_l0_68/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.00025}
|
layer_1/width_131k/average_l0_13/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.001}
|
layer_1/width_131k/average_l0_20/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.0005}
|
layer_1/width_131k/average_l0_33/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.00025}
|
layer_1/width_131k/average_l0_56/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.000125}
|
layer_1/width_131k/average_l0_6/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.004}
|
layer_1/width_131k/average_l0_9/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.002}
|
layer_1/width_16k/average_l0_15/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.001}
|
layer_1/width_16k/average_l0_175/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.000125}
|
layer_1/width_16k/average_l0_31/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.0005}
|
layer_1/width_16k/average_l0_69/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.00025}
|
layer_1/width_16k/average_l0_9/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.002}
|
layer_10/width_131k/average_l0_15/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.002}
|
layer_10/width_131k/average_l0_151/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.000125}
|
layer_10/width_131k/average_l0_27/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.001}
|
layer_10/width_131k/average_l0_47/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.0005}
|
layer_10/width_131k/average_l0_84/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.00025}
|
layer_10/width_131k/average_l0_9/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.004}
|
layer_10/width_16k/average_l0_10/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.004}
|
layer_10/width_16k/average_l0_113/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.00025}
|
layer_10/width_16k/average_l0_17/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.002}
|
layer_10/width_16k/average_l0_243/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.000125}
|
layer_10/width_16k/average_l0_31/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.001}
|
layer_10/width_16k/average_l0_57/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.0005}
|
layer_11/width_131k/average_l0_16/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.002}
|
layer_11/width_131k/average_l0_162/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.000125}
|
layer_11/width_131k/average_l0_27/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.001}
|
layer_11/width_131k/average_l0_49/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.0005}
|
layer_11/width_131k/average_l0_88/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.00025}
|
layer_11/width_131k/average_l0_9/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.004}
|
layer_11/width_16k/average_l0_10/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.004}
|
layer_11/width_16k/average_l0_118/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.00025}
|
layer_11/width_16k/average_l0_18/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.002}
|
layer_11/width_16k/average_l0_255/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.000125}
|
layer_11/width_16k/average_l0_32/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.001}
|
layer_11/width_16k/average_l0_60/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.0005}
|
layer_12/width_131k/average_l0_10/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.004}
|
layer_12/width_131k/average_l0_17/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.002}
|
layer_12/width_131k/average_l0_183/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.000125}
|
layer_12/width_131k/average_l0_29/hparams.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sparsity_lambda": 0.001}
|