Merge remote-tracking branch 'origin/main' into generalizable-multi-gpu

# Conflicts: # elk/extraction/extraction.py
EleutherAI · May 3, 2023 · 051e2fd · 051e2fd
2 parents 69bbf64 + 8ba18c3
commit 051e2fd
Show file tree

Hide file tree

Showing 12 changed files with 290 additions and 136 deletions.
diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
@@ -40,7 +40,7 @@ def apply_to_layer(
         experiment_dir = elk_reporter_dir() / self.source
 
         reporter_path = experiment_dir / "reporters" / f"layer_{layer}.pt"
-        reporter: Reporter = torch.load(reporter_path, map_location=device)
+        reporter = Reporter.load(reporter_path, map_location=device)
         reporter.eval()
 
         row_bufs = defaultdict(list)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
@@ -2,7 +2,7 @@
 import logging
 import os
 from dataclasses import InitVar, dataclass, replace
-from itertools import islice, zip_longest
+from itertools import zip_longest
 from typing import Any, Iterable, Literal
 from warnings import filterwarnings
 
@@ -30,7 +30,7 @@
     Color,
     assert_type,
     colorize,
-    float32_to_int16,
+    float_to_int16,
     infer_label_column,
     infer_num_classes,
     instantiate_tokenizer,
@@ -102,6 +102,11 @@ class Extract(Serializable):
     case of encoder-decoder models."""
 
     def __post_init__(self, layer_stride: int):
+        if len(self.datasets) == 0:
+            raise ValueError(
+                "Must specify at least one dataset to extract hiddens from."
+            )
+
         if len(self.max_examples) > 2:
             raise ValueError(
                 "max_examples should be a list of length 0, 1, or 2,"
@@ -177,6 +182,7 @@ def extract_hiddens(
         cfg.model, truncation_side="left", verbose=is_verbose
     )
 
+
     is_enc_dec = model.config.is_encoder_decoder
     if is_enc_dec and cfg.use_encoder_states:
         assert hasattr(model, "get_encoder") and callable(model.get_encoder)
@@ -202,13 +208,25 @@ def extract_hiddens(
     layer_indices = cfg.layers or tuple(range(model.config.num_hidden_layers + 1))
 
     global_max_examples = cfg.max_examples[0 if split_type == "train" else 1]
+
     # break `max_examples` among the processes roughly equally
     max_examples = global_max_examples // world_size
+    max_length = assert_type(int, tokenizer.model_max_length)
+
+    # Keep track of the number of examples we've yielded so far. We can't do something
+    # clean like `islice` the dataset, because we skip examples that are too long, and
+    # we can't predict how many of those there will be.
+    num_yielded = 0
+
     # the last process gets the remainder (which is usually small)
     if rank == world_size - 1:
         max_examples += global_max_examples % world_size
 
-    for example in islice(prompt_ds, max_examples):
+    for example in prompt_ds:
+        # Check if we've yielded enough examples
+        if num_yielded >= max_examples:
+            break
+
         num_variants = len(example["prompts"])
         num_choices = len(example["prompts"][0])
 
@@ -240,19 +258,14 @@ def extract_hiddens(
 
                 # Only feed question, not the answer, to the encoder for enc-dec models
                 target = choice["answer"] if is_enc_dec else None
-
-                # Record the EXACT question we fed to the model
-                variant_questions.append(text)
                 encoding = tokenizer(
                     text,
                     # Keep [CLS] and [SEP] for BERT-style models
                     add_special_tokens=True,
                     return_tensors="pt",
                     text_target=target,  # type: ignore[arg-type]
-                    truncation=True,
                 ).to(first_device)
                 input_ids = assert_type(Tensor, encoding.input_ids)
-
                 if is_enc_dec:
                     answer = assert_type(Tensor, encoding.labels)
                 else:
@@ -263,11 +276,14 @@ def extract_hiddens(
                         return_tensors="pt",
                     ).to(first_device)
                     answer = assert_type(Tensor, encoding2.input_ids)
-
                     input_ids = torch.cat([input_ids, answer], dim=-1)
-                    if max_len := tokenizer.model_max_length:
-                        cur_len = input_ids.shape[-1]
-                        input_ids = input_ids[..., -min(cur_len, max_len) :]
+
+                # If this input is too long, skip it
+                if input_ids.shape[-1] > max_length:
+                    break
+                else:
+                    # Record the EXACT question we fed to the model
+                    variant_questions.append(text)
 
                 # Make sure we only pass the arguments that the model expects
                 inputs = dict(input_ids=input_ids.long())
@@ -307,10 +323,20 @@ def extract_hiddens(
                     raise ValueError(f"Invalid token_loc: {cfg.token_loc}")
 
                 for layer_idx, hidden in zip(layer_indices, hiddens):
-                    hidden_dict[f"hidden_{layer_idx}"][i, j] = float32_to_int16(hidden)
+                    hidden_dict[f"hidden_{layer_idx}"][i, j] = float_to_int16(hidden)
+
+            # We skipped a pseudolabel because it was too long; break out of this whole
+            # example and move on to the next one
+            if len(variant_questions) != num_choices:
+                break
 
+            # Usual case: we have the expected number of pseudolabels
             text_questions.append(variant_questions)
 
+        # We skipped a variant because it was too long; move on to the next example
+        if len(text_questions) != num_variants:
+            continue
+
         out_record: dict[str, Any] = dict(
             label=example["label"],
             variant_ids=example["template_names"],
@@ -320,6 +346,7 @@ def extract_hiddens(
         if has_lm_preds:
             out_record["model_logits"] = lm_logits
 
+        num_yielded += 1
         yield out_record
 
 

diff --git a/elk/run.py b/elk/run.py
@@ -13,6 +13,7 @@
 import torch.multiprocessing as mp
 import yaml
 from simple_parsing.helpers import Serializable, field
+from simple_parsing.helpers.serialization import save
 from torch import Tensor
 from tqdm import tqdm
 
@@ -37,12 +38,14 @@ class Run(ABC, Serializable):
     """Directory to save results to. If None, a directory will be created
     automatically."""
 
-    datasets: list[DatasetDictWithName] = field(default_factory=list, init=False)
+    datasets: list[DatasetDictWithName] = field(
+        default_factory=list, init=False, to_dict=False
+    )
     """Datasets containing hidden states and labels for each layer."""
 
     concatenated_layer_offset: int = 0
     debug: bool = False
-    min_gpu_mem: int | None = None
+    min_gpu_mem: int | None = None  # in bytes
     num_gpus: int = -1
     out_dir: Path | None = None
     disable_cache: bool = field(default=False, to_dict=False)
@@ -78,9 +81,9 @@ def execute(
         print(f"Output directory at \033[1m{self.out_dir}\033[0m")
         self.out_dir.mkdir(parents=True, exist_ok=True)
 
-        path = self.out_dir / "cfg.yaml"
-        with open(path, "w") as f:
-            self.dump_yaml(f)
+        # save_dc_types really ought to be the default... We simply can't load
+        # properly without this flag enabled.
+        save(self, self.out_dir / "cfg.yaml", save_dc_types=True)
 
         path = self.out_dir / "fingerprints.yaml"
         with open(path, "w") as meta_f:

diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py
@@ -3,6 +3,7 @@
 import math
 from copy import deepcopy
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import Literal, Optional, cast
 
 import torch
@@ -59,7 +60,6 @@ class CcsReporterConfig(ReporterConfig):
     loss_dict: dict[str, float] = field(default_factory=dict, init=False)
     num_layers: int = 1
     pre_ln: bool = False
-    seed: int = 42
     supervised_weight: float = 0.0
 
     lr: float = 1e-2
@@ -68,6 +68,10 @@ class CcsReporterConfig(ReporterConfig):
     optimizer: Literal["adam", "lbfgs"] = "lbfgs"
     weight_decay: float = 0.01
 
+    @classmethod
+    def reporter_class(cls) -> type[Reporter]:
+        return CcsReporter
+
     def __post_init__(self):
         self.loss_dict = parse_loss(self.loss)
 
@@ -94,6 +98,11 @@ def __init__(
     ):
         super().__init__()
         self.config = cfg
+        self.in_features = in_features
+
+        # Learnable Platt scaling parameters
+        self.bias = nn.Parameter(torch.zeros(1, device=device, dtype=dtype))
+        self.scale = nn.Parameter(torch.ones(1, device=device, dtype=dtype))
 
         hidden_size = cfg.hidden_size or 4 * in_features // 3
 
@@ -239,7 +248,7 @@ def forward(self, x: Tensor) -> Tensor:
 
     def raw_forward(self, x: Tensor) -> Tensor:
         """Apply the probe to the provided input, without normalization."""
-        return self.probe(x).squeeze(-1)
+        return self.probe(x).mul(self.scale).add(self.bias).squeeze(-1)
 
     def loss(
         self,
@@ -401,3 +410,9 @@ def closure():
 
         optimizer.step(closure)
         return float(loss)
+
+    def save(self, path: Path | str) -> None:
+        """Save the reporter to a file."""
+        state = {k: v.cpu() for k, v in self.state_dict().items()}
+        state.update(in_features=self.in_features)
+        torch.save(state, path)