karpathy · amnonbleich · Aug 21, 2023
diff --git a/mingpt/model.py b/mingpt/model.py
@@ -187,6 +187,7 @@ def from_pretrained(cls, model_type):
         config.block_size = 1024  # openai's model block_size
         model = GPT(config)
         sd = model.state_dict()
+        keys_sd = [k for k in sd if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
 
         # init a huggingface/transformers model
         model_hf = GPT2LMHeadModel.from_pretrained(model_type)
@@ -197,7 +198,7 @@ def from_pretrained(cls, model_type):
         transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
         # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla nn.Linear.
         # this means that we have to transpose these weights when we import them
-        assert len(keys) == len(sd)
+        assert len(keys) == len(keys_sd)
         for k in keys:
             if any(k.endswith(w) for w in transposed):
                 # special treatment for the Conv1D weights we need to transpose