diff --git a/model.py b/model.py index cc7f2db32e..715c606b75 100644 --- a/model.py +++ b/model.py @@ -15,17 +15,6 @@ import torch.nn as nn from torch.nn import functional as F -class LayerNorm(nn.Module): - """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """ - - def __init__(self, ndim, bias): - super().__init__() - self.weight = nn.Parameter(torch.ones(ndim)) - self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None - - def forward(self, input): - return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5) - class CausalSelfAttention(nn.Module): def __init__(self, config): @@ -84,9 +73,9 @@ class Block(nn.Module): def __init__(self, config): super().__init__() - self.ln_1 = LayerNorm(config.n_embd, bias=config.bias) + self.ln_1 = nn.LayerNorm(config.n_embd, bias=config.bias) self.attn = CausalSelfAttention(config) - self.ln_2 = LayerNorm(config.n_embd, bias=config.bias) + self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias) self.mlp = MLP(config) @@ -118,7 +107,7 @@ def __init__(self, config): wpe = nn.Embedding(config.block_size, config.n_embd), drop = nn.Dropout(config.dropout), h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), - ln_f = LayerNorm(config.n_embd, bias=config.bias), + ln_f = nn.LayerNorm(config.n_embd, bias=config.bias), )) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) # with weight tying when using torch.compile() some warnings get generated: