Upload folder using huggingface_hub
Browse files- amplify_te.py +11 -2
- config.json +2 -1
- model.safetensors +2 -2
amplify_te.py
CHANGED
|
@@ -34,6 +34,7 @@ class AMPLIFYConfig(PretrainedConfig):
|
|
| 34 |
layer_norm_after_embedding: bool = False,
|
| 35 |
layer_norm_before_last_layer: bool = True,
|
| 36 |
vocab_size: int = 27,
|
|
|
|
| 37 |
ffn_bias: bool = False,
|
| 38 |
att_bias: bool = False,
|
| 39 |
pad_token_id: int = 0,
|
|
@@ -56,6 +57,7 @@ class AMPLIFYConfig(PretrainedConfig):
|
|
| 56 |
layer_norm_after_embedding (bool): Whether to use layer normalization after the embedding.
|
| 57 |
layer_norm_before_last_layer (bool): Whether to use layer normalization before the last layer.
|
| 58 |
vocab_size (int): The vocabulary size of the model.
|
|
|
|
| 59 |
ffn_bias (bool): Whether to use bias in the feedforward network.
|
| 60 |
att_bias (bool): Whether to use bias in the attention.
|
| 61 |
pad_token_id (int): The padding token id.
|
|
@@ -77,11 +79,16 @@ class AMPLIFYConfig(PretrainedConfig):
|
|
| 77 |
self.layer_norm_after_embedding = layer_norm_after_embedding
|
| 78 |
self.layer_norm_before_last_layer = layer_norm_before_last_layer
|
| 79 |
self.vocab_size = vocab_size
|
|
|
|
| 80 |
self.ffn_bias = ffn_bias
|
| 81 |
self.att_bias = att_bias
|
| 82 |
self.pad_token_id = pad_token_id
|
| 83 |
self.max_length = max_length
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
class AMPLIFYPreTrainedModel(PreTrainedModel):
|
| 87 |
"""AMPLIFY pre-trained model."""
|
|
@@ -114,7 +121,7 @@ class AMPLIFY(AMPLIFYPreTrainedModel):
|
|
| 114 |
self.config = config
|
| 115 |
|
| 116 |
self.encoder = nn.Embedding(
|
| 117 |
-
config.
|
| 118 |
config.hidden_size,
|
| 119 |
padding_idx=config.pad_token_id,
|
| 120 |
dtype=config.torch_dtype,
|
|
@@ -245,7 +252,7 @@ class AMPLIFYForMaskedLM(AMPLIFYPreTrainedModel):
|
|
| 245 |
if config.layer_norm_before_last_layer:
|
| 246 |
self.decoder = transformer_engine.pytorch.LayerNormLinear(
|
| 247 |
config.hidden_size,
|
| 248 |
-
config.
|
| 249 |
config.norm_eps,
|
| 250 |
params_dtype=config.torch_dtype,
|
| 251 |
normalization="RMSNorm" if config.rms_norm else "LayerNorm",
|
|
@@ -292,6 +299,8 @@ class AMPLIFYForMaskedLM(AMPLIFYPreTrainedModel):
|
|
| 292 |
|
| 293 |
# Classification head with layer norm
|
| 294 |
logits = self.decoder(outputs.last_hidden_state)
|
|
|
|
|
|
|
| 295 |
|
| 296 |
if labels is not None:
|
| 297 |
loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
|
|
|
|
| 34 |
layer_norm_after_embedding: bool = False,
|
| 35 |
layer_norm_before_last_layer: bool = True,
|
| 36 |
vocab_size: int = 27,
|
| 37 |
+
padded_vocab_size: int = 32,
|
| 38 |
ffn_bias: bool = False,
|
| 39 |
att_bias: bool = False,
|
| 40 |
pad_token_id: int = 0,
|
|
|
|
| 57 |
layer_norm_after_embedding (bool): Whether to use layer normalization after the embedding.
|
| 58 |
layer_norm_before_last_layer (bool): Whether to use layer normalization before the last layer.
|
| 59 |
vocab_size (int): The vocabulary size of the model.
|
| 60 |
+
padded_vocab_size (int): The padded vocabulary size of the model to support fp8.
|
| 61 |
ffn_bias (bool): Whether to use bias in the feedforward network.
|
| 62 |
att_bias (bool): Whether to use bias in the attention.
|
| 63 |
pad_token_id (int): The padding token id.
|
|
|
|
| 79 |
self.layer_norm_after_embedding = layer_norm_after_embedding
|
| 80 |
self.layer_norm_before_last_layer = layer_norm_before_last_layer
|
| 81 |
self.vocab_size = vocab_size
|
| 82 |
+
self.padded_vocab_size = padded_vocab_size
|
| 83 |
self.ffn_bias = ffn_bias
|
| 84 |
self.att_bias = att_bias
|
| 85 |
self.pad_token_id = pad_token_id
|
| 86 |
self.max_length = max_length
|
| 87 |
|
| 88 |
+
assert self.padded_vocab_size >= self.vocab_size, (
|
| 89 |
+
"padded_vocab_size must be greater than or equal to vocab_size"
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
|
| 93 |
class AMPLIFYPreTrainedModel(PreTrainedModel):
|
| 94 |
"""AMPLIFY pre-trained model."""
|
|
|
|
| 121 |
self.config = config
|
| 122 |
|
| 123 |
self.encoder = nn.Embedding(
|
| 124 |
+
config.padded_vocab_size,
|
| 125 |
config.hidden_size,
|
| 126 |
padding_idx=config.pad_token_id,
|
| 127 |
dtype=config.torch_dtype,
|
|
|
|
| 252 |
if config.layer_norm_before_last_layer:
|
| 253 |
self.decoder = transformer_engine.pytorch.LayerNormLinear(
|
| 254 |
config.hidden_size,
|
| 255 |
+
config.padded_vocab_size,
|
| 256 |
config.norm_eps,
|
| 257 |
params_dtype=config.torch_dtype,
|
| 258 |
normalization="RMSNorm" if config.rms_norm else "LayerNorm",
|
|
|
|
| 299 |
|
| 300 |
# Classification head with layer norm
|
| 301 |
logits = self.decoder(outputs.last_hidden_state)
|
| 302 |
+
if self.config.padded_vocab_size != self.config.vocab_size:
|
| 303 |
+
logits = logits[:, :, : self.config.vocab_size]
|
| 304 |
|
| 305 |
if labels is not None:
|
| 306 |
loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
|
config.json
CHANGED
|
@@ -28,10 +28,11 @@
|
|
| 28 |
"num_hidden_layers": 32,
|
| 29 |
"other_special_token_ids": null,
|
| 30 |
"pad_token_id": 0,
|
|
|
|
| 31 |
"pre_activation_layer_norm": true,
|
| 32 |
"rms_norm": true,
|
| 33 |
"torch_dtype": "float32",
|
| 34 |
-
"transformers_version": "4.
|
| 35 |
"unk_token_id": 1,
|
| 36 |
"vocab_path": "conf/tokenizer/amplify_vocab.txt",
|
| 37 |
"vocab_size": 27
|
|
|
|
| 28 |
"num_hidden_layers": 32,
|
| 29 |
"other_special_token_ids": null,
|
| 30 |
"pad_token_id": 0,
|
| 31 |
+
"padded_vocab_size": 32,
|
| 32 |
"pre_activation_layer_norm": true,
|
| 33 |
"rms_norm": true,
|
| 34 |
"torch_dtype": "float32",
|
| 35 |
+
"transformers_version": "4.53.2",
|
| 36 |
"unk_token_id": 1,
|
| 37 |
"vocab_path": "conf/tokenizer/amplify_vocab.txt",
|
| 38 |
"vocab_size": 27
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:821e89362bf8b393963d70dfd70a4b60f0bb6a83f485785526531f8bb8f26060
|
| 3 |
+
size 1416116824
|