inwaves
/

os-solu

Model card Files Files and versions

xet

Community

inwaves commited on Sep 9, 2022

Commit

2c547b1

1 Parent(s): 1bcfe48

Fixed exampleccounting glitch

Browse files

Files changed (2) hide show

main.py +5 -4
utils.py +1 -1

main.py CHANGED Viewed

@@ -33,7 +33,7 @@ def parse_arguments() -> dict:
     """
     parser = argparse.ArgumentParser(description="Parse command-line arguments for this model.")
     parser.add_argument("--batch_size", type=int, default=40, help="Batch size used in training.")
-    parser.add_argument("--checkpoint_every_n_tokens", type=int, default=50_000, help="Save a checkpoint of the model every n tokens processed.")
     parser.add_argument("--d_model", type=int, default=512, help="Hidden size of the model.")
     parser.add_argument("--dropout", type=float, default=0.1, help="Probability of dropout.")
     parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate for the optimiser.")
@@ -96,7 +96,9 @@ def train(config: OsSoluConfig, model: OsSoluModel, train_dataloader: DataLoader
             optimiser.step()
             wandb.log(dict(train_loss=loss, elapsed=time.time() - start_time), step=examples_seen)
-            examples_seen += len(batch)
             # Save a checkpoint of the model.
             if examples_seen % config.checkpoint_every_n_tokens == 0:
@@ -168,11 +170,10 @@ def setup() -> Tuple[OsSoluConfig, OsSoluModel]:
     train_dataset = ds["train"]
     test_dataset = ds["test"]
-    # TODO: tokenise the data before sending it to the model.
     tokeniser = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
     tokeniser.add_special_tokens({"pad_token": "<PAD>"})
-    train_dataset = train_dataset.map(lambda x: tokenise(x, tokeniser), batched=True).with_format("torch")
     test_dataset = test_dataset.map(tokenise, batched=True).with_format("torch")
     train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size)

     """
     parser = argparse.ArgumentParser(description="Parse command-line arguments for this model.")
     parser.add_argument("--batch_size", type=int, default=40, help="Batch size used in training.")
+    parser.add_argument("--checkpoint_every_n_tokens", type=int, default=500_000_000, help="Save a checkpoint of the model every n tokens processed.")
     parser.add_argument("--d_model", type=int, default=512, help="Hidden size of the model.")
     parser.add_argument("--dropout", type=float, default=0.1, help="Probability of dropout.")
     parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate for the optimiser.")
             optimiser.step()
             wandb.log(dict(train_loss=loss, elapsed=time.time() - start_time), step=examples_seen)
+            # Number of tokens processed is batch_size * sequence_length.
+            examples_seen += batch.numel()
             # Save a checkpoint of the model.
             if examples_seen % config.checkpoint_every_n_tokens == 0:
     train_dataset = ds["train"]
     test_dataset = ds["test"]
     tokeniser = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
     tokeniser.add_special_tokens({"pad_token": "<PAD>"})
+    train_dataset = train_dataset.map(lambda x: tokenise(x, tokeniser, 1, config.max_positional_embeddings), batched=True).with_format("torch")
     test_dataset = test_dataset.map(tokenise, batched=True).with_format("torch")
     train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size)

utils.py CHANGED Viewed

@@ -42,7 +42,7 @@ class OsSoluConfig:
         self.self_attention_type = args["self_attention_type"]
         self.vocab_size = args["vocab_size"]
-def tokenise(batch, tokeniser, num_gpus: int = 1, context_length: int = 1024):
     """Tokenise a batch of text data. This implementation is idiosyncratic to the Pile dataset, but can be easily modified to work with e.g. C4. Code from Neel.
     Args:

         self.self_attention_type = args["self_attention_type"]
         self.vocab_size = args["vocab_size"]
+def tokenise(batch, tokeniser, num_gpus: int, context_length: int):
     """Tokenise a batch of text data. This implementation is idiosyncratic to the Pile dataset, but can be easily modified to work with e.g. C4. Code from Neel.
     Args: