| from tokenizers import ByteLevelBPETokenizer | |
| paths = ['train_code.txt', 'train_doc.txt'] | |
| # Initialize a tokenizer | |
| tokenizer = ByteLevelBPETokenizer() | |
| # Customize training | |
| tokenizer.train(files=paths, vocab_size=32000, min_frequency=3, special_tokens=[ | |
| "<pad>", | |
| "<s>", | |
| "</s>", | |
| "<unk>", | |
| "<mask>" | |
| ]) | |
| # Save files to disk | |
| tokenizer.save_model("./salesforce", "codet5") | |
| print( | |
| tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens | |
| ) | |