Transformers
pszemraj commited on
Commit
a8dfac9
·
verified ·
0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: transformers
4
+ ---
5
+
6
+ # tiktoken `cl100k_base`: as HF MLM tokenizer
7
+
8
+ based on `RobertaTokenizerFast`
9
+
10
+
11
+ ```py
12
+ from pathlib import Path
13
+ from transformers import RobertaTokenizerFast, AutoTokenizer
14
+
15
+ repo_id = "BEE-spoke-data/cl100k_base-mlm"
16
+ tk = AutoTokenizer.from_pretrained(repo_id)
17
+ len(tk)
18
+ # 100266
19
+ ```
20
+
21
+ testing that it does what it should:
22
+
23
+ ```py
24
+ input_text = "i love memes"
25
+ tokenized_ids = tk.encode(input_text)
26
+ decoded_tokens = tk.convert_ids_to_tokens(tokenized_ids)
27
+
28
+ print(f"for input '{input_text}' -> {tokenized_ids} -> {decoded_tokens}")
29
+ # for input 'i love memes' -> [100277, 72, 3021, 62277, 100278] -> ['<s>', 'i', 'Ġlove', 'Ġmemes', '</s>']
30
+ ```
31
+
32
+ ---
added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "</s>": 100278,
3
+ "<mask>": 100281,
4
+ "<pad>": 100280,
5
+ "<s>": 100277,
6
+ "<unk>": 100279
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tiktoken-to-hf-masked.ipynb ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "id": "ZypJVeIMFQGQ"
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "import tiktoken\n",
12
+ "from transformers.models.roberta.tokenization_roberta import bytes_to_unicode\n",
13
+ "from typing import Dict, Optional\n",
14
+ "\n",
15
+ "byte_encoder = bytes_to_unicode()\n",
16
+ "\n",
17
+ "\n",
18
+ "def token_bytes_to_string(b):\n",
19
+ " return \"\".join([byte_encoder[ord(char)] for char in b.decode(\"latin-1\")])\n",
20
+ "\n",
21
+ "\n",
22
+ "# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960\n",
23
+ "def bpe(\n",
24
+ " mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None\n",
25
+ ") -> list[bytes]:\n",
26
+ " parts = [bytes([b]) for b in token]\n",
27
+ " while True:\n",
28
+ " min_idx = None\n",
29
+ " min_rank = None\n",
30
+ " for i, pair in enumerate(zip(parts[:-1], parts[1:])):\n",
31
+ " rank = mergeable_ranks.get(pair[0] + pair[1])\n",
32
+ " if rank is not None and (min_rank is None or rank < min_rank):\n",
33
+ " min_idx = i\n",
34
+ " min_rank = rank\n",
35
+ " if min_rank is None or (max_rank is not None and min_rank >= max_rank):\n",
36
+ " break\n",
37
+ " assert min_idx is not None\n",
38
+ " parts = (\n",
39
+ " parts[:min_idx]\n",
40
+ " + [parts[min_idx] + parts[min_idx + 1]]\n",
41
+ " + parts[min_idx + 2 :]\n",
42
+ " )\n",
43
+ " return parts\n",
44
+ "\n",
45
+ "\n",
46
+ "def generate_vocab_and_merges(encoder):\n",
47
+ " mergeable_ranks = encoder._mergeable_ranks\n",
48
+ "\n",
49
+ " merges = []\n",
50
+ " vocab = {}\n",
51
+ " for token, rank in mergeable_ranks.items():\n",
52
+ " vocab[token_bytes_to_string(token)] = rank\n",
53
+ "\n",
54
+ " if len(token) == 1:\n",
55
+ " continue\n",
56
+ " merged = tuple(bpe(mergeable_ranks, token, max_rank=rank))\n",
57
+ " assert len(merged) == 2\n",
58
+ "\n",
59
+ " merges.append(\" \".join(map(token_bytes_to_string, merged)))\n",
60
+ "\n",
61
+ " # Also add special tokens\n",
62
+ " vocab.update(encoder._special_tokens)\n",
63
+ "\n",
64
+ " return vocab, merges"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "enc = tiktoken.get_encoding(\"cl100k_base\")"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "metadata": {
80
+ "id": "O87Zz6Vzhb5C"
81
+ },
82
+ "outputs": [],
83
+ "source": [
84
+ "import json\n",
85
+ "import os\n",
86
+ "from transformers import GPT2TokenizerFast, AutoTokenizer\n",
87
+ "\n",
88
+ "\n",
89
+ "# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb\n",
90
+ "\n",
91
+ "MODEL_INFO = {\n",
92
+ " # GPT-2 and GPT-3 models (r50k_base)\n",
93
+ " \"gpt2\": {\n",
94
+ " \"tokenizer_class\": \"GPT2Tokenizer\",\n",
95
+ " \"model_max_length\": 1024,\n",
96
+ " },\n",
97
+ " \"davinci\": { # (gpt-3)\n",
98
+ " \"tokenizer_class\": \"GPT3Tokenizer\",\n",
99
+ " \"model_max_length\": 2048,\n",
100
+ " },\n",
101
+ " # GPT-3.5 and GPT-4 models (cl100k_base)\n",
102
+ " \"gpt-3.5-turbo\": {\n",
103
+ " \"tokenizer_class\": \"GPT3_5Tokenizer\",\n",
104
+ " \"model_max_length\": 4096,\n",
105
+ " },\n",
106
+ " \"gpt-3.5-turbo-16k\": {\n",
107
+ " \"tokenizer_class\": \"GPT3_5Tokenizer\",\n",
108
+ " \"model_max_length\": 16384,\n",
109
+ " },\n",
110
+ " \"gpt-4\": {\n",
111
+ " \"tokenizer_class\": \"GPT4Tokenizer\",\n",
112
+ " \"model_max_length\": 8192,\n",
113
+ " },\n",
114
+ " \"cl100k_base\": {\n",
115
+ " \"tokenizer_class\": \"RobertaTokenizer\",\n",
116
+ " \"model_max_length\": 8192,\n",
117
+ " },\n",
118
+ " \"text-embedding-ada-002\": {\n",
119
+ " \"tokenizer_class\": \"GPT4Tokenizer\",\n",
120
+ " \"model_max_length\": 8192,\n",
121
+ " },\n",
122
+ " # Codex models (p50k_base)\n",
123
+ " \"text-davinci-002\": {\n",
124
+ " \"tokenizer_class\": \"CodexTokenizer\",\n",
125
+ " \"model_max_length\": 4096,\n",
126
+ " },\n",
127
+ " \"text-davinci-003\": {\n",
128
+ " \"tokenizer_class\": \"CodexTokenizer\",\n",
129
+ " \"model_max_length\": 4096,\n",
130
+ " },\n",
131
+ "}\n",
132
+ "\n",
133
+ "\n",
134
+ "def convert_tiktoken(model_name, output_dir=None):\n",
135
+ " if output_dir is None:\n",
136
+ " output_dir = model_name\n",
137
+ "\n",
138
+ " encoder = tiktoken.get_encoding(\"cl100k_base\")\n",
139
+ "\n",
140
+ " vocab, merges = generate_vocab_and_merges(encoder)\n",
141
+ "\n",
142
+ " added_tokens = [\n",
143
+ " {\n",
144
+ " \"id\": id,\n",
145
+ " \"content\": content,\n",
146
+ " \"single_word\": False,\n",
147
+ " \"lstrip\": False,\n",
148
+ " \"rstrip\": False,\n",
149
+ " \"normalized\": False,\n",
150
+ " \"special\": True,\n",
151
+ " }\n",
152
+ " for content, id in encoder._special_tokens.items()\n",
153
+ " ]\n",
154
+ " # original_mlm_specials = [\n",
155
+ " # {\n",
156
+ " # \"id\": 0,\n",
157
+ " # \"content\": \"<s>\",\n",
158
+ " # \"single_word\": False,\n",
159
+ " # \"lstrip\": False,\n",
160
+ " # \"rstrip\": False,\n",
161
+ " # \"normalized\": True,\n",
162
+ " # \"special\": True,\n",
163
+ " # },\n",
164
+ " # {\n",
165
+ " # \"id\": 1,\n",
166
+ " # \"content\": \"<pad>\",\n",
167
+ " # \"single_word\": False,\n",
168
+ " # \"lstrip\": False,\n",
169
+ " # \"rstrip\": False,\n",
170
+ " # \"normalized\": True,\n",
171
+ " # \"special\": True,\n",
172
+ " # },\n",
173
+ " # {\n",
174
+ " # \"id\": 2,\n",
175
+ " # \"content\": \"</s>\",\n",
176
+ " # \"single_word\": False,\n",
177
+ " # \"lstrip\": False,\n",
178
+ " # \"rstrip\": False,\n",
179
+ " # \"normalized\": True,\n",
180
+ " # \"special\": True,\n",
181
+ " # },\n",
182
+ " # {\n",
183
+ " # \"id\": 3,\n",
184
+ " # \"content\": \"<unk>\",\n",
185
+ " # \"single_word\": False,\n",
186
+ " # \"lstrip\": False,\n",
187
+ " # \"rstrip\": False,\n",
188
+ " # \"normalized\": True,\n",
189
+ " # \"special\": True,\n",
190
+ " # },\n",
191
+ " # {\n",
192
+ " # \"id\": 50264,\n",
193
+ " # \"content\": \"<mask>\",\n",
194
+ " # \"single_word\": False,\n",
195
+ " # \"lstrip\": True,\n",
196
+ " # \"rstrip\": False,\n",
197
+ " # \"normalized\": False,\n",
198
+ " # \"special\": True,\n",
199
+ " # },\n",
200
+ " # ]\n",
201
+ " # added_tokens.extend(original_mlm_specials)\n",
202
+ "\n",
203
+ " # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json\n",
204
+ " tokenizer_template = {\n",
205
+ " \"version\": \"1.0\",\n",
206
+ " \"truncation\": None,\n",
207
+ " \"padding\": None,\n",
208
+ " \"added_tokens\": added_tokens,\n",
209
+ " \"normalizer\": None,\n",
210
+ " \"pre_tokenizer\": {\n",
211
+ " \"type\": \"ByteLevel\",\n",
212
+ " \"add_prefix_space\": False,\n",
213
+ " \"trim_offsets\": True,\n",
214
+ " \"use_regex\": True,\n",
215
+ " },\n",
216
+ " \"post_processor\": {\n",
217
+ " \"type\": \"RobertaProcessing\",\n",
218
+ " \"sep\": [\"</s>\", 2], # YOU HAVE TO UPDATE THIS TO THE ACTUAL ID!!\n",
219
+ " \"cls\": [\"<s>\", 0], # TODO: add auto-fixing for this\n",
220
+ " \"trim_offsets\": True,\n",
221
+ " \"add_prefix_space\": False,\n",
222
+ " },\n",
223
+ " \"decoder\": {\n",
224
+ " \"type\": \"ByteLevel\",\n",
225
+ " \"add_prefix_space\": True,\n",
226
+ " \"trim_offsets\": True,\n",
227
+ " \"use_regex\": True,\n",
228
+ " },\n",
229
+ " \"model\": {\n",
230
+ " \"type\": \"BPE\",\n",
231
+ " \"dropout\": None,\n",
232
+ " \"unk_token\": None,\n",
233
+ " \"continuing_subword_prefix\": \"\",\n",
234
+ " \"end_of_word_suffix\": \"\",\n",
235
+ " \"fuse_unk\": False,\n",
236
+ " \"byte_fallback\": False,\n",
237
+ " \"vocab\": vocab,\n",
238
+ " \"merges\": merges,\n",
239
+ " },\n",
240
+ " }\n",
241
+ "\n",
242
+ " # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json\n",
243
+ "\n",
244
+ " tokenizer_config_template = {\n",
245
+ " \"add_prefix_space\": False,\n",
246
+ " \"bos_token\": \"<s>\",\n",
247
+ " \"clean_up_tokenization_spaces\": True,\n",
248
+ " \"cls_token\": \"<s>\",\n",
249
+ " \"eos_token\": \"</s>\",\n",
250
+ " \"errors\": \"replace\",\n",
251
+ " \"mask_token\": \"<mask>\",\n",
252
+ " \"pad_token\": \"<pad>\",\n",
253
+ " \"sep_token\": \"</s>\",\n",
254
+ " \"trim_offsets\": True,\n",
255
+ " \"unk_token\": \"<unk>\",\n",
256
+ " }\n",
257
+ " tokenizer_config_template.update(\n",
258
+ " MODEL_INFO[model_name]\n",
259
+ " ) # Adds `model_max_length` and `tokenizer_class`\n",
260
+ " tokenizer_config_template = dict(\n",
261
+ " sorted(tokenizer_config_template.items(), key=lambda x: x[0])\n",
262
+ " )\n",
263
+ "\n",
264
+ " # special tokens map\n",
265
+ " MLM_specials = {\n",
266
+ " \"bos_token\": \"<s>\",\n",
267
+ " \"cls_token\": \"<s>\",\n",
268
+ " \"eos_token\": \"</s>\",\n",
269
+ " \"mask_token\": {\n",
270
+ " \"content\": \"<mask>\",\n",
271
+ " \"lstrip\": True,\n",
272
+ " \"normalized\": False,\n",
273
+ " \"rstrip\": False,\n",
274
+ " \"single_word\": False,\n",
275
+ " },\n",
276
+ " \"pad_token\": \"<pad>\",\n",
277
+ " \"sep_token\": \"</s>\",\n",
278
+ " \"unk_token\": \"<unk>\",\n",
279
+ " }\n",
280
+ " os.makedirs(output_dir, exist_ok=True)\n",
281
+ "\n",
282
+ " # Save to files\n",
283
+ " with open(os.path.join(output_dir, \"vocab.json\"), \"w\", encoding=\"utf-8\") as fp:\n",
284
+ " json.dump(vocab, fp, indent=2, ensure_ascii=False)\n",
285
+ "\n",
286
+ " with open(os.path.join(output_dir, \"tokenizer.json\"), \"w\", encoding=\"utf-8\") as fp:\n",
287
+ " json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False)\n",
288
+ "\n",
289
+ " with open(\n",
290
+ " os.path.join(output_dir, \"tokenizer_config.json\"), \"w\", encoding=\"utf-8\"\n",
291
+ " ) as fp:\n",
292
+ " json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False)\n",
293
+ "\n",
294
+ " with open(\n",
295
+ " os.path.join(output_dir, \"special_tokens_map.json\"), \"w\", encoding=\"utf-8\"\n",
296
+ " ) as fp:\n",
297
+ " json.dump(\n",
298
+ " MLM_specials,\n",
299
+ " fp,\n",
300
+ " indent=2,\n",
301
+ " ensure_ascii=False,\n",
302
+ " )\n",
303
+ "\n",
304
+ " with open(os.path.join(output_dir, \"merges.txt\"), \"w\", encoding=\"utf-8\") as fp:\n",
305
+ " fp.write(\"#version: 0.2\\n\")\n",
306
+ " fp.write(\"\\n\".join(merges))\n",
307
+ "\n",
308
+ " # load with autotokenizer and rewrite\n",
309
+ " tk = AutoTokenizer.from_pretrained(output_dir)\n",
310
+ " tk.save_pretrained(output_dir)"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": null,
316
+ "metadata": {},
317
+ "outputs": [],
318
+ "source": [
319
+ "from pathlib import Path\n",
320
+ "\n",
321
+ "output_dir = \"tiktoken-tokenizers\"\n",
322
+ "outpath = Path.cwd() / output_dir / \"cl100k_base-as-roberta\"\n",
323
+ "convert_tiktoken(\"cl100k_base\", outpath)"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": 11,
329
+ "metadata": {},
330
+ "outputs": [
331
+ {
332
+ "name": "stderr",
333
+ "output_type": "stream",
334
+ "text": [
335
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
336
+ ]
337
+ },
338
+ {
339
+ "data": {
340
+ "text/plain": [
341
+ "100266"
342
+ ]
343
+ },
344
+ "execution_count": 11,
345
+ "metadata": {},
346
+ "output_type": "execute_result"
347
+ }
348
+ ],
349
+ "source": [
350
+ "from pathlib import Path\n",
351
+ "from transformers import GPT2TokenizerFast, AutoTokenizer\n",
352
+ "\n",
353
+ "\n",
354
+ "tk = AutoTokenizer.from_pretrained(outpath)\n",
355
+ "len(tk)"
356
+ ]
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": 12,
361
+ "metadata": {},
362
+ "outputs": [
363
+ {
364
+ "name": "stdout",
365
+ "output_type": "stream",
366
+ "text": [
367
+ "[100277, 72, 3021, 62277, 100278] ['<s>', 'i', 'Ġlove', 'Ġmemes', '</s>']\n"
368
+ ]
369
+ }
370
+ ],
371
+ "source": [
372
+ "input_text = \"i love memes\"\n",
373
+ "tokenized_ids = tk.encode(input_text)\n",
374
+ "decoded_tokens = tk.convert_ids_to_tokens(tokenized_ids)\n",
375
+ "\n",
376
+ "print(tokenized_ids, decoded_tokens)"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": 10,
382
+ "metadata": {},
383
+ "outputs": [
384
+ {
385
+ "data": {
386
+ "text/plain": [
387
+ "100277"
388
+ ]
389
+ },
390
+ "execution_count": 10,
391
+ "metadata": {},
392
+ "output_type": "execute_result"
393
+ }
394
+ ],
395
+ "source": [
396
+ "tk.cls_token_id"
397
+ ]
398
+ },
399
+ {
400
+ "cell_type": "markdown",
401
+ "metadata": {},
402
+ "source": [
403
+ "---\n"
404
+ ]
405
+ }
406
+ ],
407
+ "metadata": {
408
+ "colab": {
409
+ "provenance": []
410
+ },
411
+ "kernelspec": {
412
+ "display_name": "Python 3",
413
+ "name": "python3"
414
+ },
415
+ "language_info": {
416
+ "codemirror_mode": {
417
+ "name": "ipython",
418
+ "version": 3
419
+ },
420
+ "file_extension": ".py",
421
+ "mimetype": "text/x-python",
422
+ "name": "python",
423
+ "nbconvert_exporter": "python",
424
+ "pygments_lexer": "ipython3",
425
+ "version": "3.10.13"
426
+ }
427
+ },
428
+ "nbformat": 4,
429
+ "nbformat_minor": 0
430
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "100257": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "100258": {
13
+ "content": "<|fim_prefix|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "100259": {
21
+ "content": "<|fim_middle|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "100260": {
29
+ "content": "<|fim_suffix|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "100276": {
37
+ "content": "<|endofprompt|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "100277": {
45
+ "content": "<s>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "100278": {
53
+ "content": "</s>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "100279": {
61
+ "content": "<unk>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "100280": {
69
+ "content": "<pad>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "100281": {
77
+ "content": "<mask>",
78
+ "lstrip": true,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ }
84
+ },
85
+ "bos_token": "<s>",
86
+ "clean_up_tokenization_spaces": true,
87
+ "cls_token": "<s>",
88
+ "eos_token": "</s>",
89
+ "errors": "replace",
90
+ "mask_token": "<mask>",
91
+ "model_max_length": 8192,
92
+ "pad_token": "<pad>",
93
+ "sep_token": "</s>",
94
+ "tokenizer_class": "RobertaTokenizer",
95
+ "trim_offsets": true,
96
+ "unk_token": "<unk>"
97
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff