Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

README.md +244 -0
chat_template.jinja +48 -0
config.json +63 -0
generation_config.json +7 -0
model.safetensors +3 -0
special_tokens_map.json +40 -0
tiktoken.model +3 -0
tokenizer_config.json +164 -0

README.md ADDED Viewed

	@@ -0,0 +1,244 @@

+---
+library_name: transformers
+pipeline_tag: text-generation
+inference: true
+widget:
+  - text: Hello!
+    example_title: Hello world
+    group: Python
+base_model:
+- moonshotai/Kimi-Linear-48B-A3B-Instruct
+---
+This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [moonshotai/Kimi-Linear-48B-A3B-Instruct](https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct).
+### Example usage:
+- vLLM
+```bash
+vllm serve yujiepan/kimi-linear-tiny-random --trust-remote-code
+```
+- Transformers
+```python
+# tested on transformers==4.57.1
+import torch
+import transformers
+model_id = "yujiepan/kimi-linear-tiny-random"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    dtype=torch.bfloat16,
+    device_map="cuda",
+    trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+messages = [
+    {"role": "system", "content": "You are a helpful assistant provided by Moonshot-AI."},
+    {"role": "user", "content": "Is 123 a prime?"}
+]
+input_ids = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    tokenize=True,
+).to(model.device)
+print(input_ids)
+generated_ids = model.generate(inputs=input_ids, max_new_tokens=500)
+response = tokenizer.batch_decode(generated_ids)[0]
+print(response)
+```
+### Codes to create this repo:
+```python
+import json
+from pathlib import Path
+import accelerate
+import torch
+from huggingface_hub import file_exists, hf_hub_download
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    set_seed,
+)
+source_model_id = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
+save_folder = "/tmp/yujiepan/kimi-linear-tiny-random"
+Path(save_folder).mkdir(parents=True, exist_ok=True)
+tokenizer = AutoTokenizer.from_pretrained(
+    source_model_id, trust_remote_code=True)
+tokenizer.save_pretrained(save_folder)
+with open(hf_hub_download(source_model_id, filename='tokenizer_config.json', repo_type='model'), 'r', encoding='utf-8') as f:
+    tokenizer_config_json = json.load(f)
+tokenizer_config_json['auto_map']['AutoTokenizer'][0] = f'{source_model_id}--' + \
+    tokenizer_config_json["auto_map"]["AutoTokenizer"][0]
+with open(f"{save_folder}/tokenizer_config.json", "w", encoding='utf-8') as f:
+    json.dump(tokenizer_config_json, f, indent=2)
+# hf_hub_download(source_model_id, filename='tiktoken.model', repo_type='model',
+#                 local_dir=save_folder, local_dir_use_symlinks=True, cache_dir='/tmp/')
+with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
+    config_json = json.load(f)
+for k, v in config_json['auto_map'].items():
+    config_json['auto_map'][k] = f'{source_model_id}--{v}'
+config_json.update({
+    "head_dim": 32,
+    "hidden_size": 8,
+    "intermediate_size": 32,
+    "linear_attn_config": {
+        "full_attn_layers": [4],
+        "head_dim": 32,
+        "kda_layers": [1, 2, 3],
+        "num_heads": 8,
+        "short_conv_kernel_size": 4,
+    },
+    "num_attention_heads": 8,
+    "num_key_value_heads": 8,
+    "moe_intermediate_size": 32,
+    "num_hidden_layers": 5,
+})
+with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
+    json.dump(config_json, f, indent=2)
+config = AutoConfig.from_pretrained(
+    save_folder,
+    trust_remote_code=True,
+)
+print(config)
+torch.set_default_dtype(torch.bfloat16)
+model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+torch.set_default_dtype(torch.float32)
+if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
+    model.generation_config = GenerationConfig.from_pretrained(
+        source_model_id, trust_remote_code=True,
+    )
+set_seed(42)
+model = model.cpu()
+n_parms = sum(p.numel() for p in model.parameters())
+with torch.no_grad():
+    for name, p in sorted(model.named_parameters()):
+        torch.nn.init.normal_(p, 0, 0.1)
+        print(name, p.shape, (p.numel() / n_parms * 100), '%')
+model.save_pretrained(save_folder)
+with open(f"{save_folder}/config.json", "r", encoding='utf-8') as f:
+    config_json = json.load(f)
+    config_json['auto_map'] = {k: f'{source_model_id}--' + v.split(
+        '--')[-1] for k, v in config_json['auto_map'].items()}
+with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
+    json.dump(config_json, f, indent=2)
+for python_file in Path(save_folder).glob('*.py'):
+    python_file.unlink()
+```
+### Printing the model:
+```text
+KimiLinearForCausalLM(
+  (model): KimiLinearModel(
+    (embed_tokens): Embedding(163840, 8, padding_idx=163839)
+    (layers): ModuleList(
+      (0): KimiDecoderLayer(
+        (self_attn): KimiDeltaAttention(
+          (q_proj): Linear(in_features=8, out_features=256, bias=False)
+          (k_proj): Linear(in_features=8, out_features=256, bias=False)
+          (v_proj): Linear(in_features=8, out_features=256, bias=False)
+          (q_conv1d): ShortConvolution(256, 256, kernel_size=(4,), stride=(1,), padding=(3,), groups=256, bias=False, activation=silu, backend=triton)
+          (k_conv1d): ShortConvolution(256, 256, kernel_size=(4,), stride=(1,), padding=(3,), groups=256, bias=False, activation=silu, backend=triton)
+          (v_conv1d): ShortConvolution(256, 256, kernel_size=(4,), stride=(1,), padding=(3,), groups=256, bias=False, activation=silu, backend=triton)
+          (f_a_proj): Linear(in_features=8, out_features=32, bias=False)
+          (f_b_proj): Linear(in_features=32, out_features=256, bias=False)
+          (b_proj): Linear(in_features=8, out_features=8, bias=False)
+          (g_a_proj): Linear(in_features=8, out_features=32, bias=False)
+          (g_b_proj): Linear(in_features=32, out_features=256, bias=False)
+          (o_norm): FusedRMSNormGated(32, eps=1e-05, activation=sigmoid)
+          (o_proj): Linear(in_features=256, out_features=8, bias=False)
+        )
+        (mlp): KimiMLP(
+          (gate_proj): Linear(in_features=8, out_features=32, bias=False)
+          (up_proj): Linear(in_features=8, out_features=32, bias=False)
+          (down_proj): Linear(in_features=32, out_features=8, bias=False)
+          (act_fn): SiLUActivation()
+        )
+        (input_layernorm): KimiRMSNorm()
+        (post_attention_layernorm): KimiRMSNorm()
+      )
+      (1-2): 2 x KimiDecoderLayer(
+        (self_attn): KimiDeltaAttention(
+          (q_proj): Linear(in_features=8, out_features=256, bias=False)
+          (k_proj): Linear(in_features=8, out_features=256, bias=False)
+          (v_proj): Linear(in_features=8, out_features=256, bias=False)
+          (q_conv1d): ShortConvolution(256, 256, kernel_size=(4,), stride=(1,), padding=(3,), groups=256, bias=False, activation=silu, backend=triton)
+          (k_conv1d): ShortConvolution(256, 256, kernel_size=(4,), stride=(1,), padding=(3,), groups=256, bias=False, activation=silu, backend=triton)
+          (v_conv1d): ShortConvolution(256, 256, kernel_size=(4,), stride=(1,), padding=(3,), groups=256, bias=False, activation=silu, backend=triton)
+          (f_a_proj): Linear(in_features=8, out_features=32, bias=False)
+          (f_b_proj): Linear(in_features=32, out_features=256, bias=False)
+          (b_proj): Linear(in_features=8, out_features=8, bias=False)
+          (g_a_proj): Linear(in_features=8, out_features=32, bias=False)
+          (g_b_proj): Linear(in_features=32, out_features=256, bias=False)
+          (o_norm): FusedRMSNormGated(32, eps=1e-05, activation=sigmoid)
+          (o_proj): Linear(in_features=256, out_features=8, bias=False)
+        )
+        (block_sparse_moe): KimiSparseMoeBlock(
+          (experts): ModuleList(
+            (0-255): 256 x KimiBlockSparseMLP(
+              (w1): Linear(in_features=8, out_features=32, bias=False)
+              (w2): Linear(in_features=32, out_features=8, bias=False)
+              (w3): Linear(in_features=8, out_features=32, bias=False)
+              (act_fn): SiLUActivation()
+            )
+          )
+          (gate): KimiMoEGate()
+          (shared_experts): KimiMLP(
+            (gate_proj): Linear(in_features=8, out_features=32, bias=False)
+            (up_proj): Linear(in_features=8, out_features=32, bias=False)
+            (down_proj): Linear(in_features=32, out_features=8, bias=False)
+            (act_fn): SiLUActivation()
+          )
+        )
+        (input_layernorm): KimiRMSNorm()
+        (post_attention_layernorm): KimiRMSNorm()
+      )
+      (3-4): 2 x KimiDecoderLayer(
+        (self_attn): KimiMLAAttention(
+          (q_proj): Linear(in_features=8, out_features=1536, bias=False)
+          (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
+          (kv_a_layernorm): KimiRMSNorm()
+          (kv_b_proj): Linear(in_features=512, out_features=2048, bias=False)
+          (o_proj): Linear(in_features=1024, out_features=8, bias=False)
+        )
+        (block_sparse_moe): KimiSparseMoeBlock(
+          (experts): ModuleList(
+            (0-255): 256 x KimiBlockSparseMLP(
+              (w1): Linear(in_features=8, out_features=32, bias=False)
+              (w2): Linear(in_features=32, out_features=8, bias=False)
+              (w3): Linear(in_features=8, out_features=32, bias=False)
+              (act_fn): SiLUActivation()
+            )
+          )
+          (gate): KimiMoEGate()
+          (shared_experts): KimiMLP(
+            (gate_proj): Linear(in_features=8, out_features=32, bias=False)
+            (up_proj): Linear(in_features=8, out_features=32, bias=False)
+            (down_proj): Linear(in_features=32, out_features=8, bias=False)
+            (act_fn): SiLUActivation()
+          )
+        )
+        (input_layernorm): KimiRMSNorm()
+        (post_attention_layernorm): KimiRMSNorm()
+      )
+    )
+    (norm): KimiRMSNorm()
+  )
+  (lm_head): Linear(in_features=8, out_features=163840, bias=False)
+)
+```

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,48 @@

+{% macro render_content(msg) -%}
+    {%- set c = msg.get('content') -%}
+    {%- if c is string -%}
+      {{ c }}
+    {%- elif c is not none -%}
+      {% for content in c -%}
+        {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
+          <|media_start|>image<|media_content|><|media_pad|><|media_end|>
+        {% else -%}
+          {{ content['text'] }}
+        {%- endif -%}
+      {%- endfor -%}
+    {%- endif -%}
+{%- endmacro %}
+{%- if tools -%}
+  <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
+{%- endif -%}
+{% for message in messages %}
+  {%- set role_name =  message.get('name') or  message['role'] -%}
+  {%- if message['role'] == 'user' -%}
+    <|im_user|>{{role_name}}<|im_middle|>
+  {%- elif message['role'] == 'assistant' -%}
+    <|im_assistant|>{{role_name}}<|im_middle|>
+  {%- else -%}
+    <|im_system|>{{role_name}}<|im_middle|>
+  {%- endif -%}
+  {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
+    {{render_content(message)}}<|tool_calls_section_begin|>
+    {%- for tool_call in message['tool_calls'] -%}
+        {%- set formatted_id = tool_call['id'] -%}
+      <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
+    {%- endfor -%}
+    <|tool_calls_section_end|>
+  {%- elif message['role'] == 'tool' -%}
+    {%- set tool_call_id = message.tool_call_id -%}
+    ## Return of {{ tool_call_id }}
+{{render_content(message)}}
+  {%- elif message['content'] is not none -%}
+    {{render_content(message)}}
+  {%- endif -%}
+  <|im_end|>
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+  <|im_assistant|>assistant<|im_middle|>
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "architectures": [
+    "KimiLinearForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "moonshotai/Kimi-Linear-48B-A3B-Instruct--configuration_kimi.KimiLinearConfig",
+    "AutoModel": "moonshotai/Kimi-Linear-48B-A3B-Instruct--modeling_kimi.KimiLinearModel",
+    "AutoModelForCausalLM": "moonshotai/Kimi-Linear-48B-A3B-Instruct--modeling_kimi.KimiLinearForCausalLM"
+  },
+  "bos_token_id": 163584,
+  "dtype": "bfloat16",
+  "eos_token_id": 163586,
+  "first_k_dense_replace": 1,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 8,
+  "initializer_range": 0.02,
+  "intermediate_size": 32,
+  "kv_lora_rank": 512,
+  "linear_attn_config": {
+    "full_attn_layers": [
+      4
+    ],
+    "head_dim": 32,
+    "kda_layers": [
+      1,
+      2,
+      3
+    ],
+    "num_heads": 8,
+    "short_conv_kernel_size": 4
+  },
+  "mla_use_nope": true,
+  "model_max_length": 1048576,
+  "model_type": "kimi_linear",
+  "moe_intermediate_size": 32,
+  "moe_layer_freq": 1,
+  "moe_renormalize": true,
+  "moe_router_activation_func": "sigmoid",
+  "num_attention_heads": 8,
+  "num_expert_group": 1,
+  "num_experts": 256,
+  "num_experts_per_token": 8,
+  "num_hidden_layers": 5,
+  "num_key_value_heads": 8,
+  "num_nextn_predict_layers": 0,
+  "num_shared_experts": 1,
+  "pad_token_id": 163839,
+  "q_lora_rank": null,
+  "qk_nope_head_dim": 128,
+  "qk_rope_head_dim": 64,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "routed_scaling_factor": 2.446,
+  "tie_word_embeddings": false,
+  "topk_group": 1,
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "use_grouped_topk": true,
+  "v_head_dim": 128,
+  "vocab_size": 163840
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 163584,
+  "eos_token_id": 163586,
+  "pad_token_id": 163839,
+  "transformers_version": "4.57.1"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3094fee183ef74dbef84a89e274909566dcd5e98fadaa345e74cdc92e271671a
+size 11691928

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_user|>",
+    "<|im_assistant|>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "[EOT]",
+    "<|im_system|>",
+    "<|im_middle|>"
+  ],
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[EOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tiktoken.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
+size 2795286

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,164 @@

+{
+  "added_tokens_decoder": {
+    "163584": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163585": {
+      "content": "[EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163586": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163587": {
+      "content": "<|im_user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163588": {
+      "content": "<|im_assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163590": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163591": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163593": {
+      "content": "[EOT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163594": {
+      "content": "<|im_system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163595": {
+      "content": "<|tool_calls_section_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163596": {
+      "content": "<|tool_calls_section_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163597": {
+      "content": "<|tool_call_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163598": {
+      "content": "<|tool_call_argument_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163599": {
+      "content": "<|tool_call_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163601": {
+      "content": "<|im_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163838": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163839": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_user|>",
+    "<|im_assistant|>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "[EOT]",
+    "<|im_system|>",
+    "<|im_middle|>"
+  ],
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "[EOS]",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "tokenizer_class": "TikTokenTokenizer",
+  "unk_token": "[UNK]",
+  "auto_map": {
+    "AutoTokenizer": [
+      "moonshotai/Kimi-Linear-48B-A3B-Instruct--tokenization_kimi.TikTokenTokenizer",
+      null
+    ]
+  }
+}