Upload 7 files
Browse files- README.md +82 -3
- adapter_config.json +37 -0
- adapter_model.safetensors +3 -0
- special_tokens_map.json +30 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +47 -0
README.md
CHANGED
|
@@ -1,3 +1,82 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
language:
|
| 4 |
+
- code
|
| 5 |
+
library_name: peft
|
| 6 |
+
tags:
|
| 7 |
+
- llm2vec
|
| 8 |
+
- mntp
|
| 9 |
+
- decoder-only
|
| 10 |
+
- pre-training
|
| 11 |
+
- codegemma
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
## 📖 Are Decoder-Only Large Language Models the Silver Bullet for Code Search?
|
| 15 |
+
|
| 16 |
+
This model is an official artifact from our research paper: **"[Are Decoder-Only Large Language Models the Silver Bullet for Code Search?](https://arxiv.org/abs/2410.22240)"**.
|
| 17 |
+
|
| 18 |
+
In this work, we conduct a large-scale systematic evaluation of decoder-only Large Language Models for the task of code search and present a set of effective fine-tuning and optimization strategies.
|
| 19 |
+
|
| 20 |
+
For complete details on all our experiments, to reproduce the full training/evaluation pipeline, or to use other models from the paper, please visit our official GitHub repository:
|
| 21 |
+
|
| 22 |
+
➡️ **[GitHub: Georgepitt/DecoderLLMs-CodeSearch](https://github.com/Georgepitt/DecoderLLMs-CodeSearch)**
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
## Model Card: CodeGemma-7B - MNTP Pre-trained Model
|
| 27 |
+
|
| 28 |
+
### 📜 Model Description
|
| 29 |
+
|
| 30 |
+
This is a PEFT adapter for the **`TheBloke/Mistral-7B-Instruct-v0.2-code-ft-GGUF`** model, pre-trained with the **Masked Next Token Prediction (MNTP)** objective from the [llm2vec](https://github.com/McGill-NLP/llm2vec) framework.
|
| 31 |
+
|
| 32 |
+
**Important Note on its Role**:
|
| 33 |
+
This model is **not intended for direct downstream task evaluation**. Instead, it serves as a crucial **foundational prerequisite** for our supervised fine-tuned (SupCon) models. The MNTP pre-training enables the decoder-only model to learn bidirectional representations, which is an essential step before applying supervised contrastive learning.
|
| 34 |
+
|
| 35 |
+
### 🚀 How to Use
|
| 36 |
+
|
| 37 |
+
#### Standalone Use (for Base Embeddings)
|
| 38 |
+
|
| 39 |
+
You can also use this MNTP model by itself to generate text or code embeddings.
|
| 40 |
+
|
| 41 |
+
```python
|
| 42 |
+
from transformers import AutoTokenizer, AutoModel, AutoConfig
|
| 43 |
+
from peft import PeftModel
|
| 44 |
+
from llm2vec import LLM2Vec
|
| 45 |
+
|
| 46 |
+
base_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-code-ft-GGUF"
|
| 47 |
+
mntp_model_id = "[SYSUSELab/DCS-CodeMistral-7B-It-MNTP]"
|
| 48 |
+
|
| 49 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
|
| 50 |
+
config = AutoConfig.from_pretrained(base_model_id, trust_remote_code=True)
|
| 51 |
+
model = AutoModel.from_pretrained(base_model_id, trust_remote_code=True, config=config,
|
| 52 |
+
torch_dtype=torch.bfloat16, device_map="auto")
|
| 53 |
+
model = PeftModel.from_pretrained(model, mntp_model_id)
|
| 54 |
+
|
| 55 |
+
l2v = LLM2Vec(model, tokenizer, pooling_mode="mean", max_length=512)
|
| 56 |
+
embeddings = l2v.encode(["def hello_world():\n print('Hello, World!')"])
|
| 57 |
+
print("Embedding from MNTP model:", embeddings.shape)
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### ⚙️ Training Methodology
|
| 61 |
+
|
| 62 |
+
This model was pre-trained using the **MNTP** objective as described in the `llm2vec` paper. If you wish to train your own MNTP model from scratch, please refer to the instructions in the `Fine-tuning/Fine-tuning_method/MNTP/` directory of our GitHub repository.
|
| 63 |
+
|
| 64 |
+
### 📄 Citation
|
| 65 |
+
|
| 66 |
+
If you use this model, please cite both our paper and the foundational work of `llm2vec`.
|
| 67 |
+
|
| 68 |
+
```bibtex
|
| 69 |
+
@article{chen2024decoder,
|
| 70 |
+
title={Are Decoder-Only Large Language Models the Silver Bullet for Code Search?},
|
| 71 |
+
author={Chen, Yuxuan and Liu, Mingwei and Ou, Guangsheng and Li, Anji and Dai, Dekun and Wang, Yanlin and Zheng, Zibin},
|
| 72 |
+
journal={arXiv preprint arXiv:2410.22240},
|
| 73 |
+
year={2024}
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
@article{vaishaal2024llm2vec,
|
| 77 |
+
title={LLM2Vec: Large Language Models Are Good Contextual Text Encoders},
|
| 78 |
+
author={Vaishaal, Shankar and Bansal, Mohit and Arora, Simran},
|
| 79 |
+
journal={arXiv preprint arXiv:2404.05961},
|
| 80 |
+
year={2024}
|
| 81 |
+
}
|
| 82 |
+
```
|
adapter_config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": {
|
| 4 |
+
"base_model_class": "MistralBiModel",
|
| 5 |
+
"parent_library": "llm2vec.models.bidirectional_mistral"
|
| 6 |
+
},
|
| 7 |
+
"base_model_name_or_path": "TheBloke/Mistral-7B-Instruct-v0.2-code-ft-GGUF",
|
| 8 |
+
"bias": "none",
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 32,
|
| 17 |
+
"lora_dropout": 0.05,
|
| 18 |
+
"megatron_config": null,
|
| 19 |
+
"megatron_core": "megatron.core",
|
| 20 |
+
"modules_to_save": null,
|
| 21 |
+
"peft_type": "LORA",
|
| 22 |
+
"r": 16,
|
| 23 |
+
"rank_pattern": {},
|
| 24 |
+
"revision": null,
|
| 25 |
+
"target_modules": [
|
| 26 |
+
"up_proj",
|
| 27 |
+
"k_proj",
|
| 28 |
+
"o_proj",
|
| 29 |
+
"down_proj",
|
| 30 |
+
"v_proj",
|
| 31 |
+
"q_proj",
|
| 32 |
+
"gate_proj"
|
| 33 |
+
],
|
| 34 |
+
"task_type": null,
|
| 35 |
+
"use_dora": false,
|
| 36 |
+
"use_rslora": false
|
| 37 |
+
}
|
adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b486f1b607aeaa0f1b55fdbf60af2ded0130793071d2d14507ddbcd89244c67
|
| 3 |
+
size 83943504
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<unk>",
|
| 4 |
+
"<s>",
|
| 5 |
+
"</s>"
|
| 6 |
+
],
|
| 7 |
+
"bos_token": {
|
| 8 |
+
"content": "<s>",
|
| 9 |
+
"lstrip": false,
|
| 10 |
+
"normalized": false,
|
| 11 |
+
"rstrip": false,
|
| 12 |
+
"single_word": false
|
| 13 |
+
},
|
| 14 |
+
"eos_token": {
|
| 15 |
+
"content": "</s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false
|
| 20 |
+
},
|
| 21 |
+
"mask_token": "_",
|
| 22 |
+
"pad_token": "</s>",
|
| 23 |
+
"unk_token": {
|
| 24 |
+
"content": "<unk>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
}
|
| 30 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
|
| 3 |
+
size 493443
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"0": {
|
| 6 |
+
"content": "<unk>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"1": {
|
| 14 |
+
"content": "<s>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"2": {
|
| 22 |
+
"content": "</s>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
}
|
| 29 |
+
},
|
| 30 |
+
"additional_special_tokens": [
|
| 31 |
+
"<unk>",
|
| 32 |
+
"<s>",
|
| 33 |
+
"</s>"
|
| 34 |
+
],
|
| 35 |
+
"bos_token": "<s>",
|
| 36 |
+
"clean_up_tokenization_spaces": false,
|
| 37 |
+
"eos_token": "</s>",
|
| 38 |
+
"legacy": true,
|
| 39 |
+
"mask_token": "_",
|
| 40 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 41 |
+
"pad_token": "</s>",
|
| 42 |
+
"sp_model_kwargs": {},
|
| 43 |
+
"spaces_between_special_tokens": false,
|
| 44 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 45 |
+
"unk_token": "<unk>",
|
| 46 |
+
"use_default_system_prompt": true
|
| 47 |
+
}
|