Upload folder using huggingface_hub
Browse files- 1_Dense/config.json +7 -0
- 1_Dense/model.safetensors +3 -0
- README.md +455 -0
- added_tokens.json +4 -0
- config.json +31 -0
- config_sentence_transformers.json +54 -0
- configuration_bert_hash.py +14 -0
- model.safetensors +3 -0
- modeling_bert_hash.py +519 -0
- modules.json +14 -0
- optimizer.pt +3 -0
- rng_state.pth +3 -0
- scaler.pt +3 -0
- scheduler.pt +3 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +31 -0
- tokenizer.json +0 -0
- tokenizer_config.json +79 -0
- trainer_state.json +104 -0
- training_args.bin +3 -0
- vocab.txt +0 -0
1_Dense/config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"in_features": 128,
|
| 3 |
+
"out_features": 128,
|
| 4 |
+
"bias": false,
|
| 5 |
+
"activation_function": "torch.nn.modules.linear.Identity",
|
| 6 |
+
"use_residual": false
|
| 7 |
+
}
|
1_Dense/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65869626726db1e3eb9cb199a7d5e6627c77ad384504a8902802c0cfc6e6e2f6
|
| 3 |
+
size 65624
|
README.md
ADDED
|
@@ -0,0 +1,455 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- tr
|
| 4 |
+
tags:
|
| 5 |
+
- ColBERT
|
| 6 |
+
- PyLate
|
| 7 |
+
- sentence-transformers
|
| 8 |
+
- sentence-similarity
|
| 9 |
+
- feature-extraction
|
| 10 |
+
- generated_from_trainer
|
| 11 |
+
- dataset_size:910904
|
| 12 |
+
- loss:Contrastive
|
| 13 |
+
base_model: ozayezerceli/bert-hash-nano-TR
|
| 14 |
+
datasets:
|
| 15 |
+
- parsak/msmarco-tr
|
| 16 |
+
pipeline_tag: sentence-similarity
|
| 17 |
+
library_name: PyLate
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
# PyLate model based on ozayezerceli/bert-hash-nano-TR
|
| 21 |
+
|
| 22 |
+
This is a [PyLate](https://github.com/lightonai/pylate) model finetuned from [ozayezerceli/bert-hash-nano-TR](https://huggingface.co/ozayezerceli/bert-hash-nano-TR) on the [msmarco-tr](https://huggingface.co/datasets/parsak/msmarco-tr) dataset. It maps sentences & paragraphs to sequences of 128-dimensional dense vectors and can be used for semantic textual similarity using the MaxSim operator.
|
| 23 |
+
|
| 24 |
+
## Model Details
|
| 25 |
+
|
| 26 |
+
### Model Description
|
| 27 |
+
- **Model Type:** PyLate model
|
| 28 |
+
- **Base model:** [ozayezerceli/bert-hash-nano-TR](https://huggingface.co/ozayezerceli/bert-hash-nano-TR) <!-- at revision dee6adc907398084b08a433d9e49e79690f6ee20 -->
|
| 29 |
+
- **Document Length:** 180 tokens
|
| 30 |
+
- **Query Length:** 32 tokens
|
| 31 |
+
- **Output Dimensionality:** 128 tokens
|
| 32 |
+
- **Similarity Function:** MaxSim
|
| 33 |
+
- **Training Dataset:**
|
| 34 |
+
- [msmarco-tr](https://huggingface.co/datasets/parsak/msmarco-tr)
|
| 35 |
+
- **Language:** tr
|
| 36 |
+
<!-- - **License:** Unknown -->
|
| 37 |
+
|
| 38 |
+
### Model Sources
|
| 39 |
+
|
| 40 |
+
- **Documentation:** [PyLate Documentation](https://lightonai.github.io/pylate/)
|
| 41 |
+
- **Repository:** [PyLate on GitHub](https://github.com/lightonai/pylate)
|
| 42 |
+
- **Hugging Face:** [PyLate models on Hugging Face](https://huggingface.co/models?library=PyLate)
|
| 43 |
+
|
| 44 |
+
### Full Model Architecture
|
| 45 |
+
|
| 46 |
+
```
|
| 47 |
+
ColBERT(
|
| 48 |
+
(0): Transformer({'max_seq_length': 179, 'do_lower_case': False, 'architecture': 'BertHashModel'})
|
| 49 |
+
(1): Dense({'in_features': 128, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity', 'use_residual': False})
|
| 50 |
+
)
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
## Usage
|
| 54 |
+
First install the PyLate library:
|
| 55 |
+
|
| 56 |
+
```bash
|
| 57 |
+
pip install -U pylate
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### Retrieval
|
| 61 |
+
|
| 62 |
+
Use this model with PyLate to index and retrieve documents. The index uses [FastPLAID](https://github.com/lightonai/fast-plaid) for efficient similarity search.
|
| 63 |
+
|
| 64 |
+
#### Indexing documents
|
| 65 |
+
|
| 66 |
+
Load the ColBERT model and initialize the PLAID index, then encode and index your documents:
|
| 67 |
+
|
| 68 |
+
```python
|
| 69 |
+
from pylate import indexes, models, retrieve
|
| 70 |
+
|
| 71 |
+
# Step 1: Load the ColBERT model
|
| 72 |
+
model = models.ColBERT(
|
| 73 |
+
model_name_or_path="pylate_model_id",
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Step 2: Initialize the PLAID index
|
| 77 |
+
index = indexes.PLAID(
|
| 78 |
+
index_folder="pylate-index",
|
| 79 |
+
index_name="index",
|
| 80 |
+
override=True, # This overwrites the existing index if any
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Step 3: Encode the documents
|
| 84 |
+
documents_ids = ["1", "2", "3"]
|
| 85 |
+
documents = ["document 1 text", "document 2 text", "document 3 text"]
|
| 86 |
+
|
| 87 |
+
documents_embeddings = model.encode(
|
| 88 |
+
documents,
|
| 89 |
+
batch_size=32,
|
| 90 |
+
is_query=False, # Ensure that it is set to False to indicate that these are documents, not queries
|
| 91 |
+
show_progress_bar=True,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# Step 4: Add document embeddings to the index by providing embeddings and corresponding ids
|
| 95 |
+
index.add_documents(
|
| 96 |
+
documents_ids=documents_ids,
|
| 97 |
+
documents_embeddings=documents_embeddings,
|
| 98 |
+
)
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
Note that you do not have to recreate the index and encode the documents every time. Once you have created an index and added the documents, you can re-use the index later by loading it:
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
# To load an index, simply instantiate it with the correct folder/name and without overriding it
|
| 105 |
+
index = indexes.PLAID(
|
| 106 |
+
index_folder="pylate-index",
|
| 107 |
+
index_name="index",
|
| 108 |
+
)
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
#### Retrieving top-k documents for queries
|
| 112 |
+
|
| 113 |
+
Once the documents are indexed, you can retrieve the top-k most relevant documents for a given set of queries.
|
| 114 |
+
To do so, initialize the ColBERT retriever with the index you want to search in, encode the queries and then retrieve the top-k documents to get the top matches ids and relevance scores:
|
| 115 |
+
|
| 116 |
+
```python
|
| 117 |
+
# Step 1: Initialize the ColBERT retriever
|
| 118 |
+
retriever = retrieve.ColBERT(index=index)
|
| 119 |
+
|
| 120 |
+
# Step 2: Encode the queries
|
| 121 |
+
queries_embeddings = model.encode(
|
| 122 |
+
["query for document 3", "query for document 1"],
|
| 123 |
+
batch_size=32,
|
| 124 |
+
is_query=True, # # Ensure that it is set to False to indicate that these are queries
|
| 125 |
+
show_progress_bar=True,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Step 3: Retrieve top-k documents
|
| 129 |
+
scores = retriever.retrieve(
|
| 130 |
+
queries_embeddings=queries_embeddings,
|
| 131 |
+
k=10, # Retrieve the top 10 matches for each query
|
| 132 |
+
)
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
### Reranking
|
| 136 |
+
If you only want to use the ColBERT model to perform reranking on top of your first-stage retrieval pipeline without building an index, you can simply use rank function and pass the queries and documents to rerank:
|
| 137 |
+
|
| 138 |
+
```python
|
| 139 |
+
from pylate import rank, models
|
| 140 |
+
|
| 141 |
+
queries = [
|
| 142 |
+
"query A",
|
| 143 |
+
"query B",
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
documents = [
|
| 147 |
+
["document A", "document B"],
|
| 148 |
+
["document 1", "document C", "document B"],
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
documents_ids = [
|
| 152 |
+
[1, 2],
|
| 153 |
+
[1, 3, 2],
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
model = models.ColBERT(
|
| 157 |
+
model_name_or_path="pylate_model_id",
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
queries_embeddings = model.encode(
|
| 161 |
+
queries,
|
| 162 |
+
is_query=True,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
documents_embeddings = model.encode(
|
| 166 |
+
documents,
|
| 167 |
+
is_query=False,
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
reranked_documents = rank.rerank(
|
| 171 |
+
documents_ids=documents_ids,
|
| 172 |
+
queries_embeddings=queries_embeddings,
|
| 173 |
+
documents_embeddings=documents_embeddings,
|
| 174 |
+
)
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
<!--
|
| 178 |
+
### Direct Usage (Transformers)
|
| 179 |
+
|
| 180 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
| 181 |
+
|
| 182 |
+
</details>
|
| 183 |
+
-->
|
| 184 |
+
|
| 185 |
+
<!--
|
| 186 |
+
### Downstream Usage (Sentence Transformers)
|
| 187 |
+
|
| 188 |
+
You can finetune this model on your own dataset.
|
| 189 |
+
|
| 190 |
+
<details><summary>Click to expand</summary>
|
| 191 |
+
|
| 192 |
+
</details>
|
| 193 |
+
-->
|
| 194 |
+
|
| 195 |
+
<!--
|
| 196 |
+
### Out-of-Scope Use
|
| 197 |
+
|
| 198 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
| 199 |
+
-->
|
| 200 |
+
|
| 201 |
+
<!--
|
| 202 |
+
## Bias, Risks and Limitations
|
| 203 |
+
|
| 204 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
| 205 |
+
-->
|
| 206 |
+
|
| 207 |
+
<!--
|
| 208 |
+
### Recommendations
|
| 209 |
+
|
| 210 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
| 211 |
+
-->
|
| 212 |
+
|
| 213 |
+
## Training Details
|
| 214 |
+
|
| 215 |
+
### Training Dataset
|
| 216 |
+
|
| 217 |
+
#### msmarco-tr
|
| 218 |
+
|
| 219 |
+
* Dataset: [msmarco-tr](https://huggingface.co/datasets/parsak/msmarco-tr) at [ffad30a](https://huggingface.co/datasets/parsak/msmarco-tr/tree/ffad30a7b0648f1c789c639db6c1d4720c22274c)
|
| 220 |
+
* Size: 910,904 training samples
|
| 221 |
+
* Columns: <code>query</code>, <code>positive</code>, and <code>negative</code>
|
| 222 |
+
* Approximate statistics based on the first 1000 samples:
|
| 223 |
+
| | query | positive | negative |
|
| 224 |
+
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|
|
| 225 |
+
| type | string | string | string |
|
| 226 |
+
| details | <ul><li>min: 6 tokens</li><li>mean: 16.58 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 25 tokens</li><li>mean: 31.99 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 20 tokens</li><li>mean: 31.99 tokens</li><li>max: 32 tokens</li></ul> |
|
| 227 |
+
* Samples:
|
| 228 |
+
| query | positive | negative |
|
| 229 |
+
|:---------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
| 230 |
+
| <code>sinir dokusundaki miyelin kılıfı nerede</code> | <code>Miyelin, bir tabaka oluşturan akson dielektrik (elektriksel olarak yalıtkan) malzemeyi çevreleyen yağlı bir beyaz maddedir, miyelin kılıfı, genellikle sadece bir nöronun aksonu etrafında bulunur. Sinir sisteminin düzgün çalışması için gereklidir. Bir tür glial hücrenin bir dış büyümesidir. Miyelin kılıfının üretimi miyelinasyon olarak adlandırılır. İnsanlarda, miyelin kılıfı 14'üncü haftada başlar.</code> | <code>İnsanlarda, dört temel doku tipi vardır: epitel dokusu, bağ dokusu, kas dokusu ve sinir dokusu. Her genel doku tipi içinde, belirli doku tipleri vardır. Bunu bir futbol takımı gibi düşünün.Her biri sahada kendi 'iş' olan bireysel oyuncular vardır.n insanlar, dört temel doku tipi vardır: epitel dokusu, bağ dokusu, kas dokusu ve sinir dokusu. Bu genel doku tipinde, her bir genel doku tipinde vardır.</code> |
|
| 231 |
+
| <code>Okulların Makine Mühendisliğini Sundukları Şeyler</code> | <code>Makine Mühendisliği Teknolojisi Dereceleri için Üst Okullar. Pennsylvania Eyalet Üniversitesi - Harrisburg, Purdue Üniversitesi ve Houston Üniversitesi, makine mühendisliği teknolojisi (MET) alanında lisans derecesi sunan üç okuldur. Bu üniversitelerdeki MET programları hakkında daha fazla bilgi edinmek için okumaya devam edin.</code> | <code>Mühendis tanımı, motorların veya makinelerin tasarımında, yapımında ve kullanımında veya çeşitli mühendislik dallarından herhangi birinde eğitimli ve yetenekli bir kişi: bir makine mühendisi; bir inşaat mühendisi. Daha fazla bilgi için bkz.</code> |
|
| 232 |
+
| <code>kim navigatör karıştırma valfleri taşır</code> | <code>BRADLEY THERMOSTATIC MIXING VANAS. Bradley Corporation, armatür ve sıhhi tesisat ürünlerinin üretiminde lider, dört hat üretir. termostatik karıştırma valfleri (TMVs). Bradley Navigator Yüksek Düşük termostatik karıştırma valfleri vardır. Dıştan gelen talebin çok düşük olduğu uygulamalar için idealdir.</code> | <code>Hidrolik Valfler. Eaton valfleri, tüm dünyadaki pazarlarda müşterilerimiz için rekabet avantajı sağlar. Geniş bir seçenek yelpazesinde benzersiz kalite sunan yüksek değerli hidrolik valf ürünlerimiz, gerçek endüstri liderlerinin tüm özelliklerini ve performans seviyelerini içerir. Endüstriyel Valfler.</code> |
|
| 233 |
+
* Loss: <code>pylate.losses.contrastive.Contrastive</code>
|
| 234 |
+
|
| 235 |
+
### Evaluation Dataset
|
| 236 |
+
|
| 237 |
+
#### msmarco-tr
|
| 238 |
+
|
| 239 |
+
* Dataset: [msmarco-tr](https://huggingface.co/datasets/parsak/msmarco-tr) at [ffad30a](https://huggingface.co/datasets/parsak/msmarco-tr/tree/ffad30a7b0648f1c789c639db6c1d4720c22274c)
|
| 240 |
+
* Size: 9,202 evaluation samples
|
| 241 |
+
* Columns: <code>query</code>, <code>positive</code>, and <code>negative</code>
|
| 242 |
+
* Approximate statistics based on the first 1000 samples:
|
| 243 |
+
| | query | positive | negative |
|
| 244 |
+
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|
|
| 245 |
+
| type | string | string | string |
|
| 246 |
+
| details | <ul><li>min: 6 tokens</li><li>mean: 16.35 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 25 tokens</li><li>mean: 31.99 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 25 tokens</li><li>mean: 31.99 tokens</li><li>max: 32 tokens</li></ul> |
|
| 247 |
+
* Samples:
|
| 248 |
+
| query | positive | negative |
|
| 249 |
+
|:--------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
| 250 |
+
| <code>Ermin hangi hayvandır</code> | <code>1 Aslında ermine kelimesi beyaz kürklü bir hayvanı ifade ederken, sırt üstü kahverengi kürklü ve karnında baş ve beyaz kürklü bireyler için stoat kullanılır.</code> | <code>Dünyada kaç hayvan türü var? İşte kaba bir sayım ve bilim adamlarının sayılara nasıl ulaştıklarına dair kısa bir açıklama. Dünyada kaç hayvan türü var? İşte kaba bir sayım ve bilim adamlarının sayılara nasıl ulaştıklarına dair kısa bir açıklama. Kaç hayvan türü var? https://www.thoughtco.com/how-many-animal-türleri-on-planet-130923 Strauss, Bob.</code> |
|
| 251 |
+
| <code>Abacus nereden çıktı</code> | <code>Abacus: Kısa Bir Tarih. Abacus, kökeni Yunanca abax veya abakon (masa veya tablet anlamına gelir) kelimelerinden gelen ve muhtemelen kum anlamına gelen Semitik abq kelimesinden kaynaklanan Latince bir kelimedir. Abacus, büyük sayıları saymak için kullanılan birçok sayma cihazından biridir.</code> | <code>Hücre apeksinde, bir flagellum için çapa alanı olan bazal gövdedir. Bazal cisimler, dokuz periferik mikrotübül üçlüsü ile centrioles'inkine benzer bir alt yapıya sahiptir (görüntünün alt merkezindeki yapıya bakınız).</code> |
|
| 252 |
+
| <code>Başın arkasında radyasyon tedavisi yüz kızarıklığına neden olur mu</code> | <code>Radyasyon Terapisinin En Yaygın Yan Etkileri. Cilt reaksiyonu: Radyasyon tedavisinin yaygın bir yan etkisi, tedavi edilen vücut bölgesinde cilt tahrişidir. Cilt reaksiyonu, hafif kızarıklık ve kuruluktan (güneş yanığına benzer) bazı hastalarda cildin şiddetli soyulmasına (desquamation) kadar değişebilir.</code> | <code>Bu açıklama amfizemi işaret edebilir. Bu, sigara içme geçmişiniz varsa daha da muhtemeldir. Radyasyon terapisi bilinen nedenlerden biri değildir. Bu konuda daha fazla cevap almak ve semptomlarınızı çözmeye yardımcı olmak için bir pulmonologla takip etmenizi isteyeceğim. Umarım bu, sorgunuzu tamamen ele alır. Sigara içme geçmişiniz varsa, daha da fazla umut eder. Radyasyon terapisi, bu sorunun çözümüne yardımcı olmanızı ve bu sorunun cevabını takip etmenizi isterim.</code> |
|
| 253 |
+
* Loss: <code>pylate.losses.contrastive.Contrastive</code>
|
| 254 |
+
|
| 255 |
+
### Training Hyperparameters
|
| 256 |
+
#### Non-Default Hyperparameters
|
| 257 |
+
|
| 258 |
+
- `learning_rate`: 3e-06
|
| 259 |
+
- `num_train_epochs`: 1
|
| 260 |
+
- `fp16`: True
|
| 261 |
+
|
| 262 |
+
#### All Hyperparameters
|
| 263 |
+
<details><summary>Click to expand</summary>
|
| 264 |
+
|
| 265 |
+
- `overwrite_output_dir`: False
|
| 266 |
+
- `do_predict`: False
|
| 267 |
+
- `eval_strategy`: no
|
| 268 |
+
- `prediction_loss_only`: True
|
| 269 |
+
- `per_device_train_batch_size`: 8
|
| 270 |
+
- `per_device_eval_batch_size`: 8
|
| 271 |
+
- `per_gpu_train_batch_size`: None
|
| 272 |
+
- `per_gpu_eval_batch_size`: None
|
| 273 |
+
- `gradient_accumulation_steps`: 1
|
| 274 |
+
- `eval_accumulation_steps`: None
|
| 275 |
+
- `torch_empty_cache_steps`: None
|
| 276 |
+
- `learning_rate`: 3e-06
|
| 277 |
+
- `weight_decay`: 0.0
|
| 278 |
+
- `adam_beta1`: 0.9
|
| 279 |
+
- `adam_beta2`: 0.999
|
| 280 |
+
- `adam_epsilon`: 1e-08
|
| 281 |
+
- `max_grad_norm`: 1.0
|
| 282 |
+
- `num_train_epochs`: 1
|
| 283 |
+
- `max_steps`: -1
|
| 284 |
+
- `lr_scheduler_type`: linear
|
| 285 |
+
- `lr_scheduler_kwargs`: {}
|
| 286 |
+
- `warmup_ratio`: 0.0
|
| 287 |
+
- `warmup_steps`: 0
|
| 288 |
+
- `log_level`: passive
|
| 289 |
+
- `log_level_replica`: warning
|
| 290 |
+
- `log_on_each_node`: True
|
| 291 |
+
- `logging_nan_inf_filter`: True
|
| 292 |
+
- `save_safetensors`: True
|
| 293 |
+
- `save_on_each_node`: False
|
| 294 |
+
- `save_only_model`: False
|
| 295 |
+
- `restore_callback_states_from_checkpoint`: False
|
| 296 |
+
- `no_cuda`: False
|
| 297 |
+
- `use_cpu`: False
|
| 298 |
+
- `use_mps_device`: False
|
| 299 |
+
- `seed`: 42
|
| 300 |
+
- `data_seed`: None
|
| 301 |
+
- `jit_mode_eval`: False
|
| 302 |
+
- `use_ipex`: False
|
| 303 |
+
- `bf16`: False
|
| 304 |
+
- `fp16`: True
|
| 305 |
+
- `fp16_opt_level`: O1
|
| 306 |
+
- `half_precision_backend`: auto
|
| 307 |
+
- `bf16_full_eval`: False
|
| 308 |
+
- `fp16_full_eval`: False
|
| 309 |
+
- `tf32`: None
|
| 310 |
+
- `local_rank`: 0
|
| 311 |
+
- `ddp_backend`: None
|
| 312 |
+
- `tpu_num_cores`: None
|
| 313 |
+
- `tpu_metrics_debug`: False
|
| 314 |
+
- `debug`: []
|
| 315 |
+
- `dataloader_drop_last`: False
|
| 316 |
+
- `dataloader_num_workers`: 0
|
| 317 |
+
- `dataloader_prefetch_factor`: None
|
| 318 |
+
- `past_index`: -1
|
| 319 |
+
- `disable_tqdm`: False
|
| 320 |
+
- `remove_unused_columns`: True
|
| 321 |
+
- `label_names`: None
|
| 322 |
+
- `load_best_model_at_end`: False
|
| 323 |
+
- `ignore_data_skip`: False
|
| 324 |
+
- `fsdp`: []
|
| 325 |
+
- `fsdp_min_num_params`: 0
|
| 326 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
| 327 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
| 328 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
| 329 |
+
- `parallelism_config`: None
|
| 330 |
+
- `deepspeed`: None
|
| 331 |
+
- `label_smoothing_factor`: 0.0
|
| 332 |
+
- `optim`: adamw_torch_fused
|
| 333 |
+
- `optim_args`: None
|
| 334 |
+
- `adafactor`: False
|
| 335 |
+
- `group_by_length`: False
|
| 336 |
+
- `length_column_name`: length
|
| 337 |
+
- `ddp_find_unused_parameters`: None
|
| 338 |
+
- `ddp_bucket_cap_mb`: None
|
| 339 |
+
- `ddp_broadcast_buffers`: False
|
| 340 |
+
- `dataloader_pin_memory`: True
|
| 341 |
+
- `dataloader_persistent_workers`: False
|
| 342 |
+
- `skip_memory_metrics`: True
|
| 343 |
+
- `use_legacy_prediction_loop`: False
|
| 344 |
+
- `push_to_hub`: False
|
| 345 |
+
- `resume_from_checkpoint`: None
|
| 346 |
+
- `hub_model_id`: None
|
| 347 |
+
- `hub_strategy`: every_save
|
| 348 |
+
- `hub_private_repo`: None
|
| 349 |
+
- `hub_always_push`: False
|
| 350 |
+
- `hub_revision`: None
|
| 351 |
+
- `gradient_checkpointing`: False
|
| 352 |
+
- `gradient_checkpointing_kwargs`: None
|
| 353 |
+
- `include_inputs_for_metrics`: False
|
| 354 |
+
- `include_for_metrics`: []
|
| 355 |
+
- `eval_do_concat_batches`: True
|
| 356 |
+
- `fp16_backend`: auto
|
| 357 |
+
- `push_to_hub_model_id`: None
|
| 358 |
+
- `push_to_hub_organization`: None
|
| 359 |
+
- `mp_parameters`:
|
| 360 |
+
- `auto_find_batch_size`: False
|
| 361 |
+
- `full_determinism`: False
|
| 362 |
+
- `torchdynamo`: None
|
| 363 |
+
- `ray_scope`: last
|
| 364 |
+
- `ddp_timeout`: 1800
|
| 365 |
+
- `torch_compile`: False
|
| 366 |
+
- `torch_compile_backend`: None
|
| 367 |
+
- `torch_compile_mode`: None
|
| 368 |
+
- `include_tokens_per_second`: False
|
| 369 |
+
- `include_num_input_tokens_seen`: False
|
| 370 |
+
- `neftune_noise_alpha`: None
|
| 371 |
+
- `optim_target_modules`: None
|
| 372 |
+
- `batch_eval_metrics`: False
|
| 373 |
+
- `eval_on_start`: False
|
| 374 |
+
- `use_liger_kernel`: False
|
| 375 |
+
- `liger_kernel_config`: None
|
| 376 |
+
- `eval_use_gather_object`: False
|
| 377 |
+
- `average_tokens_across_devices`: False
|
| 378 |
+
- `prompts`: None
|
| 379 |
+
- `batch_sampler`: batch_sampler
|
| 380 |
+
- `multi_dataset_batch_sampler`: proportional
|
| 381 |
+
- `router_mapping`: {}
|
| 382 |
+
- `learning_rate_mapping`: {}
|
| 383 |
+
|
| 384 |
+
</details>
|
| 385 |
+
|
| 386 |
+
### Training Logs
|
| 387 |
+
| Epoch | Step | Training Loss |
|
| 388 |
+
|:------:|:----:|:-------------:|
|
| 389 |
+
| 0.0000 | 1 | 1.9342 |
|
| 390 |
+
| 0.0088 | 1000 | 1.3908 |
|
| 391 |
+
| 0.0176 | 2000 | 1.1178 |
|
| 392 |
+
| 0.0263 | 3000 | 1.0264 |
|
| 393 |
+
| 0.0351 | 4000 | 0.9816 |
|
| 394 |
+
| 0.0439 | 5000 | 0.9313 |
|
| 395 |
+
| 0.0527 | 6000 | 0.9324 |
|
| 396 |
+
| 0.0615 | 7000 | 0.8896 |
|
| 397 |
+
| 0.0703 | 8000 | 0.8703 |
|
| 398 |
+
| 0.0790 | 9000 | 0.8665 |
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
### Framework Versions
|
| 402 |
+
- Python: 3.12.1
|
| 403 |
+
- Sentence Transformers: 5.1.1
|
| 404 |
+
- PyLate: 1.3.4
|
| 405 |
+
- Transformers: 4.56.2
|
| 406 |
+
- PyTorch: 2.8.0+cu128
|
| 407 |
+
- Accelerate: 1.10.1
|
| 408 |
+
- Datasets: 4.0.0
|
| 409 |
+
- Tokenizers: 0.22.1
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
## Citation
|
| 413 |
+
|
| 414 |
+
### BibTeX
|
| 415 |
+
|
| 416 |
+
#### Sentence Transformers
|
| 417 |
+
```bibtex
|
| 418 |
+
@inproceedings{reimers-2019-sentence-bert,
|
| 419 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
| 420 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
| 421 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
| 422 |
+
month = "11",
|
| 423 |
+
year = "2019",
|
| 424 |
+
publisher = "Association for Computational Linguistics",
|
| 425 |
+
url = "https://arxiv.org/abs/1908.10084"
|
| 426 |
+
}
|
| 427 |
+
```
|
| 428 |
+
|
| 429 |
+
#### PyLate
|
| 430 |
+
```bibtex
|
| 431 |
+
@misc{PyLate,
|
| 432 |
+
title={PyLate: Flexible Training and Retrieval for Late Interaction Models},
|
| 433 |
+
author={Chaffin, Antoine and Sourty, Raphaël},
|
| 434 |
+
url={https://github.com/lightonai/pylate},
|
| 435 |
+
year={2024}
|
| 436 |
+
}
|
| 437 |
+
```
|
| 438 |
+
|
| 439 |
+
<!--
|
| 440 |
+
## Glossary
|
| 441 |
+
|
| 442 |
+
*Clearly define terms in order to be accessible across audiences.*
|
| 443 |
+
-->
|
| 444 |
+
|
| 445 |
+
<!--
|
| 446 |
+
## Model Card Authors
|
| 447 |
+
|
| 448 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
| 449 |
+
-->
|
| 450 |
+
|
| 451 |
+
<!--
|
| 452 |
+
## Model Card Contact
|
| 453 |
+
|
| 454 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
| 455 |
+
-->
|
added_tokens.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"[D] ": 30523,
|
| 3 |
+
"[Q] ": 30522
|
| 4 |
+
}
|
config.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertHashModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"auto_map": {
|
| 7 |
+
"AutoConfig": "configuration_bert_hash.BertHashConfig",
|
| 8 |
+
"AutoModel": "modeling_bert_hash.BertHashModel",
|
| 9 |
+
"AutoModelForMaskedLM": "modeling_bert_hash.BertHashForMaskedLM",
|
| 10 |
+
"AutoModelForSequenceClassification": "modeling_bert_hash.BertHashForSequenceClassification"
|
| 11 |
+
},
|
| 12 |
+
"classifier_dropout": null,
|
| 13 |
+
"dtype": "float32",
|
| 14 |
+
"hidden_act": "gelu",
|
| 15 |
+
"hidden_dropout_prob": 0.1,
|
| 16 |
+
"hidden_size": 128,
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"intermediate_size": 512,
|
| 19 |
+
"layer_norm_eps": 1e-12,
|
| 20 |
+
"max_position_embeddings": 512,
|
| 21 |
+
"model_type": "bert_hash",
|
| 22 |
+
"num_attention_heads": 2,
|
| 23 |
+
"num_hidden_layers": 2,
|
| 24 |
+
"pad_token_id": 0,
|
| 25 |
+
"position_embedding_type": "absolute",
|
| 26 |
+
"projections": 16,
|
| 27 |
+
"transformers_version": "4.56.2",
|
| 28 |
+
"type_vocab_size": 2,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"vocab_size": 30524
|
| 31 |
+
}
|
config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "SentenceTransformer",
|
| 3 |
+
"__version__": {
|
| 4 |
+
"sentence_transformers": "5.1.1",
|
| 5 |
+
"transformers": "4.56.2",
|
| 6 |
+
"pytorch": "2.8.0+cu128"
|
| 7 |
+
},
|
| 8 |
+
"prompts": {
|
| 9 |
+
"query": "",
|
| 10 |
+
"document": ""
|
| 11 |
+
},
|
| 12 |
+
"default_prompt_name": null,
|
| 13 |
+
"similarity_fn_name": "MaxSim",
|
| 14 |
+
"query_prefix": "[Q] ",
|
| 15 |
+
"document_prefix": "[D] ",
|
| 16 |
+
"query_length": 32,
|
| 17 |
+
"document_length": 180,
|
| 18 |
+
"attend_to_expansion_tokens": false,
|
| 19 |
+
"skiplist_words": [
|
| 20 |
+
"!",
|
| 21 |
+
"\"",
|
| 22 |
+
"#",
|
| 23 |
+
"$",
|
| 24 |
+
"%",
|
| 25 |
+
"&",
|
| 26 |
+
"'",
|
| 27 |
+
"(",
|
| 28 |
+
")",
|
| 29 |
+
"*",
|
| 30 |
+
"+",
|
| 31 |
+
",",
|
| 32 |
+
"-",
|
| 33 |
+
".",
|
| 34 |
+
"/",
|
| 35 |
+
":",
|
| 36 |
+
";",
|
| 37 |
+
"<",
|
| 38 |
+
"=",
|
| 39 |
+
">",
|
| 40 |
+
"?",
|
| 41 |
+
"@",
|
| 42 |
+
"[",
|
| 43 |
+
"\\",
|
| 44 |
+
"]",
|
| 45 |
+
"^",
|
| 46 |
+
"_",
|
| 47 |
+
"`",
|
| 48 |
+
"{",
|
| 49 |
+
"|",
|
| 50 |
+
"}",
|
| 51 |
+
"~"
|
| 52 |
+
],
|
| 53 |
+
"do_query_expansion": true
|
| 54 |
+
}
|
configuration_bert_hash.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers.models.bert.configuration_bert import BertConfig
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class BertHashConfig(BertConfig):
|
| 5 |
+
"""
|
| 6 |
+
Extension of Bert configuration to add projections parameter.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
model_type = "bert_hash"
|
| 10 |
+
|
| 11 |
+
def __init__(self, projections=5, **kwargs):
|
| 12 |
+
super().__init__(**kwargs)
|
| 13 |
+
|
| 14 |
+
self.projections = projections
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc90b4bd7de9e1f1dada8731ef68a818939fec7b64c40b238f24ea64ce890d35
|
| 3 |
+
size 3883056
|
modeling_bert_hash.py
ADDED
|
@@ -0,0 +1,519 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Union
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from torch import nn
|
| 5 |
+
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
| 6 |
+
|
| 7 |
+
from transformers.cache_utils import Cache
|
| 8 |
+
from transformers.models.bert.modeling_bert import BertEncoder, BertPooler, BertPreTrainedModel, BertOnlyMLMHead
|
| 9 |
+
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
|
| 10 |
+
from transformers.modeling_outputs import (
|
| 11 |
+
BaseModelOutputWithPoolingAndCrossAttentions,
|
| 12 |
+
MaskedLMOutput,
|
| 13 |
+
SequenceClassifierOutput,
|
| 14 |
+
)
|
| 15 |
+
from transformers.utils import auto_docstring, logging
|
| 16 |
+
|
| 17 |
+
from .configuration_bert_hash import BertHashConfig
|
| 18 |
+
|
| 19 |
+
logger = logging.get_logger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class BertHashTokens(nn.Module):
|
| 23 |
+
"""
|
| 24 |
+
Module that embeds token vocabulary to an intermediate embeddings layer then projects those embeddings to the
|
| 25 |
+
hidden size.
|
| 26 |
+
|
| 27 |
+
The number of projections is like a hash. Setting the projections parameter to 5 is like generating a
|
| 28 |
+
160-bit hash (5 x float32) for each token. That hash is then projected to the hidden size.
|
| 29 |
+
|
| 30 |
+
This significantly reduces the number of parameters necessary for token embeddings.
|
| 31 |
+
|
| 32 |
+
For example:
|
| 33 |
+
Standard token embeddings:
|
| 34 |
+
30,522 (vocab size) x 768 (hidden size) = 23,440,896 parameters
|
| 35 |
+
23,440,896 x 4 (float32) = 93,763,584 bytes
|
| 36 |
+
|
| 37 |
+
Hash token embeddings:
|
| 38 |
+
30,522 (vocab size) x 5 (hash buckets) + 5 x 768 (projection matrix)= 156,450 parameters
|
| 39 |
+
156,450 x 4 (float32) = 625,800 bytes
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def __init__(self, config):
|
| 43 |
+
super().__init__()
|
| 44 |
+
self.config = config
|
| 45 |
+
|
| 46 |
+
# Token embeddings
|
| 47 |
+
self.embeddings = nn.Embedding(config.vocab_size, config.projections, padding_idx=config.pad_token_id)
|
| 48 |
+
|
| 49 |
+
# Token embeddings projections
|
| 50 |
+
self.projections = nn.Linear(config.projections, config.hidden_size)
|
| 51 |
+
|
| 52 |
+
def forward(self, input_ids):
|
| 53 |
+
# Project embeddings to hidden size
|
| 54 |
+
return self.projections(self.embeddings(input_ids))
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class BertHashEmbeddings(nn.Module):
|
| 58 |
+
"""Construct the embeddings from word, position and token_type embeddings."""
|
| 59 |
+
|
| 60 |
+
def __init__(self, config):
|
| 61 |
+
super().__init__()
|
| 62 |
+
self.word_embeddings = BertHashTokens(config)
|
| 63 |
+
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
| 64 |
+
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
| 65 |
+
|
| 66 |
+
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
|
| 67 |
+
# any TensorFlow checkpoint file
|
| 68 |
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
| 69 |
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
| 70 |
+
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
| 71 |
+
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
| 72 |
+
self.register_buffer(
|
| 73 |
+
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
| 74 |
+
)
|
| 75 |
+
self.register_buffer(
|
| 76 |
+
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
def forward(
|
| 80 |
+
self,
|
| 81 |
+
input_ids: Optional[torch.LongTensor] = None,
|
| 82 |
+
token_type_ids: Optional[torch.LongTensor] = None,
|
| 83 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 84 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 85 |
+
past_key_values_length: int = 0,
|
| 86 |
+
) -> torch.Tensor:
|
| 87 |
+
if input_ids is not None:
|
| 88 |
+
input_shape = input_ids.size()
|
| 89 |
+
else:
|
| 90 |
+
input_shape = inputs_embeds.size()[:-1]
|
| 91 |
+
|
| 92 |
+
seq_length = input_shape[1]
|
| 93 |
+
|
| 94 |
+
if position_ids is None:
|
| 95 |
+
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
|
| 96 |
+
|
| 97 |
+
# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
|
| 98 |
+
# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
|
| 99 |
+
# issue #5664
|
| 100 |
+
if token_type_ids is None:
|
| 101 |
+
if hasattr(self, "token_type_ids"):
|
| 102 |
+
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
|
| 103 |
+
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
|
| 104 |
+
token_type_ids = buffered_token_type_ids_expanded
|
| 105 |
+
else:
|
| 106 |
+
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
|
| 107 |
+
|
| 108 |
+
if inputs_embeds is None:
|
| 109 |
+
inputs_embeds = self.word_embeddings(input_ids)
|
| 110 |
+
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
| 111 |
+
|
| 112 |
+
embeddings = inputs_embeds + token_type_embeddings
|
| 113 |
+
if self.position_embedding_type == "absolute":
|
| 114 |
+
position_embeddings = self.position_embeddings(position_ids)
|
| 115 |
+
embeddings += position_embeddings
|
| 116 |
+
embeddings = self.LayerNorm(embeddings)
|
| 117 |
+
embeddings = self.dropout(embeddings)
|
| 118 |
+
return embeddings
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
@auto_docstring(
|
| 122 |
+
custom_intro="""
|
| 123 |
+
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
|
| 124 |
+
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
|
| 125 |
+
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
|
| 126 |
+
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
|
| 127 |
+
|
| 128 |
+
To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
|
| 129 |
+
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
|
| 130 |
+
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
|
| 131 |
+
"""
|
| 132 |
+
)
|
| 133 |
+
class BertHashModel(BertPreTrainedModel):
|
| 134 |
+
config_class = BertHashConfig
|
| 135 |
+
|
| 136 |
+
_no_split_modules = ["BertEmbeddings", "BertLayer"]
|
| 137 |
+
|
| 138 |
+
def __init__(self, config, add_pooling_layer=True):
|
| 139 |
+
r"""
|
| 140 |
+
add_pooling_layer (bool, *optional*, defaults to `True`):
|
| 141 |
+
Whether to add a pooling layer
|
| 142 |
+
"""
|
| 143 |
+
super().__init__(config)
|
| 144 |
+
self.config = config
|
| 145 |
+
|
| 146 |
+
self.embeddings = BertHashEmbeddings(config)
|
| 147 |
+
self.encoder = BertEncoder(config)
|
| 148 |
+
|
| 149 |
+
self.pooler = BertPooler(config) if add_pooling_layer else None
|
| 150 |
+
|
| 151 |
+
self.attn_implementation = config._attn_implementation
|
| 152 |
+
self.position_embedding_type = config.position_embedding_type
|
| 153 |
+
|
| 154 |
+
# Initialize weights and apply final processing
|
| 155 |
+
self.post_init()
|
| 156 |
+
|
| 157 |
+
def get_input_embeddings(self):
|
| 158 |
+
return self.embeddings.word_embeddings.embeddings
|
| 159 |
+
|
| 160 |
+
def set_input_embeddings(self, value):
|
| 161 |
+
self.embeddings.word_embeddings.embeddings = value
|
| 162 |
+
|
| 163 |
+
def _prune_heads(self, heads_to_prune):
|
| 164 |
+
"""
|
| 165 |
+
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
|
| 166 |
+
class PreTrainedModel
|
| 167 |
+
"""
|
| 168 |
+
for layer, heads in heads_to_prune.items():
|
| 169 |
+
self.encoder.layer[layer].attention.prune_heads(heads)
|
| 170 |
+
|
| 171 |
+
@auto_docstring
|
| 172 |
+
def forward(
|
| 173 |
+
self,
|
| 174 |
+
input_ids: Optional[torch.Tensor] = None,
|
| 175 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 176 |
+
token_type_ids: Optional[torch.Tensor] = None,
|
| 177 |
+
position_ids: Optional[torch.Tensor] = None,
|
| 178 |
+
head_mask: Optional[torch.Tensor] = None,
|
| 179 |
+
inputs_embeds: Optional[torch.Tensor] = None,
|
| 180 |
+
encoder_hidden_states: Optional[torch.Tensor] = None,
|
| 181 |
+
encoder_attention_mask: Optional[torch.Tensor] = None,
|
| 182 |
+
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
| 183 |
+
use_cache: Optional[bool] = None,
|
| 184 |
+
output_attentions: Optional[bool] = None,
|
| 185 |
+
output_hidden_states: Optional[bool] = None,
|
| 186 |
+
return_dict: Optional[bool] = None,
|
| 187 |
+
cache_position: Optional[torch.Tensor] = None,
|
| 188 |
+
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
| 189 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 190 |
+
output_hidden_states = (
|
| 191 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 192 |
+
)
|
| 193 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 194 |
+
|
| 195 |
+
if self.config.is_decoder:
|
| 196 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
| 197 |
+
else:
|
| 198 |
+
use_cache = False
|
| 199 |
+
|
| 200 |
+
if input_ids is not None and inputs_embeds is not None:
|
| 201 |
+
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
| 202 |
+
elif input_ids is not None:
|
| 203 |
+
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
|
| 204 |
+
input_shape = input_ids.size()
|
| 205 |
+
elif inputs_embeds is not None:
|
| 206 |
+
input_shape = inputs_embeds.size()[:-1]
|
| 207 |
+
else:
|
| 208 |
+
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
| 209 |
+
|
| 210 |
+
batch_size, seq_length = input_shape
|
| 211 |
+
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
| 212 |
+
|
| 213 |
+
past_key_values_length = 0
|
| 214 |
+
if past_key_values is not None:
|
| 215 |
+
past_key_values_length = (
|
| 216 |
+
past_key_values[0][0].shape[-2]
|
| 217 |
+
if not isinstance(past_key_values, Cache)
|
| 218 |
+
else past_key_values.get_seq_length()
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
if token_type_ids is None:
|
| 222 |
+
if hasattr(self.embeddings, "token_type_ids"):
|
| 223 |
+
buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
|
| 224 |
+
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
|
| 225 |
+
token_type_ids = buffered_token_type_ids_expanded
|
| 226 |
+
else:
|
| 227 |
+
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
| 228 |
+
|
| 229 |
+
embedding_output = self.embeddings(
|
| 230 |
+
input_ids=input_ids,
|
| 231 |
+
position_ids=position_ids,
|
| 232 |
+
token_type_ids=token_type_ids,
|
| 233 |
+
inputs_embeds=inputs_embeds,
|
| 234 |
+
past_key_values_length=past_key_values_length,
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
if attention_mask is None:
|
| 238 |
+
attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
|
| 239 |
+
|
| 240 |
+
use_sdpa_attention_masks = (
|
| 241 |
+
self.attn_implementation == "sdpa"
|
| 242 |
+
and self.position_embedding_type == "absolute"
|
| 243 |
+
and head_mask is None
|
| 244 |
+
and not output_attentions
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# Expand the attention mask
|
| 248 |
+
if use_sdpa_attention_masks and attention_mask.dim() == 2:
|
| 249 |
+
# Expand the attention mask for SDPA.
|
| 250 |
+
# [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
|
| 251 |
+
if self.config.is_decoder:
|
| 252 |
+
extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
|
| 253 |
+
attention_mask,
|
| 254 |
+
input_shape,
|
| 255 |
+
embedding_output,
|
| 256 |
+
past_key_values_length,
|
| 257 |
+
)
|
| 258 |
+
else:
|
| 259 |
+
extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
|
| 260 |
+
attention_mask, embedding_output.dtype, tgt_len=seq_length
|
| 261 |
+
)
|
| 262 |
+
else:
|
| 263 |
+
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
| 264 |
+
# ourselves in which case we just need to make it broadcastable to all heads.
|
| 265 |
+
extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
|
| 266 |
+
|
| 267 |
+
# If a 2D or 3D attention mask is provided for the cross-attention
|
| 268 |
+
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
| 269 |
+
if self.config.is_decoder and encoder_hidden_states is not None:
|
| 270 |
+
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
| 271 |
+
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
| 272 |
+
if encoder_attention_mask is None:
|
| 273 |
+
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
|
| 274 |
+
|
| 275 |
+
if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2:
|
| 276 |
+
# Expand the attention mask for SDPA.
|
| 277 |
+
# [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
|
| 278 |
+
encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
|
| 279 |
+
encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length
|
| 280 |
+
)
|
| 281 |
+
else:
|
| 282 |
+
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
|
| 283 |
+
else:
|
| 284 |
+
encoder_extended_attention_mask = None
|
| 285 |
+
|
| 286 |
+
# Prepare head mask if needed
|
| 287 |
+
# 1.0 in head_mask indicate we keep the head
|
| 288 |
+
# attention_probs has shape bsz x n_heads x N x N
|
| 289 |
+
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
| 290 |
+
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
| 291 |
+
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
|
| 292 |
+
|
| 293 |
+
encoder_outputs = self.encoder(
|
| 294 |
+
embedding_output,
|
| 295 |
+
attention_mask=extended_attention_mask,
|
| 296 |
+
head_mask=head_mask,
|
| 297 |
+
encoder_hidden_states=encoder_hidden_states,
|
| 298 |
+
encoder_attention_mask=encoder_extended_attention_mask,
|
| 299 |
+
past_key_values=past_key_values,
|
| 300 |
+
use_cache=use_cache,
|
| 301 |
+
output_attentions=output_attentions,
|
| 302 |
+
output_hidden_states=output_hidden_states,
|
| 303 |
+
return_dict=return_dict,
|
| 304 |
+
cache_position=cache_position,
|
| 305 |
+
)
|
| 306 |
+
sequence_output = encoder_outputs[0]
|
| 307 |
+
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
|
| 308 |
+
|
| 309 |
+
if not return_dict:
|
| 310 |
+
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
| 311 |
+
|
| 312 |
+
return BaseModelOutputWithPoolingAndCrossAttentions(
|
| 313 |
+
last_hidden_state=sequence_output,
|
| 314 |
+
pooler_output=pooled_output,
|
| 315 |
+
past_key_values=encoder_outputs.past_key_values,
|
| 316 |
+
hidden_states=encoder_outputs.hidden_states,
|
| 317 |
+
attentions=encoder_outputs.attentions,
|
| 318 |
+
cross_attentions=encoder_outputs.cross_attentions,
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
@auto_docstring
|
| 323 |
+
class BertHashForMaskedLM(BertPreTrainedModel):
|
| 324 |
+
_tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
|
| 325 |
+
config_class = BertHashConfig
|
| 326 |
+
|
| 327 |
+
def __init__(self, config):
|
| 328 |
+
super().__init__(config)
|
| 329 |
+
|
| 330 |
+
if config.is_decoder:
|
| 331 |
+
logger.warning(
|
| 332 |
+
"If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
|
| 333 |
+
"bi-directional self-attention."
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
self.bert = BertHashModel(config, add_pooling_layer=False)
|
| 337 |
+
self.cls = BertOnlyMLMHead(config)
|
| 338 |
+
|
| 339 |
+
# Initialize weights and apply final processing
|
| 340 |
+
self.post_init()
|
| 341 |
+
|
| 342 |
+
@auto_docstring
|
| 343 |
+
def forward(
|
| 344 |
+
self,
|
| 345 |
+
input_ids: Optional[torch.Tensor] = None,
|
| 346 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 347 |
+
token_type_ids: Optional[torch.Tensor] = None,
|
| 348 |
+
position_ids: Optional[torch.Tensor] = None,
|
| 349 |
+
head_mask: Optional[torch.Tensor] = None,
|
| 350 |
+
inputs_embeds: Optional[torch.Tensor] = None,
|
| 351 |
+
encoder_hidden_states: Optional[torch.Tensor] = None,
|
| 352 |
+
encoder_attention_mask: Optional[torch.Tensor] = None,
|
| 353 |
+
labels: Optional[torch.Tensor] = None,
|
| 354 |
+
output_attentions: Optional[bool] = None,
|
| 355 |
+
output_hidden_states: Optional[bool] = None,
|
| 356 |
+
return_dict: Optional[bool] = None,
|
| 357 |
+
) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
|
| 358 |
+
r"""
|
| 359 |
+
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 360 |
+
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
| 361 |
+
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
| 362 |
+
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
| 363 |
+
"""
|
| 364 |
+
|
| 365 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 366 |
+
|
| 367 |
+
outputs = self.bert(
|
| 368 |
+
input_ids,
|
| 369 |
+
attention_mask=attention_mask,
|
| 370 |
+
token_type_ids=token_type_ids,
|
| 371 |
+
position_ids=position_ids,
|
| 372 |
+
head_mask=head_mask,
|
| 373 |
+
inputs_embeds=inputs_embeds,
|
| 374 |
+
encoder_hidden_states=encoder_hidden_states,
|
| 375 |
+
encoder_attention_mask=encoder_attention_mask,
|
| 376 |
+
output_attentions=output_attentions,
|
| 377 |
+
output_hidden_states=output_hidden_states,
|
| 378 |
+
return_dict=return_dict,
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
sequence_output = outputs[0]
|
| 382 |
+
prediction_scores = self.cls(sequence_output)
|
| 383 |
+
|
| 384 |
+
masked_lm_loss = None
|
| 385 |
+
if labels is not None:
|
| 386 |
+
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
| 387 |
+
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
| 388 |
+
|
| 389 |
+
if not return_dict:
|
| 390 |
+
output = (prediction_scores,) + outputs[2:]
|
| 391 |
+
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
| 392 |
+
|
| 393 |
+
return MaskedLMOutput(
|
| 394 |
+
loss=masked_lm_loss,
|
| 395 |
+
logits=prediction_scores,
|
| 396 |
+
hidden_states=outputs.hidden_states,
|
| 397 |
+
attentions=outputs.attentions,
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
|
| 401 |
+
input_shape = input_ids.shape
|
| 402 |
+
effective_batch_size = input_shape[0]
|
| 403 |
+
|
| 404 |
+
# add a dummy token
|
| 405 |
+
if self.config.pad_token_id is None:
|
| 406 |
+
raise ValueError("The PAD token should be defined for generation")
|
| 407 |
+
|
| 408 |
+
attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
|
| 409 |
+
dummy_token = torch.full(
|
| 410 |
+
(effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
|
| 411 |
+
)
|
| 412 |
+
input_ids = torch.cat([input_ids, dummy_token], dim=1)
|
| 413 |
+
|
| 414 |
+
return {"input_ids": input_ids, "attention_mask": attention_mask}
|
| 415 |
+
|
| 416 |
+
@classmethod
|
| 417 |
+
def can_generate(cls) -> bool:
|
| 418 |
+
"""
|
| 419 |
+
Legacy correction: BertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
|
| 420 |
+
`prepare_inputs_for_generation` method.
|
| 421 |
+
"""
|
| 422 |
+
return False
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
@auto_docstring(
|
| 426 |
+
custom_intro="""
|
| 427 |
+
Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
|
| 428 |
+
output) e.g. for GLUE tasks.
|
| 429 |
+
"""
|
| 430 |
+
)
|
| 431 |
+
class BertHashForSequenceClassification(BertPreTrainedModel):
|
| 432 |
+
config_class = BertHashConfig
|
| 433 |
+
|
| 434 |
+
def __init__(self, config):
|
| 435 |
+
super().__init__(config)
|
| 436 |
+
self.num_labels = config.num_labels
|
| 437 |
+
self.config = config
|
| 438 |
+
|
| 439 |
+
self.bert = BertHashModel(config)
|
| 440 |
+
classifier_dropout = (
|
| 441 |
+
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
|
| 442 |
+
)
|
| 443 |
+
self.dropout = nn.Dropout(classifier_dropout)
|
| 444 |
+
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
| 445 |
+
|
| 446 |
+
# Initialize weights and apply final processing
|
| 447 |
+
self.post_init()
|
| 448 |
+
|
| 449 |
+
@auto_docstring
|
| 450 |
+
def forward(
|
| 451 |
+
self,
|
| 452 |
+
input_ids: Optional[torch.Tensor] = None,
|
| 453 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 454 |
+
token_type_ids: Optional[torch.Tensor] = None,
|
| 455 |
+
position_ids: Optional[torch.Tensor] = None,
|
| 456 |
+
head_mask: Optional[torch.Tensor] = None,
|
| 457 |
+
inputs_embeds: Optional[torch.Tensor] = None,
|
| 458 |
+
labels: Optional[torch.Tensor] = None,
|
| 459 |
+
output_attentions: Optional[bool] = None,
|
| 460 |
+
output_hidden_states: Optional[bool] = None,
|
| 461 |
+
return_dict: Optional[bool] = None,
|
| 462 |
+
) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
|
| 463 |
+
r"""
|
| 464 |
+
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
| 465 |
+
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
| 466 |
+
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
| 467 |
+
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
| 468 |
+
"""
|
| 469 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 470 |
+
|
| 471 |
+
outputs = self.bert(
|
| 472 |
+
input_ids,
|
| 473 |
+
attention_mask=attention_mask,
|
| 474 |
+
token_type_ids=token_type_ids,
|
| 475 |
+
position_ids=position_ids,
|
| 476 |
+
head_mask=head_mask,
|
| 477 |
+
inputs_embeds=inputs_embeds,
|
| 478 |
+
output_attentions=output_attentions,
|
| 479 |
+
output_hidden_states=output_hidden_states,
|
| 480 |
+
return_dict=return_dict,
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
pooled_output = outputs[1]
|
| 484 |
+
|
| 485 |
+
pooled_output = self.dropout(pooled_output)
|
| 486 |
+
logits = self.classifier(pooled_output)
|
| 487 |
+
|
| 488 |
+
loss = None
|
| 489 |
+
if labels is not None:
|
| 490 |
+
if self.config.problem_type is None:
|
| 491 |
+
if self.num_labels == 1:
|
| 492 |
+
self.config.problem_type = "regression"
|
| 493 |
+
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
|
| 494 |
+
self.config.problem_type = "single_label_classification"
|
| 495 |
+
else:
|
| 496 |
+
self.config.problem_type = "multi_label_classification"
|
| 497 |
+
|
| 498 |
+
if self.config.problem_type == "regression":
|
| 499 |
+
loss_fct = MSELoss()
|
| 500 |
+
if self.num_labels == 1:
|
| 501 |
+
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
| 502 |
+
else:
|
| 503 |
+
loss = loss_fct(logits, labels)
|
| 504 |
+
elif self.config.problem_type == "single_label_classification":
|
| 505 |
+
loss_fct = CrossEntropyLoss()
|
| 506 |
+
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
| 507 |
+
elif self.config.problem_type == "multi_label_classification":
|
| 508 |
+
loss_fct = BCEWithLogitsLoss()
|
| 509 |
+
loss = loss_fct(logits, labels)
|
| 510 |
+
if not return_dict:
|
| 511 |
+
output = (logits,) + outputs[2:]
|
| 512 |
+
return ((loss,) + output) if loss is not None else output
|
| 513 |
+
|
| 514 |
+
return SequenceClassifierOutput(
|
| 515 |
+
loss=loss,
|
| 516 |
+
logits=logits,
|
| 517 |
+
hidden_states=outputs.hidden_states,
|
| 518 |
+
attentions=outputs.attentions,
|
| 519 |
+
)
|
modules.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Dense",
|
| 12 |
+
"type": "pylate.models.Dense.Dense"
|
| 13 |
+
}
|
| 14 |
+
]
|
optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97245b9ef2b05399f151f4eeba08819b3a584a079c260e9abf70ab6c9a70bfad
|
| 3 |
+
size 7790987
|
rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d46800e6e3ce4bcc9b3ba403638c28e1036a2fca5005396a4c64f753ed44d765
|
| 3 |
+
size 14645
|
scaler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:125dd9a754f313ede3918b2838113df93582e51376581e56793338dd7645bd93
|
| 3 |
+
size 1383
|
scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b198d63af5c6563399a525397894e9b3d1958651f2304ed8bae0db0183a0164
|
| 3 |
+
size 1465
|
sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 179,
|
| 3 |
+
"do_lower_case": false
|
| 4 |
+
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": {
|
| 3 |
+
"content": "[CLS]",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"mask_token": {
|
| 10 |
+
"content": "[MASK]",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "[MASK]",
|
| 17 |
+
"sep_token": {
|
| 18 |
+
"content": "[SEP]",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"unk_token": {
|
| 25 |
+
"content": "[UNK]",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
},
|
| 43 |
+
"30522": {
|
| 44 |
+
"content": "[Q] ",
|
| 45 |
+
"lstrip": false,
|
| 46 |
+
"normalized": true,
|
| 47 |
+
"rstrip": false,
|
| 48 |
+
"single_word": false,
|
| 49 |
+
"special": false
|
| 50 |
+
},
|
| 51 |
+
"30523": {
|
| 52 |
+
"content": "[D] ",
|
| 53 |
+
"lstrip": false,
|
| 54 |
+
"normalized": true,
|
| 55 |
+
"rstrip": false,
|
| 56 |
+
"single_word": false,
|
| 57 |
+
"special": false
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"clean_up_tokenization_spaces": false,
|
| 61 |
+
"cls_token": "[CLS]",
|
| 62 |
+
"do_lower_case": true,
|
| 63 |
+
"extra_special_tokens": {},
|
| 64 |
+
"mask_token": "[MASK]",
|
| 65 |
+
"max_length": 512,
|
| 66 |
+
"model_max_length": 512,
|
| 67 |
+
"pad_to_multiple_of": null,
|
| 68 |
+
"pad_token": "[MASK]",
|
| 69 |
+
"pad_token_type_id": 0,
|
| 70 |
+
"padding_side": "right",
|
| 71 |
+
"sep_token": "[SEP]",
|
| 72 |
+
"stride": 0,
|
| 73 |
+
"strip_accents": null,
|
| 74 |
+
"tokenize_chinese_chars": true,
|
| 75 |
+
"tokenizer_class": "BertTokenizer",
|
| 76 |
+
"truncation_side": "right",
|
| 77 |
+
"truncation_strategy": "longest_first",
|
| 78 |
+
"unk_token": "[UNK]"
|
| 79 |
+
}
|
trainer_state.json
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.08343360002810395,
|
| 6 |
+
"eval_steps": 1000,
|
| 7 |
+
"global_step": 9500,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 8.782484213484626e-06,
|
| 14 |
+
"grad_norm": NaN,
|
| 15 |
+
"learning_rate": 3e-06,
|
| 16 |
+
"loss": 1.9342,
|
| 17 |
+
"step": 1
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.008782484213484626,
|
| 21 |
+
"grad_norm": 11.566916465759277,
|
| 22 |
+
"learning_rate": 2.9736788948121867e-06,
|
| 23 |
+
"loss": 1.3908,
|
| 24 |
+
"step": 1000
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.017564968426969252,
|
| 28 |
+
"grad_norm": 13.525375366210938,
|
| 29 |
+
"learning_rate": 2.9473314421717326e-06,
|
| 30 |
+
"loss": 1.1178,
|
| 31 |
+
"step": 2000
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.026347452640453878,
|
| 35 |
+
"grad_norm": 13.769821166992188,
|
| 36 |
+
"learning_rate": 2.920983989531279e-06,
|
| 37 |
+
"loss": 1.0264,
|
| 38 |
+
"step": 3000
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.035129936853938504,
|
| 42 |
+
"grad_norm": 14.206275939941406,
|
| 43 |
+
"learning_rate": 2.8946365368908247e-06,
|
| 44 |
+
"loss": 0.9816,
|
| 45 |
+
"step": 4000
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.04391242106742313,
|
| 49 |
+
"grad_norm": 15.29100513458252,
|
| 50 |
+
"learning_rate": 2.868289084250371e-06,
|
| 51 |
+
"loss": 0.9313,
|
| 52 |
+
"step": 5000
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.052694905280907756,
|
| 56 |
+
"grad_norm": 19.36515235900879,
|
| 57 |
+
"learning_rate": 2.8419416316099173e-06,
|
| 58 |
+
"loss": 0.9324,
|
| 59 |
+
"step": 6000
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.06147738949439238,
|
| 63 |
+
"grad_norm": 16.350460052490234,
|
| 64 |
+
"learning_rate": 2.8155941789694636e-06,
|
| 65 |
+
"loss": 0.8896,
|
| 66 |
+
"step": 7000
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.07025987370787701,
|
| 70 |
+
"grad_norm": 8.30239200592041,
|
| 71 |
+
"learning_rate": 2.7892467263290094e-06,
|
| 72 |
+
"loss": 0.8703,
|
| 73 |
+
"step": 8000
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.07904235792136163,
|
| 77 |
+
"grad_norm": 17.16596794128418,
|
| 78 |
+
"learning_rate": 2.7628992736885557e-06,
|
| 79 |
+
"loss": 0.8665,
|
| 80 |
+
"step": 9000
|
| 81 |
+
}
|
| 82 |
+
],
|
| 83 |
+
"logging_steps": 1000,
|
| 84 |
+
"max_steps": 113863,
|
| 85 |
+
"num_input_tokens_seen": 0,
|
| 86 |
+
"num_train_epochs": 1,
|
| 87 |
+
"save_steps": 500,
|
| 88 |
+
"stateful_callbacks": {
|
| 89 |
+
"TrainerControl": {
|
| 90 |
+
"args": {
|
| 91 |
+
"should_epoch_stop": false,
|
| 92 |
+
"should_evaluate": false,
|
| 93 |
+
"should_log": false,
|
| 94 |
+
"should_save": true,
|
| 95 |
+
"should_training_stop": false
|
| 96 |
+
},
|
| 97 |
+
"attributes": {}
|
| 98 |
+
}
|
| 99 |
+
},
|
| 100 |
+
"total_flos": 0.0,
|
| 101 |
+
"train_batch_size": 8,
|
| 102 |
+
"trial_name": null,
|
| 103 |
+
"trial_params": null
|
| 104 |
+
}
|
training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0015d449c32a217387af36a8fd35ed78fb8c69b20b3934f06a042ba284240858
|
| 3 |
+
size 6161
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|