Model save

Browse files

Files changed (6) hide show

README.md +10 -9
adapter_model.safetensors +1 -1
last-safe/adapter_config.json +5 -5
last-safe/adapter_model.safetensors +1 -1
last-safe/training_args.bin +1 -1
trainer_state.json +88 -19

README.md CHANGED Viewed

@@ -19,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [Qwen/Qwen3-Coder-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.9129
 ## Model description
@@ -38,22 +38,23 @@ More information needed
 ### Training hyperparameters
 The following hyperparameters were used during training:
-- learning_rate: 0.0002
 - train_batch_size: 2
 - eval_batch_size: 2
 - seed: 42
-- gradient_accumulation_steps: 8
-- total_train_batch_size: 16
-- optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.03
-- num_epochs: 1
 ### Training results
-| Training Loss | Epoch | Step | Validation Loss |
-|:-------------:|:-----:|:----:|:---------------:|
-| 1.0148        | 1.0   | 1    | 0.9129          |
 ### Framework versions

 This model is a fine-tuned version of [Qwen/Qwen3-Coder-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Loss: 1.0112
 ## Model description
 ### Training hyperparameters
 The following hyperparameters were used during training:
+- learning_rate: 2e-05
 - train_batch_size: 2
 - eval_batch_size: 2
 - seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 8
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.03
+- num_epochs: 2
 ### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.0885        | 0.9231 | 30   | 1.0279          |
+| 1.0246        | 1.8308 | 60   | 1.0112          |
 ### Framework versions

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3fe7ba3952f8c5ee74ed8f0f32f9e43c9592af03a5ab33d96da2ea2edc252879
 size 3380768360

 version https://git-lfs.github.com/spec/v1
+oid sha256:ee4d970622794f08a82fa60ef4d6332b4e8ba1dc7fcb2a46d7c17d4eba1c445e
 size 3380768360

last-safe/adapter_config.json CHANGED Viewed

@@ -25,13 +25,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "o_proj",
-    "down_proj",
-    "v_proj",
-    "k_proj",
     "gate_proj",
     "up_proj",
-    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "gate_proj",
+    "k_proj",
+    "down_proj",
     "up_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

last-safe/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03471e48c3ca4057ce894565aad685348603b48d328390a5787ed0c0dd8f22bd
 size 3380768360

 version https://git-lfs.github.com/spec/v1
+oid sha256:ee4d970622794f08a82fa60ef4d6332b4e8ba1dc7fcb2a46d7c17d4eba1c445e
 size 3380768360

last-safe/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:660b2156752c413c59b9540172b28797294d24a5cda4926de10948729d1f7415
 size 5841

 version https://git-lfs.github.com/spec/v1
+oid sha256:bf4911edfbe62714118ad9ae7d4585201ad61c9eb8139bda45dc5698ba0fa55b
 size 5841

trainer_state.json CHANGED Viewed

@@ -1,31 +1,100 @@
 {
-  "best_global_step": null,
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 0.8923076923076924,
-  "eval_steps": 250,
-  "global_step": 29,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.7692307692307693,
-      "grad_norm": 0.43906208872795105,
-      "learning_rate": 9.600000000000001e-06,
-      "loss": 1.3588,
-      "step": 25
     }
   ],
-  "logging_steps": 25,
-  "max_steps": 1650,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 50,
-  "save_steps": 250,
   "stateful_callbacks": {
     "EarlyStoppingCallback": {
       "args": {
-        "early_stopping_patience": 4,
         "early_stopping_threshold": 0.0
       },
       "attributes": {
@@ -37,13 +106,13 @@
         "should_epoch_stop": false,
         "should_evaluate": false,
         "should_log": false,
-        "should_save": false,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 1.526897011851264e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
+  "best_global_step": 60,
+  "best_metric": 1.0112364292144775,
+  "best_model_checkpoint": "j05hr3d/peft-FT-3-Coder-30b-v2/checkpoint-60",
+  "epoch": 2.0,
+  "eval_steps": 30,
+  "global_step": 66,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 0.3513231873512268,
+      "learning_rate": 1.941544065183021e-05,
+      "loss": 1.2806,
+      "step": 10
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 0.9766170382499695,
+      "learning_rate": 1.6715589548470187e-05,
+      "loss": 1.5279,
+      "step": 20
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 0.22962720692157745,
+      "learning_rate": 1.242980179903264e-05,
+      "loss": 1.0885,
+      "step": 30
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "eval_loss": 1.0279428958892822,
+      "eval_runtime": 397.5727,
+      "eval_samples_per_second": 0.07,
+      "eval_steps_per_second": 0.035,
+      "step": 30
+    },
+    {
+      "epoch": 1.2153846153846155,
+      "grad_norm": 0.21934713423252106,
+      "learning_rate": 7.570198200967363e-06,
+      "loss": 1.1829,
+      "step": 40
+    },
+    {
+      "epoch": 1.523076923076923,
+      "grad_norm": 0.19970707595348358,
+      "learning_rate": 3.284410451529816e-06,
+      "loss": 1.4437,
+      "step": 50
+    },
+    {
+      "epoch": 1.830769230769231,
+      "grad_norm": 0.2555944323539734,
+      "learning_rate": 5.845593481697931e-07,
+      "loss": 1.0246,
+      "step": 60
+    },
+    {
+      "epoch": 1.830769230769231,
+      "eval_loss": 1.0112364292144775,
+      "eval_runtime": 405.21,
+      "eval_samples_per_second": 0.069,
+      "eval_steps_per_second": 0.035,
+      "step": 60
+    },
+    {
+      "epoch": 2.0,
+      "step": 66,
+      "total_flos": 3.9699322308132864e+17,
+      "train_loss": 1.241963545481364,
+      "train_runtime": 14535.1384,
+      "train_samples_per_second": 0.036,
+      "train_steps_per_second": 0.005,
+      "train_tokens_per_second": 293.072
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.0112364292144775,
+      "eval_runtime": 408.8198,
+      "eval_samples_per_second": 0.068,
+      "eval_steps_per_second": 0.034,
+      "step": 66
     }
   ],
+  "logging_steps": 10,
+  "max_steps": 66,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 30,
   "stateful_callbacks": {
     "EarlyStoppingCallback": {
       "args": {
+        "early_stopping_patience": 2,
         "early_stopping_threshold": 0.0
       },
       "attributes": {
         "should_epoch_stop": false,
         "should_evaluate": false,
         "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 3.9699322308132864e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null