Oleg Shulyakov commited on
Commit
29e0460
·
1 Parent(s): b287a5f

Update arguments

Browse files
Files changed (1) hide show
  1. hf-quantize.sh +62 -31
hf-quantize.sh CHANGED
@@ -4,14 +4,14 @@
4
  #
5
  # Usage with long options:
6
  # ./hf-quantize.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M
7
- # ./hf-quantize.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix true
8
- # ./hf-quantize.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix true --output-filename Llama-2-7b-Q4_K_M.gguf
9
- # ./hf-quantize.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix true --output-filename Llama-2-7b-Q4_K_M.gguf --split-model true --split-max-tensors 256 --split-max-size 4G
10
  #
11
  # ./hf-quantize.sh -m meta-llama/Llama-2-7b -q Q4_K_M
12
- # ./hf-quantize.sh -m meta-llama/Llama-2-7b -q Q4_K_M -i true
13
- # ./hf-quantize.sh -m meta-llama/Llama-2-7b -q Q4_K_M -i true -o Llama-2-7b-Q4_K_M.gguf
14
- # ./hf-quantize.sh -m meta-llama/Llama-2-7b -q Q4_K_M -i true -o Llama-2-7b-Q4_K_M.gguf -s true -t 256 -z 4G
15
  #
16
 
17
  # --- Configuration ---
@@ -43,6 +43,15 @@ SPLIT_MAX_TENSORS="256"
43
  # Optional: Max size per shard if splitting (e.g., 2G) - overrides SPLIT_MAX_TENSORS if set
44
  SPLIT_MAX_SIZE=""
45
 
 
 
 
 
 
 
 
 
 
46
  # --- Parse Named Arguments ---
47
  while [[ $# -gt 0 ]]; do
48
  case $1 in
@@ -54,39 +63,51 @@ while [[ $# -gt 0 ]]; do
54
  QUANT_METHOD="$2"
55
  shift 2
56
  ;;
57
- -i|--use-imatrix)
58
- USE_IMATRIX="$2"
59
- shift 2
60
  ;;
61
  -o|--output-filename)
62
  OUTPUT_FILENAME="$2"
63
  shift 2
64
  ;;
65
- -s|--split-model)
66
- SPLIT_MODEL="$2"
67
- shift 2
68
  ;;
69
- -t|--split-max-tensors)
70
  SPLIT_MAX_TENSORS="$2"
71
  shift 2
72
  ;;
73
- -z|--split-max-size)
74
  SPLIT_MAX_SIZE="$2"
75
  shift 2
76
  ;;
 
 
 
 
 
 
 
 
 
 
 
 
77
  -h|--help)
78
  echo "Usage:"
79
  echo " Long options:"
80
- echo " $0 --model <MODEL_ID> --quant-method <QUANT_METHOD> [--use-imatrix <true|false>] [--output-filename <FILENAME>] [--split-model <true|false>] [--split-max-tensors <NUM>] [--split-max-size <SIZE>]"
81
  echo ""
82
  echo " Short options:"
83
- echo " $0 -m <MODEL_ID> -q <QUANT_METHOD> [-i <true|false>] [-o <FILENAME>] [-s <true|false>] [-t <NUM>] [-z <SIZE>]"
84
  echo ""
85
  echo "Examples:"
86
  echo " $0 --model meta-llama/Llama-2-7b --quant-method Q4_K_M"
87
- echo " $0 -m meta-llama/Llama-2-7b -q Q4_K_M -i true"
88
- echo " $0 --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix true --output-filename Llama-2-7b-Q4_K_M.gguf"
89
- echo " $0 -m meta-llama/Llama-2-7b -q Q4_K_M -i true -o Llama-2-7b-Q4_K_M.gguf -s true -t 256 -z 4G"
90
  exit 0
91
  ;;
92
  *)
@@ -237,26 +258,36 @@ QUANTIZE_CMD=(
237
  llama-quantize
238
  )
239
 
240
- # Add optional quantization flags
241
- # Note: The original script has logic for --leave-output-tensor vs --output-tensor-type
242
- # and --token-embedding-type. This script omits these for simplicity.
243
- # You can add them back if needed, but they require more input arguments.
244
-
245
  if [ "$USE_IMATRIX" = "true" ] && [ -f "$IMATRIX_FILE_PATH" ]; then
246
  QUANTIZE_CMD+=(
247
  --imatrix "$IMATRIX_FILE_PATH"
248
- "$FP16_MODEL_PATH"
249
- "$QUANTIZED_MODEL_PATH"
250
- "$QUANT_METHOD"
251
  )
252
- else
 
 
253
  QUANTIZE_CMD+=(
254
- "$FP16_MODEL_PATH"
255
- "$QUANTIZED_MODEL_PATH"
256
- "$QUANT_METHOD"
257
  )
258
  fi
259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  echo "Running command: ${QUANTIZE_CMD[*]}"
261
  "${QUANTIZE_CMD[@]}"
262
 
 
4
  #
5
  # Usage with long options:
6
  # ./hf-quantize.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M
7
+ # ./hf-quantize.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix
8
+ # ./hf-quantize.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix --output-filename Llama-2-7b-Q4_K_M.gguf
9
+ # ./hf-quantize.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix --output-filename Llama-2-7b-Q4_K_M.gguf --split-model --split-max-tensors 256 --split-max-size 4G
10
  #
11
  # ./hf-quantize.sh -m meta-llama/Llama-2-7b -q Q4_K_M
12
+ # ./hf-quantize.sh -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix
13
+ # ./hf-quantize.sh -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix -o Llama-2-7b-Q4_K_M.gguf
14
+ # ./hf-quantize.sh -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix -o Llama-2-7b-Q4_K_M.gguf -split --split-max-tensors 256 --split-max-size 4G
15
  #
16
 
17
  # --- Configuration ---
 
43
  # Optional: Max size per shard if splitting (e.g., 2G) - overrides SPLIT_MAX_TENSORS if set
44
  SPLIT_MAX_SIZE=""
45
 
46
+ # Optional: Quant embeddings tensor
47
+ TOKEN_EMBEDDING_TYPE=""
48
+
49
+ # Optional: Leave output tensor
50
+ LEAVE_OUTPUT_TENSOR="false"
51
+
52
+ # Optional: Output Quantization Method
53
+ OUTPUT_TENSOR_TYPE=""
54
+
55
  # --- Parse Named Arguments ---
56
  while [[ $# -gt 0 ]]; do
57
  case $1 in
 
63
  QUANT_METHOD="$2"
64
  shift 2
65
  ;;
66
+ -imatrix|--use-imatrix)
67
+ USE_IMATRIX="true"
68
+ shift 1
69
  ;;
70
  -o|--output-filename)
71
  OUTPUT_FILENAME="$2"
72
  shift 2
73
  ;;
74
+ -split|--split-model)
75
+ SPLIT_MODEL="true"
76
+ shift 1
77
  ;;
78
+ --split-max-tensors)
79
  SPLIT_MAX_TENSORS="$2"
80
  shift 2
81
  ;;
82
+ --split-max-size)
83
  SPLIT_MAX_SIZE="$2"
84
  shift 2
85
  ;;
86
+ --token-embedding-type)
87
+ TOKEN_EMBEDDING_TYPE="$2"
88
+ shift 2
89
+ ;;
90
+ --leave-output-tensor)
91
+ LEAVE_OUTPUT_TENSOR="true"
92
+ shift 1
93
+ ;;
94
+ --output-tensor-type)
95
+ OUTPUT_TENSOR_TYPE="$2"
96
+ shift 2
97
+ ;;
98
  -h|--help)
99
  echo "Usage:"
100
  echo " Long options:"
101
+ echo " $0 --model <MODEL_ID> --quant-method <QUANT_METHOD> [--use-imatrix] [--output-filename <FILENAME>] [--split-model] [--split-max-tensors <NUM>] [--split-max-size <SIZE>] [--token-embedding-type <QUANT_METHOD>] [--leave-output-tensor] [--output-tensor-type <QUANT_METHOD>]"
102
  echo ""
103
  echo " Short options:"
104
+ echo " $0 -m <MODEL_ID> -q <QUANT_METHOD> [-imatrix] [-o <FILENAME>] [-split]"
105
  echo ""
106
  echo "Examples:"
107
  echo " $0 --model meta-llama/Llama-2-7b --quant-method Q4_K_M"
108
+ echo " $0 -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix"
109
+ echo " $0 --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix --output-filename Llama-2-7b-Q4_K_M.gguf"
110
+ echo " $0 -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix -o Llama-2-7b-Q4_K_M.gguf -split --split-max-tensors 256 --split-max-size 4G"
111
  exit 0
112
  ;;
113
  *)
 
258
  llama-quantize
259
  )
260
 
 
 
 
 
 
261
  if [ "$USE_IMATRIX" = "true" ] && [ -f "$IMATRIX_FILE_PATH" ]; then
262
  QUANTIZE_CMD+=(
263
  --imatrix "$IMATRIX_FILE_PATH"
 
 
 
264
  )
265
+ fi
266
+
267
+ if [ -n "$TOKEN_EMBEDDING_TYPE" ]; then
268
  QUANTIZE_CMD+=(
269
+ --token-embedding-type "$TOKEN_EMBEDDING_TYPE"
 
 
270
  )
271
  fi
272
 
273
+ if [ "$LEAVE_OUTPUT_TENSOR" = "true" ]; then
274
+ QUANTIZE_CMD+=(
275
+ --leave-output-tensor
276
+ )
277
+ else
278
+ if [ -n "$OUTPUT_TENSOR_TYPE" ]; then
279
+ QUANTIZE_CMD+=(
280
+ --output-tensor-type "$OUTPUT_TENSOR_TYPE"
281
+ )
282
+ fi
283
+ fi
284
+
285
+ QUANTIZE_CMD+=(
286
+ "$FP16_MODEL_PATH"
287
+ "$QUANTIZED_MODEL_PATH"
288
+ "$QUANT_METHOD"
289
+ )
290
+
291
  echo "Running command: ${QUANTIZE_CMD[*]}"
292
  "${QUANTIZE_CMD[@]}"
293