Oleg Shulyakov commited on
Commit
ab4a7f4
·
1 Parent(s): 9250e33

Add embedded and outputs quantization

Browse files
Files changed (1) hide show
  1. app.py +118 -20
app.py CHANGED
@@ -189,7 +189,19 @@ def download_base_model(token: str, model_id: str, outdir: tempfile.TemporaryDir
189
 
190
  return fp16_model
191
 
192
- def quantize_model(outdir: tempfile.TemporaryDirectory, gguf_name: str, fp16, q_method: str, use_imatrix: bool, imatrix_q_method: str, imatrix_path: str):
 
 
 
 
 
 
 
 
 
 
 
 
193
  if use_imatrix:
194
  if train_data_file:
195
  train_data_path = train_data_file.name
@@ -206,18 +218,25 @@ def quantize_model(outdir: tempfile.TemporaryDirectory, gguf_name: str, fp16, q_
206
  print("Not using imatrix quantization.")
207
 
208
  # Quantize the model
 
 
 
 
 
 
 
 
 
 
209
  quantized_gguf = str(Path(outdir)/gguf_name)
 
 
210
  if use_imatrix:
211
- quantize_cmd = [
212
- "llama-quantize",
213
- "--imatrix", imatrix_path, fp16, quantized_gguf, imatrix_q_method
214
- ]
215
  else:
216
- quantize_cmd = [
217
- "llama-quantize",
218
- fp16, quantized_gguf, q_method
219
- ]
220
 
 
221
  result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
222
  if result.returncode != 0:
223
  stderr_str = result.stderr.decode("utf-8")
@@ -227,7 +246,24 @@ def quantize_model(outdir: tempfile.TemporaryDirectory, gguf_name: str, fp16, q_
227
  print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
228
  return quantized_gguf
229
 
230
- def process_model(model_id: str, q_method: str, use_imatrix: bool, imatrix_q_method: str, private_repo: bool, train_data_file, split_model: bool, split_max_tensors, split_max_size: str | None, repo_name: str, gguf_name: str, oauth_token: gr.OAuthToken | None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  # validate the oauth token
232
  if is_valid_token(oauth_token) is False:
233
  raise gr.Error("You must be logged in to use GGUF-my-repo")
@@ -379,7 +415,7 @@ use_imatrix = gr.Checkbox(
379
  )
380
 
381
  q_method = gr.Dropdown(
382
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16"],
383
  label="Quantization Method",
384
  info="GGML quantization type",
385
  value="Q4_K_M",
@@ -388,7 +424,7 @@ q_method = gr.Dropdown(
388
  )
389
 
390
  imatrix_q_method = gr.Dropdown(
391
- ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
392
  label="Imatrix Quantization Method",
393
  info="GGML imatrix quants type",
394
  value="IQ4_NL",
@@ -406,7 +442,7 @@ def update_imatrix_visibility(use_imatrix):
406
  return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
407
 
408
  #####
409
- # Split model section
410
  #####
411
  split_model = gr.Checkbox(
412
  value=False,
@@ -430,6 +466,41 @@ split_max_size = gr.Textbox(
430
  def update_split_visibility(split_model):
431
  return gr.update(visible=split_model), gr.update(visible=split_model)
432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  #####
434
  # Output Settings section
435
  #####
@@ -486,6 +557,10 @@ clear_btn = gr.ClearButton(
486
  imatrix_q_method,
487
  private_repo,
488
  train_data_file,
 
 
 
 
489
  split_model,
490
  split_max_tensors,
491
  split_max_size,
@@ -532,6 +607,12 @@ with gr.Blocks(css=css) as demo:
532
  train_data_file.render()
533
 
534
  gr.Markdown("### Advanced Options")
 
 
 
 
 
 
535
  split_model.render()
536
  with gr.Row() as split_options: # Group split options
537
  split_max_tensors.render()
@@ -544,6 +625,7 @@ with gr.Blocks(css=css) as demo:
544
  repo_name.render()
545
  gguf_name.render()
546
 
 
547
  with gr.Row() as buttons:
548
  clear_btn.render()
549
  submit_btn.render()
@@ -564,11 +646,15 @@ with gr.Blocks(css=css) as demo:
564
  imatrix_q_method,
565
  private_repo,
566
  train_data_file,
567
- split_model,
568
- split_max_tensors,
569
- split_max_size,
570
  repo_name,
571
  gguf_name,
 
 
 
 
 
 
 
572
  ],
573
  outputs=[
574
  output_label,
@@ -579,16 +665,28 @@ with gr.Blocks(css=css) as demo:
579
  #####
580
  # OnChange handlers
581
  #####
 
 
 
 
 
 
582
  split_model.change(
583
  fn=update_split_visibility,
584
  inputs=split_model,
585
  outputs=[split_max_tensors, split_max_size]
586
  )
587
 
588
- use_imatrix.change(
589
- fn=update_imatrix_visibility,
590
- inputs=use_imatrix,
591
- outputs=[q_method, imatrix_q_method, train_data_file]
 
 
 
 
 
 
592
  )
593
 
594
  model_id.change(
 
189
 
190
  return fp16_model
191
 
192
+ def quantize_model(
193
+ outdir: tempfile.TemporaryDirectory,
194
+ gguf_name: str,
195
+ fp16: str,
196
+ q_method: str,
197
+ use_imatrix: bool,
198
+ imatrix_q_method: str,
199
+ imatrix_path: str,
200
+ quant_embedding: bool,
201
+ embedding_tensor_method: str,
202
+ quant_output: bool,
203
+ output_tensor_method: str,
204
+ ):
205
  if use_imatrix:
206
  if train_data_file:
207
  train_data_path = train_data_file.name
 
218
  print("Not using imatrix quantization.")
219
 
220
  # Quantize the model
221
+ quantize_cmd = ["llama-quantize"]
222
+
223
+ if quant_embedding:
224
+ quantize_cmd += ["--token-embedding-type", embedding_tensor_method]
225
+ if quant_output:
226
+ quantize_cmd += ["--output-tensor-type", output_tensor_method]
227
+
228
+ if use_imatrix:
229
+ quantize_cmd += ["--imatrix", imatrix_path]
230
+
231
  quantized_gguf = str(Path(outdir)/gguf_name)
232
+ quantize_cmd += [fp16, quantized_gguf]
233
+
234
  if use_imatrix:
235
+ quantize_cmd += [imatrix_q_method]
 
 
 
236
  else:
237
+ quantize_cmd += [q_method]
 
 
 
238
 
239
+ print(f"Quantizing model with {quantize_cmd}")
240
  result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
241
  if result.returncode != 0:
242
  stderr_str = result.stderr.decode("utf-8")
 
246
  print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
247
  return quantized_gguf
248
 
249
+ def process_model(
250
+ model_id: str,
251
+ q_method: str,
252
+ use_imatrix: bool,
253
+ imatrix_q_method: str,
254
+ private_repo: bool,
255
+ train_data_file,
256
+ repo_name: str,
257
+ gguf_name: str,
258
+ quant_embedding: bool,
259
+ embedding_tensor_method: str,
260
+ quant_output: bool,
261
+ output_tensor_method: str,
262
+ split_model: bool,
263
+ split_max_tensors,
264
+ split_max_size: str | None,
265
+ oauth_token: gr.OAuthToken | None,
266
+ ):
267
  # validate the oauth token
268
  if is_valid_token(oauth_token) is False:
269
  raise gr.Error("You must be logged in to use GGUF-my-repo")
 
415
  )
416
 
417
  q_method = gr.Dropdown(
418
+ choices=["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16"],
419
  label="Quantization Method",
420
  info="GGML quantization type",
421
  value="Q4_K_M",
 
424
  )
425
 
426
  imatrix_q_method = gr.Dropdown(
427
+ choices=["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
428
  label="Imatrix Quantization Method",
429
  info="GGML imatrix quants type",
430
  value="IQ4_NL",
 
442
  return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
443
 
444
  #####
445
+ # Advanced Options section
446
  #####
447
  split_model = gr.Checkbox(
448
  value=False,
 
466
  def update_split_visibility(split_model):
467
  return gr.update(visible=split_model), gr.update(visible=split_model)
468
 
469
+ quant_embedding = gr.Checkbox(
470
+ value=False,
471
+ label="Quant embeddings tensor",
472
+ info=""
473
+ )
474
+ embedding_tensor_method = gr.Dropdown(
475
+ choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
476
+ label="Output Quantization Method",
477
+ info="use a specific quant type for the token embeddings tensor",
478
+ value="Q8_0",
479
+ filterable=False,
480
+ visible=False
481
+ )
482
+
483
+ quant_output = gr.Checkbox(
484
+ value=False,
485
+ label="Quant output tensor",
486
+ info=""
487
+ )
488
+ output_tensor_method = gr.Dropdown(
489
+ choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
490
+ label="Output Quantization Method",
491
+ info="use a specific quant type for the output.weight tensor",
492
+ value="Q8_0",
493
+ filterable=False,
494
+ visible=False
495
+ )
496
+
497
+ def update_embedding_tensor_visibility(quant_embedding):
498
+ return gr.update(visible=quant_embedding)
499
+
500
+ def update_output_tensor_visibility(quant_output):
501
+ return gr.update(visible=quant_output)
502
+
503
+
504
  #####
505
  # Output Settings section
506
  #####
 
557
  imatrix_q_method,
558
  private_repo,
559
  train_data_file,
560
+ quant_embedding,
561
+ embedding_tensor_method,
562
+ quant_output,
563
+ output_tensor_method,
564
  split_model,
565
  split_max_tensors,
566
  split_max_size,
 
607
  train_data_file.render()
608
 
609
  gr.Markdown("### Advanced Options")
610
+
611
+ quant_embedding.render()
612
+ embedding_tensor_method.render()
613
+ quant_output.render()
614
+ output_tensor_method.render()
615
+
616
  split_model.render()
617
  with gr.Row() as split_options: # Group split options
618
  split_max_tensors.render()
 
625
  repo_name.render()
626
  gguf_name.render()
627
 
628
+ # Buttons
629
  with gr.Row() as buttons:
630
  clear_btn.render()
631
  submit_btn.render()
 
646
  imatrix_q_method,
647
  private_repo,
648
  train_data_file,
 
 
 
649
  repo_name,
650
  gguf_name,
651
+ quant_embedding,
652
+ embedding_tensor_method,
653
+ quant_output,
654
+ output_tensor_method,
655
+ split_model,
656
+ split_max_tensors,
657
+ split_max_size
658
  ],
659
  outputs=[
660
  output_label,
 
665
  #####
666
  # OnChange handlers
667
  #####
668
+ use_imatrix.change(
669
+ fn=update_imatrix_visibility,
670
+ inputs=use_imatrix,
671
+ outputs=[q_method, imatrix_q_method, train_data_file]
672
+ )
673
+
674
  split_model.change(
675
  fn=update_split_visibility,
676
  inputs=split_model,
677
  outputs=[split_max_tensors, split_max_size]
678
  )
679
 
680
+ quant_embedding.change(
681
+ fn=update_embedding_tensor_visibility,
682
+ inputs=quant_embedding,
683
+ outputs=[embedding_tensor_method]
684
+ )
685
+
686
+ quant_output.change(
687
+ fn=update_output_tensor_visibility,
688
+ inputs=quant_output,
689
+ outputs=[output_tensor_method]
690
  )
691
 
692
  model_id.change(