Spaces:
Sleeping
Sleeping
Oleg Shulyakov
commited on
Commit
·
ab4a7f4
1
Parent(s):
9250e33
Add embedded and outputs quantization
Browse files
app.py
CHANGED
|
@@ -189,7 +189,19 @@ def download_base_model(token: str, model_id: str, outdir: tempfile.TemporaryDir
|
|
| 189 |
|
| 190 |
return fp16_model
|
| 191 |
|
| 192 |
-
def quantize_model(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
if use_imatrix:
|
| 194 |
if train_data_file:
|
| 195 |
train_data_path = train_data_file.name
|
|
@@ -206,18 +218,25 @@ def quantize_model(outdir: tempfile.TemporaryDirectory, gguf_name: str, fp16, q_
|
|
| 206 |
print("Not using imatrix quantization.")
|
| 207 |
|
| 208 |
# Quantize the model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
quantized_gguf = str(Path(outdir)/gguf_name)
|
|
|
|
|
|
|
| 210 |
if use_imatrix:
|
| 211 |
-
quantize_cmd
|
| 212 |
-
"llama-quantize",
|
| 213 |
-
"--imatrix", imatrix_path, fp16, quantized_gguf, imatrix_q_method
|
| 214 |
-
]
|
| 215 |
else:
|
| 216 |
-
quantize_cmd
|
| 217 |
-
"llama-quantize",
|
| 218 |
-
fp16, quantized_gguf, q_method
|
| 219 |
-
]
|
| 220 |
|
|
|
|
| 221 |
result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
|
| 222 |
if result.returncode != 0:
|
| 223 |
stderr_str = result.stderr.decode("utf-8")
|
|
@@ -227,7 +246,24 @@ def quantize_model(outdir: tempfile.TemporaryDirectory, gguf_name: str, fp16, q_
|
|
| 227 |
print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
|
| 228 |
return quantized_gguf
|
| 229 |
|
| 230 |
-
def process_model(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
# validate the oauth token
|
| 232 |
if is_valid_token(oauth_token) is False:
|
| 233 |
raise gr.Error("You must be logged in to use GGUF-my-repo")
|
|
@@ -379,7 +415,7 @@ use_imatrix = gr.Checkbox(
|
|
| 379 |
)
|
| 380 |
|
| 381 |
q_method = gr.Dropdown(
|
| 382 |
-
["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16"],
|
| 383 |
label="Quantization Method",
|
| 384 |
info="GGML quantization type",
|
| 385 |
value="Q4_K_M",
|
|
@@ -388,7 +424,7 @@ q_method = gr.Dropdown(
|
|
| 388 |
)
|
| 389 |
|
| 390 |
imatrix_q_method = gr.Dropdown(
|
| 391 |
-
["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
|
| 392 |
label="Imatrix Quantization Method",
|
| 393 |
info="GGML imatrix quants type",
|
| 394 |
value="IQ4_NL",
|
|
@@ -406,7 +442,7 @@ def update_imatrix_visibility(use_imatrix):
|
|
| 406 |
return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
|
| 407 |
|
| 408 |
#####
|
| 409 |
-
#
|
| 410 |
#####
|
| 411 |
split_model = gr.Checkbox(
|
| 412 |
value=False,
|
|
@@ -430,6 +466,41 @@ split_max_size = gr.Textbox(
|
|
| 430 |
def update_split_visibility(split_model):
|
| 431 |
return gr.update(visible=split_model), gr.update(visible=split_model)
|
| 432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
#####
|
| 434 |
# Output Settings section
|
| 435 |
#####
|
|
@@ -486,6 +557,10 @@ clear_btn = gr.ClearButton(
|
|
| 486 |
imatrix_q_method,
|
| 487 |
private_repo,
|
| 488 |
train_data_file,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
split_model,
|
| 490 |
split_max_tensors,
|
| 491 |
split_max_size,
|
|
@@ -532,6 +607,12 @@ with gr.Blocks(css=css) as demo:
|
|
| 532 |
train_data_file.render()
|
| 533 |
|
| 534 |
gr.Markdown("### Advanced Options")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
split_model.render()
|
| 536 |
with gr.Row() as split_options: # Group split options
|
| 537 |
split_max_tensors.render()
|
|
@@ -544,6 +625,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 544 |
repo_name.render()
|
| 545 |
gguf_name.render()
|
| 546 |
|
|
|
|
| 547 |
with gr.Row() as buttons:
|
| 548 |
clear_btn.render()
|
| 549 |
submit_btn.render()
|
|
@@ -564,11 +646,15 @@ with gr.Blocks(css=css) as demo:
|
|
| 564 |
imatrix_q_method,
|
| 565 |
private_repo,
|
| 566 |
train_data_file,
|
| 567 |
-
split_model,
|
| 568 |
-
split_max_tensors,
|
| 569 |
-
split_max_size,
|
| 570 |
repo_name,
|
| 571 |
gguf_name,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
],
|
| 573 |
outputs=[
|
| 574 |
output_label,
|
|
@@ -579,16 +665,28 @@ with gr.Blocks(css=css) as demo:
|
|
| 579 |
#####
|
| 580 |
# OnChange handlers
|
| 581 |
#####
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
split_model.change(
|
| 583 |
fn=update_split_visibility,
|
| 584 |
inputs=split_model,
|
| 585 |
outputs=[split_max_tensors, split_max_size]
|
| 586 |
)
|
| 587 |
|
| 588 |
-
|
| 589 |
-
fn=
|
| 590 |
-
inputs=
|
| 591 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
)
|
| 593 |
|
| 594 |
model_id.change(
|
|
|
|
| 189 |
|
| 190 |
return fp16_model
|
| 191 |
|
| 192 |
+
def quantize_model(
|
| 193 |
+
outdir: tempfile.TemporaryDirectory,
|
| 194 |
+
gguf_name: str,
|
| 195 |
+
fp16: str,
|
| 196 |
+
q_method: str,
|
| 197 |
+
use_imatrix: bool,
|
| 198 |
+
imatrix_q_method: str,
|
| 199 |
+
imatrix_path: str,
|
| 200 |
+
quant_embedding: bool,
|
| 201 |
+
embedding_tensor_method: str,
|
| 202 |
+
quant_output: bool,
|
| 203 |
+
output_tensor_method: str,
|
| 204 |
+
):
|
| 205 |
if use_imatrix:
|
| 206 |
if train_data_file:
|
| 207 |
train_data_path = train_data_file.name
|
|
|
|
| 218 |
print("Not using imatrix quantization.")
|
| 219 |
|
| 220 |
# Quantize the model
|
| 221 |
+
quantize_cmd = ["llama-quantize"]
|
| 222 |
+
|
| 223 |
+
if quant_embedding:
|
| 224 |
+
quantize_cmd += ["--token-embedding-type", embedding_tensor_method]
|
| 225 |
+
if quant_output:
|
| 226 |
+
quantize_cmd += ["--output-tensor-type", output_tensor_method]
|
| 227 |
+
|
| 228 |
+
if use_imatrix:
|
| 229 |
+
quantize_cmd += ["--imatrix", imatrix_path]
|
| 230 |
+
|
| 231 |
quantized_gguf = str(Path(outdir)/gguf_name)
|
| 232 |
+
quantize_cmd += [fp16, quantized_gguf]
|
| 233 |
+
|
| 234 |
if use_imatrix:
|
| 235 |
+
quantize_cmd += [imatrix_q_method]
|
|
|
|
|
|
|
|
|
|
| 236 |
else:
|
| 237 |
+
quantize_cmd += [q_method]
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
+
print(f"Quantizing model with {quantize_cmd}")
|
| 240 |
result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
|
| 241 |
if result.returncode != 0:
|
| 242 |
stderr_str = result.stderr.decode("utf-8")
|
|
|
|
| 246 |
print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
|
| 247 |
return quantized_gguf
|
| 248 |
|
| 249 |
+
def process_model(
|
| 250 |
+
model_id: str,
|
| 251 |
+
q_method: str,
|
| 252 |
+
use_imatrix: bool,
|
| 253 |
+
imatrix_q_method: str,
|
| 254 |
+
private_repo: bool,
|
| 255 |
+
train_data_file,
|
| 256 |
+
repo_name: str,
|
| 257 |
+
gguf_name: str,
|
| 258 |
+
quant_embedding: bool,
|
| 259 |
+
embedding_tensor_method: str,
|
| 260 |
+
quant_output: bool,
|
| 261 |
+
output_tensor_method: str,
|
| 262 |
+
split_model: bool,
|
| 263 |
+
split_max_tensors,
|
| 264 |
+
split_max_size: str | None,
|
| 265 |
+
oauth_token: gr.OAuthToken | None,
|
| 266 |
+
):
|
| 267 |
# validate the oauth token
|
| 268 |
if is_valid_token(oauth_token) is False:
|
| 269 |
raise gr.Error("You must be logged in to use GGUF-my-repo")
|
|
|
|
| 415 |
)
|
| 416 |
|
| 417 |
q_method = gr.Dropdown(
|
| 418 |
+
choices=["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16"],
|
| 419 |
label="Quantization Method",
|
| 420 |
info="GGML quantization type",
|
| 421 |
value="Q4_K_M",
|
|
|
|
| 424 |
)
|
| 425 |
|
| 426 |
imatrix_q_method = gr.Dropdown(
|
| 427 |
+
choices=["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
|
| 428 |
label="Imatrix Quantization Method",
|
| 429 |
info="GGML imatrix quants type",
|
| 430 |
value="IQ4_NL",
|
|
|
|
| 442 |
return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
|
| 443 |
|
| 444 |
#####
|
| 445 |
+
# Advanced Options section
|
| 446 |
#####
|
| 447 |
split_model = gr.Checkbox(
|
| 448 |
value=False,
|
|
|
|
| 466 |
def update_split_visibility(split_model):
|
| 467 |
return gr.update(visible=split_model), gr.update(visible=split_model)
|
| 468 |
|
| 469 |
+
quant_embedding = gr.Checkbox(
|
| 470 |
+
value=False,
|
| 471 |
+
label="Quant embeddings tensor",
|
| 472 |
+
info=""
|
| 473 |
+
)
|
| 474 |
+
embedding_tensor_method = gr.Dropdown(
|
| 475 |
+
choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
|
| 476 |
+
label="Output Quantization Method",
|
| 477 |
+
info="use a specific quant type for the token embeddings tensor",
|
| 478 |
+
value="Q8_0",
|
| 479 |
+
filterable=False,
|
| 480 |
+
visible=False
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
quant_output = gr.Checkbox(
|
| 484 |
+
value=False,
|
| 485 |
+
label="Quant output tensor",
|
| 486 |
+
info=""
|
| 487 |
+
)
|
| 488 |
+
output_tensor_method = gr.Dropdown(
|
| 489 |
+
choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
|
| 490 |
+
label="Output Quantization Method",
|
| 491 |
+
info="use a specific quant type for the output.weight tensor",
|
| 492 |
+
value="Q8_0",
|
| 493 |
+
filterable=False,
|
| 494 |
+
visible=False
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
def update_embedding_tensor_visibility(quant_embedding):
|
| 498 |
+
return gr.update(visible=quant_embedding)
|
| 499 |
+
|
| 500 |
+
def update_output_tensor_visibility(quant_output):
|
| 501 |
+
return gr.update(visible=quant_output)
|
| 502 |
+
|
| 503 |
+
|
| 504 |
#####
|
| 505 |
# Output Settings section
|
| 506 |
#####
|
|
|
|
| 557 |
imatrix_q_method,
|
| 558 |
private_repo,
|
| 559 |
train_data_file,
|
| 560 |
+
quant_embedding,
|
| 561 |
+
embedding_tensor_method,
|
| 562 |
+
quant_output,
|
| 563 |
+
output_tensor_method,
|
| 564 |
split_model,
|
| 565 |
split_max_tensors,
|
| 566 |
split_max_size,
|
|
|
|
| 607 |
train_data_file.render()
|
| 608 |
|
| 609 |
gr.Markdown("### Advanced Options")
|
| 610 |
+
|
| 611 |
+
quant_embedding.render()
|
| 612 |
+
embedding_tensor_method.render()
|
| 613 |
+
quant_output.render()
|
| 614 |
+
output_tensor_method.render()
|
| 615 |
+
|
| 616 |
split_model.render()
|
| 617 |
with gr.Row() as split_options: # Group split options
|
| 618 |
split_max_tensors.render()
|
|
|
|
| 625 |
repo_name.render()
|
| 626 |
gguf_name.render()
|
| 627 |
|
| 628 |
+
# Buttons
|
| 629 |
with gr.Row() as buttons:
|
| 630 |
clear_btn.render()
|
| 631 |
submit_btn.render()
|
|
|
|
| 646 |
imatrix_q_method,
|
| 647 |
private_repo,
|
| 648 |
train_data_file,
|
|
|
|
|
|
|
|
|
|
| 649 |
repo_name,
|
| 650 |
gguf_name,
|
| 651 |
+
quant_embedding,
|
| 652 |
+
embedding_tensor_method,
|
| 653 |
+
quant_output,
|
| 654 |
+
output_tensor_method,
|
| 655 |
+
split_model,
|
| 656 |
+
split_max_tensors,
|
| 657 |
+
split_max_size
|
| 658 |
],
|
| 659 |
outputs=[
|
| 660 |
output_label,
|
|
|
|
| 665 |
#####
|
| 666 |
# OnChange handlers
|
| 667 |
#####
|
| 668 |
+
use_imatrix.change(
|
| 669 |
+
fn=update_imatrix_visibility,
|
| 670 |
+
inputs=use_imatrix,
|
| 671 |
+
outputs=[q_method, imatrix_q_method, train_data_file]
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
split_model.change(
|
| 675 |
fn=update_split_visibility,
|
| 676 |
inputs=split_model,
|
| 677 |
outputs=[split_max_tensors, split_max_size]
|
| 678 |
)
|
| 679 |
|
| 680 |
+
quant_embedding.change(
|
| 681 |
+
fn=update_embedding_tensor_visibility,
|
| 682 |
+
inputs=quant_embedding,
|
| 683 |
+
outputs=[embedding_tensor_method]
|
| 684 |
+
)
|
| 685 |
+
|
| 686 |
+
quant_output.change(
|
| 687 |
+
fn=update_output_tensor_visibility,
|
| 688 |
+
inputs=quant_output,
|
| 689 |
+
outputs=[output_tensor_method]
|
| 690 |
)
|
| 691 |
|
| 692 |
model_id.change(
|