gguf-my-repo-ENHANCED

Sleeping

App Files Files Community

Oleg Shulyakov commited on Aug 6

Commit

ab4a7f4

1 Parent(s): 9250e33

Add embedded and outputs quantization

Browse files

Files changed (1) hide show

app.py +118 -20

app.py CHANGED Viewed

@@ -189,7 +189,19 @@ def download_base_model(token: str, model_id: str, outdir: tempfile.TemporaryDir
         return fp16_model
-def quantize_model(outdir: tempfile.TemporaryDirectory, gguf_name: str, fp16, q_method: str, use_imatrix: bool, imatrix_q_method: str, imatrix_path: str):
     if use_imatrix:
         if train_data_file:
             train_data_path = train_data_file.name
@@ -206,18 +218,25 @@ def quantize_model(outdir: tempfile.TemporaryDirectory, gguf_name: str, fp16, q_
         print("Not using imatrix quantization.")
     # Quantize the model
     quantized_gguf = str(Path(outdir)/gguf_name)
     if use_imatrix:
-        quantize_cmd = [
-            "llama-quantize",
-            "--imatrix", imatrix_path, fp16, quantized_gguf, imatrix_q_method
-        ]
     else:
-        quantize_cmd = [
-            "llama-quantize",
-            fp16, quantized_gguf, q_method
-        ]
     result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
     if result.returncode != 0:
         stderr_str = result.stderr.decode("utf-8")
@@ -227,7 +246,24 @@ def quantize_model(outdir: tempfile.TemporaryDirectory, gguf_name: str, fp16, q_
     print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
     return quantized_gguf
-def process_model(model_id: str, q_method: str, use_imatrix: bool, imatrix_q_method: str, private_repo: bool, train_data_file, split_model: bool, split_max_tensors, split_max_size: str | None, repo_name: str, gguf_name: str, oauth_token: gr.OAuthToken | None):
     # validate the oauth token
     if is_valid_token(oauth_token) is False:
         raise gr.Error("You must be logged in to use GGUF-my-repo")
@@ -379,7 +415,7 @@ use_imatrix = gr.Checkbox(
 )
 q_method = gr.Dropdown(
-    ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16"],
     label="Quantization Method",
     info="GGML quantization type",
     value="Q4_K_M",
@@ -388,7 +424,7 @@ q_method = gr.Dropdown(
 )
 imatrix_q_method = gr.Dropdown(
-    ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
     label="Imatrix Quantization Method",
     info="GGML imatrix quants type",
     value="IQ4_NL",
@@ -406,7 +442,7 @@ def update_imatrix_visibility(use_imatrix):
     return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
 #####
-# Split model section
 #####
 split_model = gr.Checkbox(
     value=False,
@@ -430,6 +466,41 @@ split_max_size = gr.Textbox(
 def update_split_visibility(split_model):
     return gr.update(visible=split_model), gr.update(visible=split_model)
 #####
 # Output Settings section
 #####
@@ -486,6 +557,10 @@ clear_btn = gr.ClearButton(
         imatrix_q_method,
         private_repo,
         train_data_file,
         split_model,
         split_max_tensors,
         split_max_size,
@@ -532,6 +607,12 @@ with gr.Blocks(css=css) as demo:
                 train_data_file.render()
             gr.Markdown("### Advanced Options")
             split_model.render()
             with gr.Row() as split_options:  # Group split options
                 split_max_tensors.render()
@@ -544,6 +625,7 @@ with gr.Blocks(css=css) as demo:
                 repo_name.render()
                 gguf_name.render()
             with gr.Row() as buttons:
                 clear_btn.render()
                 submit_btn.render()
@@ -564,11 +646,15 @@ with gr.Blocks(css=css) as demo:
             imatrix_q_method,
             private_repo,
             train_data_file,
-            split_model,
-            split_max_tensors,
-            split_max_size,
             repo_name,
             gguf_name,
         ],
         outputs=[
             output_label,
@@ -579,16 +665,28 @@ with gr.Blocks(css=css) as demo:
     #####
     # OnChange handlers
     #####
     split_model.change(
         fn=update_split_visibility,
         inputs=split_model,
         outputs=[split_max_tensors, split_max_size]
     )
-    use_imatrix.change(
-        fn=update_imatrix_visibility,
-        inputs=use_imatrix,
-        outputs=[q_method, imatrix_q_method, train_data_file]
     )
     model_id.change(

         return fp16_model
+def quantize_model(
+    outdir: tempfile.TemporaryDirectory,
+    gguf_name: str,
+    fp16: str,
+    q_method: str,
+    use_imatrix: bool,
+    imatrix_q_method: str,
+    imatrix_path: str,
+    quant_embedding: bool,
+    embedding_tensor_method: str,
+    quant_output: bool,
+    output_tensor_method: str,
+):
     if use_imatrix:
         if train_data_file:
             train_data_path = train_data_file.name
         print("Not using imatrix quantization.")
     # Quantize the model
+    quantize_cmd = ["llama-quantize"]
+    if quant_embedding:
+        quantize_cmd += ["--token-embedding-type", embedding_tensor_method]
+    if quant_output:
+        quantize_cmd += ["--output-tensor-type", output_tensor_method]
+    if use_imatrix:
+        quantize_cmd += ["--imatrix", imatrix_path]
     quantized_gguf = str(Path(outdir)/gguf_name)
+    quantize_cmd += [fp16, quantized_gguf]
     if use_imatrix:
+        quantize_cmd += [imatrix_q_method]
     else:
+        quantize_cmd += [q_method]
+    print(f"Quantizing model with {quantize_cmd}")
     result = subprocess.run(quantize_cmd, shell=False, capture_output=True)
     if result.returncode != 0:
         stderr_str = result.stderr.decode("utf-8")
     print(f"Quantized model path: {os.path.abspath(quantized_gguf)}")
     return quantized_gguf
+def process_model(
+    model_id: str,
+    q_method: str,
+    use_imatrix: bool,
+    imatrix_q_method: str,
+    private_repo: bool,
+    train_data_file,
+    repo_name: str,
+    gguf_name: str,
+    quant_embedding: bool,
+    embedding_tensor_method: str,
+    quant_output: bool,
+    output_tensor_method: str,
+    split_model: bool,
+    split_max_tensors,
+    split_max_size: str | None,
+    oauth_token: gr.OAuthToken | None,
+):
     # validate the oauth token
     if is_valid_token(oauth_token) is False:
         raise gr.Error("You must be logged in to use GGUF-my-repo")
 )
 q_method = gr.Dropdown(
+    choices=["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", "F16", "BF16"],
     label="Quantization Method",
     info="GGML quantization type",
     value="Q4_K_M",
 )
 imatrix_q_method = gr.Dropdown(
+    choices=["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
     label="Imatrix Quantization Method",
     info="GGML imatrix quants type",
     value="IQ4_NL",
     return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
 #####
+# Advanced Options section
 #####
 split_model = gr.Checkbox(
     value=False,
 def update_split_visibility(split_model):
     return gr.update(visible=split_model), gr.update(visible=split_model)
+quant_embedding = gr.Checkbox(
+    value=False,
+    label="Quant embeddings tensor",
+    info=""
+)
+embedding_tensor_method = gr.Dropdown(
+    choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
+    label="Output Quantization Method",
+    info="use a specific quant type for the token embeddings tensor",
+    value="Q8_0",
+    filterable=False,
+    visible=False
+)
+quant_output = gr.Checkbox(
+    value=False,
+    label="Quant output tensor",
+    info=""
+)
+output_tensor_method = gr.Dropdown(
+    choices=["Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q8_0"],
+    label="Output Quantization Method",
+    info="use a specific quant type for the output.weight tensor",
+    value="Q8_0",
+    filterable=False,
+    visible=False
+)
+def update_embedding_tensor_visibility(quant_embedding):
+    return gr.update(visible=quant_embedding)
+def update_output_tensor_visibility(quant_output):
+    return gr.update(visible=quant_output)
 #####
 # Output Settings section
 #####
         imatrix_q_method,
         private_repo,
         train_data_file,
+        quant_embedding,
+        embedding_tensor_method,
+        quant_output,
+        output_tensor_method,
         split_model,
         split_max_tensors,
         split_max_size,
                 train_data_file.render()
             gr.Markdown("### Advanced Options")
+            quant_embedding.render()
+            embedding_tensor_method.render()
+            quant_output.render()
+            output_tensor_method.render()
             split_model.render()
             with gr.Row() as split_options:  # Group split options
                 split_max_tensors.render()
                 repo_name.render()
                 gguf_name.render()
+            # Buttons
             with gr.Row() as buttons:
                 clear_btn.render()
                 submit_btn.render()
             imatrix_q_method,
             private_repo,
             train_data_file,
             repo_name,
             gguf_name,
+            quant_embedding,
+            embedding_tensor_method,
+            quant_output,
+            output_tensor_method,
+            split_model,
+            split_max_tensors,
+            split_max_size
         ],
         outputs=[
             output_label,
     #####
     # OnChange handlers
     #####
+    use_imatrix.change(
+        fn=update_imatrix_visibility,
+        inputs=use_imatrix,
+        outputs=[q_method, imatrix_q_method, train_data_file]
+    )
     split_model.change(
         fn=update_split_visibility,
         inputs=split_model,
         outputs=[split_max_tensors, split_max_size]
     )
+    quant_embedding.change(
+        fn=update_embedding_tensor_visibility,
+        inputs=quant_embedding,
+        outputs=[embedding_tensor_method]
+    )
+    quant_output.change(
+        fn=update_output_tensor_visibility,
+        inputs=quant_output,
+        outputs=[output_tensor_method]
     )
     model_id.change(