Spaces:
Sleeping
Sleeping
Oleg Shulyakov
commited on
Commit
·
a35310e
1
Parent(s):
1580529
Format
Browse files
README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
title: GGUF My Repo
|
| 3 |
emoji: 🦙
|
| 4 |
colorFrom: gray
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
hf_oauth: true
|
| 8 |
hf_oauth_scopes:
|
|
|
|
| 2 |
title: GGUF My Repo
|
| 3 |
emoji: 🦙
|
| 4 |
colorFrom: gray
|
| 5 |
+
colorTo: gray
|
| 6 |
sdk: docker
|
| 7 |
hf_oauth: true
|
| 8 |
hf_oauth_scopes:
|
app.py
CHANGED
|
@@ -60,7 +60,7 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
|
|
| 60 |
|
| 61 |
if oauth_token is None or oauth_token.token is None:
|
| 62 |
raise ValueError("You have to be logged in.")
|
| 63 |
-
|
| 64 |
split_cmd = [
|
| 65 |
"./llama.cpp/llama-gguf-split",
|
| 66 |
"--split",
|
|
@@ -77,12 +77,12 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
|
|
| 77 |
split_cmd.append(model_path)
|
| 78 |
split_cmd.append(model_path_prefix)
|
| 79 |
|
| 80 |
-
print(f"Split command: {split_cmd}")
|
| 81 |
-
|
| 82 |
result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
|
| 83 |
-
print(f"Split command stdout: {result.stdout}")
|
| 84 |
-
print(f"Split command stderr: {result.stderr}")
|
| 85 |
-
|
| 86 |
if result.returncode != 0:
|
| 87 |
stderr_str = result.stderr.decode("utf-8")
|
| 88 |
raise Exception(f"Error splitting the model: {stderr_str}")
|
|
@@ -93,7 +93,7 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
|
|
| 93 |
os.remove(model_path)
|
| 94 |
|
| 95 |
model_file_prefix = model_path_prefix.split('/')[-1]
|
| 96 |
-
print(f"Model file name prefix: {model_file_prefix}")
|
| 97 |
sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
|
| 98 |
if sharded_model_files:
|
| 99 |
print(f"Sharded model files: {sharded_model_files}")
|
|
@@ -111,7 +111,7 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
|
|
| 111 |
raise Exception(f"Error uploading file {file_path}: {e}")
|
| 112 |
else:
|
| 113 |
raise Exception("No sharded files found.")
|
| 114 |
-
|
| 115 |
print("Sharded model has been uploaded successfully!")
|
| 116 |
|
| 117 |
def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
|
|
@@ -184,7 +184,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
| 184 |
if train_data_file:
|
| 185 |
train_data_path = train_data_file.name
|
| 186 |
else:
|
| 187 |
-
train_data_path = "llama.cpp/
|
| 188 |
|
| 189 |
print(f"Training data file path: {train_data_path}")
|
| 190 |
|
|
@@ -194,7 +194,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
| 194 |
generate_importance_matrix(fp16, train_data_path, imatrix_path)
|
| 195 |
else:
|
| 196 |
print("Not using imatrix quantization.")
|
| 197 |
-
|
| 198 |
# Quantize the model
|
| 199 |
quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
|
| 200 |
quantized_gguf_path = str(Path(outdir)/quantized_gguf_name)
|
|
@@ -235,26 +235,26 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
| 235 |
# {new_repo_id}
|
| 236 |
This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
|
| 237 |
Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
|
| 238 |
-
|
| 239 |
## Use with llama.cpp
|
| 240 |
Install llama.cpp through brew (works on Mac and Linux)
|
| 241 |
-
|
| 242 |
```bash
|
| 243 |
brew install llama.cpp
|
| 244 |
-
|
| 245 |
```
|
| 246 |
Invoke the llama.cpp server or the CLI.
|
| 247 |
-
|
| 248 |
### CLI:
|
| 249 |
```bash
|
| 250 |
llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
|
| 251 |
```
|
| 252 |
-
|
| 253 |
### Server:
|
| 254 |
```bash
|
| 255 |
llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
|
| 256 |
```
|
| 257 |
-
|
| 258 |
Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
|
| 259 |
|
| 260 |
Step 1: Clone llama.cpp from GitHub.
|
|
@@ -271,7 +271,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
| 271 |
```
|
| 272 |
./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
|
| 273 |
```
|
| 274 |
-
or
|
| 275 |
```
|
| 276 |
./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
|
| 277 |
```
|
|
@@ -292,7 +292,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
| 292 |
)
|
| 293 |
except Exception as e:
|
| 294 |
raise Exception(f"Error uploading quantized model: {e}")
|
| 295 |
-
|
| 296 |
if os.path.isfile(imatrix_path):
|
| 297 |
try:
|
| 298 |
print(f"Uploading imatrix.dat: {imatrix_path}")
|
|
@@ -343,7 +343,7 @@ imatrix_q_method = gr.Dropdown(
|
|
| 343 |
["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
|
| 344 |
label="Imatrix Quantization Method",
|
| 345 |
info="GGML imatrix quants type",
|
| 346 |
-
value="IQ4_NL",
|
| 347 |
filterable=False,
|
| 348 |
visible=False
|
| 349 |
)
|
|
@@ -408,7 +408,7 @@ iface = gr.Interface(
|
|
| 408 |
)
|
| 409 |
|
| 410 |
# Create Gradio interface
|
| 411 |
-
with gr.Blocks(css=css) as demo:
|
| 412 |
gr.Markdown("You must be logged in to use GGUF-my-repo.")
|
| 413 |
gr.LoginButton(min_width=250)
|
| 414 |
|
|
@@ -425,7 +425,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 425 |
|
| 426 |
def update_visibility(use_imatrix):
|
| 427 |
return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
|
| 428 |
-
|
| 429 |
use_imatrix.change(
|
| 430 |
fn=update_visibility,
|
| 431 |
inputs=use_imatrix,
|
|
|
|
| 60 |
|
| 61 |
if oauth_token is None or oauth_token.token is None:
|
| 62 |
raise ValueError("You have to be logged in.")
|
| 63 |
+
|
| 64 |
split_cmd = [
|
| 65 |
"./llama.cpp/llama-gguf-split",
|
| 66 |
"--split",
|
|
|
|
| 77 |
split_cmd.append(model_path)
|
| 78 |
split_cmd.append(model_path_prefix)
|
| 79 |
|
| 80 |
+
print(f"Split command: {split_cmd}")
|
| 81 |
+
|
| 82 |
result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
|
| 83 |
+
print(f"Split command stdout: {result.stdout}")
|
| 84 |
+
print(f"Split command stderr: {result.stderr}")
|
| 85 |
+
|
| 86 |
if result.returncode != 0:
|
| 87 |
stderr_str = result.stderr.decode("utf-8")
|
| 88 |
raise Exception(f"Error splitting the model: {stderr_str}")
|
|
|
|
| 93 |
os.remove(model_path)
|
| 94 |
|
| 95 |
model_file_prefix = model_path_prefix.split('/')[-1]
|
| 96 |
+
print(f"Model file name prefix: {model_file_prefix}")
|
| 97 |
sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
|
| 98 |
if sharded_model_files:
|
| 99 |
print(f"Sharded model files: {sharded_model_files}")
|
|
|
|
| 111 |
raise Exception(f"Error uploading file {file_path}: {e}")
|
| 112 |
else:
|
| 113 |
raise Exception("No sharded files found.")
|
| 114 |
+
|
| 115 |
print("Sharded model has been uploaded successfully!")
|
| 116 |
|
| 117 |
def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
|
|
|
|
| 184 |
if train_data_file:
|
| 185 |
train_data_path = train_data_file.name
|
| 186 |
else:
|
| 187 |
+
train_data_path = "llama.cpp/train_data.txt" #fallback calibration dataset
|
| 188 |
|
| 189 |
print(f"Training data file path: {train_data_path}")
|
| 190 |
|
|
|
|
| 194 |
generate_importance_matrix(fp16, train_data_path, imatrix_path)
|
| 195 |
else:
|
| 196 |
print("Not using imatrix quantization.")
|
| 197 |
+
|
| 198 |
# Quantize the model
|
| 199 |
quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
|
| 200 |
quantized_gguf_path = str(Path(outdir)/quantized_gguf_name)
|
|
|
|
| 235 |
# {new_repo_id}
|
| 236 |
This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
|
| 237 |
Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
|
| 238 |
+
|
| 239 |
## Use with llama.cpp
|
| 240 |
Install llama.cpp through brew (works on Mac and Linux)
|
| 241 |
+
|
| 242 |
```bash
|
| 243 |
brew install llama.cpp
|
| 244 |
+
|
| 245 |
```
|
| 246 |
Invoke the llama.cpp server or the CLI.
|
| 247 |
+
|
| 248 |
### CLI:
|
| 249 |
```bash
|
| 250 |
llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
|
| 251 |
```
|
| 252 |
+
|
| 253 |
### Server:
|
| 254 |
```bash
|
| 255 |
llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
|
| 256 |
```
|
| 257 |
+
|
| 258 |
Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
|
| 259 |
|
| 260 |
Step 1: Clone llama.cpp from GitHub.
|
|
|
|
| 271 |
```
|
| 272 |
./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
|
| 273 |
```
|
| 274 |
+
or
|
| 275 |
```
|
| 276 |
./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
|
| 277 |
```
|
|
|
|
| 292 |
)
|
| 293 |
except Exception as e:
|
| 294 |
raise Exception(f"Error uploading quantized model: {e}")
|
| 295 |
+
|
| 296 |
if os.path.isfile(imatrix_path):
|
| 297 |
try:
|
| 298 |
print(f"Uploading imatrix.dat: {imatrix_path}")
|
|
|
|
| 343 |
["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
|
| 344 |
label="Imatrix Quantization Method",
|
| 345 |
info="GGML imatrix quants type",
|
| 346 |
+
value="IQ4_NL",
|
| 347 |
filterable=False,
|
| 348 |
visible=False
|
| 349 |
)
|
|
|
|
| 408 |
)
|
| 409 |
|
| 410 |
# Create Gradio interface
|
| 411 |
+
with gr.Blocks(css=css) as demo:
|
| 412 |
gr.Markdown("You must be logged in to use GGUF-my-repo.")
|
| 413 |
gr.LoginButton(min_width=250)
|
| 414 |
|
|
|
|
| 425 |
|
| 426 |
def update_visibility(use_imatrix):
|
| 427 |
return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
|
| 428 |
+
|
| 429 |
use_imatrix.change(
|
| 430 |
fn=update_visibility,
|
| 431 |
inputs=use_imatrix,
|