#!/bin/bash # # Shortcut for quantizing HF models using named parameters and short options # # Usage with long options: # ./ggufy.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M # ./ggufy.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix # ./ggufy.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix --output-filename Llama-2-7b-Q4_K_M.gguf # ./ggufy.sh --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix --output-filename Llama-2-7b-Q4_K_M.gguf --split-model --split-max-tensors 256 --split-max-size 4G # # ./ggufy.sh -m meta-llama/Llama-2-7b -q Q4_K_M # ./ggufy.sh -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix # ./ggufy.sh -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix -o Llama-2-7b-Q4_K_M.gguf # ./ggufy.sh -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix -o Llama-2-7b-Q4_K_M.gguf -split --split-max-tensors 256 --split-max-size 4G # # --- Configuration --- # Path to convert_hf_to_gguf.py CONVERT_SCRIPT_PATH="./llama.cpp/convert_hf_to_gguf.py" # Path to calibration data file for imatrix CALIBRATION_FILE_PATH="./calibration_data_v5_rc.txt" # --- Input Arguments --- # Required: Hugging Face model ID (e.g., meta-llama/Llama-3.2-1B) MODEL_ID="" # Required: Quantization method (e.g., Q4_K_M, Q5_K_M, F16) QUANT_METHOD="" # Optional: "true" to use imatrix, anything else or empty for false USE_IMATRIX="false" # Optional: Final GGUF filename (default: -.gguf) OUTPUT_FILENAME="" # Optional: "true" to split the model, anything else or empty for false SPLIT_MODEL="false" # Optional: Max tensors per shard if splitting (default: 256) SPLIT_MAX_TENSORS="256" # Optional: Max size per shard if splitting (e.g., 2G) - overrides SPLIT_MAX_TENSORS if set SPLIT_MAX_SIZE="" # Optional: Quant embeddings tensor TOKEN_EMBEDDING_TYPE="" # Optional: Leave output tensor LEAVE_OUTPUT_TENSOR="false" # Optional: Output Quantization Method OUTPUT_TENSOR_TYPE="" # --- Parse Named Arguments --- while [[ $# -gt 0 ]]; do case $1 in -m|--model) MODEL_ID="$2" shift 2 ;; -q|--quant-method) QUANT_METHOD="$2" shift 2 ;; -imatrix|--use-imatrix) USE_IMATRIX="true" shift 1 ;; -o|--output-filename) OUTPUT_FILENAME="$2" shift 2 ;; -split|--split-model) SPLIT_MODEL="true" shift 1 ;; --split-max-tensors) SPLIT_MAX_TENSORS="$2" shift 2 ;; --split-max-size) SPLIT_MAX_SIZE="$2" shift 2 ;; --token-embedding-type) TOKEN_EMBEDDING_TYPE="$2" shift 2 ;; --leave-output-tensor) LEAVE_OUTPUT_TENSOR="true" shift 1 ;; --output-tensor-type) OUTPUT_TENSOR_TYPE="$2" shift 2 ;; -h|--help) echo "Usage:" echo " Long options:" echo " $0 --model --quant-method [--use-imatrix] [--output-filename ] [--split-model] [--split-max-tensors ] [--split-max-size ] [--token-embedding-type ] [--leave-output-tensor] [--output-tensor-type ]" echo "" echo " Short options:" echo " $0 -m -q [-imatrix] [-o ] [-split]" echo "" echo "Examples:" echo " $0 --model meta-llama/Llama-2-7b --quant-method Q4_K_M" echo " $0 -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix" echo " $0 --model meta-llama/Llama-2-7b --quant-method Q4_K_M --use-imatrix --output-filename Llama-2-7b-Q4_K_M.gguf" echo " $0 -m meta-llama/Llama-2-7b -q Q4_K_M -imatrix -o Llama-2-7b-Q4_K_M.gguf -split --split-max-tensors 256 --split-max-size 4G" exit 0 ;; *) echo "Unknown option: $1" echo "Use --help or -h for usage information." exit 1 ;; esac done # --- Validation --- if [ -z "$MODEL_ID" ] || [ -z "$QUANT_METHOD" ]; then echo "Error: Both --model (-m) and --quant-method (-q) are required." echo echo "Use --help or -h for usage information." exit 1 fi # --- Derived Variables --- # Extract model name from ID MODEL_NAME=$(basename "$MODEL_ID") # Directory to store intermediate and final files OUTPUT_DIR="./outputs/${MODEL_NAME}" mkdir -p "$OUTPUT_DIR" if [ "$USE_IMATRIX" = "true" ]; then if [ ! -f "$CALIBRATION_FILE_PATH" ]; then echo "Error: Calibration file '$CALIBRATION_FILE_PATH' not found. Please provide it." exit 1 fi fi if [ -z "$OUTPUT_FILENAME" ]; then OUTPUT_FILENAME="${MODEL_NAME}-${QUANT_METHOD}.gguf" fi FP16_MODEL_PATH="$OUTPUT_DIR/${MODEL_NAME}-fp16.gguf" IMATRIX_FILE_PATH="$OUTPUT_DIR/${MODEL_NAME}-imatrix.gguf" QUANTIZED_MODEL_PATH="$OUTPUT_DIR/$OUTPUT_FILENAME" echo "=== Starting GGUF Conversion Pipeline ===" echo "Model ID: $MODEL_ID" echo "Model Name: $MODEL_NAME" echo "Quantization Method: $QUANT_METHOD" echo "Use Imatrix: $USE_IMATRIX" if [ "$USE_IMATRIX" = "true" ]; then echo "Calibration File: $CALIBRATION_FILE_PATH" fi echo "Output Directory: $OUTPUT_DIR" echo "Final Output File: $QUANTIZED_MODEL_PATH" echo "Split Model: $SPLIT_MODEL" if [ "$SPLIT_MODEL" = "true" ]; then if [ -n "$SPLIT_MAX_SIZE" ]; then echo "Split Max Size: $SPLIT_MAX_SIZE" else if [ -z "$SPLIT_MAX_TENSORS" ]; then SPLIT_MAX_TENSORS=256 fi echo "Split Max Tensors: $SPLIT_MAX_TENSORS" fi fi echo "----------------------------------------" if [ -f "$FP16_MODEL_PATH" ]; then echo "FP16 model '$FP16_MODEL_PATH' already exists. Skipping conversion." else # --- Step 1: Check Hugging Face Login --- echo "Checking Hugging Face login status..." if ! hf auth whoami > /dev/null 2>&1; then echo "Error: Not logged into Hugging Face. Please run 'hf auth login' first." exit 1 fi echo "Logged in successfully." # --- Step 2: Download Hugging Face Model --- echo "Downloading model '$MODEL_ID'..." MODEL_DOWNLOAD_DIR="./downloads/$MODEL_NAME" mkdir -p "$MODEL_DOWNLOAD_DIR" # Download necessary files hf download "$MODEL_ID" \ --revision main \ --include "*.md" \ --include "*.json" \ --include "*.model" \ --include "*.safetensors" \ --include "*.bin" \ --local-dir "$MODEL_DOWNLOAD_DIR" if [ $? -ne 0 ]; then echo "Error: Failed to download model '$MODEL_ID'." rm -rf "$MODEL_DOWNLOAD_DIR" exit 1 fi echo "Model downloaded to '$MODEL_DOWNLOAD_DIR'." # Check for LoRA adapter (simplified check) if [ -f "$MODEL_DOWNLOAD_DIR/adapter_config.json" ] && [ ! -f "$MODEL_DOWNLOAD_DIR/config.json" ]; then echo "Error: adapter_config.json found but no config.json. This might be a LoRA adapter. Please use GGUF-my-lora." exit 1 fi # --- Step 3: Convert HF Model to FP16 GGUF --- echo "Converting Hugging Face model to FP16 GGUF..." python3 "$CONVERT_SCRIPT_PATH" "$MODEL_DOWNLOAD_DIR" \ --outtype f16 \ --outfile "$FP16_MODEL_PATH" if [ $? -ne 0 ]; then echo "Error: Failed to convert model to FP16 GGUF." rm -f "$FP16_MODEL_PATH" exit 1 fi echo "FP16 GGUF model created at '$FP16_MODEL_PATH'." fi # --- Step 4: (Optional) Generate Imatrix --- if [ "$USE_IMATRIX" = "true" ]; then if [ -f "$IMATRIX_FILE_PATH" ]; then echo "Imatrix file '$IMATRIX_FILE_PATH' already exists. Skipping generation." else echo "Generating importance matrix (imatrix)..." IMATRIX_CMD=( llama-imatrix -m "$FP16_MODEL_PATH" -f "$CALIBRATION_FILE_PATH" -ngl 99 --output-frequency 10 -o "$IMATRIX_FILE_PATH" ) echo "Running command: ${IMATRIX_CMD[*]}" "${IMATRIX_CMD[@]}" if [ $? -ne 0 ]; then echo "Error: Failed to generate imatrix." rm -f "$IMATRIX_FILE_PATH" exit 1 fi echo "Imatrix generated at '$IMATRIX_FILE_PATH'." fi fi # --- Step 5: Quantize the GGUF Model --- echo "Quantizing GGUF model..." QUANTIZE_CMD=( llama-quantize ) if [ "$USE_IMATRIX" = "true" ] && [ -f "$IMATRIX_FILE_PATH" ]; then QUANTIZE_CMD+=( --imatrix "$IMATRIX_FILE_PATH" ) fi if [ -n "$TOKEN_EMBEDDING_TYPE" ]; then QUANTIZE_CMD+=( --token-embedding-type "$TOKEN_EMBEDDING_TYPE" ) fi if [ "$LEAVE_OUTPUT_TENSOR" = "true" ]; then QUANTIZE_CMD+=( --leave-output-tensor ) else if [ -n "$OUTPUT_TENSOR_TYPE" ]; then QUANTIZE_CMD+=( --output-tensor-type "$OUTPUT_TENSOR_TYPE" ) fi fi QUANTIZE_CMD+=( "$FP16_MODEL_PATH" "$QUANTIZED_MODEL_PATH" "$QUANT_METHOD" ) echo "Running command: ${QUANTIZE_CMD[*]}" "${QUANTIZE_CMD[@]}" if [ $? -ne 0 ]; then echo "Error: Failed to quantize model." rm -f "$QUANTIZED_MODEL_PATH" exit 1 fi echo "Model quantized successfully to '$QUANTIZED_MODEL_PATH'." # --- Step 6: (Optional) Split the Quantized Model --- if [ "$SPLIT_MODEL" = "true" ]; then echo "Splitting quantized model..." SPLIT_CMD=( llama-gguf-split --split ) if [ -n "$SPLIT_MAX_SIZE" ]; then SPLIT_CMD+=(--split-max-size "$SPLIT_MAX_SIZE") else SPLIT_CMD+=(--split-max-tensors "$SPLIT_MAX_TENSORS") fi # Output prefix (without .gguf extension) OUTPUT_PREFIX="${QUANTIZED_MODEL_PATH%.gguf}" SPLIT_CMD+=("$QUANTIZED_MODEL_PATH" "$OUTPUT_PREFIX") echo "Running command: ${SPLIT_CMD[*]}" "${SPLIT_CMD[@]}" if [ $? -ne 0 ]; then echo "Error: Failed to split model." exit 1 fi # Remove the original unsplit file if [ -f "$QUANTIZED_MODEL_PATH" ]; then rm "$QUANTIZED_MODEL_PATH" echo "Removed original unsplit file '$QUANTIZED_MODEL_PATH'." fi echo "Model split successfully. Shards are in '$OUTPUT_DIR' with prefix '$OUTPUT_PREFIX'." else echo "Model splitting skipped." fi echo "=== GGUF Conversion Pipeline Completed Successfully ===" if [ "$SPLIT_MODEL" = "true" ]; then echo "Check directory '$OUTPUT_DIR' for split GGUF files." else echo "Final GGUF file is located at: $QUANTIZED_MODEL_PATH" fi