Update models/loader.py
Browse files- models/loader.py +6 -3
models/loader.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
# models/loader.py
|
| 2 |
import torch
|
| 3 |
-
import os
|
| 4 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 5 |
from backend.agents import ROLE_PROMPTS
|
| 6 |
|
|
|
|
| 7 |
QUANTIZATION_CONFIG = BitsAndBytesConfig(
|
| 8 |
load_in_4bit=True,
|
| 9 |
bnb_4bit_quant_type="nf4",
|
|
@@ -30,12 +30,15 @@ def get_model_and_tokenizer(model_name="Qwen/Qwen3-0.6B"):
|
|
| 30 |
_MODEL_CACHE[model_name] = {
|
| 31 |
"model": AutoModelForCausalLM.from_pretrained(
|
| 32 |
model_name,
|
| 33 |
-
device_map=
|
| 34 |
-
quantization_config=
|
| 35 |
trust_remote_code=True,
|
| 36 |
),
|
| 37 |
"tokenizer": AutoTokenizer.from_pretrained(model_name)
|
| 38 |
}
|
|
|
|
|
|
|
|
|
|
| 39 |
return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]
|
| 40 |
|
| 41 |
def generate_with_model(agent_role, prompt):
|
|
|
|
| 1 |
# models/loader.py
|
| 2 |
import torch
|
|
|
|
| 3 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 4 |
from backend.agents import ROLE_PROMPTS
|
| 5 |
|
| 6 |
+
# The following configs are no longer used for CPU, but kept for future GPU use.
|
| 7 |
QUANTIZATION_CONFIG = BitsAndBytesConfig(
|
| 8 |
load_in_4bit=True,
|
| 9 |
bnb_4bit_quant_type="nf4",
|
|
|
|
| 30 |
_MODEL_CACHE[model_name] = {
|
| 31 |
"model": AutoModelForCausalLM.from_pretrained(
|
| 32 |
model_name,
|
| 33 |
+
device_map=None,
|
| 34 |
+
quantization_config=None,
|
| 35 |
trust_remote_code=True,
|
| 36 |
),
|
| 37 |
"tokenizer": AutoTokenizer.from_pretrained(model_name)
|
| 38 |
}
|
| 39 |
+
# Explicitly move the model to the CPU after loading
|
| 40 |
+
_MODEL_CACHE[model_name]["model"].to("cpu")
|
| 41 |
+
|
| 42 |
return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]
|
| 43 |
|
| 44 |
def generate_with_model(agent_role, prompt):
|