Spaces:
Sleeping
Sleeping
Patryk Studzinski
commited on
Commit
·
f740ffc
1
Parent(s):
e0c72ee
fix: Make 8-bit quantization opt-in and gracefully handle missing bitsandbytes
Browse files- app/models/huggingface_local.py +8 -4
- requirements.txt +3 -1
app/models/huggingface_local.py
CHANGED
|
@@ -44,8 +44,9 @@ class HuggingFaceLocal(BaseLLM):
|
|
| 44 |
self.model = None
|
| 45 |
self.use_cache = use_cache
|
| 46 |
|
| 47 |
-
# Only enable 8-bit if
|
| 48 |
-
|
|
|
|
| 49 |
self.use_8bit = requested_8bit and HAS_BITSANDBYTES
|
| 50 |
|
| 51 |
if requested_8bit and not HAS_BITSANDBYTES:
|
|
@@ -96,9 +97,12 @@ class HuggingFaceLocal(BaseLLM):
|
|
| 96 |
except Exception as e:
|
| 97 |
print(f"[{self.name}] Failed to setup 8-bit quantization: {e}")
|
| 98 |
print(f"[{self.name}] Falling back to full precision")
|
|
|
|
| 99 |
model_kwargs["torch_dtype"] = self.torch_dtype
|
| 100 |
-
model_kwargs["device_map"] =
|
| 101 |
-
|
|
|
|
|
|
|
| 102 |
model_kwargs["torch_dtype"] = self.torch_dtype
|
| 103 |
model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"
|
| 104 |
|
|
|
|
| 44 |
self.model = None
|
| 45 |
self.use_cache = use_cache
|
| 46 |
|
| 47 |
+
# Only enable 8-bit if explicitly requested (opt-in, not by default)
|
| 48 |
+
# Default to False since bitsandbytes may not be available in all deployment environments
|
| 49 |
+
requested_8bit = use_8bit or (device == "cpu" and os.getenv("USE_8BIT_QUANTIZATION", "false").lower() == "true")
|
| 50 |
self.use_8bit = requested_8bit and HAS_BITSANDBYTES
|
| 51 |
|
| 52 |
if requested_8bit and not HAS_BITSANDBYTES:
|
|
|
|
| 97 |
except Exception as e:
|
| 98 |
print(f"[{self.name}] Failed to setup 8-bit quantization: {e}")
|
| 99 |
print(f"[{self.name}] Falling back to full precision")
|
| 100 |
+
self.use_8bit = False
|
| 101 |
model_kwargs["torch_dtype"] = self.torch_dtype
|
| 102 |
+
model_kwargs["device_map"] = "cpu"
|
| 103 |
+
|
| 104 |
+
# Standard loading without quantization
|
| 105 |
+
if not self.use_8bit:
|
| 106 |
model_kwargs["torch_dtype"] = self.torch_dtype
|
| 107 |
model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"
|
| 108 |
|
requirements.txt
CHANGED
|
@@ -3,6 +3,8 @@ uvicorn[standard]==0.24.0
|
|
| 3 |
transformers==4.36.2
|
| 4 |
accelerate==0.25.0
|
| 5 |
huggingface_hub==0.19.4
|
| 6 |
-
bitsandbytes==0.49.0
|
| 7 |
torch>=2.1.0
|
| 8 |
pydantic==2.5.0
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
transformers==4.36.2
|
| 4 |
accelerate==0.25.0
|
| 5 |
huggingface_hub==0.19.4
|
|
|
|
| 6 |
torch>=2.1.0
|
| 7 |
pydantic==2.5.0
|
| 8 |
+
# bitsandbytes is optional for 8-bit quantization (CPU optimization)
|
| 9 |
+
# Uncomment below if bitsandbytes is available on your system:
|
| 10 |
+
# bitsandbytes==0.49.0
|