Patryk Studzinski commited on
Commit
f740ffc
·
1 Parent(s): e0c72ee

fix: Make 8-bit quantization opt-in and gracefully handle missing bitsandbytes

Browse files
app/models/huggingface_local.py CHANGED
@@ -44,8 +44,9 @@ class HuggingFaceLocal(BaseLLM):
44
  self.model = None
45
  self.use_cache = use_cache
46
 
47
- # Only enable 8-bit if bitsandbytes is available
48
- requested_8bit = use_8bit or (device == "cpu" and os.getenv("USE_8BIT_QUANTIZATION", "true").lower() == "true")
 
49
  self.use_8bit = requested_8bit and HAS_BITSANDBYTES
50
 
51
  if requested_8bit and not HAS_BITSANDBYTES:
@@ -96,9 +97,12 @@ class HuggingFaceLocal(BaseLLM):
96
  except Exception as e:
97
  print(f"[{self.name}] Failed to setup 8-bit quantization: {e}")
98
  print(f"[{self.name}] Falling back to full precision")
 
99
  model_kwargs["torch_dtype"] = self.torch_dtype
100
- model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"
101
- else:
 
 
102
  model_kwargs["torch_dtype"] = self.torch_dtype
103
  model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"
104
 
 
44
  self.model = None
45
  self.use_cache = use_cache
46
 
47
+ # Only enable 8-bit if explicitly requested (opt-in, not by default)
48
+ # Default to False since bitsandbytes may not be available in all deployment environments
49
+ requested_8bit = use_8bit or (device == "cpu" and os.getenv("USE_8BIT_QUANTIZATION", "false").lower() == "true")
50
  self.use_8bit = requested_8bit and HAS_BITSANDBYTES
51
 
52
  if requested_8bit and not HAS_BITSANDBYTES:
 
97
  except Exception as e:
98
  print(f"[{self.name}] Failed to setup 8-bit quantization: {e}")
99
  print(f"[{self.name}] Falling back to full precision")
100
+ self.use_8bit = False
101
  model_kwargs["torch_dtype"] = self.torch_dtype
102
+ model_kwargs["device_map"] = "cpu"
103
+
104
+ # Standard loading without quantization
105
+ if not self.use_8bit:
106
  model_kwargs["torch_dtype"] = self.torch_dtype
107
  model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"
108
 
requirements.txt CHANGED
@@ -3,6 +3,8 @@ uvicorn[standard]==0.24.0
3
  transformers==4.36.2
4
  accelerate==0.25.0
5
  huggingface_hub==0.19.4
6
- bitsandbytes==0.49.0
7
  torch>=2.1.0
8
  pydantic==2.5.0
 
 
 
 
3
  transformers==4.36.2
4
  accelerate==0.25.0
5
  huggingface_hub==0.19.4
 
6
  torch>=2.1.0
7
  pydantic==2.5.0
8
+ # bitsandbytes is optional for 8-bit quantization (CPU optimization)
9
+ # Uncomment below if bitsandbytes is available on your system:
10
+ # bitsandbytes==0.49.0