Spaces:

studzinsky
/

bielik_app_service

Sleeping

Patryk Studzinski commited on 6 days ago

Commit

f740ffc

1 Parent(s): e0c72ee

fix: Make 8-bit quantization opt-in and gracefully handle missing bitsandbytes

Files changed (2) hide show

app/models/huggingface_local.py CHANGED Viewed

@@ -44,8 +44,9 @@ class HuggingFaceLocal(BaseLLM):
         self.model = None
         self.use_cache = use_cache
-        # Only enable 8-bit if bitsandbytes is available
-        requested_8bit = use_8bit or (device == "cpu" and os.getenv("USE_8BIT_QUANTIZATION", "true").lower() == "true")
         self.use_8bit = requested_8bit and HAS_BITSANDBYTES
         if requested_8bit and not HAS_BITSANDBYTES:
@@ -96,9 +97,12 @@ class HuggingFaceLocal(BaseLLM):
                 except Exception as e:
                     print(f"[{self.name}] Failed to setup 8-bit quantization: {e}")
                     print(f"[{self.name}] Falling back to full precision")
                     model_kwargs["torch_dtype"] = self.torch_dtype
-                    model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"
-            else:
                 model_kwargs["torch_dtype"] = self.torch_dtype
                 model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"

         self.model = None
         self.use_cache = use_cache
+        # Only enable 8-bit if explicitly requested (opt-in, not by default)
+        # Default to False since bitsandbytes may not be available in all deployment environments
+        requested_8bit = use_8bit or (device == "cpu" and os.getenv("USE_8BIT_QUANTIZATION", "false").lower() == "true")
         self.use_8bit = requested_8bit and HAS_BITSANDBYTES
         if requested_8bit and not HAS_BITSANDBYTES:
                 except Exception as e:
                     print(f"[{self.name}] Failed to setup 8-bit quantization: {e}")
                     print(f"[{self.name}] Falling back to full precision")
+                    self.use_8bit = False
                     model_kwargs["torch_dtype"] = self.torch_dtype
+                    model_kwargs["device_map"] = "cpu"
+            # Standard loading without quantization
+            if not self.use_8bit:
                 model_kwargs["torch_dtype"] = self.torch_dtype
                 model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"

requirements.txt CHANGED Viewed

@@ -3,6 +3,8 @@ uvicorn[standard]==0.24.0
 transformers==4.36.2
 accelerate==0.25.0
 huggingface_hub==0.19.4
-bitsandbytes==0.49.0
 torch>=2.1.0
 pydantic==2.5.0

 transformers==4.36.2
 accelerate==0.25.0
 huggingface_hub==0.19.4
 torch>=2.1.0
 pydantic==2.5.0
+# bitsandbytes is optional for 8-bit quantization (CPU optimization)
+# Uncomment below if bitsandbytes is available on your system:
+# bitsandbytes==0.49.0