Spaces:

studzinsky
/

bielik_app_service

Sleeping

App Files Files Community

Patryk Studzinski commited on 7 days ago

Commit

e0c72ee

1 Parent(s): 04a726c

fix: Add bitsandbytes to requirements and graceful fallback for 8-bit quantization

Browse files

Files changed (2) hide show

app/models/huggingface_local.py +32 -11
requirements.txt +8 -5

app/models/huggingface_local.py CHANGED Viewed

@@ -8,13 +8,21 @@ Optimizations:
 """
 from typing import List, Dict, Any, Optional
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 import asyncio
 import os
 from app.models.base_llm import BaseLLM
 class HuggingFaceLocal(BaseLLM):
     """
@@ -35,7 +43,14 @@ class HuggingFaceLocal(BaseLLM):
         self.tokenizer = None
         self.model = None
         self.use_cache = use_cache
-        self.use_8bit = use_8bit or (device == "cpu" and os.getenv("USE_8BIT_QUANTIZATION", "true").lower() == "true")
         self.use_flash_attention = os.getenv("USE_FLASH_ATTENTION", "true").lower() == "true"
         # Determine device index and dtype
@@ -68,15 +83,21 @@ class HuggingFaceLocal(BaseLLM):
             }
             # Add 8-bit quantization for CPU (4-6x faster, 50% less memory)
-            if self.use_8bit:
-                print(f"[{self.name}] Using 8-bit quantization for CPU optimization")
-                bnb_config = BitsAndBytesConfig(
-                    load_in_8bit=True,
-                    bnb_8bit_compute_dtype=torch.float16,
-                    bnb_8bit_use_double_quant=True,
-                )
-                model_kwargs["quantization_config"] = bnb_config
-                model_kwargs["device_map"] = "cpu"
             else:
                 model_kwargs["torch_dtype"] = self.torch_dtype
                 model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"

 """
 from typing import List, Dict, Any, Optional
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import torch
 import asyncio
 import os
 from app.models.base_llm import BaseLLM
+# Try to import bitsandbytes, but don't fail if not available
+try:
+    from transformers import BitsAndBytesConfig
+    HAS_BITSANDBYTES = True
+except ImportError:
+    HAS_BITSANDBYTES = False
+    print("[WARNING] bitsandbytes not available - 8-bit quantization disabled")
 class HuggingFaceLocal(BaseLLM):
     """
         self.tokenizer = None
         self.model = None
         self.use_cache = use_cache
+        # Only enable 8-bit if bitsandbytes is available
+        requested_8bit = use_8bit or (device == "cpu" and os.getenv("USE_8BIT_QUANTIZATION", "true").lower() == "true")
+        self.use_8bit = requested_8bit and HAS_BITSANDBYTES
+        if requested_8bit and not HAS_BITSANDBYTES:
+            print(f"[{name}] 8-bit quantization requested but bitsandbytes not installed - falling back to full precision")
         self.use_flash_attention = os.getenv("USE_FLASH_ATTENTION", "true").lower() == "true"
         # Determine device index and dtype
             }
             # Add 8-bit quantization for CPU (4-6x faster, 50% less memory)
+            if self.use_8bit and HAS_BITSANDBYTES:
+                try:
+                    print(f"[{self.name}] Using 8-bit quantization for CPU optimization")
+                    bnb_config = BitsAndBytesConfig(
+                        load_in_8bit=True,
+                        bnb_8bit_compute_dtype=torch.float16,
+                        bnb_8bit_use_double_quant=True,
+                    )
+                    model_kwargs["quantization_config"] = bnb_config
+                    model_kwargs["device_map"] = "cpu"
+                except Exception as e:
+                    print(f"[{self.name}] Failed to setup 8-bit quantization: {e}")
+                    print(f"[{self.name}] Falling back to full precision")
+                    model_kwargs["torch_dtype"] = self.torch_dtype
+                    model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"
             else:
                 model_kwargs["torch_dtype"] = self.torch_dtype
                 model_kwargs["device_map"] = self.device if self.device == "cuda" else "cpu"

requirements.txt CHANGED Viewed

@@ -1,5 +1,8 @@
-fastapi
-uvicorn[standard]
-transformers[torch]
-accelerate
-huggingface_hub

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+transformers==4.36.2
+accelerate==0.25.0
+huggingface_hub==0.19.4
+bitsandbytes==0.49.0
+torch>=2.1.0
+pydantic==2.5.0