import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp.server.app import create_app, Settings
import uvicorn

# Model info
REPO_ID = "Qwen/Qwen2.5-3B-Instruct-GGUF"
FILENAME = "qwen2.5-3b-instruct-q4_k_m.gguf"  # quantized ~2GB
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Download model if not exists
model_path = MODEL_DIR / FILENAME
if not model_path.exists():
    model_path = Path(
        hf_hub_download(
            repo_id=REPO_ID,
            filename=FILENAME,
            local_dir=str(MODEL_DIR),
            local_dir_use_symlinks=False,
        )
    )

# Configure llama.cpp server
settings = Settings(
    model=str(model_path),
    model_alias="qwen2.5-3b-instruct",
    n_ctx=4096,
    n_threads=4,
    n_batch=256,
)

app = create_app(settings)

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)