import os from pathlib import Path from huggingface_hub import hf_hub_download from llama_cpp.server.app import create_app, Settings import uvicorn # Model info REPO_ID = "Qwen/Qwen2.5-3B-Instruct-GGUF" FILENAME = "qwen2.5-3b-instruct-q4_k_m.gguf" # quantized ~2GB MODEL_DIR = Path("models") MODEL_DIR.mkdir(parents=True, exist_ok=True) # Download model if not exists model_path = MODEL_DIR / FILENAME if not model_path.exists(): model_path = Path( hf_hub_download( repo_id=REPO_ID, filename=FILENAME, local_dir=str(MODEL_DIR), local_dir_use_symlinks=False, ) ) # Configure llama.cpp server settings = Settings( model=str(model_path), model_alias="qwen2.5-3b-instruct", n_ctx=4096, n_threads=4, n_batch=256, ) app = create_app(settings) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)