Spaces:

natasa365
/

whisper.cpp

Sleeping

App Files Files Community

ggerganov commited on Apr 29, 2023

Commit

49fb1c6

unverified ·

1 Parent(s): ebe63a9

build : add WHISPER_COREML_ALLOW_FALLBACK to make / CMake (#812)

Browse files

Files changed (3) hide show

CMakeLists.txt +21 -16
Makefile +13 -0
whisper.cpp +59 -60

CMakeLists.txt CHANGED Viewed

@@ -39,32 +39,33 @@ endif()
 # options
-option(BUILD_SHARED_LIBS               "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
-option(WHISPER_ALL_WARNINGS            "whisper: enable all compiler warnings"                   ON)
-option(WHISPER_ALL_WARNINGS_3RD_PARTY  "whisper: enable all compiler warnings in 3rd party libs" OFF)
-option(WHISPER_SANITIZE_THREAD         "whisper: enable thread sanitizer"    OFF)
-option(WHISPER_SANITIZE_ADDRESS        "whisper: enable address sanitizer"   OFF)
-option(WHISPER_SANITIZE_UNDEFINED      "whisper: enable undefined sanitizer" OFF)
-option(WHISPER_BUILD_TESTS             "whisper: build tests"    ${WHISPER_STANDALONE})
-option(WHISPER_BUILD_EXAMPLES          "whisper: build examples" ${WHISPER_STANDALONE})
-option(WHISPER_SUPPORT_SDL2            "whisper: support for libSDL2" OFF)
 if (APPLE)
-    option(WHISPER_NO_ACCELERATE       "whisper: disable Accelerate framework" OFF)
-    option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
-    option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
-    option(WHISPER_NO_FMA              "whisper: disable FMA" OFF)
-    option(WHISPER_COREML              "whisper: enable Core ML framework" OFF)
 else()
-    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
-option(WHISPER_PERF                    "whisper: enable perf timings" OFF)
 # sanitizers
@@ -119,6 +120,10 @@ if (APPLE)
         else()
             message(WARNING "CoreML framework not found")
         endif()
     endif()
 endif()

 # options
+option(BUILD_SHARED_LIBS              "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
+option(WHISPER_ALL_WARNINGS           "whisper: enable all compiler warnings"                   ON)
+option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
+option(WHISPER_SANITIZE_THREAD        "whisper: enable thread sanitizer"    OFF)
+option(WHISPER_SANITIZE_ADDRESS       "whisper: enable address sanitizer"   OFF)
+option(WHISPER_SANITIZE_UNDEFINED     "whisper: enable undefined sanitizer" OFF)
+option(WHISPER_BUILD_TESTS            "whisper: build tests"    ${WHISPER_STANDALONE})
+option(WHISPER_BUILD_EXAMPLES         "whisper: build examples" ${WHISPER_STANDALONE})
+option(WHISPER_SUPPORT_SDL2           "whisper: support for libSDL2" OFF)
 if (APPLE)
+    option(WHISPER_NO_ACCELERATE         "whisper: disable Accelerate framework" OFF)
+    option(WHISPER_NO_AVX                "whisper: disable AVX" OFF)
+    option(WHISPER_NO_AVX2               "whisper: disable AVX2" OFF)
+    option(WHISPER_NO_FMA                "whisper: disable FMA" OFF)
+    option(WHISPER_COREML                "whisper: enable Core ML framework" OFF)
+    option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
 else()
+    option(WHISPER_SUPPORT_OPENBLAS      "whisper: support for OpenBLAS" OFF)
 endif()
+option(WHISPER_PERF "whisper: enable perf timings" OFF)
 # sanitizers
         else()
             message(WARNING "CoreML framework not found")
         endif()
+        if (WHISPER_COREML_ALLOW_FALLBACK)
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML_ALLOW_FALLBACK)
+        endif()
     endif()
 endif()

Makefile CHANGED Viewed

@@ -123,6 +123,7 @@ endif
 ifeq ($(UNAME_M),amd64)
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
@@ -133,6 +134,7 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
 		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif
 ifndef WHISPER_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework
 	ifeq ($(UNAME_S),Darwin)
@@ -140,26 +142,36 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
 ifdef WHISPER_COREML
 	CXXFLAGS += -DWHISPER_USE_COREML
 	LDFLAGS  += -framework Foundation -framework CoreML
 endif
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
 endif
 ifdef WHISPER_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	CFLAGS += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# 32-bit Raspberry Pi 1, 2, 3
 	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access
 endif
 ifneq ($(filter armv7%,$(UNAME_M)),)
 	# 32-bit ARM, for example on Armbian or possibly raspbian
 	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
@@ -167,6 +179,7 @@ ifneq ($(filter armv7%,$(UNAME_M)),)
 	# 64-bit ARM, use these (TODO: auto-detect 64-bit)
 	# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 4
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access

 ifeq ($(UNAME_M),amd64)
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
 		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif
 ifndef WHISPER_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework
 	ifeq ($(UNAME_S),Darwin)
 		LDFLAGS += -framework Accelerate
 	endif
 endif
 ifdef WHISPER_COREML
 	CXXFLAGS += -DWHISPER_USE_COREML
 	LDFLAGS  += -framework Foundation -framework CoreML
+ifdef WHISPER_COREML_ALLOW_FALLBACK
+	CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK
+endif
 endif
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
 endif
 ifdef WHISPER_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	CFLAGS += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# 32-bit Raspberry Pi 1, 2, 3
 	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access
 endif
 ifneq ($(filter armv7%,$(UNAME_M)),)
 	# 32-bit ARM, for example on Armbian or possibly raspbian
 	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 	# 64-bit ARM, use these (TODO: auto-detect 64-bit)
 	# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 4
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access

whisper.cpp CHANGED Viewed

@@ -1393,18 +1393,17 @@ static bool whisper_encode_internal(
     const bool use_coreml = wstate.ctx_coreml != nullptr;
 #endif
-    if (!use_coreml)
-    {
         // convolution + gelu
         {
             wstate.use_buf(ctx0, 1);
             cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
             cur = ggml_add(ctx0,
-                ggml_repeat(ctx0,
-                    model.e_conv_1_b,
-                    cur),
-                cur);
             cur = ggml_gelu(ctx0, cur);
@@ -1412,10 +1411,10 @@ static bool whisper_encode_internal(
             cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
             cur = ggml_add(ctx0,
-                ggml_repeat(ctx0,
-                    model.e_conv_2_b,
-                    cur),
-                cur);
             cur = ggml_gelu(ctx0, cur);
         }
@@ -1461,10 +1460,10 @@ static bool whisper_encode_internal(
                 // cur = ln_0_w*cur + ln_0_b
                 cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
-                        cur),
-                    ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
             }
             // self-attention
@@ -1472,39 +1471,39 @@ static bool whisper_encode_internal(
                 wstate.use_buf(ctx0, 1);
                 struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
-                    layer.attn_q_w,
-                    cur);
                 Qcur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
-                        layer.attn_q_b,
-                        Qcur),
-                    Qcur);
                 //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
                 // note: no bias for Key
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
-                    layer.attn_k_w,
-                    cur);
                 //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
-                    layer.attn_v_w,
-                    cur);
                 Vcur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
-                        layer.attn_v_b,
-                        Vcur),
-                    Vcur);
                 // ------
                 wstate.use_buf(ctx0, 0);
-    #ifdef WHISPER_USE_FLASH_ATTN
                 struct ggml_tensor * Q =
                     ggml_permute(ctx0,
                             ggml_cpy(ctx0,
@@ -1529,7 +1528,7 @@ static bool whisper_encode_internal(
                             ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
                 struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
-    #else
                 struct ggml_tensor * Q =
                     ggml_permute(ctx0,
                             ggml_cpy(ctx0,
@@ -1575,14 +1574,14 @@ static bool whisper_encode_internal(
                             );
                 struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
-    #endif
                 struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
                 wstate.use_buf(ctx0, 1);
                 cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
             }
             // projection
@@ -1590,14 +1589,14 @@ static bool whisper_encode_internal(
                 wstate.use_buf(ctx0, 0);
                 cur = ggml_mul_mat(ctx0,
-                    layer.attn_ln_1_w,
-                    cur);
                 wstate.use_buf(ctx0, 1);
                 cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
-                    cur);
             }
             wstate.use_buf(ctx0, 2);
@@ -1619,31 +1618,31 @@ static bool whisper_encode_internal(
                     // cur = mlp_ln_w*cur + mlp_ln_b
                     cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, layer.mlp_ln_w, cur),
-                            cur),
-                        ggml_repeat(ctx0, layer.mlp_ln_b, cur));
                 }
-    #ifdef WHISPER_USE_FLASH_FF
                 wstate.use_buf(ctx0, 0);
                 cur = ggml_flash_ff(ctx0,
-                    ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
-                    layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
-    #else
                 wstate.use_buf(ctx0, 0);
                 // fully connected
                 cur = ggml_mul_mat(ctx0,
-                    layer.mlp_0_w,
-                    cur);
                 wstate.use_buf(ctx0, 1);
                 cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.mlp_0_b, cur),
-                    cur);
                 wstate.use_buf(ctx0, 0);
@@ -1654,15 +1653,15 @@ static bool whisper_encode_internal(
                 // projection
                 cur = ggml_mul_mat(ctx0,
-                    layer.mlp_1_w,
-                    cur);
                 wstate.use_buf(ctx0, 0);
                 cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.mlp_1_b, cur),
-                    cur);
-    #endif
             }
             wstate.use_buf(ctx0, 3);
@@ -1682,10 +1681,10 @@ static bool whisper_encode_internal(
             // cur = ln_f_g*cur + ln_f_b
             cur = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.e_ln_w, cur),
-                    cur),
-                ggml_repeat(ctx0, model.e_ln_b, cur));
         }
         wstate.use_buf(ctx0, -1);
@@ -2580,11 +2579,11 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
     if (!state->ctx_coreml) {
         fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
-#ifndef WHISPER_COREML_ALLOW_FALLBACK
         return nullptr;
 #endif
     } else {
-        fprintf(stderr, "%s: Core ML model loaded\n", __func__);
     }
 #endif

     const bool use_coreml = wstate.ctx_coreml != nullptr;
 #endif
+    if (!use_coreml) {
         // convolution + gelu
         {
             wstate.use_buf(ctx0, 1);
             cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
             cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0,
+                        model.e_conv_1_b,
+                        cur),
+                    cur);
             cur = ggml_gelu(ctx0, cur);
             cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
             cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0,
+                        model.e_conv_2_b,
+                        cur),
+                    cur);
             cur = ggml_gelu(ctx0, cur);
         }
                 // cur = ln_0_w*cur + ln_0_b
                 cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
+                            cur),
+                        ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
             }
             // self-attention
                 wstate.use_buf(ctx0, 1);
                 struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
+                        layer.attn_q_w,
+                        cur);
                 Qcur = ggml_add(ctx0,
+                        ggml_repeat(ctx0,
+                            layer.attn_q_b,
+                            Qcur),
+                        Qcur);
                 //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
                 // note: no bias for Key
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
+                        layer.attn_k_w,
+                        cur);
                 //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
+                        layer.attn_v_w,
+                        cur);
                 Vcur = ggml_add(ctx0,
+                        ggml_repeat(ctx0,
+                            layer.attn_v_b,
+                            Vcur),
+                        Vcur);
                 // ------
                 wstate.use_buf(ctx0, 0);
+#ifdef WHISPER_USE_FLASH_ATTN
                 struct ggml_tensor * Q =
                     ggml_permute(ctx0,
                             ggml_cpy(ctx0,
                             ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
                 struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
+#else
                 struct ggml_tensor * Q =
                     ggml_permute(ctx0,
                             ggml_cpy(ctx0,
                             );
                 struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
+#endif
                 struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
                 wstate.use_buf(ctx0, 1);
                 cur = ggml_cpy(ctx0,
+                        KQV_merged,
+                        ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
             }
             // projection
                 wstate.use_buf(ctx0, 0);
                 cur = ggml_mul_mat(ctx0,
+                        layer.attn_ln_1_w,
+                        cur);
                 wstate.use_buf(ctx0, 1);
                 cur = ggml_add(ctx0,
+                        ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
+                        cur);
             }
             wstate.use_buf(ctx0, 2);
                     // cur = mlp_ln_w*cur + mlp_ln_b
                     cur = ggml_add(ctx0,
+                            ggml_mul(ctx0,
+                                ggml_repeat(ctx0, layer.mlp_ln_w, cur),
+                                cur),
+                            ggml_repeat(ctx0, layer.mlp_ln_b, cur));
                 }
+#ifdef WHISPER_USE_FLASH_FF
                 wstate.use_buf(ctx0, 0);
                 cur = ggml_flash_ff(ctx0,
+                        ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
+                        layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
+#else
                 wstate.use_buf(ctx0, 0);
                 // fully connected
                 cur = ggml_mul_mat(ctx0,
+                        layer.mlp_0_w,
+                        cur);
                 wstate.use_buf(ctx0, 1);
                 cur = ggml_add(ctx0,
+                        ggml_repeat(ctx0, layer.mlp_0_b, cur),
+                        cur);
                 wstate.use_buf(ctx0, 0);
                 // projection
                 cur = ggml_mul_mat(ctx0,
+                        layer.mlp_1_w,
+                        cur);
                 wstate.use_buf(ctx0, 0);
                 cur = ggml_add(ctx0,
+                        ggml_repeat(ctx0, layer.mlp_1_b, cur),
+                        cur);
+#endif
             }
             wstate.use_buf(ctx0, 3);
             // cur = ln_f_g*cur + ln_f_b
             cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model.e_ln_w, cur),
+                        cur),
+                    ggml_repeat(ctx0, model.e_ln_b, cur));
         }
         wstate.use_buf(ctx0, -1);
     state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
     if (!state->ctx_coreml) {
         fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
+#ifndef WHISPER_COREML_ALLOW_FALLBACK
         return nullptr;
 #endif
     } else {
+        fprintf(stderr, "%s: Core ML model loaded\n", __func__);
     }
 #endif