Spaces:
Running
Running
whisper : add OpenVINO support (#1037)
Browse files* openvino: use OpenVINO encoder inference
* openvino: add python script for OpenVINO model generation
* whisper: Fix 'unused' warnings when OpenVINO isn't enabled in build
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <[email protected]>
* whisper: Fix compilation error
* whisper: revert whisper_get_openvino_path_encoder & whisper_get_openvino_path_cache to non-const func signatures
* cmake: Add openvino-encoder as separate object target
* whisper : minor style fixes
* minor : indentation fixes
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- CMakeLists.txt +28 -0
- examples/main/main.cpp +7 -0
- models/convert-whisper-to-openvino.py +53 -0
- models/openvino-conversion-requirements.txt +2 -0
- openvino/whisper-openvino-encoder.cpp +108 -0
- openvino/whisper-openvino-encoder.h +31 -0
- whisper.cpp +120 -3
- whisper.h +18 -0
CMakeLists.txt
CHANGED
|
@@ -54,6 +54,8 @@ option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
|
|
| 54 |
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
|
| 55 |
option(WHISPER_NO_F16C "whisper: disable F16c" OFF)
|
| 56 |
|
|
|
|
|
|
|
| 57 |
if (APPLE)
|
| 58 |
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
|
| 59 |
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
|
|
@@ -192,6 +194,10 @@ if (WHISPER_CLBLAST)
|
|
| 192 |
endif()
|
| 193 |
endif()
|
| 194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
# compiler flags
|
| 196 |
|
| 197 |
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
|
@@ -297,6 +303,24 @@ if (WHISPER_COREML)
|
|
| 297 |
)
|
| 298 |
endif()
|
| 299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
#
|
| 301 |
# whisper - this is the main library of the project
|
| 302 |
#
|
|
@@ -322,6 +346,10 @@ if (WHISPER_COREML)
|
|
| 322 |
target_link_libraries(${TARGET} PRIVATE whisper.coreml)
|
| 323 |
endif()
|
| 324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
if (MSVC)
|
| 326 |
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
|
| 327 |
|
|
|
|
| 54 |
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
|
| 55 |
option(WHISPER_NO_F16C "whisper: disable F16c" OFF)
|
| 56 |
|
| 57 |
+
option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
|
| 58 |
+
|
| 59 |
if (APPLE)
|
| 60 |
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
|
| 61 |
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
|
|
|
|
| 194 |
endif()
|
| 195 |
endif()
|
| 196 |
|
| 197 |
+
if( WHISPER_OPENVINO )
|
| 198 |
+
find_package(OpenVINO REQUIRED COMPONENTS Runtime)
|
| 199 |
+
endif()
|
| 200 |
+
|
| 201 |
# compiler flags
|
| 202 |
|
| 203 |
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
|
|
|
| 303 |
)
|
| 304 |
endif()
|
| 305 |
|
| 306 |
+
if (WHISPER_OPENVINO)
|
| 307 |
+
set(TARGET whisper.openvino)
|
| 308 |
+
|
| 309 |
+
add_library(${TARGET} OBJECT
|
| 310 |
+
openvino/whisper-openvino-encoder.h
|
| 311 |
+
openvino/whisper-openvino-encoder.cpp
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
target_include_directories(${TARGET} PUBLIC
|
| 315 |
+
.
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
|
| 319 |
+
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_OPENVINO)
|
| 320 |
+
|
| 321 |
+
target_link_libraries(${TARGET} PRIVATE openvino::runtime)
|
| 322 |
+
endif()
|
| 323 |
+
|
| 324 |
#
|
| 325 |
# whisper - this is the main library of the project
|
| 326 |
#
|
|
|
|
| 346 |
target_link_libraries(${TARGET} PRIVATE whisper.coreml)
|
| 347 |
endif()
|
| 348 |
|
| 349 |
+
if (WHISPER_OPENVINO)
|
| 350 |
+
target_link_libraries(${TARGET} PRIVATE whisper.openvino)
|
| 351 |
+
endif()
|
| 352 |
+
|
| 353 |
if (MSVC)
|
| 354 |
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
|
| 355 |
|
examples/main/main.cpp
CHANGED
|
@@ -95,6 +95,8 @@ struct whisper_params {
|
|
| 95 |
// [TDRZ] speaker turn string
|
| 96 |
std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
|
| 97 |
|
|
|
|
|
|
|
| 98 |
std::vector<std::string> fname_inp = {};
|
| 99 |
std::vector<std::string> fname_out = {};
|
| 100 |
};
|
|
@@ -155,6 +157,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 155 |
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
|
| 156 |
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 157 |
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
|
|
|
|
| 158 |
else {
|
| 159 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 160 |
whisper_print_usage(argc, argv, params);
|
|
@@ -207,6 +210,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 207 |
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
| 208 |
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 209 |
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
|
|
|
| 210 |
fprintf(stderr, "\n");
|
| 211 |
}
|
| 212 |
|
|
@@ -809,6 +813,9 @@ int main(int argc, char ** argv) {
|
|
| 809 |
return 3;
|
| 810 |
}
|
| 811 |
|
|
|
|
|
|
|
|
|
|
| 812 |
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
| 813 |
const auto fname_inp = params.fname_inp[f];
|
| 814 |
const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
|
|
|
| 95 |
// [TDRZ] speaker turn string
|
| 96 |
std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
|
| 97 |
|
| 98 |
+
std::string openvino_encode_device = "CPU";
|
| 99 |
+
|
| 100 |
std::vector<std::string> fname_inp = {};
|
| 101 |
std::vector<std::string> fname_out = {};
|
| 102 |
};
|
|
|
|
| 157 |
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
|
| 158 |
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 159 |
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
|
| 160 |
+
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
|
| 161 |
else {
|
| 162 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 163 |
whisper_print_usage(argc, argv, params);
|
|
|
|
| 210 |
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
| 211 |
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 212 |
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
| 213 |
+
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
|
| 214 |
fprintf(stderr, "\n");
|
| 215 |
}
|
| 216 |
|
|
|
|
| 813 |
return 3;
|
| 814 |
}
|
| 815 |
|
| 816 |
+
// initialize openvino encoder. This has no effect on whisper.cpp builds that don't have OpenVINO configured.
|
| 817 |
+
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
|
| 818 |
+
|
| 819 |
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
| 820 |
const auto fname_inp = params.fname_inp[f];
|
| 821 |
const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
models/convert-whisper-to-openvino.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import torch
|
| 3 |
+
from whisper import load_model
|
| 4 |
+
import os
|
| 5 |
+
from openvino.tools import mo
|
| 6 |
+
from openvino.runtime import serialize
|
| 7 |
+
import shutil
|
| 8 |
+
|
| 9 |
+
def convert_encoder(hparams, encoder, mname):
|
| 10 |
+
encoder.eval()
|
| 11 |
+
|
| 12 |
+
mel = torch.zeros((1, 80, 3000))
|
| 13 |
+
|
| 14 |
+
onnx_folder=os.path.join(os.path.dirname(__file__),"onnx_encoder")
|
| 15 |
+
|
| 16 |
+
#create a directory to store the onnx model, and other collateral that is saved during onnx export procedure
|
| 17 |
+
if not os.path.isdir(onnx_folder):
|
| 18 |
+
os.makedirs(onnx_folder)
|
| 19 |
+
|
| 20 |
+
onnx_path = os.path.join(onnx_folder, "whisper_encoder.onnx")
|
| 21 |
+
|
| 22 |
+
torch.onnx.export(
|
| 23 |
+
encoder,
|
| 24 |
+
mel,
|
| 25 |
+
onnx_path,
|
| 26 |
+
input_names=["mel"],
|
| 27 |
+
output_names=["output_features"]
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# use model optimizer to convert onnx to OpenVINO IR format
|
| 31 |
+
encoder_model = mo.convert_model(onnx_path, compress_to_fp16=True)
|
| 32 |
+
serialize(encoder_model, xml_path='ggml-' + mname + '-encoder-openvino.xml')
|
| 33 |
+
|
| 34 |
+
#cleanup
|
| 35 |
+
if os.path.isdir(onnx_folder):
|
| 36 |
+
shutil.rmtree(onnx_folder)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
parser = argparse.ArgumentParser()
|
| 41 |
+
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
|
| 42 |
+
args = parser.parse_args()
|
| 43 |
+
|
| 44 |
+
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
|
| 45 |
+
raise ValueError("Invalid model name")
|
| 46 |
+
|
| 47 |
+
whisper = load_model(args.model).cpu()
|
| 48 |
+
hparams = whisper.dims
|
| 49 |
+
|
| 50 |
+
encoder = whisper.encoder
|
| 51 |
+
|
| 52 |
+
# Convert encoder to onnx
|
| 53 |
+
convert_encoder(hparams, encoder, args.model)
|
models/openvino-conversion-requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openvino-dev[pytorch,onnx]
|
| 2 |
+
openai-whisper
|
openvino/whisper-openvino-encoder.cpp
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "openvino/whisper-openvino-encoder.h"
|
| 2 |
+
#include "ggml.h"
|
| 3 |
+
#include <openvino/openvino.hpp>
|
| 4 |
+
#include <iostream>
|
| 5 |
+
|
| 6 |
+
struct whisper_openvino_context {
|
| 7 |
+
ov::InferRequest inferRequest;
|
| 8 |
+
};
|
| 9 |
+
|
| 10 |
+
struct whisper_openvino_context * whisper_openvino_init(const char* path_model,
|
| 11 |
+
const char* device,
|
| 12 |
+
const char* cache_dir)
|
| 13 |
+
{
|
| 14 |
+
if (!path_model || !device) {
|
| 15 |
+
fprintf(stderr, "%s: path_model and/or device is null\n", __func__);
|
| 16 |
+
return nullptr;
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
fprintf(stderr, "%s: path_model = %s, device = %s, cache_dir = %s\n",
|
| 20 |
+
__func__, path_model, device, cache_dir ? cache_dir : "(not set)");
|
| 21 |
+
|
| 22 |
+
whisper_openvino_context *context = new whisper_openvino_context;
|
| 23 |
+
try {
|
| 24 |
+
ov::Core core;
|
| 25 |
+
|
| 26 |
+
if (cache_dir) {
|
| 27 |
+
// enables caching of device-specific 'blobs' during core.compile_model
|
| 28 |
+
// routine. This speeds up calls to compile_model for successive runs.
|
| 29 |
+
core.set_property(ov::cache_dir(cache_dir));
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
//Read the OpenVINO encoder IR (.xml/.bin) from disk, producing an ov::Model object.
|
| 33 |
+
std::shared_ptr<ov::Model> model = core.read_model(path_model);
|
| 34 |
+
|
| 35 |
+
// Produce a compiled-model object, given the device ("CPU", "GPU", etc.)
|
| 36 |
+
auto compiledModel = core.compile_model(model, device);
|
| 37 |
+
|
| 38 |
+
// From the compiled model object, create an infer request. This is the thing that we
|
| 39 |
+
// we will use later on to trigger inference execution.
|
| 40 |
+
context->inferRequest = compiledModel.create_infer_request();
|
| 41 |
+
}
|
| 42 |
+
catch (const std::exception& error) {
|
| 43 |
+
std::cout << "in openvino encoder compile routine: exception: " << error.what() << std::endl;
|
| 44 |
+
delete context;
|
| 45 |
+
context = nullptr;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
return context;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
void whisper_openvino_free(struct whisper_openvino_context * ctx) {
|
| 52 |
+
if( ctx ) {
|
| 53 |
+
delete ctx;
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
int whisper_openvino_encode(
|
| 58 |
+
whisper_openvino_context* ctx,
|
| 59 |
+
ggml_tensor* mel,
|
| 60 |
+
ggml_tensor* out) {
|
| 61 |
+
|
| 62 |
+
if (!ctx || !mel || !out) {
|
| 63 |
+
fprintf(stderr, "%s: Error! ctx / mel / out is null\n", __func__);
|
| 64 |
+
return 0;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
if (mel->n_dims != 2) {
|
| 68 |
+
fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
|
| 69 |
+
__func__, mel->n_dims);
|
| 70 |
+
return 0;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
if (out->n_dims != 2) {
|
| 74 |
+
fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
|
| 75 |
+
__func__, out->n_dims);
|
| 76 |
+
return 0;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
try {
|
| 80 |
+
|
| 81 |
+
//wrap the passed-in mel ggml_tensor as an OpenVINO Tensor object, and set as input tensor to infer request
|
| 82 |
+
{
|
| 83 |
+
// note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays
|
| 84 |
+
ov::Shape input_shape = { 1, (unsigned long long)mel->ne[1], (unsigned long long)mel->ne[0] };
|
| 85 |
+
ov::Strides input_strides = { mel->nb[2], mel->nb[1], mel->nb[0] };
|
| 86 |
+
ov::Tensor input_tensor(ov::element::f32, input_shape, mel->data, input_strides);
|
| 87 |
+
ctx->inferRequest.set_input_tensor(input_tensor);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
//wrap the passed-in out ggml_tensor as an OpenVINO Tensor object, and set as output tensor to infer request
|
| 91 |
+
{
|
| 92 |
+
// note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays
|
| 93 |
+
ov::Shape output_shape = { 1, (unsigned long long)out->ne[1], (unsigned long long)out->ne[0] };
|
| 94 |
+
ov::Strides output_strides = { out->nb[2], out->nb[1], out->nb[0] };
|
| 95 |
+
ov::Tensor out_tensor(ov::element::f32, output_shape, out->data, output_strides);
|
| 96 |
+
ctx->inferRequest.set_output_tensor(out_tensor);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
//run inference
|
| 100 |
+
ctx->inferRequest.infer();
|
| 101 |
+
}
|
| 102 |
+
catch (const std::exception& error) {
|
| 103 |
+
std::cout << "in openvino encode inference execution routine: exception: " << error.what() << std::endl;
|
| 104 |
+
return 0;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
return 1;
|
| 108 |
+
}
|
openvino/whisper-openvino-encoder.h
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Wrapper of the OpenVINO Whisper Encoder model
|
| 2 |
+
//
|
| 3 |
+
|
| 4 |
+
#if __cplusplus
|
| 5 |
+
extern "C" {
|
| 6 |
+
#endif
|
| 7 |
+
|
| 8 |
+
struct whisper_openvino_context;
|
| 9 |
+
|
| 10 |
+
// initialize openvino encoder, given path to model xml, device ("CPU", "GPU", etc.), and
|
| 11 |
+
// path to cache_dir. Returns null upon failure.
|
| 12 |
+
struct whisper_openvino_context * whisper_openvino_init(const char * path_model,
|
| 13 |
+
const char * device,
|
| 14 |
+
const char * cache_dir);
|
| 15 |
+
|
| 16 |
+
// clean up a ctx previously returned from whisper_openvino_init()
|
| 17 |
+
void whisper_openvino_free(struct whisper_openvino_context * ctx);
|
| 18 |
+
|
| 19 |
+
struct ggml_tensor;
|
| 20 |
+
|
| 21 |
+
// Perform encode using OpenVINO.
|
| 22 |
+
// Returns 1 on success
|
| 23 |
+
// Returns 0 on failure
|
| 24 |
+
int whisper_openvino_encode(
|
| 25 |
+
whisper_openvino_context* ctx,
|
| 26 |
+
ggml_tensor* mel,
|
| 27 |
+
ggml_tensor* out);
|
| 28 |
+
|
| 29 |
+
#if __cplusplus
|
| 30 |
+
}
|
| 31 |
+
#endif
|
whisper.cpp
CHANGED
|
@@ -3,6 +3,10 @@
|
|
| 3 |
#include "coreml/whisper-encoder.h"
|
| 4 |
#endif
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
#include "ggml.h"
|
| 7 |
|
| 8 |
#include <algorithm>
|
|
@@ -660,6 +664,10 @@ struct whisper_state {
|
|
| 660 |
whisper_coreml_context * ctx_coreml = nullptr;
|
| 661 |
#endif
|
| 662 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 663 |
// [EXPERIMENTAL] token-level timestamps data
|
| 664 |
int64_t t_beg = 0;
|
| 665 |
int64_t t_last = 0;
|
|
@@ -1478,7 +1486,13 @@ static bool whisper_encode_internal(
|
|
| 1478 |
const bool use_coreml = wstate.ctx_coreml != nullptr;
|
| 1479 |
#endif
|
| 1480 |
|
| 1481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1482 |
// convolution + gelu
|
| 1483 |
{
|
| 1484 |
wstate.use_buf(ctx0, 1);
|
|
@@ -1777,8 +1791,7 @@ static bool whisper_encode_internal(
|
|
| 1777 |
}
|
| 1778 |
}
|
| 1779 |
#ifdef WHISPER_USE_COREML
|
| 1780 |
-
else
|
| 1781 |
-
{
|
| 1782 |
wstate.use_buf(ctx0, -1);
|
| 1783 |
|
| 1784 |
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
|
@@ -1786,6 +1799,17 @@ static bool whisper_encode_internal(
|
|
| 1786 |
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
|
| 1787 |
}
|
| 1788 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1789 |
|
| 1790 |
// cur
|
| 1791 |
//{
|
|
@@ -2628,6 +2652,31 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
|
|
| 2628 |
}
|
| 2629 |
#endif
|
| 2630 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2631 |
struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
| 2632 |
whisper_state * state = new whisper_state;
|
| 2633 |
|
|
@@ -2694,6 +2743,58 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 2694 |
return state;
|
| 2695 |
}
|
| 2696 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2697 |
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
|
| 2698 |
|
| 2699 |
fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
|
|
@@ -2848,6 +2949,13 @@ void whisper_free_state(struct whisper_state * state)
|
|
| 2848 |
}
|
| 2849 |
#endif
|
| 2850 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2851 |
delete state;
|
| 2852 |
}
|
| 2853 |
}
|
|
@@ -3287,6 +3395,14 @@ static int whisper_has_coreml(void) {
|
|
| 3287 |
#endif
|
| 3288 |
}
|
| 3289 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3290 |
const char * whisper_print_system_info(void) {
|
| 3291 |
static std::string s;
|
| 3292 |
|
|
@@ -3304,6 +3420,7 @@ const char * whisper_print_system_info(void) {
|
|
| 3304 |
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
| 3305 |
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
| 3306 |
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
|
|
|
| 3307 |
|
| 3308 |
return s.c_str();
|
| 3309 |
}
|
|
|
|
| 3 |
#include "coreml/whisper-encoder.h"
|
| 4 |
#endif
|
| 5 |
|
| 6 |
+
#if WHISPER_USE_OPENVINO
|
| 7 |
+
#include "openvino/whisper-openvino-encoder.h"
|
| 8 |
+
#endif
|
| 9 |
+
|
| 10 |
#include "ggml.h"
|
| 11 |
|
| 12 |
#include <algorithm>
|
|
|
|
| 664 |
whisper_coreml_context * ctx_coreml = nullptr;
|
| 665 |
#endif
|
| 666 |
|
| 667 |
+
#ifdef WHISPER_USE_OPENVINO
|
| 668 |
+
whisper_openvino_context * ctx_openvino = nullptr;
|
| 669 |
+
#endif
|
| 670 |
+
|
| 671 |
// [EXPERIMENTAL] token-level timestamps data
|
| 672 |
int64_t t_beg = 0;
|
| 673 |
int64_t t_last = 0;
|
|
|
|
| 1486 |
const bool use_coreml = wstate.ctx_coreml != nullptr;
|
| 1487 |
#endif
|
| 1488 |
|
| 1489 |
+
#ifndef WHISPER_USE_OPENVINO
|
| 1490 |
+
const bool use_openvino = false;
|
| 1491 |
+
#else
|
| 1492 |
+
const bool use_openvino = wstate.ctx_openvino != nullptr;
|
| 1493 |
+
#endif
|
| 1494 |
+
|
| 1495 |
+
if (!use_coreml && !use_openvino) {
|
| 1496 |
// convolution + gelu
|
| 1497 |
{
|
| 1498 |
wstate.use_buf(ctx0, 1);
|
|
|
|
| 1791 |
}
|
| 1792 |
}
|
| 1793 |
#ifdef WHISPER_USE_COREML
|
| 1794 |
+
else if (use_coreml) {
|
|
|
|
| 1795 |
wstate.use_buf(ctx0, -1);
|
| 1796 |
|
| 1797 |
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
|
|
|
| 1799 |
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
|
| 1800 |
}
|
| 1801 |
#endif
|
| 1802 |
+
#ifdef WHISPER_USE_OPENVINO
|
| 1803 |
+
else if (use_openvino) {
|
| 1804 |
+
wstate.use_buf(ctx0, -1);
|
| 1805 |
+
|
| 1806 |
+
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
| 1807 |
+
|
| 1808 |
+
if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
|
| 1809 |
+
return false;
|
| 1810 |
+
}
|
| 1811 |
+
}
|
| 1812 |
+
#endif
|
| 1813 |
|
| 1814 |
// cur
|
| 1815 |
//{
|
|
|
|
| 2652 |
}
|
| 2653 |
#endif
|
| 2654 |
|
| 2655 |
+
#ifdef WHISPER_USE_OPENVINO
|
| 2656 |
+
// replace .bin with-encoder-openvino.xml
|
| 2657 |
+
static std::string whisper_get_openvino_path_encoder(std::string path_bin) {
|
| 2658 |
+
auto pos = path_bin.rfind('.');
|
| 2659 |
+
if (pos != std::string::npos) {
|
| 2660 |
+
path_bin = path_bin.substr(0, pos);
|
| 2661 |
+
}
|
| 2662 |
+
|
| 2663 |
+
path_bin += "-encoder-openvino.xml";
|
| 2664 |
+
|
| 2665 |
+
return path_bin;
|
| 2666 |
+
}
|
| 2667 |
+
|
| 2668 |
+
static std::string whisper_get_openvino_path_cache(std::string path_bin) {
|
| 2669 |
+
auto pos = path_bin.rfind('.');
|
| 2670 |
+
if (pos != std::string::npos) {
|
| 2671 |
+
path_bin = path_bin.substr(0, pos);
|
| 2672 |
+
}
|
| 2673 |
+
|
| 2674 |
+
path_bin += "-encoder-openvino-cache";
|
| 2675 |
+
|
| 2676 |
+
return path_bin;
|
| 2677 |
+
}
|
| 2678 |
+
#endif
|
| 2679 |
+
|
| 2680 |
struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
| 2681 |
whisper_state * state = new whisper_state;
|
| 2682 |
|
|
|
|
| 2743 |
return state;
|
| 2744 |
}
|
| 2745 |
|
| 2746 |
+
int whisper_ctx_init_openvino_encoder(struct whisper_context* ctx,
|
| 2747 |
+
const char* openvino_model_path,
|
| 2748 |
+
const char* openvino_device,
|
| 2749 |
+
const char* openvino_cache_dir)
|
| 2750 |
+
{
|
| 2751 |
+
#ifndef WHISPER_USE_OPENVINO
|
| 2752 |
+
(void)(ctx);
|
| 2753 |
+
(void)(openvino_model_path);
|
| 2754 |
+
(void)(openvino_device);
|
| 2755 |
+
(void)(openvino_cache_dir);
|
| 2756 |
+
return 0;
|
| 2757 |
+
#else
|
| 2758 |
+
if (!openvino_model_path && ctx->path_model.empty())
|
| 2759 |
+
{
|
| 2760 |
+
fprintf(stderr, "%s: openvino_model_path is nullptr, and ctx has no model_path set.\n", __func__);
|
| 2761 |
+
return 0;
|
| 2762 |
+
}
|
| 2763 |
+
|
| 2764 |
+
std::string path_openvino;
|
| 2765 |
+
if (!openvino_model_path) {
|
| 2766 |
+
//if openvino_model_path is not set, attempt to find it in the same directory as ggml-<model>.bin model
|
| 2767 |
+
path_openvino = whisper_get_openvino_path_encoder(ctx->path_model);
|
| 2768 |
+
}
|
| 2769 |
+
else {
|
| 2770 |
+
path_openvino = openvino_model_path;
|
| 2771 |
+
}
|
| 2772 |
+
|
| 2773 |
+
std::string path_openvino_cache_dir;
|
| 2774 |
+
if (!openvino_cache_dir) {
|
| 2775 |
+
//if openvino_cache_dir is not set, set it as a dir residing next to ggml-<model>.bin
|
| 2776 |
+
path_openvino_cache_dir = whisper_get_openvino_path_cache(ctx->path_model);
|
| 2777 |
+
}
|
| 2778 |
+
else {
|
| 2779 |
+
path_openvino_cache_dir = openvino_cache_dir;
|
| 2780 |
+
}
|
| 2781 |
+
|
| 2782 |
+
fprintf(stderr, "%s: loading OpenVINO model from '%s'\n", __func__, path_openvino.c_str());
|
| 2783 |
+
fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
|
| 2784 |
+
|
| 2785 |
+
ctx->state->ctx_openvino = whisper_openvino_init(path_openvino.c_str(), openvino_device, path_openvino_cache_dir.c_str());
|
| 2786 |
+
if (!ctx->state->ctx_openvino) {
|
| 2787 |
+
fprintf(stderr, "%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_openvino.c_str());
|
| 2788 |
+
return 0;
|
| 2789 |
+
}
|
| 2790 |
+
else {
|
| 2791 |
+
fprintf(stderr, "%s: OpenVINO model loaded\n", __func__);
|
| 2792 |
+
}
|
| 2793 |
+
|
| 2794 |
+
return 1;
|
| 2795 |
+
#endif
|
| 2796 |
+
}
|
| 2797 |
+
|
| 2798 |
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
|
| 2799 |
|
| 2800 |
fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
|
|
|
|
| 2949 |
}
|
| 2950 |
#endif
|
| 2951 |
|
| 2952 |
+
#ifdef WHISPER_USE_OPENVINO
|
| 2953 |
+
if (state->ctx_openvino != nullptr) {
|
| 2954 |
+
whisper_openvino_free(state->ctx_openvino);
|
| 2955 |
+
state->ctx_openvino = nullptr;
|
| 2956 |
+
}
|
| 2957 |
+
#endif
|
| 2958 |
+
|
| 2959 |
delete state;
|
| 2960 |
}
|
| 2961 |
}
|
|
|
|
| 3395 |
#endif
|
| 3396 |
}
|
| 3397 |
|
| 3398 |
+
static int whisper_has_openvino(void) {
|
| 3399 |
+
#ifdef WHISPER_USE_OPENVINO
|
| 3400 |
+
return 1;
|
| 3401 |
+
#else
|
| 3402 |
+
return 0;
|
| 3403 |
+
#endif
|
| 3404 |
+
}
|
| 3405 |
+
|
| 3406 |
const char * whisper_print_system_info(void) {
|
| 3407 |
static std::string s;
|
| 3408 |
|
|
|
|
| 3420 |
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
| 3421 |
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
| 3422 |
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
| 3423 |
+
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
|
| 3424 |
|
| 3425 |
return s.c_str();
|
| 3426 |
}
|
whisper.h
CHANGED
|
@@ -110,6 +110,24 @@ extern "C" {
|
|
| 110 |
|
| 111 |
WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
// Frees all allocated memory
|
| 114 |
WHISPER_API void whisper_free (struct whisper_context * ctx);
|
| 115 |
WHISPER_API void whisper_free_state(struct whisper_state * state);
|
|
|
|
| 110 |
|
| 111 |
WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
|
| 112 |
|
| 113 |
+
// Given a context, enable use of OpenVINO for encode inference.
|
| 114 |
+
// model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
|
| 115 |
+
// the path will be generated from the ggml model path that was passed
|
| 116 |
+
// in to whisper_init_from_file. For example, if 'path_model' was
|
| 117 |
+
// "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
|
| 118 |
+
// assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
|
| 119 |
+
// device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
|
| 120 |
+
// cache_dir: Optional cache directory that can speed up init time, especially for
|
| 121 |
+
// GPU, by caching compiled 'blobs' there.
|
| 122 |
+
// Set to nullptr if not used.
|
| 123 |
+
// Returns 1 on success. If OpenVINO is not enabled in build, this
|
| 124 |
+
// simply returns 0.
|
| 125 |
+
WHISPER_API int whisper_ctx_init_openvino_encoder(
|
| 126 |
+
struct whisper_context * ctx,
|
| 127 |
+
const char * model_path,
|
| 128 |
+
const char * device,
|
| 129 |
+
const char * cache_dir);
|
| 130 |
+
|
| 131 |
// Frees all allocated memory
|
| 132 |
WHISPER_API void whisper_free (struct whisper_context * ctx);
|
| 133 |
WHISPER_API void whisper_free_state(struct whisper_state * state);
|