Spaces:
Sleeping
bindings.java : enable copyLibs task [no ci] (#2949)
Browse files* bindings.java : enable copyLibs task [no ci]
This commit adds a dependency on the copyLibs task to the sourcesJar and
jar tasks. This ensures that the libwhisper.so file is copied to the
correct location before the jar is built.
It also sets the executable bit on the gradlew file.
* bindings.java : add copyLibs dep for processResources [no ci]
This will otherwise cause builds to fail after doing an initial build.
* bindings.java : pass structs by value to native code
This commit refactors the code to pass the structs by value to the
native code. This is done by creating a ByValue class for each struct
and using it in the Java code.
The motivation for this change is that without this application crashes
due to what I believe was memory mis-alignement. When the structs were
passed to the native code they would be att different memory locations.
Passing by value overcomes this issue and considering that the structs
hold parementers (context and full params) it might be alright do to
this. These changes allow all the tests to pass.
* bindings.java : fix javadoc warnings [no ci]
* bindings.java : fix libwhisper.dylib path in build.gradle [no ci]
This commit fixes the copyLibwhisperDynlib task in the build.gradle file
to copy the correct libwhisper.dylib file from build/src.
- bindings/java/build.gradle +11 -3
- bindings/java/gradlew +0 -0
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperConstants.java +24 -0
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperContext.java +18 -23
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java +24 -15
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java +9 -8
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/callbacks/GgmlAbortCallback.java +17 -0
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperAhead.java +30 -0
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperAheads.java +41 -0
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperContextParams.java +56 -6
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java +44 -12
- bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java +1 -1
|
@@ -25,13 +25,13 @@ sourceSets {
|
|
| 25 |
}
|
| 26 |
|
| 27 |
tasks.register('copyLibwhisperDynlib', Copy) {
|
| 28 |
-
from '../../build'
|
| 29 |
-
include 'libwhisper.
|
| 30 |
into 'build/generated/resources/main/darwin'
|
| 31 |
}
|
| 32 |
|
| 33 |
tasks.register('copyLibwhisperSo', Copy) {
|
| 34 |
-
from '../../build'
|
| 35 |
include 'libwhisper.so'
|
| 36 |
into 'build/generated/resources/main/linux-x86-64'
|
| 37 |
}
|
|
@@ -55,7 +55,12 @@ java {
|
|
| 55 |
withJavadocJar()
|
| 56 |
}
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
jar {
|
|
|
|
| 59 |
exclude '**/whisper_java.exp', '**/whisper_java.lib'
|
| 60 |
}
|
| 61 |
|
|
@@ -67,6 +72,9 @@ tasks.withType(Test) {
|
|
| 67 |
useJUnitPlatform()
|
| 68 |
}
|
| 69 |
|
|
|
|
|
|
|
|
|
|
| 70 |
dependencies {
|
| 71 |
implementation "net.java.dev.jna:jna:5.13.0"
|
| 72 |
testImplementation "org.junit.jupiter:junit-jupiter:5.9.2"
|
|
|
|
| 25 |
}
|
| 26 |
|
| 27 |
tasks.register('copyLibwhisperDynlib', Copy) {
|
| 28 |
+
from '../../build/src'
|
| 29 |
+
include 'libwhisper.dylib'
|
| 30 |
into 'build/generated/resources/main/darwin'
|
| 31 |
}
|
| 32 |
|
| 33 |
tasks.register('copyLibwhisperSo', Copy) {
|
| 34 |
+
from '../../build/src'
|
| 35 |
include 'libwhisper.so'
|
| 36 |
into 'build/generated/resources/main/linux-x86-64'
|
| 37 |
}
|
|
|
|
| 55 |
withJavadocJar()
|
| 56 |
}
|
| 57 |
|
| 58 |
+
sourcesJar() {
|
| 59 |
+
dependsOn copyLibs
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
jar {
|
| 63 |
+
dependsOn copyLibs
|
| 64 |
exclude '**/whisper_java.exp', '**/whisper_java.lib'
|
| 65 |
}
|
| 66 |
|
|
|
|
| 72 |
useJUnitPlatform()
|
| 73 |
}
|
| 74 |
|
| 75 |
+
test.dependsOn copyLibs
|
| 76 |
+
processResources.dependsOn copyLibs
|
| 77 |
+
|
| 78 |
dependencies {
|
| 79 |
implementation "net.java.dev.jna:jna:5.13.0"
|
| 80 |
testImplementation "org.junit.jupiter:junit-jupiter:5.9.2"
|
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package io.github.ggerganov.whispercpp;
|
| 2 |
+
|
| 3 |
+
/**
|
| 4 |
+
* Presets for alignment heads in DTW token timestamps
|
| 5 |
+
*/
|
| 6 |
+
public class WhisperConstants {
|
| 7 |
+
// Alignment heads presets
|
| 8 |
+
public static final int WHISPER_AHEADS_NONE = 0;
|
| 9 |
+
public static final int WHISPER_AHEADS_TINY_EN = 1;
|
| 10 |
+
public static final int WHISPER_AHEADS_TINY = 2;
|
| 11 |
+
public static final int WHISPER_AHEADS_BASE_EN = 3;
|
| 12 |
+
public static final int WHISPER_AHEADS_BASE = 4;
|
| 13 |
+
public static final int WHISPER_AHEADS_SMALL_EN = 5;
|
| 14 |
+
public static final int WHISPER_AHEADS_SMALL = 6;
|
| 15 |
+
public static final int WHISPER_AHEADS_MEDIUM_EN = 7;
|
| 16 |
+
public static final int WHISPER_AHEADS_MEDIUM = 8;
|
| 17 |
+
public static final int WHISPER_AHEADS_LARGE_V1 = 9;
|
| 18 |
+
public static final int WHISPER_AHEADS_LARGE_V2 = 10;
|
| 19 |
+
public static final int WHISPER_AHEADS_LARGE_V3 = 11;
|
| 20 |
+
public static final int WHISPER_AHEADS_LARGE_V3_TURBO = 12;
|
| 21 |
+
public static final int WHISPER_AHEADS_CUSTOM = 13;
|
| 22 |
+
public static final int WHISPER_AHEADS_N_TOP_MOST = 14;
|
| 23 |
+
public static final int WHISPER_AHEADS_COUNT = 15;
|
| 24 |
+
}
|
|
@@ -1,7 +1,9 @@
|
|
| 1 |
package io.github.ggerganov.whispercpp;
|
| 2 |
|
|
|
|
| 3 |
import com.sun.jna.Structure;
|
| 4 |
import com.sun.jna.ptr.PointerByReference;
|
|
|
|
| 5 |
import io.github.ggerganov.whispercpp.ggml.GgmlType;
|
| 6 |
import io.github.ggerganov.whispercpp.WhisperModel;
|
| 7 |
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
|
@@ -9,33 +11,26 @@ import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
|
| 9 |
import java.util.List;
|
| 10 |
|
| 11 |
public class WhisperContext extends Structure {
|
| 12 |
-
|
| 13 |
-
|
| 14 |
|
| 15 |
/** weight type (FP32 / FP16 / QX) */
|
| 16 |
-
GgmlType wtype = GgmlType.GGML_TYPE_F16;
|
| 17 |
/** intermediate type (FP32 or FP16) */
|
| 18 |
-
GgmlType itype = GgmlType.GGML_TYPE_F16;
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
public
|
| 25 |
-
public PointerByReference state;
|
| 26 |
|
| 27 |
/** populated by whisper_init_from_file_with_params() */
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
// }
|
| 36 |
-
//
|
| 37 |
-
// @Override
|
| 38 |
-
// protected List<String> getFieldOrder() {
|
| 39 |
-
// return List.of("t_load_us", "t_start_us", "wtype", "itype", "model", "vocab", "state", "path_model");
|
| 40 |
-
// }
|
| 41 |
}
|
|
|
|
| 1 |
package io.github.ggerganov.whispercpp;
|
| 2 |
|
| 3 |
+
import com.sun.jna.NativeLong;
|
| 4 |
import com.sun.jna.Structure;
|
| 5 |
import com.sun.jna.ptr.PointerByReference;
|
| 6 |
+
import com.sun.jna.Pointer;
|
| 7 |
import io.github.ggerganov.whispercpp.ggml.GgmlType;
|
| 8 |
import io.github.ggerganov.whispercpp.WhisperModel;
|
| 9 |
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
|
|
|
| 11 |
import java.util.List;
|
| 12 |
|
| 13 |
public class WhisperContext extends Structure {
|
| 14 |
+
public NativeLong t_load_us;
|
| 15 |
+
public NativeLong t_start_us;
|
| 16 |
|
| 17 |
/** weight type (FP32 / FP16 / QX) */
|
| 18 |
+
public GgmlType wtype = GgmlType.GGML_TYPE_F16;
|
| 19 |
/** intermediate type (FP32 or FP16) */
|
| 20 |
+
public GgmlType itype = GgmlType.GGML_TYPE_F16;
|
| 21 |
|
| 22 |
+
public WhisperContextParams.ByValue params;
|
| 23 |
+
|
| 24 |
+
public Pointer model;
|
| 25 |
+
public Pointer vocab;
|
| 26 |
+
public Pointer state;
|
|
|
|
| 27 |
|
| 28 |
/** populated by whisper_init_from_file_with_params() */
|
| 29 |
+
public Pointer path_model;
|
| 30 |
+
|
| 31 |
+
@Override
|
| 32 |
+
protected List<String> getFieldOrder() {
|
| 33 |
+
return List.of("t_load_us", "t_start_us", "wtype", "itype",
|
| 34 |
+
"params", "model", "vocab", "state", "path_model");
|
| 35 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
}
|
|
@@ -43,11 +43,11 @@ public class WhisperCpp implements AutoCloseable {
|
|
| 43 |
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
|
| 44 |
* @param params - params to use when initialising the context
|
| 45 |
*/
|
| 46 |
-
public void initContext(String modelPath, WhisperContextParams params) throws FileNotFoundException {
|
| 47 |
initContextImpl(modelPath, params);
|
| 48 |
}
|
| 49 |
|
| 50 |
-
private void initContextImpl(String modelPath, WhisperContextParams params) throws FileNotFoundException {
|
| 51 |
if (ctx != null) {
|
| 52 |
lib.whisper_free(ctx);
|
| 53 |
}
|
|
@@ -69,15 +69,13 @@ public class WhisperCpp implements AutoCloseable {
|
|
| 69 |
|
| 70 |
/**
|
| 71 |
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
|
| 72 |
-
*
|
| 73 |
-
* - call `whisper_free_context_params()`
|
| 74 |
-
* - `Native.free(Pointer.nativeValue(pointer));`
|
| 75 |
*/
|
| 76 |
-
public WhisperContextParams getContextDefaultParams() {
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
return
|
| 81 |
}
|
| 82 |
|
| 83 |
/**
|
|
@@ -88,7 +86,7 @@ public class WhisperCpp implements AutoCloseable {
|
|
| 88 |
*
|
| 89 |
* @param strategy - GREEDY
|
| 90 |
*/
|
| 91 |
-
public WhisperFullParams getFullDefaultParams(WhisperSamplingStrategy strategy) {
|
| 92 |
Pointer pointer;
|
| 93 |
|
| 94 |
// whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
|
|
@@ -104,7 +102,7 @@ public class WhisperCpp implements AutoCloseable {
|
|
| 104 |
pointer = beamParamsPointer;
|
| 105 |
}
|
| 106 |
|
| 107 |
-
WhisperFullParams params = new WhisperFullParams(pointer);
|
| 108 |
params.read();
|
| 109 |
return params;
|
| 110 |
}
|
|
@@ -138,15 +136,21 @@ public class WhisperCpp implements AutoCloseable {
|
|
| 138 |
}
|
| 139 |
|
| 140 |
/**
|
| 141 |
-
* Run the entire model: PCM
|
| 142 |
* Not thread safe for same context
|
| 143 |
* Uses the specified decoding strategy to obtain the text.
|
| 144 |
*/
|
| 145 |
-
public String fullTranscribe(WhisperFullParams whisperParams, float[] audioData) throws IOException {
|
| 146 |
if (ctx == null) {
|
| 147 |
throw new IllegalStateException("Model not initialised");
|
| 148 |
}
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
|
| 151 |
throw new IOException("Failed to process audio");
|
| 152 |
}
|
|
@@ -163,12 +167,17 @@ public class WhisperCpp implements AutoCloseable {
|
|
| 163 |
|
| 164 |
return str.toString().trim();
|
| 165 |
}
|
|
|
|
| 166 |
public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
|
| 167 |
if (ctx == null) {
|
| 168 |
throw new IllegalStateException("Model not initialised");
|
| 169 |
}
|
| 170 |
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
throw new IOException("Failed to process audio");
|
| 173 |
}
|
| 174 |
|
|
|
|
| 43 |
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
|
| 44 |
* @param params - params to use when initialising the context
|
| 45 |
*/
|
| 46 |
+
public void initContext(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
|
| 47 |
initContextImpl(modelPath, params);
|
| 48 |
}
|
| 49 |
|
| 50 |
+
private void initContextImpl(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
|
| 51 |
if (ctx != null) {
|
| 52 |
lib.whisper_free(ctx);
|
| 53 |
}
|
|
|
|
| 69 |
|
| 70 |
/**
|
| 71 |
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
|
| 72 |
+
* Returns a ByValue instance to ensure proper parameter passing to native code.
|
|
|
|
|
|
|
| 73 |
*/
|
| 74 |
+
public WhisperContextParams.ByValue getContextDefaultParams() {
|
| 75 |
+
WhisperContextParams.ByValue valueParams = new WhisperContextParams.ByValue(
|
| 76 |
+
lib.whisper_context_default_params_by_ref());
|
| 77 |
+
valueParams.read();
|
| 78 |
+
return valueParams;
|
| 79 |
}
|
| 80 |
|
| 81 |
/**
|
|
|
|
| 86 |
*
|
| 87 |
* @param strategy - GREEDY
|
| 88 |
*/
|
| 89 |
+
public WhisperFullParams.ByValue getFullDefaultParams(WhisperSamplingStrategy strategy) {
|
| 90 |
Pointer pointer;
|
| 91 |
|
| 92 |
// whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
|
|
|
|
| 102 |
pointer = beamParamsPointer;
|
| 103 |
}
|
| 104 |
|
| 105 |
+
WhisperFullParams.ByValue params = new WhisperFullParams.ByValue(pointer);
|
| 106 |
params.read();
|
| 107 |
return params;
|
| 108 |
}
|
|
|
|
| 136 |
}
|
| 137 |
|
| 138 |
/**
|
| 139 |
+
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text.
|
| 140 |
* Not thread safe for same context
|
| 141 |
* Uses the specified decoding strategy to obtain the text.
|
| 142 |
*/
|
| 143 |
+
public String fullTranscribe(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
|
| 144 |
if (ctx == null) {
|
| 145 |
throw new IllegalStateException("Model not initialised");
|
| 146 |
}
|
| 147 |
|
| 148 |
+
/*
|
| 149 |
+
WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
|
| 150 |
+
lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
|
| 151 |
+
valueParams.read();
|
| 152 |
+
*/
|
| 153 |
+
|
| 154 |
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
|
| 155 |
throw new IOException("Failed to process audio");
|
| 156 |
}
|
|
|
|
| 167 |
|
| 168 |
return str.toString().trim();
|
| 169 |
}
|
| 170 |
+
|
| 171 |
public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
|
| 172 |
if (ctx == null) {
|
| 173 |
throw new IllegalStateException("Model not initialised");
|
| 174 |
}
|
| 175 |
|
| 176 |
+
WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
|
| 177 |
+
lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
|
| 178 |
+
valueParams.read();
|
| 179 |
+
|
| 180 |
+
if (lib.whisper_full(ctx, valueParams, audioData, audioData.length) != 0) {
|
| 181 |
throw new IOException("Failed to process audio");
|
| 182 |
}
|
| 183 |
|
|
@@ -38,7 +38,7 @@ public interface WhisperCppJnaLibrary extends Library {
|
|
| 38 |
* @param params Pointer to whisper_context_params
|
| 39 |
* @return Whisper context on success, null on failure
|
| 40 |
*/
|
| 41 |
-
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams params);
|
| 42 |
|
| 43 |
/**
|
| 44 |
* Allocate (almost) all memory needed for the model by loading from a buffer.
|
|
@@ -180,12 +180,12 @@ public interface WhisperCppJnaLibrary extends Library {
|
|
| 180 |
/**
|
| 181 |
* @return the id of the specified language, returns -1 if not found.
|
| 182 |
* Examples:
|
| 183 |
-
* "de"
|
| 184 |
-
* "german"
|
| 185 |
*/
|
| 186 |
int whisper_lang_id(String lang);
|
| 187 |
|
| 188 |
-
/** @return the short string of the specified language id (e.g. 2
|
| 189 |
String whisper_lang_str(int id);
|
| 190 |
|
| 191 |
/**
|
|
@@ -268,20 +268,21 @@ public interface WhisperCppJnaLibrary extends Library {
|
|
| 268 |
void whisper_free_params(Pointer params);
|
| 269 |
|
| 270 |
/**
|
| 271 |
-
* Run the entire model: PCM
|
| 272 |
* Not thread safe for same context
|
| 273 |
* Uses the specified decoding strategy to obtain the text.
|
| 274 |
*/
|
| 275 |
-
int whisper_full(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples);
|
| 276 |
|
| 277 |
-
int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params,
|
|
|
|
| 278 |
|
| 279 |
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
| 280 |
// Result is stored in the default state of the context
|
| 281 |
// Not thread safe if executed in parallel on the same context.
|
| 282 |
// It seems this approach can offer some speedup in some cases.
|
| 283 |
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
| 284 |
-
int whisper_full_parallel(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples, int n_processors);
|
| 285 |
|
| 286 |
/**
|
| 287 |
* Number of generated text segments.
|
|
|
|
| 38 |
* @param params Pointer to whisper_context_params
|
| 39 |
* @return Whisper context on success, null on failure
|
| 40 |
*/
|
| 41 |
+
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams.ByValue params);
|
| 42 |
|
| 43 |
/**
|
| 44 |
* Allocate (almost) all memory needed for the model by loading from a buffer.
|
|
|
|
| 180 |
/**
|
| 181 |
* @return the id of the specified language, returns -1 if not found.
|
| 182 |
* Examples:
|
| 183 |
+
* "de" -> 2
|
| 184 |
+
* "german" -> 2
|
| 185 |
*/
|
| 186 |
int whisper_lang_id(String lang);
|
| 187 |
|
| 188 |
+
/** @return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found */
|
| 189 |
String whisper_lang_str(int id);
|
| 190 |
|
| 191 |
/**
|
|
|
|
| 268 |
void whisper_free_params(Pointer params);
|
| 269 |
|
| 270 |
/**
|
| 271 |
+
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
| 272 |
* Not thread safe for same context
|
| 273 |
* Uses the specified decoding strategy to obtain the text.
|
| 274 |
*/
|
| 275 |
+
int whisper_full(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples);
|
| 276 |
|
| 277 |
+
public int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams.ByValue params, float[] samples, int n_samples);
|
| 278 |
+
//int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
|
| 279 |
|
| 280 |
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
| 281 |
// Result is stored in the default state of the context
|
| 282 |
// Not thread safe if executed in parallel on the same context.
|
| 283 |
// It seems this approach can offer some speedup in some cases.
|
| 284 |
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
| 285 |
+
int whisper_full_parallel(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples, int n_processors);
|
| 286 |
|
| 287 |
/**
|
| 288 |
* Number of generated text segments.
|
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package io.github.ggerganov.whispercpp.callbacks;
|
| 2 |
+
|
| 3 |
+
import com.sun.jna.Callback;
|
| 4 |
+
|
| 5 |
+
/**
|
| 6 |
+
* Callback for aborting GGML computation
|
| 7 |
+
* Maps to the C typedef: bool (*ggml_abort_callback)(void * data)
|
| 8 |
+
*/
|
| 9 |
+
public interface GgmlAbortCallback extends Callback {
|
| 10 |
+
/**
|
| 11 |
+
* Return true to abort the computation, false to continue
|
| 12 |
+
*
|
| 13 |
+
* @param data User data passed to the callback
|
| 14 |
+
* @return true to abort, false to continue
|
| 15 |
+
*/
|
| 16 |
+
boolean invoke(com.sun.jna.Pointer data);
|
| 17 |
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package io.github.ggerganov.whispercpp.params;
|
| 2 |
+
import com.sun.jna.*;
|
| 3 |
+
import java.util.Arrays;
|
| 4 |
+
import java.util.List;
|
| 5 |
+
|
| 6 |
+
public class WhisperAhead extends Structure {
|
| 7 |
+
|
| 8 |
+
public int n_text_layer;
|
| 9 |
+
|
| 10 |
+
public int n_head;
|
| 11 |
+
|
| 12 |
+
public WhisperAhead() {
|
| 13 |
+
super();
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
public WhisperAhead(int textLayer, int head) {
|
| 17 |
+
super();
|
| 18 |
+
this.n_text_layer = textLayer;
|
| 19 |
+
this.n_head = head;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
@Override
|
| 23 |
+
protected List<String> getFieldOrder() {
|
| 24 |
+
return Arrays.asList("n_text_layer", "n_head");
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
public static class ByReference extends WhisperAhead implements Structure.ByReference {}
|
| 28 |
+
|
| 29 |
+
public static class ByValue extends WhisperAhead implements Structure.ByValue {}
|
| 30 |
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package io.github.ggerganov.whispercpp.params;
|
| 2 |
+
import com.sun.jna.*;
|
| 3 |
+
import java.util.Arrays;
|
| 4 |
+
import java.util.List;
|
| 5 |
+
|
| 6 |
+
public class WhisperAheads extends Structure {
|
| 7 |
+
public NativeLong n_heads;
|
| 8 |
+
|
| 9 |
+
public Pointer heads;
|
| 10 |
+
|
| 11 |
+
public WhisperAheads() {
|
| 12 |
+
super();
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
/**
|
| 16 |
+
* Create alignment heads from an array of WhisperAhead objects
|
| 17 |
+
*/
|
| 18 |
+
public void setHeads(WhisperAhead[] aheadsArray) {
|
| 19 |
+
this.n_heads = new NativeLong(aheadsArray.length);
|
| 20 |
+
|
| 21 |
+
int structSize = aheadsArray[0].size();
|
| 22 |
+
Memory mem = new Memory(structSize * aheadsArray.length);
|
| 23 |
+
|
| 24 |
+
for (int i = 0; i < aheadsArray.length; i++) {
|
| 25 |
+
aheadsArray[i].write();
|
| 26 |
+
byte[] buffer = aheadsArray[i].getPointer().getByteArray(0, structSize);
|
| 27 |
+
mem.write(i * structSize, buffer, 0, buffer.length);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
this.heads = mem;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
@Override
|
| 34 |
+
protected List<String> getFieldOrder() {
|
| 35 |
+
return Arrays.asList("n_heads", "heads");
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
public static class ByReference extends WhisperAheads implements Structure.ByReference {}
|
| 39 |
+
|
| 40 |
+
public static class ByValue extends WhisperAheads implements Structure.ByValue {}
|
| 41 |
+
}
|
|
@@ -1,7 +1,5 @@
|
|
| 1 |
package io.github.ggerganov.whispercpp.params;
|
| 2 |
-
|
| 3 |
import com.sun.jna.*;
|
| 4 |
-
|
| 5 |
import java.util.Arrays;
|
| 6 |
import java.util.List;
|
| 7 |
|
|
@@ -11,21 +9,73 @@ import java.util.List;
|
|
| 11 |
* whisper_context_default_params()
|
| 12 |
*/
|
| 13 |
public class WhisperContextParams extends Structure {
|
| 14 |
-
|
| 15 |
public WhisperContextParams(Pointer p) {
|
| 16 |
super(p);
|
| 17 |
}
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
public CBool use_gpu;
|
| 21 |
|
| 22 |
-
/** Use
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
public void useGpu(boolean enable) {
|
| 24 |
use_gpu = enable ? CBool.TRUE : CBool.FALSE;
|
| 25 |
}
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
@Override
|
| 28 |
protected List<String> getFieldOrder() {
|
| 29 |
-
return Arrays.asList(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
}
|
| 31 |
}
|
|
|
|
| 1 |
package io.github.ggerganov.whispercpp.params;
|
|
|
|
| 2 |
import com.sun.jna.*;
|
|
|
|
| 3 |
import java.util.Arrays;
|
| 4 |
import java.util.List;
|
| 5 |
|
|
|
|
| 9 |
* whisper_context_default_params()
|
| 10 |
*/
|
| 11 |
public class WhisperContextParams extends Structure {
|
|
|
|
| 12 |
public WhisperContextParams(Pointer p) {
|
| 13 |
super(p);
|
| 14 |
}
|
| 15 |
|
| 16 |
+
public WhisperContextParams() {
|
| 17 |
+
super();
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
/** Use GPU for inference (default = true) */
|
| 21 |
public CBool use_gpu;
|
| 22 |
|
| 23 |
+
/** Use flash attention (default = false) */
|
| 24 |
+
public CBool flash_attn;
|
| 25 |
+
|
| 26 |
+
/** CUDA device to use (default = 0) */
|
| 27 |
+
public int gpu_device;
|
| 28 |
+
|
| 29 |
+
/** [EXPERIMENTAL] Enable token-level timestamps with DTW (default = false) */
|
| 30 |
+
public CBool dtw_token_timestamps;
|
| 31 |
+
|
| 32 |
+
/** [EXPERIMENTAL] Alignment heads preset for DTW */
|
| 33 |
+
public int dtw_aheads_preset;
|
| 34 |
+
|
| 35 |
+
/** Number of top layers to use for DTW when using WHISPER_AHEADS_N_TOP_MOST preset */
|
| 36 |
+
public int dtw_n_top;
|
| 37 |
+
|
| 38 |
+
public WhisperAheads.ByValue dtw_aheads;
|
| 39 |
+
|
| 40 |
+
/** DTW memory size (internal use) */
|
| 41 |
+
public NativeLong dtw_mem_size;
|
| 42 |
+
|
| 43 |
+
/** Use GPU for inference */
|
| 44 |
public void useGpu(boolean enable) {
|
| 45 |
use_gpu = enable ? CBool.TRUE : CBool.FALSE;
|
| 46 |
}
|
| 47 |
|
| 48 |
+
/** Use flash attention */
|
| 49 |
+
public void useFlashAttn(boolean enable) {
|
| 50 |
+
flash_attn = enable ? CBool.TRUE : CBool.FALSE;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
/** Enable DTW token-level timestamps */
|
| 54 |
+
public void enableDtwTokenTimestamps(boolean enable) {
|
| 55 |
+
dtw_token_timestamps = enable ? CBool.TRUE : CBool.FALSE;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
/** Set DTW alignment heads preset */
|
| 59 |
+
public void setDtwAheadsPreset(int preset) {
|
| 60 |
+
dtw_aheads_preset = preset;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
@Override
|
| 64 |
protected List<String> getFieldOrder() {
|
| 65 |
+
return Arrays.asList(
|
| 66 |
+
"use_gpu",
|
| 67 |
+
"flash_attn",
|
| 68 |
+
"gpu_device",
|
| 69 |
+
"dtw_token_timestamps",
|
| 70 |
+
"dtw_aheads_preset",
|
| 71 |
+
"dtw_n_top",
|
| 72 |
+
"dtw_aheads",
|
| 73 |
+
"dtw_mem_size"
|
| 74 |
+
);
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
public static class ByValue extends WhisperContextParams implements Structure.ByValue {
|
| 78 |
+
public ByValue() { super(); }
|
| 79 |
+
public ByValue(Pointer p) { super(p); }
|
| 80 |
}
|
| 81 |
}
|
|
@@ -5,6 +5,7 @@ import io.github.ggerganov.whispercpp.callbacks.WhisperEncoderBeginCallback;
|
|
| 5 |
import io.github.ggerganov.whispercpp.callbacks.WhisperLogitsFilterCallback;
|
| 6 |
import io.github.ggerganov.whispercpp.callbacks.WhisperNewSegmentCallback;
|
| 7 |
import io.github.ggerganov.whispercpp.callbacks.WhisperProgressCallback;
|
|
|
|
| 8 |
|
| 9 |
import java.util.Arrays;
|
| 10 |
import java.util.List;
|
|
@@ -16,10 +17,12 @@ import java.util.List;
|
|
| 16 |
*/
|
| 17 |
public class WhisperFullParams extends Structure {
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
public WhisperFullParams(Pointer p) {
|
| 20 |
super(p);
|
| 21 |
-
// super(p, ALIGN_MSVC);
|
| 22 |
-
// super(p, ALIGN_GNUC);
|
| 23 |
}
|
| 24 |
|
| 25 |
/** Sampling strategy for whisper_full() function. */
|
|
@@ -69,10 +72,10 @@ public class WhisperFullParams extends Structure {
|
|
| 69 |
single_segment = single ? CBool.TRUE : CBool.FALSE;
|
| 70 |
}
|
| 71 |
|
| 72 |
-
/** Flag to print special tokens (e.g., <SOT
|
| 73 |
public CBool print_special;
|
| 74 |
|
| 75 |
-
/** Flag to print special tokens (e.g., <SOT
|
| 76 |
public void printSpecial(boolean enable) {
|
| 77 |
print_special = enable ? CBool.TRUE : CBool.FALSE;
|
| 78 |
}
|
|
@@ -129,6 +132,14 @@ public class WhisperFullParams extends Structure {
|
|
| 129 |
/** Maximum tokens per segment (0, default = no limit) */
|
| 130 |
public int max_tokens;
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
/** Overwrite the audio context size (0 = use default). */
|
| 133 |
public int audio_ctx;
|
| 134 |
|
|
@@ -274,6 +285,16 @@ public class WhisperFullParams extends Structure {
|
|
| 274 |
*/
|
| 275 |
public Pointer encoder_begin_callback_user_data;
|
| 276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
/**
|
| 278 |
* Callback by each decoder to filter obtained logits.
|
| 279 |
* WhisperLogitsFilterCallback
|
|
@@ -310,17 +331,28 @@ public class WhisperFullParams extends Structure {
|
|
| 310 |
|
| 311 |
@Override
|
| 312 |
protected List<String> getFieldOrder() {
|
| 313 |
-
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
|
| 314 |
-
"
|
| 315 |
-
"
|
| 316 |
-
"
|
| 317 |
-
"
|
| 318 |
-
"
|
| 319 |
-
"
|
| 320 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
"progress_callback", "progress_callback_user_data",
|
| 322 |
"encoder_begin_callback", "encoder_begin_callback_user_data",
|
|
|
|
| 323 |
"logits_filter_callback", "logits_filter_callback_user_data",
|
| 324 |
"grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
|
| 325 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
}
|
|
|
|
| 5 |
import io.github.ggerganov.whispercpp.callbacks.WhisperLogitsFilterCallback;
|
| 6 |
import io.github.ggerganov.whispercpp.callbacks.WhisperNewSegmentCallback;
|
| 7 |
import io.github.ggerganov.whispercpp.callbacks.WhisperProgressCallback;
|
| 8 |
+
import io.github.ggerganov.whispercpp.callbacks.GgmlAbortCallback;
|
| 9 |
|
| 10 |
import java.util.Arrays;
|
| 11 |
import java.util.List;
|
|
|
|
| 17 |
*/
|
| 18 |
public class WhisperFullParams extends Structure {
|
| 19 |
|
| 20 |
+
public WhisperFullParams() {
|
| 21 |
+
super();
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
public WhisperFullParams(Pointer p) {
|
| 25 |
super(p);
|
|
|
|
|
|
|
| 26 |
}
|
| 27 |
|
| 28 |
/** Sampling strategy for whisper_full() function. */
|
|
|
|
| 72 |
single_segment = single ? CBool.TRUE : CBool.FALSE;
|
| 73 |
}
|
| 74 |
|
| 75 |
+
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
| 76 |
public CBool print_special;
|
| 77 |
|
| 78 |
+
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
| 79 |
public void printSpecial(boolean enable) {
|
| 80 |
print_special = enable ? CBool.TRUE : CBool.FALSE;
|
| 81 |
}
|
|
|
|
| 132 |
/** Maximum tokens per segment (0, default = no limit) */
|
| 133 |
public int max_tokens;
|
| 134 |
|
| 135 |
+
/** [EXPERIMENTAL] Enable debug mode for extra info */
|
| 136 |
+
public CBool debug_mode;
|
| 137 |
+
|
| 138 |
+
/** Enable debug mode */
|
| 139 |
+
public void enableDebugMode(boolean enable) {
|
| 140 |
+
debug_mode = enable ? CBool.TRUE : CBool.FALSE;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
/** Overwrite the audio context size (0 = use default). */
|
| 144 |
public int audio_ctx;
|
| 145 |
|
|
|
|
| 285 |
*/
|
| 286 |
public Pointer encoder_begin_callback_user_data;
|
| 287 |
|
| 288 |
+
/** Callback used to abort GGML computation */
|
| 289 |
+
public Pointer abort_callback;
|
| 290 |
+
|
| 291 |
+
/** User data for the abort_callback */
|
| 292 |
+
public Pointer abort_callback_user_data;
|
| 293 |
+
|
| 294 |
+
public void setAbortCallback(GgmlAbortCallback callback) {
|
| 295 |
+
abort_callback = CallbackReference.getFunctionPointer(callback);
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
/**
|
| 299 |
* Callback by each decoder to filter obtained logits.
|
| 300 |
* WhisperLogitsFilterCallback
|
|
|
|
| 331 |
|
| 332 |
@Override
|
| 333 |
protected List<String> getFieldOrder() {
|
| 334 |
+
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
|
| 335 |
+
"offset_ms", "duration_ms", "translate", "no_context",
|
| 336 |
+
"no_timestamps", "single_segment", "print_special",
|
| 337 |
+
"print_progress", "print_realtime", "print_timestamps",
|
| 338 |
+
"token_timestamps", "thold_pt", "thold_ptsum", "max_len",
|
| 339 |
+
"split_on_word", "max_tokens", "debug_mode", "audio_ctx",
|
| 340 |
+
"tdrz_enable", "suppress_regex", "initial_prompt",
|
| 341 |
+
"prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
| 342 |
+
"suppress_blank", "suppress_nst", "temperature",
|
| 343 |
+
"max_initial_ts", "length_penalty", "temperature_inc",
|
| 344 |
+
"entropy_thold", "logprob_thold", "no_speech_thold", "greedy",
|
| 345 |
+
"beam_search", "new_segment_callback", "new_segment_callback_user_data",
|
| 346 |
"progress_callback", "progress_callback_user_data",
|
| 347 |
"encoder_begin_callback", "encoder_begin_callback_user_data",
|
| 348 |
+
"abort_callback", "abort_callback_user_data",
|
| 349 |
"logits_filter_callback", "logits_filter_callback_user_data",
|
| 350 |
"grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
|
| 351 |
}
|
| 352 |
+
|
| 353 |
+
public static class ByValue extends WhisperFullParams implements Structure.ByValue {
|
| 354 |
+
public ByValue() { super(); }
|
| 355 |
+
public ByValue(Pointer p) { super(p); }
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
}
|
|
@@ -76,7 +76,7 @@ class WhisperCppTest {
|
|
| 76 |
float[] floats = new float[b.length / 2];
|
| 77 |
|
| 78 |
//WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
| 79 |
-
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
| 80 |
params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
|
| 81 |
params.print_progress = CBool.FALSE;
|
| 82 |
//params.initial_prompt = "and so my fellow Americans um, like";
|
|
|
|
| 76 |
float[] floats = new float[b.length / 2];
|
| 77 |
|
| 78 |
//WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
| 79 |
+
WhisperFullParams.ByValue params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
| 80 |
params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
|
| 81 |
params.print_progress = CBool.FALSE;
|
| 82 |
//params.initial_prompt = "and so my fellow Americans um, like";
|