danbev commited on
Commit
fbe8350
·
unverified ·
1 Parent(s): 0a3036a

bindings.java : enable copyLibs task [no ci] (#2949)

Browse files

* bindings.java : enable copyLibs task [no ci]

This commit adds a dependency on the copyLibs task to the sourcesJar and
jar tasks. This ensures that the libwhisper.so file is copied to the
correct location before the jar is built.

It also sets the executable bit on the gradlew file.

* bindings.java : add copyLibs dep for processResources [no ci]

This will otherwise cause builds to fail after doing an initial build.

* bindings.java : pass structs by value to native code

This commit refactors the code to pass the structs by value to the
native code. This is done by creating a ByValue class for each struct
and using it in the Java code.

The motivation for this change is that without this application crashes
due to what I believe was memory mis-alignement. When the structs were
passed to the native code they would be att different memory locations.
Passing by value overcomes this issue and considering that the structs
hold parementers (context and full params) it might be alright do to
this. These changes allow all the tests to pass.

* bindings.java : fix javadoc warnings [no ci]

* bindings.java : fix libwhisper.dylib path in build.gradle [no ci]

This commit fixes the copyLibwhisperDynlib task in the build.gradle file
to copy the correct libwhisper.dylib file from build/src.

bindings/java/build.gradle CHANGED
@@ -25,13 +25,13 @@ sourceSets {
25
  }
26
 
27
  tasks.register('copyLibwhisperDynlib', Copy) {
28
- from '../../build'
29
- include 'libwhisper.dynlib'
30
  into 'build/generated/resources/main/darwin'
31
  }
32
 
33
  tasks.register('copyLibwhisperSo', Copy) {
34
- from '../../build'
35
  include 'libwhisper.so'
36
  into 'build/generated/resources/main/linux-x86-64'
37
  }
@@ -55,7 +55,12 @@ java {
55
  withJavadocJar()
56
  }
57
 
 
 
 
 
58
  jar {
 
59
  exclude '**/whisper_java.exp', '**/whisper_java.lib'
60
  }
61
 
@@ -67,6 +72,9 @@ tasks.withType(Test) {
67
  useJUnitPlatform()
68
  }
69
 
 
 
 
70
  dependencies {
71
  implementation "net.java.dev.jna:jna:5.13.0"
72
  testImplementation "org.junit.jupiter:junit-jupiter:5.9.2"
 
25
  }
26
 
27
  tasks.register('copyLibwhisperDynlib', Copy) {
28
+ from '../../build/src'
29
+ include 'libwhisper.dylib'
30
  into 'build/generated/resources/main/darwin'
31
  }
32
 
33
  tasks.register('copyLibwhisperSo', Copy) {
34
+ from '../../build/src'
35
  include 'libwhisper.so'
36
  into 'build/generated/resources/main/linux-x86-64'
37
  }
 
55
  withJavadocJar()
56
  }
57
 
58
+ sourcesJar() {
59
+ dependsOn copyLibs
60
+ }
61
+
62
  jar {
63
+ dependsOn copyLibs
64
  exclude '**/whisper_java.exp', '**/whisper_java.lib'
65
  }
66
 
 
72
  useJUnitPlatform()
73
  }
74
 
75
+ test.dependsOn copyLibs
76
+ processResources.dependsOn copyLibs
77
+
78
  dependencies {
79
  implementation "net.java.dev.jna:jna:5.13.0"
80
  testImplementation "org.junit.jupiter:junit-jupiter:5.9.2"
bindings/java/gradlew CHANGED
File without changes
bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperConstants.java ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package io.github.ggerganov.whispercpp;
2
+
3
+ /**
4
+ * Presets for alignment heads in DTW token timestamps
5
+ */
6
+ public class WhisperConstants {
7
+ // Alignment heads presets
8
+ public static final int WHISPER_AHEADS_NONE = 0;
9
+ public static final int WHISPER_AHEADS_TINY_EN = 1;
10
+ public static final int WHISPER_AHEADS_TINY = 2;
11
+ public static final int WHISPER_AHEADS_BASE_EN = 3;
12
+ public static final int WHISPER_AHEADS_BASE = 4;
13
+ public static final int WHISPER_AHEADS_SMALL_EN = 5;
14
+ public static final int WHISPER_AHEADS_SMALL = 6;
15
+ public static final int WHISPER_AHEADS_MEDIUM_EN = 7;
16
+ public static final int WHISPER_AHEADS_MEDIUM = 8;
17
+ public static final int WHISPER_AHEADS_LARGE_V1 = 9;
18
+ public static final int WHISPER_AHEADS_LARGE_V2 = 10;
19
+ public static final int WHISPER_AHEADS_LARGE_V3 = 11;
20
+ public static final int WHISPER_AHEADS_LARGE_V3_TURBO = 12;
21
+ public static final int WHISPER_AHEADS_CUSTOM = 13;
22
+ public static final int WHISPER_AHEADS_N_TOP_MOST = 14;
23
+ public static final int WHISPER_AHEADS_COUNT = 15;
24
+ }
bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperContext.java CHANGED
@@ -1,7 +1,9 @@
1
  package io.github.ggerganov.whispercpp;
2
 
 
3
  import com.sun.jna.Structure;
4
  import com.sun.jna.ptr.PointerByReference;
 
5
  import io.github.ggerganov.whispercpp.ggml.GgmlType;
6
  import io.github.ggerganov.whispercpp.WhisperModel;
7
  import io.github.ggerganov.whispercpp.params.WhisperContextParams;
@@ -9,33 +11,26 @@ import io.github.ggerganov.whispercpp.params.WhisperContextParams;
9
  import java.util.List;
10
 
11
  public class WhisperContext extends Structure {
12
- int t_load_us = 0;
13
- int t_start_us = 0;
14
 
15
  /** weight type (FP32 / FP16 / QX) */
16
- GgmlType wtype = GgmlType.GGML_TYPE_F16;
17
  /** intermediate type (FP32 or FP16) */
18
- GgmlType itype = GgmlType.GGML_TYPE_F16;
19
 
20
- // WhisperModel model;
21
- public PointerByReference model;
22
- // whisper_vocab vocab;
23
- // whisper_state * state = nullptr;
24
- public PointerByReference vocab;
25
- public PointerByReference state;
26
 
27
  /** populated by whisper_init_from_file_with_params() */
28
- String path_model;
29
- WhisperContextParams params;
30
-
31
- // public static class ByReference extends WhisperContext implements Structure.ByReference {
32
- // }
33
- //
34
- // public static class ByValue extends WhisperContext implements Structure.ByValue {
35
- // }
36
- //
37
- // @Override
38
- // protected List<String> getFieldOrder() {
39
- // return List.of("t_load_us", "t_start_us", "wtype", "itype", "model", "vocab", "state", "path_model");
40
- // }
41
  }
 
1
  package io.github.ggerganov.whispercpp;
2
 
3
+ import com.sun.jna.NativeLong;
4
  import com.sun.jna.Structure;
5
  import com.sun.jna.ptr.PointerByReference;
6
+ import com.sun.jna.Pointer;
7
  import io.github.ggerganov.whispercpp.ggml.GgmlType;
8
  import io.github.ggerganov.whispercpp.WhisperModel;
9
  import io.github.ggerganov.whispercpp.params.WhisperContextParams;
 
11
  import java.util.List;
12
 
13
  public class WhisperContext extends Structure {
14
+ public NativeLong t_load_us;
15
+ public NativeLong t_start_us;
16
 
17
  /** weight type (FP32 / FP16 / QX) */
18
+ public GgmlType wtype = GgmlType.GGML_TYPE_F16;
19
  /** intermediate type (FP32 or FP16) */
20
+ public GgmlType itype = GgmlType.GGML_TYPE_F16;
21
 
22
+ public WhisperContextParams.ByValue params;
23
+
24
+ public Pointer model;
25
+ public Pointer vocab;
26
+ public Pointer state;
 
27
 
28
  /** populated by whisper_init_from_file_with_params() */
29
+ public Pointer path_model;
30
+
31
+ @Override
32
+ protected List<String> getFieldOrder() {
33
+ return List.of("t_load_us", "t_start_us", "wtype", "itype",
34
+ "params", "model", "vocab", "state", "path_model");
35
+ }
 
 
 
 
 
 
36
  }
bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java CHANGED
@@ -43,11 +43,11 @@ public class WhisperCpp implements AutoCloseable {
43
  * @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
44
  * @param params - params to use when initialising the context
45
  */
46
- public void initContext(String modelPath, WhisperContextParams params) throws FileNotFoundException {
47
  initContextImpl(modelPath, params);
48
  }
49
 
50
- private void initContextImpl(String modelPath, WhisperContextParams params) throws FileNotFoundException {
51
  if (ctx != null) {
52
  lib.whisper_free(ctx);
53
  }
@@ -69,15 +69,13 @@ public class WhisperCpp implements AutoCloseable {
69
 
70
  /**
71
  * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
72
- * Because this function allocates memory for the params, the caller must call either:
73
- * - call `whisper_free_context_params()`
74
- * - `Native.free(Pointer.nativeValue(pointer));`
75
  */
76
- public WhisperContextParams getContextDefaultParams() {
77
- paramsPointer = lib.whisper_context_default_params_by_ref();
78
- WhisperContextParams params = new WhisperContextParams(paramsPointer);
79
- params.read();
80
- return params;
81
  }
82
 
83
  /**
@@ -88,7 +86,7 @@ public class WhisperCpp implements AutoCloseable {
88
  *
89
  * @param strategy - GREEDY
90
  */
91
- public WhisperFullParams getFullDefaultParams(WhisperSamplingStrategy strategy) {
92
  Pointer pointer;
93
 
94
  // whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
@@ -104,7 +102,7 @@ public class WhisperCpp implements AutoCloseable {
104
  pointer = beamParamsPointer;
105
  }
106
 
107
- WhisperFullParams params = new WhisperFullParams(pointer);
108
  params.read();
109
  return params;
110
  }
@@ -138,15 +136,21 @@ public class WhisperCpp implements AutoCloseable {
138
  }
139
 
140
  /**
141
- * Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text.
142
  * Not thread safe for same context
143
  * Uses the specified decoding strategy to obtain the text.
144
  */
145
- public String fullTranscribe(WhisperFullParams whisperParams, float[] audioData) throws IOException {
146
  if (ctx == null) {
147
  throw new IllegalStateException("Model not initialised");
148
  }
149
 
 
 
 
 
 
 
150
  if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
151
  throw new IOException("Failed to process audio");
152
  }
@@ -163,12 +167,17 @@ public class WhisperCpp implements AutoCloseable {
163
 
164
  return str.toString().trim();
165
  }
 
166
  public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
167
  if (ctx == null) {
168
  throw new IllegalStateException("Model not initialised");
169
  }
170
 
171
- if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
 
 
 
 
172
  throw new IOException("Failed to process audio");
173
  }
174
 
 
43
  * @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
44
  * @param params - params to use when initialising the context
45
  */
46
+ public void initContext(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
47
  initContextImpl(modelPath, params);
48
  }
49
 
50
+ private void initContextImpl(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
51
  if (ctx != null) {
52
  lib.whisper_free(ctx);
53
  }
 
69
 
70
  /**
71
  * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
72
+ * Returns a ByValue instance to ensure proper parameter passing to native code.
 
 
73
  */
74
+ public WhisperContextParams.ByValue getContextDefaultParams() {
75
+ WhisperContextParams.ByValue valueParams = new WhisperContextParams.ByValue(
76
+ lib.whisper_context_default_params_by_ref());
77
+ valueParams.read();
78
+ return valueParams;
79
  }
80
 
81
  /**
 
86
  *
87
  * @param strategy - GREEDY
88
  */
89
+ public WhisperFullParams.ByValue getFullDefaultParams(WhisperSamplingStrategy strategy) {
90
  Pointer pointer;
91
 
92
  // whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
 
102
  pointer = beamParamsPointer;
103
  }
104
 
105
+ WhisperFullParams.ByValue params = new WhisperFullParams.ByValue(pointer);
106
  params.read();
107
  return params;
108
  }
 
136
  }
137
 
138
  /**
139
+ * Run the entire model: PCM -&gt; log mel spectrogram -&gt; encoder -&gt; decoder -&gt; text.
140
  * Not thread safe for same context
141
  * Uses the specified decoding strategy to obtain the text.
142
  */
143
+ public String fullTranscribe(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
144
  if (ctx == null) {
145
  throw new IllegalStateException("Model not initialised");
146
  }
147
 
148
+ /*
149
+ WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
150
+ lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
151
+ valueParams.read();
152
+ */
153
+
154
  if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
155
  throw new IOException("Failed to process audio");
156
  }
 
167
 
168
  return str.toString().trim();
169
  }
170
+
171
  public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
172
  if (ctx == null) {
173
  throw new IllegalStateException("Model not initialised");
174
  }
175
 
176
+ WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
177
+ lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
178
+ valueParams.read();
179
+
180
+ if (lib.whisper_full(ctx, valueParams, audioData, audioData.length) != 0) {
181
  throw new IOException("Failed to process audio");
182
  }
183
 
bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java CHANGED
@@ -38,7 +38,7 @@ public interface WhisperCppJnaLibrary extends Library {
38
  * @param params Pointer to whisper_context_params
39
  * @return Whisper context on success, null on failure
40
  */
41
- Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams params);
42
 
43
  /**
44
  * Allocate (almost) all memory needed for the model by loading from a buffer.
@@ -180,12 +180,12 @@ public interface WhisperCppJnaLibrary extends Library {
180
  /**
181
  * @return the id of the specified language, returns -1 if not found.
182
  * Examples:
183
- * "de" -> 2
184
- * "german" -> 2
185
  */
186
  int whisper_lang_id(String lang);
187
 
188
- /** @return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found */
189
  String whisper_lang_str(int id);
190
 
191
  /**
@@ -268,20 +268,21 @@ public interface WhisperCppJnaLibrary extends Library {
268
  void whisper_free_params(Pointer params);
269
 
270
  /**
271
- * Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
272
  * Not thread safe for same context
273
  * Uses the specified decoding strategy to obtain the text.
274
  */
275
- int whisper_full(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples);
276
 
277
- int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
 
278
 
279
  // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
280
  // Result is stored in the default state of the context
281
  // Not thread safe if executed in parallel on the same context.
282
  // It seems this approach can offer some speedup in some cases.
283
  // However, the transcription accuracy can be worse at the beginning and end of each chunk.
284
- int whisper_full_parallel(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples, int n_processors);
285
 
286
  /**
287
  * Number of generated text segments.
 
38
  * @param params Pointer to whisper_context_params
39
  * @return Whisper context on success, null on failure
40
  */
41
+ Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams.ByValue params);
42
 
43
  /**
44
  * Allocate (almost) all memory needed for the model by loading from a buffer.
 
180
  /**
181
  * @return the id of the specified language, returns -1 if not found.
182
  * Examples:
183
+ * "de" -&gt; 2
184
+ * "german" -&gt; 2
185
  */
186
  int whisper_lang_id(String lang);
187
 
188
+ /** @return the short string of the specified language id (e.g. 2 -&gt; "de"), returns nullptr if not found */
189
  String whisper_lang_str(int id);
190
 
191
  /**
 
268
  void whisper_free_params(Pointer params);
269
 
270
  /**
271
+ * Run the entire model: PCM -&gt; log mel spectrogram -&gt; encoder -&gt; decoder -&gt; text
272
  * Not thread safe for same context
273
  * Uses the specified decoding strategy to obtain the text.
274
  */
275
+ int whisper_full(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples);
276
 
277
+ public int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams.ByValue params, float[] samples, int n_samples);
278
+ //int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
279
 
280
  // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
281
  // Result is stored in the default state of the context
282
  // Not thread safe if executed in parallel on the same context.
283
  // It seems this approach can offer some speedup in some cases.
284
  // However, the transcription accuracy can be worse at the beginning and end of each chunk.
285
+ int whisper_full_parallel(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples, int n_processors);
286
 
287
  /**
288
  * Number of generated text segments.
bindings/java/src/main/java/io/github/ggerganov/whispercpp/callbacks/GgmlAbortCallback.java ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package io.github.ggerganov.whispercpp.callbacks;
2
+
3
+ import com.sun.jna.Callback;
4
+
5
+ /**
6
+ * Callback for aborting GGML computation
7
+ * Maps to the C typedef: bool (*ggml_abort_callback)(void * data)
8
+ */
9
+ public interface GgmlAbortCallback extends Callback {
10
+ /**
11
+ * Return true to abort the computation, false to continue
12
+ *
13
+ * @param data User data passed to the callback
14
+ * @return true to abort, false to continue
15
+ */
16
+ boolean invoke(com.sun.jna.Pointer data);
17
+ }
bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperAhead.java ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package io.github.ggerganov.whispercpp.params;
2
+ import com.sun.jna.*;
3
+ import java.util.Arrays;
4
+ import java.util.List;
5
+
6
+ public class WhisperAhead extends Structure {
7
+
8
+ public int n_text_layer;
9
+
10
+ public int n_head;
11
+
12
+ public WhisperAhead() {
13
+ super();
14
+ }
15
+
16
+ public WhisperAhead(int textLayer, int head) {
17
+ super();
18
+ this.n_text_layer = textLayer;
19
+ this.n_head = head;
20
+ }
21
+
22
+ @Override
23
+ protected List<String> getFieldOrder() {
24
+ return Arrays.asList("n_text_layer", "n_head");
25
+ }
26
+
27
+ public static class ByReference extends WhisperAhead implements Structure.ByReference {}
28
+
29
+ public static class ByValue extends WhisperAhead implements Structure.ByValue {}
30
+ }
bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperAheads.java ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package io.github.ggerganov.whispercpp.params;
2
+ import com.sun.jna.*;
3
+ import java.util.Arrays;
4
+ import java.util.List;
5
+
6
+ public class WhisperAheads extends Structure {
7
+ public NativeLong n_heads;
8
+
9
+ public Pointer heads;
10
+
11
+ public WhisperAheads() {
12
+ super();
13
+ }
14
+
15
+ /**
16
+ * Create alignment heads from an array of WhisperAhead objects
17
+ */
18
+ public void setHeads(WhisperAhead[] aheadsArray) {
19
+ this.n_heads = new NativeLong(aheadsArray.length);
20
+
21
+ int structSize = aheadsArray[0].size();
22
+ Memory mem = new Memory(structSize * aheadsArray.length);
23
+
24
+ for (int i = 0; i < aheadsArray.length; i++) {
25
+ aheadsArray[i].write();
26
+ byte[] buffer = aheadsArray[i].getPointer().getByteArray(0, structSize);
27
+ mem.write(i * structSize, buffer, 0, buffer.length);
28
+ }
29
+
30
+ this.heads = mem;
31
+ }
32
+
33
+ @Override
34
+ protected List<String> getFieldOrder() {
35
+ return Arrays.asList("n_heads", "heads");
36
+ }
37
+
38
+ public static class ByReference extends WhisperAheads implements Structure.ByReference {}
39
+
40
+ public static class ByValue extends WhisperAheads implements Structure.ByValue {}
41
+ }
bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperContextParams.java CHANGED
@@ -1,7 +1,5 @@
1
  package io.github.ggerganov.whispercpp.params;
2
-
3
  import com.sun.jna.*;
4
-
5
  import java.util.Arrays;
6
  import java.util.List;
7
 
@@ -11,21 +9,73 @@ import java.util.List;
11
  * whisper_context_default_params()
12
  */
13
  public class WhisperContextParams extends Structure {
14
-
15
  public WhisperContextParams(Pointer p) {
16
  super(p);
17
  }
18
 
19
- /** Use GPU for inference Number (default = true) */
 
 
 
 
20
  public CBool use_gpu;
21
 
22
- /** Use GPU for inference Number (default = true) */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  public void useGpu(boolean enable) {
24
  use_gpu = enable ? CBool.TRUE : CBool.FALSE;
25
  }
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  @Override
28
  protected List<String> getFieldOrder() {
29
- return Arrays.asList("use_gpu");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  }
31
  }
 
1
  package io.github.ggerganov.whispercpp.params;
 
2
  import com.sun.jna.*;
 
3
  import java.util.Arrays;
4
  import java.util.List;
5
 
 
9
  * whisper_context_default_params()
10
  */
11
  public class WhisperContextParams extends Structure {
 
12
  public WhisperContextParams(Pointer p) {
13
  super(p);
14
  }
15
 
16
+ public WhisperContextParams() {
17
+ super();
18
+ }
19
+
20
+ /** Use GPU for inference (default = true) */
21
  public CBool use_gpu;
22
 
23
+ /** Use flash attention (default = false) */
24
+ public CBool flash_attn;
25
+
26
+ /** CUDA device to use (default = 0) */
27
+ public int gpu_device;
28
+
29
+ /** [EXPERIMENTAL] Enable token-level timestamps with DTW (default = false) */
30
+ public CBool dtw_token_timestamps;
31
+
32
+ /** [EXPERIMENTAL] Alignment heads preset for DTW */
33
+ public int dtw_aheads_preset;
34
+
35
+ /** Number of top layers to use for DTW when using WHISPER_AHEADS_N_TOP_MOST preset */
36
+ public int dtw_n_top;
37
+
38
+ public WhisperAheads.ByValue dtw_aheads;
39
+
40
+ /** DTW memory size (internal use) */
41
+ public NativeLong dtw_mem_size;
42
+
43
+ /** Use GPU for inference */
44
  public void useGpu(boolean enable) {
45
  use_gpu = enable ? CBool.TRUE : CBool.FALSE;
46
  }
47
 
48
+ /** Use flash attention */
49
+ public void useFlashAttn(boolean enable) {
50
+ flash_attn = enable ? CBool.TRUE : CBool.FALSE;
51
+ }
52
+
53
+ /** Enable DTW token-level timestamps */
54
+ public void enableDtwTokenTimestamps(boolean enable) {
55
+ dtw_token_timestamps = enable ? CBool.TRUE : CBool.FALSE;
56
+ }
57
+
58
+ /** Set DTW alignment heads preset */
59
+ public void setDtwAheadsPreset(int preset) {
60
+ dtw_aheads_preset = preset;
61
+ }
62
+
63
  @Override
64
  protected List<String> getFieldOrder() {
65
+ return Arrays.asList(
66
+ "use_gpu",
67
+ "flash_attn",
68
+ "gpu_device",
69
+ "dtw_token_timestamps",
70
+ "dtw_aheads_preset",
71
+ "dtw_n_top",
72
+ "dtw_aheads",
73
+ "dtw_mem_size"
74
+ );
75
+ }
76
+
77
+ public static class ByValue extends WhisperContextParams implements Structure.ByValue {
78
+ public ByValue() { super(); }
79
+ public ByValue(Pointer p) { super(p); }
80
  }
81
  }
bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java CHANGED
@@ -5,6 +5,7 @@ import io.github.ggerganov.whispercpp.callbacks.WhisperEncoderBeginCallback;
5
  import io.github.ggerganov.whispercpp.callbacks.WhisperLogitsFilterCallback;
6
  import io.github.ggerganov.whispercpp.callbacks.WhisperNewSegmentCallback;
7
  import io.github.ggerganov.whispercpp.callbacks.WhisperProgressCallback;
 
8
 
9
  import java.util.Arrays;
10
  import java.util.List;
@@ -16,10 +17,12 @@ import java.util.List;
16
  */
17
  public class WhisperFullParams extends Structure {
18
 
 
 
 
 
19
  public WhisperFullParams(Pointer p) {
20
  super(p);
21
- // super(p, ALIGN_MSVC);
22
- // super(p, ALIGN_GNUC);
23
  }
24
 
25
  /** Sampling strategy for whisper_full() function. */
@@ -69,10 +72,10 @@ public class WhisperFullParams extends Structure {
69
  single_segment = single ? CBool.TRUE : CBool.FALSE;
70
  }
71
 
72
- /** Flag to print special tokens (e.g., &lt;SOT>, &lt;EOT>, &lt;BEG>, etc.). (default = false) */
73
  public CBool print_special;
74
 
75
- /** Flag to print special tokens (e.g., &lt;SOT>, &lt;EOT>, &lt;BEG>, etc.). (default = false) */
76
  public void printSpecial(boolean enable) {
77
  print_special = enable ? CBool.TRUE : CBool.FALSE;
78
  }
@@ -129,6 +132,14 @@ public class WhisperFullParams extends Structure {
129
  /** Maximum tokens per segment (0, default = no limit) */
130
  public int max_tokens;
131
 
 
 
 
 
 
 
 
 
132
  /** Overwrite the audio context size (0 = use default). */
133
  public int audio_ctx;
134
 
@@ -274,6 +285,16 @@ public class WhisperFullParams extends Structure {
274
  */
275
  public Pointer encoder_begin_callback_user_data;
276
 
 
 
 
 
 
 
 
 
 
 
277
  /**
278
  * Callback by each decoder to filter obtained logits.
279
  * WhisperLogitsFilterCallback
@@ -310,17 +331,28 @@ public class WhisperFullParams extends Structure {
310
 
311
  @Override
312
  protected List<String> getFieldOrder() {
313
- return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
314
- "no_context", "single_segment", "no_timestamps",
315
- "print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
316
- "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
317
- "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
318
- "suppress_blank", "suppress_nst", "temperature", "max_initial_ts", "length_penalty",
319
- "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
320
- "new_segment_callback", "new_segment_callback_user_data",
 
 
 
 
321
  "progress_callback", "progress_callback_user_data",
322
  "encoder_begin_callback", "encoder_begin_callback_user_data",
 
323
  "logits_filter_callback", "logits_filter_callback_user_data",
324
  "grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
325
  }
 
 
 
 
 
 
326
  }
 
5
  import io.github.ggerganov.whispercpp.callbacks.WhisperLogitsFilterCallback;
6
  import io.github.ggerganov.whispercpp.callbacks.WhisperNewSegmentCallback;
7
  import io.github.ggerganov.whispercpp.callbacks.WhisperProgressCallback;
8
+ import io.github.ggerganov.whispercpp.callbacks.GgmlAbortCallback;
9
 
10
  import java.util.Arrays;
11
  import java.util.List;
 
17
  */
18
  public class WhisperFullParams extends Structure {
19
 
20
+ public WhisperFullParams() {
21
+ super();
22
+ }
23
+
24
  public WhisperFullParams(Pointer p) {
25
  super(p);
 
 
26
  }
27
 
28
  /** Sampling strategy for whisper_full() function. */
 
72
  single_segment = single ? CBool.TRUE : CBool.FALSE;
73
  }
74
 
75
+ /** Flag to print special tokens (e.g., &lt;SOT&gt;, &lt;EOT&gt;, &lt;BEG&gt;, etc.). (default = false) */
76
  public CBool print_special;
77
 
78
+ /** Flag to print special tokens (e.g., &lt;SOT&gt;, &lt;EOT&gt;, &lt;BEG&gt;, etc.). (default = false) */
79
  public void printSpecial(boolean enable) {
80
  print_special = enable ? CBool.TRUE : CBool.FALSE;
81
  }
 
132
  /** Maximum tokens per segment (0, default = no limit) */
133
  public int max_tokens;
134
 
135
+ /** [EXPERIMENTAL] Enable debug mode for extra info */
136
+ public CBool debug_mode;
137
+
138
+ /** Enable debug mode */
139
+ public void enableDebugMode(boolean enable) {
140
+ debug_mode = enable ? CBool.TRUE : CBool.FALSE;
141
+ }
142
+
143
  /** Overwrite the audio context size (0 = use default). */
144
  public int audio_ctx;
145
 
 
285
  */
286
  public Pointer encoder_begin_callback_user_data;
287
 
288
+ /** Callback used to abort GGML computation */
289
+ public Pointer abort_callback;
290
+
291
+ /** User data for the abort_callback */
292
+ public Pointer abort_callback_user_data;
293
+
294
+ public void setAbortCallback(GgmlAbortCallback callback) {
295
+ abort_callback = CallbackReference.getFunctionPointer(callback);
296
+ }
297
+
298
  /**
299
  * Callback by each decoder to filter obtained logits.
300
  * WhisperLogitsFilterCallback
 
331
 
332
  @Override
333
  protected List<String> getFieldOrder() {
334
+ return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
335
+ "offset_ms", "duration_ms", "translate", "no_context",
336
+ "no_timestamps", "single_segment", "print_special",
337
+ "print_progress", "print_realtime", "print_timestamps",
338
+ "token_timestamps", "thold_pt", "thold_ptsum", "max_len",
339
+ "split_on_word", "max_tokens", "debug_mode", "audio_ctx",
340
+ "tdrz_enable", "suppress_regex", "initial_prompt",
341
+ "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
342
+ "suppress_blank", "suppress_nst", "temperature",
343
+ "max_initial_ts", "length_penalty", "temperature_inc",
344
+ "entropy_thold", "logprob_thold", "no_speech_thold", "greedy",
345
+ "beam_search", "new_segment_callback", "new_segment_callback_user_data",
346
  "progress_callback", "progress_callback_user_data",
347
  "encoder_begin_callback", "encoder_begin_callback_user_data",
348
+ "abort_callback", "abort_callback_user_data",
349
  "logits_filter_callback", "logits_filter_callback_user_data",
350
  "grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
351
  }
352
+
353
+ public static class ByValue extends WhisperFullParams implements Structure.ByValue {
354
+ public ByValue() { super(); }
355
+ public ByValue(Pointer p) { super(p); }
356
+ }
357
+
358
  }
bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java CHANGED
@@ -76,7 +76,7 @@ class WhisperCppTest {
76
  float[] floats = new float[b.length / 2];
77
 
78
  //WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
79
- WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
80
  params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
81
  params.print_progress = CBool.FALSE;
82
  //params.initial_prompt = "and so my fellow Americans um, like";
 
76
  float[] floats = new float[b.length / 2];
77
 
78
  //WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
79
+ WhisperFullParams.ByValue params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
80
  params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
81
  params.print_progress = CBool.FALSE;
82
  //params.initial_prompt = "and so my fellow Americans um, like";