ggerganov commited on
Commit
84ba527
·
1 Parent(s): dbbf84f

parallel : print time of audio boundaries + fix timings

Browse files
Files changed (2) hide show
  1. whisper.cpp +38 -12
  2. whisper.h +3 -0
whisper.cpp CHANGED
@@ -1910,14 +1910,19 @@ whisper_vocab::id whisper_sample_timestamp(
1910
  return probs_id[0].second;
1911
  }
1912
 
1913
- static std::string to_timestamp(int64_t t) {
1914
- int64_t sec = t/100;
1915
- int64_t msec = t - sec*100;
1916
- int64_t min = sec/60;
1917
- sec = sec - min*60;
 
 
 
 
 
1918
 
1919
  char buf[32];
1920
- snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
1921
 
1922
  return std::string(buf);
1923
  }
@@ -2727,24 +2732,45 @@ int whisper_full_parallel(
2727
 
2728
  // combine results into ctx->result_all
2729
  for (int i = 0; i < n_processors - 1; ++i) {
2730
- auto & result_all = ctxs[i].result_all;
2731
 
2732
- for (int j = 0; j < (int) result_all.size(); ++j) {
2733
- result_all[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
2734
- result_all[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
 
2735
 
 
2736
  if (ctx->result_all.size() > 0) {
2737
- result_all[j].t0 = std::max(result_all[j].t0, ctx->result_all.back().t1);
2738
  }
2739
 
2740
- ctx->result_all.push_back(std::move(result_all[j]));
2741
 
2742
  // call the new_segment_callback for each segment
2743
  if (params.new_segment_callback) {
2744
  params.new_segment_callback(ctx, params.new_segment_callback_user_data);
2745
  }
2746
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2747
  }
 
2748
 
2749
  return ret;
2750
  }
 
1910
  return probs_id[0].second;
1911
  }
1912
 
1913
+ // 500 -> 00:05.000
1914
+ // 6000 -> 01:00.000
1915
+ std::string to_timestamp(int64_t t, bool comma = false) {
1916
+ int64_t msec = t * 10;
1917
+ int64_t hr = msec / (1000 * 60 * 60);
1918
+ msec = msec - hr * (1000 * 60 * 60);
1919
+ int64_t min = msec / (1000 * 60);
1920
+ msec = msec - min * (1000 * 60);
1921
+ int64_t sec = msec / 1000;
1922
+ msec = msec - sec * 1000;
1923
 
1924
  char buf[32];
1925
+ snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
1926
 
1927
  return std::string(buf);
1928
  }
 
2732
 
2733
  // combine results into ctx->result_all
2734
  for (int i = 0; i < n_processors - 1; ++i) {
2735
+ auto & results_i = ctxs[i].result_all;
2736
 
2737
+ for (int j = 0; j < (int) results_i.size(); ++j) {
2738
+ // correct the segment timestamp taking into account the offset
2739
+ results_i[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
2740
+ results_i[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
2741
 
2742
+ // make sure that segments are not overlapping
2743
  if (ctx->result_all.size() > 0) {
2744
+ results_i[j].t0 = std::max(results_i[j].t0, ctx->result_all.back().t1);
2745
  }
2746
 
2747
+ ctx->result_all.push_back(std::move(results_i[j]));
2748
 
2749
  // call the new_segment_callback for each segment
2750
  if (params.new_segment_callback) {
2751
  params.new_segment_callback(ctx, params.new_segment_callback_user_data);
2752
  }
2753
  }
2754
+
2755
+ ctx->t_mel_us += ctxs[i].t_mel_us;
2756
+ ctx->t_sample_us += ctxs[i].t_sample_us;
2757
+ ctx->t_encode_us += ctxs[i].t_encode_us;
2758
+ ctx->t_decode_us += ctxs[i].t_decode_us;
2759
+ }
2760
+
2761
+ // average the timings
2762
+ ctx->t_mel_us /= n_processors;
2763
+ ctx->t_sample_us /= n_processors;
2764
+ ctx->t_encode_us /= n_processors;
2765
+ ctx->t_decode_us /= n_processors;
2766
+
2767
+ // print information about the audio boundaries
2768
+ fprintf(stderr, "\n");
2769
+ fprintf(stderr, "%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
2770
+ for (int i = 0; i < n_processors - 1; ++i) {
2771
+ fprintf(stderr, "%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
2772
  }
2773
+ fprintf(stderr, "%s: the transcription quality may be degraded near these boundaries\n", __func__);
2774
 
2775
  return ret;
2776
  }
whisper.h CHANGED
@@ -213,6 +213,9 @@ extern "C" {
213
  const float * samples,
214
  int n_samples);
215
 
 
 
 
216
  WHISPER_API int whisper_full_parallel(
217
  struct whisper_context * ctx,
218
  struct whisper_full_params params,
 
213
  const float * samples,
214
  int n_samples);
215
 
216
+ // Split the input audio in chunks and process each chunk separately using whisper_full()
217
+ // It seems this approach can offer some speedup in some cases.
218
+ // However, the transcription accuracy can be worse at the beginning and end of each chunk.
219
  WHISPER_API int whisper_full_parallel(
220
  struct whisper_context * ctx,
221
  struct whisper_full_params params,