akx commited on
Commit
d166741
·
unverified ·
1 Parent(s): b1a3c5a

examples : Implement JSON output for Token-Level data in main (#1358)

Browse files
Files changed (1) hide show
  1. examples/main/main.cpp +49 -13
examples/main/main.cpp CHANGED
@@ -83,6 +83,7 @@ struct whisper_params {
83
  bool output_wts = false;
84
  bool output_csv = false;
85
  bool output_jsn = false;
 
86
  bool output_lrc = false;
87
  bool print_special = false;
88
  bool print_colors = false;
@@ -151,6 +152,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
151
  else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
152
  else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
153
  else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
 
154
  else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
155
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
156
  else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
@@ -206,6 +208,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
206
  fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
207
  fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
208
  fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false");
 
209
  fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
210
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
211
  fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
@@ -511,7 +514,12 @@ bool output_score(struct whisper_context * ctx, const char * fname, const whispe
511
  return true;
512
  }
513
 
514
- bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
 
 
 
 
 
515
  std::ofstream fout(fname);
516
  int indent = 0;
517
 
@@ -528,7 +536,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
528
  auto end_arr = [&](bool end) {
529
  indent--;
530
  doindent();
531
- fout << (end ? "]\n" : "},\n");
532
  };
533
 
534
  auto start_obj = [&](const char *name) {
@@ -569,12 +577,29 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
569
  end_value(end);
570
  };
571
 
 
 
 
 
 
 
572
  auto value_b = [&](const char *name, const bool val, bool end) {
573
  start_value(name);
574
  fout << (val ? "true" : "false");
575
  end_value(end);
576
  };
577
 
 
 
 
 
 
 
 
 
 
 
 
578
  if (!fout.is_open()) {
579
  fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
580
  return false;
@@ -620,15 +645,26 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
620
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
621
 
622
  start_obj(nullptr);
623
- start_obj("timestamps");
624
- value_s("from", to_timestamp(t0, true).c_str(), false);
625
- value_s("to", to_timestamp(t1, true).c_str(), true);
626
- end_obj(false);
627
- start_obj("offsets");
628
- value_i("from", t0 * 10, false);
629
- value_i("to", t1 * 10, true);
630
- end_obj(false);
631
- value_s("text", text, !params.diarize && !params.tinydiarize);
 
 
 
 
 
 
 
 
 
 
 
632
 
633
  if (params.diarize && pcmf32s.size() == 2) {
634
  value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
@@ -912,7 +948,7 @@ int main(int argc, char ** argv) {
912
  wparams.offset_ms = params.offset_t_ms;
913
  wparams.duration_ms = params.duration_ms;
914
 
915
- wparams.token_timestamps = params.output_wts || params.max_len > 0;
916
  wparams.thold_pt = params.word_thold;
917
  wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
918
  wparams.split_on_word = params.split_on_word;
@@ -1012,7 +1048,7 @@ int main(int argc, char ** argv) {
1012
  // output to JSON file
1013
  if (params.output_jsn) {
1014
  const auto fname_jsn = fname_out + ".json";
1015
- output_json(ctx, fname_jsn.c_str(), params, pcmf32s);
1016
  }
1017
 
1018
  // output to LRC file
 
83
  bool output_wts = false;
84
  bool output_csv = false;
85
  bool output_jsn = false;
86
+ bool output_jsn_full = false;
87
  bool output_lrc = false;
88
  bool print_special = false;
89
  bool print_colors = false;
 
152
  else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
153
  else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
154
  else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
155
+ else if (arg == "-ojf" || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; }
156
  else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
157
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
158
  else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
 
208
  fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
209
  fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
210
  fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false");
211
+ fprintf(stderr, " -ojf, --output-json-full [%-7s] include more information in the JSON file\n", params.output_jsn_full ? "true" : "false");
212
  fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
213
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
214
  fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
 
514
  return true;
515
  }
516
 
517
+ bool output_json(
518
+ struct whisper_context * ctx,
519
+ const char * fname,
520
+ const whisper_params & params,
521
+ std::vector<std::vector<float>> pcmf32s,
522
+ bool full) {
523
  std::ofstream fout(fname);
524
  int indent = 0;
525
 
 
536
  auto end_arr = [&](bool end) {
537
  indent--;
538
  doindent();
539
+ fout << (end ? "]\n" : "],\n");
540
  };
541
 
542
  auto start_obj = [&](const char *name) {
 
577
  end_value(end);
578
  };
579
 
580
+ auto value_f = [&](const char *name, const float val, bool end) {
581
+ start_value(name);
582
+ fout << val;
583
+ end_value(end);
584
+ };
585
+
586
  auto value_b = [&](const char *name, const bool val, bool end) {
587
  start_value(name);
588
  fout << (val ? "true" : "false");
589
  end_value(end);
590
  };
591
 
592
+ auto times_o = [&](int64_t t0, int64_t t1, bool end) {
593
+ start_obj("timestamps");
594
+ value_s("from", to_timestamp(t0, true).c_str(), false);
595
+ value_s("to", to_timestamp(t1, true).c_str(), true);
596
+ end_obj(false);
597
+ start_obj("offsets");
598
+ value_i("from", t0 * 10, false);
599
+ value_i("to", t1 * 10, true);
600
+ end_obj(end);
601
+ };
602
+
603
  if (!fout.is_open()) {
604
  fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
605
  return false;
 
645
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
646
 
647
  start_obj(nullptr);
648
+ times_o(t0, t1, false);
649
+ value_s("text", text, !params.diarize && !params.tinydiarize && !full);
650
+
651
+ if (full) {
652
+ start_arr("tokens");
653
+ const int n = whisper_full_n_tokens(ctx, i);
654
+ for (int j = 0; j < n; ++j) {
655
+ auto token = whisper_full_get_token_data(ctx, i, j);
656
+ start_obj(nullptr);
657
+ value_s("text", whisper_token_to_str(ctx, token.id), false);
658
+ if(token.t0 > -1 && token.t1 > -1) {
659
+ // If we have per-token timestamps, write them out
660
+ times_o(token.t0, token.t1, false);
661
+ }
662
+ value_i("id", token.id, false);
663
+ value_f("p", token.p, true);
664
+ end_obj(j == (n - 1));
665
+ }
666
+ end_arr(!params.diarize && !params.tinydiarize);
667
+ }
668
 
669
  if (params.diarize && pcmf32s.size() == 2) {
670
  value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
 
948
  wparams.offset_ms = params.offset_t_ms;
949
  wparams.duration_ms = params.duration_ms;
950
 
951
+ wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.max_len > 0;
952
  wparams.thold_pt = params.word_thold;
953
  wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
954
  wparams.split_on_word = params.split_on_word;
 
1048
  // output to JSON file
1049
  if (params.output_jsn) {
1050
  const auto fname_jsn = fname_out + ".json";
1051
+ output_json(ctx, fname_jsn.c_str(), params, pcmf32s, params.output_jsn_full);
1052
  }
1053
 
1054
  // output to LRC file