Spaces:
Running
Running
examples : Implement JSON output for Token-Level data in main (#1358)
Browse files- examples/main/main.cpp +49 -13
examples/main/main.cpp
CHANGED
|
@@ -83,6 +83,7 @@ struct whisper_params {
|
|
| 83 |
bool output_wts = false;
|
| 84 |
bool output_csv = false;
|
| 85 |
bool output_jsn = false;
|
|
|
|
| 86 |
bool output_lrc = false;
|
| 87 |
bool print_special = false;
|
| 88 |
bool print_colors = false;
|
|
@@ -151,6 +152,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 151 |
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
|
| 152 |
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
| 153 |
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
|
|
|
|
| 154 |
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
|
| 155 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 156 |
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
|
@@ -206,6 +208,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 206 |
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
|
| 207 |
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
| 208 |
fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false");
|
|
|
|
| 209 |
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
|
| 210 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 211 |
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
|
@@ -511,7 +514,12 @@ bool output_score(struct whisper_context * ctx, const char * fname, const whispe
|
|
| 511 |
return true;
|
| 512 |
}
|
| 513 |
|
| 514 |
-
bool output_json(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
std::ofstream fout(fname);
|
| 516 |
int indent = 0;
|
| 517 |
|
|
@@ -528,7 +536,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 528 |
auto end_arr = [&](bool end) {
|
| 529 |
indent--;
|
| 530 |
doindent();
|
| 531 |
-
fout << (end ? "]\n" : "
|
| 532 |
};
|
| 533 |
|
| 534 |
auto start_obj = [&](const char *name) {
|
|
@@ -569,12 +577,29 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 569 |
end_value(end);
|
| 570 |
};
|
| 571 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
auto value_b = [&](const char *name, const bool val, bool end) {
|
| 573 |
start_value(name);
|
| 574 |
fout << (val ? "true" : "false");
|
| 575 |
end_value(end);
|
| 576 |
};
|
| 577 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
if (!fout.is_open()) {
|
| 579 |
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
| 580 |
return false;
|
|
@@ -620,15 +645,26 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 620 |
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 621 |
|
| 622 |
start_obj(nullptr);
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
if (params.diarize && pcmf32s.size() == 2) {
|
| 634 |
value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
|
|
@@ -912,7 +948,7 @@ int main(int argc, char ** argv) {
|
|
| 912 |
wparams.offset_ms = params.offset_t_ms;
|
| 913 |
wparams.duration_ms = params.duration_ms;
|
| 914 |
|
| 915 |
-
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
| 916 |
wparams.thold_pt = params.word_thold;
|
| 917 |
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
| 918 |
wparams.split_on_word = params.split_on_word;
|
|
@@ -1012,7 +1048,7 @@ int main(int argc, char ** argv) {
|
|
| 1012 |
// output to JSON file
|
| 1013 |
if (params.output_jsn) {
|
| 1014 |
const auto fname_jsn = fname_out + ".json";
|
| 1015 |
-
output_json(ctx, fname_jsn.c_str(), params, pcmf32s);
|
| 1016 |
}
|
| 1017 |
|
| 1018 |
// output to LRC file
|
|
|
|
| 83 |
bool output_wts = false;
|
| 84 |
bool output_csv = false;
|
| 85 |
bool output_jsn = false;
|
| 86 |
+
bool output_jsn_full = false;
|
| 87 |
bool output_lrc = false;
|
| 88 |
bool print_special = false;
|
| 89 |
bool print_colors = false;
|
|
|
|
| 152 |
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
|
| 153 |
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
| 154 |
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
|
| 155 |
+
else if (arg == "-ojf" || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; }
|
| 156 |
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
|
| 157 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 158 |
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
|
|
|
| 208 |
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
|
| 209 |
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
| 210 |
fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false");
|
| 211 |
+
fprintf(stderr, " -ojf, --output-json-full [%-7s] include more information in the JSON file\n", params.output_jsn_full ? "true" : "false");
|
| 212 |
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
|
| 213 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 214 |
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
|
|
|
| 514 |
return true;
|
| 515 |
}
|
| 516 |
|
| 517 |
+
bool output_json(
|
| 518 |
+
struct whisper_context * ctx,
|
| 519 |
+
const char * fname,
|
| 520 |
+
const whisper_params & params,
|
| 521 |
+
std::vector<std::vector<float>> pcmf32s,
|
| 522 |
+
bool full) {
|
| 523 |
std::ofstream fout(fname);
|
| 524 |
int indent = 0;
|
| 525 |
|
|
|
|
| 536 |
auto end_arr = [&](bool end) {
|
| 537 |
indent--;
|
| 538 |
doindent();
|
| 539 |
+
fout << (end ? "]\n" : "],\n");
|
| 540 |
};
|
| 541 |
|
| 542 |
auto start_obj = [&](const char *name) {
|
|
|
|
| 577 |
end_value(end);
|
| 578 |
};
|
| 579 |
|
| 580 |
+
auto value_f = [&](const char *name, const float val, bool end) {
|
| 581 |
+
start_value(name);
|
| 582 |
+
fout << val;
|
| 583 |
+
end_value(end);
|
| 584 |
+
};
|
| 585 |
+
|
| 586 |
auto value_b = [&](const char *name, const bool val, bool end) {
|
| 587 |
start_value(name);
|
| 588 |
fout << (val ? "true" : "false");
|
| 589 |
end_value(end);
|
| 590 |
};
|
| 591 |
|
| 592 |
+
auto times_o = [&](int64_t t0, int64_t t1, bool end) {
|
| 593 |
+
start_obj("timestamps");
|
| 594 |
+
value_s("from", to_timestamp(t0, true).c_str(), false);
|
| 595 |
+
value_s("to", to_timestamp(t1, true).c_str(), true);
|
| 596 |
+
end_obj(false);
|
| 597 |
+
start_obj("offsets");
|
| 598 |
+
value_i("from", t0 * 10, false);
|
| 599 |
+
value_i("to", t1 * 10, true);
|
| 600 |
+
end_obj(end);
|
| 601 |
+
};
|
| 602 |
+
|
| 603 |
if (!fout.is_open()) {
|
| 604 |
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
| 605 |
return false;
|
|
|
|
| 645 |
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 646 |
|
| 647 |
start_obj(nullptr);
|
| 648 |
+
times_o(t0, t1, false);
|
| 649 |
+
value_s("text", text, !params.diarize && !params.tinydiarize && !full);
|
| 650 |
+
|
| 651 |
+
if (full) {
|
| 652 |
+
start_arr("tokens");
|
| 653 |
+
const int n = whisper_full_n_tokens(ctx, i);
|
| 654 |
+
for (int j = 0; j < n; ++j) {
|
| 655 |
+
auto token = whisper_full_get_token_data(ctx, i, j);
|
| 656 |
+
start_obj(nullptr);
|
| 657 |
+
value_s("text", whisper_token_to_str(ctx, token.id), false);
|
| 658 |
+
if(token.t0 > -1 && token.t1 > -1) {
|
| 659 |
+
// If we have per-token timestamps, write them out
|
| 660 |
+
times_o(token.t0, token.t1, false);
|
| 661 |
+
}
|
| 662 |
+
value_i("id", token.id, false);
|
| 663 |
+
value_f("p", token.p, true);
|
| 664 |
+
end_obj(j == (n - 1));
|
| 665 |
+
}
|
| 666 |
+
end_arr(!params.diarize && !params.tinydiarize);
|
| 667 |
+
}
|
| 668 |
|
| 669 |
if (params.diarize && pcmf32s.size() == 2) {
|
| 670 |
value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
|
|
|
|
| 948 |
wparams.offset_ms = params.offset_t_ms;
|
| 949 |
wparams.duration_ms = params.duration_ms;
|
| 950 |
|
| 951 |
+
wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.max_len > 0;
|
| 952 |
wparams.thold_pt = params.word_thold;
|
| 953 |
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
| 954 |
wparams.split_on_word = params.split_on_word;
|
|
|
|
| 1048 |
// output to JSON file
|
| 1049 |
if (params.output_jsn) {
|
| 1050 |
const auto fname_jsn = fname_out + ".json";
|
| 1051 |
+
output_json(ctx, fname_jsn.c_str(), params, pcmf32s, params.output_jsn_full);
|
| 1052 |
}
|
| 1053 |
|
| 1054 |
// output to LRC file
|