diff --git a/sherpa-onnx/csrc/offline-ctc-decoder.h b/sherpa-onnx/csrc/offline-ctc-decoder.h index 23e8d0bd83..c9d1b36ffa 100644 --- a/sherpa-onnx/csrc/offline-ctc-decoder.h +++ b/sherpa-onnx/csrc/offline-ctc-decoder.h @@ -15,8 +15,16 @@ struct OfflineCtcDecoderResult { /// The decoded token IDs std::vector<int64_t> tokens; + /// The decoded word IDs + /// Note: tokens.size() is usually not equal to words.size() + /// words is empty for greedy search decoding. + /// it is not empty when an HLG graph or an HLG graph is used. + std::vector<int32_t> words; + /// timestamps[i] contains the output frame index where tokens[i] is decoded. /// Note: The index is after subsampling + /// + /// tokens.size() == timestamps.size() std::vector<int32_t> timestamps; }; diff --git a/sherpa-onnx/csrc/offline-ctc-fst-decoder.cc b/sherpa-onnx/csrc/offline-ctc-fst-decoder.cc index e54274df4e..6c9df3fd3c 100644 --- a/sherpa-onnx/csrc/offline-ctc-fst-decoder.cc +++ b/sherpa-onnx/csrc/offline-ctc-fst-decoder.cc @@ -108,6 +108,9 @@ static OfflineCtcDecoderResult DecodeOne(kaldi_decoder::FasterDecoder *decoder, // -1 here since the input labels are incremented during graph // construction r.tokens.push_back(arc.ilabel - 1); + if (arc.olabel != 0) { + r.words.push_back(arc.olabel); + } r.timestamps.push_back(t); prev = arc.ilabel; diff --git a/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc b/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc index 0e31bd97ca..cdaf794135 100644 --- a/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc +++ b/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc @@ -64,10 +64,6 @@ OfflineParaformerGreedySearchDecoder::Decode( if (timestamps.size() == results[i].tokens.size()) { results[i].timestamps = std::move(timestamps); - } else { - SHERPA_ONNX_LOGE("time stamp for batch: %d, %d vs %d", i, - static_cast<int32_t>(results[i].tokens.size()), - static_cast<int32_t>(timestamps.size())); } } } diff --git a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h index 2c83dac28e..c64da12af8 100644 --- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h @@ -65,6 +65,8 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, r.timestamps.push_back(time); } + r.words = std::move(src.words); + return r; } diff --git a/sherpa-onnx/csrc/offline-stream.cc b/sherpa-onnx/csrc/offline-stream.cc index 4321a62ddf..6e72a4a1f6 100644 --- a/sherpa-onnx/csrc/offline-stream.cc +++ b/sherpa-onnx/csrc/offline-stream.cc @@ -339,6 +339,20 @@ std::string OfflineRecognitionResult::AsJsonString() const { } sep = ", "; } + os << "], "; + + sep = ""; + + os << "\"" + << "words" + << "\"" + << ": "; + os << "["; + for (int32_t w : words) { + os << sep << w; + sep = ", "; + } + os << "]"; os << "}"; diff --git a/sherpa-onnx/csrc/offline-stream.h b/sherpa-onnx/csrc/offline-stream.h index 13cc560047..9df46d04ef 100644 --- a/sherpa-onnx/csrc/offline-stream.h +++ b/sherpa-onnx/csrc/offline-stream.h @@ -30,6 +30,8 @@ struct OfflineRecognitionResult { /// timestamps[i] records the time in seconds when tokens[i] is decoded. std::vector<float> timestamps; + std::vector<int32_t> words; + std::string AsJsonString() const; }; diff --git a/sherpa-onnx/csrc/online-ctc-decoder.h b/sherpa-onnx/csrc/online-ctc-decoder.h index 28809e39f0..65305e6aca 100644 --- a/sherpa-onnx/csrc/online-ctc-decoder.h +++ b/sherpa-onnx/csrc/online-ctc-decoder.h @@ -22,8 +22,16 @@ struct OnlineCtcDecoderResult { /// The decoded token IDs std::vector<int64_t> tokens; + /// The decoded word IDs + /// Note: tokens.size() is usually not equal to words.size() + /// words is empty for greedy search decoding. + /// it is not empty when an HLG graph or an HLG graph is used. + std::vector<int32_t> words; + /// timestamps[i] contains the output frame index where tokens[i] is decoded. /// Note: The index is after subsampling + /// + /// tokens.size() == timestamps.size() std::vector<int32_t> timestamps; int32_t num_trailing_blanks = 0; diff --git a/sherpa-onnx/csrc/online-ctc-fst-decoder.cc b/sherpa-onnx/csrc/online-ctc-fst-decoder.cc index 93e4c103b5..f505788336 100644 --- a/sherpa-onnx/csrc/online-ctc-fst-decoder.cc +++ b/sherpa-onnx/csrc/online-ctc-fst-decoder.cc @@ -51,9 +51,9 @@ static void DecodeOne(const float *log_probs, int32_t num_rows, bool ok = decoder->GetBestPath(&fst_out); if (ok) { std::vector<int32_t> isymbols_out; - std::vector<int32_t> osymbols_out_unused; - ok = fst::GetLinearSymbolSequence(fst_out, &isymbols_out, - &osymbols_out_unused, nullptr); + std::vector<int32_t> osymbols_out; + ok = fst::GetLinearSymbolSequence(fst_out, &isymbols_out, &osymbols_out, + nullptr); std::vector<int64_t> tokens; tokens.reserve(isymbols_out.size()); @@ -83,6 +83,7 @@ static void DecodeOne(const float *log_probs, int32_t num_rows, } result->tokens = std::move(tokens); + result->words = std::move(osymbols_out); result->timestamps = std::move(timestamps); // no need to set frame_offset } diff --git a/sherpa-onnx/csrc/online-recognizer-ctc-impl.h b/sherpa-onnx/csrc/online-recognizer-ctc-impl.h index 7b85ceefd8..4d8ce29611 100644 --- a/sherpa-onnx/csrc/online-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-ctc-impl.h @@ -59,6 +59,7 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src, } r.segment = segment; + r.words = std::move(src.words); r.start_time = frames_since_start * frame_shift_ms / 1000.; return r; diff --git a/sherpa-onnx/csrc/online-recognizer.cc b/sherpa-onnx/csrc/online-recognizer.cc index 9004d3fbf7..fcb9169ef2 100644 --- a/sherpa-onnx/csrc/online-recognizer.cc +++ b/sherpa-onnx/csrc/online-recognizer.cc @@ -22,14 +22,16 @@ namespace sherpa_onnx { template <typename T> std::string VecToString(const std::vector<T> &vec, int32_t precision = 6) { std::ostringstream oss; - oss << std::fixed << std::setprecision(precision); - oss << "[ "; + if (precision != 0) { + oss << std::fixed << std::setprecision(precision); + } + oss << "["; std::string sep = ""; for (const auto &item : vec) { oss << sep << item; sep = ", "; } - oss << " ]"; + oss << "]"; return oss.str(); } @@ -38,26 +40,29 @@ template <> // explicit specialization for T = std::string std::string VecToString<std::string>(const std::vector<std::string> &vec, int32_t) { // ignore 2nd arg std::ostringstream oss; - oss << "[ "; + oss << "["; std::string sep = ""; for (const auto &item : vec) { oss << sep << "\"" << item << "\""; sep = ", "; } - oss << " ]"; + oss << "]"; return oss.str(); } std::string OnlineRecognizerResult::AsJsonString() const { std::ostringstream os; os << "{ "; - os << "\"text\": " << "\"" << text << "\"" << ", "; + os << "\"text\": " + << "\"" << text << "\"" + << ", "; os << "\"tokens\": " << VecToString(tokens) << ", "; os << "\"timestamps\": " << VecToString(timestamps, 2) << ", "; os << "\"ys_probs\": " << VecToString(ys_probs, 6) << ", "; os << "\"lm_probs\": " << VecToString(lm_probs, 6) << ", "; os << "\"context_scores\": " << VecToString(context_scores, 6) << ", "; os << "\"segment\": " << segment << ", "; + os << "\"words\": " << VecToString(words, 0) << ", "; os << "\"start_time\": " << std::fixed << std::setprecision(2) << start_time << ", "; os << "\"is_final\": " << (is_final ? "true" : "false"); diff --git a/sherpa-onnx/csrc/online-recognizer.h b/sherpa-onnx/csrc/online-recognizer.h index c04122ea0b..f7fcf2f216 100644 --- a/sherpa-onnx/csrc/online-recognizer.h +++ b/sherpa-onnx/csrc/online-recognizer.h @@ -47,6 +47,8 @@ struct OnlineRecognizerResult { /// log-domain scores from "hot-phrase" contextual boosting std::vector<float> context_scores; + std::vector<int32_t> words; + /// ID of this segment /// When an endpoint is detected, it is incremented int32_t segment = 0; diff --git a/sherpa-onnx/python/csrc/offline-stream.cc b/sherpa-onnx/python/csrc/offline-stream.cc index 5679eca7b0..3c1cf3486e 100644 --- a/sherpa-onnx/python/csrc/offline-stream.cc +++ b/sherpa-onnx/python/csrc/offline-stream.cc @@ -34,6 +34,8 @@ static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT }) .def_property_readonly("tokens", [](const PyClass &self) { return self.tokens; }) + .def_property_readonly("words", + [](const PyClass &self) { return self.words; }) .def_property_readonly( "timestamps", [](const PyClass &self) { return self.timestamps; }); } diff --git a/sherpa-onnx/python/csrc/online-recognizer.cc b/sherpa-onnx/python/csrc/online-recognizer.cc index c402163fee..148f73ee5f 100644 --- a/sherpa-onnx/python/csrc/online-recognizer.cc +++ b/sherpa-onnx/python/csrc/online-recognizer.cc @@ -40,6 +40,9 @@ static void PybindOnlineRecognizerResult(py::module *m) { }) .def_property_readonly( "segment", [](PyClass &self) -> int32_t { return self.segment; }) + .def_property_readonly( + "words", + [](PyClass &self) -> std::vector<int32_t> { return self.words; }) .def_property_readonly( "is_final", [](PyClass &self) -> bool { return self.is_final; }) .def("__str__", &PyClass::AsJsonString,