Skip to content

Commit

Permalink
Support printing input text and words after splitting (#376)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Oct 20, 2023
1 parent 2a932ac commit 3ba9a49
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 3 deletions.
33 changes: 32 additions & 1 deletion sherpa-onnx/csrc/lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ static std::vector<int32_t> ConvertTokensToIds(
}

Lexicon::Lexicon(const std::string &lexicon, const std::string &tokens,
const std::string &punctuations, const std::string &language) {
const std::string &punctuations, const std::string &language,
bool debug /*= false*/)
: debug_(debug) {
InitLanguage(language);
InitTokens(tokens);
InitLexicon(lexicon);
Expand All @@ -102,6 +104,20 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese(
const std::string &text) const {
std::vector<std::string> words = SplitUtf8(text);

if (debug_) {
fprintf(stderr, "Input text in string: %s\n", text.c_str());
fprintf(stderr, "Input text in bytes:");
for (uint8_t c : text) {
fprintf(stderr, " %02x", c);
}
fprintf(stderr, "\n");
fprintf(stderr, "After splitting to words:");
for (const auto &w : words) {
fprintf(stderr, " %s", w.c_str());
}
fprintf(stderr, "\n");
}

std::vector<int64_t> ans;

auto sil = token2id_.at("sil");
Expand Down Expand Up @@ -134,6 +150,21 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
ToLowerCase(&text);

std::vector<std::string> words = SplitUtf8(text);

if (debug_) {
fprintf(stderr, "Input text (lowercase) in string: %s\n", text.c_str());
fprintf(stderr, "Input text in bytes:");
for (uint8_t c : text) {
fprintf(stderr, " %02x", c);
}
fprintf(stderr, "\n");
fprintf(stderr, "After splitting to words:");
for (const auto &w : words) {
fprintf(stderr, " %s", w.c_str());
}
fprintf(stderr, "\n");
}

int32_t blank = token2id_.at(" ");

std::vector<int64_t> ans;
Expand Down
4 changes: 3 additions & 1 deletion sherpa-onnx/csrc/lexicon.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ namespace sherpa_onnx {
class Lexicon {
public:
Lexicon(const std::string &lexicon, const std::string &tokens,
const std::string &punctuations, const std::string &language);
const std::string &punctuations, const std::string &language,
bool debug = false);

std::vector<int64_t> ConvertTextToTokenIds(const std::string &text) const;

Expand Down Expand Up @@ -45,6 +46,7 @@ class Lexicon {
std::unordered_set<std::string> punctuations_;
std::unordered_map<std::string, int32_t> token2id_;
Language language_;
bool debug_;
//
};

Expand Down
3 changes: 2 additions & 1 deletion sherpa-onnx/csrc/offline-tts-vits-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config)
: model_(std::make_unique<OfflineTtsVitsModel>(config.model)),
lexicon_(config.model.vits.lexicon, config.model.vits.tokens,
model_->Punctuations(), model_->Language()) {}
model_->Punctuations(), model_->Language(),
config.model.debug) {}

GeneratedAudio Generate(const std::string &text,
int64_t sid = 0) const override {
Expand Down

0 comments on commit 3ba9a49

Please sign in to comment.