diff --git a/src/tokenize_util.cpp b/src/tokenize_util.cpp index bc0ff1d5a..22cf8ae2e 100644 --- a/src/tokenize_util.cpp +++ b/src/tokenize_util.cpp @@ -919,15 +919,21 @@ std::vector token_split(const std::string& text) { // `\s*[\r\n]+|\s+(?!\S)|\s+` if (is_space(cp)) { - std::string token = codepoint_to_utf8(cp); - ++i; + std::string token; + bool saw_new_line = false; while (i < cps.size() && is_space(cps[i])) { token += codepoint_to_utf8(cps[i]); - ++i; + if (cps[i] == U'\r' || cps[i] == U'\n') { - break; + saw_new_line = true; + } else { + if (saw_new_line) { + break; + } } + + ++i; } tokens.push_back(token);