diff --git a/README.md b/README.md index 8d43cc472..3de5aefe1 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,11 @@ mllm ## Latest News -- [2026 Jun 08] `pymllm` now covers Qwen3, Qwen3-VL, and Qwen3.5 on Jetson Orin with W4A16 / W8A8 serving; Qwen3-VL-2B W8A8 reaches up to 3.12x prefill speedup on AGX Orin 32GB, while decode throughput stays broadly close to llama.cpp. -- [2026 Apr 30] `pymllm` adds Jetson-oriented Qwen3 / Qwen3-VL BF16, W4A16, and W8A8 serving support, including compressed-tensors AWQ and W8A8 INT8 paths. -- [2026 Mar 18] 🔥🔥🔥 `pymllm` now supports CUDA on Jetson Orin and Jetson Thor devices (experimental; still under active development). -- [2026 Feb 03] 🔥🔥🔥 MLLM Qnn AOT Support for Full Graph Execution on NPU! [Quick Start](https://ubiquitouslearning.github.io/mllm/qnn_backend/aot_execute.html), [Technical Report](https://chenghuawang.github.io/News/2026-01-29-mllm-qnn-aot-support-en/) +- [2026 Jun 08] 🔥🔥🔥`pymllm` now covers Qwen3, Qwen3-VL, and Qwen3.5 on Jetson Orin with W4A16 / W8A8 serving; Qwen3-VL-2B W8A8 reaches up to 3.12x prefill speedup on AGX Orin 32GB, while decode throughput stays broadly close to llama.cpp. +- [2026 May 02] 🔥🔥🔥 MLLM now supports the Ascend NPU backend, with ATB graph execution and Qwen3 W8A8 inference on Ascend devices. +- [2026 Apr 30] 🔥🔥🔥`pymllm` adds Jetson-oriented Qwen3 / Qwen3-VL BF16, W4A16, and W8A8 serving support, including compressed-tensors AWQ and W8A8 INT8 paths. +- [2026 Mar 18] `pymllm` now supports CUDA on Jetson Orin and Jetson Thor devices (experimental; still under active development). +- [2026 Feb 03] MLLM Qnn AOT Support for Full Graph Execution on NPU! [Quick Start](https://ubiquitouslearning.github.io/mllm/qnn_backend/aot_execute.html), [Technical Report](https://chenghuawang.github.io/News/2026-01-29-mllm-qnn-aot-support-en/) - [2025 Nov 27] Android Demo Update: Enabled stable Qwen3 and DeepSeek-OCR streaming on Android via a novel In-App Go Server Architecture. - [2025 Nov 23] MLLM v2 released! - [2025 Aug 28] Support for MLLM V1 is ending soon. Before its retirement, V1 will integrate the following features: GPT-OSS. MLLM will then transition to V2, which can be viewed on the V2 branch. V2 will include brand-new capabilities: @@ -100,7 +101,7 @@ The mllm framework integrates seamlessly with popular community frameworks' chec | Model(v2) | CPU | Hexagon NPU
INT8 | Ascend NPU | |-----------------------------------------------------------------------------|------|-----------------------|------------| -| [Qwen3-0.6B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-w4a32kai) | | ✔️ W8A8 | +| [Qwen3-0.6B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-w4a32kai) | | [✔️ W8A8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-W8A8-Ascend) | | [Qwen3-1.7B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-1.7B-w4a8-i8mm-kai) | [W4A16-SM8650](https://modelscope.cn/models/mllmTeam/Qwen3-1.7B-Qnn-AOT-SM8650/) | | | [Qwen3-4B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-4B-w4a8-i8mm-kai) | | | | [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/DeepSeek-OCR-w4a8-i8mm-kai) | | | diff --git a/examples/qwen_ascend/main.cpp b/examples/qwen_ascend/main.cpp index 9099d18b3..d5abd622e 100644 --- a/examples/qwen_ascend/main.cpp +++ b/examples/qwen_ascend/main.cpp @@ -3,13 +3,39 @@ #include #include +#include +#include +#include #include +#include #include #include #include using mllm::Argparse; +namespace { + +std::string takeValidUtf8Prefix(std::string& pending_text) { + auto invalid = utf8::find_invalid(pending_text.begin(), pending_text.end()); + if (invalid == pending_text.begin()) { + return {}; + } + + if (invalid == pending_text.end()) { + std::string ready_text; + ready_text.swap(pending_text); + return ready_text; + } + + auto ready_bytes = static_cast(std::distance(pending_text.begin(), invalid)); + auto ready_text = pending_text.substr(0, ready_bytes); + pending_text.erase(0, ready_bytes); + return ready_text; +} + +} // namespace + MLLM_MAIN({ auto& help = Argparse::add("-h|--help").help("Show help message"); auto& model_path = Argparse::add("-m|--model_path").help("Model path").required(true); @@ -194,17 +220,27 @@ MLLM_MAIN({ msg.prompt = prompt_text; auto inputs = tokenizer.convertMessage(msg); - // Clear KV cache before generation + // Run a prefill warmup outside ARGeneration timing so first-use Ascend + // graph/runtime setup is not counted as the measured prefill time. model.clearCache(); + fmt::print("\nWarming up prefill path...\n"); + (void)model.forward(inputs, {}); + // Keep RoPE cache warmed, but reset KV state for the measured generation. + model.kvCache().clearCache(); fmt::print("\nAnswer:\n"); auto chat_start = std::chrono::high_resolution_clock::now(); std::vector generated_ids; - // Use streaming generation with the ARGeneration chat interface + std::string pending_text; for (auto& step : model.chat(inputs)) { generated_ids.push_back(step.cur_token_id); - std::wcout << tokenizer.detokenize(step.cur_token_id) << std::flush; + pending_text += tokenizer.decode({step.cur_token_id}); + auto ready_text = takeValidUtf8Prefix(pending_text); + if (!ready_text.empty()) { + fmt::print("{}", ready_text); + std::fflush(stdout); + } // Stop if we've reached max_new_tokens if (static_cast(generated_ids.size()) >= gen_max_new_tokens) { if (step.current_step > 0) { @@ -213,7 +249,10 @@ MLLM_MAIN({ break; } } - std::wcout << std::endl; + if (!pending_text.empty()) { + fmt::print("{}", pending_text); + } + fmt::print("\n"); auto chat_end = std::chrono::high_resolution_clock::now(); auto chat_ms = std::chrono::duration_cast(chat_end - chat_start).count(); diff --git a/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp b/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp index e9306002c..eb7bc2f3c 100644 --- a/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp +++ b/mllm/models/qwen_ascend/tokenization_qwen_ascend.hpp @@ -22,7 +22,7 @@ struct QwenAscendMessage { class QwenAscendTokenizer final : public mllm::preprocessor::AutoTokenizer { public: explicit QwenAscendTokenizer(const std::string& file_path) { - preprocessor::initLocal(); + preprocessor::initLocal("C.UTF-8"); preprocessor::makeBytes2UnicodeMap(bytes_2_unicode_dict_); for (auto& kv : bytes_2_unicode_dict_) { bytes_2_unicode_dict_inverse_.insert({kv.second, kv.first}); } bpe_.initFromSentencePieceJson(file_path);