|
18 | 18 | prompt = """ |
19 | 19 | Explain quantum computing in simple terms. |
20 | 20 | """ |
21 | | -config = AutoConfig.from_pretrained(model_id, num_hidden_layers=2) |
22 | | -tokenizer = AutoTokenizer.from_pretrained(model_id, num_hidden_layers=2) |
| 21 | +config = AutoConfig.from_pretrained(model_id) |
| 22 | +tokenizer = AutoTokenizer.from_pretrained(model_id) |
23 | 23 | PREFILL_SEQ_LEN = 128 |
24 | 24 | CTX_LEN = 128 * 3 |
25 | 25 |
|
26 | | -qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) |
27 | | -breakpoint() |
| 26 | +qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id) |
28 | 27 | decode_qpc_path = qeff_model.compile( |
29 | 28 | prefill_seq_len=1, |
30 | 29 | ctx_len=CTX_LEN, |
31 | 30 | num_cores=16, |
32 | 31 | mxfp6_matmul=True, |
33 | 32 | mxint8_kv_cache=True, |
34 | | - num_devices=2, |
35 | | - split_retained_state_io=True, |
| 33 | + num_devices=1, |
36 | 34 | mos=1, |
37 | 35 | aic_enable_depth_first=True, |
38 | 36 | num_speculative_tokens=None, |
39 | 37 | offload_pt_weights=False, # Need the weights in memory for prefill-model export/compilation in the next step |
40 | 38 | retain_full_kv=True, |
41 | 39 | ) |
42 | | -breakpoint() |
43 | 40 |
|
44 | 41 | # Following command errors out by default, the user is supposed to run the printed command and provide the generated qpc path as prefill_qpc_path commenting out lines 55-68 |
45 | 42 | # prefill_qpc_path = "/home/dipankar/.cache/qeff_models/Qwen3MoeForCausalLM/Qwen3MoeForCausalLM-2fff95dd3d8e1907/qpc-0d9874dc75da1555/qpc" |
|
60 | 57 | # use_onnx_subfunctions=True, |
61 | 58 | ) |
62 | 59 |
|
63 | | -breakpoint() |
| 60 | + |
64 | 61 | inputs = tokenizer(prompt, return_tensors="np", padding=True) |
65 | 62 | position_ids = inputs["attention_mask"].sum(1, keepdims=True) |
66 | 63 | generation_len = CTX_LEN - position_ids.max() |
|
74 | 71 | inputs.pop("past_key_values", None) |
75 | 72 | inputs = {k: v.detach().numpy() for k, v in inputs.items()} |
76 | 73 |
|
77 | | -breakpoint() |
78 | 74 |
|
79 | 75 | prefill_session = QAICInferenceSession(prefill_qpc_path) |
| 76 | +decode_session = QAICInferenceSession(decode_qpc_path) |
80 | 77 |
|
81 | 78 | all_outputs = [] |
82 | 79 | for i in range(num_chunks): |
|
86 | 83 | ins = time.time() |
87 | 84 | qpc_out = prefill_session.run(chunk_inputs) |
88 | 85 | print(f"time for this run={time.time() - ins}") |
89 | | - breakpoint() |
90 | 86 | for i in range(config.num_hidden_layers): |
91 | 87 | inputs[f"past_key.{i}"] = qpc_out[f"past_key.{i}_RetainedState"] |
92 | 88 | inputs[f"past_value.{i}"] = qpc_out[f"past_value.{i}_RetainedState"] |
93 | 89 |
|
94 | 90 | all_outputs.append(np.argmax(qpc_out["logits"])) |
95 | | -prefill_session.deactivate() |
96 | | -decode_session = QAICInferenceSession(decode_qpc_path) |
97 | | -breakpoint() |
98 | | -# decode_session.activate() |
| 91 | + |
99 | 92 | decode_inputs = { |
100 | 93 | "input_ids": np.argmax(qpc_out["logits"]).reshape(1, 1), |
101 | 94 | "position_ids": np.max(inputs["position_ids"]).reshape(1, 1) + 1, |
|
0 commit comments