Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,17 +210,6 @@ def _validate_split_kv_size(value: int) -> int:
"FD_XPU_MOE_FFN_QUANT_TYPE_MAP": lambda: os.getenv("FD_XPU_MOE_FFN_QUANT_TYPE_MAP", ""),
# Whether to enable low latency in mixed scenario
"FD_XPU_ENABLE_MIXED_EP_MODE": lambda: bool(int(os.getenv("FD_XPU_ENABLE_MIXED_EP_MODE", "0"))),
# Whether to use phi FP8 quantization,if 1,use paddle default.
"FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))),
# Enables the Paddle/phi combined TopK operator only when topk_method == noaux_tc,
# intended for training alignment. Defaults to 0 (disabled).
"FD_USE_PHI_MOE_TOPK": lambda: bool(int(os.getenv("FD_USE_PHI_MOE_TOPK", "0"))),
# Whether to use phi MOE permute,if 1,use paddle op.
"FD_USE_PHI_MOE_PERMUTE": lambda: bool(int(os.getenv("FD_USE_PHI_MOE_PERMUTE", "0"))),
# Whether to use phi rms_norm,if 1,use paddle op.
"FD_USE_PHI_RMSNORM": lambda: bool(int(os.getenv("FD_USE_PHI_RMSNORM", "0"))),
# Control class SiluAndMul to use swiglu or fusid_bias_act operator in the forward_cuda function
"FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))),
# Reserve output blocks for decoding requests when schedule new prefill requests
"FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL": lambda: int(
os.getenv("FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL", "16")
Expand Down Expand Up @@ -268,8 +257,22 @@ def _validate_split_kv_size(value: int) -> int:
"FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST": lambda: bool(
int(os.getenv("FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST", "1"))
),
# train-infer consistency, used in RL
# Whether to align RoPE and moe gate precision with training
"FD_ENABLE_RL": lambda: int(os.getenv("FD_ENABLE_RL", "0")),

This comment was marked as outdated.

# Whether to use phi FP8 quantization,if 1,use paddle default.
"FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))),
# Enables the Paddle/phi combined TopK operator only when topk_method == noaux_tc,
# intended for training alignment. Defaults to 0 (disabled).
"FD_USE_PHI_MOE_TOPK": lambda: bool(int(os.getenv("FD_USE_PHI_MOE_TOPK", "0"))),
# Whether to use phi MOE permute,if 1,use paddle op.
"FD_USE_PHI_MOE_PERMUTE": lambda: bool(int(os.getenv("FD_USE_PHI_MOE_PERMUTE", "0"))),
# Whether to use phi rms_norm,if 1,use paddle op.
"FD_USE_PHI_RMSNORM": lambda: bool(int(os.getenv("FD_USE_PHI_RMSNORM", "0"))),
# Control class SiluAndMul to use swiglu or fusid_bias_act operator in the forward_cuda function
"FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))),
# Whether to enable FP8 quantization with pow2scale.
"FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))),
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def m_grouped_fp8_gemm_nt_contiguous_custom_python_op(
else:
ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
ffn_out,
using_pow2_scale=not disable_ue8m0_cast,
using_pow2_scale=not disable_ue8m0_cast or fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE,
using_ue8m0_scale=not disable_ue8m0_cast,
)
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]]
Expand Down Expand Up @@ -355,7 +355,7 @@ def apply_ep_prefill(
else:
x_fp8, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
x,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0 or fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE,
output_scale_transpose=self.quant_config.deepgemm_scale_ue8m0,
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
)
Expand Down Expand Up @@ -581,7 +581,8 @@ def apply_ep_prefill(
else:
ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
ffn_out,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0
or fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE,
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
)
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]]
Expand Down Expand Up @@ -773,7 +774,7 @@ def apply_tp(
else:
recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
x,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0 or fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE,
output_scale_transpose=self.quant_config.deepgemm_scale_ue8m0,
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1247,7 +1247,7 @@ def python_op_fused_moe_kernel_paddle(
x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, quant_config.weight_block_size[0], False)
else:
x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
x, using_pow2_scale=False, output_scale_transpose=False
x, using_pow2_scale=fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE, output_scale_transpose=False

This comment was marked as outdated.

This comment was marked as outdated.

)
x_scale = x_scale[: x.shape[0]]

Expand Down Expand Up @@ -1305,7 +1305,9 @@ def python_op_fused_moe_kernel_paddle(
)
else:
x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
intermediate_cache2, using_pow2_scale=False, output_scale_transpose=False
intermediate_cache2,
using_pow2_scale=fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE,
output_scale_transpose=False,
)
x_scale = x_scale[: x_q.shape[0]]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def apply(self, layer, x):
else:
x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
x,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0 or fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE,
output_scale_transpose=True,
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
)
Expand Down
Loading