diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index c9869943d1d..902fa20f81e 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -212,17 +212,6 @@ def _validate_split_kv_size(value: int) -> int: "FD_XPU_MOE_FFN_QUANT_TYPE_MAP": lambda: os.getenv("FD_XPU_MOE_FFN_QUANT_TYPE_MAP", ""), # Whether to enable low latency in mixed scenario "FD_XPU_ENABLE_MIXED_EP_MODE": lambda: bool(int(os.getenv("FD_XPU_ENABLE_MIXED_EP_MODE", "0"))), - # Whether to use phi FP8 quantization,if 1,use paddle default. - "FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))), - # Enables the Paddle/phi combined TopK operator only when topk_method == noaux_tc, - # intended for training alignment. Defaults to 0 (disabled). - "FD_USE_PHI_MOE_TOPK": lambda: bool(int(os.getenv("FD_USE_PHI_MOE_TOPK", "0"))), - # Whether to use phi MOE permute,if 1,use paddle op. - "FD_USE_PHI_MOE_PERMUTE": lambda: bool(int(os.getenv("FD_USE_PHI_MOE_PERMUTE", "0"))), - # Whether to use phi rms_norm,if 1,use paddle op. - "FD_USE_PHI_RMSNORM": lambda: bool(int(os.getenv("FD_USE_PHI_RMSNORM", "0"))), - # Control class SiluAndMul to use swiglu or fusid_bias_act operator in the forward_cuda function - "FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))), # Reserve output blocks for decoding requests when schedule new prefill requests "FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL": lambda: int( os.getenv("FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL", "16") @@ -270,8 +259,22 @@ def _validate_split_kv_size(value: int) -> int: "FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST": lambda: bool( int(os.getenv("FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST", "1")) ), + # train-infer consistency, used in RL # Whether to align RoPE and moe gate precision with training "FD_ENABLE_RL": lambda: int(os.getenv("FD_ENABLE_RL", "0")), + # Whether to use phi FP8 quantization,if 1,use paddle default. + "FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))), + # Enables the Paddle/phi combined TopK operator only when topk_method == noaux_tc, + # intended for training alignment. Defaults to 0 (disabled). + "FD_USE_PHI_MOE_TOPK": lambda: bool(int(os.getenv("FD_USE_PHI_MOE_TOPK", "0"))), + # Whether to use phi MOE permute,if 1,use paddle op. + "FD_USE_PHI_MOE_PERMUTE": lambda: bool(int(os.getenv("FD_USE_PHI_MOE_PERMUTE", "0"))), + # Whether to use phi rms_norm,if 1,use paddle op. + "FD_USE_PHI_RMSNORM": lambda: bool(int(os.getenv("FD_USE_PHI_RMSNORM", "0"))), + # Control class SiluAndMul to use swiglu or fusid_bias_act operator in the forward_cuda function + "FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))), + # Whether to enable FP8 quantization with pow2scale. + "FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))), } diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 53247e29126..a16e5ccbe9c 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -188,7 +188,7 @@ def m_grouped_fp8_gemm_nt_contiguous_custom_python_op( else: ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( ffn_out, - using_pow2_scale=not disable_ue8m0_cast, + using_pow2_scale=not disable_ue8m0_cast or fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE, using_ue8m0_scale=not disable_ue8m0_cast, ) ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]] @@ -355,7 +355,7 @@ def apply_ep_prefill( else: x_fp8, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( x, - using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0, + using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0 or fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE, output_scale_transpose=self.quant_config.deepgemm_scale_ue8m0, using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0, ) @@ -581,7 +581,8 @@ def apply_ep_prefill( else: ffn_in_x, ffn_in_x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( ffn_out, - using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0, + using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0 + or fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE, using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0, ) ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.T[: ffn_in_x.shape[0]] @@ -773,7 +774,7 @@ def apply_tp( else: recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( x, - using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0, + using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0 or fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE, output_scale_transpose=self.quant_config.deepgemm_scale_ue8m0, using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0, ) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index 010b61e0496..56f6e6dd42c 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -1248,7 +1248,7 @@ def python_op_fused_moe_kernel_paddle( x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, quant_config.weight_block_size[0], False) else: x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( - x, using_pow2_scale=False, output_scale_transpose=False + x, using_pow2_scale=fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE, output_scale_transpose=False ) x_scale = x_scale[: x.shape[0]] @@ -1306,7 +1306,9 @@ def python_op_fused_moe_kernel_paddle( ) else: x_q, x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise( - intermediate_cache2, using_pow2_scale=False, output_scale_transpose=False + intermediate_cache2, + using_pow2_scale=fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE, + output_scale_transpose=False, ) x_scale = x_scale[: x_q.shape[0]] diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index b277efbd73d..709a5e69613 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -353,7 +353,7 @@ def apply(self, layer, x): else: x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise( x, - using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0, + using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0 or fastdeploy.envs.FD_FP8_QUANT_WITH_POW2SCALE, output_scale_transpose=True, using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0, )