diff --git a/src/prime_rl/templates/_launch_rank.sh.j2 b/src/prime_rl/templates/_launch_rank.sh.j2 index f9d35fa935..f4c2a0be40 100644 --- a/src/prime_rl/templates/_launch_rank.sh.j2 +++ b/src/prime_rl/templates/_launch_rank.sh.j2 @@ -15,14 +15,18 @@ rank_log() { # launch_inference_rank launch_inference_rank() { local port="$1" gpus="$2" dp="$3" rpc="$4" extra="$5" nixl="$6" log="$7" + local log_name="${log##*/}" + local -a tee_args=(-a "$log") + # node_*.log already gets a truncating header; rank-specific logs do not. + [[ "$log_name" == *_rank*.log ]] && tee_args=("$log") local -a cmd=(uv run inference @ "$CONFIG_PATH" --server.host 0.0.0.0 --server.port "$port" --parallel.dp "$dp" --data-parallel-size-local 1 --api-server-count 1) [ -n "$rpc" ] && cmd+=(--data-parallel-rpc-port "$rpc") [ -n "$extra" ] && cmd+=(--vllm-extra "$extra") if [ -n "$nixl" ]; then - CUDA_VISIBLE_DEVICES="$gpus" VLLM_NIXL_SIDE_CHANNEL_PORT="$nixl" "${cmd[@]}" 2>&1 | tee -a "$log" & + CUDA_VISIBLE_DEVICES="$gpus" VLLM_NIXL_SIDE_CHANNEL_PORT="$nixl" "${cmd[@]}" 2>&1 | tee "${tee_args[@]}" & else - CUDA_VISIBLE_DEVICES="$gpus" "${cmd[@]}" 2>&1 | tee -a "$log" & + CUDA_VISIBLE_DEVICES="$gpus" "${cmd[@]}" 2>&1 | tee "${tee_args[@]}" & fi } diff --git a/src/prime_rl/templates/inference.sbatch.j2 b/src/prime_rl/templates/inference.sbatch.j2 index e7d7f784bc..8814ca12f1 100755 --- a/src/prime_rl/templates/inference.sbatch.j2 +++ b/src/prime_rl/templates/inference.sbatch.j2 @@ -51,7 +51,6 @@ export PROJECT_DIR={{ project_dir }} export CONFIG_PATH={{ config_path }} export OUTPUT_DIR={{ output_dir }} mkdir -p $OUTPUT_DIR/logs/inference -rm -f $OUTPUT_DIR/logs/inference/*.log ln -sfn inference/node_0.log $OUTPUT_DIR/logs/inference.log # General diff --git a/src/prime_rl/templates/multi_node_rl.sbatch.j2 b/src/prime_rl/templates/multi_node_rl.sbatch.j2 index c462610778..8c8d373b29 100755 --- a/src/prime_rl/templates/multi_node_rl.sbatch.j2 +++ b/src/prime_rl/templates/multi_node_rl.sbatch.j2 @@ -57,7 +57,6 @@ export CONFIG_DIR={{ config_dir }} export OUTPUT_DIR={{ output_dir }} export ORCHESTRATOR_OUTPUT_DIR={{ orchestrator_output_dir }} mkdir -p $OUTPUT_DIR/logs/trainer $OUTPUT_DIR/logs/inference -rm -f $OUTPUT_DIR/logs/inference/*.log ln -sfn trainer/node_0.log $OUTPUT_DIR/logs/trainer.log ln -sfn inference/node_0.log $OUTPUT_DIR/logs/inference.log