diff --git a/scripts/install-tuolumne.sh b/scripts/install-tuolumne.sh index d8f5da1..62760ca 100644 --- a/scripts/install-tuolumne.sh +++ b/scripts/install-tuolumne.sh @@ -22,4 +22,5 @@ for f in *.so*; do if patchelf --print-needed "$f" 2>/dev/null | grep -Fxq "$OLD"; then echo "STILL NEEDS $OLD -> $f" fi -done \ No newline at end of file +done +cd - diff --git a/scripts/scaffold-tuolumne-torchpypi.job b/scripts/scaffold-tuolumne-torchpypi.job index 3b25274..cc9b10e 100644 --- a/scripts/scaffold-tuolumne-torchpypi.job +++ b/scripts/scaffold-tuolumne-torchpypi.job @@ -13,7 +13,6 @@ ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi # Use ccl plugin that we manually built with install-rccl.sh export NCCL_NET_PLUGIN=../aws-ofi-nccl.git/install/lib/librccl-net.so -export NCCL_NET="AWS Libfabric" torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml diff --git a/scripts/scaffold-tuolumne.job b/scripts/scaffold-tuolumne.job index 79604f7..ce50b46 100644 --- a/scripts/scaffold-tuolumne.job +++ b/scripts/scaffold-tuolumne.job @@ -15,9 +15,6 @@ ml cce/21.0.0 cray-mpich/9.1.0 rocm/7.1.0 rccl/fast-env-slows-mpi # (2) Removing libmpi may cause segfault on mpi4py import export LD_PRELOAD="/opt/rocm-7.1.0/llvm/lib/libomp.so /opt/cray/pe/mpich/9.1.0/ofi/gnu/11.2/lib/libmpi_gnu.so.12" -# Ensure using libfabric. NCCL_NET_PLUGIN should be unecessary to set for WCI wheel. -export NCCL_NET="AWS Libfabric" - torchrun-hpc -N 1 -n 1 $(which scaffold) generate_fractals -c $(pwd)/ScaFFold/configs/benchmark_default.yml # Uncomment if you want torch profiling