diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/orbax_save_restore/orbax_llama405b_64gpu_ocdbt.yaml b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/orbax_save_restore/orbax_llama405b_64gpu_ocdbt.yaml new file mode 100644 index 000000000..0a35e9dee --- /dev/null +++ b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/orbax_save_restore/orbax_llama405b_64gpu_ocdbt.yaml @@ -0,0 +1,18 @@ +suite_name: "Llama 3.1 405B Orbax OCDBT 64-GPU" +num_repeats: 20 + +mesh_config: + mesh_axes: ["data", "fsdp", "tensor"] + ici_parallelism: {"data": 1, "fsdp": 64, "tensor": 1} + +checkpoint_config: + path: "gs://orbax-benchmarks/checkpoints/llama-3.1-405B-checkpoints/0/items" + +benchmarks: + - generator: "orbax.checkpoint._src.testing.benchmarks.v1.benchmark.Benchmark" + options: + async_enabled: true + use_ocdbt: true + use_zarr3: true + use_replica_parallel: false + use_compression: true diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/orbax_save_restore/orbax_llama70b_16gpu_ocdbt.yaml b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/orbax_save_restore/orbax_llama70b_16gpu_ocdbt.yaml new file mode 100644 index 000000000..b8559abee --- /dev/null +++ b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/orbax_save_restore/orbax_llama70b_16gpu_ocdbt.yaml @@ -0,0 +1,19 @@ +suite_name: "Llama 3.1 70B Orbax OCDBT 16-GPU" +num_repeats: 20 + +mesh_config: + mesh_axes: ["data", "fsdp", "tensor"] + ici_parallelism: {"data": 1, "fsdp": 16, "tensor": 1} + +checkpoint_config: + path: "gs://orbax-benchmarks/checkpoints/llama-3.1-70B-checkpoints/0/items" + +benchmarks: + - generator: "orbax.checkpoint._src.testing.benchmarks.v1.benchmark.Benchmark" + options: + async_enabled: true + use_ocdbt: true + use_zarr3: true + use_replica_parallel: false + use_compression: true + enable_trace: true diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/orbax_save_restore/orbax_llama8b_4gpu_ocdbt.yaml b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/orbax_save_restore/orbax_llama8b_4gpu_ocdbt.yaml new file mode 100644 index 000000000..541470999 --- /dev/null +++ b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/orbax_save_restore/orbax_llama8b_4gpu_ocdbt.yaml @@ -0,0 +1,18 @@ +suite_name: "Llama 3.1 8B Orbax OCDBT 4-GPU" +num_repeats: 20 + +mesh_config: + mesh_axes: ["data", "fsdp", "tensor"] + ici_parallelism: {"data": 1, "fsdp": 4, "tensor": 1} + +checkpoint_config: + path: "gs://orbax-benchmarks/checkpoints/llama-3.1-8B-checkpoints/0" + +benchmarks: + - generator: "orbax.checkpoint._src.testing.benchmarks.v1.benchmark.Benchmark" + options: + async_enabled: true + use_ocdbt: true + use_zarr3: true + use_replica_parallel: false + use_compression: true diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/pytorch_save_restore/pytorch_llama405b_benchmark.yaml b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/pytorch_save_restore/pytorch_llama405b_benchmark.yaml new file mode 100644 index 000000000..aba489f7d --- /dev/null +++ b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/pytorch_save_restore/pytorch_llama405b_benchmark.yaml @@ -0,0 +1,13 @@ +suite_name: PyTorch Llama 3.1 405B GCS Benchmark +num_repeats: 20 + +benchmarks: + - generator: orbax.checkpoint._src.testing.benchmarks.pytorch_checkpoint_benchmark.PyTorchCheckpointBenchmark + options: + reference_checkpoint_path: "/mnt/gcs_bucket/llama3.1-405B-checkpoints/" + metric_tracemalloc_enabled: [False] + save_thread_count: [16] + save_per_thread_copy_ahead_mb: [100] + enable_async_save: [True] + cache_staged_state_dict: [False] + enable_gcs_connector: [False] diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/pytorch_save_restore/pytorch_llama70b_benchmark.yaml b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/pytorch_save_restore/pytorch_llama70b_benchmark.yaml new file mode 100644 index 000000000..2a8e02fac --- /dev/null +++ b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/pytorch_save_restore/pytorch_llama70b_benchmark.yaml @@ -0,0 +1,14 @@ +suite_name: PyTorch Llama 3.1 70B FSSpec Benchmark +num_repeats: 20 + +benchmarks: + + - generator: orbax.checkpoint._src.testing.benchmarks.pytorch_checkpoint_benchmark.PyTorchCheckpointBenchmark + options: + reference_checkpoint_path: "/mnt/gcs_bucket/llama3.1-70B-checkpoints/" + metric_tracemalloc_enabled: [False] + save_thread_count: [16] + save_per_thread_copy_ahead_mb: [100] + enable_async_save: [True] + cache_staged_state_dict: [False] + enable_gcs_connector: [False] diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/pytorch_save_restore/pytorch_llama8b_benchmark.yaml b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/pytorch_save_restore/pytorch_llama8b_benchmark.yaml new file mode 100644 index 000000000..2ce0e9beb --- /dev/null +++ b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/pytorch_save_restore/pytorch_llama8b_benchmark.yaml @@ -0,0 +1,13 @@ +suite_name: PyTorch Llama 3.1 8B Optimized GCS Benchmark +num_repeats: 20 + +benchmarks: + - generator: orbax.checkpoint._src.testing.benchmarks.pytorch_checkpoint_benchmark.PyTorchCheckpointBenchmark + options: + reference_checkpoint_path: "/mnt/gcs_bucket/checkpoints/llama3.1-8B-checkpoints/" + metric_tracemalloc_enabled: [False] + save_thread_count: [16] + save_per_thread_copy_ahead_mb: [100] + enable_async_save: [True] + cache_staged_state_dict: [False] + enable_gcs_connector: [False]