diff --git a/src/dstack/_internal/cli/commands/offer.py b/src/dstack/_internal/cli/commands/offer.py index 0e4be1d5c2..dd81231536 100644 --- a/src/dstack/_internal/cli/commands/offer.py +++ b/src/dstack/_internal/cli/commands/offer.py @@ -74,7 +74,10 @@ def _register(self): def _command(self, args: argparse.Namespace): super()._command(args) - conf = TaskConfiguration(commands=[":"]) + # Set image and user so that the server (a) does not default gpu.vendor + # to nvidia — `dstack offer` should show all vendors, and (b) does not + # attempt to pull image config from the Docker registry. + conf = TaskConfiguration(commands=[":"], image="scratch", user="root") configurator = OfferConfigurator(api_client=self.api) configurator.apply_args(conf, args) diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index 33ba4e10d2..c4c4d3488a 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -383,7 +383,13 @@ def interpolate_env(self, conf: RunConfigurationT): def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None: """ - Infers and sets `resources.gpu.vendor` if not set, requires `image` if the vendor is AMD. + Infers GPU vendor if not set. Defaults to Nvidia when using the default + CUDA image. Requires explicit `image` if the vendor is AMD or Tenstorrent. + + NOTE: We don't set the inferred vendor on gpu_spec for compatibility with + older servers. Servers set the vendor using the same logic in + set_resources_defaults(). The inferred vendor is used here only for + validation and display (see _infer_gpu_vendor). """ gpu_spec = conf.resources.gpu if gpu_spec is None: @@ -425,12 +431,18 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None: # CUDA image, not a big deal. has_amd_gpu = gpuhunt.AcceleratorVendor.AMD in vendors has_tt_gpu = gpuhunt.AcceleratorVendor.TENSTORRENT in vendors + # Set vendor inferred from name on the spec (server needs it for filtering). + gpu_spec.vendor = vendor else: - # If neither gpu.vendor nor gpu.name is set, assume Nvidia. - vendor = gpuhunt.AcceleratorVendor.NVIDIA + # No vendor or name specified. Default to Nvidia if using the default + # CUDA image, since it's only compatible with Nvidia GPUs. + # We don't set the inferred vendor on the spec — the server does the + # same inference in set_resources_defaults() for compatibility with + # older servers that don't handle vendor + count.min=0 correctly. + if conf.image is None and conf.docker is not True: + vendor = gpuhunt.AcceleratorVendor.NVIDIA has_amd_gpu = False has_tt_gpu = False - gpu_spec.vendor = vendor else: has_amd_gpu = vendor == gpuhunt.AcceleratorVendor.AMD has_tt_gpu = vendor == gpuhunt.AcceleratorVendor.TENSTORRENT diff --git a/src/dstack/_internal/core/models/resources.py b/src/dstack/_internal/core/models/resources.py index 20b4f3aa55..02cbbdc9b8 100644 --- a/src/dstack/_internal/core/models/resources.py +++ b/src/dstack/_internal/core/models/resources.py @@ -319,6 +319,9 @@ def _vendor_from_string(cls, v: str) -> gpuhunt.AcceleratorVendor: return gpuhunt.AcceleratorVendor.cast(v) +DEFAULT_GPU_SPEC = GPUSpec(count=Range[int](min=0, max=None)) + + class DiskSpecConfig(CoreConfig): @staticmethod def schema_extra(schema: Dict[str, Any]): @@ -387,7 +390,8 @@ class ResourcesSpec(generate_dual_core_model(ResourcesSpecConfig)): "you may need to configure this" ), ] = None - gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None + # Optional for backward compatibility + gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = DEFAULT_GPU_SPEC disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK def pretty_format(self) -> str: @@ -397,6 +401,7 @@ def pretty_format(self) -> str: if self.gpu: gpu = self.gpu resources.update( + gpu_vendor=gpu.vendor, gpu_name=",".join(gpu.name) if gpu.name else None, gpu_count=gpu.count, gpu_memory=gpu.memory, diff --git a/src/dstack/_internal/server/services/resources.py b/src/dstack/_internal/server/services/resources.py index 17cd80a662..aab47de21c 100644 --- a/src/dstack/_internal/server/services/resources.py +++ b/src/dstack/_internal/server/services/resources.py @@ -1,3 +1,5 @@ +from typing import Optional + import gpuhunt from pydantic import parse_obj_as @@ -19,3 +21,24 @@ def set_resources_defaults(resources: ResourcesSpec) -> None: else: cpu.arch = gpuhunt.CPUArchitecture.X86 resources.cpu = cpu + + +def set_gpu_vendor_default( + resources: ResourcesSpec, + image: Optional[str], + docker: Optional[bool], +) -> None: + """Default GPU vendor to Nvidia when using the default CUDA image, + since it's only compatible with Nvidia GPUs. + Mirrors the client-side logic in validate_gpu_vendor_and_image(). + Should only be called for runs (not fleets) since fleets don't have image context.""" + gpu = resources.gpu + if ( + gpu is not None + and gpu.vendor is None + and gpu.name is None + and gpu.count.max != 0 + and image is None + and docker is not True + ): + gpu.vendor = gpuhunt.AcceleratorVendor.NVIDIA diff --git a/src/dstack/_internal/server/services/runs/__init__.py b/src/dstack/_internal/server/services/runs/__init__.py index 5ae19b348f..73966916b5 100644 --- a/src/dstack/_internal/server/services/runs/__init__.py +++ b/src/dstack/_internal/server/services/runs/__init__.py @@ -65,7 +65,10 @@ from dstack._internal.server.services.plugins import apply_plugin_policies from dstack._internal.server.services.probes import is_probe_ready from dstack._internal.server.services.projects import list_user_project_models -from dstack._internal.server.services.resources import set_resources_defaults +from dstack._internal.server.services.resources import ( + set_gpu_vendor_default, + set_resources_defaults, +) from dstack._internal.server.services.runs.plan import get_job_plans from dstack._internal.server.services.runs.spec import ( can_update_run_spec, @@ -343,8 +346,8 @@ async def get_plan( ) if current_resource is not None: # For backward compatibility (current_resource may has been submitted before - # some fields, e.g., CPUSpec.arch, were added) - set_resources_defaults(current_resource.run_spec.configuration.resources) + # some fields, e.g., CPUSpec.arch, gpu.vendor were added) + _set_run_resources_defaults(current_resource.run_spec) if not current_resource.status.is_finished() and can_update_run_spec( current_resource.run_spec, effective_run_spec ): @@ -354,7 +357,7 @@ async def get_plan( session=session, project=project, profile=profile, - run_spec=run_spec, + run_spec=effective_run_spec, max_offers=max_offers, ) run_plan = RunPlan( @@ -410,8 +413,8 @@ async def apply_plan( current_resource = run_model_to_run(current_resource_model, return_in_api=True) # For backward compatibility (current_resource may has been submitted before - # some fields, e.g., CPUSpec.arch, were added) - set_resources_defaults(current_resource.run_spec.configuration.resources) + # some fields, e.g., CPUSpec.arch, gpu.vendor were added) + _set_run_resources_defaults(current_resource.run_spec) try: spec_diff = check_can_update_run_spec(current_resource.run_spec, run_spec) except ServerClientError: @@ -421,7 +424,7 @@ async def apply_plan( raise if not force: if plan.current_resource is not None: - set_resources_defaults(plan.current_resource.run_spec.configuration.resources) + _set_run_resources_defaults(plan.current_resource.run_spec) if ( plan.current_resource is None or plan.current_resource.id != current_resource.id @@ -782,6 +785,16 @@ def run_model_to_run( return run +def _set_run_resources_defaults(run_spec: RunSpec) -> None: + """Apply resource defaults to a run spec, including GPU vendor inference.""" + set_resources_defaults(run_spec.configuration.resources) + set_gpu_vendor_default( + run_spec.configuration.resources, + image=run_spec.configuration.image, + docker=getattr(run_spec.configuration, "docker", None), + ) + + def _get_run_jobs_with_submissions( run_model: RunModel, job_submissions_limit: Optional[int], diff --git a/src/dstack/_internal/server/services/runs/spec.py b/src/dstack/_internal/server/services/runs/spec.py index f478d187bb..a18f151ce1 100644 --- a/src/dstack/_internal/server/services/runs/spec.py +++ b/src/dstack/_internal/server/services/runs/spec.py @@ -8,7 +8,10 @@ from dstack._internal.server import settings from dstack._internal.server.models import UserModel from dstack._internal.server.services.docker import is_valid_docker_volume_target -from dstack._internal.server.services.resources import set_resources_defaults +from dstack._internal.server.services.resources import ( + set_gpu_vendor_default, + set_resources_defaults, +) from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) @@ -108,6 +111,11 @@ def validate_run_spec_and_set_defaults( if run_spec.configuration.priority is None: run_spec.configuration.priority = RUN_PRIORITY_DEFAULT set_resources_defaults(run_spec.configuration.resources) + set_gpu_vendor_default( + run_spec.configuration.resources, + image=run_spec.configuration.image, + docker=getattr(run_spec.configuration, "docker", None), + ) if run_spec.ssh_key_pub is None: if user.ssh_public_key: run_spec.ssh_key_pub = user.ssh_public_key diff --git a/src/dstack/_internal/utils/common.py b/src/dstack/_internal/utils/common.py index 28becc936f..ba139c6bfc 100644 --- a/src/dstack/_internal/utils/common.py +++ b/src/dstack/_internal/utils/common.py @@ -91,28 +91,14 @@ def pretty_resources( cpus: Optional[Any] = None, memory: Optional[Any] = None, gpu_count: Optional[Any] = None, + gpu_vendor: Optional[Any] = None, gpu_name: Optional[Any] = None, gpu_memory: Optional[Any] = None, total_gpu_memory: Optional[Any] = None, compute_capability: Optional[Any] = None, disk_size: Optional[Any] = None, ) -> str: - """ - >>> pretty_resources(cpus=4, memory="16GB") - '4xCPU, 16GB' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1) - '4xCPU, 16GB, 1xGPU' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name='A100') - '4xCPU, 16GB, 1xA100' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name='A100', gpu_memory="40GB") - '4xCPU, 16GB, 1xA100 (40GB)' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, total_gpu_memory="80GB") - '4xCPU, 16GB, 1xGPU (total 80GB)' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=2, gpu_name='A100', gpu_memory="40GB", total_gpu_memory="80GB") - '4xCPU, 16GB, 2xA100 (40GB, total 80GB)' - >>> pretty_resources(gpu_count=1, compute_capability="8.0") - '1xGPU (8.0)' - """ + """Format resource requirements as a human-readable string.""" parts = [] if cpus is not None: cpu_arch_lower: Optional[str] = None @@ -131,7 +117,6 @@ def pretty_resources( parts.append(f"disk={disk_size}") if gpu_count: gpu_parts = [] - gpu_parts.append(f"{gpu_name or 'gpu'}") if gpu_memory is not None: gpu_parts.append(f"{gpu_memory}") if gpu_count is not None: @@ -141,8 +126,13 @@ def pretty_resources( if compute_capability is not None: gpu_parts.append(f"{compute_capability}") - gpu = ":".join(gpu_parts) - parts.append(gpu) + if gpu_name: + parts.append("gpu=" + ":".join([f"{gpu_name}"] + gpu_parts)) + elif gpu_vendor: + vendor_str = gpu_vendor.value if isinstance(gpu_vendor, enum.Enum) else str(gpu_vendor) + parts.append("gpu=" + ":".join([vendor_str] + gpu_parts)) + else: + parts.append("gpu=" + ":".join(gpu_parts)) return " ".join(parts) diff --git a/src/tests/_internal/cli/services/configurators/test_run.py b/src/tests/_internal/cli/services/configurators/test_run.py index eb5027671a..6238bcb025 100644 --- a/src/tests/_internal/cli/services/configurators/test_run.py +++ b/src/tests/_internal/cli/services/configurators/test_run.py @@ -132,13 +132,34 @@ def validate(self, conf: BaseRunConfiguration) -> None: def test_no_gpu(self): conf = self.prepare_conf() self.validate(conf) - assert conf.resources.gpu is None + assert conf.resources.gpu is not None + # Vendor is not written to spec for compatibility with older servers. + # The server infers nvidia in set_resources_defaults(). + assert conf.resources.gpu.vendor is None + assert conf.resources.gpu.name is None + assert conf.resources.gpu.count.min == 0 def test_zero_gpu(self): conf = self.prepare_conf(gpu_spec="0") self.validate(conf) assert conf.resources.gpu.vendor is None + def test_gpu_no_vendor_no_image_defaults_to_nvidia(self): + """Vendor is inferred as nvidia for validation but NOT written to spec.""" + conf = self.prepare_conf(gpu_spec="1") + self.validate(conf) + assert conf.resources.gpu.vendor is None + + def test_gpu_no_vendor_with_image_no_default(self): + conf = self.prepare_conf(gpu_spec="1", image="my-custom-image") + self.validate(conf) + assert conf.resources.gpu.vendor is None + + def test_gpu_no_vendor_docker_true_no_default(self): + conf = self.prepare_conf(gpu_spec="1", docker=True) + self.validate(conf) + assert conf.resources.gpu.vendor is None + @pytest.mark.parametrize( ["gpu_spec", "expected_vendor"], [ diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index b00d6ccf57..fef712acae 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -344,7 +344,14 @@ async def test_creates_fleet(self, test_db, session: AsyncSession, client: Async "cpu": {"min": 2, "max": None}, "memory": {"min": 8.0, "max": None}, "shm_size": None, - "gpu": None, + "gpu": { + "vendor": None, + "name": None, + "count": {"min": 0, "max": None}, + "memory": None, + "total_memory": None, + "compute_capability": None, + }, "disk": {"size": {"min": 100.0, "max": None}}, }, "backends": None, @@ -467,7 +474,14 @@ async def test_creates_ssh_fleet(self, test_db, session: AsyncSession, client: A "cpu": {"min": 2, "max": None}, "memory": {"min": 8.0, "max": None}, "shm_size": None, - "gpu": None, + "gpu": { + "vendor": None, + "name": None, + "count": {"min": 0, "max": None}, + "memory": None, + "total_memory": None, + "compute_capability": None, + }, "disk": {"size": {"min": 100.0, "max": None}}, }, "backends": None, @@ -639,7 +653,14 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A "cpu": {"min": 2, "max": None}, "memory": {"min": 8.0, "max": None}, "shm_size": None, - "gpu": None, + "gpu": { + "vendor": None, + "name": None, + "count": {"min": 0, "max": None}, + "memory": None, + "total_memory": None, + "compute_capability": None, + }, "disk": {"size": {"min": 100.0, "max": None}}, }, "backends": None, diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index ad8ad878d1..878b4419bd 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -47,6 +47,10 @@ from dstack._internal.server.models import JobModel, RunModel from dstack._internal.server.schemas.runs import ApplyRunPlanRequest from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.services.resources import ( + set_gpu_vendor_default, + set_resources_defaults, +) from dstack._internal.server.services.runs import run_model_to_run from dstack._internal.server.services.runs.spec import validate_run_spec_and_set_defaults from dstack._internal.server.testing.common import ( @@ -1534,6 +1538,13 @@ async def test_returns_update_or_create_action_on_conf_change( run_spec=run_spec, ) run = run_model_to_run(run_model) + # Apply the same defaults the server applies to current_resource + set_resources_defaults(run.run_spec.configuration.resources) + set_gpu_vendor_default( + run.run_spec.configuration.resources, + image=run.run_spec.configuration.image, + docker=getattr(run.run_spec.configuration, "docker", None), + ) run_spec.configuration = new_conf response = await client.post( f"/api/project/{project.name}/runs/get_plan", diff --git a/src/tests/_internal/utils/test_common.py b/src/tests/_internal/utils/test_common.py index 140627580f..70d12c8f39 100644 --- a/src/tests/_internal/utils/test_common.py +++ b/src/tests/_internal/utils/test_common.py @@ -13,6 +13,7 @@ make_proxy_url, parse_memory, pretty_date, + pretty_resources, sizeof_fmt, ) @@ -239,6 +240,80 @@ def test_make_proxy_url(server_url, proxy_url, expected_url): assert make_proxy_url(server_url, proxy_url) == expected_url +class TestPrettyResources: + def test_cpu_and_memory(self): + assert pretty_resources(cpus=4, memory="16GB") == "cpu=4 mem=16GB" + + def test_gpu_count_without_name(self): + assert pretty_resources(cpus=4, memory="16GB", gpu_count=1) == "cpu=4 mem=16GB gpu=1" + + def test_gpu_count_with_vendor(self): + assert ( + pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_vendor="nvidia") + == "cpu=4 mem=16GB gpu=nvidia:1" + ) + + def test_gpu_count_with_name(self): + assert ( + pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name="A100") + == "cpu=4 mem=16GB gpu=A100:1" + ) + + def test_gpu_with_name_and_memory(self): + assert ( + pretty_resources( + cpus=4, memory="16GB", gpu_count=1, gpu_name="A100", gpu_memory="40GB" + ) + == "cpu=4 mem=16GB gpu=A100:40GB:1" + ) + + def test_gpu_with_total_memory_without_name(self): + assert ( + pretty_resources(cpus=4, memory="16GB", gpu_count=1, total_gpu_memory="80GB") + == "cpu=4 mem=16GB gpu=1:80GB" + ) + + def test_gpu_with_name_memory_and_total_memory(self): + assert ( + pretty_resources( + cpus=4, + memory="16GB", + gpu_count=2, + gpu_name="A100", + gpu_memory="40GB", + total_gpu_memory="80GB", + ) + == "cpu=4 mem=16GB gpu=A100:40GB:2:80GB" + ) + + def test_gpu_with_compute_capability(self): + assert pretty_resources(gpu_count=1, compute_capability="8.0") == "gpu=1:8.0" + + def test_disk(self): + assert ( + pretty_resources(cpus=2, memory="8GB", disk_size="100GB") == "cpu=2 mem=8GB disk=100GB" + ) + + def test_no_gpu(self): + assert pretty_resources(cpus=2, memory="8GB") == "cpu=2 mem=8GB" + + def test_gpu_zero_count_range(self): + """Default GPU spec (0..) should display gpu=0..""" + assert ( + pretty_resources(cpus=2, memory="8GB", disk_size="100GB", gpu_count="0..") + == "cpu=2 mem=8GB disk=100GB gpu=0.." + ) + + def test_gpu_zero_count_range_with_vendor(self): + """Default GPU spec with nvidia vendor should display gpu=nvidia:0..""" + assert ( + pretty_resources( + cpus=2, memory="8GB", disk_size="100GB", gpu_count="0..", gpu_vendor="nvidia" + ) + == "cpu=2 mem=8GB disk=100GB gpu=nvidia:0.." + ) + + class TestSizeofFmt: @pytest.mark.parametrize( ("num", "suffix", "expected"),