Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/dstack/_internal/cli/commands/offer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,10 @@ def _register(self):

def _command(self, args: argparse.Namespace):
super()._command(args)
conf = TaskConfiguration(commands=[":"])
# Set image and user so that the server (a) does not default gpu.vendor
# to nvidia — `dstack offer` should show all vendors, and (b) does not
# attempt to pull image config from the Docker registry.
conf = TaskConfiguration(commands=[":"], image="scratch", user="root")

configurator = OfferConfigurator(api_client=self.api)
configurator.apply_args(conf, args)
Expand Down
20 changes: 16 additions & 4 deletions src/dstack/_internal/cli/services/configurators/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,13 @@ def interpolate_env(self, conf: RunConfigurationT):

def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None:
"""
Infers and sets `resources.gpu.vendor` if not set, requires `image` if the vendor is AMD.
Infers GPU vendor if not set. Defaults to Nvidia when using the default
CUDA image. Requires explicit `image` if the vendor is AMD or Tenstorrent.

NOTE: We don't set the inferred vendor on gpu_spec for compatibility with
older servers. Servers set the vendor using the same logic in
set_resources_defaults(). The inferred vendor is used here only for
validation and display (see _infer_gpu_vendor).
"""
gpu_spec = conf.resources.gpu
if gpu_spec is None:
Expand Down Expand Up @@ -425,12 +431,18 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None:
# CUDA image, not a big deal.
has_amd_gpu = gpuhunt.AcceleratorVendor.AMD in vendors
has_tt_gpu = gpuhunt.AcceleratorVendor.TENSTORRENT in vendors
# Set vendor inferred from name on the spec (server needs it for filtering).
gpu_spec.vendor = vendor
else:
# If neither gpu.vendor nor gpu.name is set, assume Nvidia.
vendor = gpuhunt.AcceleratorVendor.NVIDIA
# No vendor or name specified. Default to Nvidia if using the default
# CUDA image, since it's only compatible with Nvidia GPUs.
# We don't set the inferred vendor on the spec — the server does the
# same inference in set_resources_defaults() for compatibility with
# older servers that don't handle vendor + count.min=0 correctly.
if conf.image is None and conf.docker is not True:
vendor = gpuhunt.AcceleratorVendor.NVIDIA
has_amd_gpu = False
has_tt_gpu = False
gpu_spec.vendor = vendor
else:
has_amd_gpu = vendor == gpuhunt.AcceleratorVendor.AMD
has_tt_gpu = vendor == gpuhunt.AcceleratorVendor.TENSTORRENT
Expand Down
7 changes: 6 additions & 1 deletion src/dstack/_internal/core/models/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,9 @@ def _vendor_from_string(cls, v: str) -> gpuhunt.AcceleratorVendor:
return gpuhunt.AcceleratorVendor.cast(v)


DEFAULT_GPU_SPEC = GPUSpec(count=Range[int](min=0, max=None))


class DiskSpecConfig(CoreConfig):
@staticmethod
def schema_extra(schema: Dict[str, Any]):
Expand Down Expand Up @@ -387,7 +390,8 @@ class ResourcesSpec(generate_dual_core_model(ResourcesSpecConfig)):
"you may need to configure this"
),
] = None
gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
# Optional for backward compatibility
gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = DEFAULT_GPU_SPEC
disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK

def pretty_format(self) -> str:
Expand All @@ -397,6 +401,7 @@ def pretty_format(self) -> str:
if self.gpu:
gpu = self.gpu
resources.update(
gpu_vendor=gpu.vendor,
gpu_name=",".join(gpu.name) if gpu.name else None,
gpu_count=gpu.count,
gpu_memory=gpu.memory,
Expand Down
23 changes: 23 additions & 0 deletions src/dstack/_internal/server/services/resources.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Optional

import gpuhunt
from pydantic import parse_obj_as

Expand All @@ -19,3 +21,24 @@ def set_resources_defaults(resources: ResourcesSpec) -> None:
else:
cpu.arch = gpuhunt.CPUArchitecture.X86
resources.cpu = cpu


def set_gpu_vendor_default(
resources: ResourcesSpec,
image: Optional[str],
docker: Optional[bool],
) -> None:
"""Default GPU vendor to Nvidia when using the default CUDA image,
since it's only compatible with Nvidia GPUs.
Mirrors the client-side logic in validate_gpu_vendor_and_image().
Should only be called for runs (not fleets) since fleets don't have image context."""
gpu = resources.gpu
if (
gpu is not None
and gpu.vendor is None
and gpu.name is None
and gpu.count.max != 0
and image is None
and docker is not True
):
gpu.vendor = gpuhunt.AcceleratorVendor.NVIDIA
27 changes: 20 additions & 7 deletions src/dstack/_internal/server/services/runs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@
from dstack._internal.server.services.plugins import apply_plugin_policies
from dstack._internal.server.services.probes import is_probe_ready
from dstack._internal.server.services.projects import list_user_project_models
from dstack._internal.server.services.resources import set_resources_defaults
from dstack._internal.server.services.resources import (
set_gpu_vendor_default,
set_resources_defaults,
)
from dstack._internal.server.services.runs.plan import get_job_plans
from dstack._internal.server.services.runs.spec import (
can_update_run_spec,
Expand Down Expand Up @@ -343,8 +346,8 @@ async def get_plan(
)
if current_resource is not None:
# For backward compatibility (current_resource may has been submitted before
# some fields, e.g., CPUSpec.arch, were added)
set_resources_defaults(current_resource.run_spec.configuration.resources)
# some fields, e.g., CPUSpec.arch, gpu.vendor were added)
_set_run_resources_defaults(current_resource.run_spec)
if not current_resource.status.is_finished() and can_update_run_spec(
current_resource.run_spec, effective_run_spec
):
Expand All @@ -354,7 +357,7 @@ async def get_plan(
session=session,
project=project,
profile=profile,
run_spec=run_spec,
run_spec=effective_run_spec,
max_offers=max_offers,
)
run_plan = RunPlan(
Expand Down Expand Up @@ -410,8 +413,8 @@ async def apply_plan(
current_resource = run_model_to_run(current_resource_model, return_in_api=True)

# For backward compatibility (current_resource may has been submitted before
# some fields, e.g., CPUSpec.arch, were added)
set_resources_defaults(current_resource.run_spec.configuration.resources)
# some fields, e.g., CPUSpec.arch, gpu.vendor were added)
_set_run_resources_defaults(current_resource.run_spec)
try:
spec_diff = check_can_update_run_spec(current_resource.run_spec, run_spec)
except ServerClientError:
Expand All @@ -421,7 +424,7 @@ async def apply_plan(
raise
if not force:
if plan.current_resource is not None:
set_resources_defaults(plan.current_resource.run_spec.configuration.resources)
_set_run_resources_defaults(plan.current_resource.run_spec)
if (
plan.current_resource is None
or plan.current_resource.id != current_resource.id
Expand Down Expand Up @@ -782,6 +785,16 @@ def run_model_to_run(
return run


def _set_run_resources_defaults(run_spec: RunSpec) -> None:
"""Apply resource defaults to a run spec, including GPU vendor inference."""
set_resources_defaults(run_spec.configuration.resources)
set_gpu_vendor_default(
run_spec.configuration.resources,
image=run_spec.configuration.image,
docker=getattr(run_spec.configuration, "docker", None),
)


def _get_run_jobs_with_submissions(
run_model: RunModel,
job_submissions_limit: Optional[int],
Expand Down
10 changes: 9 additions & 1 deletion src/dstack/_internal/server/services/runs/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
from dstack._internal.server import settings
from dstack._internal.server.models import UserModel
from dstack._internal.server.services.docker import is_valid_docker_volume_target
from dstack._internal.server.services.resources import set_resources_defaults
from dstack._internal.server.services.resources import (
set_gpu_vendor_default,
set_resources_defaults,
)
from dstack._internal.utils.logging import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -108,6 +111,11 @@ def validate_run_spec_and_set_defaults(
if run_spec.configuration.priority is None:
run_spec.configuration.priority = RUN_PRIORITY_DEFAULT
set_resources_defaults(run_spec.configuration.resources)
set_gpu_vendor_default(
run_spec.configuration.resources,
image=run_spec.configuration.image,
docker=getattr(run_spec.configuration, "docker", None),
)
if run_spec.ssh_key_pub is None:
if user.ssh_public_key:
run_spec.ssh_key_pub = user.ssh_public_key
Expand Down
28 changes: 9 additions & 19 deletions src/dstack/_internal/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,28 +91,14 @@ def pretty_resources(
cpus: Optional[Any] = None,
memory: Optional[Any] = None,
gpu_count: Optional[Any] = None,
gpu_vendor: Optional[Any] = None,
gpu_name: Optional[Any] = None,
gpu_memory: Optional[Any] = None,
total_gpu_memory: Optional[Any] = None,
compute_capability: Optional[Any] = None,
disk_size: Optional[Any] = None,
) -> str:
"""
>>> pretty_resources(cpus=4, memory="16GB")
'4xCPU, 16GB'
>>> pretty_resources(cpus=4, memory="16GB", gpu_count=1)
'4xCPU, 16GB, 1xGPU'
>>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name='A100')
'4xCPU, 16GB, 1xA100'
>>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name='A100', gpu_memory="40GB")
'4xCPU, 16GB, 1xA100 (40GB)'
>>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, total_gpu_memory="80GB")
'4xCPU, 16GB, 1xGPU (total 80GB)'
>>> pretty_resources(cpus=4, memory="16GB", gpu_count=2, gpu_name='A100', gpu_memory="40GB", total_gpu_memory="80GB")
'4xCPU, 16GB, 2xA100 (40GB, total 80GB)'
>>> pretty_resources(gpu_count=1, compute_capability="8.0")
'1xGPU (8.0)'
"""
"""Format resource requirements as a human-readable string."""
parts = []
if cpus is not None:
cpu_arch_lower: Optional[str] = None
Expand All @@ -131,7 +117,6 @@ def pretty_resources(
parts.append(f"disk={disk_size}")
if gpu_count:
gpu_parts = []
gpu_parts.append(f"{gpu_name or 'gpu'}")
if gpu_memory is not None:
gpu_parts.append(f"{gpu_memory}")
if gpu_count is not None:
Expand All @@ -141,8 +126,13 @@ def pretty_resources(
if compute_capability is not None:
gpu_parts.append(f"{compute_capability}")

gpu = ":".join(gpu_parts)
parts.append(gpu)
if gpu_name:
parts.append("gpu=" + ":".join([f"{gpu_name}"] + gpu_parts))
elif gpu_vendor:
vendor_str = gpu_vendor.value if isinstance(gpu_vendor, enum.Enum) else str(gpu_vendor)
parts.append("gpu=" + ":".join([vendor_str] + gpu_parts))
else:
parts.append("gpu=" + ":".join(gpu_parts))
return " ".join(parts)


Expand Down
23 changes: 22 additions & 1 deletion src/tests/_internal/cli/services/configurators/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,34 @@ def validate(self, conf: BaseRunConfiguration) -> None:
def test_no_gpu(self):
conf = self.prepare_conf()
self.validate(conf)
assert conf.resources.gpu is None
assert conf.resources.gpu is not None
# Vendor is not written to spec for compatibility with older servers.
# The server infers nvidia in set_resources_defaults().
assert conf.resources.gpu.vendor is None
assert conf.resources.gpu.name is None
assert conf.resources.gpu.count.min == 0

def test_zero_gpu(self):
conf = self.prepare_conf(gpu_spec="0")
self.validate(conf)
assert conf.resources.gpu.vendor is None

def test_gpu_no_vendor_no_image_defaults_to_nvidia(self):
"""Vendor is inferred as nvidia for validation but NOT written to spec."""
conf = self.prepare_conf(gpu_spec="1")
self.validate(conf)
assert conf.resources.gpu.vendor is None

def test_gpu_no_vendor_with_image_no_default(self):
conf = self.prepare_conf(gpu_spec="1", image="my-custom-image")
self.validate(conf)
assert conf.resources.gpu.vendor is None

def test_gpu_no_vendor_docker_true_no_default(self):
conf = self.prepare_conf(gpu_spec="1", docker=True)
self.validate(conf)
assert conf.resources.gpu.vendor is None

@pytest.mark.parametrize(
["gpu_spec", "expected_vendor"],
[
Expand Down
27 changes: 24 additions & 3 deletions src/tests/_internal/server/routers/test_fleets.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,14 @@ async def test_creates_fleet(self, test_db, session: AsyncSession, client: Async
"cpu": {"min": 2, "max": None},
"memory": {"min": 8.0, "max": None},
"shm_size": None,
"gpu": None,
"gpu": {
"vendor": None,
"name": None,
"count": {"min": 0, "max": None},
"memory": None,
"total_memory": None,
"compute_capability": None,
},
"disk": {"size": {"min": 100.0, "max": None}},
},
"backends": None,
Expand Down Expand Up @@ -467,7 +474,14 @@ async def test_creates_ssh_fleet(self, test_db, session: AsyncSession, client: A
"cpu": {"min": 2, "max": None},
"memory": {"min": 8.0, "max": None},
"shm_size": None,
"gpu": None,
"gpu": {
"vendor": None,
"name": None,
"count": {"min": 0, "max": None},
"memory": None,
"total_memory": None,
"compute_capability": None,
},
"disk": {"size": {"min": 100.0, "max": None}},
},
"backends": None,
Expand Down Expand Up @@ -639,7 +653,14 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A
"cpu": {"min": 2, "max": None},
"memory": {"min": 8.0, "max": None},
"shm_size": None,
"gpu": None,
"gpu": {
"vendor": None,
"name": None,
"count": {"min": 0, "max": None},
"memory": None,
"total_memory": None,
"compute_capability": None,
},
"disk": {"size": {"min": 100.0, "max": None}},
},
"backends": None,
Expand Down
11 changes: 11 additions & 0 deletions src/tests/_internal/server/routers/test_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
from dstack._internal.server.models import JobModel, RunModel
from dstack._internal.server.schemas.runs import ApplyRunPlanRequest
from dstack._internal.server.services.projects import add_project_member
from dstack._internal.server.services.resources import (
set_gpu_vendor_default,
set_resources_defaults,
)
from dstack._internal.server.services.runs import run_model_to_run
from dstack._internal.server.services.runs.spec import validate_run_spec_and_set_defaults
from dstack._internal.server.testing.common import (
Expand Down Expand Up @@ -1534,6 +1538,13 @@ async def test_returns_update_or_create_action_on_conf_change(
run_spec=run_spec,
)
run = run_model_to_run(run_model)
# Apply the same defaults the server applies to current_resource
set_resources_defaults(run.run_spec.configuration.resources)
set_gpu_vendor_default(
run.run_spec.configuration.resources,
image=run.run_spec.configuration.image,
docker=getattr(run.run_spec.configuration, "docker", None),
)
run_spec.configuration = new_conf
response = await client.post(
f"/api/project/{project.name}/runs/get_plan",
Expand Down
Loading