Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 40 additions & 25 deletions runner_manager/backend/scaleway.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
)
from scaleway.instance.v1.custom_api import InstanceUtilsV1API
from scaleway.marketplace.v2 import MarketplaceV2API
from scaleway_core.api import ScalewayException

from runner_manager.backend.base import BaseBackend
from runner_manager.models.backend import (
Expand Down Expand Up @@ -464,37 +465,51 @@ def delete(self, runner: Runner) -> int:
for volume_id in volume_ids:
try:
# First, try to delete using Block Storage API (for sbs_volume)
try:
self.block_client.delete_volume(
zone=self.config.zone,
volume_id=volume_id,
)
log.info(
f"Block storage volume {volume_id} deleted successfully"
)
except Exception as block_error:
block_error_msg = str(block_error)
# If volume not found in Block API, try Instance API (for l_ssd volumes)
if (
"404" in block_error_msg
or "not_found" in block_error_msg.lower()
):
log.debug(
f"Volume {volume_id} not found in Block API, trying Instance API"
)
self.client.delete_volume(
# Retry with exponential backoff because SBS volumes can remain in
# "in_use" status briefly after server deletion while Scaleway detaches them.
max_attempts = 5
deleted = False
for attempt in range(max_attempts):
try:
self.block_client.delete_volume(
zone=self.config.zone,
volume_id=volume_id,
)
log.info(
f"Instance volume {volume_id} deleted successfully"
f"Block storage volume {volume_id} deleted successfully"
)
else:
raise block_error
except Exception as vol_error:
error_msg = str(vol_error)
deleted = True
break
except ScalewayException as block_error:
if block_error.status_code == 404:
# Not a block storage volume, fall through to Instance API
break
if (
block_error.status_code == 412
and attempt + 1 < max_attempts
):
wait = 2**attempt
log.debug(
f"Volume {volume_id} still in_use, retrying in {wait}s "
f"(attempt {attempt + 1}/{max_attempts})"
)
time.sleep(wait)
else:
raise block_error

if not deleted:
# Volume not found in Block API, try Instance API (for l_ssd volumes)
log.debug(
f"Volume {volume_id} not found in Block API, trying Instance API"
)
self.client.delete_volume(
zone=self.config.zone,
volume_id=volume_id,
)
log.info(f"Instance volume {volume_id} deleted successfully")
except ScalewayException as vol_error:
# Volume may already be deleted automatically (especially l_ssd volumes)
if "404" in error_msg or "not_found" in error_msg.lower():
if vol_error.status_code == 404:
log.info(
f"Volume {volume_id} not found - may have been auto-deleted with server or already cleaned up"
)
Expand Down
127 changes: 116 additions & 11 deletions tests/unit/backend/test_scaleway.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest
from redis_om import NotFoundError
from scaleway.instance.v1 import ServerState
from scaleway_core.api import ScalewayException

from runner_manager.backend.scaleway import ScalewayBackend
from runner_manager.models.backend import (
Expand All @@ -15,6 +16,14 @@
from runner_manager.models.runner_group import RunnerGroup


def make_scaleway_error(status_code: int, text: str = "") -> ScalewayException:
"""Create a ScalewayException with a given HTTP status code for use in tests."""
response = MagicMock()
response.status_code = status_code
response.text = text
return ScalewayException(response=response)


@pytest.fixture()
def scaleway_group(settings) -> RunnerGroup:
"""Create a runner group with Scaleway backend configuration."""
Expand Down Expand Up @@ -78,8 +87,8 @@ def fake_scaleway_group(scaleway_group, monkeypatch):
# By default, simulate that volumes are not found in Block API (like l_ssd volumes)
# This will trigger fallback to Instance API, maintaining compatibility with existing tests
mock_block_client = MagicMock()
mock_block_client.delete_volume.side_effect = Exception(
"404 not_found: Volume not in Block API"
mock_block_client.delete_volume.side_effect = make_scaleway_error(
404, "not_found: Volume not in Block API"
)

# Patch the client property
Expand Down Expand Up @@ -406,7 +415,9 @@ def test_delete_with_volume_error(

mock_client = backend.client
mock_client.get_server.return_value = MagicMock(server=mock_server)
mock_client.delete_volume.side_effect = Exception("Volume deletion failed")
mock_client.delete_volume.side_effect = make_scaleway_error(
500, "Volume deletion failed"
)

# Restore wait_for_server_state mock
def mock_wait(self, server_id, target_state, timeout=300):
Expand Down Expand Up @@ -439,7 +450,7 @@ def test_delete_with_volume_not_found_404(

mock_client = backend.client
mock_client.get_server.return_value = MagicMock(server=mock_server)
mock_client.delete_volume.side_effect = Exception("Error 404: Volume not found")
mock_client.delete_volume.side_effect = make_scaleway_error(404, "Volume not found")

# Restore wait_for_server_state mock
def mock_wait(self, server_id, target_state, timeout=300):
Expand Down Expand Up @@ -573,8 +584,8 @@ def test_delete_with_volume_fallback_to_instance_api(

# Mock Block Storage API client to return 404 (volume not found in Block API)
mock_block_client = MagicMock()
mock_block_client.delete_volume.side_effect = Exception(
"404 not_found: Volume not found in Block API"
mock_block_client.delete_volume.side_effect = make_scaleway_error(
404, "not_found: Volume not found in Block API"
)

# Patch both clients
Expand Down Expand Up @@ -974,12 +985,14 @@ def test_delete_with_block_api_non_404_error(
mock_client.delete_server.return_value = None
mock_client.server_action.return_value = None
# Instance API also fails with permission error
mock_client.delete_volume.side_effect = Exception("Permission denied")
mock_client.delete_volume.side_effect = make_scaleway_error(
403, "Permission denied"
)

# Mock Block Storage API client - fails with permission error (not 404)
mock_block_client = MagicMock()
mock_block_client.delete_volume.side_effect = Exception(
"Permission denied: insufficient permissions"
mock_block_client.delete_volume.side_effect = make_scaleway_error(
403, "Permission denied: insufficient permissions"
)

# Patch both clients
Expand Down Expand Up @@ -1027,8 +1040,8 @@ def test_delete_with_block_api_no_fallback_on_permission_error(

# Mock Block Storage API client - fails with permission error (not 404)
mock_block_client = MagicMock()
mock_block_client.delete_volume.side_effect = Exception(
"403 Permission denied: insufficient permissions"
mock_block_client.delete_volume.side_effect = make_scaleway_error(
403, "Permission denied: insufficient permissions"
)

# Patch both clients
Expand All @@ -1053,3 +1066,95 @@ def mock_wait(self, server_id, target_state, timeout=300):
assert "Failed to delete volume test-volume-id" in caplog.text
assert "Permission denied" in caplog.text
assert result == 1


def test_delete_volume_in_use_retry_succeeds(
scaleway_runner, fake_scaleway_group, caplog, monkeypatch
):
"""Test that volume deletion retries when volume is in_use, and succeeds eventually."""
backend = fake_scaleway_group.backend
scaleway_runner.instance_id = "test-server-id"
scaleway_runner.save()

mock_volume = MagicMock()
mock_volume.id = "test-block-volume-id"
mock_server = MagicMock()
mock_server.id = "test-server-id"
mock_server.state = ServerState.STOPPED
mock_server.volumes = {"0": mock_volume}

mock_client = MagicMock()
mock_client.get_server.return_value = MagicMock(server=mock_server)
mock_client.delete_server.return_value = None
mock_client.server_action.return_value = None

# Fail twice with 412 (in_use), then succeed on 3rd attempt
in_use_error = make_scaleway_error(
412, "precondition is not respected: protected_resource"
)
mock_block_client = MagicMock()
mock_block_client.delete_volume.side_effect = [in_use_error, in_use_error, None]

monkeypatch.setattr(ScalewayBackend, "client", property(lambda self: mock_client))
monkeypatch.setattr(
ScalewayBackend, "block_client", property(lambda self: mock_block_client)
)

def mock_wait(self, server_id, target_state, timeout=300):
return mock_server

monkeypatch.setattr(ScalewayBackend, "wait_for_server_state", mock_wait)
monkeypatch.setattr("runner_manager.backend.scaleway.time.sleep", lambda _: None)

result = backend.delete(scaleway_runner)

assert mock_block_client.delete_volume.call_count == 3
assert (
"Block storage volume test-block-volume-id deleted successfully" in caplog.text
)
assert "Failed to delete volume" not in caplog.text
assert result == 1


def test_delete_volume_in_use_retry_exhausted(
scaleway_runner, fake_scaleway_group, caplog, monkeypatch
):
"""Test that a warning is logged when volume stays in_use after all retries."""
backend = fake_scaleway_group.backend
scaleway_runner.instance_id = "test-server-id"
scaleway_runner.save()

mock_volume = MagicMock()
mock_volume.id = "test-block-volume-id"
mock_server = MagicMock()
mock_server.id = "test-server-id"
mock_server.state = ServerState.STOPPED
mock_server.volumes = {"0": mock_volume}

mock_client = MagicMock()
mock_client.get_server.return_value = MagicMock(server=mock_server)
mock_client.delete_server.return_value = None
mock_client.server_action.return_value = None

in_use_error = make_scaleway_error(
412, "precondition is not respected: protected_resource"
)
mock_block_client = MagicMock()
mock_block_client.delete_volume.side_effect = in_use_error

monkeypatch.setattr(ScalewayBackend, "client", property(lambda self: mock_client))
monkeypatch.setattr(
ScalewayBackend, "block_client", property(lambda self: mock_block_client)
)

def mock_wait(self, server_id, target_state, timeout=300):
return mock_server

monkeypatch.setattr(ScalewayBackend, "wait_for_server_state", mock_wait)
monkeypatch.setattr("runner_manager.backend.scaleway.time.sleep", lambda _: None)

result = backend.delete(scaleway_runner)

assert mock_block_client.delete_volume.call_count == 5 # max_attempts
assert "Failed to delete volume test-block-volume-id" in caplog.text
assert result == 1
Loading