diff --git a/.gitignore b/.gitignore index e43b0f988..fe1fdc20d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ .DS_Store +.agents/ +.config/agents/ +agents.md diff --git a/gateway/.envs/example/minio.env b/gateway/.envs/example/minio.env deleted file mode 100644 index 965d69133..000000000 --- a/gateway/.envs/example/minio.env +++ /dev/null @@ -1,13 +0,0 @@ -# ------------------------------------------------------- -# ====================== LOCAL ENV ====================== -# MINIO Config -MINIO_ROOT_USER=minioadmin -MINIO_ROOT_PASSWORD= -MINIO_ENDPOINT_URL=minio:9000 -MINIO_STORAGE_USE_HTTPS=false # prod: true - -# AWS S3 Config -AWS_ACCESS_KEY_ID=minioadmin -AWS_SECRET_ACCESS_KEY= -AWS_STORAGE_BUCKET_NAME=spectrumx -AWS_S3_ENDPOINT_URL=http://minio:9000 diff --git a/gateway/.envs/example/storage.env b/gateway/.envs/example/storage.env new file mode 100644 index 000000000..dcabfbf95 --- /dev/null +++ b/gateway/.envs/example/storage.env @@ -0,0 +1,24 @@ +# ====================== STORAGE ENV ====================== +# PRIMARY (RustFS) — S3-compatible storage, default for local/CI +# SECONDARY (SeaweedFS) — S3-compatible object store for local/dev + +# PRIMARY (RustFS) credentials +PRIMARY_ACCESS_KEY_ID=admin +PRIMARY_ENDPOINT_URL=sds-gateway-local-rustfs:9000 +PRIMARY_S3_ENDPOINT_URL=http://sds-gateway-local-rustfs:9000 +PRIMARY_SECRET_ACCESS_KEY=admin +PRIMARY_STORAGE_BUCKET_NAME=spectrumx +PRIMARY_STORAGE_USE_HTTPS=false + +# SECONDARY (SeaweedFS) credentials +SECONDARY_ACCESS_KEY_ID=admin +SECONDARY_SECRET_ACCESS_KEY=admin +SECONDARY_ENDPOINT_URL=sds-gateway-local-sfs-s3:8333 +SECONDARY_S3_ENDPOINT_URL=http://sds-gateway-local-sfs-s3:8333 +SECONDARY_STORAGE_BUCKET_NAME=spectrumx +SECONDARY_STORAGE_USE_HTTPS=false + +# Transition controls +OBJECT_STORE_DUAL_WRITE_STRICT=false +OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED=false +OBJECT_STORE_WRITE_BOTH_ENABLED=false diff --git a/gateway/.envs/example/storage.prod.env b/gateway/.envs/example/storage.prod.env new file mode 100644 index 000000000..426d967a7 --- /dev/null +++ b/gateway/.envs/example/storage.prod.env @@ -0,0 +1,26 @@ +# ====================== STORAGE ENV (PRODUCTION) ====================== +# SeaweedFS config — see seaweedfs/compose.production.yaml +# RustFS config — see gateway/compose..yaml + +# PRIMARY credentials (RustFS in local and ci, SeaweedFS in prod) +PRIMARY_ACCESS_KEY_ID=admin +PRIMARY_ENDPOINT_URL=sds-gateway-prod-sfs-s3:8333 +PRIMARY_S3_ENDPOINT_URL=http://sds-gateway-prod-sfs-s3:8333 +PRIMARY_SECRET_ACCESS_KEY=admin +PRIMARY_STORAGE_BUCKET_NAME=spectrumx +PRIMARY_STORAGE_USE_HTTPS=false + +# SECONDARY credentials (usually RustFS in prod; absent in local and ci) +SECONDARY_ACCESS_KEY_ID=minioadmin +SECONDARY_ENDPOINT_URL=prod-secondary-rustfs:9000 +SECONDARY_ROOT_PASSWORD= +SECONDARY_ROOT_USER=minioadmin +SECONDARY_S3_ENDPOINT_URL=http://prod-secondary-rustfs:9000 +SECONDARY_SECRET_ACCESS_KEY= +SECONDARY_STORAGE_BUCKET_NAME=spectrumx +SECONDARY_STORAGE_USE_HTTPS=false + +# Transition controls +OBJECT_STORE_DUAL_WRITE_STRICT=false +OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED=false +OBJECT_STORE_WRITE_BOTH_ENABLED=false diff --git a/gateway/.github/workflows/ci.yml b/gateway/.github/workflows/ci.yml deleted file mode 100644 index d490ab4e2..000000000 --- a/gateway/.github/workflows/ci.yml +++ /dev/null @@ -1,62 +0,0 @@ -name: CI - -# Enable Buildkit and let compose use it to speed up image building -env: - DOCKER_BUILDKIT: 1 - COMPOSE_DOCKER_CLI_BUILD: 1 - -on: - workflow_dispatch: - # To manually trigger the workflow - # https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#workflow_dispatch - - pull_request: - types: ["ready_for_review", "synchronize"] - branches: ["master", "main"] - paths-ignore: ["docs/**"] - - push: - branches: ["master", "main"] - paths-ignore: ["docs/**"] - -concurrency: - group: ${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - linter: - runs-on: ubuntu-latest - steps: - - name: Checkout Code Repository - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v6 - with: - python-version: "3.12" - # Consider using pre-commit.ci for open source project - - name: Run pre-commit - uses: pre-commit/action@v3.0.1 - - # With no caching at all the entire ci process takes 3m to complete! - pytest: - runs-on: ubuntu-latest - - steps: - - name: Checkout Code Repository - uses: actions/checkout@v4 - - - name: Build the Stack - run: docker compose -f compose.local.yaml build django - - - name: Build the docs - run: docker compose -f compose.docs.yaml build docs - - - name: Run DB Migrations - run: docker compose -f compose.local.yaml run --rm django uv run manage.py migrate - - - name: Run Django Tests - run: docker compose -f compose.local.yaml run --rm django uv run manage.py test - - - name: Tear down the Stack - run: docker compose -f compose.local.yaml down diff --git a/gateway/compose.ci.yaml b/gateway/compose.ci.yaml index 14ea08c80..ae9350c17 100644 --- a/gateway/compose.ci.yaml +++ b/gateway/compose.ci.yaml @@ -13,7 +13,7 @@ volumes: sds-gateway-ci-uv-venv-worker: {} sds-gateway-ci-uv-venv-beat: {} sds-gateway-ci-uv-venv-flower: {} - sds-gateway-ci-minio-files: {} + sds-gateway-ci-rustfs-files: {} sds-gateway-ci-opensearch-data: {} sds-gateway-ci-postgres-data-backups: {} sds-gateway-ci-postgres-data: {} @@ -21,15 +21,15 @@ volumes: networks: # for safety, all gateway CI networks start with "sds-gateway-ci-" - sds-gateway-ci-minio-net: + sds-gateway-ci-rustfs-net: driver: bridge sds-gateway-ci-opensearch-net: driver: bridge + sds-gateway-ci-postgres-net: + driver: bridge sds-network-ci: - # external: true # make it external if running with traefik on this machine - # should match traefik's network name + external: true name: sds-network-ci - driver: bridge services: sds-gateway-ci-app: build: @@ -45,8 +45,6 @@ services: condition: service_healthy redis: condition: service_healthy - minio: - condition: service_healthy volumes: - sds-gateway-ci-uv-cache:/opt/uv-cache/ - sds-gateway-ci-uv-venv-app:/opt/uv-venv/ @@ -74,7 +72,7 @@ services: # - ./staticfiles/:/app/staticfiles/:z # used in prod only env_file: - ./.envs/ci/django.env - - ./.envs/ci/minio.env + - ./.envs/ci/storage.env # PRIMARY (RustFS) — local/CI: primary only, no secondary - ./.envs/ci/postgres.env - ./.envs/ci/opensearch.env # remember /entrypoint runs first @@ -82,8 +80,9 @@ services: ports: - "8000:8000" # make sure this port matches traefik's config, if used networks: - - sds-gateway-ci-minio-net + - sds-gateway-ci-rustfs-net - sds-gateway-ci-opensearch-net + - sds-gateway-ci-postgres-net - sds-network-ci healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:8000/ || exit 1"] @@ -124,34 +123,49 @@ services: selinux: z networks: - sds-network-ci + healthcheck: + test: + [ + "CMD-SHELL", + "wget -q -O /dev/null http://localhost/ || exit 1", + ] + interval: 30s + timeout: 5s + retries: 5 + start_period: 10s - minio: - # main file storage for sds - # minio uses rolling upgrades that are non-disruptive, so we can target latest - # For more information on how to upgrade MinIO deployment, refer to the MinIO documentation: - # https://min.io/docs/minio/container/operations/install-deploy-manage/upgrade-minio-deployment.html - image: minio/minio:latest - container_name: sds-gateway-ci-minio + # Primary storage (RustFS) — S3-compatible, default for local/CI + rustfs: + image: rustfs/rustfs:latest + container_name: sds-gateway-ci-rustfs volumes: - - sds-gateway-ci-minio-files:/files + - sds-gateway-ci-rustfs-files:/data ports: - - "9000:9000" + - "19000:9000" - "9001:9001" env_file: - - ./.envs/ci/minio.env + - ./.envs/ci/storage.env + environment: + - RUSTFS_VOLUMES=/data + - RUSTFS_ADDRESS=0.0.0.0:9000 + - RUSTFS_CONSOLE_ADDRESS=0.0.0.0:9001 + - RUSTFS_CONSOLE_ENABLE=true + - RUSTFS_CORS_ALLOWED_ORIGINS=* + - RUSTFS_CONSOLE_CORS_ALLOWED_ORIGINS=* + - RUSTFS_ACCESS_KEY=${PRIMARY_ACCESS_KEY_ID} + - RUSTFS_SECRET_KEY=${PRIMARY_SECRET_ACCESS_KEY} + networks: + - sds-gateway-ci-rustfs-net healthcheck: test: [ "CMD-SHELL", - "curl -f http://localhost:9000/minio/health/live || exit 1", + "curl -f http://localhost:9000/rustfs/console/health || exit 1", ] interval: 30s timeout: 5s retries: 5 start_period: 10s - command: 'server /files --console-address ":9001"' - networks: - - sds-gateway-ci-minio-net opensearch: # used for indexing and searching documents @@ -191,7 +205,7 @@ services: build: context: . dockerfile: ./compose/production/postgres/Dockerfile - # this dockerfile is used for both local/CI and prod + # this dockerfile is used for both local and prod image: sds-gateway-ci-postgres container_name: sds-gateway-ci-postgres volumes: @@ -200,7 +214,7 @@ services: env_file: - ./.envs/ci/postgres.env networks: - - sds-gateway-ci-minio-net + - sds-gateway-ci-postgres-net healthcheck: test: [ @@ -264,14 +278,25 @@ services: selinux: z env_file: - ./.envs/ci/django.env - - ./.envs/ci/minio.env + - ./.envs/ci/storage.env # PRIMARY (RustFS) — local/CI: primary only, no secondary - ./.envs/ci/postgres.env - ./.envs/ci/opensearch.env command: "/worker-start" networks: - - sds-gateway-ci-minio-net + - sds-gateway-ci-rustfs-net - sds-gateway-ci-opensearch-net + - sds-gateway-ci-postgres-net - sds-network-ci + healthcheck: + test: + [ + "CMD-SHELL", + 'uv run celery -A config.celery_app inspect ping -d "celery@$$HOSTNAME" | grep -q "OK"', + ] + interval: 30s + timeout: 30s + retries: 5 + start_period: 30s celery-beat: # Celery Beat scheduler for periodic tasks @@ -309,14 +334,25 @@ services: selinux: z env_file: - ./.envs/ci/django.env - - ./.envs/ci/minio.env + - ./.envs/ci/storage.env # PRIMARY (RustFS) — local/CI: primary only, no secondary - ./.envs/ci/postgres.env - ./.envs/ci/opensearch.env command: "/beat-start" networks: - - sds-gateway-ci-minio-net + - sds-gateway-ci-rustfs-net - sds-gateway-ci-opensearch-net + - sds-gateway-ci-postgres-net - sds-network-ci + healthcheck: + test: + [ + "CMD-SHELL", + 'uv run python -c "import pathlib,sys; ok=any((b\"beat\" in data) and ((b\"celery\" in data) or (b\"watchfiles\" in data)) for data in (path.read_bytes() for path in pathlib.Path(\"/proc\").glob(\"[0-9]*/cmdline\"))); sys.exit(0 if ok else 1)"', + ] + interval: 30s + timeout: 30s + retries: 5 + start_period: 30s celery-flower: # Celery monitoring and administration tool @@ -354,16 +390,27 @@ services: selinux: z env_file: - ./.envs/ci/django.env - - ./.envs/ci/minio.env + - ./.envs/ci/storage.env # PRIMARY (RustFS) — local/CI: primary only, no secondary - ./.envs/ci/postgres.env - ./.envs/ci/opensearch.env command: "/flower-start" ports: - "5555:5555" # Flower web interface networks: - - sds-gateway-ci-minio-net + - sds-gateway-ci-rustfs-net - sds-gateway-ci-opensearch-net + - sds-gateway-ci-postgres-net - sds-network-ci + healthcheck: + test: + [ + "CMD-SHELL", + 'curl -f --header "Authorization: Basic $(echo -n "$$CELERY_FLOWER_USER:$$CELERY_FLOWER_PASSWORD" | base64)" http://localhost:5555/api/workers || exit 1', + ] + interval: 30s + timeout: 30s + retries: 5 + start_period: 30s # ========================== # local development services @@ -395,6 +442,16 @@ services: - action: sync path: ./ target: /app/ + healthcheck: + test: + [ + "CMD-SHELL", + 'node -e "const http=require(\"http\"); const req=http.get(\"http://127.0.0.1:3000\", res => process.exit(res.statusCode < 500 ? 0 : 1)); req.on(\"error\", () => process.exit(1)); req.setTimeout(5000, () => { req.destroy(); process.exit(1); });"', + ] + interval: 30s + timeout: 10s + retries: 5 + start_period: 45s mailhog: # email testing service for local development @@ -405,3 +462,13 @@ services: - "8025:8025" # Web UI networks: - sds-network-ci + healthcheck: + test: + [ + "CMD-SHELL", + "wget -q -O /dev/null http://localhost:8025/api/v2/messages || exit 1", + ] + interval: 30s + timeout: 5s + retries: 5 + start_period: 10s diff --git a/gateway/compose.local.yaml b/gateway/compose.local.yaml index b0358c8ca..5e687ca50 100644 --- a/gateway/compose.local.yaml +++ b/gateway/compose.local.yaml @@ -13,7 +13,7 @@ volumes: sds-gateway-local-uv-venv-worker: {} sds-gateway-local-uv-venv-beat: {} sds-gateway-local-uv-venv-flower: {} - sds-gateway-local-minio-files: {} + sds-gateway-local-rustfs-files: {} sds-gateway-local-opensearch-data: {} sds-gateway-local-postgres-data-backups: {} sds-gateway-local-postgres-data: {} @@ -21,12 +21,18 @@ volumes: networks: # for safety, all gateway local networks start with "sds-gateway-local-" - sds-gateway-local-minio-net: + sds-gateway-local-rustfs-net: driver: bridge + name: sds-gateway-local-rustfs-net sds-gateway-local-opensearch-net: driver: bridge + name: sds-gateway-local-opensearch-net + sds-gateway-local-postgres-net: + driver: bridge + name: sds-gateway-local-postgres-net sds-network-local: - # external: true # make it external if running with traefik on this machine + # externally defined in traefik and/or in the primary storage compose file + external: true # should match traefik's network name name: sds-network-local driver: bridge @@ -45,8 +51,6 @@ services: condition: service_healthy redis: condition: service_healthy - minio: - condition: service_healthy volumes: - sds-gateway-local-uv-cache:/opt/uv-cache/ - sds-gateway-local-uv-venv-app:/opt/uv-venv/ @@ -74,7 +78,7 @@ services: # - ./staticfiles/:/app/staticfiles/:z # used in prod only env_file: - ./.envs/local/django.env - - ./.envs/local/minio.env + - ./.envs/local/storage.env - ./.envs/local/postgres.env - ./.envs/local/opensearch.env # remember /entrypoint runs first @@ -82,8 +86,9 @@ services: ports: - "8000:8000" # make sure this port matches traefik's config, if used networks: - - sds-gateway-local-minio-net - sds-gateway-local-opensearch-net + - sds-gateway-local-rustfs-net + - sds-gateway-local-postgres-net - sds-network-local healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:8000/ || exit 1"] @@ -124,34 +129,49 @@ services: selinux: z networks: - sds-network-local + healthcheck: + test: + [ + "CMD-SHELL", + "wget -q -O /dev/null http://localhost/healthz || exit 1", + ] + interval: 30s + timeout: 5s + retries: 5 + start_period: 10s - minio: - # main file storage for sds - # minio uses rolling upgrades that are non-disruptive, so we can target latest - # For more information on how to upgrade MinIO deployment, refer to the MinIO documentation: - # https://min.io/docs/minio/container/operations/install-deploy-manage/upgrade-minio-deployment.html - image: minio/minio:latest - container_name: sds-gateway-local-minio + # Primary storage (RustFS) — S3-compatible, default for local/CI + rustfs: + image: rustfs/rustfs:latest + container_name: sds-gateway-local-rustfs volumes: - - sds-gateway-local-minio-files:/files + - sds-gateway-local-rustfs-files:/data ports: - - "9000:9000" + - "19000:9000" - "9001:9001" env_file: - - ./.envs/local/minio.env + - ./.envs/local/storage.env + environment: + - RUSTFS_VOLUMES=/data + - RUSTFS_ADDRESS=0.0.0.0:9000 + - RUSTFS_CONSOLE_ADDRESS=0.0.0.0:9001 + - RUSTFS_CONSOLE_ENABLE=true + - RUSTFS_CORS_ALLOWED_ORIGINS=* + - RUSTFS_CONSOLE_CORS_ALLOWED_ORIGINS=* + - RUSTFS_ACCESS_KEY=${PRIMARY_ACCESS_KEY_ID} + - RUSTFS_SECRET_KEY=${PRIMARY_SECRET_ACCESS_KEY} + networks: + - sds-gateway-local-rustfs-net healthcheck: test: [ "CMD-SHELL", - "curl -f http://localhost:9000/minio/health/live || exit 1", + "curl -f http://localhost:9000/rustfs/console/health || exit 1", ] interval: 30s timeout: 5s retries: 5 start_period: 10s - command: 'server /files --console-address ":9001"' - networks: - - sds-gateway-local-minio-net opensearch: # used for indexing and searching documents @@ -200,7 +220,7 @@ services: env_file: - ./.envs/local/postgres.env networks: - - sds-gateway-local-minio-net + - sds-gateway-local-postgres-net healthcheck: test: [ @@ -264,14 +284,26 @@ services: selinux: z env_file: - ./.envs/local/django.env - - ./.envs/local/minio.env + - ./.envs/local/storage.env - ./.envs/local/postgres.env - ./.envs/local/opensearch.env command: "/worker-start" networks: - - sds-gateway-local-minio-net + # additional networks are used for health checks - sds-gateway-local-opensearch-net + - sds-gateway-local-postgres-net + - sds-gateway-local-rustfs-net - sds-network-local + healthcheck: + test: + [ + "CMD-SHELL", + 'uv run celery -A config.celery_app inspect ping -d "celery@$$HOSTNAME" | grep -q "OK"', + ] + interval: 30s + timeout: 30s + retries: 5 + start_period: 30s celery-beat: # Celery Beat scheduler for periodic tasks @@ -309,61 +341,25 @@ services: selinux: z env_file: - ./.envs/local/django.env - - ./.envs/local/minio.env + - ./.envs/local/storage.env - ./.envs/local/postgres.env - ./.envs/local/opensearch.env command: "/beat-start" networks: - - sds-gateway-local-minio-net - - sds-gateway-local-opensearch-net - - sds-network-local - - celery-flower: - # Celery monitoring and administration tool - build: - context: . - dockerfile: ./compose/local/django/Dockerfile - image: sds-gateway-local-app - container_name: sds-gateway-local-celery-flower - tty: true - depends_on: - sds-gateway-local-app: - condition: service_healthy - volumes: - - sds-gateway-local-uv-cache:/opt/uv-cache/ - - sds-gateway-local-uv-venv-flower:/opt/uv-venv/ - - sds-gateway-local-app-media:/app/sds_gateway/media - - sds-gateway-local-temp-zips:/app/sds_gateway/media/temp_zips - - source: ./sds_gateway/api_methods/migrations - target: /app/sds_gateway/api_methods/migrations - type: bind - read_only: false - bind: - selinux: z - - source: ./sds_gateway/users/migrations - target: /app/sds_gateway/users/migrations - type: bind - read_only: false - bind: - selinux: z - - source: ./sds_gateway/visualizations/migrations - target: /app/sds_gateway/visualizations/migrations - type: bind - read_only: false - bind: - selinux: z - env_file: - - ./.envs/local/django.env - - ./.envs/local/minio.env - - ./.envs/local/postgres.env - - ./.envs/local/opensearch.env - command: "/flower-start" - ports: - - "5555:5555" # Flower web interface - networks: - - sds-gateway-local-minio-net + - sds-gateway-local-rustfs-net - sds-gateway-local-opensearch-net + - sds-gateway-local-postgres-net - sds-network-local + healthcheck: + test: + [ + "CMD-SHELL", + 'curl -f --header "Authorization: Basic $(echo -n "$$CELERY_FLOWER_USER:$$CELERY_FLOWER_PASSWORD" | base64)" http://localhost:5555/api/workers || exit 1', + ] + interval: 30s + timeout: 30s + retries: 5 + start_period: 30s # ========================== # local development services @@ -395,13 +391,33 @@ services: - action: sync path: ./ target: /app/ + healthcheck: + test: + [ + "CMD-SHELL", + 'node -e "const http=require(\"http\"); const req=http.get(\"http://127.0.0.1:3000\", res => process.exit(res.statusCode < 500 ? 0 : 1)); req.on(\"error\", () => process.exit(1)); req.setTimeout(5000, () => { req.destroy(); process.exit(1); });"', + ] + interval: 30s + timeout: 10s + retries: 5 + start_period: 45s mailhog: # email testing service for local development - image: mailhog/mailhog:latest + image: docker.io/mailhog/mailhog:latest container_name: sds-gateway-local-mailhog ports: - "1025:1025" # SMTP server - "8025:8025" # Web UI networks: - sds-network-local + healthcheck: + test: + [ + "CMD-SHELL", + "wget -q -O /dev/null http://localhost:8025/api/v2/messages || exit 1", + ] + interval: 30s + timeout: 5s + retries: 5 + start_period: 10s diff --git a/gateway/compose.production.yaml b/gateway/compose.production.yaml index 64922c875..26dac6185 100644 --- a/gateway/compose.production.yaml +++ b/gateway/compose.production.yaml @@ -26,6 +26,8 @@ networks: driver: bridge sds-gateway-prod-opensearch-net: driver: bridge + sds-gateway-prod-postgres-net: + driver: bridge sds-network-prod: external: true @@ -44,8 +46,6 @@ services: condition: service_started redis: condition: service_started - minio: - condition: service_started volumes: - source: sds-gateway-prod-app-media target: /app/sds_gateway/media @@ -74,11 +74,12 @@ services: type: volume read_only: false post_start: - - command: chown -R django:django /app/sds_gateway/media/ /opt/uv-cache/ /opt/uv-venv/ + - command: chown -R django:django /app/sds_gateway/media/ /opt/uv-cache/ + /opt/uv-venv/ user: root env_file: - ./.envs/production/django.env - - ./.envs/production/minio.env + - ./.envs/production/storage.prod.env - ./.envs/production/postgres.env - ./.envs/production/opensearch.env ports: @@ -88,10 +89,10 @@ services: command: "/start" networks: - sds-gateway-prod-opensearch-net - - sds-gateway-prod-minio-net - - sds-network-prod + - sds-gateway-prod-postgres-net + - sds-network-prod # also carries SeaweedFS S3 traffic — see seaweedfs/compose.yaml healthcheck: - test: ["CMD-SHELL", "curl -f http://localhost:18000/ || exit 1"] + test: [ "CMD-SHELL", "curl -f http://localhost:18000/ || exit 1" ] interval: 30s timeout: 10s retries: 5 @@ -116,12 +117,16 @@ services: read_only: true networks: - sds-network-prod + healthcheck: + test: [ "CMD-SHELL", "wget -q -O /dev/null http://localhost/healthz || exit 1" ] + interval: 30s + timeout: 5s + retries: 5 + start_period: 10s - minio: - # main file storage for sds - # minio uses rolling upgrades that are non-disruptive, so we can target latest - # For more information on how to upgrade MinIO deployment, refer to the MinIO documentation: - # https://min.io/docs/minio/container/operations/install-deploy-manage/upgrade-minio-deployment.html + # DEPRECATED: being replaced by SeaweedFS. Keep running during migration. + # Remove after data migration is complete — see docs/minio-to-sfs-migration.md + minio-deprecated: image: minio/minio:latest container_name: sds-gateway-prod-minio volumes: @@ -130,11 +135,115 @@ services: - "19000:9000" - "19001:9001" env_file: - - ./.envs/production/minio.env + - ./.envs/production/storage.prod.env restart: unless-stopped - command: 'server /files --console-address ":9001"' + healthcheck: + test: [ "CMD-SHELL", "curl -f http://localhost:9000/minio/health/live || exit 1" ] + interval: 30s + timeout: 5s + retries: 5 + start_period: 10s + command: "server /files --console-address \":9001\"" + networks: + - sds-gateway-prod-minio-net + + # prod-secondary-minio: + # # https://min.io/docs/minio/container/operations/install-deploy-manage/upgrade-minio-deployment.html + # image: docker.io/minio/minio:latest + # container_name: sds-gateway-prod-secondary-minio + # volumes: + # - /disk1:/data/disk1 + # - /disk2:/data/disk2 + # - /disk3:/data/disk3 + # # - ./.envs/production/minio-config.json:/tmp/.mc/config.json + # ports: + # - "19100:9000" # deprecated minio S3 API is 19000 + # - "19101:9001" # deprecated minio console is 19001 + # env_file: + # - ./.envs/production/storage.prod.env + # restart: unless-stopped + # healthcheck: + # test: [ "CMD-SHELL", "curl -f http://localhost:9000/minio/health/live || exit 1" ] + # interval: 30s + # timeout: 5s + # retries: 5 + # start_period: 10s + # command: "server --json /data/disk{1...3} --console-address \":9001\"" + # networks: + # - sds-gateway-prod-minio-net + # ulimits: + # nofile: + # soft: 131072 + # hard: 131072 + + # RustFS S3-compatible storage service, used as the secondary storage backend for + # the gateway in production. The primary S3 storage backend in production is + # SeaweedFS, defined in ../seaweedfs/compose.production.yaml . + # At the time of writing, RustFS is not yet ready for production use, so we keep it + # as our secondary backend, as redundancy. + prod-secondary-rustfs: + image: docker.io/rustfs/rustfs:latest + container_name: sds-gateway-prod-secondary-rustfs + security_opt: + - "no-new-privileges:true" + ports: + - "19400:9000" # S3 API port + - "19401:9001" # Console port + env_file: + - ./.envs/production/storage.prod.env + environment: + - RUSTFS_VOLUMES=/data/rustfs{1...3} + - RUSTFS_ADDRESS=0.0.0.0:9000 + - RUSTFS_CONSOLE_ADDRESS=0.0.0.0:9001 + - RUSTFS_CONSOLE_ENABLE=true + - RUSTFS_CORS_ALLOWED_ORIGINS=* + - RUSTFS_CONSOLE_CORS_ALLOWED_ORIGINS=* + # - RUSTFS_ACCESS_KEY=rustfsadmin # CHANGEME + # - RUSTFS_SECRET_KEY=rustfsadmin # CHANGEME + - RUSTFS_OBS_LOGGER_LEVEL=debug + - RUSTFS_TLS_PATH=/opt/tls + + volumes: + - /disk6:/data/rustfs1 + - /disk7:/data/rustfs2 + - /disk8:/data/rustfs3 + - sds-gateway-prod-rustfs-logs:/app/logs networks: - sds-gateway-prod-minio-net + ulimits: + nofile: + soft: 131072 + hard: 131072 + restart: unless-stopped + healthcheck: + test: + [ + "CMD", + "sh", + "-c", + "curl -f http://127.0.0.1:9000/health && curl -f + http://127.0.0.1:9001/rustfs/console/health", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + + # RustFS volume permissions fixer service + rustfs-volume-permission-helper: + image: alpine + volumes: + - /disk6:/data1 + - /disk7:/data2 + - /disk8:/data3 + - sds-gateway-prod-rustfs-logs:/logs + command: > + sh -c " + chown -R 10001:10001 /data1 /data2 /data3 /logs && + echo 'Volume Permissions fixed' && + exit 0 + " + restart: "no" opensearch: # used for indexing and searching documents @@ -185,7 +294,9 @@ services: test: [ "CMD-SHELL", - 'curl -k -u "$OPENSEARCH_ADMIN_USER:$OPENSEARCH_INITIAL_ADMIN_PASSWORD" https://localhost:9200/_cluster/health || exit 1', + "curl -k -u + \"$OPENSEARCH_ADMIN_USER:$OPENSEARCH_INITIAL_ADMIN_PASSWO\ + RD\" https://localhost:9200/_cluster/health || exit 1", ] interval: 5s timeout: 5s @@ -206,7 +317,19 @@ services: env_file: - ./.envs/production/postgres.env networks: - - sds-gateway-prod-minio-net + - sds-gateway-prod-postgres-net + - sds-gateway-prod-opensearch-net + healthcheck: + test: + [ + "CMD-SHELL", + "pg_isready -U \"$$POSTGRES_USER\" -d \"$$POSTGRES_DB\" -h + localhost", + ] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s redis: # used as caching layer for the gateway app @@ -217,6 +340,12 @@ services: - sds-gateway-prod-redis-data:/data networks: - sds-network-prod + healthcheck: + test: [ "CMD", "redis-cli", "ping" ] + interval: 10s + timeout: 5s + retries: 5 + start_period: 5s # =================== # Celery services for background tasks @@ -254,19 +383,30 @@ services: type: volume read_only: false post_start: - - command: chown -R django:django /app/sds_gateway/media/ /opt/uv-cache/ /opt/uv-venv/ + - command: chown -R django:django /app/sds_gateway/media/ /opt/uv-cache/ + /opt/uv-venv/ user: root env_file: - ./.envs/production/django.env - - ./.envs/production/minio.env - - ./.envs/production/postgres.env + - ./.envs/production/storage.prod.env - ./.envs/production/opensearch.env command: "/worker-start" restart: unless-stopped networks: - sds-gateway-prod-opensearch-net - - sds-gateway-prod-minio-net - - sds-network-prod + - sds-gateway-prod-postgres-net + - sds-network-prod # also carries SeaweedFS S3 traffic + healthcheck: + test: + [ + "CMD-SHELL", + "uv run celery -A config.celery_app inspect ping -d + \"celery@$$HOSTNAME\" | grep -q \"OK\"", + ] + interval: 30s + timeout: 30s + retries: 5 + start_period: 30s celery-beat: # Celery Beat scheduler for periodic tasks @@ -302,19 +442,34 @@ services: type: volume read_only: false post_start: - - command: chown -R django:django /app/sds_gateway/media/ /opt/uv-cache/ /opt/uv-venv/ + - command: chown -R django:django /app/sds_gateway/media/ /opt/uv-cache/ + /opt/uv-venv/ user: root env_file: - ./.envs/production/django.env - - ./.envs/production/minio.env - - ./.envs/production/postgres.env + - ./.envs/production/storage.prod.env - ./.envs/production/opensearch.env command: "/beat-start" restart: unless-stopped networks: - sds-gateway-prod-opensearch-net - - sds-gateway-prod-minio-net - - sds-network-prod + - sds-gateway-prod-postgres-net + - sds-network-prod # also carries SeaweedFS S3 traffic + healthcheck: + test: + [ + "CMD-SHELL", + "uv run python -c \"import pathlib,sys; + ok=any((b\\\"beat\\\" in data) and ((b\\\"celery\\\" in + data) or (b\\\"watchfiles\\\" in data)) for data in + (path.read_bytes() for path in + pathlib.Path(\\\"/proc\\\").glob(\\\"[0-9]*/cmdline\\\"))\ + ); sys.exit(0 if ok else 1)\"", + ] + interval: 30s + timeout: 30s + retries: 5 + start_period: 30s celery-flower: # Celery monitoring and administration tool @@ -345,18 +500,30 @@ services: type: volume read_only: false post_start: - - command: chown -R django:django /app/sds_gateway/media/ /opt/uv-cache/ /opt/uv-venv/ + - command: chown -R django:django /app/sds_gateway/media/ /opt/uv-cache/ + /opt/uv-venv/ user: root env_file: - ./.envs/production/django.env - - ./.envs/production/minio.env - - ./.envs/production/postgres.env + - ./.envs/production/storage.prod.env - ./.envs/production/opensearch.env command: "/flower-start" restart: unless-stopped ports: - "15555:5555" # Flower web interface networks: - - sds-gateway-prod-minio-net - sds-gateway-prod-opensearch-net - - sds-network-prod + - sds-gateway-prod-postgres-net + - sds-network-prod # also carries SeaweedFS S3 traffic + healthcheck: + test: + [ + "CMD-SHELL", + "curl -f --header \"Authorization: Basic $(echo -n + \"$$CELERY_FLOWER_USER:$$CELERY_FLOWER_PASSWORD\" | + base64)\" http://localhost:5555/api/workers || exit 1", + ] + interval: 30s + timeout: 30s + retries: 5 + start_period: 30s diff --git a/gateway/compose/production/django/celery/worker-start b/gateway/compose/production/django/celery/worker-start index 1caba3f8e..d2ab19bdc 100644 --- a/gateway/compose/production/django/celery/worker-start +++ b/gateway/compose/production/django/celery/worker-start @@ -4,4 +4,4 @@ set -o errexit set -o pipefail set -o nounset -exec uv run celery -A config.celery_app worker -l INFO +exec uv run celery -A config.celery_app worker -l INFO --concurrency "${CELERY_WORKER_CONCURRENCY:-4}" diff --git a/gateway/compose/production/nginx/nginx-default.conf b/gateway/compose/production/nginx/nginx-default.conf index 69fd8339a..d0503de26 100644 --- a/gateway/compose/production/nginx/nginx-default.conf +++ b/gateway/compose/production/nginx/nginx-default.conf @@ -1,22 +1,68 @@ -error_log /var/log/nginx/error.log debug; +error_log /var/log/nginx/error.log warn; server { - # serving static files - # TLS is handled by Traefik listen 80; server_name localhost; + server_tokens off; - # Set MIME types include /etc/nginx/mime.types; default_type application/octet-stream; + add_header X-Content-Type-Options nosniff always; location /static/ { alias /usr/share/nginx/static/; + autoindex off; + + if ($request_method = OPTIONS) { + add_header Access-Control-Allow-Origin *; + add_header Access-Control-Allow-Methods 'GET, HEAD, OPTIONS'; + add_header Access-Control-Allow-Headers 'Range'; + add_header Access-Control-Max-Age 86400; + add_header Content-Length 0; + add_header Content-Type 'text/plain; charset=utf-8'; + add_header X-Content-Type-Options nosniff always; + return 204; + } + + limit_except GET HEAD { + deny all; + } - # Add CORS headers add_header Access-Control-Allow-Origin *; - add_header Access-Control-Allow-Methods 'GET, POST, OPTIONS'; - add_header Access-Control-Allow-Headers 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range'; + add_header Access-Control-Allow-Methods 'GET, HEAD, OPTIONS'; + add_header Access-Control-Allow-Headers 'Range'; + add_header X-Content-Type-Options nosniff always; + + location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ { + if ($request_method = OPTIONS) { + add_header Access-Control-Allow-Origin *; + add_header Access-Control-Allow-Methods 'GET, HEAD, OPTIONS'; + add_header Access-Control-Allow-Headers 'Range'; + add_header Access-Control-Max-Age 86400; + add_header Content-Length 0; + add_header Content-Type 'text/plain; charset=utf-8'; + add_header X-Content-Type-Options nosniff always; + return 204; + } + + limit_except GET HEAD { + deny all; + } + + expires 1d; + add_header Cache-Control "public, immutable"; + add_header Access-Control-Allow-Origin *; + add_header Access-Control-Allow-Methods 'GET, HEAD, OPTIONS'; + add_header Access-Control-Allow-Headers 'Range'; + add_header X-Content-Type-Options nosniff always; + } + } + + location = /healthz { + access_log off; + add_header Content-Type 'text/plain; charset=utf-8'; + add_header X-Content-Type-Options nosniff always; + return 200 'OK'; } } diff --git a/gateway/config/settings/base.py b/gateway/config/settings/base.py index f95d18945..9b0fa4508 100644 --- a/gateway/config/settings/base.py +++ b/gateway/config/settings/base.py @@ -5,6 +5,7 @@ import string from pathlib import Path from typing import Any +from urllib.parse import urlparse from celery.schedules import crontab from environs import env @@ -48,25 +49,123 @@ def __get_random_token(length: int) -> str: OPENSEARCH_VERIFY_CERTS: bool = env.bool("OPENSEARCH_VERIFY_CERTS", default=False) OPENSEARCH_CA_CERTS: str | None = env.str("OPENSEARCH_CA_CERTS", default=None) -# MinIO configuration +# S3-compatible object storage (MinIO + SeaweedFS) + + +def _build_endpoint_url(endpoint: str, *, secure: bool) -> str: + """Build endpoint URL with scheme if endpoint does not include one.""" + parsed_endpoint = urlparse(endpoint) + if parsed_endpoint.scheme: + return endpoint + + protocol = "https" if secure else "http" + return f"{protocol}://{endpoint}" + + +def _strip_endpoint_scheme(endpoint_url: str) -> str: + """Strip scheme from endpoint URL for MinIO client compatibility.""" + parsed_endpoint = urlparse(endpoint_url) + if parsed_endpoint.netloc: + return parsed_endpoint.netloc + return endpoint_url + + STORAGES = { "default": { - "BACKEND": "storages.backends.s3boto3.S3Boto3Storage", + "BACKEND": ( + "sds_gateway.api_methods.utils." + "dual_object_store_storage.DualObjectStoreS3Storage" + ), }, "staticfiles": { "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage", }, } -MINIO_ENDPOINT_URL = env.str("MINIO_ENDPOINT_URL", default="minio:9000") -MINIO_STORAGE_USE_HTTPS = env.bool("MINIO_STORAGE_USE_HTTPS", default=False) - -AWS_ACCESS_KEY_ID: str = env.str("AWS_ACCESS_KEY_ID", default="minioadmin") -AWS_SECRET_ACCESS_KEY: str = env.str("AWS_SECRET_ACCESS_KEY", default="miniopassword") -AWS_STORAGE_BUCKET_NAME: str = env.str("AWS_STORAGE_BUCKET_NAME", default="spectrumx") -AWS_S3_ENDPOINT_URL: str = env.str( +# env var names kept for backward compatibility with existing deployments +LEGACY_AWS_ACCESS_KEY_ID: str = env.str("AWS_ACCESS_KEY_ID", default="admin") +LEGACY_AWS_SECRET_ACCESS_KEY: str = env.str("AWS_SECRET_ACCESS_KEY", default="admin") +LEGACY_AWS_STORAGE_BUCKET_NAME: str = env.str( + "AWS_STORAGE_BUCKET_NAME", default="spectrumx" +) +LEGACY_AWS_S3_ENDPOINT_URL: str = env.str( "AWS_S3_ENDPOINT_URL", - default="http://minio:9000", + default="http://sds-gateway-local-sfs-s3:8333", +) + +# Primary (SeaweedFS) +PRIMARY_ACCESS_KEY_ID: str = env.str( + "PRIMARY_ACCESS_KEY_ID", + default=LEGACY_AWS_ACCESS_KEY_ID, +) +PRIMARY_SECRET_ACCESS_KEY: str = env.str( + "PRIMARY_SECRET_ACCESS_KEY", + default=LEGACY_AWS_SECRET_ACCESS_KEY, +) +PRIMARY_STORAGE_BUCKET_NAME: str = env.str( + "PRIMARY_STORAGE_BUCKET_NAME", + default=LEGACY_AWS_STORAGE_BUCKET_NAME, +) +PRIMARY_S3_ENDPOINT_URL: str = env.str( + "PRIMARY_S3_ENDPOINT_URL", + default=LEGACY_AWS_S3_ENDPOINT_URL, +) +PRIMARY_STORAGE_USE_HTTPS: bool = env.bool( + "PRIMARY_STORAGE_USE_HTTPS", + default=PRIMARY_S3_ENDPOINT_URL.startswith("https://"), +) +PRIMARY_ENDPOINT_URL: str = env.str( + "PRIMARY_ENDPOINT_URL", + default=_strip_endpoint_scheme(PRIMARY_S3_ENDPOINT_URL), ) + +# Secondary (minio/rustfs) +SECONDARY_STORAGE_USE_HTTPS: bool = env.bool( + "SECONDARY_STORAGE_USE_HTTPS", default=False +) +SECONDARY_ENDPOINT_URL: str = env.str( + "SECONDARY_ENDPOINT_URL", + default="sds-gateway-local-sfs-s3:8333", +) +SECONDARY_S3_ENDPOINT_URL: str = env.str( + "SECONDARY_S3_ENDPOINT_URL", + default=_build_endpoint_url( + SECONDARY_ENDPOINT_URL, + secure=SECONDARY_STORAGE_USE_HTTPS, + ), +) +SECONDARY_ACCESS_KEY_ID: str = env.str( + "SECONDARY_ACCESS_KEY_ID", + default=LEGACY_AWS_ACCESS_KEY_ID, +) +SECONDARY_SECRET_ACCESS_KEY: str = env.str( + "SECONDARY_SECRET_ACCESS_KEY", + default=LEGACY_AWS_SECRET_ACCESS_KEY, +) +SECONDARY_STORAGE_BUCKET_NAME: str = env.str( + "SECONDARY_STORAGE_BUCKET_NAME", + default=LEGACY_AWS_STORAGE_BUCKET_NAME, +) + +# transition controls +OBJECT_STORE_WRITE_BOTH_ENABLED: bool = env.bool( + "OBJECT_STORE_WRITE_BOTH_ENABLED", + default=False, +) +OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED: bool = env.bool( + "OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED", + default=False, +) +OBJECT_STORE_DUAL_WRITE_STRICT: bool = env.bool( + "OBJECT_STORE_DUAL_WRITE_STRICT", + default=False, +) + +# keep AWS_* aliases mapped to primary store for backward compatibility +# django-storages expects these values +AWS_S3_ACCESS_KEY_ID: str = PRIMARY_ACCESS_KEY_ID +AWS_S3_SECRET_ACCESS_KEY: str = PRIMARY_SECRET_ACCESS_KEY +AWS_STORAGE_BUCKET_NAME: str = PRIMARY_STORAGE_BUCKET_NAME +AWS_S3_ENDPOINT_URL: str = PRIMARY_S3_ENDPOINT_URL AWS_S3_REGION_NAME: str = "us-east-1" AWS_S3_SIGNATURE_VERSION: str = "s3v4" AWS_S3_FILE_OVERWRITE: bool = False diff --git a/gateway/config/settings/local.py b/gateway/config/settings/local.py index 78458464c..876afee07 100644 --- a/gateway/config/settings/local.py +++ b/gateway/config/settings/local.py @@ -116,6 +116,8 @@ # CELERY # ------------------------------------------------------------------------------ +# Worker concurrency; override with env var CELERY_WORKER_CONCURRENCY +CELERY_WORKER_CONCURRENCY: int = env.int("CELERY_WORKER_CONCURRENCY", default=1) # https://docs.celeryq.dev/en/stable/userguide/configuration.html#task-eager-propagates # CELERY_TASK_EAGER_PROPAGATES: bool = True # noqa: ERA001 diff --git a/gateway/config/settings/production.py b/gateway/config/settings/production.py index d5303363e..b3da8967d 100644 --- a/gateway/config/settings/production.py +++ b/gateway/config/settings/production.py @@ -1,6 +1,8 @@ """⚠️ Setting overrides for PRODUCTION ⚠️""" # ruff: noqa: F405, ERA001 +import os + import sentry_sdk from django.utils.log import DEFAULT_LOGGING from loguru import logger as log @@ -199,6 +201,14 @@ send_default_pii=False, ) +# CELERY +# ------------------------------------------------------------------------------ +# Worker concurrency: override with env CELERY_WORKER_CONCURRENCY. +_nproc = os.cpu_count() or 1 +CELERY_WORKER_CONCURRENCY: int = env.int( + "CELERY_WORKER_CONCURRENCY", default=min(8, _nproc) +) + # DJANGO-REST-FRAMEWORK # ------------------------------------------------------------------------------- # Tools that generate code samples can use SERVERS to point to the correct domain diff --git a/gateway/docs/detailed-deploy.md b/gateway/docs/detailed-deploy.md index 8f0026097..8a44a2843 100644 --- a/gateway/docs/detailed-deploy.md +++ b/gateway/docs/detailed-deploy.md @@ -103,8 +103,8 @@ Then proceed to the [first deployment steps](#first-deployment-automated) below. # manually set the secrets in .envs/local/*.env files ``` - > [!NOTE] - > In `minio.env`, set `AWS_SECRET_ACCESS_KEY == MINIO_ROOT_PASSWORD`; +> [!NOTE] +> In `storage.env`, set `AWS_SECRET_ACCESS_KEY == SECONDARY_ROOT_PASSWORD`; > > In `django.env`, to generate the `API_KEY` get it running first, then navigate to > [localhost:8000/users/generate-api-key](http://localhost:8000/users/generate-api-key). @@ -166,10 +166,10 @@ differ. This also tests the connection between the application and the OpenSearch instance. -3. Create the MinIO bucket: +3. Create the storage bucket: Go to [localhost:9001](http://localhost:9001) (or `localhost:19001` in production) - and create a bucket named `spectrumx` with the credentials set in `minio.env`. + and create a bucket named `spectrumx` with the credentials set in `storage.env`. Optionally apply a storage quota to this bucket (you can modify it later if needed). ## First deployment: not automated @@ -267,8 +267,8 @@ rsync -aP ./.envs/example/ ./.envs/production echo $(head /dev/urandom | tr -dc 'a-zA-Z0-9' | head -c 40) ``` -+ In `minio.env`, **`AWS_SECRET_ACCESS_KEY` must be equal to - `MINIO_ROOT_PASSWORD`**; ++ In `storage.env`, **`AWS_SECRET_ACCESS_KEY` must be equal to + `SECONDARY_ROOT_PASSWORD`**; + In `django.env`, the **`DJANGO_ADMIN_URL` must end with a slash `/`**. + In `django.env`, to generate the `API_KEY` get it running first, then navigate to [localhost:18000/users/generate-api-key-form](http://localhost:18000/users/generate-api-key-form/) @@ -380,37 +380,62 @@ production hosts. Open the web interface at [localhost:18000](http://localhost:18000). You can create regular users by signing up there. - You can sign in with the superuser credentials at `localhost:18000/` to access the admin interface. + You can sign in with the superuser credentials at + `localhost:18000/` + to access the admin interface. -4. MinIO setup: +4. RustFS setup: - This is a multi-drive, single-node setup of MinIO. For a distributed setup - (multi-node), see the [MinIO - documentation](https://min.io/docs/minio/linux/operations/install-deploy-manage/deploy-minio-multi-node-multi-drive.html#deploy-minio-distributed). + > [!NOTE] + > As of May 2026, RustFS is used as a secondary storage for production deployments + > of SDS, and the primary is SeaweedFS. MinIO was replaced by a combination of + > SeaweedFS (primary) and RustFS (secondary) after project maintainers abandoned the + > open source community version of MinIO. For more details, see the [MinIO to + > SeaweedFS migration documentation](./migration-minio-to-seaweedfs.md). + + The instructions below are for setting up the RustFS instance if you choose to use + it, and instructions are very similar to the pre-existing ones for MinIO. This is a + multi-drive, single-node setup of RustFS. For other kinds of deployment, check their + documentation. + + The `mc` commands below refer to the MinIO CLI client, which can be used with RustFS + endpoints. Unfortunately it also seems unmaintained, so you may want to use a + community fork or the RustFS CLI instead: + + + Official `mc` repo: + + Pigsty community fork of `mc`: (most starred fork) + + Docker Hub mirror + + RustFS CLI (alpha): + + Most `mc` commands can be replaced with `rc`, as they are, but the API is not + exactly a drop-in replacement. >[!NOTE] > - > We're using `local` in the example commands below as our MinIO alias. Change it - > accordingly if you're using a different alias in your MinIO configuration. + > We're using `prod-secondary-rustfs` in the example commands below as our mc alias. + > Change it accordingly if you're using a different alias in your config. + > To see all aliases, run `mc alias list`. 1. Establish the connection alias: ```bash - just dc exec minio mc alias set local http://127.0.0.1:9000 minioadmin - # paste your MinIO credentials from .envs/production/minio.env; - # change `minioadmin` above to match that file, if needed. + mc alias set prod-secondary-rustfs http://127.0.0.1:9000 rustfsadmin + # paste your storage credentials from .envs/production/storage.env; + # change `rustfsadmin` above to match that file, if needed. # in prod, that is equivalent to: - # docker exec -it sds-gateway-prod-minio mc alias set local http://127.0.0.1:9000 minioadmin + # docker exec -it sds-gateway-prod-secondary-rustfs mc alias set prod-secondary-rustfs http://127.0.0.1:9000 rustfsadmin ``` - Optionally, set up a local `mc` client if you're managing the cluster remotely: + Optionally, set up a `prod-secondary-rustfs` `mc` client if you're managing the + cluster remotely: ```bash - mc alias set local http://:19000 + mc alias set prod-secondary-rustfs http://localhost:19000 rustfsadmin ``` + When running from another docker container, you can use the container name in + the stack instead of `localhost`. + 2. Set admin settings: + [MinIO reference @@ -419,7 +444,7 @@ production hosts. ```bash # enable object compression for all objects, except the ones excluded by default # NOTE: compression is not recommended by MinIO when also using encryption. - mc admin config set local compression enable=on extensions= mime_types= + mc admin config set prod-secondary-rustfs compression enable=on extensions= mime_types= # https://min.io/docs/minio/container/administration/object-management/data-compression.html#id6 @@ -432,36 +457,41 @@ production hosts. # References: # https://min.io/docs/minio/linux/reference/minio-server/settings/storage-class.html#mc-conf.storage_class.standard # https://min.io/product/erasure-code-calculator - mc admin config set local storage_class standard=EC:2 - mc admin config set local storage_class rrs=EC:1 + mc admin config set prod-secondary-rustfs storage_class standard=EC:2 + mc admin config set prod-secondary-rustfs storage_class rrs=EC:1 ``` - 3. Create the MinIO bucket: + 3. Create the bucket: ```bash - mc mb local/spectrumx + mc mb --ignore-existing "prod-secondary-rustfs/spectrumx" ``` 4. (Optional) Diagnostic checks: + > [!TIP] + > If using `rc`, check their documentation. They have additional commands like: + > `rc admin info disk prod-secondary-rustfs` and + > `rc admin info cluster prod-secondary-rustfs` + Check the output of these commands to make sure everything is as expected: ```bash - mc admin info local - mc admin config get local + mc admin info prod-secondary-rustfs + mc admin config get prod-secondary-rustfs # --- cluster health # liveness check curl -I "http://localhost:19000/minio/health/live" - # A response code of 200 OK indicates the MinIO server is online and functional. + # A response code of 200 OK indicates the server is online and functional. # Any other HTTP codes indicate an issue with reaching the server, such as a # transient network issue or potential downtime. # write quorum check curl -I "http://localhost:19000/minio/health/cluster" - # a response code of 200 OK indicates that the MinIO cluster has sufficient MinIO + # a response code of 200 OK indicates that the cluster has sufficient MinIO # servers online to meet write quorum. A response code of 503 Service Unavailable # indicates the cluster does not currently have write quorum. diff --git a/gateway/docs/github-actions-ephemeral-env.md b/gateway/docs/github-actions-ephemeral-env.md index c930b646c..533783cd2 100644 --- a/gateway/docs/github-actions-ephemeral-env.md +++ b/gateway/docs/github-actions-ephemeral-env.md @@ -101,8 +101,8 @@ The CI environment uses safe, deterministic values: | Service | Variable | Value | | ------------- | ----------------------------------- | ------------------------------- | | Postgres | `POSTGRES_PASSWORD` | `ci-postgres-pass` | -| MinIO | `MINIO_ROOT_PASSWORD` | `ci-minio-secret` | -| MinIO | `AWS_SECRET_ACCESS_KEY` | `ci-minio-secret` | +| Secondary | `SECONDARY_ROOT_PASSWORD` | `ci-minio-secret` | +| Secondary | `AWS_SECRET_ACCESS_KEY` | `ci-minio-secret` | | OpenSearch | `OPENSEARCH_INITIAL_ADMIN_PASSWORD` | `CiAdmin123!` | | OpenSearch | `OPENSEARCH_PASSWORD` | `CiDjango123!` | | Celery Flower | `CELERY_FLOWER_PASSWORD` | `ci-flower-pass` | @@ -180,7 +180,7 @@ Check that all env files were generated: ```bash ls -la .envs/ci/ -# Should show: django.env, minio.env, opensearch.env, postgres.env +# Should show: django.env, storage.env, opensearch.env, postgres.env ``` ### Secrets not populated diff --git a/gateway/docs/migration-minio-to-seaweedfs.md b/gateway/docs/migration-minio-to-seaweedfs.md index 2f03b3827..ce20e4f74 100644 --- a/gateway/docs/migration-minio-to-seaweedfs.md +++ b/gateway/docs/migration-minio-to-seaweedfs.md @@ -7,6 +7,7 @@ SeaweedFS setup is fully automated. This document covers data migration from a r MinIO instance and production-specific configuration. + [Migration: MinIO → SeaweedFS](#migration-minio--seaweedfs) + + [Diagram](#diagram) + [Prerequisites](#prerequisites) + [1. Start both stacks](#1-start-both-stacks) + [2. Configure `mc` aliases](#2-configure-mc-aliases) @@ -22,6 +23,56 @@ MinIO instance and production-specific configuration. --- +## Diagram + +```mermaid +timeline + title CRC SDS storage backend migration (2026) + March Week 2 : ✅ Run a standalone prototype for SeaweedFS + : ✅ Initial SFS configuration + April Week 2 : ✅ Draft the data migration plan + April Week 3 : ✅ Automate deployment (local/ci/production) + : ✅ Integrate SFS as an additional storage backend + : ✅ Create backup deployment of MinIO on NFS for the transition period + April Week 4 : ✅ Verify backup integrity + : ⬜ Unmount 3 (/8) MinIO drives (entering RO mode); rsync data in them to separate location + : ⬜ Deploy a new MinIO instance on those 3 drives with `EC:1` + : ⬜ Mirror data from RO MinIO to the new instance + : ⬜ Check data integrity of new instance + : ⬜ Switch production to use the new instance (leaving RO mode) + April Week 5 : ⬜ Stop older MinIO instance; wipe drives + : ⬜ Repurpose drives for SeaweedFS + : ⬜ Mirror existing production data to SeaweedFS + : ⬜ Switch production primary to SeaweedFS, leave MinIO as secondary; monitor stability + May Week 1 : ⬜ Remove `prod-backup`; finalize migration; keep monitoring +``` + ++ March Week 2 + + [x] Run a standalone prototype for SeaweedFS + + [x] Initial SFS configuration ++ April Week 2 + + [x] Draft the data migration plan ++ April Week 3 + + [x] Automate deployment (local/ci/production) + + [x] Integrate SFS as an additional storage backend + + [x] Create backup deployment of MinIO on NFS for the transition period ++ April Week 4 + + [x] Verify backup integrity + + [ ] Unmount 3 (/8) MinIO drives (entering RO mode); rsync data in them to separate location + + [ ] Deploy a new MinIO instance on those 3 drives with `EC:1` + + [ ] Mirror data from RO MinIO to the new instance + + [ ] Check data integrity of new instance + + [ ] Switch production to use the new instance (leaving RO mode) ++ April Week 5 + + [ ] Stop older MinIO instance; wipe drives + + [ ] Repurpose drives for SeaweedFS + + [ ] Mirror existing production data to SeaweedFS + + [ ] Switch production primary to SeaweedFS, leave MinIO as secondary; monitor stability ++ May Week 1 + + [ ] Remove `prod-backup`; finalize migration; keep monitoring + +--- + ## Prerequisites | Tool | Purpose | @@ -59,13 +110,13 @@ curl -s http://localhost:8333/healthz # SFS S3 endpoint: expected empty 200 ```bash # read credentials from env files -MINIO_USER=$(grep MINIO_ROOT_USER .envs/local/minio.env | cut -d= -f2) -MINIO_PASS=$(grep MINIO_ROOT_PASSWORD .envs/local/minio.env | cut -d= -f2) -SFS_KEY=$(grep AWS_ACCESS_KEY_ID .envs/local/sfs.env | cut -d= -f2) -SFS_SECRET=$(grep AWS_SECRET_ACCESS_KEY .envs/local/sfs.env | cut -d= -f2) +SECONDARY_USER=$(grep SECONDARY_ROOT_USER .envs/local/storage.env | cut -d= -f2) +SECONDARY_PASS=$(grep SECONDARY_ROOT_PASSWORD .envs/local/storage.env | cut -d= -f2) +PRIMARY_KEY=$(grep PRIMARY_ACCESS_KEY_ID .envs/local/storage.env | cut -d= -f2) +PRIMARY_SECRET=$(grep PRIMARY_SECRET_ACCESS_KEY .envs/local/storage.env | cut -d= -f2) -mc alias set minio http://localhost:9000 "${MINIO_USER}" "${MINIO_PASS}" -mc alias set sfs http://localhost:8333 "${SFS_KEY}" "${SFS_SECRET}" +mc alias set minio http://localhost:9000 "${SECONDARY_USER}" "${SECONDARY_PASS}" +mc alias set sfs http://localhost:8333 "${PRIMARY_KEY}" "${PRIMARY_SECRET}" ``` Verify: @@ -103,7 +154,7 @@ mc diff minio/spectrumx sfs/spectrumx ## 5. Switch the application to SFS -The compose files already reference `sfs.env` instead of `minio.env`. Restart the +The compose files already reference `storage.env` for both backends. Restart the gateway to confirm: ```bash @@ -118,7 +169,7 @@ curl -s http://localhost:8000/api/v1/files/ | head Once migration is verified: 1. Stop MinIO: `just dc stop minio` -2. Remove `minio.env` entries from `env_file` lists in the compose file (lines marked `# legacy`). +2. Remove `storage.env` entries from `env_file` lists in the compose file (lines marked `# legacy`). 3. Remove the `minio:` service block. 4. Remove the `sds-gateway--minio-net` network and `sds-gateway--minio-files` volume. 5. Restart: `just down && just up` @@ -163,12 +214,15 @@ Generate production credentials and keep both files in sync: ACCESS_KEY=$(openssl rand -hex 16) SECRET_KEY=$(openssl rand -base64 32 | tr -d '=+/') -sed -i "s/^AWS_ACCESS_KEY_ID=.*/AWS_ACCESS_KEY_ID=${ACCESS_KEY}/" \ - gateway/.envs/production/sfs.env \ +ACCESS_KEY=$(grep PRIMARY_ACCESS_KEY_ID .envs/local/storage.env | cut -d= -f2) +SECRET_KEY=$(grep PRIMARY_SECRET_ACCESS_KEY .envs/local/storage.env | cut -d= -f2) + +sed -i "s/^PRIMARY_ACCESS_KEY_ID=.*/PRIMARY_ACCESS_KEY_ID=${ACCESS_KEY}/" \ + gateway/.envs/production/storage.env \ seaweedfs/.envs/production/sfs.env -sed -i "s/^AWS_SECRET_ACCESS_KEY=.*/AWS_SECRET_ACCESS_KEY=${SECRET_KEY}/" \ - gateway/.envs/production/sfs.env \ +sed -i "s/^PRIMARY_SECRET_ACCESS_KEY=.*/PRIMARY_SECRET_ACCESS_KEY=${SECRET_KEY}/" \ + gateway/.envs/production/storage.env \ seaweedfs/.envs/production/sfs.env ``` @@ -177,7 +231,7 @@ sed -i "s/^AWS_SECRET_ACCESS_KEY=.*/AWS_SECRET_ACCESS_KEY=${SECRET_KEY}/" \ 1. Add the server hostname to `seaweedfs/scripts/prod-hostnames.env` and `gateway/scripts/prod-hostnames.env` — deploy scripts validate this. -2. Confirm `seaweedfs/.envs/production/sfs.env` and `gateway/.envs/production/sfs.env` +2. Confirm `seaweedfs/.envs/production/sfs.env` and `gateway/.envs/production/storage.env` have matching non-empty credentials. 3. The `sds-network-prod` Docker network must exist (the deploy script creates it @@ -193,5 +247,5 @@ sed -i "s/^AWS_SECRET_ACCESS_KEY=.*/AWS_SECRET_ACCESS_KEY=${SECRET_KEY}/" \ ## Rollback -Replace `sfs.env` with `minio.env` in the `env_file` lists of the compose file, then -restart the gateway. MinIO data is untouched until its volume is explicitly deleted. +Replace `storage.prod.env` with `storage.env` in the `env_file` lists of the compose file, then +restart the gateway. diff --git a/gateway/justfile b/gateway/justfile index 7def1e43a..6569c19ca 100644 --- a/gateway/justfile +++ b/gateway/justfile @@ -14,7 +14,7 @@ app_container := shell(env_selection_script + ' $1', "app_container") compose_file := shell(env_selection_script + ' $1', "compose_file") env := shell(env_selection_script + ' $1', "env") env_file := shell(env_selection_script + ' $1', "env_file") -docker_compose := "COMPOSE_FILE=" + compose_file + " docker compose --env-file " + env_file +docker_compose := "COMPOSE_FILE=" + compose_file + " docker compose --env-file " + env_file + " --env-file ./.envs/" + env + "/storage.env" gwy_root := justfile_directory() git_root := gwy_root + "/.." uv_cmd := docker_compose + " run '" + app_container + "' uv" @@ -146,7 +146,6 @@ dev-setup: [group('utilities')] env: #!/usr/bin/env bash - set -euo pipefail echo -e "\nSelected env:\n" echo -e "\tEnvironment: \e[34m '{{ env }}'\e[0m" echo -e "\tEnvironment file: \e[34m '{{ env_file }}'\e[0m" diff --git a/gateway/scripts/deploy.sh b/gateway/scripts/deploy.sh index dfe3adb83..31dcbbb8c 100755 --- a/gateway/scripts/deploy.sh +++ b/gateway/scripts/deploy.sh @@ -8,473 +8,530 @@ # SDS_FORCE_SECRETS - Set to 'true' to overwrite existing secrets (default: false) # SDS_SKIP_SECRETS - Set to 'true' to skip secret generation (default: false) # SDS_SKIP_NETWORK - Set to 'true' to skip network creation (default: false) +# SDS_SKIP_SFS - Set to 'true' to skip SeaweedFS stack deployment (default: false) # SDS_DETACH - Set to 'true' to run in detached mode (default: true for prod) # # USAGE EXAMPLES: # ./deploy.sh [OPTIONS] # SDS_SKIP_SECRETS=true ./deploy.sh local # SDS_FORCE_SECRETS=true SDS_DETACH=false ./deploy.sh production +# SDS_SKIP_SFS=true ./deploy.sh local set -euo pipefail SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) PROJECT_ROOT=$(cd "${SCRIPT_DIR}/.." && pwd) +SFS_ROOT=$(cd "${PROJECT_ROOT}/../seaweedfs" 2>/dev/null && pwd) || SFS_ROOT="" # shellcheck disable=SC1091 source "${SCRIPT_DIR}/common.sh" function show_usage() { - echo -e "Usage: ${0} [OPTIONS] " - echo "" - echo "Deploy the SDS Gateway environment following README instructions." - echo "" - echo -e "\e[34mThis is a high level script that automates:\e[0m" - echo " 1. Secret generation" - echo " 2. Docker network creation" - echo " 3. Service deployment" - echo " 4. Database migrations" - echo " 5. Superuser creation (interactive)" - echo " 6. MinIO bucket creation" - echo "" - echo -e "\e[34mOPTIONS:\e[0m" - echo " -f, --force Overwrite existing env files when generating secrets" - echo " -s, --skip-secrets Skip secret generation (use existing secrets)" - echo " -n, --skip-network Skip network creation" - echo " -d, --detach Run services in detached mode (default for prod)" - echo " -h, --help Show this help message" - echo "" - echo -e "\e[34mARGUMENTS:\e[0m" - echo " Target environment to deploy" - echo "" - echo -e "\e[34mENVIRONMENT VARIABLES:\e[0m" - echo " SDS_FORCE_SECRETS Overwrite existing secrets (true/false, default: false)" - echo " SDS_SKIP_SECRETS Skip secret generation (true/false, default: false)" - echo " SDS_SKIP_NETWORK Skip network creation (true/false, default: false)" - echo " SDS_DETACH Run in detached mode (true/false, default: true for prod)" - echo "" - echo " Note: Command-line options take precedence over environment variables." - echo "" - echo -e "\e[34mEXAMPLES:\e[0m" - echo " ${0} local # Quick local deploy" - echo " ${0} --force production # Production deploy, regenerate secrets" - echo " ${0} --skip-secrets ci # CI deploy using existing secrets" - echo " SDS_SKIP_SECRETS=true ${0} local # Use env var to skip secrets" - echo " SDS_DETACH=false ${0} production # Production in foreground mode" - echo "" - echo -e "\e[34mNOTES:\e[0m" - echo " - For production, ensure prod-hostnames.env is configured first" - echo " - Superuser creation is interactive by default" - echo " - MinIO bucket must be created manually via web UI (localhost:9001 or 19001)" - echo " - Use 'just redeploy' for quick rebuilds after initial deploy" - exit 0 + echo -e "Usage: ${0} [OPTIONS] " + echo "" + echo "Deploy the SDS Gateway environment following README instructions." + echo "" + echo -e "\e[34mThis is a high level script that automates:\e[0m" + echo " 1. Secret generation" + echo " 2. Docker network creation" + echo " 3. SeaweedFS stack deployment (start + configure credentials + create bucket)" + echo " 4. Gateway service deployment" + echo " 5. Database migrations" + echo " 6. Superuser creation (interactive)" + echo "" + echo -e "\e[34mOPTIONS:\e[0m" + echo " -f, --force Overwrite existing env files when generating secrets" + echo " -s, --skip-secrets Skip secret generation (use existing secrets)" + echo " -n, --skip-network Skip network creation" + echo " --skip-sfs Skip SeaweedFS stack deployment" + echo " -d, --detach Run services in detached mode (default for prod)" + echo " -h, --help Show this help message" + echo "" + echo -e "\e[34mARGUMENTS:\e[0m" + echo " Target environment to deploy" + echo "" + echo -e "\e[34mENVIRONMENT VARIABLES:\e[0m" + echo " SDS_FORCE_SECRETS Overwrite existing secrets (true/false, default: false)" + echo " SDS_SKIP_SECRETS Skip secret generation (true/false, default: false)" + echo " SDS_SKIP_NETWORK Skip network creation (true/false, default: false)" + echo " SDS_SKIP_SFS Skip SeaweedFS deployment (true/false, default: false)" + echo " SDS_DETACH Run in detached mode (true/false, default: true for prod)" + echo "" + echo " Note: Command-line options take precedence over environment variables." + echo "" + echo -e "\e[34mEXAMPLES:\e[0m" + echo " ${0} local # Quick local deploy" + echo " ${0} --force production # Production deploy, regenerate secrets" + echo " ${0} --skip-secrets ci # CI deploy using existing secrets" + echo " SDS_SKIP_SECRETS=true ${0} local # Use env var to skip secrets" + echo " SDS_DETACH=false ${0} production # Production in foreground mode" + echo "" + echo -e "\e[34mNOTES:\e[0m" + echo " - For production, ensure prod-hostnames.env is configured first" + echo " - Superuser creation is interactive by default" + echo " - S3 credentials are read from PRIMARY_* vars in .envs//storage.env" + echo " and configured automatically via SeaweedFS weed shell" + echo " - Use 'just redeploy' for quick rebuilds after initial deploy" + exit 0 } function setup_prod_hostnames() { - local script_dir="$1" - local env_type="$2" - local example_file="${script_dir}/prod-hostnames.example.env" - local target_file="${script_dir}/prod-hostnames.env" - - if [[ -f "${example_file}" && ! -f "${target_file}" ]]; then - log_msg "Creating prod-hostnames.env from example..." - cp "${example_file}" "${target_file}" - log_success "Created: ${target_file}" - - if [[ "${env_type}" == "production" ]]; then - local current_hostname - current_hostname=$(hostname) - if [[ -n "${current_hostname}" ]]; then - echo "${current_hostname}" >> "${target_file}" - log_success "Appended hostname to ${target_file}: ${current_hostname}" - else - log_warning "Could not determine current hostname; skipping append" - fi - fi - fi - - # if we're running a production deploy, check the hostname is - # listed in the file first, otherwise abort the deployment - if [[ "${env_type}" == "production" && -f "${target_file}" ]]; then - local current_hostname - local target_file_cur_dir - current_hostname=$(hostname) - target_file_cur_dir=$(realpath --relative-to="." "${target_file}") - if [[ -n "${current_hostname}" ]]; then - if ! grep -Fxq "${current_hostname}" "${target_file}"; then - log_error "Current hostname '${current_hostname}' not a production host listed in '${target_file_cur_dir}'." - log_msg "Add it manually:\n\n\techo '${current_hostname}' >> ${target_file_cur_dir}" - exit 1 - fi - else - log_warning "Could not determine current hostname; cannot validate ${target_file_cur_dir}" - fi - fi + local script_dir="$1" + local env_type="$2" + local example_file="${script_dir}/prod-hostnames.example.env" + local target_file="${script_dir}/prod-hostnames.env" + + if [[ -f "${example_file}" && ! -f "${target_file}" ]]; then + log_msg "Creating prod-hostnames.env from example..." + cp "${example_file}" "${target_file}" + log_success "Created: ${target_file}" + + if [[ "${env_type}" == "production" ]]; then + local current_hostname + current_hostname=$(hostname) + if [[ -n "${current_hostname}" ]]; then + echo "${current_hostname}" >>"${target_file}" + log_success "Appended hostname to ${target_file}: ${current_hostname}" + else + log_warning "Could not determine current hostname; skipping append" + fi + fi + fi + + # if we're running a production deploy, check the hostname is + # listed in the file first, otherwise abort the deployment + if [[ "${env_type}" == "production" && -f "${target_file}" ]]; then + local current_hostname + local target_file_cur_dir + current_hostname=$(hostname) + target_file_cur_dir=$(realpath --relative-to="." "${target_file}") + if [[ -n "${current_hostname}" ]]; then + if ! grep -Fxq "${current_hostname}" "${target_file}"; then + log_error "Current hostname '${current_hostname}' not a production host listed in '${target_file_cur_dir}'." + log_msg "Add it manually:\n\n\techo '${current_hostname}' >> ${target_file_cur_dir}" + exit 1 + fi + else + log_warning "Could not determine current hostname; cannot validate ${target_file_cur_dir}" + fi + fi } function create_docker_network() { - local env_type="$1" - local network_name="sds-network-${env_type}" - - log_header "Docker Network Setup" - - if docker network inspect "${network_name}" &>/dev/null; then - log_msg "Network '${network_name}' already exists" - else - log_msg "Creating Docker network: ${network_name}" - docker network create "${network_name}" --driver=bridge - log_success "Network created: ${network_name}" - fi + local env_type="$1" + local network_name="sds-network-${env_type}" + + log_header "Docker Network Setup" + + if docker network inspect "${network_name}" &>/dev/null; then + log_msg "Network '${network_name}' already exists" + else + log_msg "Creating Docker network: ${network_name}" + docker network create "${network_name}" --driver=bridge + log_success "Network created: ${network_name}" + fi } function generate_secrets() { - local env_type="$1" - local force="$2" + local env_type="$1" + local force="$2" - log_header "Secret Generation" + log_header "Secret Generation" - local force_flag="" - if [[ "${force}" == "true" ]]; then - force_flag="--force" - fi + local force_flag="" + if [[ "${force}" == "true" ]]; then + force_flag="--force" + fi - log_msg "Generating secrets for '${env_type}' environment..." - just generate-secrets "${env_type}" ${force_flag} + log_msg "Generating secrets for '${env_type}' environment..." + just generate-secrets "${env_type}" ${force_flag} } function build_app() { - local service_name - service_name="$1" - log_header "Building stack" - if [[ -n "${service_name}" ]]; then - log_msg "Pulling images and building only service: ${service_name}" - else - log_msg "Pulling images and building all services" - fi - just build "${service_name}" + local service_name + service_name="$1" + log_header "Building stack" + if [[ -n "${service_name}" ]]; then + log_msg "Pulling images and building only service: ${service_name}" + else + log_msg "Pulling images and building all services" + fi + just build "${service_name}" } function first_start() { - log_header "First Stack Startup" + log_header "First Stack Startup" - log_msg "Building images" - just build + log_msg "Building images" + just build - log_msg "Starting opensearch" - just up opensearch + log_msg "Starting opensearch" + just up opensearch - log_msg "Waiting for OpenSearch to be healthy..." - wait_for_service "opensearch" 60 || { - log_warning "OpenSearch health check timed out, tearing down anyway" - } - just up || true + log_msg "Waiting for OpenSearch to be healthy..." + wait_for_service "opensearch" 60 || { + log_warning "OpenSearch health check timed out, tearing down anyway" + } + just up || true } function start_stack() { - log_header "Starting SDS stack" - log_msg "Starting stack..." - { - just build - just up - } &>/dev/null & + log_header "Starting SDS stack" + log_msg "Starting stack..." + { + just build + just up + } &>/dev/null & } function stop_stack() { - log_msg "Stopping stack..." - just down + log_msg "Stopping stack..." + just down } function wait_for_service() { - local container_name="$1" - local max_attempts="${2:-30}" - local attempt=1 - - log_msg "Waiting for container '${container_name}' to be ready..." - - while [[ ${attempt} -le ${max_attempts} ]]; do - if just dc exec "${container_name}" echo "ready" &>/dev/null; then - log_success "Container '${container_name}' is ready" - return 0 - fi - - if [[ $((attempt % 5)) -eq 0 ]]; then - log_msg "Still waiting... (attempt ${attempt}/${max_attempts})" - fi - - sleep 2 - attempt=$((attempt + 1)) - done - - log_error "Container '${container_name}' did not become ready in time" - return 1 + local container_name="$1" + local max_attempts="${2:-30}" + local attempt=1 + + log_msg "Waiting for container '${container_name}' to be ready..." + + while [[ ${attempt} -le ${max_attempts} ]]; do + if just dc exec "${container_name}" echo "ready" &>/dev/null; then + log_success "Container '${container_name}' is ready" + return 0 + fi + + if [[ $((attempt % 5)) -eq 0 ]]; then + log_msg "Still waiting... (attempt ${attempt}/${max_attempts})" + log_msg "=== Container logs (last 20 lines) ===" + docker logs --tail 20 "${container_name}" 2>&1 | while IFS= read -r line; do + log_msg " ${line}" + done + log_msg "==========================================" + fi + + sleep 2 + attempt=$((attempt + 1)) + done + + log_error "Container '${container_name}' did not become ready in time" + return 1 } function run_migrations() { - local container_name="$1" + local container_name="$1" - log_header "Database Migrations" + log_header "Database Migrations" - log_msg "Running Django migrations..." - # you probably don't need/want makemigrations at this stage; here for documentation - # just uv run manage.py makemigrations - just uv run manage.py migrate - log_success "Migrations applied" + log_msg "Running Django migrations..." + # you probably don't need/want makemigrations at this stage; here for documentation + # just uv run manage.py makemigrations + just uv run manage.py migrate + log_success "Migrations applied" } function create_superuser() { - local container_name="$1" - local env_type="$2" - - log_header "Superuser Creation" - - local has_superuser - has_superuser=$(just uv run manage.py check_superuser_exists 2>/dev/null | tail -n1 | tr -d '[:space:]') - - case "${has_superuser}" in - yes|no) ;; - *) - log_error "Unexpected output from check_superuser_exists: '${has_superuser}'" - return 1 - ;; - esac - - if [[ "${has_superuser}" == "yes" ]]; then - log_msg "Superuser already exists, skipping creation" - return 0 - fi - - if [[ "${env_type}" == "ci" ]]; then - log_msg "Creating superuser for CI environment (non-interactive)..." - just uv run manage.py create_ci_superuser - else - log_msg "Creating superuser (interactive)..." - log_msg "You will be prompted for username, email, and password" - echo "" - just uv run manage.py createsuperuser || { - log_warning "Superuser creation skipped or failed" - log_msg "You can create it later with: just uv run manage.py createsuperuser" - } - fi + local container_name="$1" + local env_type="$2" + + log_header "Superuser Creation" + + local has_superuser + has_superuser=$(just uv run manage.py check_superuser_exists 2>/dev/null | tail -n1 | tr -d '[:space:]') + + case "${has_superuser}" in + yes | no) ;; + *) + log_error "Unexpected output from check_superuser_exists: '${has_superuser}'" + return 1 + ;; + esac + + if [[ "${has_superuser}" == "yes" ]]; then + log_msg "Superuser already exists, skipping creation" + return 0 + fi + + if [[ "${env_type}" == "ci" ]]; then + log_msg "Creating superuser for CI environment (non-interactive)..." + just uv run manage.py create_ci_superuser + else + log_msg "Creating superuser (interactive)..." + log_msg "You will be prompted for username, email, and password" + echo "" + just uv run manage.py createsuperuser || { + log_warning "Superuser creation skipped or failed" + log_msg "You can create it later with: just uv run manage.py createsuperuser" + } + fi } function show_next_steps() { - local env_type="$1" - local port_prefix="" - - if [[ "${env_type}" == "production" ]]; then - port_prefix="1" - fi - - log_header "Deployment Complete!" - - echo "" - echo "🎉 Gateway deployed successfully!" - echo "" - echo "Next steps:" - echo "" - echo " 1. Access the web interface:" - echo " - Gateway: http://localhost:${port_prefix}8000" - echo " - Admin panel: http://localhost:${port_prefix}8000/admin" - echo "" - echo " 2. Run tests to verify installation:" - echo " just test" - echo "" - echo " 3. For production SDK API key generation:" - echo " - Visit http://localhost:${port_prefix}8000/users/generate-api-key-form/" - echo " - Copy the key to .envs/${env_type}/django.env" - echo "" - - if [[ "${env_type}" == "local" ]]; then - echo " 4. Check webpack dev server:" - echo " http://localhost:3000/webpack-dev-server" - echo "" - fi - - echo "📚 For more information, see gateway/README.md" - echo "" + local env_type="$1" + local port_prefix="" + + if [[ "${env_type}" == "production" ]]; then + port_prefix="1" + fi + + log_header "Deployment Complete!" + + echo "" + echo "🎉 Gateway deployed successfully!" + echo "" + echo "Next steps:" + echo "" + echo " 1. Access the web interface:" + echo " - Gateway: http://localhost:${port_prefix}8000" + echo " - Admin panel: http://localhost:${port_prefix}8000/admin" + echo "" + echo " 2. Run tests to verify installation:" + echo " just test" + echo "" + echo " 3. For production SDK API key generation:" + echo " - Visit http://localhost:${port_prefix}8000/users/generate-api-key-form/" + echo " - Copy the key to .envs/${env_type}/django.env" + echo "" + + if [[ "${env_type}" == "local" ]]; then + echo " 4. Check webpack dev server:" + echo " http://localhost:3000/webpack-dev-server" + echo "" + fi + + echo "📚 For more information, see gateway/README.md" + echo "" } function parse_arguments() { - local -n args_ref=$1 - shift - - # read from environment variables first (command-line args will override) - if [[ "${SDS_FORCE_SECRETS:-}" == "true" ]]; then - args_ref[force_secrets]="true" - fi - if [[ "${SDS_SKIP_SECRETS:-}" == "true" ]]; then - args_ref[skip_secrets]="true" - fi - if [[ "${SDS_SKIP_NETWORK:-}" == "true" ]]; then - args_ref[skip_network]="true" - fi - if [[ "${SDS_DETACH:-}" == "true" ]]; then - args_ref[detach]="true" - elif [[ "${SDS_DETACH:-}" == "false" ]]; then - args_ref[detach]="false" - fi - - # parse command-line arguments (these override env vars) - while [[ $# -gt 0 ]]; do - case "$1" in - -f|--force) - args_ref[force_secrets]="true" - shift - ;; - -s|--skip-secrets) - args_ref[skip_secrets]="true" - shift - ;; - -n|--skip-network) - args_ref[skip_network]="true" - shift - ;; - -d|--detach) - args_ref[detach]="true" - shift - ;; - -h|--help) - show_usage - ;; - local|production|ci) - args_ref[env_type]="$1" - shift - ;; - *) - log_error "Unknown argument: $1" - show_usage - ;; - esac - done - - if [[ -z "${args_ref[env_type]}" ]]; then - log_error "Environment type required (local, production, or ci)" - show_usage - fi - - # auto-detach for production unless explicitly overridden - if [[ "${args_ref[env_type]}" == "production" && "${SDS_DETACH:-}" != "false" ]]; then - args_ref[detach]="true" - fi + local -n _args_ref=$1 + shift + + # Ensure all keys exist (shellcheck can't follow nameref) + if [[ -z "${_args_ref[force_secrets]+x}" ]]; then + _args_ref[force_secrets]="false" + fi + if [[ -z "${_args_ref[skip_secrets]+x}" ]]; then + _args_ref[skip_secrets]="false" + fi + if [[ -z "${_args_ref[skip_network]+x}" ]]; then + _args_ref[skip_network]="false" + fi + if [[ -z "${_args_ref[skip_sfs]+x}" ]]; then + _args_ref[skip_sfs]="false" + fi + if [[ -z "${_args_ref[detach]+x}" ]]; then + _args_ref[detach]="false" + fi + # read from environment variables first (command-line args will override) + if [[ "${SDS_FORCE_SECRETS:-}" == "true" ]]; then + _args_ref[force_secrets]="true" + fi + if [[ "${SDS_SKIP_SECRETS:-}" == "true" ]]; then + _args_ref[skip_secrets]="true" + fi + if [[ "${SDS_SKIP_NETWORK:-}" == "true" ]]; then + _args_ref[skip_network]="true" + fi + if [[ "${SDS_SKIP_SFS:-}" == "true" ]]; then + _args_ref[skip_sfs]="true" + fi + if [[ "${SDS_DETACH:-}" == "true" ]]; then + _args_ref[detach]="true" + elif [[ "${SDS_DETACH:-}" == "false" ]]; then + _args_ref[detach]="false" + fi + + # parse command-line arguments (these override env vars) + while [[ $# -gt 0 ]]; do + case "$1" in + -f | --force) + _args_ref[force_secrets]="true" + shift + ;; + -s | --skip-secrets) + _args_ref[skip_secrets]="true" + shift + ;; + -n | --skip-network) + _args_ref[skip_network]="true" + shift + ;; + --skip-sfs) + _args_ref[skip_sfs]="true" + shift + ;; + -d | --detach) + _args_ref[detach]="true" + shift + ;; + -h | --help) + show_usage + ;; + local | production | ci) + _args_ref[env_type]="$1" + shift + ;; + *) + log_error "Unknown argument: $1" + show_usage + ;; + esac + done + + if [[ -z "${_args_ref[env_type]}" ]]; then + log_error "Environment type required (local, production, or ci)" + show_usage + fi + + # auto-detach for production unless explicitly overridden + if [[ "${_args_ref[env_type]}" == "production" && "${SDS_DETACH:-}" != "false" ]]; then + _args_ref[detach]="true" + fi } function determine_container_name() { - local env_type="$1" - if [[ "${env_type}" == "production" ]]; then - echo "sds-gateway-prod-app" - elif [[ "${env_type}" == "ci" ]]; then - echo "sds-gateway-ci-app" - elif [[ "${env_type}" == "local" ]]; then - echo "sds-gateway-local-app" - else - log_error "Unknown environment type: ${env_type}" - return 1 - fi + local env_type="$1" + if [[ "${env_type}" == "production" ]]; then + echo "sds-gateway-prod-app" + elif [[ "${env_type}" == "ci" ]]; then + echo "sds-gateway-ci-app" + elif [[ "${env_type}" == "local" ]]; then + echo "sds-gateway-local-app" + else + log_error "Unknown environment type: ${env_type}" + return 1 + fi } function setup_secrets_and_network() { - local env_type="$1" - local skip_secrets="$2" - local force_secrets="$3" - local skip_network="$4" - - if [[ "${skip_secrets}" == "false" ]]; then - generate_secrets "${env_type}" "${force_secrets}" - else - log_msg "Skipping secret generation (using existing secrets)" - fi - - if [[ "${skip_network}" == "false" ]]; then - create_docker_network "${env_type}" - else - log_msg "Skipping network creation" - fi + local env_type="$1" + local skip_secrets="$2" + local force_secrets="$3" + local skip_network="$4" + + if [[ "${skip_secrets}" == "false" ]]; then + generate_secrets "${env_type}" "${force_secrets}" + else + log_msg "Skipping secret generation (using existing secrets)" + fi + + if [[ "${skip_network}" == "false" ]]; then + create_docker_network "${env_type}" + else + log_msg "Skipping network creation" + fi } function setup_database() { - local container_name="$1" - local env_type="$2" + local container_name="$1" + local env_type="$2" - log_header "Setting up Database" + log_header "Setting up Database" - wait_for_service "${container_name}" 60 || { - log_error "Failed to start services" - log_msg "Check logs with: just logs" - exit 1 - } + wait_for_service "${container_name}" 60 || { + log_error "Failed to start services" + log_msg "Check logs with: just logs" + exit 1 + } - run_migrations "${container_name}" - create_superuser "${container_name}" "${env_type}" + run_migrations "${container_name}" + create_superuser "${container_name}" "${env_type}" } -function create_minio_bucket() { - local env_type="$1" - local minio_env_file="${PROJECT_ROOT}/.envs/${env_type}/minio.env" +function create_storage_buckets() { + local env_type="$1" + log_header "Creating Object Store Buckets" + log_msg "Ensuring storage buckets exist on configured object stores..." + set +e + just uv run manage.py create_storage_buckets + local mgmt_exit=$? + set -e + if [[ ${mgmt_exit} -ne 0 ]]; then + log_warning "Bucket creation had non-zero exit (may be expected if secondary is unreachable)" + fi + log_success "Storage buckets ready" +} + +function deploy_sfs_stack() { + local env_type="$1" + local sfs_env_file="${PROJECT_ROOT}/.envs/${env_type}/storage.env" - log_header "MinIO Bucket Setup" + log_header "SeaweedFS Stack Deployment" - if [[ ! -f "${minio_env_file}" ]]; then - log_error "MinIO environment file not found: ${minio_env_file}" - return 1 - fi + if [[ -z "${SFS_ROOT}" || ! -d "${SFS_ROOT}" ]]; then + log_warning "SeaweedFS directory not found at '${PROJECT_ROOT}/../seaweedfs' — skipping SFS deployment" + log_msg "Run the SFS stack manually from the seaweedfs/ directory before starting the gateway." + return 0 + fi - local minio_user - local minio_password - minio_user=$(grep -E '^MINIO_ROOT_USER=' "${minio_env_file}" | cut -d'=' -f2) - minio_password=$(grep -E '^MINIO_ROOT_PASSWORD=' "${minio_env_file}" | cut -d'=' -f2) + if [[ ! -f "${SFS_ROOT}/scripts/deploy.sh" ]]; then + log_warning "SeaweedFS deploy script not found at '${SFS_ROOT}/scripts/deploy.sh' — skipping" + return 0 + fi - if [[ -z "${minio_user}" || -z "${minio_password}" ]]; then - log_error "Failed to extract MinIO credentials from ${minio_env_file}" - return 1 - fi + # ensure the shared network exists before SFS references it as external (CI/prod) + create_docker_network "${env_type}" - local alias_name="local" # always "local", doesn't depend on env_type + log_msg "Deploying SeaweedFS stack (env: ${env_type})..." + "${SFS_ROOT}/scripts/deploy.sh" \ + --sfs-env "${sfs_env_file}" \ + "${env_type}" - just dc exec -it minio mc alias set "${alias_name}" "http://localhost:9000" "${minio_user}" "${minio_password}" - just dc exec -it minio mc mb --ignore-existing "${alias_name}/spectrumx" + log_success "SeaweedFS stack deployed" } function finalize_deployment() { - local env_type="$1" - local detach="$2" + local env_type="$1" + local detach="$2" - log_header "Finalizing Deployment" - start_stack - show_next_steps "${env_type}" + log_header "Finalizing Deployment" + start_stack + show_next_steps "${env_type}" } function main() { - declare -A args=( - [force_secrets]="false" - [skip_secrets]="false" - [skip_network]="true" # usually works when skipped - [detach]="false" - [env_type]="" - ) + declare -A args=( + [force_secrets]="false" + [skip_secrets]="false" + [skip_network]="false" + [skip_sfs]="false" + [detach]="false" + [env_type]="" + ) + + parse_arguments args "$@" + + cd "${PROJECT_ROOT}" + log_header "SDS Gateway Deployment - ${args[env_type]} environment" - parse_arguments args "$@" + local container_name + container_name=$(determine_container_name "${args[env_type]}") - cd "${PROJECT_ROOT}" - log_header "SDS Gateway Deployment - ${args[env_type]} environment" + setup_secrets_and_network \ + "${args[env_type]}" \ + "${args[skip_secrets]}" \ + "${args[force_secrets]}" \ + "${args[skip_network]}" - local container_name - container_name=$(determine_container_name "${args[env_type]}") + setup_prod_hostnames "${SCRIPT_DIR}" "${args[env_type]}" - setup_secrets_and_network \ - "${args[env_type]}" \ - "${args[skip_secrets]}" \ - "${args[force_secrets]}" \ - "${args[skip_network]}" + if [[ "${args[skip_sfs]}" == "false" ]]; then + deploy_sfs_stack "${args[env_type]}" + else + log_msg "Skipping SeaweedFS stack deployment (--skip-sfs)" + fi - setup_prod_hostnames "${SCRIPT_DIR}" "${args[env_type]}" + build_app "${container_name}" + first_start - build_app "${container_name}" - first_start + create_storage_buckets "${args[env_type]}" - setup_database "${container_name}" "${args[env_type]}" - create_minio_bucket "${args[env_type]}" - finalize_deployment "${args[env_type]}" "${args[detach]}" + setup_database "${container_name}" "${args[env_type]}" + finalize_deployment "${args[env_type]}" "${args[detach]}" } main "$@" diff --git a/gateway/scripts/env-selection.sh b/gateway/scripts/env-selection.sh index 57e35a96e..2073cb053 100755 --- a/gateway/scripts/env-selection.sh +++ b/gateway/scripts/env-selection.sh @@ -2,132 +2,142 @@ set -euo pipefail IFS=$'\n\t' -is_production_host() { - local script_dir - script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) - local host - host=$(hostname) - local prod_hosts_file="${script_dir}/prod-hostnames.env" +function is_production_host() { + local script_dir + script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + local host + host=$(hostname) + local prod_hosts_file="${script_dir}/prod-hostnames.env" - if [[ ! -f "${prod_hosts_file}" ]]; then - printf '\033[33mProduction host list not found at %s: defaulting to local\033[0m\n' "${prod_hosts_file}" >&2 - printf 'Create this file to make the warning go away:\n\n\tcp %s/prod-hostnames.example.env %s\n\n' "${script_dir}" "${prod_hosts_file}" >&2 - return 1 - fi + if [[ ! -f "${prod_hosts_file}" ]]; then + printf '\033[33mProduction host list not found at %s: defaulting to local\033[0m\n' "${prod_hosts_file}" >&2 + printf 'Create this file to make the warning go away:\n\n\tcp %s/prod-hostnames.example.env %s\n\n' "${script_dir}" "${prod_hosts_file}" >&2 + return 1 + fi - while read -r line; do - # trim leading/trailing whitespace - line=$(echo "${line}" | xargs) - # skip comments - [[ -z "${line}" || ${line:0:1} == '#' ]] && continue - # check if the line matches the current host - if [[ "${line}" == "${host}" ]]; then - return 0 - fi - done < "${prod_hosts_file}" + while read -r line; do + # trim leading/trailing whitespace + line=$(echo "${line}" | xargs) + # skip comments + [[ -z "${line}" || ${line:0:1} == '#' ]] && continue + # check if the line matches the current host + if [[ "${line}" == "${host}" ]]; then + return 0 + fi + done <"${prod_hosts_file}" - return 1 + return 1 } -is_ci_env() { - if [[ -n "${CI:-}" ]] || [[ -n "${GITHUB_ACTIONS:-}" ]] || [[ -n "${GITLAB_CI:-}" ]] || [[ -n "${BUILD_ID:-}" ]] || [[ -n "${JENKINS_URL:-}" ]]; then - return 0 - fi - return 1 +function is_ci_env() { + if [[ -n "${CI:-}" ]] || [[ -n "${GITHUB_ACTIONS:-}" ]] || [[ -n "${GITLAB_CI:-}" ]] || [[ -n "${BUILD_ID:-}" ]] || [[ -n "${JENKINS_URL:-}" ]]; then + return 0 + fi + return 1 } -get_target_value() { - local target=$1 - local env_type=$2 - local local_env_file=".envs/local/opensearch.env" - local production_env_file=".envs/production/opensearch.env" - local ci_env_file=".envs/ci/opensearch.env" - local value +function get_target_value() { + local target=$1 + local env_type=$2 + local local_env_file=".envs/local/opensearch.env" + local production_env_file=".envs/production/opensearch.env" + local ci_env_file=".envs/ci/opensearch.env" + local value - case "${target}" in - env) - value="${env_type}" - ;; - compose_file) - case "${env_type}" in - production) - value='compose.production.yaml' - ;; - local) - value='compose.local.yaml' - ;; - ci) - value='compose.ci.yaml' - ;; - esac - ;; - app_container) - case "${env_type}" in - ci) - value='sds-gateway-ci-app' - ;; - local) - value='sds-gateway-local-app' - ;; - production) - value='sds-gateway-prod-app' - ;; - *) - printf 'unsupported environment type: %s\n' "${env_type}" >&2 - exit 1 - ;; - esac - ;; - env_file) - case "${env_type}" in - ci) - value="${ci_env_file}" - ;; - local) - value="${local_env_file}" - ;; - production) - value="${production_env_file}" - ;; - *) - printf 'unsupported environment type: %s\n' "${env_type}" >&2 - exit 1 - ;; - esac - ;; - *) - printf 'unsupported target: %s\n' "${target}" >&2 - exit 1 - ;; - esac + case "${target}" in + env) + value="${env_type}" + ;; + compose_file) + case "${env_type}" in + production) + value='compose.production.yaml' + ;; + local) + value='compose.local.yaml' + ;; + ci) + value='compose.ci.yaml' + ;; + esac + ;; + app_container) + case "${env_type}" in + ci) + value='sds-gateway-ci-app' + ;; + local) + value='sds-gateway-local-app' + ;; + production) + value='sds-gateway-prod-app' + ;; + *) + printf 'unsupported environment type: %s\n' "${env_type}" >&2 + exit 1 + ;; + esac + ;; + env_file) + case "${env_type}" in + ci) + value="${ci_env_file}" + ;; + local) + value="${local_env_file}" + ;; + production) + value="${production_env_file}" + ;; + *) + printf 'unsupported environment type: %s\n' "${env_type}" >&2 + exit 1 + ;; + esac + ;; + *) + printf 'unsupported target: %s\n' "${target}" >&2 + exit 1 + ;; + esac - if [[ "${target}" == "compose_file" && ! -f "${value}" ]]; then - printf '\033[31mERROR: selected compose file "%s" does not exist\033[0m\n' "${value}" >&2 - fi - if [[ "${target}" == "env_file" && ! -f "${value}" ]]; then - printf '\033[31mERROR: selected env file "%s" does not exist\033[0m\n' "${value}" >&2 - fi + if [[ "${target}" == "compose_file" && ! -f "${value}" ]]; then + printf '\033[31mERROR: selected compose file "%s" does not exist\033[0m\n' "${value}" >&2 + fi + if [[ "${target}" == "env_file" && ! -f "${value}" ]]; then + printf '\033[31mERROR: selected env file "%s" does not exist\033[0m\n' "${value}" >&2 + fi - printf '%s\n' "${value}" + printf '%s\n' "${value}" } -main() { - if [[ $# -ne 1 ]]; then - printf 'usage: %s \n' "${0}" >&2 - exit 1 - fi +function main() { + if [[ $# -ne 1 ]]; then + printf 'usage: %s \n' "${0}" >&2 + exit 1 + fi - local target=$1 - local env_type - if is_ci_env; then - env_type='ci' - elif is_production_host; then - env_type='production' - else - env_type='local' - fi + local target=${1:-} + local env_type - get_target_value "${target}" "${env_type}" + # allow explicit override via SDS_ENV (e.g., SDS_ENV=ci just env) + if [[ -n "${SDS_ENV:-}" ]]; then + case "${SDS_ENV}" in + ci | local | production) env_type="${SDS_ENV}" ;; + *) + printf '\033[33mUnknown SDS_ENV="%s": must be ci, local, or production\033[0m\n' "${SDS_ENV}" >&2 + exit 1 + ;; + esac + elif is_ci_env; then + env_type='ci' + elif is_production_host; then + env_type='production' + else + env_type='local' + fi + + get_target_value "${target}" "${env_type}" } main "$@" diff --git a/gateway/scripts/generate-secrets.sh b/gateway/scripts/generate-secrets.sh index 73757980d..690926173 100755 --- a/gateway/scripts/generate-secrets.sh +++ b/gateway/scripts/generate-secrets.sh @@ -1,12 +1,25 @@ #!/usr/bin/env bash -set -euo pipefail +set -Eeuo pipefail SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -PROJECT_ROOT=$(cd "${SCRIPT_DIR}/.." && pwd) -EXAMPLE_DIR="${PROJECT_ROOT}/.envs/example" +GATEWAY_ROOT=$(cd "${SCRIPT_DIR}/.." && pwd) +SFS_ROOT=$(cd "${GATEWAY_ROOT}/../seaweedfs" && pwd) +EXAMPLE_DIR="${GATEWAY_ROOT}/.envs/example" -usage() { - cat << EOF +# PRIMARY (RustFS or SeaweedFS) +PRIMARY_ACCESS_KEY_ID="" +PRIMARY_SECRET_ACCESS_KEY="" +PRIMARY_ENDPOINT_URL="" +PRIMARY_S3_ENDPOINT_URL="" + +# SECONDARY (RustFS or SeaweedFS) — only for production +SECONDARY_ACCESS_KEY_ID="" +SECONDARY_SECRET_ACCESS_KEY="" +SECONDARY_ROOT_USER="minioadmin" +SECONDARY_ROOT_PASSWORD="" + +function usage() { + cat < Generate environment secrets for the gateway component. @@ -23,162 +36,292 @@ EXAMPLES: ${0} --force ci # Generate CI env files (overwrite if exist) ${0} production # Generate production env files -NOTES: - - Generated files are placed in .envs// directory - - Example templates are read from .envs/example/ - - Secrets are randomly generated using OpenSSL - - CI environment uses insecure but deterministic values for ephemeral usage + NOTES: + - Generated files are placed in .envs// directory + - Example templates are read from .envs/example/ + - Secrets are randomly generated using OpenSSL + - CI environment uses insecure but deterministic values for ephemeral usage + - local: PRIMARY (RustFS) + SECONDARY (SeaweedFS) + - production: PRIMARY (SeaweedFS) + SECONDARY (RustFS) + - ci: PRIMARY only (RustFS). No secondary storage. EOF - exit 0 + exit 0 } -generate_secret() { - local length="${1:-40}" - openssl rand -base64 48 | tr -d "=+/" | cut -c1-"${length}" +function configure_object_store_defaults() { + local env_type="$1" + + if [[ -n "${PRIMARY_ENDPOINT_URL}" ]]; then + return 0 + fi + + case "${env_type}" in + local) + PRIMARY_ENDPOINT_URL="sds-gateway-local-rustfs:9000" + PRIMARY_ACCESS_KEY_ID=$(generate_secret 32) + PRIMARY_SECRET_ACCESS_KEY=$(generate_secret 32) + # SECONDARY = SeaweedFS (S3 gateway) + SECONDARY_ENDPOINT_URL="sds-gateway-local-sfs-s3:8333" + SECONDARY_ACCESS_KEY_ID=$(generate_secret 32) + SECONDARY_SECRET_ACCESS_KEY=$(generate_secret 32) + ;; + ci) + PRIMARY_ENDPOINT_URL="sds-gateway-ci-rustfs:9000" + ;; + production) + PRIMARY_ENDPOINT_URL="sds-gateway-prod-sfs-s3:8333" + ;; + *) + echo "ERROR: Unsupported environment type: ${env_type}" >&2 + return 1 + ;; + esac + + PRIMARY_S3_ENDPOINT_URL="http://${PRIMARY_ENDPOINT_URL}" + + # Set SECONDARY S3 endpoint URL for environments that have a secondary + if [[ -n "${SECONDARY_ENDPOINT_URL:-}" ]]; then + SECONDARY_S3_ENDPOINT_URL="http://${SECONDARY_ENDPOINT_URL}" + fi + + # SECONDARY only in local and production (no secondary for CI) + if [[ "${env_type}" == "ci" ]]; then + PRIMARY_ACCESS_KEY_ID="ci-rustfs-access-key" + PRIMARY_SECRET_ACCESS_KEY="ci-rustfs-secret-key" + return 0 + fi + + if [[ "${env_type}" == "production" ]]; then + SECONDARY_ACCESS_KEY_ID="rustfs-secondary-access-key" + SECONDARY_SECRET_ACCESS_KEY="rustfs-secondary-secret-key" + SECONDARY_ROOT_USER="minioadmin" + fi } -generate_django_secret_key() { - # Django needs 50+ chars with special characters - openssl rand -base64 64 | tr -d "\n" +function generate_secret() { + local length="${1:-40}" + openssl rand -base64 48 | tr -d "=+/" | cut -c1-"${length}" } -process_env_file() { - local template="$1" - local output="$2" - local env_type="$3" - local force="$4" - - if [[ -f "${output}" && "${force}" != "true" ]]; then - echo " ⏭ ${output} already exists (use --force to overwrite)" - return 0 - fi - - echo " ✓ Generating ${output}" - - local content - content=$(cat "${template}") - - # calculate WEB_CONCURRENCY based on CPU cores: (2 x num_cores) + 1 - local num_cores - num_cores=$(nproc 2>/dev/null || echo "2") - local web_concurrency=$(( (num_cores * 2) + 1 )) - - # generate secrets based on environment type - if [[ "${env_type}" == "ci" ]]; then - # CI: use predictable but acceptable secrets for ephemeral environments - content="${content//DJANGO_SECRET_KEY=/DJANGO_SECRET_KEY=ci-django-secret-key-insecure-for-testing-only}" - content="${content//DJANGO_ADMIN_URL=/DJANGO_ADMIN_URL=ci-admin/}" - content="${content//CELERY_FLOWER_PASSWORD=/CELERY_FLOWER_PASSWORD=ci-flower-pass}" - content="${content//SVI_SERVER_API_KEY=/SVI_SERVER_API_KEY=ci-svi-api-key-01234567890123456789abcde}" # 40 chars - content="${content//MINIO_ROOT_PASSWORD=/MINIO_ROOT_PASSWORD=ci-minio-secret}" - content="${content//AWS_SECRET_ACCESS_KEY=/AWS_SECRET_ACCESS_KEY=ci-minio-secret}" - content="${content//POSTGRES_PASSWORD=your-specific-password/POSTGRES_PASSWORD=ci-postgres-pass}" - content="${content//:your-specific-password@/:ci-postgres-pass@}" - content="${content//OPENSEARCH_INITIAL_ADMIN_PASSWORD=/OPENSEARCH_INITIAL_ADMIN_PASSWORD=CiAdmin123!}" - content="${content//OPENSEARCH_PASSWORD=/OPENSEARCH_PASSWORD=CiDjango123!}" - else - # local/production: generate random secure secrets - local django_secret_key django_admin_url flower_pass minio_pass postgres_pass opensearch_admin_pass opensearch_user_pass svi_api_key - django_secret_key=$(generate_django_secret_key) - django_admin_url="$(generate_secret 16)/" - flower_pass=$(generate_secret 32) - minio_pass=$(generate_secret 40) - postgres_pass=$(generate_secret 32) - opensearch_admin_pass=$(generate_secret 32) - opensearch_user_pass=$(generate_secret 32) - svi_api_key=$(generate_secret 40) - - content="${content//DJANGO_SECRET_KEY=/DJANGO_SECRET_KEY=${django_secret_key}}" - content="${content//DJANGO_ADMIN_URL=/DJANGO_ADMIN_URL=${django_admin_url}}" - content="${content//CELERY_FLOWER_PASSWORD=/CELERY_FLOWER_PASSWORD=${flower_pass}}" - content="${content//SVI_SERVER_API_KEY=/SVI_SERVER_API_KEY=${svi_api_key}}" - content="${content//MINIO_ROOT_PASSWORD=/MINIO_ROOT_PASSWORD=${minio_pass}}" - content="${content//AWS_SECRET_ACCESS_KEY=/AWS_SECRET_ACCESS_KEY=${minio_pass}}" - content="${content//POSTGRES_PASSWORD=your-specific-password/POSTGRES_PASSWORD=${postgres_pass}}" - content="${content//:your-specific-password@/:${postgres_pass}@}" - content="${content//OPENSEARCH_INITIAL_ADMIN_PASSWORD=/OPENSEARCH_INITIAL_ADMIN_PASSWORD=${opensearch_admin_pass}}" - content="${content//OPENSEARCH_PASSWORD=/OPENSEARCH_PASSWORD=${opensearch_user_pass}}" - fi - - # set WEB_CONCURRENCY based on CPU cores (applies to all environments) - content="${content//WEB_CONCURRENCY=4/WEB_CONCURRENCY=${web_concurrency}}" - - # write to output - mkdir -p "$(dirname "${output}")" - echo "${content}" > "${output}" +function generate_django_secret_key() { + # Django needs 50+ chars with special characters + openssl rand -base64 64 | tr -d "\n" } -main() { - local force="false" - local env_type="" - - # parse arguments - while [[ $# -gt 0 ]]; do - case "$1" in - -f|--force) - force="true" - shift - ;; - -h|--help) - usage - ;; - local|production|ci) - env_type="$1" - shift - ;; - *) - echo "ERROR: Unknown argument: $1" >&2 - usage - ;; - esac - done - - if [[ -z "${env_type}" ]]; then - echo "ERROR: Environment type required (local, production, or ci)" >&2 - usage - fi - - echo "🔐 Generating secrets for '${env_type}' environment..." - - local target_dir="${PROJECT_ROOT}/.envs/${env_type}" - - # process each env file from examples - for template in "${EXAMPLE_DIR}"/*.env; do - local filename - filename=$(basename "${template}") - - # skip production-specific example files for non-production envs - if [[ "${filename}" == *.prod-example.env ]]; then - if [[ "${env_type}" == "production" ]]; then - # use prod-example for production django.env - if [[ "${filename}" == "django.prod-example.env" ]]; then - process_env_file "${template}" "${target_dir}/django.env" "${env_type}" "${force}" - fi - fi - continue - fi - - # skip regular django.env for production (we use prod-example instead) - if [[ "${env_type}" == "production" && "${filename}" == "django.env" ]]; then - continue - fi - - local output="${target_dir}/${filename}" - process_env_file "${template}" "${output}" "${env_type}" "${force}" - done - - echo "" - echo "✅ Secrets generated successfully in ${target_dir}/" - echo "" - echo "Next steps:" - if [[ "${env_type}" == "ci" ]]; then - echo " - Review generated secrets (safe for ephemeral CI usage)" - else - echo " - Review and customize ${target_dir}/*.env as needed" - echo " - Set additional optional vars (AUTH0, SENTRY, etc.)" - fi - echo " - Use 'just env' to check the environment setup" - echo " - Use 'just up' to start the stack" +function process_env_file() { + local template="$1" + local output="$2" + local env_type="$3" + local force="$4" + local filename + filename=$(basename "${template}") + + configure_object_store_defaults "${env_type}" + + if [[ -f "${output}" && "${force}" != "true" ]]; then + echo " ⏭ ${output} already exists (use --force to overwrite)" + return 0 + fi + + echo " ✓ Generating ${output}" + + local content + content=$(cat "${template}") + + # calculate WEB_CONCURRENCY based on CPU cores: (2 x num_cores) + 1 + local num_cores + num_cores=$(nproc 2>/dev/null || echo "2") + local web_concurrency=$(((num_cores * 2) + 1)) + + # generate secrets based on environment type + if [[ "${env_type}" == "ci" ]]; then + # CI: use predictable but acceptable secrets for ephemeral environments + content="${content//:your-specific-password@/:ci-postgres-pass@}" + content="${content//AWS_SECRET_ACCESS_KEY=/AWS_SECRET_ACCESS_KEY=ci-rustfs-secret}" + content="${content//CELERY_FLOWER_PASSWORD=/CELERY_FLOWER_PASSWORD=ci-flower-pass}" + content="${content//DJANGO_ADMIN_URL=/DJANGO_ADMIN_URL=ci-admin/}" + content="${content//DJANGO_SECRET_KEY=/DJANGO_SECRET_KEY=ci-django-secret-key-insecure-for-testing-only}" + content="${content//OPENSEARCH_INITIAL_ADMIN_PASSWORD=/OPENSEARCH_INITIAL_ADMIN_PASSWORD=CiAdmin123!}" + content="${content//OPENSEARCH_PASSWORD=/OPENSEARCH_PASSWORD=CiDjango123!}" + content="${content//POSTGRES_PASSWORD=your-specific-password/POSTGRES_PASSWORD=ci-postgres-pass}" + content="${content//SVI_SERVER_API_KEY=/SVI_SERVER_API_KEY=ci-svi-api-key-01234567890123456789abcde}" # 40 chars + else + # local/production: generate random secure secrets + local django_secret_key django_admin_url flower_pass postgres_pass opensearch_admin_pass opensearch_user_pass svi_api_key + django_secret_key=$(generate_django_secret_key) + django_admin_url="$(generate_secret 16)/" + flower_pass=$(generate_secret 32) + postgres_pass=$(generate_secret 32) + opensearch_admin_pass=$(generate_secret 32) + opensearch_user_pass=$(generate_secret 32) + svi_api_key=$(generate_secret 40) + + content="${content//:your-specific-password@/:${postgres_pass}@}" + content="${content//AWS_SECRET_ACCESS_KEY=/AWS_SECRET_ACCESS_KEY=${PRIMARY_SECRET_ACCESS_KEY}}" + content="${content//CELERY_FLOWER_PASSWORD=/CELERY_FLOWER_PASSWORD=${flower_pass}}" + content="${content//DJANGO_ADMIN_URL=/DJANGO_ADMIN_URL=${django_admin_url}}" + content="${content//DJANGO_SECRET_KEY=/DJANGO_SECRET_KEY=${django_secret_key}}" + content="${content//OPENSEARCH_INITIAL_ADMIN_PASSWORD=/OPENSEARCH_INITIAL_ADMIN_PASSWORD=${opensearch_admin_pass}}" + content="${content//OPENSEARCH_PASSWORD=/OPENSEARCH_PASSWORD=${opensearch_user_pass}}" + content="${content//POSTGRES_PASSWORD=your-specific-password/POSTGRES_PASSWORD=${postgres_pass}}" + content="${content//SVI_SERVER_API_KEY=/SVI_SERVER_API_KEY=${svi_api_key}}" + fi + + # set WEB_CONCURRENCY based on CPU cores (applies to all environments) + content="${content//WEB_CONCURRENCY=4/WEB_CONCURRENCY=${web_concurrency}}" + + if [[ "${filename}" == "storage.env" ]]; then + # PRIMARY vars + content="${content//PRIMARY_ACCESS_KEY_ID=admin/PRIMARY_ACCESS_KEY_ID=${PRIMARY_ACCESS_KEY_ID}}" + content="${content//PRIMARY_S3_ENDPOINT_URL=http:\/\/sds-gateway-local-rustfs:9000/PRIMARY_S3_ENDPOINT_URL=${PRIMARY_S3_ENDPOINT_URL}}" + content="${content//PRIMARY_SECRET_ACCESS_KEY=admin/PRIMARY_SECRET_ACCESS_KEY=${PRIMARY_SECRET_ACCESS_KEY}}" + content="${content//PRIMARY_ENDPOINT_URL=sds-gateway-local-rustfs:9000/PRIMARY_ENDPOINT_URL=${PRIMARY_ENDPOINT_URL}}" + + # SECONDARY vars (local only — SeaweedFS) + if [[ -n "${SECONDARY_ENDPOINT_URL:-}" ]]; then + content="${content//SECONDARY_ACCESS_KEY_ID=admin/SECONDARY_ACCESS_KEY_ID=${SECONDARY_ACCESS_KEY_ID}}" + content="${content//SECONDARY_S3_ENDPOINT_URL=http:\/\/sds-gateway-local-sfs-s3:8333/SECONDARY_S3_ENDPOINT_URL=${SECONDARY_S3_ENDPOINT_URL}}" + content="${content//SECONDARY_SECRET_ACCESS_KEY=admin/SECONDARY_SECRET_ACCESS_KEY=${SECONDARY_SECRET_ACCESS_KEY}}" + content="${content//SECONDARY_ENDPOINT_URL=sds-gateway-local-sfs-s3:8333/SECONDARY_ENDPOINT_URL=${SECONDARY_ENDPOINT_URL}}" + fi + + # deprecated: + # content="${content//AWS_ACCESS_KEY_ID=admin/AWS_ACCESS_KEY_ID=${PRIMARY_ACCESS_KEY_ID}}" + # content="${content//AWS_SECRET_ACCESS_KEY=admin/AWS_SECRET_ACCESS_KEY=${PRIMARY_SECRET_ACCESS_KEY}}" + fi + + if [[ "${filename}" == "storage.prod.env" ]]; then + # PRIMARY (SeaweedFS) vars + content="${content//PRIMARY_ACCESS_KEY_ID=admin/PRIMARY_ACCESS_KEY_ID=${PRIMARY_ACCESS_KEY_ID}}" + content="${content//PRIMARY_S3_ENDPOINT_URL=http:\/\/sds-gateway-prod-sfs-s3:8333/PRIMARY_S3_ENDPOINT_URL=${PRIMARY_S3_ENDPOINT_URL}}" + content="${content//PRIMARY_SECRET_ACCESS_KEY=admin/PRIMARY_SECRET_ACCESS_KEY=${PRIMARY_SECRET_ACCESS_KEY}}" + content="${content//PRIMARY_ENDPOINT_URL=sds-gateway-prod-sfs-s3:8333/PRIMARY_ENDPOINT_URL=${PRIMARY_ENDPOINT_URL}}" + # SECONDARY (RustFS) vars + content="${content//SECONDARY_ACCESS_KEY_ID=minioadmin/SECONDARY_ACCESS_KEY_ID=${SECONDARY_ACCESS_KEY_ID}}" + content="${content//SECONDARY_ROOT_USER=minioadmin/SECONDARY_ROOT_USER=${SECONDARY_ROOT_USER}}" + if [[ -n "${SECONDARY_ROOT_PASSWORD}" ]]; then + content="${content//SECONDARY_ROOT_PASSWORD=/SECONDARY_ROOT_PASSWORD=${SECONDARY_ROOT_PASSWORD}}" + content="${content//SECONDARY_SECRET_ACCESS_KEY=/SECONDARY_SECRET_ACCESS_KEY=${SECONDARY_SECRET_ACCESS_KEY}}" + fi + + # deprecated / unused env vars safe to rename in your .env files: + + # AWS_ACCESS_KEY_ID -> PRIMARY_ACCESS_KEY_ID and SECONDARY_ACCESS_KEY_ID + # AWS_SECRET_ACCESS_KEY -> PRIMARY_SECRET_ACCESS_KEY and SECONDARY_SECRET_ACCESS_KEY + # MINIO_ROOT_PASSWORD -> removed: MinIO is not used anymore + # MINIO_SECRET_ACCESS_KEY -> removed: MinIO is not used anymore + # RUSTFS_ACCESS_KEY_ID -> PRIMARY_ACCESS_KEY_ID or SECONDARY_ACCESS_KEY_ID depending on your setup + # RUSTFS_ROOT_PASSWORD -> PRIMARY_SECRET_ACCESS_KEY or SECONDARY_ROOT_PASSWORD depending on your setup + # RUSTFS_ROOT_USER -> PRIMARY_ROOT_USER or SECONDARY_ROOT_USER depending on your setup + # RUSTFS_SECRET_ACCESS_KEY -> PRIMARY_SECRET_ACCESS_KEY or SECONDARY_SECRET_ACCESS_KEY depending on your setup + + # content="${content//AWS_ACCESS_KEY_ID=admin/AWS_ACCESS_KEY_ID=${PRIMARY_ACCESS_KEY_ID}}" + # content="${content//AWS_SECRET_ACCESS_KEY=admin/AWS_SECRET_ACCESS_KEY=${PRIMARY_SECRET_ACCESS_KEY}}" + # content="${content//MINIO_ROOT_PASSWORD=/MINIO_ROOT_PASSWORD=${SECONDARY_ROOT_PASSWORD}}" + # content="${content//MINIO_SECRET_ACCESS_KEY=/MINIO_SECRET_ACCESS_KEY=${SECONDARY_SECRET_ACCESS_KEY}}" + # content="${content//RUSTFS_ACCESS_KEY_ID=minioadmin/RUSTFS_ACCESS_KEY_ID=${SECONDARY_ACCESS_KEY_ID}}" + # content="${content//RUSTFS_ROOT_PASSWORD=/RUSTFS_ROOT_PASSWORD=${SECONDARY_ROOT_PASSWORD}}" + # content="${content//RUSTFS_ROOT_USER=minioadmin/RUSTFS_ROOT_USER=${SECONDARY_ROOT_USER}}" + # content="${content//RUSTFS_SECRET_ACCESS_KEY=/RUSTFS_SECRET_ACCESS_KEY=${SECONDARY_SECRET_ACCESS_KEY}}" + fi + + # write to output + mkdir -p "$(dirname "${output}")" + echo "${content}" >"${output}" + chmod 600 "${output}" +} + +function set_permissions() { + declare -a env_dirs + env_dirs=( + "${GATEWAY_ROOT}/.envs" + "${SFS_ROOT}/.envs" + ) + for dir in "${env_dirs[@]}"; do + if [ -d "${dir}" ]; then + find "${dir}" -type f -name "*.env" -exec chmod --changes 600 {} \; + fi + done +} + +function main() { + local force="false" + local env_type="" + + # parse arguments + while [[ $# -gt 0 ]]; do + case "$1" in + -f | --force) + force="true" + shift + ;; + -h | --help) + usage + ;; + local | production | ci) + env_type="$1" + shift + ;; + *) + echo "ERROR: Unknown argument: $1" >&2 + usage + ;; + esac + done + + if [[ -z "${env_type}" ]]; then + echo "ERROR: Environment type required (local, production, or ci)" >&2 + usage + fi + + echo "🔐 Generating secrets for '${env_type}' environment..." + + local target_dir_gwy="${GATEWAY_ROOT}/.envs/${env_type}" + + # process each env file from examples + for template in "${EXAMPLE_DIR}"/*.env; do + local filename + filename=$(basename "${template}") + + # skip production-specific example files for non-production envs + if [[ "${filename}" == *.prod-example.env ]]; then + if [[ "${env_type}" == "production" ]]; then + # use prod-example for production django.env + if [[ "${filename}" == "django.prod-example.env" ]]; then + process_env_file "${template}" "${target_dir_gwy}/django.env" "${env_type}" "${force}" + fi + fi + continue + fi + + # skip regular django.env for production (we use prod-example instead) + if [[ "${env_type}" == "production" && "${filename}" == "django.env" ]]; then + continue + fi + + # skip storage.prod.env for local/CI + if [[ "${env_type}" != "production" && "${filename}" == "storage.prod.env" ]]; then + continue + fi + + local output="${target_dir_gwy}/${filename}" + process_env_file "${template}" "${output}" "${env_type}" "${force}" + done + + set_permissions + + echo "" + echo "✅ Secrets generated successfully in ${target_dir_gwy}/" + echo "" + echo "Next steps:" + if [[ "${env_type}" == "ci" ]]; then + echo " - Review generated secrets (safe for ephemeral CI usage)" + else + echo " - Review and customize ${target_dir_gwy}/*.env as needed" + echo " - Set additional optional vars (AUTH0, SENTRY, etc.)" + fi + echo " - Use 'just env' to check the environment setup" + echo " - Use 'just up' to start the stack" } main "$@" diff --git a/gateway/sds_gateway/api_methods/management/commands/create_storage_buckets.py b/gateway/sds_gateway/api_methods/management/commands/create_storage_buckets.py new file mode 100644 index 000000000..20b2a358b --- /dev/null +++ b/gateway/sds_gateway/api_methods/management/commands/create_storage_buckets.py @@ -0,0 +1,58 @@ +"""Management command to create/ensure buckets exist on configured object stores.""" + +from django.conf import settings +from django.core.management.base import BaseCommand +from loguru import logger as log + +from sds_gateway.api_methods.utils.minio_client import _build_minio_client + + +class Command(BaseCommand): + """Create or ensure buckets exist on primary and optional secondary stores.""" + + help = "Create/ensure buckets exist on configured object stores" + + def handle(self, *args, **options) -> None: + """Execute the command.""" + # Primary store (required) + primary_client = _build_minio_client( + endpoint=settings.PRIMARY_ENDPOINT_URL, + access_key=settings.PRIMARY_ACCESS_KEY_ID, + secret_key=settings.PRIMARY_SECRET_ACCESS_KEY, + secure=settings.PRIMARY_STORAGE_USE_HTTPS, + ) + self._ensure_bucket(primary_client, settings.PRIMARY_STORAGE_BUCKET_NAME) + + # Secondary store (optional — may be unreachable) + # Skip entirely if access key is still the LEGACY fallback default; + # that means no secondary was ever configured for this environment. + if settings.SECONDARY_ACCESS_KEY_ID == settings.LEGACY_AWS_ACCESS_KEY_ID: + log.info( + "Secondary object store not configured (LEGACY fallback creds), " + "skipping" + ) + else: + try: + secondary_client = _build_minio_client( + endpoint=settings.SECONDARY_ENDPOINT_URL, + access_key=settings.SECONDARY_ACCESS_KEY_ID, + secret_key=settings.SECONDARY_SECRET_ACCESS_KEY, + secure=settings.SECONDARY_STORAGE_USE_HTTPS, + ) + self._ensure_bucket( + secondary_client, settings.SECONDARY_STORAGE_BUCKET_NAME + ) + except Exception as exc: # noqa: BLE001 + log.warning( + "Secondary object store unreachable or bucket creation failed: {}", + exc, + ) + + def _ensure_bucket(self, client, bucket_name: str) -> None: + """Check if a bucket exists; create it if it does not.""" + if client.bucket_exists(bucket_name): + log.info("Bucket '{}' already exists", bucket_name) + return + + client.make_bucket(bucket_name) + log.success("Created bucket '{}'", bucket_name) diff --git a/gateway/sds_gateway/api_methods/tests/test_object_store_migration.py b/gateway/sds_gateway/api_methods/tests/test_object_store_migration.py new file mode 100644 index 000000000..189e37c30 --- /dev/null +++ b/gateway/sds_gateway/api_methods/tests/test_object_store_migration.py @@ -0,0 +1,361 @@ +"""Tests for object-store migration adapter and dual Django storage backend.""" + +# ruff: noqa: SLF001 +# pyright: reportPrivateUsage=false + +import logging +from unittest.mock import MagicMock + +import pytest +from django.core.files.base import ContentFile + +from sds_gateway.api_methods.utils.dual_object_store_storage import ( + DualObjectStoreS3Storage, +) +from sds_gateway.api_methods.utils.minio_client import ObjectStoreFacade + +EXPECTED_SIZE = 42 + + +class MissingObjectError(Exception): + """Test-only exception to simulate missing-object failures.""" + + code = "NoSuchKey" + + +def _configure_bucket_settings(settings) -> None: + settings.PRIMARY_STORAGE_BUCKET_NAME = "sfs-bucket" + settings.SECONDARY_STORAGE_BUCKET_NAME = "secondary-bucket" + + +def _build_storage_with_mocks( + *, + monkeypatch: pytest.MonkeyPatch, + settings, + primary_storage: MagicMock, + secondary_storage: MagicMock, + read_fallback_enabled: bool, + write_both_enabled: bool, + dual_write_strict: bool, +) -> DualObjectStoreS3Storage: + settings.OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED = read_fallback_enabled + settings.OBJECT_STORE_WRITE_BOTH_ENABLED = write_both_enabled + settings.OBJECT_STORE_DUAL_WRITE_STRICT = dual_write_strict + + backends = [primary_storage, secondary_storage] + + def _create_backend(_self, *, store_prefix: str): + _ = store_prefix + return backends.pop(0) + + monkeypatch.setattr(DualObjectStoreS3Storage, "_create_backend", _create_backend) + return DualObjectStoreS3Storage() + + +def test_adapter_read_falls_back_on_missing(settings) -> None: + _configure_bucket_settings(settings) + + primary_client = MagicMock() + secondary_client = MagicMock() + + expected_response = object() + primary_client.get_object.side_effect = MissingObjectError("missing") + secondary_client.get_object.return_value = expected_response + + facade = ObjectStoreFacade( + primary_client=primary_client, + secondary_client=secondary_client, + read_fallback_to_secondary_enabled=True, + write_both_enabled=False, + dual_write_strict=False, + ) + + result = facade.get_object(bucket_name="bucket", object_name="path/to/object") + + assert result is expected_response + secondary_client.get_object.assert_called_once_with( + bucket_name="secondary-bucket", + object_name="path/to/object", + ) + + +def test_adapter_does_not_fallback_on_non_missing_errors(settings) -> None: + """Only missing-object errors should trigger fallback when enabled, other errors + should raise immediately.""" + _configure_bucket_settings(settings) + + primary_client = MagicMock() + secondary_client = MagicMock() + + primary_client.get_object.side_effect = RuntimeError("boom") + + facade = ObjectStoreFacade( + primary_client=primary_client, + secondary_client=secondary_client, + read_fallback_to_secondary_enabled=True, + write_both_enabled=False, + dual_write_strict=False, + ) + + with pytest.raises(RuntimeError, match="boom"): + facade.get_object(bucket_name="bucket", object_name="path/to/object") + + secondary_client.get_object.assert_not_called() + + +def test_adapter_dual_write_non_strict_allows_secondary_failure(settings) -> None: + """In non-strict dual-write mode, secondary write failures should not raise and + should be logged.""" + _configure_bucket_settings(settings) + + primary_client = MagicMock() + secondary_client = MagicMock() + + primary_client.put_object.return_value = "primary-result" + secondary_client.put_object.side_effect = RuntimeError("secondary write failed") + + facade = ObjectStoreFacade( + primary_client=primary_client, + secondary_client=secondary_client, + read_fallback_to_secondary_enabled=False, + write_both_enabled=True, + dual_write_strict=False, + ) + + result = facade.put_object(bucket_name="bucket", object_name="path/to/object") + + assert result == "primary-result" + + +def test_adapter_dual_write_strict_raises_on_secondary_failure(settings) -> None: + _configure_bucket_settings(settings) + + primary_client = MagicMock() + secondary_client = MagicMock() + + primary_client.put_object.return_value = "primary-result" + secondary_client.put_object.side_effect = RuntimeError("secondary write failed") + + facade = ObjectStoreFacade( + primary_client=primary_client, + secondary_client=secondary_client, + read_fallback_to_secondary_enabled=False, + write_both_enabled=True, + dual_write_strict=True, + ) + + with pytest.raises(RuntimeError, match="secondary write failed"): + facade.put_object(bucket_name="bucket", object_name="path/to/object") + + +def test_adapter_maps_bucket_name_kwargs_per_store(settings) -> None: + _configure_bucket_settings(settings) + + primary_client = MagicMock() + secondary_client = MagicMock() + + primary_client.put_object.return_value = "primary-result" + + facade = ObjectStoreFacade( + primary_client=primary_client, + secondary_client=secondary_client, + read_fallback_to_secondary_enabled=False, + write_both_enabled=True, + dual_write_strict=False, + ) + + facade.put_object(bucket_name="caller-bucket", object_name="path/to/object") + + primary_client.put_object.assert_called_once_with( + bucket_name="sfs-bucket", + object_name="path/to/object", + ) + secondary_client.put_object.assert_called_once_with( + bucket_name="secondary-bucket", + object_name="path/to/object", + ) + + +def test_adapter_maps_bucket_name_positionally_per_store(settings) -> None: + _configure_bucket_settings(settings) + + primary_client = MagicMock() + secondary_client = MagicMock() + + facade = ObjectStoreFacade( + primary_client=primary_client, + secondary_client=secondary_client, + read_fallback_to_secondary_enabled=False, + write_both_enabled=True, + dual_write_strict=False, + ) + + facade.remove_object("caller-bucket", "path/to/object") + + primary_client.remove_object.assert_called_once_with( + "sfs-bucket", + "path/to/object", + ) + secondary_client.remove_object.assert_called_once_with( + "secondary-bucket", + "path/to/object", + ) + + +def test_adapter_remove_object_is_strict_when_fallback_is_enabled(settings) -> None: + _configure_bucket_settings(settings) + + primary_client = MagicMock() + secondary_client = MagicMock() + + secondary_client.remove_object.side_effect = RuntimeError("secondary delete failed") + + facade = ObjectStoreFacade( + primary_client=primary_client, + secondary_client=secondary_client, + read_fallback_to_secondary_enabled=True, + write_both_enabled=False, + dual_write_strict=False, + ) + + with pytest.raises(RuntimeError, match="secondary delete failed"): + facade.remove_object(bucket_name="bucket", object_name="path/to/object") + + +def test_adapter_fallback_logging_redacts_object_key( + caplog: pytest.LogCaptureFixture, + settings, +) -> None: + _configure_bucket_settings(settings) + + primary_client = MagicMock() + secondary_client = MagicMock() + + full_key = "customers/acme-corp/private/export-2026-04-14.csv" + primary_client.get_object.side_effect = MissingObjectError("missing") + secondary_client.get_object.return_value = object() + + facade = ObjectStoreFacade( + primary_client=primary_client, + secondary_client=secondary_client, + read_fallback_to_secondary_enabled=True, + write_both_enabled=False, + dual_write_strict=False, + ) + + with caplog.at_level( + logging.WARNING, + logger="sds_gateway.api_methods.utils.minio_client", + ): + facade.get_object(bucket_name="bucket", object_name=full_key) + + logged_messages = " ".join(record.getMessage() for record in caplog.records) + assert full_key not in logged_messages + assert "sha256=" in logged_messages + assert "len=" in logged_messages + + +def test_storage_open_falls_back_on_missing( + monkeypatch: pytest.MonkeyPatch, + settings, +) -> None: + primary_storage = MagicMock() + secondary_storage = MagicMock() + + expected_file = MagicMock() + primary_storage._open.side_effect = MissingObjectError("missing") + secondary_storage._open.return_value = expected_file + + storage = _build_storage_with_mocks( + monkeypatch=monkeypatch, + settings=settings, + primary_storage=primary_storage, + secondary_storage=secondary_storage, + read_fallback_enabled=True, + write_both_enabled=False, + dual_write_strict=False, + ) + + result = storage._open("path/to/object", mode="rb") + + assert result is expected_file + secondary_storage._open.assert_called_once_with("path/to/object", mode="rb") + + +def test_storage_save_dual_write_non_strict( + monkeypatch: pytest.MonkeyPatch, + settings, +) -> None: + primary_storage = MagicMock() + secondary_storage = MagicMock() + + primary_storage._save.return_value = "saved/name.bin" + secondary_storage._save.side_effect = RuntimeError("secondary save failed") + + storage = _build_storage_with_mocks( + monkeypatch=monkeypatch, + settings=settings, + primary_storage=primary_storage, + secondary_storage=secondary_storage, + read_fallback_enabled=False, + write_both_enabled=True, + dual_write_strict=False, + ) + + content = ContentFile(b"payload", name="name.bin") + saved_name = storage._save("name.bin", content) + + assert saved_name == "saved/name.bin" + secondary_storage._save.assert_called_once() + + +def test_storage_delete_is_strict_when_fallback_is_enabled( + monkeypatch: pytest.MonkeyPatch, + settings, +) -> None: + primary_storage = MagicMock() + secondary_storage = MagicMock() + + secondary_storage.delete.side_effect = RuntimeError("secondary delete failed") + + storage = _build_storage_with_mocks( + monkeypatch=monkeypatch, + settings=settings, + primary_storage=primary_storage, + secondary_storage=secondary_storage, + read_fallback_enabled=True, + write_both_enabled=False, + dual_write_strict=False, + ) + + with pytest.raises(RuntimeError, match="secondary delete failed"): + storage.delete("path/to/object") + + +def test_storage_size_delegates_to_primary( + monkeypatch: pytest.MonkeyPatch, + settings, +) -> None: + """DualObjectStoreS3Storage.size() must be implemented so Django's + FileField run_validation can read file size without raising + NotImplementedError.""" + primary_storage = MagicMock() + secondary_storage = MagicMock() + + primary_storage.size.return_value = EXPECTED_SIZE + + storage = _build_storage_with_mocks( + monkeypatch=monkeypatch, + settings=settings, + primary_storage=primary_storage, + secondary_storage=secondary_storage, + read_fallback_enabled=False, + write_both_enabled=False, + dual_write_strict=False, + ) + + result = storage.size("path/to/object") + + assert result == EXPECTED_SIZE + primary_storage.size.assert_called_once_with("path/to/object") + secondary_storage.size.assert_not_called() diff --git a/gateway/sds_gateway/api_methods/utils/dual_object_store_storage.py b/gateway/sds_gateway/api_methods/utils/dual_object_store_storage.py new file mode 100644 index 000000000..f58aba53e --- /dev/null +++ b/gateway/sds_gateway/api_methods/utils/dual_object_store_storage.py @@ -0,0 +1,172 @@ +"""Dual-store Django storage backend for primary + secondary. + +Primary and secondary backends might be any S3-compatible object store, usually among: +- Primary: RustFS (local/CI), SeaweedFS (production), or MinIO (deprecated) +- Secondary: RustFS, Garage, or MinIO (deprecated) + +Sec is optional, unless any of these are True: + - OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED + - OBJECT_STORE_WRITE_BOTH_ENABLED + - OBJECT_STORE_DUAL_WRITE_STRICT +""" + +import hashlib +import logging +from typing import Any + +from django.conf import settings +from django.core.files.base import ContentFile +from django.core.files.base import File +from django.core.files.storage import Storage +from storages.backends.s3boto3 import S3Boto3Storage + +log = logging.getLogger(__name__) + +_MISSING_OBJECT_ERROR_CODES = { + "404", + "NoSuchBucket", + "NoSuchKey", + "NoSuchObject", + "NoSuchVersion", + "NotFound", +} + + +def _is_missing_object_error(error: Exception) -> bool: + """Return True when error represents missing object/bucket condition.""" + error_code = str(getattr(error, "code", "")) + if error_code in _MISSING_OBJECT_ERROR_CODES: + return True + + response = getattr(error, "response", None) + if isinstance(response, dict): + response_error = response.get("Error", {}) + code = str(response_error.get("Code", "")) + if code in _MISSING_OBJECT_ERROR_CODES: + return True + + status_code = str(getattr(error, "status", "")) + return status_code == "404" + + +def _build_storage_options(store_prefix: str) -> dict[str, Any]: + """Build S3Boto3Storage options for a configured object store prefix.""" + return { + "access_key": getattr(settings, f"{store_prefix}_ACCESS_KEY_ID"), + "secret_key": getattr(settings, f"{store_prefix}_SECRET_ACCESS_KEY"), + "bucket_name": getattr(settings, f"{store_prefix}_STORAGE_BUCKET_NAME"), + "endpoint_url": getattr(settings, f"{store_prefix}_S3_ENDPOINT_URL"), + "region_name": settings.AWS_S3_REGION_NAME, + "signature_version": settings.AWS_S3_SIGNATURE_VERSION, + "default_acl": settings.AWS_DEFAULT_ACL, + "file_overwrite": settings.AWS_S3_FILE_OVERWRITE, + } + + +def _safe_object_reference(name: str) -> str: + """Return a non-reversible identifier suitable for operational logs.""" + object_name_digest = hashlib.sha256(name.encode()).hexdigest()[:12] + return f"sha256={object_name_digest} len={len(name)}" + + +class DualObjectStoreS3Storage(Storage): + """Django storage backend with primary and fallback.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__() + self._primary_storage = self._create_backend(store_prefix="PRIMARY") + self._secondary_storage = self._create_backend(store_prefix="SECONDARY") + + def _create_backend(self, *, store_prefix: str) -> S3Boto3Storage: + """Create storage backend for a given settings prefix.""" + return S3Boto3Storage(**_build_storage_options(store_prefix=store_prefix)) + + def _clone_content(self, content: File[Any]) -> ContentFile[Any]: + """Clone content for secondary writes while preserving the primary stream.""" + if hasattr(content, "seek"): + content.seek(0) + payload = content.read() + if isinstance(payload, str): + payload = payload.encode() + if hasattr(content, "seek"): + content.seek(0) + + return ContentFile(payload, name=getattr(content, "name", None)) + + def _open(self, name: str, mode: str = "rb") -> File[Any]: + try: + return self._primary_storage._open(name, mode=mode) # pyright: ignore[reportPrivateUsage] # noqa: SLF001 + except Exception as error: + if not settings.OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED: + raise + if not _is_missing_object_error(error): + raise + + log.warning( + "Object %s not in primary storage, falling back to secondary", + _safe_object_reference(name), + ) + return self._secondary_storage._open(name, mode=mode) # pyright: ignore[reportPrivateUsage] # noqa: SLF001 + + def _save(self, name: str, content: File[Any]) -> str: + if not settings.OBJECT_STORE_WRITE_BOTH_ENABLED: + return self._primary_storage._save(name, content) # pyright: ignore[reportPrivateUsage] # noqa: SLF001 + + secondary_content = self._clone_content(content) + saved_name = self._primary_storage._save(name, content) # pyright: ignore[reportPrivateUsage] # noqa: SLF001 + + try: + self._secondary_storage._save(saved_name, secondary_content) # pyright: ignore[reportPrivateUsage] # noqa: SLF001 + except Exception: + if settings.OBJECT_STORE_DUAL_WRITE_STRICT: + raise + + log.exception( + "Secondary storage write failed in non-strict dual-write mode" + ) + + return saved_name + + def exists(self, name: str) -> bool: + if self._primary_storage.exists(name): + return True + + if settings.OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED: + return self._secondary_storage.exists(name) + + return False + + def delete(self, name: str) -> None: + self._primary_storage.delete(name) + if not ( + settings.OBJECT_STORE_WRITE_BOTH_ENABLED + or settings.OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED + ): + return + + try: + self._secondary_storage.delete(name) + except Exception: + if ( + settings.OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED + or settings.OBJECT_STORE_DUAL_WRITE_STRICT + ): + raise + + log.exception( + "Secondary storage delete failed in non-strict dual-write mode" + ) + + def size(self, name: str) -> int: + """Return the size of the file in the primary storage.""" + return self._primary_storage.size(name) + + def path(self, name: str) -> str: + """Return the absolute path of the file in the primary storage.""" + return self._primary_storage.path(name) # pyright: ignore[reportUnknownMemberType] + + def url(self, name: str) -> str: + return self._primary_storage.url(name) + + def __getattr__(self, name: str) -> Any: + return getattr(self._primary_storage, name) diff --git a/gateway/sds_gateway/api_methods/utils/minio_client.py b/gateway/sds_gateway/api_methods/utils/minio_client.py index 3c1606926..74c0a416d 100644 --- a/gateway/sds_gateway/api_methods/utils/minio_client.py +++ b/gateway/sds_gateway/api_methods/utils/minio_client.py @@ -1,12 +1,291 @@ +"""Object storage client facade for SeaweedFS + MinIO migration.""" + +import hashlib +import logging +from typing import Any +from urllib.parse import urlparse + from django.conf import settings from minio import Minio +log = logging.getLogger(__name__) + +_MISSING_OBJECT_ERROR_CODES = { + "404", + "NoSuchBucket", + "NoSuchKey", + "NoSuchObject", + "NoSuchVersion", + "NotFound", +} + +_BUCKET_NAME_POSITION = 0 +_OBJECT_NAME_POSITION = 1 +_BUCKET_AND_OBJECT_ARGUMENT_COUNT = 2 + + +def _is_missing_object_error(error: Exception) -> bool: + """Return True when error represents a missing object/bucket condition.""" + error_code = str(getattr(error, "code", "")) + if error_code in _MISSING_OBJECT_ERROR_CODES: + return True + + status_code = str(getattr(error, "status", "")) + return status_code == "404" + + +def _normalize_endpoint(endpoint: str) -> str: + """Convert endpoint URL to host:port format accepted by MinIO client.""" + parsed_endpoint = urlparse(endpoint) + if parsed_endpoint.netloc: + return parsed_endpoint.netloc + return endpoint + + +def _safe_object_reference(object_name: Any) -> str: + """Return a non-reversible identifier suitable for operational logs.""" + object_name_text = str(object_name) + object_name_digest = hashlib.sha256(object_name_text.encode()).hexdigest()[:12] + return f"sha256={object_name_digest} len={len(object_name_text)}" + -def get_minio_client() -> Minio: - # Initialize MinIO client +def _build_minio_client( + *, + endpoint: str, + access_key: str, + secret_key: str, + secure: bool, +) -> Minio: + """Build a MinIO API-compatible client.""" return Minio( - settings.MINIO_ENDPOINT_URL, - access_key=settings.AWS_ACCESS_KEY_ID, - secret_key=settings.AWS_SECRET_ACCESS_KEY, - secure=settings.MINIO_STORAGE_USE_HTTPS, + _normalize_endpoint(endpoint), + access_key=access_key, + secret_key=secret_key, + secure=secure, + ) + + +class ObjectStoreFacade: + """Facade exposing MinIO-compatible methods with primary/fallback behavior. + + It encapsulates two storage clients (primary and secondary) and provides + methods that implement the desired read/write behavior based on + configuration flags. The facade also handles argument rewriting to target + the correct buckets for each store and provides safe object references + for logging. + """ + + def __init__( + self, + *, + primary_client: Minio, + secondary_client: Minio, + read_fallback_to_secondary_enabled: bool, + write_both_enabled: bool, + dual_write_strict: bool, + ) -> None: + """Initialize the ObjectStoreFacade with given clients and behavior flags. + + Args: + primary_client: MinIO client for the primary object store (SeaweedFS). + secondary_client: MinIO client for the secondary object store (secondary). + read_fallback_to_secondary_enabled: Whether to fallback to secondary on + read errors. + write_both_enabled: Whether to perform writes on both stores. + dual_write_strict: Requires both writes to succeed, raises otherwise. + """ + self._primary_client = primary_client + self._secondary_client = secondary_client + self._read_fallback_to_secondary_enabled = read_fallback_to_secondary_enabled + self._write_both_enabled = write_both_enabled + self._dual_write_strict = dual_write_strict + + def _rewrite_bucket_name( + self, + bucket_name: str, + *args: Any, + **kwargs: Any, + ) -> tuple[tuple[Any, ...], dict[str, Any]]: + """Return arguments rewritten for the target store bucket.""" + rewritten_args = list(args) + rewritten_kwargs = dict(kwargs) + + if "bucket_name" in rewritten_kwargs or not rewritten_args: + rewritten_kwargs["bucket_name"] = bucket_name + else: + rewritten_args[0] = bucket_name + + return tuple(rewritten_args), rewritten_kwargs + + def _primary_call_arguments( + self, + *args: Any, + **kwargs: Any, + ) -> tuple[tuple[Any, ...], dict[str, Any]]: + """Build call arguments targeting the primary object-store bucket.""" + kwargs.pop("bucket_name", None) + return self._rewrite_bucket_name( + settings.PRIMARY_STORAGE_BUCKET_NAME, + *args, + **kwargs, + ) + + def _secondary_call_arguments( + self, + *args: Any, + **kwargs: Any, + ) -> tuple[tuple[Any, ...], dict[str, Any]]: + """Build call arguments targeting the secondary object-store bucket.""" + kwargs.pop("bucket_name", None) + return self._rewrite_bucket_name( + settings.SECONDARY_STORAGE_BUCKET_NAME, + *args, + **kwargs, + ) + + def _object_reference(self, *args: Any, **kwargs: Any) -> str: + """Return a safe object identifier for logs.""" + object_name = kwargs.get("object_name") + if object_name is None: + if len(args) >= _BUCKET_AND_OBJECT_ARGUMENT_COUNT: + object_name = args[_OBJECT_NAME_POSITION] + elif args and "bucket_name" not in kwargs: + object_name = args[_BUCKET_NAME_POSITION] + else: + object_name = "unknown" + + return _safe_object_reference(object_name) + + def _read_with_optional_fallback( + self, + method_name: str, + *args: Any, + **kwargs: Any, + ) -> Any: + primary_method = getattr(self._primary_client, method_name) + primary_args, primary_kwargs = self._primary_call_arguments(*args, **kwargs) + try: + return primary_method(*primary_args, **primary_kwargs) + except Exception as error: + if not self._read_fallback_to_secondary_enabled: + raise + if not _is_missing_object_error(error): + raise + + log.warning( + "Object %s not found in primary store, falling back to secondary", + self._object_reference(*args, **kwargs), + ) + secondary_method = getattr(self._secondary_client, method_name) + secondary_args, secondary_kwargs = self._secondary_call_arguments( + *args, + **kwargs, + ) + return secondary_method(*secondary_args, **secondary_kwargs) + + def _write_with_optional_dual_write( + self, + method_name: str, + *args: Any, + **kwargs: Any, + ) -> Any: + primary_method = getattr(self._primary_client, method_name) + primary_args, primary_kwargs = self._primary_call_arguments(*args, **kwargs) + primary_result = primary_method(*primary_args, **primary_kwargs) + + if not self._write_both_enabled: + return primary_result + + secondary_method = getattr(self._secondary_client, method_name) + secondary_args, secondary_kwargs = self._secondary_call_arguments( + *args, + **kwargs, + ) + try: + secondary_method(*secondary_args, **secondary_kwargs) + except Exception: + if self._dual_write_strict: + raise + + log.exception( + "Secondary object-store write failed in non-strict dual-write mode" + ) + + return primary_result + + def _delete_from_both_stores(self, *args: Any, **kwargs: Any) -> Any: + """Delete from primary and, when needed, from secondary store too.""" + primary_args, primary_kwargs = self._primary_call_arguments(*args, **kwargs) + primary_result = self._primary_client.remove_object( + *primary_args, + **primary_kwargs, + ) + + if not (self._write_both_enabled or self._read_fallback_to_secondary_enabled): + return primary_result + + secondary_args, secondary_kwargs = self._secondary_call_arguments( + *args, + **kwargs, + ) + try: + self._secondary_client.remove_object(*secondary_args, **secondary_kwargs) + except Exception: + if self._read_fallback_to_secondary_enabled or self._dual_write_strict: + raise + + log.exception( + "Secondary object-store delete failed in non-strict dual-write mode" + ) + + return primary_result + + def get_object(self, *args: Any, **kwargs: Any) -> Any: + """Get object stream from primary store with optional fallback.""" + return self._read_with_optional_fallback("get_object", *args, **kwargs) + + def fget_object(self, *args: Any, **kwargs: Any) -> Any: + """Download object to local file from primary store with optional fallback.""" + return self._read_with_optional_fallback("fget_object", *args, **kwargs) + + def put_object(self, *args: Any, **kwargs: Any) -> Any: + """Upload object from stream with optional dual-write behavior.""" + return self._write_with_optional_dual_write("put_object", *args, **kwargs) + + def fput_object(self, *args: Any, **kwargs: Any) -> Any: + """Upload object from local file with optional dual-write behavior.""" + return self._write_with_optional_dual_write("fput_object", *args, **kwargs) + + def remove_object(self, *args: Any, **kwargs: Any) -> Any: + """Remove object from primary store with optional dual-write behavior.""" + return self._delete_from_both_stores(*args, **kwargs) + + def __getattr__(self, name: str) -> Any: + """Delegate unknown methods to the primary client for compatibility.""" + return getattr(self._primary_client, name) + + +def get_minio_client() -> ObjectStoreFacade: + """Return migration-aware object store facade while keeping API name stable.""" + primary_client = _build_minio_client( + endpoint=settings.PRIMARY_ENDPOINT_URL, + access_key=settings.PRIMARY_ACCESS_KEY_ID, + secret_key=settings.PRIMARY_SECRET_ACCESS_KEY, + secure=settings.PRIMARY_STORAGE_USE_HTTPS, + ) + secondary_client = _build_minio_client( + endpoint=settings.SECONDARY_ENDPOINT_URL, + access_key=settings.SECONDARY_ACCESS_KEY_ID, + secret_key=settings.SECONDARY_SECRET_ACCESS_KEY, + secure=settings.SECONDARY_STORAGE_USE_HTTPS, + ) + + return ObjectStoreFacade( + primary_client=primary_client, + secondary_client=secondary_client, + read_fallback_to_secondary_enabled=( + settings.OBJECT_STORE_READ_FALLBACK_TO_SECONDARY_ENABLED + ), + write_both_enabled=settings.OBJECT_STORE_WRITE_BOTH_ENABLED, + dual_write_strict=settings.OBJECT_STORE_DUAL_WRITE_STRICT, ) diff --git a/gateway/sds_gateway/monitoring/services.py b/gateway/sds_gateway/monitoring/services.py index 8600234c6..edce635c9 100644 --- a/gateway/sds_gateway/monitoring/services.py +++ b/gateway/sds_gateway/monitoring/services.py @@ -50,21 +50,25 @@ def _split_host_port(endpoint: str, *, default_port: int) -> tuple[str, int]: def get_default_service_definitions() -> list[ServiceDefinition]: services: list[ServiceDefinition] = [] - sfs_endpoint = getattr(settings, "SFS_ENDPOINT_URL", None) - if sfs_endpoint is not None: - sfs_host, sfs_port = _split_host_port(sfs_endpoint, default_port=8333) + primary_endpoint = getattr(settings, "PRIMARY_ENDPOINT_URL", None) + if primary_endpoint is not None: + primary_host, primary_port = _split_host_port( + primary_endpoint, default_port=9000 + ) services.append( ServiceDefinition( - name="seaweedfs", kind="tcp", host=sfs_host, port=sfs_port + name="primary-storage", kind="tcp", host=primary_host, port=primary_port ) ) - minio_endpoint = getattr(settings, "MINIO_ENDPOINT_URL", None) - if minio_endpoint is not None: - minio_host, minio_port = _split_host_port(minio_endpoint, default_port=9000) + secondary_endpoint = getattr(settings, "SECONDARY_ENDPOINT_URL", None) + if secondary_endpoint is not None: + secondary_host, secondary_port = _split_host_port( + secondary_endpoint, default_port=9000 + ) services.append( ServiceDefinition( - name="minio", kind="tcp", host=minio_host, port=minio_port + name="secondary", kind="tcp", host=secondary_host, port=secondary_port ) ) diff --git a/jupyter/docs/agents.md b/jupyter/docs/agents.md new file mode 100644 index 000000000..c8ad22479 --- /dev/null +++ b/jupyter/docs/agents.md @@ -0,0 +1,65 @@ +# JupyterHub Agent Documentation + +## Purpose + +JupyterHub deployment for SDS: spawns per-user notebook containers with spectrumx SDK access via custom Docker spawner. + +## Architecture + +- **Base image**: `quay.io/jupyterhub/jupyterhub:` (JUPYTERHUB_VERSION arg) +- **Spawner**: Custom `MyDockerSpawner` → `dockerspawner.DockerSpawner` subclass +- **Auth**: Auth0OAuthenticator in prod; `DummyAuthenticator(admin=admin)` locally +- **Notebook image**: `quay.io/jupyter/base-notebook:latest` (DOCKER_NOTEBOOK_IMAGE env) +- **Lab interface**: JupyterLab via `jupyter-labhub` command + `JUPYTER_ENABLE_LAB=yes` +- **Idle culling**: `jupyterhub-idle-culler` service +- **DB**: SQLite at `/data/jupyterhub.sqlite` +- **Cookie secret**: Generated on build, stored at `/data/jupyterhub_cookie_secret` (600 perms) + +## Key Configuration (`jupyterhub_config.py`) + +- `hub_connect_ip` → container name (env-driven) +- `hub_ip/port` → bound to container interface +- `notebook_dir` → `/home/jovyan/work` +- All other settings (limits, timeouts, active_server_limit, cpu/mem limits) are environment-specific and vary by deployment + +### MyDockerSpawner overrides + +- Sets `CHOWN_HOME=yes`, `CHOWN_HOME_OPTS=-R`, `NB_GROUP=nb_users` +- Post-start: `pip install ipywidgets spectrumx` +- Network prefix: `sds-jupyter-local_` + `DOCKER_NETWORK_NAME` +- Volume mounts: `{username}` named volume → `/home/jovyan/work`; `sample_scripts/` → `/home/jovyan/work/sample_scripts` (ro) +- Prefix for user containers: `sds-jupyter-user` + +Docker socket `/var/run/docker.sock` bind-mounted ro into hub (but `sudo` granted for chown/chmod). + +## Deployment + +- Local compose: `compose.local.yaml` +- Prod compose: `compose.production.yaml` +- Hub service image: `sds-jupyter-local`, port `8888:8000` (Traefik reverse proxy) +- Traefik labels configured for `/notebook` prefix strip on `sds-dev.crc.nd.edu` +- Env file: `.envs/local/jupyterhub.env` +- Networks: `sds-jupyter-local-net-clients` (bridge, alias `jupyterhub`) + +## Directory Structure + +- `compose/local/` → local dev compose files + Dockerfile +- `compose/production/` → prod compose files + Dockerfile + jupyterhub_config override +- `scripts/` → deployment utilities (`env-selection.sh`, `prod-hostnames.env`) +- `.envs/local/` → local env vars +- `.envs/example/` → env var template + +## Key Files + +| Path | Purpose | +|--|-| +| `compose.local.yaml` | Local compose stack definition | +| `compose.production.yaml` | Production compose stack | +| `compose/local/jupyter/Dockerfile` | Hub image build — installs docker.io, sudo, curl; creates users/groups | +| `compose/production/jupyter/Dockerfile` | Prod hub Dockerfile (same base + chown fix) | +| `compose/local/jupyter/jupyterhub_config.py` | Local dev Hub config + spawner override | +| `compose/production/jupyter/jupyterhub_config.py` | Prod-specific Hub config override | +| `scripts/env-selection.sh` | Staging env file selector (local vs prod) | +| `scripts/prod-hostnames.env` | Production hostname overrides | +| `.envs/local/jupyterhub.env` | Local environment variables | +| `.envs/example/jupyterhub.env` | Template for all required env vars | diff --git a/jupyter/scripts/env-selection.sh b/jupyter/scripts/env-selection.sh index 36861829d..86ad64004 100755 --- a/jupyter/scripts/env-selection.sh +++ b/jupyter/scripts/env-selection.sh @@ -6,99 +6,109 @@ script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) prod_hosts_file="${script_dir}/prod-hostnames.env" is_production_host() { - local host - host=$(hostname) - - if [[ ! -f "${prod_hosts_file}" ]]; then - return 1 - fi - - while read -r line || [[ -n "${line}" ]]; do - line=$(echo "${line}" | xargs) - [[ -z "${line}" || ${line:0:1} == '#' ]] && continue - if [[ "${line}" == "${host}" ]]; then - return 0 - fi - done < "${prod_hosts_file}" - - return 1 + local host + host=$(hostname) + + if [[ ! -f "${prod_hosts_file}" ]]; then + return 1 + fi + + while read -r line || [[ -n "${line}" ]]; do + line=$(echo "${line}" | xargs) + [[ -z "${line}" || ${line:0:1} == '#' ]] && continue + if [[ "${line}" == "${host}" ]]; then + return 0 + fi + done <"${prod_hosts_file}" + + return 1 } get_target_value() { - local target=$1 - local is_prod=$2 - - local local_env_file=".envs/local/jupyterhub.env" - local production_env_file=".envs/production/jupyterhub.env" - - local output - - case "${target}" in - env) - if [[ "${is_prod}" == true ]]; then - output='production' - else - output='local' - fi - ;; - compose_file) - if [[ "${is_prod}" == true ]]; then - output='compose.production.yaml' - else - output='compose.local.yaml' - fi - ;; - env_file) - if [[ "${is_prod}" == true ]]; then - output="${production_env_file}" - else - output="${local_env_file}" - fi - ;; - client_network) - if [[ "${is_prod}" == true ]]; then - output='sds-jupyter-prod-net-clients' - else - output='sds-jupyter-local-net-clients' - fi - ;; - compose_project_name) - if [[ "${is_prod}" == true ]]; then - output='sds-jupyter-prod' - else - output='sds-jupyter-local' - fi - ;; - *) - printf 'unsupported target: %s\n' "${target}" >&2 - exit 1 - ;; - esac - - if [[ "${target}" == "compose_file" && ! -f "${output}" ]]; then - printf '\033[31mERROR: selected compose file "%s" does not exist\033[0m\n' "${output}" >&2 - fi - if [[ "${target}" == "env_file" && ! -f "${output}" ]]; then - printf '\033[31mERROR: selected env file "%s" does not exist\033[0m\n' "${output}" >&2 - fi - - printf '%s\n' "${output}" + local target=$1 + local is_prod=$2 + + local local_env_file=".envs/local/jupyterhub.env" + local production_env_file=".envs/production/jupyterhub.env" + + local output + + case "${target}" in + env) + if [[ "${is_prod}" == true ]]; then + output='production' + else + output='local' + fi + ;; + compose_file) + if [[ "${is_prod}" == true ]]; then + output='compose.production.yaml' + else + output='compose.local.yaml' + fi + ;; + env_file) + if [[ "${is_prod}" == true ]]; then + output="${production_env_file}" + else + output="${local_env_file}" + fi + ;; + client_network) + if [[ "${is_prod}" == true ]]; then + output='sds-jupyter-prod-net-clients' + else + output='sds-jupyter-local-net-clients' + fi + ;; + compose_project_name) + if [[ "${is_prod}" == true ]]; then + output='sds-jupyter-prod' + else + output='sds-jupyter-local' + fi + ;; + *) + printf 'unsupported target: %s\n' "${target}" >&2 + exit 1 + ;; + esac + + if [[ "${target}" == "compose_file" && ! -f "${output}" ]]; then + printf '\033[31mERROR: selected compose file "%s" does not exist\033[0m\n' "${output}" >&2 + fi + if [[ "${target}" == "env_file" && ! -f "${output}" ]]; then + printf '\033[31mERROR: selected env file "%s" does not exist\033[0m\n' "${output}" >&2 + fi + + printf '%s\n' "${output}" } main() { - if [[ $# -ne 1 ]]; then - printf 'usage: %s \n' "$0" >&2 - exit 1 - fi - - local target=$1 - local is_prod=false - - if is_production_host; then - is_prod=true - fi - - get_target_value "${target}" "${is_prod}" + if [[ $# -ne 1 ]]; then + printf 'usage: %s \n' "$0" >&2 + exit 1 + fi + + local target=$1 + local is_prod=false + + # allow explicit override via SDS_ENV (e.g., SDS_ENV=prod just env) + if [[ -n "${SDS_ENV:-}" ]]; then + case "${SDS_ENV}" in + local) is_prod=false ;; + prod | production) is_prod=true ;; + *) + printf '\033[33mUnknown SDS_ENV="%s": must be local, prod, or production\033[0m\n' "${SDS_ENV}" >&2 + exit 1 + ;; + esac + elif is_production_host; then + is_prod=true + fi + + get_target_value "${target}" "${is_prod}" } main "$@" diff --git a/sdk/README.md b/sdk/README.md index f1f086902..5070c03d7 100644 --- a/sdk/README.md +++ b/sdk/README.md @@ -121,7 +121,7 @@ components, create a test user, and set up the integration test environment: 2. Follow the Gateway instructions in the [Gateway README](../gateway/README.md); In summary: 1. Deploy the Docker Compose stack; - 2. Create a MinIO user and bucket with same credentials as in `minio.env`; + 1. Create a storage user and bucket with same credentials as in `storage.env`; 3. Create a test user and API key: 1. Create a Gateway superuser and a regular user (they may be the same); 2. Enable their `is_approved` flag in the [admin diff --git a/sdk/config/nginx/nginx.conf b/sdk/config/nginx/nginx.conf index d6f74b54d..54dff35d1 100644 --- a/sdk/config/nginx/nginx.conf +++ b/sdk/config/nginx/nginx.conf @@ -47,7 +47,7 @@ http { # Cache configuration for static assets location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ { - expires 30d; + expires 1d; add_header Cache-Control "public, immutable"; } } @@ -58,6 +58,13 @@ http { expires -1; add_header Cache-Control "no-cache, no-store, must-revalidate"; } + + # health check endpoint + location = /healthz { + access_log off; + return 200 'OK'; + } + } # end server } # end http diff --git a/sds-code.code-workspace b/sds-code.code-workspace index 04bdb837c..54231c410 100644 --- a/sds-code.code-workspace +++ b/sds-code.code-workspace @@ -36,6 +36,10 @@ "name": "jupyter", "path": "./jupyter" }, + { + "name": "seaweedfs", + "path": "./seaweedfs" + }, ], "settings": { "[python]": { diff --git a/seaweedfs/.envs/example/seaweedfs.env b/seaweedfs/.envs/example/seaweedfs.env new file mode 100644 index 000000000..2d194af88 --- /dev/null +++ b/seaweedfs/.envs/example/seaweedfs.env @@ -0,0 +1,53 @@ +# ───────────────────────────────────────────────────────── +# SeaweedFS Example Environment Variables +# ───────────────────────────────────────────────────────── +# Copy this to .envs//sfs.env and fill in secrets. +# Never commit .env files to git. +# +# Generate secrets: +# JWT_SIGNING_KEY=$(openssl rand -hex 32) +# JWT_FILER_SIGNING_KEY=$(openssl rand -hex 32) +# S3_SSE_KEK=$(openssl rand -hex 32) +# GRAFANA_PASSWORD= + +# User / Group for file ownership inside containers +UID=1000 +GID=1000 + +# ── Ports ────────────────────────────────────────────── +SFS_MASTER_PORT=9333 +SFS_MASTER_GRPC_PORT=19333 +SFS_MASTER_METRICS_PORT=9324 + +SFS_VOLUME_PORT=8080 +SFS_VOLUME_GRPC_PORT=18080 +SFS_VOLUME_METRICS_PORT=9325 + +SFS_FILER_PORT=8888 +SFS_FILER_GRPC_PORT=18888 +SFS_FILER_METRICS_PORT=9326 + +SFS_S3_PORT=8333 +SFS_S3_METRICS_PORT=9327 + +SFS_WEBDAV_PORT=7333 + +SFS_PROMETHEUS_HOST_PORT=9000 +SFS_PROMETHEUS_CONTAINER_PORT=9090 + +# ── Secrets (set real values, never commit this file) ── +# JWT signing key for volume write authorization. +JWT_SIGNING_KEY= + +# JWT signing key for filer HTTP write/read authorization. +JWT_FILER_SIGNING_KEY= + +# SSE-S3 Key Encryption Key (KEK). +S3_SSE_KEK= + +# Grafana admin password. +GRAFANA_PASSWORD= + +# MinIO backup credentials (for filer.backup S3 sink). +MINIO_BACKUP_ACCESS_KEY= +MINIO_BACKUP_SECRET_KEY= diff --git a/seaweedfs/.gitignore b/seaweedfs/.gitignore new file mode 100644 index 000000000..acae57224 --- /dev/null +++ b/seaweedfs/.gitignore @@ -0,0 +1,4 @@ +.env +data/ +.envs/* +!.envs/example/ diff --git a/seaweedfs/compose.ci.yaml b/seaweedfs/compose.ci.yaml new file mode 100644 index 000000000..0aa24d06f --- /dev/null +++ b/seaweedfs/compose.ci.yaml @@ -0,0 +1,157 @@ +# CI COMPOSE — SeaweedFS stack (minimal subset for CI/testing) +# 4 services only: master, single volume, filer, s3 gateway. +# Uses bind mounts under ./data/ (ephemeral). No JWT, no metrics infra. + +x-logging: &default-logging + driver: "json-file" + options: + max-size: "100m" + max-file: "3" + +networks: + sds-gateway-ci-seaweed-net: + driver: bridge + sds-network-ci: + external: true + +services: + # ───────────────────────────────────────────────────────── + # MASTER + # ───────────────────────────────────────────────────────── + sds-gateway-ci-sfs-master: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-ci-sfs-master + user: "${UID:-1000}:${GID:-1000}" + restart: unless-stopped + networks: + - sds-gateway-ci-seaweed-net + ports: + - "${SFS_MASTER_PORT:-9333}:${SFS_MASTER_PORT:-9333}" + - "${SFS_MASTER_GRPC_PORT:-19333}:${SFS_MASTER_GRPC_PORT:-19333}" + volumes: + - ./data/master:/data + - ./config/master.toml:/etc/seaweedfs/master.toml:ro + - ./config/security.toml:/etc/seaweedfs/security.toml:ro + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:${SFS_MASTER_PORT:-9333}/cluster/status >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + master + -mdir=/data + -ip=sds-gateway-ci-sfs-master + -ip.bind=0.0.0.0 + -port=${SFS_MASTER_PORT:-9333} + -metricsPort=${SFS_MASTER_METRICS_PORT:-9324} + + # ───────────────────────────────────────────────────────── + # VOLUME + # ───────────────────────────────────────────────────────── + sds-gateway-ci-sfs-volume: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-ci-sfs-volume + user: "${UID:-1000}:${GID:-1000}" + restart: unless-stopped + depends_on: + - sds-gateway-ci-sfs-master + networks: + - sds-gateway-ci-seaweed-net + ports: + - "${SFS_VOLUME_PORT:-8080}:${SFS_VOLUME_PORT:-8080}" + - "${SFS_VOLUME_GRPC_PORT:-18080}:${SFS_VOLUME_GRPC_PORT:-18080}" + volumes: + - ./data/volumes:/data + - ./config/security.toml:/etc/seaweedfs/security.toml:ro + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:${SFS_VOLUME_PORT:-8080}/healthz >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + volume + -master=sds-gateway-ci-sfs-master:${SFS_MASTER_PORT:-9333} + -ip=sds-gateway-ci-sfs-volume + -ip.bind=0.0.0.0 + -port=${SFS_VOLUME_PORT:-8080} + -max=0 + -dir=/data + -index=leveldb + -metricsPort=${SFS_VOLUME_METRICS_PORT:-9325} + + # ───────────────────────────────────────────────────────── + # FILER + # ───────────────────────────────────────────────────────── + sds-gateway-ci-sfs-filer: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-ci-sfs-filer + user: "${UID:-1000}:${GID:-1000}" + restart: unless-stopped + depends_on: + - sds-gateway-ci-sfs-master + - sds-gateway-ci-sfs-volume + networks: + - sds-gateway-ci-seaweed-net + ports: + - "${SFS_FILER_PORT:-8888}:${SFS_FILER_PORT:-8888}" + - "${SFS_FILER_GRPC_PORT:-18888}:${SFS_FILER_GRPC_PORT:-18888}" + volumes: + - ./data/filer:/data + - ./config/filer.toml:/etc/seaweedfs/filer.toml:ro + - ./config/security.toml:/etc/seaweedfs/security.toml:ro + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:${SFS_FILER_PORT:-8888}/ >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + filer + -master=sds-gateway-ci-sfs-master:${SFS_MASTER_PORT:-9333} + -ip=sds-gateway-ci-sfs-filer + -ip.bind=0.0.0.0 + -port=${SFS_FILER_PORT:-8888} + -metricsPort=${SFS_FILER_METRICS_PORT:-9326} + + # ───────────────────────────────────────────────────────── + # S3 GATEWAY + # ───────────────────────────────────────────────────────── + sds-gateway-ci-sfs-s3: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-ci-sfs-s3 + user: "${UID:-1000}:${GID:-1000}" + restart: unless-stopped + depends_on: + - sds-gateway-ci-sfs-master + - sds-gateway-ci-sfs-volume + - sds-gateway-ci-sfs-filer + networks: + - sds-gateway-ci-seaweed-net + - sds-network-ci + ports: + - "${SFS_S3_PORT:-8333}:${SFS_S3_PORT:-8333}" + volumes: + - ./config/s3-config.json:/etc/seaweedfs/s3.json:ro + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:${SFS_S3_PORT:-8333}/healthz >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + s3 + -filer=sds-gateway-ci-sfs-filer:${SFS_FILER_PORT:-8888} + -ip.bind=0.0.0.0 + -port=${SFS_S3_PORT:-8333} + -config=/etc/seaweedfs/s3.json + -metricsPort=${SFS_S3_METRICS_PORT:-9327} diff --git a/seaweedfs/compose.local.yaml b/seaweedfs/compose.local.yaml new file mode 100644 index 000000000..67ce323e4 --- /dev/null +++ b/seaweedfs/compose.local.yaml @@ -0,0 +1,212 @@ +x-logging: &default-logging + driver: "json-file" + options: + max-size: "100m" + max-file: "3" + +networks: + sds-gateway-local-seaweed-net: + driver: bridge + sds-network-local: + external: true + +volumes: + prometheus-data: + +services: + # ───────────────────────────────────────────────────────── + # MASTER + # ───────────────────────────────────────────────────────── + sds-gateway-local-sfs-master: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-local-sfs-master + user: "${UID:-1000}:${GID:-1000}" + restart: unless-stopped + networks: + - sds-gateway-local-seaweed-net + ports: + - "${SFS_MASTER_PORT:-9333}:${SFS_MASTER_PORT:-9333}" + - "${SFS_MASTER_GRPC_PORT:-19333}:${SFS_MASTER_GRPC_PORT:-19333}" + volumes: + - ./data/master:/data + - ./config/master.toml:/etc/seaweedfs/master.toml:ro + - ./config/security.toml:/etc/seaweedfs/security.toml:ro + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:${SFS_MASTER_PORT:-9333}/cluster/status >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + master + -mdir=/data + -ip=sds-gateway-local-sfs-master + -ip.bind=0.0.0.0 + -port=${SFS_MASTER_PORT:-9333} + -metricsPort=${SFS_MASTER_METRICS_PORT:-9324} + + # ───────────────────────────────────────────────────────── + # VOLUME — single volume server + # ───────────────────────────────────────────────────────── + sds-gateway-local-sfs-volume: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-local-sfs-volume + user: "${UID:-1000}:${GID:-1000}" + restart: unless-stopped + depends_on: + - sds-gateway-local-sfs-master + networks: + - sds-gateway-local-seaweed-net + ports: + - "${SFS_VOLUME_PORT:-8080}:${SFS_VOLUME_PORT:-8080}" + - "${SFS_VOLUME_GRPC_PORT:-18080}:${SFS_VOLUME_GRPC_PORT:-18080}" + volumes: + - ./data/volumes:/data + - ./config/security.toml:/etc/seaweedfs/security.toml:ro + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:${SFS_VOLUME_PORT:-8080}/healthz >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + volume + -master=sds-gateway-local-sfs-master:${SFS_MASTER_PORT:-9333} + -ip=sds-gateway-local-sfs-volume + -ip.bind=0.0.0.0 + -port=${SFS_VOLUME_PORT:-8080} + -max=0 + -dir=/data + -index=leveldb + -metricsPort=${SFS_VOLUME_METRICS_PORT:-9325} + + # ───────────────────────────────────────────────────────── + # FILER + # ───────────────────────────────────────────────────────── + sds-gateway-local-sfs-filer: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-local-sfs-filer + user: "${UID:-1000}:${GID:-1000}" + restart: unless-stopped + depends_on: + - sds-gateway-local-sfs-master + - sds-gateway-local-sfs-volume + networks: + - sds-gateway-local-seaweed-net + ports: + - "${SFS_FILER_PORT:-8888}:${SFS_FILER_PORT:-8888}" + - "${SFS_FILER_GRPC_PORT:-18888}:${SFS_FILER_GRPC_PORT:-18888}" + volumes: + - ./data/filer:/data + - ./config/filer.toml:/etc/seaweedfs/filer.toml:ro + - ./config/security.toml:/etc/seaweedfs/security.toml:ro + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:${SFS_FILER_PORT:-8888}/ >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + filer + -master=sds-gateway-local-sfs-master:${SFS_MASTER_PORT:-9333} + -ip=sds-gateway-local-sfs-filer + -ip.bind=0.0.0.0 + -port=${SFS_FILER_PORT:-8888} + -metricsPort=${SFS_FILER_METRICS_PORT:-9326} + + # ───────────────────────────────────────────────────────── + # S3 GATEWAY + # ───────────────────────────────────────────────────────── + sds-gateway-local-sfs-s3: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-local-sfs-s3 + user: "${UID:-1000}:${GID:-1000}" + restart: unless-stopped + depends_on: + - sds-gateway-local-sfs-master + - sds-gateway-local-sfs-volume + - sds-gateway-local-sfs-filer + networks: + - sds-gateway-local-seaweed-net + - sds-network-local + ports: + - "${SFS_S3_PORT:-8333}:${SFS_S3_PORT:-8333}" + volumes: + - ./config/s3-config.json:/etc/seaweedfs/s3.json:ro + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:${SFS_S3_PORT:-8333}/healthz >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + s3 + -filer=sds-gateway-local-sfs-filer:${SFS_FILER_PORT:-8888} + -ip.bind=0.0.0.0 + -port=${SFS_S3_PORT:-8333} + -config=/etc/seaweedfs/s3.json + -metricsPort=${SFS_S3_METRICS_PORT:-9327} + + # ───────────────────────────────────────────────────────── + # WEBDAV + # ───────────────────────────────────────────────────────── + sds-gateway-local-sfs-webdav: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-local-sfs-webdav + user: "${UID:-1000}:${GID:-1000}" + restart: unless-stopped + depends_on: + - sds-gateway-local-sfs-master + - sds-gateway-local-sfs-volume + - sds-gateway-local-sfs-filer + networks: + - sds-gateway-local-seaweed-net + ports: + - "${SFS_WEBDAV_PORT:-7333}:${SFS_WEBDAV_PORT:-7333}" + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -sS -o /dev/null http://localhost:${SFS_WEBDAV_PORT:-7333}/"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + webdav + -filer=sds-gateway-local-sfs-filer:${SFS_FILER_PORT:-8888} + + # ───────────────────────────────────────────────────────── + # PROMETHEUS — pull-based metrics + # ───────────────────────────────────────────────────────── + sds-gateway-local-sfs-prometheus: + image: docker.io/prom/prometheus:v2.53.0 + container_name: sds-gateway-local-sfs-prometheus + restart: unless-stopped + depends_on: + - sds-gateway-local-sfs-s3 + healthcheck: + test: ["CMD-SHELL", "wget --spider -q http://localhost:${SFS_PROMETHEUS_CONTAINER_PORT:-9090}/-/healthy || exit 1"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + networks: + - sds-gateway-local-seaweed-net + ports: + - "${SFS_PROMETHEUS_HOST_PORT:-9000}:${SFS_PROMETHEUS_CONTAINER_PORT:-9090}" + volumes: + - prometheus-data:/prometheus + - ./prometheus/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro + command: + - "--config.file=/etc/prometheus/prometheus.yaml" + - "--storage.tsdb.path=/prometheus" + - "--web.enable-lifecycle" diff --git a/seaweedfs/compose.production.yaml b/seaweedfs/compose.production.yaml new file mode 100644 index 000000000..35a73a58f --- /dev/null +++ b/seaweedfs/compose.production.yaml @@ -0,0 +1,501 @@ +# ⚠️ PRODUCTION COMPOSE — SeaweedFS 5×22TB + EC RS(10+4) ⚠️ +# Following sfs-deployment-checklist.md for safe production deployment. +# +# Architecture: +# - Single master (restartable, light load) +# - 5 volume servers (1 per 22TB XFS drive, ports 8081-8085) +# - Filer with leveldb2 (embedded metadata store) +# - S3 gateway for S3-compatible access +# - WebDAV access +# - Admin + Worker for Erasure Coding (RS 10+4) + cluster maintenance +# - Pushgateway + Prometheus (push metrics mode) + Grafana +# - Async filer backup to MinIO (S3 sink) +# +# PRE-DEPLOYMENT (run once): +# docker network create sds-gateway-prod-seaweed-net +# mkdir -p /disk{1,2,3,4,5}/{data,idx} +# mkdir -p /data/seaweedfs/{master,filer} +# +# SECURITY: Set these in your .env file (never commit to git): +# JWT_SIGNING_KEY — master signs, volumes validate on write +# JWT_FILER_SIGNING_KEY— S3 gateway signs, filer validates +# S3_SSE_KEK — SSE-S3 encryption key +# GRAFANA_PASSWORD — Grafana admin password +# +# IMAGE: 4.23_large_disk_full — supports large volumes, full backend suite. + +x-logging: &default-logging + driver: "json-file" + options: + max-size: "100m" + max-file: "3" + +networks: + # Internal SeaweedFS network (created before deploy) + sds-gateway-prod-seaweed-net: + external: true + # Shared network with gateway services + sds-network-prod: + external: true + +volumes: + prometheus-data: + grafana-data: + +services: + # ───────────────────────────────────────────────────────── + # MASTER — cluster coordinator, assigns volumes, signs JWTs + # ───────────────────────────────────────────────────────── + sds-gateway-prod-sfs-master: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-master + restart: unless-stopped + networks: + - sds-gateway-prod-seaweed-net + ports: + - "9333:9333" # HTTP + - "19333:19333" # gRPC + environment: + # JWT signing key for volume write auth + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:9333/cluster/status >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + volumes: + # Persistent metadata (filer store, master state) + - /data/seaweedfs/master:/data + # Config files + - ./config/master.toml:/etc/seaweedfs/master.toml:ro + - ./config/security.toml:/etc/seaweedfs/security.toml:ro + logging: *default-logging + command: | + master + -mdir=/data + -ip=sds-gateway-prod-sfs-master + -ip.bind=0.0.0.0 + -port=9333 + -volumePreallocate + -volumeSizeLimitMB=30000 + -master.metrics.address=http://sds-gateway-prod-sfs-pushgateway:9091 + + # ───────────────────────────────────────────────────────── + # 5 VOLUME SERVERS — one per 22TB XFS drive + # Each has dedicated data + idx paths, leveldb index, + # and per-drive healthcheck. + # ───────────────────────────────────────────────────────── + sds-gateway-prod-sfs-volume1: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-volume1 + restart: unless-stopped + networks: + - sds-gateway-prod-seaweed-net + ports: + - "8081:8081" # HTTP + - "18081:18081" # gRPC + environment: + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /disk1/data:/data + - /disk1/idx:/idx + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:8081/healthz >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + volume + -master=sds-gateway-prod-sfs-master:9333 + -ip=sds-gateway-prod-sfs-volume1 + -ip.bind=0.0.0.0 + -port=8081 + -max=0 + -dir=/data + -dir.idx=/idx + -index=leveldb + -dataCenter=dc1 + -rack=rack1 + -compactionMBps=40 + -minFreeSpacePercent=7 + + sds-gateway-prod-sfs-volume2: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-volume2 + restart: unless-stopped + networks: + - sds-gateway-prod-seaweed-net + ports: + - "8082:8082" + - "18082:18082" + environment: + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /disk2/data:/data + - /disk2/idx:/idx + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:8082/healthz >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + volume + -master=sds-gateway-prod-sfs-master:9333 + -ip=sds-gateway-prod-sfs-volume2 + -ip.bind=0.0.0.0 + -port=8082 + -max=0 + -dir=/data + -dir.idx=/idx + -index=leveldb + -dataCenter=dc1 + -rack=rack1 + -compactionMBps=40 + -minFreeSpacePercent=7 + + sds-gateway-prod-sfs-volume3: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-volume3 + restart: unless-stopped + networks: + - sds-gateway-prod-seaweed-net + ports: + - "8083:8083" + - "18083:18083" + environment: + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /disk3/data:/data + - /disk3/idx:/idx + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:8083/healthz >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + volume + -master=sds-gateway-prod-sfs-master:9333 + -ip=sds-gateway-prod-sfs-volume3 + -ip.bind=0.0.0.0 + -port=8083 + -max=0 + -dir=/data + -dir.idx=/idx + -index=leveldb + -dataCenter=dc1 + -rack=rack1 + -compactionMBps=40 + -minFreeSpacePercent=7 + + sds-gateway-prod-sfs-volume4: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-volume4 + restart: unless-stopped + networks: + - sds-gateway-prod-seaweed-net + ports: + - "8084:8084" + - "18084:18084" + environment: + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /disk4/data:/data + - /disk4/idx:/idx + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:8084/healthz >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + volume + -master=sds-gateway-prod-sfs-master:9333 + -ip=sds-gateway-prod-sfs-volume4 + -ip.bind=0.0.0.0 + -port=8084 + -max=0 + -dir=/data + -dir.idx=/idx + -index=leveldb + -dataCenter=dc1 + -rack=rack1 + -compactionMBps=40 + -minFreeSpacePercent=7 + + sds-gateway-prod-sfs-volume5: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-volume5 + restart: unless-stopped + networks: + - sds-gateway-prod-seaweed-net + ports: + - "8085:8085" + - "18085:18085" + environment: + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /disk5/data:/data + - /disk5/idx:/idx + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:8085/healthz >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + volume + -master=sds-gateway-prod-sfs-master:9333 + -ip=sds-gateway-prod-sfs-volume5 + -ip.bind=0.0.0.0 + -port=8085 + -max=0 + -dir=/data + -dir.idx=/idx + -index=leveldb + -dataCenter=dc1 + -rack=rack1 + -compactionMBps=40 + -minFreeSpacePercent=7 + + # ───────────────────────────────────────────────────────── + # FILER — metadata store, file namespace, HTTP file browser + # ───────────────────────────────────────────────────────── + sds-gateway-prod-sfs-filer: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-filer + restart: unless-stopped + depends_on: + - sds-gateway-prod-sfs-master + networks: + - sds-gateway-prod-seaweed-net + ports: + - "8888:8888" # HTTP + - "18888:18888" # gRPC + environment: + # JWT key for volume write auth + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + # JWT key for filer HTTP write auth — S3 gateway signs, filer validates + WEED_JWT_FILER_SIGNING_KEY: "${JWT_FILER_SIGNING_KEY}" + volumes: + # Persistent filer metadata (leveldb2 store) + - /data/seaweedfs/filer:/data + # Config files + - ./config/filer.toml:/etc/seaweedfs/filer.toml:ro + - ./config/security.toml:/etc/seaweedfs/security.toml:ro + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:8888/ >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + filer + -master=sds-gateway-prod-sfs-master:9333 + -ip=sds-gateway-prod-sfs-filer + -ip.bind=0.0.0.0 + -port=8888 + -encryptVolumeData=false + -maxMB=32 + + # ───────────────────────────────────────────────────────── + # S3 GATEWAY — S3-compatible API, connects to filer + # ───────────────────────────────────────────────────────── + sds-gateway-prod-sfs-s3: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-s3 + restart: unless-stopped + depends_on: + - sds-gateway-prod-sfs-filer + networks: + # Internal: connects to filer/volume + - sds-gateway-prod-seaweed-net + # External: gateway services connect here + - sds-network-prod + ports: + - "8333:8333" + environment: + # Must match filer's WEED_JWT_FILER_SIGNING_KEY + WEED_JWT_FILER_SIGNING_KEY: "${JWT_FILER_SIGNING_KEY}" + # SSE-S3 Key Encryption Key + WEED_S3_SSE_KEK: "${S3_SSE_KEK}" + volumes: + - ./config/s3-config.json:/etc/seaweedfs/s3.json:ro + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:8333/healthz >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + s3 + -filer=sds-gateway-prod-sfs-filer:8888 + -port=8333 + -config=/etc/seaweedfs/s3.json + -domain=.s3.example.com + + # ───────────────────────────────────────────────────────── + # WEBDAV — WebDAV access to filer namespace + # ───────────────────────────────────────────────────────── + sds-gateway-prod-sfs-webdav: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-webdav + restart: unless-stopped + depends_on: + - sds-gateway-prod-sfs-master + - sds-gateway-prod-sfs-filer + networks: + - sds-gateway-prod-seaweed-net + logging: *default-logging + command: | + webdav + -filer=sds-gateway-prod-sfs-filer:8888 + + # ───────────────────────────────────────────────────────── + # ADMIN — cluster admin server (EC management, maintenance) + # ───────────────────────────────────────────────────────── + sds-gateway-prod-sfs-admin: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-admin + restart: unless-stopped + depends_on: + - sds-gateway-prod-sfs-master + networks: + - sds-gateway-prod-seaweed-net + ports: + - "23646:23646" # Admin HTTP + logging: *default-logging + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:23646/ >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + command: | + admin + -master=sds-gateway-prod-sfs-master:9333 + + # ───────────────────────────────────────────────────────── + # WORKER — runs erasure_coding plugin and maintenance scripts + # Continuously converts full/quiet volumes to EC shards. + # ───────────────────────────────────────────────────────── + sds-gateway-prod-sfs-worker: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-worker + restart: unless-stopped + depends_on: + - sds-gateway-prod-sfs-admin + networks: + - sds-gateway-prod-seaweed-net + logging: *default-logging + command: | + worker + -admin=sds-gateway-prod-sfs-admin:23646 + + # ───────────────────────────────────────────────────────── + # PROMETHEUS + PUSHGATEWAY — push-based metrics collection + # SeaweedFS components push metrics to pushgateway; + # Prometheus scrapes from pushgateway (simpler than + # dynamic target discovery for volume servers). + # ───────────────────────────────────────────────────────── + sds-gateway-prod-sfs-prometheus: + image: docker.io/prom/prometheus:v2.53.0 + container_name: sds-gateway-prod-sfs-prometheus + restart: unless-stopped + networks: + - sds-gateway-prod-seaweed-net + ports: + - "9090:9090" + healthcheck: + test: ["CMD-SHELL", "wget --spider -q http://localhost:9090/-/healthy || exit 1"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + volumes: + - prometheus-data:/prometheus + - ./prometheus/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro + command: + - "--config.file=/etc/prometheus/prometheus.yaml" + - "--storage.tsdb.path=/prometheus" + - "--web.enable-lifecycle" + + sds-gateway-prod-sfs-pushgateway: + image: docker.io/prom/pushgateway:v1.9.0 + container_name: sds-gateway-prod-sfs-pushgateway + restart: unless-stopped + networks: + - sds-gateway-prod-seaweed-net + ports: + - "9091:9091" + healthcheck: + test: ["CMD-SHELL", "wget --spider -q http://localhost:9091/-/healthy || exit 1"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + + # ───────────────────────────────────────────────────────── + # GRAFANA — dashboards + alerting + # ───────────────────────────────────────────────────────── + sds-gateway-prod-sfs-grafana: + image: docker.io/grafana/grafana:11.1.0 + container_name: sds-gateway-prod-sfs-grafana + restart: unless-stopped + networks: + - sds-gateway-prod-seaweed-net + ports: + - "3000:3000" + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:3000/api/health >/dev/null"] + interval: 15s + retries: 5 + start_interval: 5s + start_period: 30s + timeout: 5s + environment: + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_PASSWORD}" + volumes: + - grafana-data:/var/lib/grafana + + # ───────────────────────────────────────────────────────── + # FILER BACKUP — async replication to MinIO (S3 sink) + # Subscribes to filer metadata change log (CDC) and + # replicates file content to the configured S3-compatible + # storage (MinIO). Checkpointed for safe restarts. + # ───────────────────────────────────────────────────────── + sds-gateway-prod-sfs-filer-backup: + image: docker.io/chrislusf/seaweedfs:4.23_large_disk_full + container_name: sds-gateway-prod-sfs-filer-backup + restart: unless-stopped + depends_on: + - sds-gateway-prod-sfs-filer + networks: + - sds-gateway-prod-seaweed-net + volumes: + - ./config/replication.toml:/etc/seaweedfs/replication.toml:ro + command: | + filer.backup + -filer=sds-gateway-prod-sfs-filer:8888 + -config=/etc/seaweedfs/replication.toml diff --git a/seaweedfs/config/credential.toml b/seaweedfs/config/credential.toml new file mode 100644 index 000000000..7e3bde779 --- /dev/null +++ b/seaweedfs/config/credential.toml @@ -0,0 +1,47 @@ +# Put this file to one of the location, with descending priority +# Based on 'docker compose -p seaweedfs exec -it sds-gateway-local-sfs-master weed scaffold -config credential' +# ./credential.toml +# $HOME/.seaweedfs/credential.toml +# /etc/seaweedfs/credential.toml +# this file is read by S3 API and IAM API servers + +# Choose one of the credential stores below +# Only one store can be enabled at a time + +# Filer-based credential store (default, uses existing filer storage) +[credential.filer_etc] + enabled = true + # filer address and grpc_dial_option will be automatically configured by the server + + # PostgreSQL credential store (recommended for multi-node deployments) + # [credential.postgres] + # database = "seaweedfs" + # enabled = false + # hostname = "localhost" + # password = "your_password" + # port = 5432 + # schema = "public" + # sslmode = "disable" + # username = "seaweedfs" + # # Optional: table name prefix (default: "sw_") + # table_prefix = "sw_" + # # Connection pool settings + # connection_max_idle = 10 + # connection_max_lifetime_seconds = 3600 + # connection_max_open = 100 + + # Memory credential store (for testing only, data is lost on restart) + # [credential.memory] + # enabled = false + + # # Environment variable overrides: + # # Any configuration value can be overridden by environment variables + # # Rules: + # # * Prefix with "WEED_CREDENTIAL_" + # # * Convert to uppercase + # # * Replace '.' with '_' + # # + # # Examples: + # # export WEED_CREDENTIAL_POSTGRES_PASSWORD=secret + # # export WEED_CREDENTIAL_POSTGRES_HOSTNAME=db.example.com + # # export WEED_CREDENTIAL_FILER_ETC_ENABLED=true diff --git a/seaweedfs/config/filer.toml b/seaweedfs/config/filer.toml new file mode 100644 index 000000000..e57ca931b --- /dev/null +++ b/seaweedfs/config/filer.toml @@ -0,0 +1,436 @@ +# A sample TOML config file for SeaweedFS filer store +# Based on 'docker compose -p seaweedfs exec -it sds-gateway-local-sfs-volume weed scaffold -config filer' +# Used with "weed filer" or "weed server -filer" +# Put this file to one of the location, with descending priority +# ./filer.toml +# $HOME/.seaweedfs/filer.toml +# /etc/seaweedfs/filer.toml + +#################################################### +# Customizable filer server options +#################################################### +[filer.options] + # with http DELETE, by default the filer would check whether a folder is empty. + # recursive_delete will delete all sub folders and files, similar to "rm -Rf" + recursive_delete = false + #max_file_name_length = 255 + + #################################################### + # The following are filer store options + #################################################### + +[leveldb2] + # local on disk, mostly for simple single-machine setup, fairly scalable + # faster than previous leveldb, recommended. + dir = "/data/filer/filerldb2" # directory to store level db files + enabled = true + +[leveldb3] + # similar to leveldb2. + # each bucket has its own meta store. + dir = "./filerldb3" # directory to store level db files + enabled = false + +[rocksdb] + # local on disk, similar to leveldb + # since it is using a C wrapper, you need to install rocksdb and build it by yourself + dir = "./filerrdb" # directory to store rocksdb files + enabled = false + +[sqlite] + # local on disk, similar to leveldb + dbFile = "./filer.db" # sqlite db file + enabled = false + +[mysql] # or memsql, tidb + # CREATE TABLE IF NOT EXISTS `filemeta` ( + # `dirhash` BIGINT NOT NULL COMMENT 'first 64 bits of MD5 hash value of directory field', + # `name` VARCHAR(766) NOT NULL COMMENT 'directory or file name', + # `directory` TEXT NOT NULL COMMENT 'full path to parent directory', + # `meta` LONGBLOB, + # PRIMARY KEY (`dirhash`, `name`) + # ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; + + enabled = false + # dsn will take priority over "hostname, port, username, password, database". + # [username[:password]@][protocol[(address)]]/dbname[?param1=value1&...¶mN=valueN] + ca_crt = "" # ca.crt dir when enable_tls set true + client_crt = "" # mysql client.crt dir when enable_tls set true + client_key = "" # mysql client.key dir when enable_tls set true + connection_max_idle = 10 + connection_max_lifetime_seconds = 300 + connection_max_open = 50 + database = "" # create or use an existing database + dsn = "root@tcp(localhost:3306)/seaweedfs?collation=utf8mb4_bin" + enable_tls = false + hostname = "localhost" + interpolateParams = false + password = "" + port = 3306 + username = "root" + # if insert/upsert failing, you can disable upsert or update query syntax to match your RDBMS syntax: + enableUpsert = true + upsertQuery = """INSERT INTO `%s` (`dirhash`,`name`,`directory`,`meta`) VALUES (?,?,?,?) AS `new` ON DUPLICATE KEY UPDATE `meta` = `new`.`meta`""" + +[mysql2] # or memsql, tidb + connection_max_idle = 10 + connection_max_lifetime_seconds = 300 + connection_max_open = 50 + createTable = """ + CREATE TABLE IF NOT EXISTS `%s` ( + `dirhash` BIGINT NOT NULL, + `name` VARCHAR(766) NOT NULL, + `directory` TEXT NOT NULL, + `meta` LONGBLOB, + PRIMARY KEY (`dirhash`, `name`) + ) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; +""" + database = "" # create or use an existing database + enabled = false + hostname = "localhost" + interpolateParams = false + password = "" + port = 3306 + username = "root" + # if insert/upsert failing, you can disable upsert or update query syntax to match your RDBMS syntax: + enableUpsert = true + upsertQuery = """INSERT INTO `%s` (`dirhash`,`name`,`directory`,`meta`) VALUES (?,?,?,?) AS `new` ON DUPLICATE KEY UPDATE `meta` = `new`.`meta`""" + +[postgres] # or cockroachdb, YugabyteDB + # CREATE TABLE IF NOT EXISTS filemeta ( + # dirhash BIGINT, + # name VARCHAR(65535), + # directory VARCHAR(65535), + # meta bytea, + # PRIMARY KEY (dirhash, name) + # ); + database = "postgres" # create or use an existing database + enabled = false + hostname = "localhost" + password = "" + port = 5432 + schema = "" + sslmode = "disable" + username = "postgres" + # SSL certificate options for secure connections + # For sslmode=verify-full, uncomment and configure the following: + # sslcert = "/path/to/client.crt" # client certificate file + # sslkey = "/path/to/client.key" # client private key file + # sslrootcert = "/path/to/ca.crt" # CA certificate file + # sslcrl = "/path/to/client.crl" # Certificate Revocation List (CRL) (optional) + connection_max_idle = 10 + connection_max_lifetime_seconds = 300 + connection_max_open = 50 + # Set to true when using PgBouncer connection pooler + pgbouncer_compatible = false + # if insert/upsert failing, you can disable upsert or update query syntax to match your RDBMS syntax: + enableUpsert = true + upsertQuery = """ + INSERT INTO "%[1]s" (dirhash, name, directory, meta) + VALUES($1, $2, $3, $4) + ON CONFLICT (dirhash, name) DO UPDATE SET + directory=EXCLUDED.directory, + meta=EXCLUDED.meta +""" + +[postgres2] + createTable = """ + CREATE TABLE IF NOT EXISTS "%s" ( + dirhash BIGINT, + name VARCHAR(65535), + directory VARCHAR(65535), + meta bytea, + PRIMARY KEY (dirhash, name) + ); +""" + database = "postgres" # create or use an existing database + enabled = false + hostname = "localhost" + password = "" + port = 5432 + schema = "" + sslmode = "disable" + username = "postgres" + # SSL certificate options for secure connections + # For sslmode=verify-full, uncomment and configure the following: + # sslcert = "/path/to/client.crt" # client certificate file + # sslkey = "/path/to/client.key" # client private key file + # sslrootcert = "/path/to/ca.crt" # CA certificate file + # sslcrl = "/path/to/client.crl" # Certificate Revocation List (CRL) (optional) + connection_max_idle = 10 + connection_max_lifetime_seconds = 300 + connection_max_open = 50 + # Set to true when using PgBouncer connection pooler + pgbouncer_compatible = false + # if insert/upsert failing, you can disable upsert or update query syntax to match your RDBMS syntax: + enableUpsert = true + upsertQuery = """ + INSERT INTO "%[1]s" (dirhash, name, directory, meta) + VALUES($1, $2, $3, $4) + ON CONFLICT (dirhash, name) DO UPDATE SET + directory=EXCLUDED.directory, + meta=EXCLUDED.meta +""" + +[cassandra2] + # CREATE TABLE filemeta ( + # dirhash bigint, + # directory varchar, + # name varchar, + # meta blob, + # PRIMARY KEY ((dirhash, directory), name) + # ) WITH CLUSTERING ORDER BY (name ASC); + enabled = false + hosts = ["localhost:9042"] + keyspace = "seaweedfs" + password = "" + username = "" + # Set the CA certificate path + ssl_ca_path = "" + # Set the client certificate path + ssl_cert_path = "" + # Set the client private key path + ssl_key_path = "" + # Check host name in the certificate + ssl_enable_host_verification = true + # This changes the data layout. Only add new directories. Removing/Updating will cause data loss. + superLargeDirectories = [] + # Name of the datacenter local to this filer, used as host selection fallback. + localDC = "" + # Gocql connection timeout, default: 600ms + connection_timeout_millisecond = 600 + +[hbase] + enabled = false + table = "seaweedfs" + zkquorum = "" + +[redis2] + address = "localhost:6379" + database = 0 + enabled = false + password = "" + username = "" + # prefix for filer redis keys + ca_cert_path = "" + client_cert_path = "" + client_key_path = "" + enable_tls = false + keyPrefix = "" + # This changes the data layout. Only add new directories. Removing/Updating will cause data loss. + superLargeDirectories = [] + +[redis2_sentinel] + addresses = ["172.22.12.7:26379", "172.22.12.8:26379", "172.22.12.9:26379"] + database = 0 + enabled = false + masterName = "master" + password = "" + username = "" + # prefix for filer redis keys + ca_cert_path = "" + client_cert_path = "" + client_key_path = "" + enable_tls = false + keyPrefix = "" + +[redis_cluster2] + addresses = [ + "localhost:30001", + "localhost:30002", + "localhost:30003", + "localhost:30004", + "localhost:30005", + "localhost:30006", + ] + enabled = false + password = "" + username = "" + # prefix for filer redis keys + ca_cert_path = "" + client_cert_path = "" + client_key_path = "" + enable_tls = false + keyPrefix = "" + # allows reads from slave servers or the master, but all writes still go to the master + readOnly = false + # automatically use the closest Redis server for reads + routeByLatency = false + # This changes the data layout. Only add new directories. Removing/Updating will cause data loss. + superLargeDirectories = [] + +# The following lua redis stores uses lua to ensure atomicity +[redis_lua] + address = "localhost:6379" + database = 0 + enabled = false + password = "" + username = "" + # prefix for filer redis keys + ca_cert_path = "" + client_cert_path = "" + client_key_path = "" + enable_tls = false + keyPrefix = "" + # This changes the data layout. Only add new directories. Removing/Updating will cause data loss. + superLargeDirectories = [] + +[redis_lua_sentinel] + addresses = ["172.22.12.7:26379", "172.22.12.8:26379", "172.22.12.9:26379"] + database = 0 + enabled = false + masterName = "master" + password = "" + username = "" + # prefix for filer redis keys + ca_cert_path = "" + client_cert_path = "" + client_key_path = "" + enable_tls = false + keyPrefix = "" + +[redis_lua_cluster] + addresses = [ + "localhost:30001", + "localhost:30002", + "localhost:30003", + "localhost:30004", + "localhost:30005", + "localhost:30006", + ] + enabled = false + password = "" + username = "" + # prefix for filer redis keys + ca_cert_path = "" + client_cert_path = "" + client_key_path = "" + enable_tls = false + keyPrefix = "" + # allows reads from slave servers or the master, but all writes still go to the master + readOnly = false + # automatically use the closest Redis server for reads + routeByLatency = false + # This changes the data layout. Only add new directories. Removing/Updating will cause data loss. + superLargeDirectories = [] + +[etcd] + enabled = false + key_prefix = "seaweedfs." + password = "" + servers = "localhost:2379" + timeout = "3s" + username = "" + # Set the CA certificate path + tls_ca_file = "" + # Set the client certificate path + tls_client_crt_file = "" + # Set the client private key path + tls_client_key_file = "" + +[mongodb] + database = "seaweedfs" + enabled = false + insecure_skip_verify = false + option_pool_size = 0 + password = "" + ssl = false + ssl_ca_file = "" + ssl_cert_file = "" + ssl_key_file = "" + uri = "mongodb://localhost:27017" + username = "" + +[elastic7] + enabled = false + healthcheck_enabled = false + password = "" + servers = ["http://localhost1:9200", "http://localhost2:9200", "http://localhost3:9200"] + sniff_enabled = false + username = "" + # increase the value is recommend, be sure the value in Elastic is greater or equal here + index.max_result_window = 10000 + + +[arangodb] # in development dont use it + db_name = "seaweedfs" + enabled = false + servers = ["http://localhost:8529"] # list of servers to connect to + # only basic auth supported for now + password = "" + username = "" + # skip tls cert validation + insecure_skip_verify = true + +[ydb] # https://ydb.tech/ + dialTimeOut = 10 + dsn = "grpc://localhost:2136?database=/local" + enabled = false + poolSizeLimit = 50 + prefix = "seaweedfs" + useBucketPrefix = true # Fast Bucket Deletion + + # Authenticate produced with one of next environment variables: + # YDB_SERVICE_ACCOUNT_KEY_FILE_CREDENTIALS= — used service account key file by path + # YDB_ANONYMOUS_CREDENTIALS="1" — used for authenticate with anonymous access. Anonymous access needs for connect to testing YDB installation + # YDB_METADATA_CREDENTIALS="1" — used metadata service for authenticate to YDB from yandex cloud virtual machine or from yandex function + # YDB_ACCESS_TOKEN_CREDENTIALS= — used for authenticate to YDB with short-life access token. For example, access token may be IAM token + + ########################## + ########################## + # To add path-specific filer store: + # + # 1. Add a name following the store type separated by a dot ".". E.g., cassandra2.tmp + # 2. Add a location configuration. E.g., location = "/tmp/" + # 3. Copy and customize all other configurations. + # Make sure they are not the same if using the same store type! + # 4. Set enabled to true + # + # The following is just using redis as an example + ########################## + [redis2.tmp] + address = "localhost:6379" + database = 1 + enabled = false + keyPrefix = "" + location = "/tmp/" + password = "" + username = "" + +[tikv] + enabled = false + # If you have many pd address, use ',' split then: + # pdaddrs = "pdhost1:2379, pdhost2:2379, pdhost3:2379" + pdaddrs = "localhost:2379" + # prefix for filer TiKV keys, useful for sharing a TiKV cluster with multiple seaweedfs clusters + keyPrefix = "" + # Enable 1PC + enable_1pc = false + # batch delete count, default 10000 in code + #batchdelete_count = 20000 + + # Set the CA certificate path + ca_path = "" + # Set the certificate path + cert_path = "" + # Set the private key path + key_path = "" + # The name list used to verify the cn name + verify_cn = "" + +[foundationdb] + # FoundationDB provides ACID transactions and horizontal scalability. + # Requires: go build -tags foundationdb + cluster_file = "/etc/foundationdb/fdb.cluster" + enabled = false + # api_version = 740 + # timeout = "5s" + # directory_prefix = "seaweedfs" + # For bulk ingestion, enable batching: batch_enabled = true + +[tarantool] + address = "localhost:3301" + maxReconnects = 1000 + password = "" + timeout = "5s" + user = "guest" diff --git a/seaweedfs/config/master.toml b/seaweedfs/config/master.toml new file mode 100644 index 000000000..4e24ccc80 --- /dev/null +++ b/seaweedfs/config/master.toml @@ -0,0 +1,64 @@ +# Put this file to one of the location, with descending priority +# Based on 'docker compose -p seaweedfs exec -it sds-gateway-local-sfs-master weed scaffold -config master' +# ./master.toml +# $HOME/.seaweedfs/master.toml +# /etc/seaweedfs/master.toml +# this file is read by master + +[master.maintenance] + # periodically run these scripts are the same as running them from 'weed shell' + # Scripts are skipped while an admin server is connected. + scripts = """ + lock + ec.encode -fullPercent=95 -quietFor=1h + ec.rebuild -apply + ec.balance -apply + fs.log.purge -daysAgo=7 + volume.deleteEmpty -quietFor=24h -apply + volume.balance -apply + volume.fix.replication -apply + s3.clean.uploads -timeAgo=24h + unlock +""" + sleep_minutes = 17 # sleep minutes between each script execution + + +[master.sequencer] + type = "raft" # Choose [raft|snowflake] type for storing the file id sequence + # when sequencer.type = snowflake, the snowflake id must be different from other masters + sequencer_snowflake_id = 0 # any number between 1~1023 + + + # configurations for tiered cloud storage + # old volumes are transparently moved to cloud for cost efficiency + # [storage.backend] + # [storage.backend.s3.default] + # aws_access_key_id = "" # if empty, loads from the shared credentials file (~/.aws/credentials). + # aws_secret_access_key = "" # if empty, loads from the shared credentials file (~/.aws/credentials). + # bucket = "your_bucket_name" # an existing bucket + # enabled = false + # endpoint = "" + # region = "us-east-2" + # storage_class = "STANDARD_IA" + +# create this number of logical volumes if no more writable volumes +# count_x means how many copies of data. +# e.g.: +# 000 has only one copy, copy_1 +# 010 and 001 has two copies, copy_2 +# 011 has only 3 copies, copy_3 +[master.volume_growth] + copy_1 = 7 # create 1 x 7 = 7 actual volumes + copy_2 = 6 # create 2 x 6 = 12 actual volumes + copy_3 = 3 # create 3 x 3 = 9 actual volumes + copy_other = 1 # create n x 1 = n actual volumes + disable = false # disables volume growth if true + threshold = 0.9 # create threshold + +# configuration flags for replication +[master.replication] + # any replication counts should be considered minimums. If you specify 010 and + # have 3 different racks, that's still considered writable. Writes will still + # try to replicate to all available volumes. You should only use this option + # if you are doing your own replication or periodic sync of volumes. + treat_replication_as_minimums = false diff --git a/seaweedfs/config/notification.toml b/seaweedfs/config/notification.toml new file mode 100644 index 000000000..af869abaa --- /dev/null +++ b/seaweedfs/config/notification.toml @@ -0,0 +1,70 @@ +# A sample TOML config file for SeaweedFS filer store +# Based on 'docker compose -p seaweedfs exec -it sds-gateway-local-sfs-master weed scaffold -config notification' +# Used by both "weed filer" or "weed server -filer" and "weed filer.replicate" +# Put this file to one of the location, with descending priority +# ./notification.toml +# $HOME/.seaweedfs/notification.toml +# /etc/seaweedfs/notification.toml + +#################################################### +# notification +# send and receive filer updates for each file to an external message queue +#################################################### +[notification.log] + # this is only for debugging purpose and does not work with "weed filer.replicate" + enabled = false + + +[notification.kafka] + enabled = false + hosts = ["localhost:9092"] + offsetFile = "./last.offset" + offsetSaveIntervalSeconds = 10 + topic = "seaweedfs_filer" + + +[notification.aws_sqs] + # experimental, let me know if it works + aws_access_key_id = "" # if empty, loads from the shared credentials file (~/.aws/credentials). + aws_secret_access_key = "" # if empty, loads from the shared credentials file (~/.aws/credentials). + enabled = false + region = "us-east-2" + sqs_queue_name = "my_filer_queue" # an existing queue name + + +[notification.google_pub_sub] + # read credentials doc at https://cloud.google.com/docs/authentication/getting-started + enabled = false + google_application_credentials = "/path/to/x.json" # path to json credential file + project_id = "" # an existing project id + topic = "seaweedfs_filer_topic" # a topic, auto created if does not exists + +[notification.gocdk_pub_sub] + # The Go Cloud Development Kit (https://gocloud.dev). + # PubSub API (https://godoc.org/gocloud.dev/pubsub). + # Supports AWS SNS/SQS, Azure Service Bus, Google PubSub, NATS and RabbitMQ. + enabled = false + # This URL will Dial the RabbitMQ server at the URL in the environment + # variable RABBIT_SERVER_URL and open the exchange "myexchange". + # The exchange must have already been created by some other means, like + # the RabbitMQ management plugin. Сreate myexchange of type fanout and myqueue then + # create binding myexchange => myqueue + sub_url = "rabbit://myqueue" + topic_url = "rabbit://myexchange" + +[notification.webhook] + # Send file system events to HTTP webhook endpoints (push model) + # BEST FOR: Low to moderate traffic (< 100 events/second sustained) + # FOR HIGH TRAFFIC: Consider using Kafka, SQS, or pull-based event logs instead + # Documentation: https://github.com/seaweedfs/seaweedfs/wiki/Filer-Notification-Webhook + backoff_seconds = 3 # optional: initial backoff delay (default: 3, range: 1-60) + bearer_token = "" # optional: bearer token for authentication + buffer_size = 10000 # optional: event buffer size (default: 10000, range: 100-1000000) + enabled = false + endpoint = "https://your-server.com/webhook" # required: HTTP endpoint URL + max_backoff_seconds = 30 # optional: max backoff delay (default: 30, range: backoff_seconds-300) + max_retries = 3 # optional: retry attempts (default: 3, range: 0-10) + timeout_seconds = 10 # optional: HTTP timeout (default: 10, range: 1-300) + workers = 5 # optional: concurrent workers (default: 5, range: 1-100) + # event_types = ["create", "update", "delete", "rename"] # optional: filter by event types (default: all) + # path_prefixes = ["/important", "/data"] # optional: filter by path prefixes (default: all) diff --git a/seaweedfs/config/replication.toml b/seaweedfs/config/replication.toml new file mode 100644 index 000000000..fb827636a --- /dev/null +++ b/seaweedfs/config/replication.toml @@ -0,0 +1,75 @@ +# A sample TOML config file for replicating SeaweedFS filer +# Based on 'docker compose -p seaweedfs exec -it sds-gateway-local-sfs-master weed scaffold -config replication' +# Used with "weed filer.backup" +# Using with "weed filer.replicate" is deprecated. +# Put this file to one of the location, with descending priority +# ./replication.toml +# $HOME/.seaweedfs/replication.toml +# /etc/seaweedfs/replication.toml + +# [source.filer] # deprecated. Only useful with "weed filer.replicate" +# enabled = true +# grpcAddress = "localhost:18888" +# # all files under this directory tree are replicated. +# # this is not a directory on your hard drive, but on your filer. +# # i.e., all files with this "prefix" are sent to notification message queue. +# directory = "/buckets" +# # files from the directory separated by space are excluded from sending notifications +# excludeDirectories = "/buckets/tmp" + +[sink.local] + directory = "/data" + enabled = false + # all replicated files are under modified time as yyyy-mm-dd directories + # so each date directory contains all new and updated files. + is_incremental = false + +[sink.filer] + enabled = false + grpcAddress = "localhost:18888" + # all replicated files are under this directory tree + # this is not a directory on your hard drive, but on your filer. + # i.e., all received files will be "prefixed" to this directory. + collection = "" + directory = "/backup" + is_incremental = false + replication = "" + ttlSec = 0 + + [sink.s3] + # read credentials doc at https://docs.aws.amazon.com/sdk-for-go/v1/developer-guide/sessions.html + # default loads credentials from the shared credentials file (~/.aws/credentials). + aws_access_key_id = "${MINIO_BACKUP_ACCESS_KEY}" # if empty, loads from the shared credentials file (~/.aws/credentials). + aws_secret_access_key = "${MINIO_BACKUP_SECRET_KEY}" # if empty, loads from the shared credentials file (~/.aws/credentials). + bucket = "spectrumx" # an existing bucket in MinIO + directory = "/spectrumx" # prefix inside the bucket + enabled = true + endpoint = "https://minio.example.com" # your MinIO endpoint URL + is_incremental = false + region = "us-east-1" # can be anything for MinIO + + # [sink.google_cloud_storage] + # # read credentials doc at https://cloud.google.com/docs/authentication/getting-started + # bucket = "spectrumx" # an existing bucket + # directory = "/" # destination directory + # enabled = false + # google_application_credentials = "/path/to/x.json" # path to json credential file + # is_incremental = false + + # [sink.azure] + # # experimental, let me know if it works + # account_key = "" + # account_name = "" + # container = "mycontainer" # an existing container + # directory = "/" # destination directory + # enabled = false + # is_incremental = false + + # [sink.backblaze] + # b2_account_id = "" + # b2_master_application_key = "" + # b2_region = "" + # bucket = "mybucket" # an existing bucket + # directory = "/" # destination directory + # enabled = false + # is_incremental = false diff --git a/seaweedfs/config/s3-config.json b/seaweedfs/config/s3-config.json new file mode 100644 index 000000000..5de1f4fae --- /dev/null +++ b/seaweedfs/config/s3-config.json @@ -0,0 +1,24 @@ +{ + "identities": [ + { + "name": "admin", + "credentials": [ + { + "accessKey": "admin-access-key", + "secretKey": "admin-secret-key" + } + ], + "actions": ["Admin", "Read", "Write", "List", "Tagging"] + }, + { + "name": "backup-user", + "credentials": [ + { + "accessKey": "backup-access-key", + "secretKey": "backup-secret-key" + } + ], + "actions": ["Read", "List"] + } + ] +} diff --git a/seaweedfs/config/security.toml b/seaweedfs/config/security.toml new file mode 100644 index 000000000..8f2f8ab67 --- /dev/null +++ b/seaweedfs/config/security.toml @@ -0,0 +1,174 @@ +# Put this file to one of the location, with descending priority +# Based on 'docker compose -p seaweedfs exec -it sds-gateway-local-sfs-master weed scaffold -config security' +# ./security.toml +# $HOME/.seaweedfs/security.toml +# /etc/seaweedfs/security.toml +# this file is read by master, volume server, filer, and worker + +# comma separated origins allowed to make requests to the filer and s3 gateway. +# enter in this format: https://domain.com, or http://localhost:port +[cors.allowed_origins] + values = "*" + +# this jwt signing key is read by master and volume server, and it is used for write operations: +# - the Master server generates the JWT, which can be used to write a certain file on a volume server +# - the Volume server validates the JWT on writing +# the jwt defaults to expire after 10 seconds. +# PRODUCTION: Set via WEED_JWT_SIGNING_KEY env var in compose (overrides this empty value). +[jwt.signing] + expires_after_seconds = 10 # seconds + key = "" + +# by default, if the signing key above is set, the Volume UI over HTTP is disabled. +# by setting ui.access to true, you can re-enable the Volume UI. Despite +# some information leakage (as the UI is not authenticated), this should not +# pose a security risk. +[access] + ui = false + +# by default the filer UI is enabled. This can be a security risk if the filer is exposed to the public +# and the JWT for reads is not set. If you don't want the public to have access to the objects in your +# storage, and you haven't set the JWT for reads it is wise to disable access to directory metadata. +# This disables access to the Filer UI, and will no longer return directory metadata in GET requests. +[filer.expose_directory_metadata] + enabled = true + + # this jwt signing key is read by master and volume server, and it is used for read operations: + # - the Master server generates the JWT, which can be used to read a certain file on a volume server + # - the Volume server validates the JWT on reading + # NOTE: jwt for read is only supported with master+volume setup. Filer does not support this mode. + # Not set for production read auth — gRPC traffic stays within Docker network. + [jwt.signing.read] + expires_after_seconds = 10 # seconds + key = "" + + +# If this JWT key is configured, Filer only accepts writes over HTTP if they are signed with this JWT: +# - f.e. the S3 API Shim generates the JWT +# - the Filer server validates the JWT on writing +# NOTE: This key is ALSO used as a fallback signing key for S3 STS if s3.iam.config does not specify a signingKey. +# the jwt defaults to expire after 10 seconds. +# PRODUCTION: Set via WEED_JWT_FILER_SIGNING_KEY env var in compose (overrides this empty value). +[jwt.filer_signing] + expires_after_seconds = 10 # seconds + key = "" + + # If this JWT key is configured, Filer only accepts reads over HTTP if they are signed with this JWT: + # - f.e. the S3 API Shim generates the JWT + # - the Filer server validates the JWT on reading + # the jwt defaults to expire after 10 seconds. + [jwt.filer_signing.read] + expires_after_seconds = 10 # seconds + key = "" + +# gRPC mTLS configuration +# All gRPC TLS authentications are mutual (mTLS) +# The values for ca, cert, and key are paths to the certificate/key files +# The host name is not checked, so the certificate files can be shared +[grpc] + ca = "" + # Set wildcard domain for enable TLS authentication by common names + allowed_wildcard_domain = "" # .mycompany.com + + # Volume server gRPC options (server-side) + # Enables mTLS for incoming gRPC connections to volume server + [grpc.volume] + allowed_commonNames = "" # comma-separated SSL certificate common names + cert = "" + key = "" + + # Master server gRPC options (server-side) + # Enables mTLS for incoming gRPC connections to master server + [grpc.master] + allowed_commonNames = "" # comma-separated SSL certificate common names + cert = "" + key = "" + + # Filer server gRPC options (server-side) + # Enables mTLS for incoming gRPC connections to filer server + [grpc.filer] + allowed_commonNames = "" # comma-separated SSL certificate common names + cert = "" + key = "" + + # S3 server gRPC options (server-side) + # Enables mTLS for incoming gRPC connections to S3 server + [grpc.s3] + allowed_commonNames = "" # comma-separated SSL certificate common names + cert = "" + key = "" + + [grpc.msg_broker] + allowed_commonNames = "" # comma-separated SSL certificate common names + cert = "" + key = "" + + [grpc.msg_agent] + allowed_commonNames = "" # comma-separated SSL certificate common names + cert = "" + key = "" + + [grpc.admin] + allowed_commonNames = "" # comma-separated SSL certificate common names + cert = "" + key = "" + + [grpc.worker] + allowed_commonNames = "" # comma-separated SSL certificate common names + cert = "" + key = "" + + [grpc.mq] + allowed_commonNames = "" # comma-separated SSL certificate common names + cert = "" + key = "" + + # gRPC client configuration for outgoing gRPC connections + # Used by clients (S3, mount, backup, benchmark, filer.copy, filer.replicate, upload, etc.) + # when connecting to any gRPC server (master, volume, filer) + [grpc.client] + cert = "" + key = "" + +# HTTPS client configuration for outgoing HTTP connections +# Used by S3, mount, filer.copy, backup, and other clients when communicating with master/volume/filer +# Set enabled=true to use HTTPS instead of HTTP for data operations (separate from gRPC) +# If [https.filer] or [https.volume] are enabled on servers, clients must have [https.client] enabled=true +[https.client] + ca = "" # CA certificate to verify server certificates (required when enabled=true) + cert = "" # Client certificate for mTLS (optional if server doesn't require client cert) + enabled = false # Set to true to enable HTTPS for all outgoing HTTP client connections + key = "" # Client key for mTLS (optional if server doesn't require client cert) + +# Volume server HTTPS options (server-side) +# Enables HTTPS for incoming HTTP connections to volume server +[https.volume] + ca = "" + cert = "" + key = "" + +# Master server HTTPS options (server-side) +# Enables HTTPS for incoming HTTP connections to master server (web UI, HTTP API) +[https.master] + ca = "" + cert = "" + key = "" + +# Filer server HTTPS options (server-side) +# Enables HTTPS for incoming HTTP connections to filer server (web UI, HTTP API) +[https.filer] + ca = "" + cert = "" + key = "" + # disable_tls_verify_client_cert = true|false (default: false) + +# Admin server HTTPS options (server-side) +# Enables HTTPS for incoming HTTP connections to admin server +[https.admin] + ca = "" + cert = "" + key = "" + +# white list. It's checking request ip address. +[guard] + white_list = "" diff --git a/seaweedfs/config/shell.toml b/seaweedfs/config/shell.toml new file mode 100644 index 000000000..701519c95 --- /dev/null +++ b/seaweedfs/config/shell.toml @@ -0,0 +1,11 @@ +# A sample TOML config file for SeaweedFS cluster +# Based on 'docker compose -p seaweedfs exec -it sds-gateway-local-sfs-master weed scaffold -config shell' + +[cluster] + default = "c1" + + [cluster.c1] + master = "localhost:9333" # comma-separated master servers + + [cluster.c2] + master = "" diff --git a/seaweedfs/data/filer/.gitkeep b/seaweedfs/data/filer/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/seaweedfs/data/volumes/.gitkeep b/seaweedfs/data/volumes/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/seaweedfs/docs/.gitignore b/seaweedfs/docs/.gitignore new file mode 100644 index 000000000..15261f1e3 --- /dev/null +++ b/seaweedfs/docs/.gitignore @@ -0,0 +1 @@ +sfs-wiki diff --git a/seaweedfs/docs/operations.md b/seaweedfs/docs/operations.md new file mode 100644 index 000000000..7f2cd5be4 --- /dev/null +++ b/seaweedfs/docs/operations.md @@ -0,0 +1,496 @@ +# SeaweedFS Operations Guide + +Reference guide for managing this deployment. All commands target the Docker Compose +stack defined in `compose.yaml`. + ++ [SeaweedFS Operations Guide](#seaweedfs-operations-guide) + + [Architecture](#architecture) + + [Data flow](#data-flow) + + [Deployment](#deployment) + + [Data directory ownership](#data-directory-ownership) + + [Standard compose commands](#standard-compose-commands) + + [Full teardown (destroy all data)](#full-teardown-destroy-all-data) + + [View logs](#view-logs) + + [Web UIs](#web-uis) + + [S3 API](#s3-api) + + [Create or find S3 credentials (required)](#create-or-find-s3-credentials-required) + + [AWS CLI setup](#aws-cli-setup) + + [Common operations with AWS CLI](#common-operations-with-aws-cli) + + [MinIO client setup](#minio-client-setup) + + [Common operations with MinIO client](#common-operations-with-minio-client) + + [Filer HTTP API](#filer-http-api) + + [Maintenance](#maintenance) + + [Open the admin shell](#open-the-admin-shell) + + [Garbage collection (reclaim space from deleted files)](#garbage-collection-reclaim-space-from-deleted-files) + + [Delete empty / orphaned volumes](#delete-empty--orphaned-volumes) + + [Check volume filesystem integrity](#check-volume-filesystem-integrity) + + [Fix replication](#fix-replication) + + [Balance volume distribution across servers](#balance-volume-distribution-across-servers) + + [Backup and Restore](#backup-and-restore) + + [Save filer metadata to a file](#save-filer-metadata-to-a-file) + + [Restore filer metadata from a file](#restore-filer-metadata-from-a-file) + + [Backup volume data incrementally](#backup-volume-data-incrementally) + + [Troubleshooting](#troubleshooting) + + [Filer metadata not persisting after restart](#filer-metadata-not-persisting-after-restart) + + [Disk space used but files not visible](#disk-space-used-but-files-not-visible) + + [Volume server not registering with master](#volume-server-not-registering-with-master) + + [No free volumes error](#no-free-volumes-error) + +## Architecture + +> For production, replace `local` with `prod`, matching the Gateway's compose file. + +| Component | Container | Default Port | Purpose | +| ---------- | ---------------------------------- | ------------ | ------------------------------------ | +| Master | `sds-gateway-local-sfs-master` | 9333 | Cluster coordination, volume routing | +| Volume | `sds-gateway-local-sfs-volume` | 8080 | Raw file chunk storage | +| Filer | `sds-gateway-local-sfs-filer` | 8888 | Metadata + path-based file access | +| S3 Gateway | `sds-gateway-local-sfs-s3` | 8333 | AWS S3-compatible API | +| WebDAV | `sds-gateway-local-sfs-webdav` | 7333 | WebDAV mount access | +| Prometheus | `sds-gateway-local-sfs-prometheus` | 9000 | Metrics scraping | + +### Data flow + +```text +Client → S3/WebDAV/Filer HTTP → Filer (metadata in /data/filer/filerldb2) + ↓ + Volume Server (chunks in ./data/volumes) +``` + +The **Filer** stores only metadata (file paths, sizes, chunk IDs). The **Volume Server** +stores the actual bytes. Both must persist across restarts — see the `volumes` section +in `compose.yaml`. + +--- + +## Deployment + +> [!TIP] Assign `alias dc='docker compose'` for convenience; then run e.g. `dc logs -f` +> instead of `docker compose logs -f`. + +### Data directory ownership + +```bash +sudo chown -R 1000:1000 data/ +# otherwise, match UID and GID used in compose.yaml +``` + +### Standard compose commands + +```bash +cd seaweedfs/ +docker compose build +docker compose up -d +docker compose down +docker compose restart sds-gateway-local-sfs-filer +docker compose ps +``` + +If the alias is set, you can run a one-liner: + +```bash +cd seaweedfs/ +dc pull --ignore-buildable; dc build && dc up -d && dc ps && dc logs -f +``` + +### Full teardown (destroy all data) + +```bash +docker compose down -v +rm -rf data/volumes/* data/filer/* +``` + +### View logs + +```bash +# all services +docker compose logs -f + +# single service +docker compose logs -f sds-gateway-local-sfs-filer +``` + +--- + +## Web UIs + +| UI | URL | +| --------------------- | ------------------------------------- | +| Master cluster status | | +| Volume server status | | +| Filer browser | | +| Prometheus targets | | + +--- + +## S3 API + +The S3 gateway is compatible with the AWS CLI and any S3 SDK. The MinIO client also +works, if migrating from that. + +### Create or find S3 credentials (required) + +This deployment stores S3 identities in SeaweedFS (not in `compose.yaml`). + ++ Credential backend is configured in `config/credential.toml`. ++ In this repo, `[credential.filer_etc] enabled = true`, so identities are persisted in the filer store. + +Create a known admin key pair (recommended if you are unsure which keys exist): + +```bash +export S3_ENDPOINT=http://localhost:8333 +export S3_USER=admin +export S3_ACCESS_KEY=seaweed-sds-main +export S3_SECRET_KEY=$(LC_ALL=C tr -dc 'A-Za-z0-9' [!IMPORTANT] +> Access key IDs can be listed later, but secret keys cannot be recovered in plain text. +> If a secret is unknown, create/rotate credentials with `s3.configure` or IAM APIs. + +### AWS CLI setup + +```bash +aws configure set aws_access_key_id "${S3_ACCESS_KEY}" +aws configure set aws_secret_access_key "${S3_SECRET_KEY}" +aws configure set default.region us-east-1 +aws configure set default.s3.signature_version s3v4 + +export S3="${S3_ENDPOINT}" +``` + +#### Common operations with AWS CLI + +```bash +# list buckets +aws --endpoint-url "${S3}" s3 ls + +# create a bucket +aws --endpoint-url "${S3}" s3 mb s3://my-bucket + +# upload a file +aws --endpoint-url "${S3}" s3 cp local-file.txt s3://my-bucket/ + +# list bucket contents +aws --endpoint-url "${S3}" s3 ls s3://my-bucket + +# download a file +aws --endpoint-url "${S3}" s3 cp s3://my-bucket/file.txt . + +# delete a file +aws --endpoint-url "${S3}" s3 rm s3://my-bucket/file.txt + +# delete a bucket (must be empty) +aws --endpoint-url "${S3}" s3 rb s3://my-bucket + +# sync a local directory to a bucket +aws --endpoint-url "${S3}" s3 sync ./local-dir s3://my-bucket/prefix/ +``` + +### MinIO client setup + +Installing `mc` CLI: + +```bash +MINIO_INSTALL_DIR="${XDG_DATA_HOME:-${HOME}/.local/share}/mc" +mkdir -p "${MINIO_INSTALL_DIR}" +ls -alh "${MINIO_INSTALL_DIR}" +curl --progress-bar -L https://dl.min.io/aistor/mc/release/linux-amd64/mc \ + -o "${MINIO_INSTALL_DIR}/mc" \ + && chmod +x "${MINIO_INSTALL_DIR}/mc" +ln -s "${MINIO_INSTALL_DIR}/mc" "${HOME}/.local/bin/mc" +``` + +Bootstrap credentials for `mc` (run once if you do not already have a working key): + +```bash +echo "s3.configure -apply -user ${S3_USER} -access_key ${S3_ACCESS_KEY} -secret_key ${S3_SECRET_KEY} -actions Admin" \ + | docker exec -i sds-gateway-local-sfs-master weed shell -master=localhost:9333 +``` + +Usage: + +```bash +# install (choose one) +# macOS: brew install minio/stable/mc +# linux: https://min.io/docs/minio/linux/reference/minio-mc.html + +# configure an alias pointing to SeaweedFS S3 gateway +mc alias set sfs "${S3_ENDPOINT}" "${S3_ACCESS_KEY}" "${S3_SECRET_KEY}" --api S3v4 +# Added `sfs` successfully. + +# verify alias +mc alias ls +# ... +# sfs +# URL : http://localhost:8333 +# AccessKey : +# SecretKey : +# API : S3v4 +# Path : auto +# Src : /home/user/.mc/config.json +``` + +Optional: temporary shell-only setup (no local alias file written): + +```bash +export MC_HOST_sfs="http://${S3_ACCESS_KEY}:${S3_SECRET_KEY}@${S3_ENDPOINT#*://}" +mc ls sfs +``` + +#### Common operations with MinIO client + +```bash +# list buckets +mc ls sfs + +# create a bucket +mc mb sfs/main + +# upload a file +mc cp docs/readme.md sfs/main/ + +# list bucket contents +mc ls sfs/main + +# download a file +mc cp sfs/main/readme.md . + +# delete a file +mc rm sfs/main/readme.md + +# delete a bucket (must be empty) +mc rb sfs/main + +# sync a local directory to a bucket prefix +mc mirror ./docs sfs/main/docs && mc ls sfs/main/docs +# or more dangerously, include --overwrite: +# mc mirror --overwrite ./docs sfs/main/docs + +# access it via the file browser (opens a browser) +xdg-open http://localhost:8888/buckets/main/docs/ +``` + +--- + +## Filer HTTP API + +```bash +# upload a file +curl -F file=@report.pdf "http://localhost:8888/path/to/dir/" + +# upload with a specific name +curl -F file=@report.pdf "http://localhost:8888/path/to/dir/renamed.pdf" + +# download +curl "http://localhost:8888/path/to/dir/renamed.pdf" -o renamed.pdf + +# list directory (JSON) +curl -H "Accept: application/json" "http://localhost:8888/path/to/dir/?pretty=y" + +# delete a file +curl -X DELETE "http://localhost:8888/path/to/dir/renamed.pdf" + +# server-side copy (no client data transfer) +curl -X POST "http://localhost:8888/dest/dir/?cp.from=/source/path/file.pdf" +``` + +--- + +## Maintenance + +### Open the admin shell + +All maintenance operations go through `weed shell`. + +> [!IMPORTANT] Always `unlock` before exiting. + +```bash +docker exec -it sds-gateway-local-sfs-master weed shell -master=localhost:9333 +``` + +### Garbage collection (reclaim space from deleted files) + +Deleted file chunks are not immediately removed. Run vacuum to compact volumes and free +disk space. The master also runs this automatically every 15 minutes when free space +exceeds 30%. + +```bash +# trigger immediately via HTTP (no shell needed) +curl "http://localhost:9333/vol/vacuum" + +# or with a custom threshold (40% free space to trigger) +curl "http://localhost:9333/vol/vacuum?garbageThreshold=0.4" +``` + +### Delete empty / orphaned volumes + +Volumes that contain no live data (e.g. left over from previous runs with missing +metadata) can be removed. Run inside `weed shell`: + +```bash +lock +volume.deleteEmpty -quietFor=24h -apply +unlock +``` + +`-quietFor=24h` skips volumes that have been written to within the last 24 hours, to +avoid racing with active writes. + +### Check volume filesystem integrity + +```bash +lock +volume.fsck -findMissingChunks +unlock +``` + +### Fix replication + +```bash +lock +volume.fix.replication -apply +unlock +``` + +### Balance volume distribution across servers + +```bash +lock +volume.balance -apply +unlock +``` + +--- + +## Backup and Restore + +### Save filer metadata to a file + +Run inside `weed shell` on the source cluster: + +```bash +lock +fs.cd / +fs.meta.save -o /tmp/filer-backup.meta +unlock +``` + +Then copy it out: + +```bash +docker cp sds-gateway-local-sfs-filer:/tmp/filer-backup.meta ./filer-backup.meta +``` + +### Restore filer metadata from a file + +```bash +docker cp ./filer-backup.meta sds-gateway-local-sfs-filer:/tmp/filer-backup.meta +``` + +Then inside `weed shell`: + +```bash +fs.meta.load /tmp/filer-backup.meta +``` + +### Backup volume data incrementally + +Run on any machine with enough disk space. SeaweedFS fetches only the delta since the +last backup. + +```bash +weed backup -server=localhost:9333 -dir=/backup/volumes -volumeId=1 +``` + +Loop over all known volume IDs in a script — non-existent IDs are a no-op, so iterating +`1..N` is safe. + +--- + +## Troubleshooting + +### Filer metadata not persisting after restart + +Verify the filer process is writing to the bind-mounted path: + +```bash +docker exec sds-gateway-local-sfs-filer find / -maxdepth 4 -name "filerldb2" -type d 2>/dev/null +# Expected: /data/filer/filerldb2 + +docker exec sds-gateway-local-sfs-filer ls /data/filer/ +# Expected: filerldb2/ +``` + +If `filerldb2` appears outside `/data/filer/`, the `dir` setting in `config/filer.toml` +is wrong. It must use an absolute path that falls inside the volume mount: + +```toml +[leveldb2] + dir = "/data/filer/filerldb2" + enabled = true +``` + +### Disk space used but files not visible + +This means orphaned volume chunks exist without filer metadata (e.g. the filer metadata +was lost in a previous session). The data is unrecoverable. Reclaim the space with: + +```bash +# inside weed shell +lock +volume.deleteEmpty -quietFor=24h -apply +unlock +``` + +Or wipe `data/volumes/` entirely if you have no data to preserve. + +### Volume server not registering with master + +Check the master address in `compose.yaml` matches the master container name and port. +The filer and volume services must be able to reach the master by its container name on +the internal Docker network. + +```bash +docker exec sds-gateway-local-sfs-volume ping sds-gateway-local-sfs-master +``` + +### No free volumes error + +The default setup creates 8 volumes of 30 GB each. If you need more (e.g. many S3 +buckets each use their own collection): + +```bash +# pre-allocate 4 more volumes +curl "http://localhost:9333/vol/grow?count=4" +``` + +Or reduce the volume size limit in the master command to allow more volumes from the +same disk budget (requires restart): + +```bash +# in compose.yaml master command, add: +-volumeSizeLimitMB=1024 +``` diff --git a/seaweedfs/docs/readme.md b/seaweedfs/docs/readme.md new file mode 100644 index 000000000..222951f6c --- /dev/null +++ b/seaweedfs/docs/readme.md @@ -0,0 +1,17 @@ +# SeaweedFS integration docs + +SeaweedFS is a distributed file system that can be used as a storage backend for SPX. +This document provides instructions on how to set up and integrate SeaweedFS with the +SpectrumX Data System. + +## Documentation pages + ++ [Operations Guide](./operations.md) + +## Additional docs + +Pull the latest SeaweedFS documentation locally: + +```bash +git clone https://github.com/seaweedfs/seaweedfs.wiki.git sfs-wiki +``` diff --git a/seaweedfs/docs/sfs-deployment-checklist.md b/seaweedfs/docs/sfs-deployment-checklist.md new file mode 100644 index 000000000..ace061132 --- /dev/null +++ b/seaweedfs/docs/sfs-deployment-checklist.md @@ -0,0 +1,1261 @@ +# SeaweedFS Production Deployment Checklist + +- [SeaweedFS Production Deployment Checklist](#seaweedfs-production-deployment-checklist) + - [Infrastructure \& Pre-Deployment](#infrastructure--pre-deployment) + - [Single-Server, All-in-One with 5 XFS Drives](#single-server-all-in-one-with-5-xfs-drives) + - [0. Pre-Deployment Decisions](#0-pre-deployment-decisions) + - [EC Design Note](#ec-design-note) + - [1. OS \& Filesystem Preparation](#1-os--filesystem-preparation) + - [1a. Identify Drives (Both Tracks)](#1a-identify-drives-both-tracks) + - [1b. Track A — Fresh Drives (Empty, Can Be Formatted)](#1b-track-a--fresh-drives-empty-can-be-formatted) + - [1c. Track B — Existing Drives (Already Have Data, Cannot Reformat)](#1c-track-b--existing-drives-already-have-data-cannot-reformat) + - [1d. Set Mount Options Persistently (Both Tracks)](#1d-set-mount-options-persistently-both-tracks) + - [Why XFS Settings Matter](#why-xfs-settings-matter) + - [Core Service Configuration](#core-service-configuration) + - [2. Security Configuration](#2-security-configuration) + - [Why JWT Security Matters](#why-jwt-security-matters) + - [gRPC mTLS Note](#grpc-mtls-note) + - [3. Docker Compose Configuration](#3-docker-compose-configuration) + - [Why 5 Separate Volume Servers Instead of One With 5 Dirs](#why-5-separate-volume-servers-instead-of-one-with-5-dirs) + - [Why `-index=leveldb`](#why--indexleveldb) + - [4. S3 API Setup](#4-s3-api-setup) + - [S3 Encryption Note](#s3-encryption-note) + - [Operations \& Maintenance](#operations--maintenance) + - [5. Monitoring — Prometheus + Grafana](#5-monitoring--prometheus--grafana) + - [Push vs Pull Metrics](#push-vs-pull-metrics) + - [6. Backup to MinIO via Async Filer Backup](#6-backup-to-minio-via-async-filer-backup) + - [How Async Backup Works](#how-async-backup-works) + - [Alternative: Volume-Level Backup](#alternative-volume-level-backup) + - [7. Startup \& Verification](#7-startup--verification) + - [Smoke Test: Drive Failure Scenario](#smoke-test-drive-failure-scenario) + - [8. Volume Growth Tuning](#8-volume-growth-tuning) + - [9. Maintenance Plan](#9-maintenance-plan) + - [Daily / Automated](#daily--automated) + - [Weekly](#weekly) + - [Monthly](#monthly) + - [Erasure Coding (Always Active)](#erasure-coding-always-active) + - [Drive Replacement Procedure](#drive-replacement-procedure) + - [Appendices](#appendices) + - [Appendix A: Volume Size Calculation](#appendix-a-volume-size-calculation) + - [Appendix B: Port Reference](#appendix-b-port-reference) + - [Appendix C: Recommended Environment `.env` File](#appendix-c-recommended-environment-env-file) + +## Infrastructure & Pre-Deployment + +### Single-Server, All-in-One with 5 XFS Drives + +--- + +### 0. Pre-Deployment Decisions + +Answers to scoping questions gathered before writing this checklist: + +| Question | Decision | Rationale | +| ---------------------- | ----------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| Topology | Single server, all-in-one | One machine runs master, volume servers, filer, S3, monitoring | +| Master HA | Single master | Acceptable for single-node; master load is light; restartable | +| Filer store | leveldb2 (embedded) | Simplest for single filer; no external dependency | +| Data durability | Erasure Coding (RS 10+4) via admin worker | Writes go to `000` volumes; EC worker auto-converts full/quiet volumes to EC shards; survives up to 4 shard losses with ~1.4x storage overhead | +| Drive size | 5 × 22TB | ~110TB raw, ~74.5TB usable after EC overhead (RS 10+4 = 1.4x) | +| Drive failure target | Up to 4 drives (theoretical max) | RS(10,4) can lose any 4 of 14 shards; with 5 drives, EC shards are spread across all drives — losing 1-2 drives is fully survivable | +| Monitoring | Prometheus + Grafana (push mode) | Full observability with the upstream Grafana dashboard | +| S3 gateway | Yes | Required for S3-compatible access; separate service on port 8333 | +| Backup | Async to existing MinIO (S3 interface) | `weed filer.backup` with S3 sink; user has mc alias ready | +| Volume server approach | 5 separate volume servers (1 per drive) | Cleaner drive isolation; easier replacement on failure | + +#### EC Design Note + +This deployment uses **Erasure Coding (RS 10+4)** as the primary data durability +mechanism instead of replication. Here is how it works: + +**Write path:** New data is written to normal volumes with **`000` replication** (no +copies). This is the initial landing zone. Data is temporarily at single-copy risk +during the brief window before EC conversion. + +**EC conversion (automatic):** The `erasure_coding` plugin worker (running via `weed +admin` + `weed worker`) continuously scans for volumes that are: + +- ≥80% full (fullness ratio threshold, configurable) +- Unmodified for ≥300 seconds (quiet period, configurable) +- Larger than 30MB + +When a volume qualifies, the worker encodes it into **14 EC shards** (10 data + 4 +parity) using Reed-Solomon coding. The 14 shards are spread across available volume +servers (drives). After successful encoding, the original volume file is deleted, +freeing space. + +**Failure tolerance:** RS(10,4) can reconstruct data from any **10 of 14 shards**. With +5 drives and shards spread evenly, this means: + +- **1-2 drive failures:** Fully survivable — at most ~3 shards lost per volume +- **3-4 drive failures:** Potentially survivable depending on shard distribution +- All 5 drives can have some shards on each; losing any single drive never takes down + more than ~3 shards per volume (well within the 4-shard recovery limit) + +**Storage efficiency:** RS(10,4) requires only **1.4×** raw storage (vs 2× for 001 +replication, 3× for 002). For 5 × 22TB = 110TB raw, this yields ~74.5TB usable. + +**Trade-offs:** + +- Write amplification: EC reads the entire volume to encode it (one-time cost) +- Read penalty: EC reads may require an extra network hop to reconstruct data from + multiple shards (~50% throughput vs normal volumes in benchmarks) +- Deletes only: EC shards are append-only; updates require re-compaction +- Temporary risk window: Before EC conversion, data lives on a single volume with 000 + replication — conversion happens within minutes of volume filling up + +--- + +### 1. OS & Filesystem Preparation + +This section splits into two tracks depending on whether the XFS drives are **fresh** or +**already formatted with data**. Mount options can be fixed on either track; mkfs-level +geometry cannot be changed without reformatting. + +#### 1a. Identify Drives (Both Tracks) + +- [ ] **Identify 5 drives** — confirm device paths: + + ```bash + lsblk -o NAME,SIZE,TYPE,MOUNTPOINT,FSTYPE + ``` + +- [ ] **Note mount points** — decide on a consistent scheme, e.g. `/disk1` … `/disk5`. + Create them: + + ```bash + mkdir -p /disk{1,2,3,4,5} + ``` + +--- + +#### 1b. Track A — Fresh Drives (Empty, Can Be Formatted) + +> Use this if the drives are new or contain nothing you need to keep. + +- [ ] **XFS mkfs on each drive** with optimal settings: + + ```bash + mkfs.xfs -f -d agcount=4 -l size=128m -n size=8192 /dev/vdb1 # repeat for vdc1, vdd1, vde1, vdf1 + ``` + + | Flag | Value | Why | + | --------- | ----- | ---------------------------------------------------------------------- | + | `agcount` | 4 | More allocation groups → parallel allocation under concurrent writes | + | `l size` | 128m | Larger journal → smoother write bursts | + | `n size` | 8192 | Larger dir blocks → better perf for directories with many volume files | + + > **On 22TB drives** the defaults are often already close to these values (XFS + > auto-tunes based on device size). Run `xfs_info /dev/vdb1` after mkfs to confirm. + +--- + +#### 1c. Track B — Existing Drives (Already Have Data, Cannot Reformat) + +> Use this when the drives are already in use or carry data you need to preserve. + +- [ ] **Check current XFS geometry** — some mkfs-time settings affect performance but + **cannot be changed without reformatting**. Run on each drive: + + ```bash + xfs_info /dev/vdb1 # repeat for vdc1, vdd1, vde1, vdf1 + + # e.g. + # meta-data=/dev/vdb1 isize=512 agcount=22, agsize=268435455 blks + # = sectsz=4096 attr=2, projid32bit=1 + # = crc=1 finobt=1, sparse=1, rmapbt=0 + # = reflink=1 bigtime=1 inobtcount=1 nrext64=0 + # data = bsize=4096 blocks=5859442176, imaxpct=5 + # = sunit=0 swidth=0 blks + # naming =version 2 bsize=4096 ascii-ci=0, ftype=1 + # log =internal log bsize=4096 blocks=521728, version=2 + # = sectsz=4096 sunit=1 blks, lazy-count=1 + # realtime =none extsz=4096 blocks=0, rtextents=0 + ``` + + In the example above: + + - **agcount** = `22` → well above 4, excellent for parallel allocation. + - **naming bsize** = `4096` → below the ideal `8192`. This means directory metadata + blocks are 4KB instead of 8KB. For SeaweedFS this is a minor factor because volume + files are written sequentially and directories hold at most a few thousand entries. + The `-n size=8192` mkfs flag is a "nice to have" optimization, not a requirement. + - **logsize** = `521728 blocks × 4096 bsize = ~2 GB` → well above the `128m` minimum. + The log holds metadata journal entries; a tiny log forces flushes more often under + concurrent writes. On 22TB drives XFS auto-sizes the log generously. + + Pay attention to: + + | Parameter | Ideal | Impact if suboptimal | Can fix? | + | --------- | ------ | ---------------------------------------------------------------- | ---------------------- | + | `agcount` | ≥ 4 | Fewer AGs → less parallel allocation; minor perf hit | **No** — requires mkfs | + | `logsize` | ≥ 64m | Small log → more frequent log rotation under write load | **No** — requires mkfs | + | `naming` | ≥ 8192 | Small dir blocks → slower directory scans with many volume files | **No** — requires mkfs | + +- [ ] **Check current mount options**: + + ```bash + mount | grep /disk + # or + findmnt /disk1 + ``` + + If `noatime,nodiratime,allocsize=1m` are missing, fix them in the next step. + +--- + +#### 1d. Set Mount Options Persistently (Both Tracks) + +Mount options — `noatime`, `nodiratime`, `nobarrier`, `allocsize` — can be changed at +any time by updating `/etc/fstab` and remounting. These are the most impactful tuning +parameters and the main reason to touch the filesystem config. + +| Option | Effect | +| -------------- | ------------------------------------------------------------------------------ | +| `noatime` | Skip access-time writes on reads — critical for storage servers | +| `allocsize=1m` | XFS prealloc hint — matches SeaweedFS volume chunk patterns (1MB chunk writes) | + +Sources: + +- [`allocsize`](https://oneuptime.com/blog/post/2026-03-04-tune-xfs-file-system-performance-mount-options-rhel-9/view#allocsize) + +Other options + +| Option | Effect | +| -------------- | ----------------------------------------------------------------------- | +| `rw` | Read-write mode (default) | +| `attr2` | Enable version 2 on-disk inode format (immutable default on modern XFS) | +| `nodiratime` | Skip directory access time updates (`noatime` implies `nodiratime`) | +| `inode64` | Support >16TB files (default on modern XFS) | +| `logbufs=8` | More log buffers can improve performance under heavy metadata load | +| `logbsize=64k` | Larger log buffer size can help with large transactions | +| `noquota` | Disable quota checks (not needed if not using XFS quotas) | + +- [ ] **Add or update fstab entries** for each drive: + + ```text + /dev/vdb1 /disk1 xfs noatime,allocsize=1m 0 0 + /dev/vdc1 /disk2 xfs noatime,allocsize=1m 0 0 + /dev/vdd1 /disk3 xfs noatime,allocsize=1m 0 0 + /dev/vde1 /disk4 xfs noatime,allocsize=1m 0 0 + /dev/vdf1 /disk5 xfs noatime,allocsize=1m 0 0 + ``` + + The trailing `0 0` are for dump and fsck order (`fs_passno`): + + `fs_passno`: + - 0 means "do not fsck". XFS with journaling rarely needs boot-time fsck, and checking + 22TB drives at boot would add significant startup delay. This setting also avoids + potential hangs if fsck cannot resolve an issue without human intervention. + - 1 means "check first" and is reserved for the root filesystem. + - 2 means "check after root" and is standard for data drives. Use this instead of 0 if + you want periodic fsck checks at boot (e.g. every 30 mounts via tune2fs on ext4; XFS + doesn't use mount-count-based fsck). + + > These options are **safe for existing data**. They only change how the kernel + > interacts with the filesystem going forward; no data rewrite occurs. + +- [ ] **Create SeaweedFS data directories** on each drive: + + ```bash + mkdir -p /disk{1,2,3,4,5}/{data,idx} + ``` + +- [ ] **Remount all drives** (non-disruptive — active processes continue; the new mount + options take effect): + + ```bash + mount -o remount /disk1 + mount -o remount /disk2 + mount -o remount /disk3 + mount -o remount /disk4 + mount -o remount /disk5 + ``` + + Or reboot (cleaner verification that fstab is correct): + + ```bash + mount -a + ``` + +- [ ] **Verify mount options are applied**: + + ```bash + mount | grep /disk + # Confirm noatime,nodiratime,allocsize=1m appear in the options column + ``` + +- [ ] **Verify disk space**: + + ```bash + df -h | grep /disk + ``` + +- [ ] **Set ulimit** (open file limit): + + ```bash + echo "* soft nofile 102400" >> /etc/security/limits.conf + echo "* hard nofile 102400" >> /etc/security/limits.conf + ulimit -n 102400 + ``` + + SeaweedFS can open many network connections under load. Default 1024 is insufficient. + See the [Optimization wiki + page](https://github.com/seaweedfs/seaweedfs/wiki/Optimization#increase-user-open-file-limit) + for details. +- [ ] **Disable swap** or set `vm.swappiness=1` in `/etc/sysctl.conf` — prevents the + kernel from swapping out SeaweedFS processes under memory pressure: + + ```bash + echo "vm.swappiness=1" >> /etc/sysctl.conf + echo "vm.vfs_cache_pressure=50" >> /etc/sysctl.conf + sysctl -p + ``` + + See the [Linux kernel VM + documentation](https://www.kernel.org/doc/html/latest/admin-guide/sysctl/vm.html) for + the rationale behind swappiness tuning. SeaweedFS benefits from keeping page cache hot + for frequently accessed volume indexes. +- [ ] **Optimize network** (if applicable): net.core.somaxconn, net.ipv4.tcp_tw_reuse +- [ ] **Install Docker Engine** — follow the [official Docker install + guide](https://docs.docker.com/engine/install/) for your distribution. +- [ ] **Install Docker Compose** (v2 plugin or standalone binary) — see [Docker Compose + install docs](https://docs.docker.com/compose/install/). +- [ ] **Create Docker network** for SeaweedFS: + + ```bash + docker network create sds-gateway-prod-seaweedfs-net + ``` + +##### Why XFS Settings Matter + +The XFS mount options and mkfs parameters above are tuned for large sequential I/O +patterns typical of SeaweedFS volume files. In particular: + +| Setting | Effect | +| -------------------- | ----------------------------------------------------------------------------------------------------------- | +| `noatime` | Eliminates metadata writes on reads, including directory atime (`nodiratime` is implied on kernels ≥2.6.30) | +| `allocsize=1m` | Hints XFS to allocate 1MB extents — matches SeaweedFS volume chunk patterns | +| `agcount=4` | (mkfs option, not mount) More allocation groups = better parallel allocation under concurrent writes | +| Volume Preallocation | Master flag `-volumePreallocate` on XFS gives contiguous block allocation, reduces fragmentation | + +See the [Optimization wiki +page](https://github.com/seaweedfs/seaweedfs/wiki/Optimization#preallocate-volume-file-disk-spaces) +for details on `-volumePreallocate` and XFS support. + +--- + +## Core Service Configuration + +### 2. Security Configuration + +- [ ] **Generate `security.toml` scaffold**: + + ```bash + docker run --rm docker.io/chrislusf/seaweedfs:4.23-large_disk_full weed scaffold -config=security > security.toml + ``` + +- [ ] **Set JWT signing key for volume writes** — prevents unauthorized writes to volume + servers: + + ```bash + WEED_JWT_SIGNING_KEY=$(openssl rand -hex 32) + ``` + +- [ ] **Set JWT signing key for filer writes** — secures filer HTTP write endpoints: + + ```bash + WEED_JWT_FILER_SIGNING_KEY=$(openssl rand -hex 32) + ``` + +- [ ] **Set SSE-S3 KEK** — required if S3 clients send `x-amz-server-side-encryption: + AES256`: + + ```bash + WEED_S3_SSE_KEK=$(openssl rand -hex 32) + ``` + + All S3 API servers must use the same KEK value. +- [ ] **Create `.env` file** — Docker Compose [reads variables from a `.env` + file](https://docs.docker.com/compose/environment-variables/env-file/) in the same + directory as `compose.yaml`. Variable names in `.env` are plain (e.g. + `JWT_SIGNING_KEY`), referenced in the compose file as `${JWT_SIGNING_KEY}`. Add these + secrets (do NOT commit `.env` to Git): + + ```ini + # JWT signing key for volume write authorization. + # Master signs JWTs during /dir/assign; volume servers validate them on write. + # Generate: openssl rand -hex 32 + JWT_SIGNING_KEY= + + # JWT signing key for filer HTTP write/read authorization. + # S3 gateway generates these JWTs; filer validates them. + # Generate: openssl rand -hex 32 + JWT_FILER_SIGNING_KEY= + + # SSE-S3 Key Encryption Key (KEK). + # Required if S3 clients send x-amz-server-side-encryption: AES256. + # All S3 API servers in the cluster must use the same value. + # Generate: openssl rand -hex 32 + S3_SSE_KEK= + + # Grafana admin password. + GRAFANA_PASSWORD= + ``` + +- [ ] **Store secrets in a vault/password manager** (Bitwarden, 1Password, pass, etc.) + +#### Why JWT Security Matters + +Without JWT signing keys, any client that can reach the volume servers can write data. +The JWT is generated by the master during `/dir/assign`, so only clients that first +authenticate with the master (or go through the filer/S3 gateway) can write. This +prevents direct unauthorized writes to volume server HTTP endpoints. + +#### gRPC mTLS Note + +For a single-server deployment, gRPC mTLS is **optional**. The gRPC traffic stays within +the Docker network and does not leave the host. Skip unless you need FIPS compliance or +defense-in-depth. + +--- + +### 3. Docker Compose Configuration + +Create `compose.yaml`: + +> **Port allocation**: 5 volume servers on ports 8081-8085 (leaving 8080 free if +> needed). +> +> **Image tag choice**: `4.23-large_disk_full` is used for SeaweedFS because: +> +> - `large_disk` variant supports larger volume indexes without memory issues — critical +> for 22TB drives where default 30GB volumes are not performance-optimal and you may +> want fewer, larger volumes (e.g. 100GB+). +> - `full` variant includes all optional backends (rclone, MySQL, Postgres, etc.), +> avoiding surprises if you later need cloud tiering or migrate the filer store. +> - `4.23` (minimal) omits these — it would work but limits future options. +> - Pinning to a specific version instead of `latest` ensures reproducibility: `latest` +> can change on rebuild and break your deployment. + +```yaml +x-logging: &default-logging + driver: "json-file" + options: + max-size: "100m" + max-file: "3" + +networks: + sds-gateway-prod-seaweedfs-net: + external: true + +volumes: + prometheus-data: + grafana-data: + +services: + master: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-master + restart: unless-stopped + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "9333:9333" + - "19333:19333" + environment: + # JWT key for volume write auth — master signs, volume servers validate + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /data/seaweedfs/master:/data + logging: *default-logging + command: | + master + -mdir=/data + -ip=master + -port=9333 + -volumePreallocate + -volumeSizeLimitMB=30000 + -master.metrics.address=http://pushgateway:9091 + + # 5 volume servers — one per XFS drive + volume1: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-volume1 + restart: unless-stopped + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "8081:8081" + - "18081:18081" + environment: + # JWT key to validate volume write tokens issued by master + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /disk1/data:/data + - /disk1/idx:/idx + logging: *default-logging + command: | + volume + -master=master:9333 + -ip=volume1 + -port=8081 + -max=0 + -dir=/data + -dir.idx=/idx + -index=leveldb + -dataCenter=dc1 + -rack=rack1 + -compactionMBps=40 + -minFreeSpacePercent=7 + + volume2: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-volume2 + restart: unless-stopped + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "8082:8082" + - "18082:18082" + environment: + # JWT key to validate volume write tokens issued by master + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /disk2/data:/data + - /disk2/idx:/idx + logging: *default-logging + command: | + volume + -master=master:9333 + -ip=volume2 + -port=8082 + -max=0 + -dir=/data + -dir.idx=/idx + -index=leveldb + -dataCenter=dc1 + -rack=rack1 + -compactionMBps=40 + -minFreeSpacePercent=7 + + volume3: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-volume3 + restart: unless-stopped + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "8083:8083" + - "18083:18083" + environment: + # JWT key to validate volume write tokens issued by master + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /disk3/data:/data + - /disk3/idx:/idx + logging: *default-logging + command: | + volume + -master=master:9333 + -ip=volume3 + -port=8083 + -max=0 + -dir=/data + -dir.idx=/idx + -index=leveldb + -dataCenter=dc1 + -rack=rack1 + -compactionMBps=40 + -minFreeSpacePercent=7 + + volume4: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-volume4 + restart: unless-stopped + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "8084:8084" + - "18084:18084" + environment: + # JWT key to validate volume write tokens issued by master + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /disk4/data:/data + - /disk4/idx:/idx + logging: *default-logging + command: | + volume + -master=master:9333 + -ip=volume4 + -port=8084 + -max=0 + -dir=/data + -dir.idx=/idx + -index=leveldb + -dataCenter=dc1 + -rack=rack1 + -compactionMBps=40 + -minFreeSpacePercent=7 + + volume5: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-volume5 + restart: unless-stopped + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "8085:8085" + - "18085:18085" + environment: + # JWT key to validate volume write tokens issued by master + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + volumes: + - /disk5/data:/data + - /disk5/idx:/idx + logging: *default-logging + command: | + volume + -master=master:9333 + -ip=volume5 + -port=8085 + -max=0 + -dir=/data + -dir.idx=/idx + -index=leveldb + -dataCenter=dc1 + -rack=rack1 + -compactionMBps=40 + -minFreeSpacePercent=7 + + filer: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-filer + restart: unless-stopped + depends_on: + - master + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "8888:8888" + - "18888:18888" + environment: + # JWT key for volume write auth — passed through from master + WEED_JWT_SIGNING_KEY: "${JWT_SIGNING_KEY}" + # JWT key for filer HTTP write auth — S3 gateway signs, filer validates + WEED_JWT_FILER_SIGNING_KEY: "${JWT_FILER_SIGNING_KEY}" + volumes: + - /data/seaweedfs/filer:/data + - ./filer.toml:/etc/seaweedfs/filer.toml:ro + logging: *default-logging + command: | + filer + -master=master:9333 + -ip=filer + -port=8888 + -encryptVolumeData=false + -maxMB=32 + + s3: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-s3 + restart: unless-stopped + depends_on: + - filer + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "8333:8333" + environment: + # JWT key for signing filer HTTP requests — must match filer's WEED_JWT_FILER_SIGNING_KEY + WEED_JWT_FILER_SIGNING_KEY: "${JWT_FILER_SIGNING_KEY}" + # SSE-S3 Key Encryption Key — required when clients send x-amz-server-side-encryption: AES256 + WEED_S3_SSE_KEK: "${S3_SSE_KEK}" + volumes: + - ./s3-config.json:/etc/seaweedfs/s3.json:ro + logging: *default-logging + command: | + s3 + -filer=filer:8888 + -port=8333 + -config=/etc/seaweedfs/s3.json + -domain=.s3.example.com + + # Admin server + worker for Erasure Coding and cluster maintenance + admin: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-admin + restart: unless-stopped + depends_on: + - master + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "23646:23646" + logging: *default-logging + command: | + admin + -master=master:9333 + + worker: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-worker + restart: unless-stopped + depends_on: + - admin + networks: + - sds-gateway-prod-seaweedfs-net + logging: *default-logging + command: | + worker + -admin=admin:23646 + + prometheus: + image: docker.io/prom/prometheus:v2.53.0 + container_name: seaweedfs-prometheus + restart: unless-stopped + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "9090:9090" + volumes: + - prometheus-data:/prometheus + - ./prometheus.yaml:/etc/prometheus/prometheus.yaml:ro + command: + - "--config.file=/etc/prometheus/prometheus.yaml" + - "--storage.tsdb.path=/prometheus" + + pushgateway: + image: docker.io/prom/pushgateway:v1.9.0 + container_name: seaweedfs-pushgateway + restart: unless-stopped + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "9091:9091" + + grafana: + image: docker.io/grafana/grafana:11.1.0 + container_name: seaweedfs-grafana + restart: unless-stopped + networks: + - sds-gateway-prod-seaweedfs-net + ports: + - "3000:3000" + environment: + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_PASSWORD}" + volumes: + - grafana-data:/var/lib/grafana +``` + +- [ ] **Create `filer.toml`** for leveldb2 store (default — file may be empty or + scaffolded): + + ```bash + docker run --rm docker.io/chrislusf/seaweedfs:4.23-large_disk_full weed scaffold -config=filer > filer.toml + ``` + +- [ ] **Create `prometheus.yaml`** with pushgateway as a target (see section 5 for + contents) +- [ ] **Set `${GRAFANA_PASSWORD}`** in the same `.env` file (Compose substitutes it into + the `grafana` service) +- [ ] **Create directories**: + + ```bash + mkdir -p /data/seaweedfs/{master,filer} + ``` + +#### Why 5 Separate Volume Servers Instead of One With 5 Dirs + +| Approach | Pros | Cons | +| ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------- | +| 5 separate volume servers | Each drive independent; replacing a failed drive = stop one container; cleaner metrics per-drive; easier to move/rebalance | More containers; more ports | +| 1 server with 5 comma-separated dirs | Simpler; fewer ports | Opaque per-drive health; harder to replace a single drive | + +For EC, separate volume servers are equally important. The EC shard placement algorithm +spreads the 14 shards (10 data + 4 parity) across available volume servers. With 5 +separate servers (drives), shards are naturally distributed across all drives, +maximizing failure tolerance. A single volume server with 5 dirs is seen as one node by +the EC placement algorithm — losing that one node means losing the volume entirely, +defeating the purpose of EC. + +| EC shard distribution (5 drives) | Max survivable failures | +| -------------------------------------- | ----------------------------------- | +| 14 shards spread across 5 servers | 4 shards = any 2-3 drives | +| 14 shards on 1 server (5 dirs, 1 node) | 0 drives (server loss = total loss) | + +#### Why `-index=leveldb` + +- **Memory mode** (default): Fast but loads full index into RAM on startup — slow + restart with large volumes. +- **LevelDB mode**: ~4MB fixed memory footprint per volume server, faster startup, + minimal performance impact since index lookups are dwarfed by network latency. +- For 5 volume servers with large volumes, leveldb saves significant RAM. + +--- + +### 4. S3 API Setup + +- [ ] **Create `s3-config.json`** with identities: + + ```json + { + "identities": [ + { + "name": "admin", + "credentials": [ + { + "accessKey": "admin-access-key", + "secretKey": "admin-secret-key" + } + ], + "actions": ["Admin", "Read", "Write", "List", "Tagging"] + }, + { + "name": "backup-user", + "credentials": [ + { + "accessKey": "backup-access-key", + "secretKey": "backup-secret-key" + } + ], + "actions": ["Read", "List"] + } + ] + } + ``` + +- [ ] **Admin actions** allow bucket creation/deletion. Avoid giving `Admin` to everyday + users. +- [ ] **Test S3 access**: + + ```bash + aws s3 --endpoint http://localhost:8333 ls + aws s3 --endpoint http://localhost:8333 mb s3://test-bucket + aws s3 --endpoint http://localhost:8333 cp /etc/hostname s3://test-bucket/ + ``` + +#### S3 Encryption Note + +If your S3 clients send `x-amz-server-side-encryption: AES256`, the SSE-S3 KEK must be +configured (already done in step 2). Without it, these requests fail with `400 Bad +Request`. + +--- + +## Operations & Maintenance + +### 5. Monitoring — Prometheus + Grafana + +- [ ] **Start Prometheus pushgateway** (included in compose as `pushgateway` service) +- [ ] **Master** configured with `-master.metrics.address=http://pushgateway:9091` — all + other components (volume, filer) inherit this from master's heartbeat and push their + own metrics. +- [ ] **Configure Prometheus** to scrape the pushgateway: + + ```yaml + # prometheus.yaml + global: + scrape_interval: 15s + + scrape_configs: + - job_name: "seaweedfs-pushgateway" + honor_labels: true + static_configs: + - targets: ["pushgateway:9091"] + ``` + +- [ ] **Import Grafana dashboard** from upstream: + + ```bash + # Download the dashboard JSON from the SeaweedFS repo + curl -o grafana-seaweedfs.json \ + https://raw.githubusercontent.com/seaweedfs/seaweedfs/master/other/metrics/grafana_seaweedfs.json + ``` + + - Login to Grafana at `http://:3000` (default admin/admin) + - Create Prometheus datasource pointing to `http://prometheus:9090` + - Import `grafana-seaweedfs.json` +- [ ] **Set up alerting** in Grafana for: + - Volume server down (heartbeat missing) + - Free volume count = 0 (cluster full) + - High compaction backlog + - Disk space < 10% on any volume drive + +#### Push vs Pull Metrics + +SeaweedFS components push metrics to the pushgateway. This is simpler than configuring +Prometheus to discover dynamic volume server targets. The pushgateway is a lightweight +bridge. + +--- + +### 6. Backup to MinIO via Async Filer Backup + +- [ ] **Create backup access key** in your MinIO deployment (via mc or MinIO console) + with write permissions to a dedicated backup bucket. +- [ ] **Generate `replication.toml`**: + + ```bash + docker run --rm docker.io/chrislusf/seaweedfs:4.23-large_disk_full weed scaffold -config=replication > replication.toml + ``` + +- [ ] **Edit `replication.toml`** to configure the S3 sink targeting your MinIO: + + ```toml + [sink.s3] + enabled = true + aws_access_key_id = "minio-backup-access-key" + aws_secret_access_key = "minio-backup-secret-key" + region = "us-east-1" # can be anything for MinIO + bucket = "spectrumx" # existing bucket in MinIO + directory = "/spectrumx" # prefix inside the bucket + endpoint = "https://minio.example.com" # your MinIO endpoint URL + is_incremental = false # false = continuous mirroring + ``` + +- [ ] **Create the backup bucket** in MinIO: + + ```bash + mc mb --ignore-existing "sds-backup-minio/spectrumx" + ``` + +- [ ] **Start backup** as an additional Docker service or standalone process: + + ```yaml + # Add to compose.yaml + filer-backup: + image: docker.io/chrislusf/seaweedfs:4.23-large_disk_full + container_name: seaweedfs-filer-backup + restart: unless-stopped + depends_on: + - filer + networks: + - sds-gateway-prod-seaweedfs-net + volumes: + - ./replication.toml:/etc/seaweedfs/replication.toml:ro + command: | + filer.backup + -filer=filer:8888 + -config=/etc/seaweedfs/replication.toml + ``` + +#### How Async Backup Works + +- `weed filer.backup` subscribes to the filer's metadata change log (CDC). +- When files are created/updated/deleted, it reads the content from SeaweedFS and + replicates to the configured sink. +- Progress is checkpointed on the filer — safe to restart. +- In `is_incremental = false` mode, the remote mirror keeps the same directory structure + as the source. + +#### Alternative: Volume-Level Backup + +For a full-clone backup (not just file-level), use `weed backup` per volume: + +```bash +weed backup -server=master:9333 -dir=/backup -volumeId= +``` + +This is useful for bootstrapping a second cluster but is not continuous. + +--- + +### 7. Startup & Verification + +- [ ] **Start all services**: + + ```bash + docker compose up -d + ``` + +- [ ] **Verify cluster status** via master UI: + + ```bash + curl http://localhost:9333/ # or open in browser + ``` + + - Check that all 5 volume servers appear + - Check that Free volume count > 0 +- [ ] **Verify volume servers**: + + ```bash + curl http://localhost:8081/ # repeat for 8082-8085 + ``` + +- [ ] **Verify filer**: + + ```bash + curl http://localhost:8888/ + ``` + +- [ ] **Verify S3 gateway**: + + ```bash + aws s3 --endpoint http://localhost:8333 ls + ``` + +- [ ] **Trigger volume allocation** to test write path: + + ```bash + curl "http://localhost:9333/dir/assign" + ``` + +- [ ] **Run the SeaweedFS benchmark** from within the Docker network: + + ```bash + docker run --rm --network sds-gateway-prod-seaweedfs-net docker.io/chrislusf/seaweedfs:4.23-large_disk_full \ + weed benchmark -master=master:9333 -n 10000 + ``` + +- [ ] **Verify Prometheus targets** — check pushgateway at `http://localhost:9091` +- [ ] **Verify Grafana dashboard** — open at `http://localhost:3000`, check for data + +#### Smoke Test: Drive Failure Scenario + +Simulate a drive failure to verify EC durability: + +```bash +# Stop one volume server (simulate drive failure) +docker stop seaweedfs-volume1 + +# Verify data is still accessible via S3/filer +aws s3 --endpoint http://localhost:8333 ls s3://test-bucket/ --recursive +# Read a file to confirm EC reconstruction works +aws s3 --endpoint http://localhost:8333 cp s3://test-bucket/test-file /tmp/test-file + +# Check EC shard status via weed shell +docker exec seaweedfs-master weed shell -c "ec.balance" + +# Restart the volume server (simulate drive replacement) +docker start seaweedfs-volume1 + +# After restart, rebalance EC shards to restore optimal distribution +docker exec seaweedfs-master weed shell -c "ec.balance -apply" +``` + +--- + +### 8. Volume Growth Tuning + +With EC and no replication (`copy_1`), the default growth strategy creates **7 writable +volumes** initially. As these fill up and get EC-encoded, new volumes are automatically +created. Given 22TB drives, this is more than sufficient. + +If you need more write concurrency (more simultaneous write streams), pre-create +additional volumes: + +```bash +docker run --rm docker.io/chrislusf/seaweedfs:4.23-large_disk_full weed scaffold -config=master > master.toml +``` + +Edit and mount to master: + +```toml +[master.volume_growth] +copy_1 = 16 # 16 writable volumes for no-replication (more write concurrency) +threshold = 0.9 +``` + +**Volume size tuning**: With 22TB drives, the default 30GB volume size means ~733 +volumes per drive. With LevelDB mode (`-index=leveldb`), each volume's index occupies +roughly 20-40MB of **disk space** in the `idx` directory (~15-30GB total per drive on +disk). The LevelDB block cache RAM footprint remains fixed at ~4MB per volume server +regardless of volume count — this is the key advantage of LevelDB over memory mode. See +the [Optimization wiki +page](https://github.com/seaweedfs/seaweedfs/wiki/Optimization#use-leveldb) for details +on index types and memory usage. + +```text +- volumeSizeLimitMB=100000 # 100GB volumes → ~220 per drive +``` + +--- + +### 9. Maintenance Plan + +#### Daily / Automated + +- [ ] **Admin script plugin** — the `admin` and `worker` Docker services (already in + `compose.yaml`) automatically run these maintenance tasks. Verify they are running: + + ```bash + docker ps | grep seaweedfs-admin + docker ps | grep seaweedfs-worker + ``` + + Default script covers: + - `ec.balance -apply` — balance EC shards + - `fs.log.purge -daysAgo=7` — purge old filer logs + - `volume.deleteEmpty -quietFor=24h -apply` — delete empty volumes + - `volume.fix.replication -apply` — fix missing replicas + - `s3.clean.uploads -timeAgo=24h` — clean aborted S3 multipart uploads + +- [ ] **Monitor disk usage** on all 5 drives. Alert when any drive exceeds 85% usage. + +#### Weekly + +- [ ] **Check `weed shell` status**: + + ```bash + docker exec seaweedfs-master weed shell -c "volume.status" + docker exec seaweedfs-master weed shell -c "volume.list" + ``` + +#### Monthly + +- [ ] **Run full cluster health check**: + + ```bash + weed shell -c "volume.fsck" + weed shell -c "volume.check.disk" + ``` + +- [ ] **Review Grafana dashboards** for trends: compaction rates, write amplification, + disk growth +- [ ] **Verify backup is running** — check that MinIO bucket has recent files + +#### Erasure Coding (Always Active) + +EC is the **primary durability mechanism** for this deployment, not an afterthought. The +`erasure_coding` plugin worker runs automatically inside the `worker` container and +continuously converts full/quiet volumes to RS(10,4) EC shards. + +**Detection defaults** (configurable from admin UI at `/plugin`): + +- Fullness ratio threshold: 80% +- Quiet period: 300 seconds (5 minutes) +- Minimum volume size: 30 MB +- Scan interval: 5 minutes + +**What to watch for:** + +- Ensure the `worker` container is always running — if it stops, volumes will sit at + `000` replication (single copy) indefinitely. +- If the cluster runs low on free volume IDs, pre-create volumes manually with `curl + http://localhost:9333/vol/grow?count=10`. +- Monitor `ec.balance` shard distribution in Grafana after drive replacements. + +#### Drive Replacement Procedure + +When a drive fails with EC, the procedure differs from a replication-based setup. There +are no volume replicas to "fix" — instead, the surviving EC shards on other drives can +reconstruct missing data once the replacement drive is online. + +1. **Do NOT stop the volume container yet** — the volume server may still serve reads + from its surviving shards (depending on failure mode). Only stop it if the drive is + fully dead/unresponsive. + +2. If the drive is still partially readable, mark maintenance mode: + + ```bash + docker exec seaweedfs-master weed shell -c "volumeServer.state --nodes volume1:8081 --maintenanceOn" + ``` + +3. Replace the physical drive, mkfs.xfs, mount, recreate directory structure: + + ```bash + # if the drive is new/empty, format with XFS and recommended options for SeaweedFS: + mkfs.xfs -f -d agcount=4 -l size=128m -n size=8192 /dev/vdb1 # replace with actual new drive + + # if the filesystem already exists (e.g. replaced drive with pre-formatted data): + # - check geometry is adequate: + # xfs_info /dev/vdb1 (see Track B in §1 for what to look for) + # - verify/add fstab entry then mount: + # echo '/dev/vdb1 /disk1 xfs noatime,nodiratime,nobarrier,allocsize=1m 0 2' >> /etc/fstab + # mount /disk1 + + mkdir -p /disk1/{data,idx} + ``` + +4. Start the container on the new drive: + + ```bash + docker start seaweedfs-volume1 + ``` + +5. **Rebalance EC shards** — the `ec.balance` command detects that some shards are + missing from the replacement server and moves/reconstructs shards to restore optimal + distribution: + + ```bash + docker exec seaweedfs-master weed shell -c "ec.balance -apply" + ``` + + This may take time depending on how many EC volumes need shard reconstruction. + Monitor progress via the admin UI or Grafana. + +6. Re-run volume server state check: + + ```bash + docker exec seaweedfs-master weed shell -c "volumeServer.state" + ``` + +7. Turn off maintenance mode if it was enabled: + + ```bash + docker exec seaweedfs-master weed shell -c "volumeServer.state --nodes volume1:8081 --maintenanceOff" + ``` + +**Note:** Unlike replication (`volume.fix.replication`), EC shard reconstruction +rebuilds only the missing shards from the parity data on surviving drives. This is +network-efficient but computationally intensive (Reed-Solomon encoding). Monitor CPU on +the worker/admin containers during reconstruction. + +--- + +## Appendices + +### Appendix A: Volume Size Calculation + +| Drive count | Data durability | Volume size | Volumes per drive | Raw storage | Usable capacity | +| ----------- | --------------- | ----------- | ----------------- | ----------- | --------------- | +| 5 × 22TB | RS(10,4) EC | 30GB | ~733 per drive | 110TB | ~74.5TB | +| 5 × 22TB | RS(10,4) EC | 100GB | ~220 per drive | 110TB | ~74.5TB | + +**Formula**: `usable = (total_raw / 1.4) × 0.95` (RS 10+4 = 1.4× raw overhead; ~5% for +XFS filesystem overhead, index files, and compaction temp space) + +RS(10,4) Erasure Coding: for every 10 data shards, 4 parity shards are created — 14 +total. This means 1.4× raw storage consumption vs 2× for `001` replication or 3× for +`002` replication. + +| Method | Raw:Usable ratio | Usable from 110TB raw | # disk failures w/o data loss | +| --------------- | ---------------- | --------------------- | ----------------------------- | +| No redundancy | 1:1 | 107.8TB | 0 / 5 | +| EC RS(10,4) | 1.4:1 | ~74.5TB | 2 / 5 | +| Replication 001 | 2:1 | ~52.3TB | 1 / 5 | +| Replication 002 | 3:1 | ~34.8TB | 2 / 5 | + +### Appendix B: Port Reference + +| Service | HTTP Port | gRPC Port | +| --------------- | --------- | --------- | +| Master | 9333 | 19333 | +| Volume 1 | 8081 | 18081 | +| Volume 2 | 8082 | 18082 | +| Volume 3 | 8083 | 18083 | +| Volume 4 | 8084 | 18084 | +| Volume 5 | 8085 | 18085 | +| Filer | 8888 | 18888 | +| S3 | 8333 | — | +| Prometheus | 9090 | — | +| Pushgateway | 9091 | — | +| Grafana | 3000 | — | +| Admin (if used) | 23646 | — | + +### Appendix C: Recommended Environment `.env` File + +This file lives **in the same directory as `compose.yaml`**. Docker Compose reads it +automatically when you run `docker compose up`. Variable names are plain — Compose +substitutes them when referenced as `${VAR_NAME}` in the YAML. + +```text +JWT_SIGNING_KEY= +JWT_FILER_SIGNING_KEY= +S3_SSE_KEK= +GRAFANA_PASSWORD= +``` + +**Do not commit `.env` to version control.** remember to add it to `.gitignore`. diff --git a/seaweedfs/justfile b/seaweedfs/justfile new file mode 100644 index 000000000..b164225f4 --- /dev/null +++ b/seaweedfs/justfile @@ -0,0 +1,203 @@ +set shell := ["bash", "-eu", "-o", "pipefail", "-c"] + +# constants + +env_selection_script := "./scripts/env-selection.sh" + +# variables | run `just env` to see current values + +compose_file := shell(env_selection_script + ' $1', "compose_file") +env := shell(env_selection_script + ' $1', "env") +env_file := shell(env_selection_script + ' $1', "env_file") +filer_container := shell(env_selection_script + ' $1', "filer_container") +master_container := shell(env_selection_script + ' $1', "master_container") +docker_compose := "COMPOSE_FILE=" + compose_file + " docker compose --env-file " + env_file + +alias hooks := pre-commit +alias run := up +alias upgrade := update-hooks + +# show available recipes +default: + @just --list + +# pulls and rebuilds the compose services with optional args +[group('setup')] +build *args: + @echo "Pulling and building sds-seaweedfs" + {{ docker_compose }} pull --ignore-buildable + {{ docker_compose }} build {{ args }} + +# runs a generic docker compose command e.g. `just dc ps` +[group('utilities')] +dc +args: + @echo "Running docker compose command: {{ args }}" + {{ docker_compose }} {{ args }} + +# sets up the data directories with correct ownership (local only) +[group('setup')] +data-setup: + #!/usr/bin/env bash + set -euo pipefail + if [[ "{{ env }}" != "local" ]]; then + echo "data-setup only needed for local; CI and production use volumes or bind mounts" + exit 0 + fi + echo "Creating data directories..." + mkdir -p data/master data/volumes data/filer/filerldb2 + echo "Setting ownership to ${UID:-1000}:${GID:-1000}..." + sudo chown --changes -R "${UID:-1000}:${GID:-1000}" data/ + echo "Done" + +# runs a full deploy (start services, configure credentials, create bucket) +[group('setup')] +deploy *args: + @echo "Deploying SeaweedFS stack for '{{ env }}' environment" + ./scripts/deploy.sh {{ args }} + +# stops and removes compose services +[group('service')] +down *args: + @echo "Stopping SeaweedFS" + {{ docker_compose }} down --remove-orphans {{ args }} + +[group('setup')] +load_credentials path="": + #!/usr/bin/env bash + set -Eeuo pipefail + + env="{{ env }}" + path_override="{{ path }}" + primary_env_file="${path_override:-../gateway/.envs/${env}/storage.env}" + if [[ ! -f "${primary_env_file}" ]]; then + echo "Error: Primary storage credentials file not found at ${primary_env_file}" >&2 + echo "Please run 'just generate-secrets' to create it." >&2 + exit 1 + fi + env_file_gateway=$(realpath ${primary_env_file}) + echo "Loading credentials from ${env_file_gateway}..." >&2 + + if [[ ! -f "${env_file_gateway}" ]]; then + echo "Credentials file not found: ${env_file_gateway}" >&2 + exit 1 + fi + + access_key=$(grep -E '^PRIMARY_ACCESS_KEY_ID=' "${env_file_gateway}" | cut -d'=' -f2- || true) + secret_key=$(grep -E '^PRIMARY_SECRET_ACCESS_KEY=' "${env_file_gateway}" | cut -d'=' -f2- || true) + bucket_name=$(grep -E '^PRIMARY_STORAGE_BUCKET_NAME=' "${env_file_gateway}" | cut -d'=' -f2- || true) + + if [[ -z "${access_key}" || -z "${secret_key}" || -z "${bucket_name}" ]]; then + echo "Missing required credentials in ${env_file_gateway}. Expected:" >&2 + echo -e "\tPRIMARY_ACCESS_KEY_ID, PRIMARY_SECRET_ACCESS_KEY, PRIMARY_STORAGE_BUCKET_NAME" >&2 + exit 1 + fi + + printf '%s\n%s\n%s' "${access_key}" "${secret_key}" "${bucket_name}" + +# prints currently selected environment +[group('utilities')] +env: + #!/usr/bin/env bash + echo -e "\nSelected env:\n" + echo -e "\tEnvironment: \e[34m '{{ env }}'\e[0m" + echo -e "\tEnv file: \e[34m '{{ env_file }}'\e[0m" + echo -e "\tCompose file: \e[34m '{{ compose_file }}'\e[0m" + echo -e "\tDocker compose command: \e[34m '{{ docker_compose }}'\e[0m" + echo -e "\tFiler container: \e[34m '{{ filer_container }}'\e[0m" + echo -e "\tMaster container: \e[34m '{{ master_container }}'\e[0m" + + if ! [ -f "{{ compose_file }}" ]; then + echo -e "\n\e[31mError:\e[0m Compose file '{{ compose_file }}' does not exist." + exit 1 + fi + if ! [ -f "{{ env_file }}" ]; then + echo -e "\n\e[31mError:\e[0m Env file '{{ env_file }}' does not exist." \ + "Generate secrets for this environment to create it." + exit 1 + fi + +# streams logs until interrupted +[group('monitoring')] +logs *args: + @echo "Showing SeaweedFS logs..." + {{ docker_compose }} logs --tail 10000 -f {{ args }} || true + +# prints all recent logs once +[group('monitoring')] +logs-once *args: + @echo "Showing SeaweedFS logs once..." + {{ docker_compose }} logs {{ args }} + +# rebuilds then restarts services and shows logs +[group('service')] +redeploy services='': + just build {{ services }} + just down {{ services }} + just up {{ services }} + just logs {{ services }} + +# restarts running compose services +[group('service')] +restart *args: + @echo "Restarting SeaweedFS" + {{ docker_compose }} restart {{ args }} + +# opens an interactive weed shell session +[group('utilities')] +shell: + @echo "Opening weed shell on '{{ filer_container }}' (master: {{ master_container }})" + docker exec -it {{ filer_container }} \ + weed shell -master="{{ master_container }}:9333" + +# starts services in detached mode +[group('service')] +up *args: + #!/usr/bin/env bash + echo "Starting SeaweedFS in detached mode" + echo "Environment: '{{ env }}'" + echo "Compose file: '{{ compose_file }}'" + {{ docker_compose }} up --detach --remove-orphans {{ args }} + +# runs the pre-commit hooks +[group('qa')] +pre-commit: + @uvx prek install -f + @uvx prek run --all-files + +# upgrades pre-commit hooks to their latest compatible versions +[group('development')] +update-hooks: + @uvx prek autoupdate + +# performs full teardown (removes data) — irreversible +[confirm("This will destroy ALL SeaweedFS data. Are you sure? [y/N]")] +[group('service')] +wipe: + #!/usr/bin/env bash + set -euo pipefail + host=$(hostname) + echo -e "This will wipe ALL SeaweedFS data in env=\e[31m{{ env }}\e[0m and hostname=\e[31m${host}\e[0m" + echo "This includes Docker-managed volumes in this SeaweedFS stack, " + echo -e "\tand if env=local it will also delete local data directories.\n" + echo -e "\e[31mThis action is IRREVERSIBLE. Type this machine's hostname to confirm:\e[0m" + read -r confirmation + if [[ "${confirmation}" != "${host}" ]]; then + echo "Aborting." + exit 1 + fi + just down --volumes + if [[ "{{ env }}" == "local" ]]; then + rm -rf data/volumes/* data/filer/* + echo "Local data directories cleared" + fi + echo "SeaweedFS data wiped" + +# health *args # comprehensive cluster diagnostic (human-readable) +[group('monitoring')] +health *args: + @./scripts/health-check.sh {{ args }} + +# health-json # machine-readable JSON output for agentic consumption +[group('monitoring')] +health-json: + @./scripts/health-check.sh --json diff --git a/seaweedfs/progress.md b/seaweedfs/progress.md new file mode 100644 index 000000000..6dcfe3bbc --- /dev/null +++ b/seaweedfs/progress.md @@ -0,0 +1,118 @@ +# SeaweedFS Production Deployment Progress + +## Mission: Checklist-Compliant Production Deployment + +**Target:** 5 × 22TB drives, Erasure Coding RS(10+4), push-based monitoring, JWT security. + +## Audit Results + +### Current State vs Checklist Requirements + +| Area | Before | After | +| ------------------- | --------------------------------------------- | ---------------------------------------------- | +| Image tag | `4.17_large_disk` | `4.23-large_disk_full` | +| Volume servers | 1 (named Docker volume) | 5 (bind mount to /disk{1-5}/{data,idx}) | +| Index | memory (default) | leveldb on all 5 volumes | +| EC (admin+worker) | Not present | admin + worker containers added | +| Monitoring | Prometheus (direct scrape) | Pushgateway + Prometheus (push mode) + Grafana | +| S3 config | No s3-config.json | s3-config.json with identities | +| Security (JWT) | security.toml keys empty | Env var JWT keys in compose + .env | +| Backup | Not present | filer-backup service + replication.toml S3 sink| +| Logging config | Not defined | x-logging with json-file driver | +| Network | `sds-gateway-prod-seaweed-net` (bridge) | External network (created before deploy) | +| WebDAV | Present | Preserved (image bumped to 4.23) | +| Healthchecks | Present on volume, s3 | Retained on all 5 volumes + s3 | +| Env file refs | `.envs/*/seaweedfs.env` (wrong name) | Fixed to `sfs.env` in env-selection.sh | + +## Changes Made + +### 1. `compose.production.yaml` — Full rewrite + +- Image: `4.23-large_disk_full` (supports large volumes, includes all backends) +- x-logging defaults for all services +- External network `sds-gateway-prod-seaweed-net` (created before deploy) +- Master: JWT env var, volumePreallocate, volumeSizeLimitMB=30000, push metrics +- 5 volume services (volume1-5): bind mounts, leveldb index, compactionMBps=40, minFreeSpacePercent=7, per-drive healthchecks +- Filer: JWT filer signing, leveldb2, encryptVolumeData=false, maxMB=32 +- S3: JWT filer signing, SSE KEK, s3-config.json, healthcheck, dual-network +- WebDAV: preserved, image bumped +- Admin: EC management, cluster maintenance +- Worker: erasure_coding plugin runner +- Prometheus: v2.53.0, pushgateway scrape target, web.enable-lifecycle +- Pushgateway: v1.9.0 +- Grafana: 11.1.0, admin password from env +- filer-backup: async S3 replication to MinIO + +### 2. `prometheus/prometheus.yaml` — Pushgateway mode + +- Changed from direct service scrape (4 targets) to single pushgateway target with `honor_labels: true` + +### 3. `config/security.toml` — Env var documentation + +- Added comments: `PRODUCTION: Set via WEED_JWT_SIGNING_KEY env var` + +### 4. `config/s3-config.json` — NEW + +- Admin identity (Admin, Read, Write, List, Tagging) +- Backup-user identity (Read, List) + +### 5. `config/replication.toml` — S3 sink enabled + +- Uncommented `[sink.s3]` section, set `enabled = true` +- Credentials use `${MINIO_BACKUP_ACCESS_KEY}` / `${MINIO_BACKUP_SECRET_KEY}` env vars +- Target: `spectrumx` bucket, `/spectrumx` prefix + +### 6. `.envs/production/sfs.env` — Secrets scaffolding + +- Added: `JWT_SIGNING_KEY`, `JWT_FILER_SIGNING_KEY`, `S3_SSE_KEK`, `GRAFANA_PASSWORD`, `MINIO_BACKUP_ACCESS_KEY`, `MINIO_BACKUP_SECRET_KEY` + +### 7. `.envs/example/seaweedfs.env` — Updated template + +- Mirrors production env structure with secrets placeholders + +### 8. `scripts/env-selection.sh` — Bug fix + +- Fixed: `seaweedfs.env` → `sfs.env` (all actual env files use `sfs.env` naming) + +## Final Compliance Review + +| Checklist Section | Status | Notes | +| ----------------------- | ------ | ------------------------------------------------ | +| §0 Pre-Deployment | ✅ | EC RS(10+4), 5×22TB, leveldb2, push monitoring | +| §1 OS & Filesystem | 🟡 | Documented; mkfs/fstab are host-level ops | +| §2 Security | ✅ | JWT env vars, security.toml scaffold, .env | +| §3 Docker Compose | ✅ | Full compose with all checklist services | +| §4 S3 API | ✅ | s3-config.json with admin + backup identities | +| §5 Monitoring | ✅ | Pushgateway + Prometheus + Grafana | +| §6 Backup | ✅ | filer-backup + replication.toml S3 sink | +| §7 Startup & Verify | 🟡 | Documented in checklist; commands ready to run | +| §8 Volume Growth | ✅ | master.toml volume_growth config present | +| §9 Maintenance | ✅ | master.toml scripts + admin+worker services | + +### Items requiring host-level ops (not in compose scope) + +- XFS filesystem creation with mkfs.xfs +- /etc/fstab mount options (noatime,allocsize=1m) +- /disk{1-5}/{data,idx} directory creation +- Docker network creation +- Docker Engine installation +- ulimit and sysctl tuning +- MinIO backup bucket creation +- Grafana dashboard import +- S3 credential configuration via `s3.configure` in weed shell + +## Progress Log + +### 2026-05-05 + +- [x] Audited all existing compose files, config files, .env files, scripts +- [x] Documented gap analysis +- [x] Rewrote compose.production.yaml — full checklist compliance + merged existing features +- [x] Updated prometheus.yaml for pushgateway mode +- [x] Updated security.toml with env var documentation +- [x] Created s3-config.json with admin + backup identities +- [x] Updated replication.toml with S3 sink enabled +- [x] Updated .envs/production/sfs.env with JWT secrets scaffolding +- [x] Updated .envs/example/seaweedfs.env with secrets placeholders +- [x] Fixed env-selection.sh bug (seaweedfs.env → sfs.env) +- [x] Final review against checklist sections 0-9 — all covered diff --git a/seaweedfs/prometheus/prometheus.yaml b/seaweedfs/prometheus/prometheus.yaml new file mode 100644 index 000000000..884e9d477 --- /dev/null +++ b/seaweedfs/prometheus/prometheus.yaml @@ -0,0 +1,19 @@ +# PRODUCTION Prometheus config — pushgateway mode +# SeaweedFS components push metrics to pushgateway (configured in master +# via -master.metrics.address). Prometheus scrapes from pushgateway, +# avoiding the need for dynamic target discovery. +# +# See checklist §5 — Monitoring + +global: + scrape_interval: 15s + +scrape_configs: + - job_name: "seaweedfs-pushgateway" + honor_labels: true + static_configs: + - targets: ["sds-gateway-prod-sfs-pushgateway:9091"] + + - job_name: prometheus + static_configs: + - targets: ["localhost:9090"] diff --git a/seaweedfs/scripts/.gitignore b/seaweedfs/scripts/.gitignore new file mode 100644 index 000000000..7774f9875 --- /dev/null +++ b/seaweedfs/scripts/.gitignore @@ -0,0 +1 @@ +prod-hostnames.env diff --git a/seaweedfs/scripts/checksum-audit.sh b/seaweedfs/scripts/checksum-audit.sh new file mode 100755 index 000000000..486aae909 --- /dev/null +++ b/seaweedfs/scripts/checksum-audit.sh @@ -0,0 +1,420 @@ +#!/usr/bin/env bash +# ============================================================================= +# minio-checksum-audit.sh +# +# Randomly samples objects from a MinIO bucket and verifies that each object's +# BLAKE3 checksum matches its base name (the base name IS the expected hash). +# +# Usage: +# checksum-audit.sh --bucket my_bucket +# MC_ALIAS=my_minio MC_BUCKET=my_bucket checksum-audit.sh +# +# Environment variables: +# MC_ALIAS MinIO alias configured in `mc` (default: local) +# MC_BUCKET Bucket to audit (required) +# MC_PREFIX Optional key prefix to scope the scan, no leading slash (default: "files") +# SAMPLE_RATE Percentage of objects to sample, supports decimals (default: 1) +# LOG_FILE Path to the log file (default: ./checksum_audit.log) +# FAIL_FAST Exit on first mismatch if "true", otherwise audit all samples +# and exit with an error at the end (default: true) +# ============================================================================= +set -Eeuo pipefail +IFS=$'\n\t' + +MC_ALIAS="${MC_ALIAS:-local}" +MC_BUCKET="${MC_BUCKET:-}" +MC_PREFIX="${MC_PREFIX:-files}" +SAMPLE_RATE="${SAMPLE_RATE:-1}" +LOG_FILE="${LOG_FILE:-./checksum_audit.log}" +FAIL_FAST="${FAIL_FAST:-true}" +OBJECT_REGEX=".*/[0-9a-f]{64}(_.*)?$" +FIND_PATH="" + +target="" +sampled=0 +checked=0 +errors=0 +temp_files=() + +color_reset="" +color_info="" +color_warn="" +color_error="" +color_fatal="" + +function init_colors() { + if [[ -t 1 ]] && [[ -z "${NO_COLOR:-}" ]]; then + color_reset=$'\033[0m' + color_info=$'\033[36m' + color_warn=$'\033[33m' + color_error=$'\033[31m' + color_fatal=$'\033[35m' + fi +} + +function log() { + local level="${1}" + local color="${2}" + local stream="${3}" + shift 3 + local text="$*" + local timestamp + local message + timestamp="$(date -u '+%Y-%m-%dT%H:%M:%SZ')" + message="[${timestamp}] [${level}] ${text}" + + printf '%s\n' "${message}" >>"${LOG_FILE}" + + if [[ "${stream}" == "stderr" ]]; then + if [[ -n "${color}" ]]; then + printf '%b%s%b\n' "${color}" "${message}" "${color_reset}" >&2 + else + printf '%s\n' "${message}" >&2 + fi + return + fi + + if [[ -n "${color}" ]]; then + printf '%b%s%b\n' "${color}" "${message}" "${color_reset}" + else + printf '%s\n' "${message}" + fi +} + +function log_info() { + log "INFO" "${color_info}" "stdout" "$*" +} + +function log_warn() { + log "WARN" "${color_warn}" "stderr" "$*" +} + +function log_error() { + log "ERROR" "${color_error}" "stderr" "$*" +} + +function log_fatal() { + log "FATAL" "${color_fatal}" "stderr" "$*" +} + +function die() { + log_fatal "$*" + exit 1 +} + +function remember_temp_file() { + local file_path="${1}" + temp_files+=("${file_path}") +} + +function print_usage() { + cat < MinIO alias configured in mc (default: env MC_ALIAS or "local") + -b, --bucket Bucket to audit (required; env fallback: MC_BUCKET) + -p, --prefix Optional key prefix to scope the scan, no leading slash (default: env MC_PREFIX or "files") + -r, --sample-rate Sampling percentage in (0,100] (default: env SAMPLE_RATE or "1") + -l, --log-file Log file path (default: env LOG_FILE or "./checksum_audit.log") + -f, --fail-fast true|false (default: env FAIL_FAST or "true") + --no-fail-fast Shortcut for --fail-fast false + -h, --help Show this help and exit + +Examples: + checksum-audit.sh --bucket spectrumx + checksum-audit.sh -b spectrumx -r 0.5 --fail-fast false + MC_BUCKET=spectrumx checksum-audit.sh -r 5 +EOF +} + +function parse_args() { + while [[ $# -gt 0 ]]; do + case "${1}" in + -h | --help) + print_usage + exit 0 + ;; + -a | --alias) + [[ $# -lt 2 ]] && die "Missing value for ${1}" + MC_ALIAS="${2}" + shift 2 + ;; + -b | --bucket) + [[ $# -lt 2 ]] && die "Missing value for ${1}" + MC_BUCKET="${2}" + shift 2 + ;; + -p | --prefix) + [[ $# -lt 2 ]] && die "Missing value for ${1}" + MC_PREFIX="${2}" + shift 2 + ;; + -r | --sample-rate) + [[ $# -lt 2 ]] && die "Missing value for ${1}" + SAMPLE_RATE="${2}" + shift 2 + ;; + -l | --log-file) + [[ $# -lt 2 ]] && die "Missing value for ${1}" + LOG_FILE="${2}" + shift 2 + ;; + -f | --fail-fast) + [[ $# -lt 2 ]] && die "Missing value for ${1}" + FAIL_FAST="${2}" + shift 2 + ;; + --no-fail-fast) + FAIL_FAST="false" + shift + ;; + --) + shift + break + ;; + -*) + die "Unknown option: ${1}. Use --help for usage." + ;; + *) + die "Unexpected positional argument: ${1}. Use --help for usage." + ;; + esac + done + + if [[ $# -gt 0 ]]; then + die "Unexpected positional argument: ${1}. Use --help for usage." + fi +} + +function require_commands() { + for cmd in mc b3sum awk date jq mktemp; do + command -v "${cmd}" >/dev/null 2>&1 || die "Required command not found: '${cmd}'" + done +} + +function validate_sample_rate() { + if ! awk -v rate="${SAMPLE_RATE}" 'BEGIN { exit !(rate > 0 && rate <= 100) }'; then + die "SAMPLE_RATE must be a number between 0 (exclusive) and 100. Got: '${SAMPLE_RATE}'" + fi + if ! mc alias list "${MC_ALIAS}" >/dev/null 2>&1; then + log_error "Available MinIO aliases:" + mc alias list + die "MinIO alias '${MC_ALIAS}' not found in 'mc' configuration. Pass it with --alias or set MC_ALIAS environment variable." + fi +} + +function validate_fail_fast() { + case "${FAIL_FAST}" in + true | false) ;; + *) die "FAIL_FAST must be 'true' or 'false'. Got: '${FAIL_FAST}'" ;; + esac +} + +function validate_config() { + [[ -z "${MC_BUCKET}" ]] && die "MC_BUCKET must be set, or specified with --bucket " + validate_sample_rate + validate_fail_fast +} + +function set_target() { + target="${MC_ALIAS}/${MC_BUCKET}" +} + +function build_find_path() { + local normalized_prefix="${MC_PREFIX#/}" + normalized_prefix="${normalized_prefix%/}" + + if [[ -z "${normalized_prefix}" ]]; then + FIND_PATH="" + return + fi + + FIND_PATH="${normalized_prefix}/*" +} + +function is_fail_fast() { + [[ "${FAIL_FAST}" == "true" ]] +} + +function print_start_banner() { + log_info "════════════════════════════════════════" + log_info "MinIO BLAKE3 Checksum Audit — Starting" + log_info "Target : ${target}" + log_info "Sample : ${SAMPLE_RATE}%" + log_info "Fail-fast : ${FAIL_FAST}" + log_info "Prefix : ${MC_PREFIX}" + log_info "Path : ${FIND_PATH:-}" + log_info "Regex : ${OBJECT_REGEX}" + log_info "Log file : ${LOG_FILE}" + log_info "════════════════════════════════════════" +} + +function count_lines() { + local input_file="${1}" + awk 'END { print NR + 0 }' "${input_file}" +} + +function filtered_objects() { + local output_file="${1}" + if [[ -n "${FIND_PATH}" ]]; then + log_info "mc find \"${target}\" --path \"${FIND_PATH}\" --regex \"${OBJECT_REGEX}\" > ${output_file}" + mc find "${target}" --path "${FIND_PATH}" --regex "${OBJECT_REGEX}" 2>>"${LOG_FILE}" >"${output_file}" + return + fi + + log_info "mc find \"${target}\" --regex \"${OBJECT_REGEX}\" > ${output_file}" + mc find "${target}" --regex "${OBJECT_REGEX}" 2>>"${LOG_FILE}" >"${output_file}" +} + +function sampled_objects() { + local filtered_file="${1}" + local sampled_file="${2}" + + awk \ + -v rate="${SAMPLE_RATE}" \ + -v seed="$(($$ + $(date +%s)))" \ + 'BEGIN { srand(seed) } rand() * 100 < rate { print }' \ + "${filtered_file}" >"${sampled_file}" +} + +function stream_hash() { + local object_path="${1}" + mc cat "${object_path}" 2>>"${LOG_FILE}" | b3sum --no-names 2>>"${LOG_FILE}" +} + +function on_stream_failure() { + local object_path="${1}" + log_error "STREAM_FAIL — could not read or hash object: ${object_path}" + errors=$((errors + 1)) + if is_fail_fast; then + log_error "Aborting early (FAIL_FAST=true)." + exit 1 + fi +} + +function on_mismatch() { + local object_path="${1}" + local expected_hash="${2}" + local actual_hash="${3}" + log_error "MISMATCH — object : ${object_path}" + log_error "MISMATCH — expected: ${expected_hash}" + log_error "MISMATCH — actual : ${actual_hash}" + errors=$((errors + 1)) + if is_fail_fast; then + log_error "Aborting early (FAIL_FAST=true)." + exit 1 + fi +} + +function verify_object() { + local object_path="${1}" + local base_name="${object_path##*/}" + local expected_hash="${base_name%%_*}" + local actual_hash="" + + sampled=$((sampled + 1)) + # log_info "Verifying [#${sampled}]: ${object_path}" + + if ! actual_hash="$(stream_hash "${object_path}")"; then + on_stream_failure "${object_path}" + return + fi + + checked=$((checked + 1)) + + if [[ "${actual_hash}" != "${expected_hash}" ]]; then + on_mismatch "${object_path}" "${expected_hash}" "${actual_hash}" + return + fi + + log_info "OK — ${object_path}" +} + +function verify_objects_from_file() { + local sampled_file="${1}" + while IFS= read -r object_path; do + verify_object "${object_path}" + done <"${sampled_file}" +} + +function audit_objects() { + local filtered_file="" + local sampled_file="" + local filtered_count=0 + local sampled_count=0 + + filtered_file="$(mktemp)" + remember_temp_file "${filtered_file}" + sampled_file="$(mktemp)" + remember_temp_file "${sampled_file}" + + log_info "Running regex filter with: ${OBJECT_REGEX}" + filtered_objects "${filtered_file}" + filtered_count="$(count_lines "${filtered_file}")" + log_info "Objects after regex filter: ${filtered_count}" + + if ((filtered_count == 0)); then + log_warn "No objects matched the regex filter. Skipping verification stage." + return + fi + + sampled_objects "${filtered_file}" "${sampled_file}" + sampled_count="$(count_lines "${sampled_file}")" + log_info "Objects after sampling: ${sampled_count}" + + if ((sampled_count == 0)); then + log_warn "No objects remained after sampling. Skipping verification stage." + return + fi + + verify_objects_from_file "${sampled_file}" +} + +function print_summary() { + local stream_errors=$((sampled - checked)) + + log_info "════════════════════════════════════════" + log_info "Audit Complete" + log_info "Sampled : ${sampled}" + log_info "Hashed : ${checked}" + log_info "Stream errors : ${stream_errors}" + log_info "Mismatches : ${errors}" + log_info "════════════════════════════════════════" +} + +function finalize_result() { + if [[ $sampled -eq 0 ]]; then + log_warn "No objects were sampled. Bucket may be empty or prefix too narrow." + log_info "Total objects in bucket ${MC_BUCKET}:" + mc stat "${MC_ALIAS}/${MC_BUCKET}" --json 2>>"${LOG_FILE}" | + jq '.Usage.objectsCount' 2>>"${LOG_FILE}" || + log_warn "Could not retrieve object count for bucket." + exit 0 + fi + + if [[ ${errors} -gt 0 ]]; then + log_error "Audit FAILED — ${errors} error(s) detected across ${checked} verified objects." + exit 1 + fi + + log_info "Audit PASSED — all ${checked} sampled objects are clean." + exit 0 +} + +function main() { + trap cleanup_temp_files EXIT INT TERM + init_colors + parse_args "$@" + require_commands + validate_config + set_target + build_find_path + print_start_banner + audit_objects + print_summary + finalize_result +} + +main "$@" diff --git a/seaweedfs/scripts/common.sh b/seaweedfs/scripts/common.sh new file mode 100644 index 000000000..d63e55ce5 --- /dev/null +++ b/seaweedfs/scripts/common.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +# Script with helper functions to be sourced in other scripts. + +# ensure the script is sourced, not executed +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + echo "This script must be sourced. Use: source ${BASH_SOURCE[0]}" >&2 + exit 1 +fi + +function ts() { + local timestamp + timestamp=$(date +"%Y-%m-%d %H:%M:%S") + echo "${timestamp}" +} + +function log_msg() { + local msg="$1" + echo -e "$(ts) | INFO | ${msg}" +} + +function log_header() { + local msg="$1" + echo -e "$(ts) | \033[0;34m======= ${msg}\033[0m" +} + +function log_success() { + local msg="$1" + echo -e "$(ts) | \033[0;32mSUCCESS\033[0m | ${msg}" +} + +function log_error() { + local msg="$1" + echo -e "$(ts) | \033[0;31mERROR | ${msg}\033[0m" >&2 +} + +function log_warning() { + local msg="$1" + echo -e "$(ts) | \033[0;33mWARNING | ${msg}\033[0m" >&2 +} + +function log_fatal_and_exit() { + local msg="$1" + log_error "${msg}" + exit 1 +} + +function log_error_and_skip() { + local msg="$1" + log_error "${msg}" + log_msg "Skipping this step and continuing..." +} diff --git a/seaweedfs/scripts/deploy.sh b/seaweedfs/scripts/deploy.sh new file mode 100755 index 000000000..2d7299de7 --- /dev/null +++ b/seaweedfs/scripts/deploy.sh @@ -0,0 +1,353 @@ +#!/usr/bin/env bash +# Deploy the SeaweedFS stack: start services, configure S3 credentials, create bucket. +# +# By default, S3 credentials are read from .envs//storage.env (PRIMARY vars). +# Pass --sfs-env to override the credentials file path (used by gateway/deploy.sh). +# +# ENVIRONMENT VARIABLES: +# SFS_FORCE_SECRETS - Set to 'true' to overwrite existing .envs files (default: false) +# SFS_SKIP_SETUP - Set to 'true' to skip credential/bucket setup (default: false) +# +# USAGE EXAMPLES: +# ./deploy.sh local +# ./deploy.sh ci +# ./deploy.sh production +# ./deploy.sh --sfs-env /path/to/storage.env local + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +SFS_ROOT=$(cd "${SCRIPT_DIR}/.." && pwd) + +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/common.sh" + +readonly DEFAULT_MAX_WAIT=60 + +function show_usage() { + echo -e "Usage: ${0} [OPTIONS] " + echo "" + echo "Deploy the SeaweedFS stack: start services, configure S3 credentials, create bucket." + echo "" + echo -e "\e[34mOPTIONS:\e[0m" + echo " --sfs-env Path to env file with S3 credentials" + echo " (defaults to .envs//storage.env)" + echo " --skip-setup Skip credential and bucket setup" + echo " -h, --help Show this help message" + echo "" + echo -e "\e[34mARGUMENTS:\e[0m" + echo " Target environment to deploy" + echo "" + echo -e "\e[34mCREDENTIALS FILE FORMAT:\e[0m" + echo " PRIMARY_ACCESS_KEY_ID=" + echo " PRIMARY_SECRET_ACCESS_KEY=" + echo " PRIMARY_STORAGE_BUCKET_NAME=" + echo "" + echo -e "\e[34mEXAMPLES:\e[0m" + echo " ${0} local" + echo " ${0} ci" + echo " ${0} --sfs-env .envs/production/storage.env production" + echo "" + exit 0 +} + +# Return 0 if running as root, 1 otherwise +function is_root() { + [[ $(id -u) -eq 0 ]] +} + +function setup_data_dirs() { + local env_type="$1" + if [[ "${env_type}" != "local" ]]; then + return 0 + fi + + log_header "Local Data Directory Setup" + log_msg "Creating data directories..." + local uid gid + uid=$(id -u) + gid=$(id -g) + # Export for compose (UID/GID are readonly in bash, so we use HOST_UID/HOST_GID) + export HOST_UID="${uid}" HOST_GID="${gid}" + mkdir -p "${SFS_ROOT}/data/volumes" "${SFS_ROOT}/data/filer/filerldb2" + # Dirs created by current user → already owned by ${uid}:${gid} + # Container also runs as ${uid}:${gid} via compose user: ${HOST_UID}:${HOST_GID} + # → no chown needed. + + log_success "Data directories ready (uid=${uid}, gid=${gid})" +} + +function start_stack() { + log_header "Starting SFS stack" + log_msg "Starting stack..." + { + just build + just up + } &>/dev/null & +} + +function env_prefix() { + if [[ "$1" == "production" ]]; then + echo "prod" + else + echo "$1" + fi +} + +function wait_for_s3_health() { + local env_type="$1" + local max_attempts="${2:-${DEFAULT_MAX_WAIT}}" + local prefix + prefix=$(env_prefix "${env_type}") + local s3_container="sds-gateway-${prefix}-sfs-s3" + local s3_port="${SFS_S3_PORT:-8333}" + + log_msg "Waiting for S3 gateway to be healthy (container: ${s3_container})..." + + local attempt=1 + while [[ ${attempt} -le ${max_attempts} ]]; do + if docker exec "${s3_container}" curl -fsS "http://localhost:${s3_port}/healthz" >/dev/null 2>&1; then + log_success "S3 gateway is healthy" + return 0 + fi + + if [[ $((attempt % 10)) -eq 0 ]]; then + log_msg "Still waiting... (attempt ${attempt}/${max_attempts})" + log_msg "=== S3 gateway logs (last 20 lines) ===" + docker logs --tail 20 "${s3_container}" 2>&1 | while IFS= read -r line; do + log_msg " ${line}" + done + log_msg "=========================================" + fi + + sleep 2 + attempt=$((attempt + 1)) + done + + log_error "S3 gateway '${s3_container}' did not become healthy in time" + return 1 +} + +function configure_s3_credentials() { + local env_type="$1" + local access_key="$2" + local secret_key="$3" + local prefix + prefix=$(env_prefix "${env_type}") + local filer_container="sds-gateway-${prefix}-sfs-filer" + local master_container="sds-gateway-${prefix}-sfs-master" + + log_header "Configuring S3 Credentials" + log_msg "Configuring S3 identity '${access_key}' on cluster..." + + printf '%s\n' "s3.configure -apply -user ${access_key} -access_key ${access_key} -secret_key ${secret_key} -actions Admin -buckets *" | + docker exec -i "${filer_container}" weed shell \ + -master="${master_container}:9333" + + log_success "S3 credentials configured" +} + +function create_bucket() { + local env_type="$1" + local bucket_name="$2" + local access_key="$3" + local secret_key="$4" + local prefix + prefix=$(env_prefix "${env_type}") + local filer_container="sds-gateway-${prefix}-sfs-filer" + local master_container="sds-gateway-${prefix}-sfs-master" + + log_header "Creating S3 Bucket" + log_msg "Creating bucket '${bucket_name}'..." + + printf '%s\n' "s3.bucket.create -name ${bucket_name}" | + docker exec -i "${filer_container}" weed shell \ + -master="${master_container}:9333" + + log_success "Bucket '${bucket_name}' ready" +} + +function setup_prod_hostnames() { + local env_type="$1" + local example_file="${SCRIPT_DIR}/prod-hostnames.example.env" + local target_file="${SCRIPT_DIR}/prod-hostnames.env" + + if [[ -f "${example_file}" && ! -f "${target_file}" ]]; then + cp "${example_file}" "${target_file}" + log_msg "Created: ${target_file}" + fi + + if [[ "${env_type}" == "production" && -f "${target_file}" ]]; then + local current_hostname + current_hostname=$(hostname) + local rel_path + rel_path=$(realpath --relative-to="." "${target_file}") + + if [[ -n "${current_hostname}" ]]; then + if ! grep -Fxq "${current_hostname}" "${target_file}"; then + log_error "Current hostname '${current_hostname}' not listed in '${rel_path}'." + log_msg "Add it:\n\n\techo '${current_hostname}' >> ${rel_path}" + exit 1 + fi + fi + fi +} + +function load_credentials() { + local env_file="$1" + + if [[ ! -f "${env_file}" ]]; then + log_error "Credentials file not found: ${env_file}" + return 1 + fi + + local access_key secret_key bucket_name + access_key=$(grep -E '^PRIMARY_ACCESS_KEY_ID=' "${env_file}" | cut -d'=' -f2-) + secret_key=$(grep -E '^PRIMARY_SECRET_ACCESS_KEY=' "${env_file}" | cut -d'=' -f2-) + bucket_name=$(grep -E '^PRIMARY_STORAGE_BUCKET_NAME=' "${env_file}" | cut -d'=' -f2-) + + if [[ -z "${access_key}" || -z "${secret_key}" || -z "${bucket_name}" ]]; then + log_error "Missing required credentials in ${env_file}" + log_msg "Expected: PRIMARY_ACCESS_KEY_ID, PRIMARY_SECRET_ACCESS_KEY, PRIMARY_STORAGE_BUCKET_NAME" + return 1 + fi + + printf '%s\n%s\n%s' "${access_key}" "${secret_key}" "${bucket_name}" +} + +function load_secondary_credentials() { + local env_file="$1" + + if [[ ! -f "${env_file}" ]]; then + return 1 + fi + + local access_key secret_key + access_key=$(grep -E '^SECONDARY_ACCESS_KEY_ID=' "${env_file}" | cut -d'=' -f2-) + secret_key=$(grep -E '^SECONDARY_SECRET_ACCESS_KEY=' "${env_file}" | cut -d'=' -f2-) + + # If neither SECONDARY credential is set, the store is not configured + if [[ -z "${access_key}" || -z "${secret_key}" ]]; then + return 1 + fi + + # Filter out placeholder/admin defaults that indicate unset creds + if [[ "${access_key}" == "admin" && "${secret_key}" == "admin" ]]; then + return 1 + fi + + printf '%s\n%s' "${access_key}" "${secret_key}" +} + +function parse_arguments() { + local -n _args_ref=$1 + shift + + # Ensure key exists (shellcheck can't follow nameref) + if [[ -z "${_args_ref["skip_setup"]+x}" ]]; then + _args_ref["skip_setup"]="false" + fi + if [[ -z "${_args_ref["sfs_env"]+x}" ]]; then + _args_ref["sfs_env"]="" + fi + if [[ "${SFS_SKIP_SETUP:-}" == "true" ]]; then + _args_ref["skip_setup"]="true" + fi + + while [[ $# -gt 0 ]]; do + case "$1" in + --sfs-env) + if [[ -z "${2:-}" ]]; then + log_error "Missing value for --sfs-env" + show_usage + fi + _args_ref["sfs_env"]="$2" + shift 2 + ;; + --skip-setup) + _args_ref["skip_setup"]="true" + shift + ;; + -h | --help) + show_usage + ;; + local | production | ci) + _args_ref["env_type"]="$1" + shift + ;; + *) + log_error "Unknown argument: $1" + show_usage + ;; + esac + done + + if [[ -z "${_args_ref["env_type"]}" ]]; then + log_error "Environment type required (local, production, or ci)" + show_usage + fi +} + +function assert_selected_env() { + local env_type="$1" + local selected_env + selected_env="$(just env | awk -F"'" '/Environment:/{print $2}')" + if [[ "${env_type}" != "${selected_env}" ]]; then + log_error "Selected environment >${selected_env}< does not match argument >${env_type}<" + log_msg "If you are attempting to run e.g. a CI env locally, tear down your local stack," + log_msg "then run the deploy script with CI=1, e.g.:\n\n\tCI=1 ${0} ci\n" + exit 1 + fi +} + +function main() { + declare -A args=( + [env_type]="" + [skip_setup]="false" + [sfs_env]="" + ) + + parse_arguments args "$@" + + cd "${SFS_ROOT}" + log_header "SeaweedFS Deployment - ${args[env_type]} environment" + + assert_selected_env "${args[env_type]}" + setup_prod_hostnames "${args[env_type]}" + setup_data_dirs "${args[env_type]}" + start_stack "${args[env_type]}" + wait_for_s3_health "${args[env_type]}" "${DEFAULT_MAX_WAIT}" + + if [[ "${args[skip_setup]}" == "false" ]]; then + local creds + local sfs_env_path="${args[sfs_env]}" + creds=$(just load_credentials "${sfs_env_path}") + local access_key secret_key bucket_name + access_key=$(echo "${creds}" | sed -n '1p') + secret_key=$(echo "${creds}" | sed -n '2p') + bucket_name=$(echo "${creds}" | sed -n '3p') + + configure_s3_credentials "${args[env_type]}" "${access_key}" "${secret_key}" + create_bucket "${args[env_type]}" "${bucket_name}" "${access_key}" "${secret_key}" + + # Also configure SECONDARY S3 identity if credentials are available (local/dev) + local secondary_creds + secondary_creds=$(just load_secondary_credentials "${sfs_env_path}") || true + if [[ -n "${secondary_creds}" ]]; then + local sec_access_key sec_secret_key + sec_access_key=$(echo "${secondary_creds}" | sed -n '1p') + sec_secret_key=$(echo "${secondary_creds}" | sed -n '2p') + log_msg "Configuring SECONDARY S3 identity on SeaweedFS..." + configure_s3_credentials "${args[env_type]}" "${sec_access_key}" "${sec_secret_key}" + log_success "SECONDARY S3 identity configured on SeaweedFS" + fi + else + log_msg "Skipping credential and bucket setup (--skip-setup)" + fi + + log_header "SeaweedFS deployment complete" + log_msg "S3 endpoint: http://localhost:${SFS_S3_PORT:-8333}" + log_msg "File browser: http://localhost:${SFS_FILER_PORT:-8888}" +} + +main "$@" diff --git a/seaweedfs/scripts/env-selection.sh b/seaweedfs/scripts/env-selection.sh new file mode 100755 index 000000000..430856c6b --- /dev/null +++ b/seaweedfs/scripts/env-selection.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n\t' + +function is_production_host() { + local script_dir + script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + local host + host=$(hostname) + local prod_hosts_file="${script_dir}/prod-hostnames.env" + + if [[ ! -f "${prod_hosts_file}" ]]; then + printf '\033[33mProduction host list not found at %s: defaulting to local\033[0m\n' "${prod_hosts_file}" >&2 + printf 'Create this file to make the warning go away:\n\n\tcp %s/prod-hostnames.example.env %s\n\n' "${script_dir}" "${prod_hosts_file}" >&2 + return 1 + fi + + while read -r line; do + line=$(echo "${line}" | xargs) + [[ -z "${line}" || ${line:0:1} == '#' ]] && continue + if [[ "${line}" == "${host}" ]]; then + return 0 + fi + done <"${prod_hosts_file}" + + return 1 +} + +function is_ci_env() { + if [[ -n "${CI:-}" ]] || [[ -n "${GITHUB_ACTIONS:-}" ]] || [[ -n "${GITLAB_CI:-}" ]] || [[ -n "${BUILD_ID:-}" ]] || [[ -n "${JENKINS_URL:-}" ]]; then + return 0 + fi + return 1 +} + +function get_target_value() { + local target="$1" + local env_type="$2" + local local_env_file=".envs/local/sfs.env" + local production_env_file=".envs/production/sfs.env" + local ci_env_file=".envs/ci/sfs.env" + local value="" + + case "${target}" in + env) + value="${env_type}" + ;; + compose_file) + case "${env_type}" in + production) value="compose.production.yaml" ;; + local) value="compose.local.yaml" ;; + ci) value="compose.ci.yaml" ;; + esac + ;; + env_file) + case "${env_type}" in + ci) + value="${ci_env_file}" + ;; + local) + value="${local_env_file}" + ;; + production) + value="${production_env_file}" + ;; + *) + printf 'unsupported environment type: %s\n' "${env_type}" >&2 + exit 1 + ;; + esac + ;; + filer_container) + case "${env_type}" in + production) value="sds-gateway-prod-sfs-filer" ;; + *) value="sds-gateway-${env_type}-sfs-filer" ;; + esac + ;; + master_container) + case "${env_type}" in + production) value="sds-gateway-prod-sfs-master" ;; + *) value="sds-gateway-${env_type}-sfs-master" ;; + esac + ;; + s3_container) + case "${env_type}" in + production) value="sds-gateway-prod-sfs-s3" ;; + *) value="sds-gateway-${env_type}-sfs-s3" ;; + esac + ;; + *) + printf 'Unknown target: %s\n' "${target}" >&2 + exit 1 + ;; + esac + + printf '%s' "${value}" +} + +function main() { + if [[ $# -ne 1 ]]; then + printf 'usage: %s \n' "${0}" >&2 + exit 1 + fi + + # determine the environment type + local target=${1:-} + local env_type="" + + # allow explicit override via SDS_ENV (e.g., SDS_ENV=ci just env) + if [[ -n "${SDS_ENV:-}" ]]; then + case "${SDS_ENV}" in + ci | local | production) env_type="${SDS_ENV}" ;; + *) + printf '\033[33mUnknown SDS_ENV="%s": must be ci, local, or production\033[0m\n' "${SDS_ENV}" >&2 + exit 1 + ;; + esac + elif is_production_host 2>/dev/null; then + env_type="production" + elif is_ci_env; then + env_type="ci" + else + env_type="local" + fi + + get_target_value "${target}" "${env_type}" + +} + +main "$@" diff --git a/seaweedfs/scripts/health-check.sh b/seaweedfs/scripts/health-check.sh new file mode 100755 index 000000000..ebcb3bc4c --- /dev/null +++ b/seaweedfs/scripts/health-check.sh @@ -0,0 +1,536 @@ +#!/usr/bin/env bash +# seaweedfs-health-check.sh — comprehensive cluster diagnostic +# Human-readable colored output + machine-readable JSON summary +# +# Usage: ./scripts/health-check.sh [--json | --silent] +# +# Exit codes: +# 0 — all OK +# 1 — failures (warnings don't fail) +# 2 — fatal error (can't run checks) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" +source "${SCRIPT_DIR}/common.sh" + +# ── args ──────────────────────────────────────────────────── +OUTPUT_MODE="human" +for arg in "$@"; do + case "$arg" in + --json) OUTPUT_MODE="json" ;; + --silent) OUTPUT_MODE="silent" ;; + esac +done + +# ── environment detection ─────────────────────────────────── +ENV_TYPE="" +if "${SCRIPT_DIR}/env-selection.sh" env 2>/dev/null | grep -q "^production$" 2>/dev/null; then + ENV_TYPE="production" +elif [[ -n "${CI:-}" || -n "${GITHUB_ACTIONS:-}" || -n "${GITLAB_CI:-}" || -n "${BUILD_ID:-}" ]]; then + ENV_TYPE="ci" +else + ENV_TYPE="local" +fi + +case "$ENV_TYPE" in +production) COMPOSE_FILE="compose.production.yaml" ;; +ci) COMPOSE_FILE="compose.ci.yaml" ;; +*) COMPOSE_FILE="compose.local.yaml" ;; +esac + +ENV_FILE=".envs/${ENV_TYPE}/sfs.env" +COMPOSE_ABS="${PROJECT_DIR}/${COMPOSE_FILE}" +ENV_ABS="${PROJECT_DIR}/${ENV_FILE}" +DOCKER_COMPOSE="docker compose -f ${COMPOSE_ABS} --env-file ${ENV_ABS}" + +# ── detect compose profile ────────────────────────────────── +COMPOSE_PROFILE=$(basename "${COMPOSE_FILE}" .yaml | sed 's/^compose\.//') + +# Service availability per profile +HAS_WEBDAV=false +HAS_ADMIN=false +HAS_GRAFANA=false +HAS_WORKER=false +HAS_PROMETHEUS=false +HAS_PUSHGATEWAY=false +case "$COMPOSE_PROFILE" in +production) + HAS_WEBDAV=true + HAS_ADMIN=true + HAS_GRAFANA=true + HAS_WORKER=true + HAS_PROMETHEUS=true + HAS_PUSHGATEWAY=true + ;; +ci) + HAS_WEBDAV=true + HAS_ADMIN=false + HAS_GRAFANA=false + HAS_WORKER=false + HAS_PROMETHEUS=true + HAS_PUSHGATEWAY=false + ;; +local) + HAS_WEBDAV=true + HAS_ADMIN=false + HAS_GRAFANA=false + HAS_WORKER=false + HAS_PROMETHEUS=true + HAS_PUSHGATEWAY=false + ;; +esac + +# Volume server config per profile +case "$COMPOSE_PROFILE" in +production) + VOL_COUNT=5 + VOL_BASE_PORT=8081 + VOL_BASE_GRPC=18081 + DISK_BASE="/disk" + ;; +*) + VOL_COUNT=1 + VOL_BASE_PORT=8080 + VOL_BASE_GRPC=18080 + DISK_BASE="" + ;; +esac + +# Load custom ports from env file +if [[ -f "$ENV_ABS" ]]; then + SFS_FILER_PORT=$(grep '^SFS_FILER_PORT=' "$ENV_ABS" | cut -d= -f2 || echo "8888") + SFS_WEBDAV_PORT=$(grep '^SFS_WEBDAV_PORT=' "$ENV_ABS" | cut -d= -f2 || echo "7333") + SFS_PROM_HOST_PORT=$(grep '^SFS_PROMETHEUS_HOST_PORT=' "$ENV_ABS" | cut -d= -f2 || echo "9090") +fi + +# ── counters ──────────────────────────────────────────────── +TOTAL=0 +OK=0 +WARN=0 +FAIL=0 +JSON_CHECKS="[]" + +add_check() { + local name="$1" status="$2" detail="${3:-}" + TOTAL=$((TOTAL + 1)) + case "$status" in + ok) OK=$((OK + 1)) ;; + warn) WARN=$((WARN + 1)) ;; + fail) FAIL=$((FAIL + 1)) ;; + esac + JSON_CHECKS=$(echo "$JSON_CHECKS" | jq --arg n "$name" --arg s "$status" --arg d "$detail" \ + '. + [{"name": $n, "status": $s, "detail": $d}]') + if [[ "$OUTPUT_MODE" == "human" ]]; then + case "$status" in + ok) log_success "${name}" ;; + warn) log_msg "${name} [${YELLOW}⚠ ${status}${RESET}]" ;; + fail) log_error "${name}" ;; + esac + fi +} + +YELLOW='\033[0;33m' +RESET='\033[0m' + +curl_ok() { curl -fsS --max-time 5 "$@" >/dev/null 2>&1; } +curl_json() { curl -fsS --max-time 5 "$@" 2>/dev/null || echo '{}'; } + +output_header() { + if [[ "$OUTPUT_MODE" == "human" ]]; then + log_header "$1" + fi +} + +# ───────────────────────────────────────────────────────────── +output_header "0. PRELIMINARY" + +if [[ -f "$COMPOSE_ABS" ]]; then + add_check "Compose file exists" "ok" "$(basename "$COMPOSE_ABS")" +else + add_check "Compose file exists" "fail" "$(basename "$COMPOSE_ABS") not found" + log_fatal_and_exit "Compose file not found: $COMPOSE_ABS" +fi + +if [[ -f "$ENV_ABS" ]]; then + add_check "Env file exists" "ok" "$(basename "$ENV_ABS")" +else + add_check "Env file exists" "warn" "$(basename "$ENV_ABS") not found (may use docker secrets)" +fi + +# ───────────────────────────────────────────────────────────── +output_header "1. CONTAINER STATUS" + +SERVICES_LIST=$(${DOCKER_COMPOSE} ps --format '{{.Service}}' 2>/dev/null || true) + +if [[ -z "$SERVICES_LIST" ]]; then + add_check "Compose stack running" "fail" "no services" +else + SVC_COUNT=$(echo "$SERVICES_LIST" | wc -l) + add_check "Compose stack running" "ok" "${SVC_COUNT} service(s)" + while IFS= read -r svc; do + svc_health=$(${DOCKER_COMPOSE} ps --format '{{.Service}}|{{.Health}}|{{.Status}}' 2>/dev/null | grep "^${svc}|" || true) + if [[ -z "$svc_health" ]]; then + add_check "Container: $svc" "warn" "no health output" + continue + fi + health=$(echo "$svc_health" | cut -d'|' -f2) + status=$(echo "$svc_health" | cut -d'|' -f3) + if echo "$health" | grep -qi "healthy\|none"; then + add_check "Container: $svc" "ok" "$health / $status" + elif echo "$status" | grep -qi "up\|running"; then + add_check "Container: $svc" "ok" "no healthcheck / $status" + else + add_check "Container: $svc" "fail" "$health / $status" + fi + done <<<"$SERVICES_LIST" +fi + +# ───────────────────────────────────────────────────────────── +output_header "2. MASTER" + +if curl_ok http://localhost:9333/cluster/status; then + add_check "Master HTTP (9333)" "ok" "" +else + add_check "Master HTTP (9333)" "fail" "unreachable" +fi + +if curl_ok http://localhost:19333/debug/vars; then + add_check "Master gRPC (19333)" "ok" "" +else + add_check "Master gRPC (19333)" "warn" "unreachable (may be normal)" +fi + +MASTER_JSON=$(curl_json http://localhost:9333/cluster/status) +MASTER_LEADER=$(echo "$MASTER_JSON" | jq -r '.Leader // "unknown"' 2>/dev/null) +MASTER_IS_LEADER=$(echo "$MASTER_JSON" | jq -r '.IsLeader // "unknown"' 2>/dev/null) +MASTER_MAX_VOL=$(echo "$MASTER_JSON" | jq -r '.MaxVolumeId // "unknown"' 2>/dev/null) +add_check "Master topology" "ok" "leader=${MASTER_LEADER}, isLeader=${MASTER_IS_LEADER}, maxVolId=${MASTER_MAX_VOL}" + +# ───────────────────────────────────────────────────────────── +output_header "3. VOLUME SERVERS" + +for i in $(seq 1 $VOL_COUNT); do + port=$((VOL_BASE_PORT + i - 1)) + grpc_port=$((VOL_BASE_GRPC + i - 1)) + + if [[ "$COMPOSE_PROFILE" == "local" ]]; then + svc_name="sds-gateway-${ENV_TYPE}-sfs-volume" + else + svc_name="sds-gateway-${ENV_TYPE}-sfs-volume${i}" + fi + + if curl_ok "http://localhost:${port}/healthz"; then + add_check "${svc_name} HTTP (${port})" "ok" "" + else + add_check "${svc_name} HTTP (${port})" "fail" "healthz unreachable" + fi + + if curl_ok "http://localhost:${grpc_port}/debug/vars"; then + add_check "${svc_name} gRPC (${grpc_port})" "ok" "" + else + add_check "${svc_name} gRPC (${grpc_port})" "warn" "debug/vars unreachable" + fi +done + +# ───────────────────────────────────────────────────────────── +output_header "4. CLUSTER INFO" + +if [[ "$MASTER_JSON" != "{}" ]]; then + # Try to get volume/filer info from master (only available in some SeaweedFS versions) + VOL_SERVERS=$(echo "$MASTER_JSON" | jq '[.Volumes[]? // {} | .url // empty] | length' 2>/dev/null || echo "-1") + FILER_COUNT=$(echo "$MASTER_JSON" | jq '.Filervers | length // .filers | length' 2>/dev/null || echo "-1") + + if [[ "$VOL_SERVERS" -eq -1 ]]; then + add_check "Volume servers registered" "warn" "master JSON has no Volumes field (may be normal)" + elif [[ "$VOL_SERVERS" -eq "$VOL_COUNT" ]]; then + add_check "Volume servers registered" "ok" "${VOL_SERVERS}/${VOL_COUNT}" + else + add_check "Volume servers registered" "warn" "master reports ${VOL_SERVERS}, expected ${VOL_COUNT}" + fi + + if [[ "$FILER_COUNT" -eq -1 || "$FILER_COUNT" -eq 0 ]]; then + add_check "Filers registered" "warn" "master JSON has no Filers field (may be normal)" + else + add_check "Filers registered" "ok" "${FILER_COUNT}" + fi + + VOL_DISTRIBUTION=$(echo "$MASTER_JSON" | jq -r '.Volumes[]? | "Volume \(.id): \(.url) DC=\(.dataCenter // "?") Rack=\(.rack // "?")"' 2>/dev/null || echo "") + if [[ -n "$VOL_DISTRIBUTION" ]]; then + add_check "Volume distribution" "ok" "$(echo "$VOL_DISTRIBUTION" | head -c 200)" + fi +else + add_check "Cluster info" "fail" "master /cluster/status returned empty" +fi + +# ───────────────────────────────────────────────────────────── +output_header "5. FILER" + +if curl_ok "http://localhost:${SFS_FILER_PORT:-8888}/"; then + add_check "Filer HTTP (${SFS_FILER_PORT:-8888})" "ok" "" +else + add_check "Filer HTTP (${SFS_FILER_PORT:-8888})" "fail" "unreachable" +fi + +if curl_ok http://localhost:18888/; then + add_check "Filer gRPC (18888)" "ok" "" +else + add_check "Filer gRPC (18888)" "warn" "unreachable (may be normal)" +fi + +# ───────────────────────────────────────────────────────────── +output_header "6. S3 GATEWAY" + +if curl_ok http://localhost:8333/healthz; then + add_check "S3 HTTP (8333)" "ok" "" +else + add_check "S3 HTTP (8333)" "fail" "healthz unreachable" +fi + +S3_LIST=$(curl -fsS --max-time 5 http://localhost:8333/ 2>/dev/null || echo "unavailable") +if echo "$S3_LIST" | grep -q '/dev/null; then + BUCKET_COUNT=$(echo "$S3_LIST" | grep -c '' 2>/dev/null || echo "0") + add_check "S3 list buckets" "ok" "${BUCKET_COUNT} bucket(s)" +elif echo "$S3_LIST" | grep -q 'unavailable\|403\|401\|405' 2>/dev/null; then + add_check "S3 list buckets" "warn" "auth/no-buckets (may be normal)" +else + add_check "S3 list buckets" "warn" "unexpected response: $(echo "$S3_LIST" | head -c 100)" +fi + +# ───────────────────────────────────────────────────────────── +output_header "7. WEBDAV" + +if [[ "$HAS_WEBDAV" == "true" ]]; then + if curl_ok -o /dev/null "http://localhost:${SFS_WEBDAV_PORT:-7333}/"; then + add_check "WebDAV HTTP (${SFS_WEBDAV_PORT:-7333})" "ok" "" + else + # 405 may mean WebDAV is running but / is not the root endpoint + WEBDAV_CODE=$(curl -fsS --max-time 5 -o /dev/null -w '%{http_code}' "http://localhost:${SFS_WEBDAV_PORT:-7333}/" 2>/dev/null || echo "000") + if [[ "$WEBDAV_CODE" == "405" ]]; then + add_check "WebDAV HTTP (${SFS_WEBDAV_PORT:-7333})" "ok" "responding (405 on / is normal)" + else + add_check "WebDAV HTTP (${SFS_WEBDAV_PORT:-7333})" "warn" "unexpected status $WEBDAV_CODE" + fi + fi +else + add_check "WebDAV" "warn" "not in ${COMPOSE_PROFILE} profile" +fi + +# ───────────────────────────────────────────────────────────── +output_header "8. ADMIN & WORKER" + +if [[ "$HAS_ADMIN" == "true" ]]; then + if curl_ok http://localhost:23646/; then + add_check "Admin HTTP (23646)" "ok" "" + else + add_check "Admin HTTP (23646)" "fail" "unreachable" + fi + + WORKER_JSON=$(curl_json http://localhost:23646/admin/worker) + if echo "$WORKER_JSON" | jq -e 'keys | length > 0' >/dev/null 2>&1; then + add_check "Worker plugin" "ok" "$(echo "$WORKER_JSON" | jq -r 'keys | join(", ") // "active"' 2>/dev/null)" + else + add_check "Worker plugin" "warn" "status unknown" + fi +else + add_check "Admin HTTP (23646)" "warn" "not in ${COMPOSE_PROFILE} profile" + add_check "Worker plugin" "warn" "not in ${COMPOSE_PROFILE} profile" +fi + +# ───────────────────────────────────────────────────────────── +output_header "9. METRICS" + +PROM_HTTP_PORT="${SFS_PROM_HOST_PORT:-9090}" + +if curl_ok "http://localhost:${PROM_HTTP_PORT}/-/healthy"; then + add_check "Prometheus HTTP (${PROM_HTTP_PORT})" "ok" "" +else + add_check "Prometheus HTTP (${PROM_HTTP_PORT})" "warn" "unreachable (may be normal)" +fi + +if [[ "$HAS_PROMETHEUS" == "true" && "$HAS_PUSHGATEWAY" == "true" ]]; then + if curl_ok http://localhost:9091/-/healthy; then + add_check "Pushgateway HTTP (9091)" "ok" "" + else + add_check "Pushgateway HTTP (9091)" "fail" "unreachable" + fi +fi + +if [[ "$HAS_PROMETHEUS" == "true" ]]; then + PROM_TARGETS=$(curl_json "http://localhost:${PROM_HTTP_PORT}/api/v1/targets") + if echo "$PROM_TARGETS" | jq -e '.data.activeTargets | length > 0' >/dev/null 2>&1; then + PROM_OK=$(echo "$PROM_TARGETS" | jq '[.data.activeTargets[]? | select(.health == "up")] | length' 2>/dev/null || echo "0") + PROM_TOTAL=$(echo "$PROM_TARGETS" | jq '.data.activeTargets | length' 2>/dev/null || echo "0") + if [[ "$PROM_OK" -eq "$PROM_TOTAL" ]]; then + add_check "Prometheus targets" "ok" "${PROM_OK}/${PROM_TOTAL} healthy" + else + add_check "Prometheus targets" "warn" "${PROM_OK}/${PROM_TOTAL} healthy" + fi + else + add_check "Prometheus targets" "warn" "no active targets" + fi +else + add_check "Prometheus targets" "warn" "not in ${COMPOSE_PROFILE} profile" +fi + +if [[ "$HAS_GRAFANA" == "true" ]]; then + if curl_ok http://localhost:3000/api/health; then + GRAFANA_HEALTH=$(curl_json http://localhost:3000/api/health) + add_check "Grafana HTTP (3000)" "ok" "$(echo "$GRAFANA_HEALTH" | jq -r '.version // "ok"' 2>/dev/null || echo "ok")" + else + add_check "Grafana HTTP (3000)" "fail" "unreachable" + fi +else + add_check "Grafana" "warn" "not in ${COMPOSE_PROFILE} profile" +fi + +# ───────────────────────────────────────────────────────────── +output_header "10. DISK SPACE" + +if [[ "$COMPOSE_PROFILE" == "production" && -n "$DISK_BASE" ]]; then + for disk in 1 2 3 4 5; do + if [[ -d "${DISK_BASE}${disk}/data" ]]; then + DF_RESULT=$(df -h "${DISK_BASE}${disk}/data" 2>/dev/null || echo "unavailable") + if echo "$DF_RESULT" | grep -q "Filesystem"; then + USE_PCT=$(echo "$DF_RESULT" | tail -1 | awk '{print $5}' | tr -d '%') + AVAIL=$(echo "$DF_RESULT" | tail -1 | awk '{print $4}') + if [[ "$USE_PCT" =~ ^[0-9]+$ ]] && [[ "$USE_PCT" -ge 90 ]]; then + add_check "Disk /disk${disk}/data" "warn" "${USE_PCT}% used (${AVAIL} avail)" + else + add_check "Disk /disk${disk}/data" "ok" "${USE_PCT}% used (${AVAIL} avail)" + fi + else + add_check "Disk /disk${disk}/data" "warn" "not mounted" + fi + else + add_check "Disk /disk${disk}/data" "warn" "directory not found" + fi + done +else + for d in data/master data/volumes data/filer; do + if [[ -d "${PROJECT_DIR}/${d}" ]]; then + USE_PCT=$(df "${PROJECT_DIR}/${d}" 2>/dev/null | tail -1 | awk '{print $5}' || echo "?") + if [[ "$USE_PCT" =~ ^[0-9]+$ ]] && [[ "$USE_PCT" -ge 90 ]]; then + add_check "Dir ${d}" "warn" "${USE_PCT}% (high)" + else + add_check "Dir ${d}" "ok" "${USE_PCT}%" + fi + else + add_check "Dir ${d}" "warn" "not found" + fi + done +fi + +# ───────────────────────────────────────────────────────────── +output_header "11. CROSS-SERVICE DEPENDENCIES" + +# Volume → master registration +if [[ "$MASTER_JSON" != "{}" ]]; then + VOL_SERVERS_CHECK=$(echo "$MASTER_JSON" | jq '[.Volumes[]? // {} | .url // empty] | length' 2>/dev/null || echo "-1") + if [[ "$VOL_SERVERS_CHECK" -ne -1 ]]; then + for i in $(seq 1 $VOL_COUNT); do + port=$((VOL_BASE_PORT + i - 1)) + if [[ "$COMPOSE_PROFILE" == "local" ]]; then + svc_name="sds-gateway-${ENV_TYPE}-sfs-volume" + else + svc_name="sds-gateway-${ENV_TYPE}-sfs-volume${i}" + fi + if [[ "$VOL_SERVERS_CHECK" -gt 0 ]]; then + add_check "${svc_name} → master" "ok" "registered" + else + add_check "${svc_name} → master" "warn" "not in master registry" + fi + done + else + # Fallback: master HTTP is up, assume connectivity + for i in $(seq 1 $VOL_COUNT); do + if [[ "$COMPOSE_PROFILE" == "local" ]]; then + svc_name="sds-gateway-${ENV_TYPE}-sfs-volume" + else + svc_name="sds-gateway-${ENV_TYPE}-sfs-volume${i}" + fi + add_check "${svc_name} → master" "ok" "master HTTP reachable" + done + fi +fi + +# Filer → master connectivity +if curl_ok "http://localhost:${SFS_FILER_PORT:-8888}/"; then + add_check "Filer → master" "ok" "filer responding" +else + add_check "Filer → master" "fail" "filer unreachable" +fi + +# S3 → filer connectivity +S3_FILER=$(docker exec sds-gateway-${ENV_TYPE}-sfs-s3 \ + weed s3.filer 2>/dev/null || echo "unknown") +if [[ "$S3_FILER" != "unknown" ]]; then + add_check "S3 → filer" "ok" "connected to ${S3_FILER}" +else + add_check "S3 → filer" "warn" "can't verify connection" +fi + +# ───────────────────────────────────────────────────────────── +output_header "12. DOCKER CLEANUP" + +RUNNING_COUNT=$(${DOCKER_COMPOSE} ps --format '{{.Service}}' 2>/dev/null | wc -l || echo "0") +add_check "Running services" "ok" "${RUNNING_COUNT}" + +NETWORK_NAME="sds-gateway-${ENV_TYPE}-seaweed-net" +ORPHANS=$(${DOCKER_COMPOSE} ps --format '{{.Name}}' 2>/dev/null || echo "") +ORPHAN_LIST=$(docker ps -q --filter "network=${NETWORK_NAME}" 2>/dev/null | while read cid; do + cname=$(docker inspect --format '{{.Name}}' "$cid" 2>/dev/null | sed 's|^/||') + if ! echo "$ORPHANS" | grep -qw "$cname"; then + echo "$cname" + fi +done || true) + +if [[ -n "$ORPHAN_LIST" ]]; then + add_check "Orphaned containers" "warn" "$ORPHAN_LIST" +else + add_check "Orphaned containers" "ok" "none" +fi + +# ───────────────────────────────────────────────────────────── +output_header "SUMMARY" + +if [[ "$OUTPUT_MODE" == "human" ]]; then + printf " Checks: %d | ✓ %d OK | ⚠ %d WARN | ✗ %d FAIL\n" "$TOTAL" "$OK" "$WARN" "$FAIL" +fi + +# ── JSON output ───────────────────────────────────────────── +if [[ "$OUTPUT_MODE" == "json" ]]; then + jq -n \ + --argjson checks "$JSON_CHECKS" \ + --arg total "$TOTAL" \ + --arg ok "$OK" \ + --arg warn "$WARN" \ + --arg fail "$FAIL" \ + --arg env "$ENV_TYPE" \ + --arg profile "$COMPOSE_PROFILE" \ + --arg compose_file "$COMPOSE_FILE" \ + '{ + env: $env, + profile: $profile, + compose_file: $compose_file, + total: ($total | tonumber), + ok: ($ok | tonumber), + warn: ($warn | tonumber), + fail: ($fail | tonumber), + status: (if ($fail | tonumber) > 0 then "failed" elif ($warn | tonumber) > 0 then "warning" else "ok" end), + checks: $checks + }' +fi + +# ── EXIT ──────────────────────────────────────────────────── +if [[ "$FAIL" -gt 0 ]]; then + [[ "$OUTPUT_MODE" == "human" ]] && log_error "HEALTH CHECK FAILED" + exit 1 +elif [[ "$WARN" -gt 0 ]]; then + log_msg "HEALTH CHECK PASSED WITH WARNINGS" + exit 0 +else + log_success "ALL HEALTH CHECKS PASSED" + exit 0 +fi diff --git a/seaweedfs/scripts/prod-hostnames.example.env b/seaweedfs/scripts/prod-hostnames.example.env new file mode 100644 index 000000000..7f0613204 --- /dev/null +++ b/seaweedfs/scripts/prod-hostnames.example.env @@ -0,0 +1,9 @@ +# Production hostnames — one per line. +# The deploy script checks the current hostname against this list when deploying +# to production, preventing accidental deploys on non-production machines. +# +# Add the hostname of each production server below, one per line. +# Get the hostname with: hostname +# +# example-prod-host-01 +# example-prod-host-02