diff --git a/.gitignore b/.gitignore index 1c0bd60e4..d4afd0427 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ secrets_cache/ terraform.tfstate terraform.tfstate.backup +*.auto.tfvars.json kubeconfig.new .vscode/* diff --git a/ansible/inventory/demo/host.yml b/ansible/inventory/demo/host.yml index b7e2493a2..1ef28e0a5 100644 --- a/ansible/inventory/demo/host.yml +++ b/ansible/inventory/demo/host.yml @@ -39,6 +39,9 @@ wiab: pod_network_cidr: "10.233.0.0/16" minikube_node_subnet: "192.168.99.0/24" + # will dump logs on failure when deploying helm charts + dump_logs_on_failure: true + # will use certmanager for certs use_cert_manager: true # networking iptables dnat rules http_dnat_rules: diff --git a/ansible/wiab-demo/helm_install.yml b/ansible/wiab-demo/helm_install.yml index 669598ea1..f4f0b7b98 100644 --- a/ansible/wiab-demo/helm_install.yml +++ b/ansible/wiab-demo/helm_install.yml @@ -117,6 +117,13 @@ - name: Deploy core Wire service Helm charts block: + - name: Reset core Helm deployment status + set_fact: + helm_deploy_failed: false + helm_deploy_failure_task: '' + helm_deploy_failure_message: '' + deployment_messages: [] + - name: Display charts that will be deployed debug: msg: "Following charts will be deployed: {{ charts_to_deploy | join(', ') }}" @@ -165,32 +172,46 @@ loop: "{{ charts_to_deploy }}" register: helm_deploy_result - - name: Report deployment status for all core charts - block: + rescue: + - name: Store core Helm deployment failure details + set_fact: + helm_deploy_failed: true + helm_deploy_failure_task: "{{ ansible_failed_task.name | default('Deploy core Wire charts using their available configuration files') }}" + helm_deploy_failure_message: "{{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('Unknown error during Helm chart deployment')) }}" + + always: - name: Build deployment status list set_fact: - deployment_messages: "{{ deployment_messages | default([]) + [item.item + ': ' + ('Deployed' if item.changed else 'Already up-to-date')] }}" - loop: "{{ helm_deploy_result.results }}" - when: helm_deploy_result.results is defined + deployment_messages: "{{ deployment_messages + [item.item + ': ' + ('Failed' if item.failed | default(false) else ('Deployed' if item.changed | default(false) else 'Already up-to-date'))] }}" + loop: "{{ helm_deploy_result.results | default([]) }}" no_log: true - name: Display chart deployment status debug: msg: "{{ ['Chart deployment status:'] + (deployment_messages | map('regex_replace', '^', '- ') | list) }}" - when: helm_deploy_result.results is defined + when: deployment_messages | length > 0 - - name: Retrieve running pods from default namespace - kubernetes.core.k8s_info: - kind: Pod - namespace: default - kubeconfig: "{{ kube_config }}" - register: pods_info + - name: Display core Helm deployment failure details + debug: + msg: + - "Core Wire chart deployment failed." + - "Failed task: {{ helm_deploy_failure_task }}" + - "Error: {{ helm_deploy_failure_message }}" + when: helm_deploy_failed | default(false) + + - name: Retrieve running pods from default namespace + kubernetes.core.k8s_info: + kind: Pod + namespace: default + kubeconfig: "{{ kube_config }}" + register: pods_info - - name: Display running pods sorted by creation time - block: - name: Count running pods set_fact: - running_pods_count: "{{ pods_info.resources | length }}" + total_pods_count: "{{ pods_info.resources | length }}" + all_pods: "{{ pods_info.resources | map(attribute='metadata.name') | list }}" + failing_pod_resources: "{{ pods_info.resources | rejectattr('status.phase', 'equalto', 'Running') | rejectattr('status.phase', 'equalto', 'Succeeded') | list }}" + failing_pods: "{{ pods_info.resources | rejectattr('status.phase', 'equalto', 'Running') | rejectattr('status.phase', 'equalto', 'Succeeded') | map(attribute='metadata.name') | list }}" running_pods: "{{ pods_info.resources | selectattr('status.phase', 'equalto', 'Running') | map(attribute='metadata.name') | list }}" succeeded_pods: "{{ pods_info.resources | selectattr('status.phase', 'equalto', 'Succeeded') | map(attribute='metadata.name') | list }}" pending_pods: "{{ pods_info.resources | selectattr('status.phase', 'equalto', 'Pending') | map(attribute='metadata.name') | list }}" @@ -198,7 +219,7 @@ - name: Display pods summary debug: msg: - - "Total running pods: {{ running_pods_count }}" + - "Total pods: {{ total_pods_count }}" - "" - "Running ({{ running_pods | length }}):" - "{{ running_pods | map('regex_replace', '^', ' - ') | list }}" @@ -209,6 +230,59 @@ - "Pending ({{ pending_pods | length }}):" - "{{ pending_pods | map('regex_replace', '^', ' - ') | list }}" + - name: Display failing pod details + debug: + msg: | + Failing pod details: + {{ failing_pod_resources | to_nice_yaml }} + when: + - helm_deploy_failed | default(false) + - dump_logs_on_failure | default(false) + - failing_pod_resources | length > 0 + + - name: Note when no failing pods were found + debug: + msg: "No failing pods found in pod inventory; skipping pod detail dump." + when: + - helm_deploy_failed | default(false) + - dump_logs_on_failure | default(false) + - failing_pod_resources | length == 0 + + - name: Collect logs from all pods + kubernetes.core.k8s_log: + name: "{{ item }}" + namespace: default + kubeconfig: "{{ kube_config }}" + tail_lines: 30 + all_containers: true + failed_when: false + register: pod_logs + loop: "{{ all_pods }}" + when: + - helm_deploy_failed | default(false) + - dump_logs_on_failure | default(false) + + - name: Display pod logs + debug: + msg: | + Pod logs for {{ item.item }}: + {% if item.failed | default(false) %} + Failed to collect logs: {{ item.msg | default('Unknown error') }} + {% else %} + {{ item.log | default(item.content | default('No logs returned')) }} + {% endif %} + loop: "{{ pod_logs.results | default([]) }}" + loop_control: + label: "{{ item.item }}" + when: + - helm_deploy_failed | default(false) + - dump_logs_on_failure | default(false) + + - name: Stop play after core Helm deployment failure + fail: + msg: "Core Wire chart deployment failed in task '{{ helm_deploy_failure_task }}': {{ helm_deploy_failure_message }}" + when: helm_deploy_failed | default(false) + - name: Deploy nginx-ingress-services with TLS configuration block: diff --git a/ansible/wiab-demo/wire_secrets.yml b/ansible/wiab-demo/wire_secrets.yml index 4afd87b8f..4d15fe9fd 100644 --- a/ansible/wiab-demo/wire_secrets.yml +++ b/ansible/wiab-demo/wire_secrets.yml @@ -369,6 +369,120 @@ when: "'postgresql' in charts_to_deploy" + - name: Manage MLS private keys for galley + block: + - name: Create temporary directory for MLS key files + tempfile: + state: directory + suffix: _mls_keys + register: mls_temp_dir + changed_when: false + + - name: Generate MLS private keys using openssl + shell: >- + openssl genpkey {{ item.command }} -out '{{ mls_temp_dir.path }}/{{ item.filename }}' 2>/dev/null + args: + executable: /bin/bash + changed_when: false + no_log: true + loop: + - name: mls_ed25519_key + filename: ed25519.pem + command: "-algorithm ed25519" + pem_bytes: 119 + der_bytes: 48 + - name: mls_ecdsa_p256_key + filename: ecdsa_p256.pem + command: "-algorithm ec -pkeyopt ec_paramgen_curve:P-256" + pem_bytes: 241 + der_bytes: 121 + - name: mls_ecdsa_p384_key + filename: ecdsa_p384.pem + command: "-algorithm ec -pkeyopt ec_paramgen_curve:P-384" + pem_bytes: 306 + der_bytes: 167 + - name: mls_ecdsa_p521_key + filename: ecdsa_p521.pem + command: "-algorithm ec -pkeyopt ec_paramgen_curve:P-521" + pem_bytes: 384 + der_bytes: 223 + + - name: Read generated MLS private key files + slurp: + src: "{{ mls_temp_dir.path }}/{{ item.filename }}" + register: mls_key_files + no_log: true + loop: + - name: mls_ed25519_key + filename: ed25519.pem + pem_bytes: 119 + der_bytes: 48 + - name: mls_ecdsa_p256_key + filename: ecdsa_p256.pem + pem_bytes: 241 + der_bytes: 121 + - name: mls_ecdsa_p384_key + filename: ecdsa_p384.pem + pem_bytes: 306 + der_bytes: 167 + - name: mls_ecdsa_p521_key + filename: ecdsa_p521.pem + pem_bytes: 384 + der_bytes: 223 + + - name: Set MLS private keys as facts + set_fact: + "{{ item.item.name }}": "{{ item.content | b64decode }}" + loop: "{{ mls_key_files.results }}" + no_log: true + + - name: Validate exact MLS private key PEM sizes + assert: + that: + - item.content | b64decode | length == item.item.pem_bytes + fail_msg: "MLS private key PEM size mismatch for {{ item.item.name }}" + quiet: yes + loop: "{{ mls_key_files.results }}" + no_log: true + + - name: Validate exact MLS private key DER sizes + shell: >- + openssl pkey -in '{{ mls_temp_dir.path }}/{{ item.filename }}' -outform DER 2>/dev/null | wc -c + args: + executable: /bin/bash + register: mls_key_der_sizes + changed_when: false + no_log: true + loop: + - name: mls_ed25519_key + filename: ed25519.pem + der_bytes: 48 + - name: mls_ecdsa_p256_key + filename: ecdsa_p256.pem + der_bytes: 121 + - name: mls_ecdsa_p384_key + filename: ecdsa_p384.pem + der_bytes: 167 + - name: mls_ecdsa_p521_key + filename: ecdsa_p521.pem + der_bytes: 223 + + - name: Assert exact MLS private key DER sizes + assert: + that: + - item.stdout | int == item.item.der_bytes + fail_msg: "MLS private key DER size mismatch for {{ item.item.name }}" + quiet: yes + loop: "{{ mls_key_der_sizes.results }}" + + always: + - name: Cleanup MLS temporary directory + file: + path: "{{ mls_temp_dir.path }}" + state: absent + changed_when: false + when: mls_temp_dir.path is defined + - name: Configure wire-server service secrets (brig, nginz, cargohold, galley) block: - name: Check if wire-server secrets file exists @@ -405,6 +519,14 @@ secrets: awsKeyId: "{{ minio_access_key }}" awsSecretKey: "{{ minio_secret_key }}" + galley: + secrets: + mlsPrivateKeys: + removal: + ed25519: "{{ mls_ed25519_key }}" + ecdsa_secp256r1_sha256: "{{ mls_ecdsa_p256_key }}" + ecdsa_secp384r1_sha384: "{{ mls_ecdsa_p384_key }}" + ecdsa_secp521r1_sha512: "{{ mls_ecdsa_p521_key }}" no_log: true - name: Add pgPassword to update dictionary diff --git a/bin/helm-operations.sh b/bin/helm-operations.sh index 46d4a75a8..164aac3ac 100755 --- a/bin/helm-operations.sh +++ b/bin/helm-operations.sh @@ -78,7 +78,7 @@ process_values() { ENV=$1 TYPE=$2 - charts=(fake-aws demo-smtp rabbitmq databases-ephemeral reaper wire-server webapp account-pages team-settings ingress-nginx-controller) + charts=(fake-aws smtp rabbitmq databases-ephemeral reaper wire-server webapp account-pages team-settings ingress-nginx-controller) if [[ "$DEPLOY_CERT_MANAGER" == "TRUE" ]]; then charts+=(nginx-ingress-services cert-manager) diff --git a/changelog.d/2-wire-builds/release-5.25 b/changelog.d/2-wire-builds/release-5.25 new file mode 100644 index 000000000..fb05a2866 --- /dev/null +++ b/changelog.d/2-wire-builds/release-5.25 @@ -0,0 +1,2 @@ +Fixed: update reference for 5.25 to 5.25.21 without any pinned component +Added: logging in case of helm chart failure diff --git a/changelog.d/2-wire-builds/smpt-values-fix b/changelog.d/2-wire-builds/smpt-values-fix new file mode 100644 index 000000000..5c55e4a80 --- /dev/null +++ b/changelog.d/2-wire-builds/smpt-values-fix @@ -0,0 +1 @@ +Fixed: values for smtp helm chart diff --git a/changelog.d/3-deploy-builds/heztner-deployment b/changelog.d/3-deploy-builds/heztner-deployment new file mode 100644 index 000000000..7e5d50bcb --- /dev/null +++ b/changelog.d/3-deploy-builds/heztner-deployment @@ -0,0 +1 @@ +Fixed: Refactored terraform logic for CD purposes for all solutions wiab-dev(demo), wiab-staging and default (equivalent). All logic to pick up the region and server type remains in the respective scripts, there will be an iteration over regions first, terraform would just validate the regions and server types diff --git a/changelog.d/3-deploy-builds/wiab-dev-mls-kys b/changelog.d/3-deploy-builds/wiab-dev-mls-kys new file mode 100644 index 000000000..2ee9129ce --- /dev/null +++ b/changelog.d/3-deploy-builds/wiab-dev-mls-kys @@ -0,0 +1,2 @@ +Fixed: smtp helm chart values +Fixed: issue due to requirement of mls keys for webapp for wiab-dev when MLS is not required diff --git a/offline/cd.sh b/offline/cd.sh index 8d879f504..069566d94 100755 --- a/offline/cd.sh +++ b/offline/cd.sh @@ -5,13 +5,19 @@ set -euo pipefail CD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TF_DIR="${CD_DIR}/../terraform/examples/wire-server-deploy-offline-hetzner" VALUES_DIR="${CD_DIR}/../values" +TF_VARS_FILE="${TF_DIR}/retry-selection.auto.tfvars.json" COMMIT_HASH="${GITHUB_SHA}" ARTIFACT="wire-server-deploy-static-${COMMIT_HASH}" +# Retry matrix +LOCATIONS=("hel1" "fsn1" "nbg1") +SMALL_SERVER_TYPES=("cx23" "cx33" "cpx22") +MEDIUM_SERVER_TYPES=("cx33" "cx43" "cpx32") + # Retry configuration -MAX_RETRIES=3 RETRY_DELAY=30 +APPLY_TIMEOUT_SECONDS=300 echo "Wire Offline Deployment with Retry Logic" echo "========================================" @@ -22,79 +28,100 @@ function cleanup { } trap cleanup EXIT +function persist_terraform_vars { + local location="$1" + local small_server_type="$2" + local medium_server_type="$3" + + printf '{\n "location": "%s",\n "small_server_type": "%s",\n "medium_server_type": "%s"\n}\n' \ + "$location" \ + "$small_server_type" \ + "$medium_server_type" > "$TF_VARS_FILE" +} + cd "$TF_DIR" terraform init +if ! command -v timeout >/dev/null 2>&1; then + echo "The 'timeout' command is required but not installed" + exit 1 +fi + +if [[ ${#SMALL_SERVER_TYPES[@]} -ne ${#MEDIUM_SERVER_TYPES[@]} ]]; then + echo "Small and medium server type retry lists must have the same length" + exit 1 +fi + +if [[ ${#LOCATIONS[@]} -eq 0 || ${#SMALL_SERVER_TYPES[@]} -eq 0 ]]; then + echo "No location or server type preferences configured in the retry matrix" + exit 1 +fi + +location_count=${#LOCATIONS[@]} +server_type_count=${#SMALL_SERVER_TYPES[@]} +MAX_RETRIES=$((location_count * server_type_count)) + # Retry loop for terraform apply echo "Starting deployment with automatic retry on resource unavailability..." -for attempt in $(seq 1 $MAX_RETRIES); do - echo "" - echo "Deployment attempt $attempt of $MAX_RETRIES" - date +echo "Retry plan: ${location_count} locations x ${server_type_count} server type pairs = ${MAX_RETRIES} attempts" + +attempt=1 +deployment_succeeded=false - if terraform apply -auto-approve; then - echo "Infrastructure deployment successful on attempt $attempt!" - break - else - echo "Infrastructure deployment failed on attempt $attempt" +for server_type_index in $(seq 0 $((server_type_count - 1))); do + for location_index in $(seq 0 $((location_count - 1))); do + attempt_location="${LOCATIONS[$location_index]}" + attempt_small_server_type="${SMALL_SERVER_TYPES[$server_type_index]}" + attempt_medium_server_type="${MEDIUM_SERVER_TYPES[$server_type_index]}" + + persist_terraform_vars "$attempt_location" "$attempt_small_server_type" "$attempt_medium_server_type" + + echo "" + echo "Deployment attempt $attempt of $MAX_RETRIES" + echo " -> location=${attempt_location}, small=${attempt_small_server_type}, medium=${attempt_medium_server_type}" + date + + if timeout "${APPLY_TIMEOUT_SECONDS}s" terraform apply -auto-approve; then + echo "Infrastructure deployment successful on attempt $attempt!" + deployment_succeeded=true + break 2 + fi + + apply_exit_code=$? + + if [[ $apply_exit_code -eq 124 ]]; then + echo "Infrastructure deployment timed out after ${APPLY_TIMEOUT_SECONDS}s on attempt $attempt" + else + echo "Infrastructure deployment failed on attempt $attempt" + fi if [[ $attempt -lt $MAX_RETRIES ]]; then - echo "Will retry with different configuration..." + echo "Will retry with the next location and server type combination..." - # Clean up partial deployment echo "Cleaning up partial deployment..." terraform destroy -auto-approve || true - # Wait for resources to potentially become available echo "Waiting ${RETRY_DELAY}s for resources to become available..." sleep $RETRY_DELAY - - # Modify configuration for better availability - echo "Adjusting server type preferences for attempt $((attempt + 1))..." - case $attempt in - 1) - # Attempt 2: Prioritize cx22 and cx41 - sed -i.bak 's/"cx23", "cx33", "cpx22"/"cx33", "cpx22", "cx23"/' main.tf - sed -i.bak 's/"cx33", "cx43", "cpx32"/"cx43", "cpx32", "cx33"/' main.tf - echo " -> Prioritizing cx33 and cx43 server types" - ;; - 2) - # Attempt 3: Use smallest available types - sed -i.bak 's/"cx33", "cpx22", "cx23"/"cpx22", "cx23", "cx33"/' main.tf - sed -i.bak 's/"cx43", "cpx32", "cx33"/"cpx32", "cx33", "cx43"/' main.tf - echo " -> Using smallest available server types" - ;; - esac - - terraform init -reconfigure - else - echo "All deployment attempts failed after $MAX_RETRIES tries" - echo "" - echo "This usually means:" - echo " 1. High demand for Hetzner Cloud resources in EU regions" - echo " 2. Your account may have resource limits" - echo " 3. Try again later when resources become available" - echo "" - echo "Manual solutions:" - echo " 1. Check Hetzner Console for resource limits" - echo " 2. Try different server types manually" - echo " 3. Contact Hetzner support for resource availability" - - # Restore original config - if [[ -f main.tf.bak ]]; then - mv main.tf.bak main.tf - terraform init -reconfigure - fi - - exit 1 fi - fi + + attempt=$((attempt + 1)) + done done -# Restore original config after successful deployment -if [[ -f main.tf.bak ]]; then - mv main.tf.bak main.tf - terraform init -reconfigure +if [[ "$deployment_succeeded" != true ]]; then + echo "All deployment attempts failed after $MAX_RETRIES tries" + echo "" + echo "This usually means:" + echo " 1. High demand for Hetzner Cloud resources in EU regions" + echo " 2. Your account may have resource limits" + echo " 3. Try again later when resources become available" + echo "" + echo "Manual solutions:" + echo " 1. Check Hetzner Console for resource limits" + echo " 2. Try different server types manually" + echo " 3. Contact Hetzner support for resource availability" + exit 1 fi echo "" @@ -134,4 +161,3 @@ ssh $SSH_OPTS -A "root@$adminhost" ./bin/offline-deploy.sh echo "" echo "Wire offline deployment completed successfully!" -cleanup diff --git a/offline/cd_demo.sh b/offline/cd_demo.sh index d06b52256..d9b0ed932 100755 --- a/offline/cd_demo.sh +++ b/offline/cd_demo.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -euxo pipefail +set -euo pipefail CD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TF_DIR="${CD_DIR}/../terraform/examples/wiab-demo-hetzner" @@ -11,9 +11,19 @@ ARTIFACTS_DIR="${CD_DIR}/demo-build/output" ANSIBLE_DIR="${CD_DIR}/../ansible" INVENTORY_DIR="${ANSIBLE_DIR}/inventory/demo" INVENTORY_FILE="${INVENTORY_DIR}/host.yml" +TF_VARS_FILE="${TF_DIR}/retry-selection.auto.tfvars.json" TEST_USER="demo" COMMIT_HASH="${GITHUB_SHA}" +# Retry matrix +LOCATIONS=("hel1" "fsn1" "nbg1") +SERVER_TYPES=("cx53" "cpx62") + +# Retry configuration +RETRY_DELAY=30 +APPLY_TIMEOUT_SECONDS=300 + + function cleanup { (cd "$TF_DIR" && terraform destroy -auto-approve) echo "done" @@ -21,8 +31,80 @@ function cleanup { trap cleanup EXIT +function persist_terraform_vars { + local location="$1" + local server_type="$2" + + printf '{\n "location": "%s",\n "server_type": "%s"\n}\n' \ + "$location" \ + "$server_type" > "$TF_VARS_FILE" +} + cd "$TF_DIR" -terraform init && terraform apply -auto-approve +terraform init + +if ! command -v timeout >/dev/null 2>&1; then + echo "The 'timeout' command is required but not installed" + exit 1 +fi + +if [[ ${#LOCATIONS[@]} -eq 0 || ${#SERVER_TYPES[@]} -eq 0 ]]; then + echo "No location or server type preferences configured in the retry matrix" + exit 1 +fi + +location_count=${#LOCATIONS[@]} +server_type_count=${#SERVER_TYPES[@]} +MAX_RETRIES=$((location_count * server_type_count)) + +echo "Retry plan: ${location_count} locations x ${server_type_count} server types = ${MAX_RETRIES} attempts" + +attempt=1 +deployment_succeeded=false + +for server_type_index in $(seq 0 $((server_type_count - 1))); do + for location_index in $(seq 0 $((location_count - 1))); do + attempt_location="${LOCATIONS[$location_index]}" + attempt_server_type="${SERVER_TYPES[$server_type_index]}" + + persist_terraform_vars "$attempt_location" "$attempt_server_type" + + echo "Deployment attempt $attempt of $MAX_RETRIES" + echo " -> location=${attempt_location}, size=${attempt_server_type}" + date + + if timeout "${APPLY_TIMEOUT_SECONDS}s" terraform apply -auto-approve; then + deployment_succeeded=true + break 2 + fi + + apply_exit_code=$? + + if [[ $apply_exit_code -eq 124 ]]; then + echo "Infrastructure deployment timed out after ${APPLY_TIMEOUT_SECONDS}s on attempt $attempt" + else + echo "Infrastructure deployment failed on attempt $attempt" + fi + + if [[ $attempt -lt $MAX_RETRIES ]]; then + echo "Cleaning up partial deployment..." + terraform destroy -auto-approve || true + + echo "Waiting ${RETRY_DELAY}s for resources to become available..." + sleep $RETRY_DELAY + fi + + attempt=$((attempt + 1)) + done +done + +if [[ "$deployment_succeeded" != true ]]; then + echo "All deployment attempts failed after $MAX_RETRIES tries" + exit 1 +fi + +echo "" +echo "Infrastructure ready! Proceeding with application deployment..." host=$(terraform output -raw host) ssh_private_key=$(terraform output ssh_private_key) @@ -56,5 +138,3 @@ echo "Running ansible playbook deploy_wiab.yml against node $host" ansible-playbook -i "${INVENTORY_FILE}" "${ANSIBLE_DIR}/wiab-demo/deploy_wiab.yml" --skip-tags verify_dns # cleaning demo-wiab ansible-playbook -i "${INVENTORY_FILE}" "${ANSIBLE_DIR}/wiab-demo/clean_cluster.yml" --tags remove_minikube,remove_artifacts,remove_packages,remove_iptables,remove_ssh - -cleanup diff --git a/offline/cd_staging.sh b/offline/cd_staging.sh index 6cc218994..cd4d2ccbc 100755 --- a/offline/cd_staging.sh +++ b/offline/cd_staging.sh @@ -5,12 +5,18 @@ set -euo pipefail CD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TF_DIR="${CD_DIR}/../terraform/examples/wiab-staging-hetzner" VALUES_DIR="${CD_DIR}/../values" +TF_VARS_FILE="${TF_DIR}/retry-selection.auto.tfvars.json" COMMIT_HASH="${GITHUB_SHA}" ARTIFACT="wire-server-deploy-static-${COMMIT_HASH}" +# Retry matrix +LOCATIONS=("hel1" "nbg1" "fsn1") +SMALL_SERVER_TYPES=("cpx22" "cpx32" "cpx42") +MEDIUM_SERVER_TYPES=("cpx42" "cpx52" "cpx62") + # Retry configuration -MAX_RETRIES=3 RETRY_DELAY=30 +APPLY_TIMEOUT_SECONDS=300 echo "Wire Offline Deployment with Retry Logic" echo "========================================" @@ -21,79 +27,101 @@ function cleanup { } trap cleanup EXIT +function persist_terraform_vars { + local location="$1" + local small_server_type="$2" + local medium_server_type="$3" + + printf '{\n "location": "%s",\n "small_server_type": "%s",\n "medium_server_type": "%s"\n}\n' \ + "$location" \ + "$small_server_type" \ + "$medium_server_type" > "$TF_VARS_FILE" +} + cd "$TF_DIR" terraform init +if ! command -v timeout >/dev/null 2>&1; then + echo "The 'timeout' command is required but not installed" + exit 1 +fi + +if [[ ${#SMALL_SERVER_TYPES[@]} -ne ${#MEDIUM_SERVER_TYPES[@]} ]]; then + echo "Small and medium server type retry lists must have the same length" + exit 1 +fi + +if [[ ${#LOCATIONS[@]} -eq 0 || ${#SMALL_SERVER_TYPES[@]} -eq 0 ]]; then + echo "No location or server type preferences configured in the retry matrix" + exit 1 +fi + +location_count=${#LOCATIONS[@]} +server_type_count=${#SMALL_SERVER_TYPES[@]} +MAX_RETRIES=$((location_count * server_type_count)) + +echo "Retry plan: ${location_count} locations x ${server_type_count} server type pairs = ${MAX_RETRIES} attempts" + # Retry loop for terraform apply echo "Starting deployment with automatic retry on resource unavailability..." -for attempt in $(seq 1 $MAX_RETRIES); do - echo "" - echo "Deployment attempt $attempt of $MAX_RETRIES" - date +attempt=1 +deployment_succeeded=false + +for server_type_index in $(seq 0 $((server_type_count - 1))); do + for location_index in $(seq 0 $((location_count - 1))); do + attempt_location="${LOCATIONS[$location_index]}" + attempt_small_server_type="${SMALL_SERVER_TYPES[$server_type_index]}" + attempt_medium_server_type="${MEDIUM_SERVER_TYPES[$server_type_index]}" - if terraform apply -auto-approve; then - echo "Infrastructure deployment successful on attempt $attempt!" - break - else - echo "Infrastructure deployment failed on attempt $attempt" + persist_terraform_vars "$attempt_location" "$attempt_small_server_type" "$attempt_medium_server_type" + + + echo "" + echo "Deployment attempt $attempt of $MAX_RETRIES" + echo " -> location=${attempt_location}, small=${attempt_small_server_type}, medium=${attempt_medium_server_type}" + date + + if timeout "${APPLY_TIMEOUT_SECONDS}s" terraform apply -auto-approve; then + echo "Infrastructure deployment successful on attempt $attempt!" + deployment_succeeded=true + break 2 + fi + + apply_exit_code=$? + + if [[ $apply_exit_code -eq 124 ]]; then + echo "Infrastructure deployment timed out after ${APPLY_TIMEOUT_SECONDS}s on attempt $attempt" + else + echo "Infrastructure deployment failed on attempt $attempt" + fi if [[ $attempt -lt $MAX_RETRIES ]]; then - echo "Will retry with different configuration..." + echo "Will retry with the next location and server type combination..." - # Clean up partial deployment echo "Cleaning up partial deployment..." terraform destroy -auto-approve || true - # Wait for resources to potentially become available echo "Waiting ${RETRY_DELAY}s for resources to become available..." sleep $RETRY_DELAY - - # Modify configuration for better availability - echo "Adjusting server type preferences for attempt $((attempt + 1))..." - case $attempt in - 1) - # Attempt 2: Prioritize cpx22 and cx53 - sed -i.bak 's/"cx33", "cpx22", "cx43"/"cpx22", "cx43", "cx33"/' main.tf - sed -i.bak 's/"cx43", "cx53", "cpx42"/"cx53", "cpx42", "cx43"/' main.tf - echo " -> Prioritizing cpx22 and cx53 server types" - ;; - 2) - # Attempt 3: Use biggest available types - sed -i.bak 's/"cpx22", "cx43", "cx33"/"cx43", "cx33", "cpx22"/' main.tf - sed -i.bak 's/"cx53", "cpx42", "cx43"/"cpx42", "cx43", "cx53"/' main.tf - echo " -> Using Biggest available server types" - ;; - esac - - terraform init -reconfigure - else - echo "All deployment attempts failed after $MAX_RETRIES tries" - echo "" - echo "This usually means:" - echo " 1. High demand for Hetzner Cloud resources in EU regions" - echo " 2. Your account may have resource limits" - echo " 3. Try again later when resources become available" - echo "" - echo "Manual solutions:" - echo " 1. Check Hetzner Console for resource limits" - echo " 2. Try different server types manually" - echo " 3. Contact Hetzner support for resource availability" - - # Restore original config - if [[ -f main.tf.bak ]]; then - mv main.tf.bak main.tf - terraform init -reconfigure - fi - - exit 1 fi - fi + + attempt=$((attempt + 1)) + done done -# Restore original config after successful deployment -if [[ -f main.tf.bak ]]; then - mv main.tf.bak main.tf - terraform init -reconfigure +if [[ "$deployment_succeeded" != true ]]; then + echo "All deployment attempts failed after $MAX_RETRIES tries" + echo "" + echo "This usually means:" + echo " 1. High demand for Hetzner Cloud resources in EU regions" + echo " 2. Your account may have resource limits" + echo " 3. Try again later when resources become available" + echo "" + echo "Manual solutions:" + echo " 1. Check Hetzner Console for resource limits" + echo " 2. Try different server types manually" + echo " 3. Contact Hetzner support for resource availability" + exit 1 fi echo "" @@ -208,4 +236,3 @@ ssh $SSH_OPTS -A "demo@$adminhost" ./bin/offline-deploy.sh echo "" echo "Wire offline deployment completed successfully!" -cleanup diff --git a/offline/default-build/build.sh b/offline/default-build/build.sh index 23e608dc6..a94855536 100755 --- a/offline/default-build/build.sh +++ b/offline/default-build/build.sh @@ -26,7 +26,7 @@ TASKS_DIR="${SCRIPT_DIR}/../tasks" # -------------------------- # pulling the charts based on builds.json, charts to be skipped are passed as arguments HELM_CHART_EXCLUDE_LIST -"${TASKS_DIR}"/proc_pull_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_EXCLUDE_LIST="inbucket,wire-server-enterprise,postgresql" +"${TASKS_DIR}"/proc_pull_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_EXCLUDE_LIST="inbucket,wire-server-enterprise,postgresql,rust-sft,fluent-bit" # pulling the charts from helm-charts repo, charts to be included are passed as arguments HELM_CHART_INCLUDE_LIST # "${TASKS_DIR}"/proc_pull_ext_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_INCLUDE_LIST="postgresql-external" diff --git a/offline/demo-build/build.sh b/offline/demo-build/build.sh index 9b00679e2..7aafee54f 100755 --- a/offline/demo-build/build.sh +++ b/offline/demo-build/build.sh @@ -20,7 +20,7 @@ TASKS_DIR="${SCRIPT_DIR}/../tasks" # Processing helm charts # -------------------------- -HELM_CHART_EXCLUDE_LIST="inbucket,wire-server-enterprise,k8ssandra-operator,k8ssandra-test-cluster,elasticsearch-curator,keycloakx,openebs,nginx-ingress-controller,kibana,restund,fluent-bit,aws-ingress,redis-cluster,calling-test,demo-smtp,cassandra-external,elasticsearch-external,minio-external,postgresql-external,rabbitmq-external" +HELM_CHART_EXCLUDE_LIST="inbucket,wire-server-enterprise,k8ssandra-operator,k8ssandra-test-cluster,elasticsearch-curator,keycloakx,openebs,nginx-ingress-controller,kibana,restund,fluent-bit,aws-ingress,redis-cluster,calling-test,demo-smtp,cassandra-external,elasticsearch-external,minio-external,postgresql-external,rabbitmq-external,rust-sft,fluent-bit" # pulling the charts, charts to be skipped are passed as arguments HELM_CHART_EXCLUDE_LIST "${TASKS_DIR}"/proc_pull_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_EXCLUDE_LIST="${HELM_CHART_EXCLUDE_LIST}" diff --git a/offline/min-build/build.sh b/offline/min-build/build.sh index 8aff72848..00f0f2421 100755 --- a/offline/min-build/build.sh +++ b/offline/min-build/build.sh @@ -26,7 +26,7 @@ TASKS_DIR="${SCRIPT_DIR}/../tasks" # -------------------------- # pulling the charts, charts to be skipped are passed as arguments HELM_CHART_EXCLUDE_LIST -HELM_CHART_EXCLUDE_LIST="inbucket,wire-server-enterprise,k8ssandra-operator,k8ssandra-test-cluster,elasticsearch-ephemeral,elasticsearch-curator,rabbitmq,smtp,fake-aws,fake-aws-s3,postgresql,keycloakx,openebs,nginx-ingress-controller,kibana,restund,fluent-bit,aws-ingress,databases-ephemeral,redis-cluster,calling-test,cert-manager,kube-prometheus-stack,demo-smtp,wire-utility" +HELM_CHART_EXCLUDE_LIST="inbucket,wire-server-enterprise,k8ssandra-operator,k8ssandra-test-cluster,elasticsearch-ephemeral,elasticsearch-curator,rabbitmq,smtp,fake-aws,fake-aws-s3,postgresql,keycloakx,openebs,nginx-ingress-controller,kibana,restund,fluent-bit,aws-ingress,databases-ephemeral,redis-cluster,calling-test,cert-manager,kube-prometheus-stack,demo-smtp,wire-utility,rust-sft,fluent-bit" "${TASKS_DIR}"/proc_pull_charts.sh OUTPUT_DIR="${OUTPUT_DIR}" HELM_CHART_EXCLUDE_LIST="${HELM_CHART_EXCLUDE_LIST}" diff --git a/offline/tasks/proc_pull_charts.sh b/offline/tasks/proc_pull_charts.sh index c069320c2..20b1e7fb1 100755 --- a/offline/tasks/proc_pull_charts.sh +++ b/offline/tasks/proc_pull_charts.sh @@ -82,17 +82,7 @@ pull_charts() { done echo "Pulling charts done." - # Patch bitnami repository references in pulled charts - # Remove the extraction and replacement when there will be no more bitnami charts - #echo "Patching bitnami repository references..." - #SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - #PATCH_SCRIPT="${SCRIPT_DIR}/patch-chart-images.sh" - #if [[ -f "$PATCH_SCRIPT" ]]; then - # "$PATCH_SCRIPT" "${OUTPUT_DIR}/charts" - #else - # echo "Warning: patch-chart-images.sh not found at $PATCH_SCRIPT, skipping chart patching" - #fi } -wire_build="https://raw.githubusercontent.com/wireapp/wire-builds/pinned-offline-multi-20260224-142104/build.json" +wire_build="https://raw.githubusercontent.com/wireapp/wire-builds/f941851bee7666441a23ba782766cd8d7de5043e/build.json" wire_build_chart_release "$wire_build" | pull_charts diff --git a/terraform/examples/wiab-demo-hetzner/main.tf b/terraform/examples/wiab-demo-hetzner/main.tf index 5b6b7e8ac..e667fecd1 100644 --- a/terraform/examples/wiab-demo-hetzner/main.tf +++ b/terraform/examples/wiab-demo-hetzner/main.tf @@ -1,25 +1,72 @@ locals { - # Server type preferences with fallbacks - preferred_server_types = { - size = ["cx53", "cpx62"] } } -# Get available server types in the specified location -data "hcloud_server_types" "available" { +variable "location" { + description = "Hetzner location selected by the deployment script" + type = string + default = "hel1" } -# Helper locals to select available server types +variable "server_type" { + description = "Server type selected by the deployment script" + type = string + default = "cx53" +} + +# Get available server types and locations +data "hcloud_server_types" "available" {} +data "hcloud_datacenters" "available" {} + locals { available_server_type_names = [for st in data.hcloud_server_types.available.server_types : st.name] + available_location_names = [for dc in data.hcloud_datacenters.available.datacenters : dc.location.name] +} + +resource "null_resource" "location_validation" { + count = contains(local.available_location_names, var.location) ? 0 : 1 + + provisioner "local-exec" { + command = <<-EOT + echo "DEPLOYMENT FAILED: Requested location is unavailable" + echo "Requested location: ${var.location}" + echo "Available locations: ${join(", ", local.available_location_names)}" + echo "Please check Hetzner Cloud region availability" + exit 1 + EOT + } +} + +resource "null_resource" "server_type_validation" { + count = contains(local.available_server_type_names, var.server_type) ? 0 : 1 + + provisioner "local-exec" { + command = <<-EOT + echo "DEPLOYMENT FAILED: Requested server type is currently unavailable" + echo "Requested server type: ${var.server_type}" + echo "Available types: ${join(", ", local.available_server_type_names)}" + echo "Please check server type availability" + exit 1 + EOT + } +} + +resource "null_resource" "deployment_info" { + depends_on = [ + null_resource.location_validation, + null_resource.server_type_validation + ] - # Select the first available server type from the preference list - server_type = [ - for preferred in local.preferred_server_types.size : - preferred if contains(local.available_server_type_names, preferred) - ][0] + provisioner "local-exec" { + command = <<-EOT + echo "VALIDATION PASSED: Deploying WIAB dev infrastructure" + echo "Location: ${var.location}" + echo "Server type: ${var.server_type}" + EOT + } } resource "random_pet" "host" { + depends_on = [null_resource.deployment_info] } resource "tls_private_key" "host" { @@ -33,10 +80,10 @@ resource "hcloud_ssh_key" "host" { } resource "hcloud_server" "host" { - location = "fsn1" + location = var.location name = "host-${random_pet.host.id}" image = "ubuntu-24.04" ssh_keys = [hcloud_ssh_key.host.name] - server_type = local.server_type + server_type = var.server_type } diff --git a/terraform/examples/wiab-demo-hetzner/outputs.tf b/terraform/examples/wiab-demo-hetzner/outputs.tf index f2e2a63e6..de46ae866 100644 --- a/terraform/examples/wiab-demo-hetzner/outputs.tf +++ b/terraform/examples/wiab-demo-hetzner/outputs.tf @@ -1,16 +1,33 @@ output "ssh_private_key" { sensitive = true - value = tls_private_key.host.private_key_pem + value = tls_private_key.host.private_key_pem } output "selected_server_types" { - description = "Server types selected after checking availability" + description = "Server types selected for the current deployment attempt" value = { - server_type = local.server_type + server_type = var.server_type + } +} + +output "selected_location" { + description = "Location selected for the current deployment attempt" + value = var.location +} + +output "resource_fallback_info" { + description = "Information about the requested deployment combination and its availability" + value = { + requested_location = var.location + selected_location = var.location + requested_server_type = var.server_type + selected_server_type = var.server_type + available_locations = local.available_location_names + available_server_types = local.available_server_type_names } } output "host" { sensitive = true - value = hcloud_server.host.ipv4_address + value = hcloud_server.host.ipv4_address } diff --git a/terraform/examples/wiab-staging-hetzner/main.tf b/terraform/examples/wiab-staging-hetzner/main.tf index 890ddd7fa..5fb916526 100644 --- a/terraform/examples/wiab-staging-hetzner/main.tf +++ b/terraform/examples/wiab-staging-hetzner/main.tf @@ -1,57 +1,45 @@ locals { - rfc1918_cidr = "10.0.0.0/8" - kubenode_count = 3 - datanode_count = 3 - ssh_keys = [hcloud_ssh_key.adminhost.name] - - # Location preferences with fallbacks (EU only) - preferred_locations = ["fsn1", "hel1", "nbg1"] - - # Server type preferences with fallbacks (optimized for availability) - preferred_server_types = { - small = ["cx33", "cpx22", "cx43"] # For assethost and adminhost - medium = ["cx43", "cx53", "cpx42"] # For datanodes and k8s_nodes - } + rfc1918_cidr = "10.0.0.0/8" + kubenode_count = 3 + datanode_count = 3 + ssh_keys = [hcloud_ssh_key.adminhost.name] +} + +variable "location" { + description = "Hetzner location selected by the deployment script" + type = string + default = "hel1" +} + +variable "small_server_type" { + description = "Server type for assethost and adminhost selected by the deployment script" + type = string + default = "cx33" +} + +variable "medium_server_type" { + description = "Server type for datanodes and Kubernetes nodes selected by the deployment script" + type = string + default = "cx43" } # Get available server types and locations data "hcloud_server_types" "available" {} data "hcloud_datacenters" "available" {} -# Helper locals to select available resources with robust fallback logic +# Validate the exact combination requested by the deployment script. locals { available_server_type_names = [for st in data.hcloud_server_types.available.server_types : st.name] available_location_names = [for dc in data.hcloud_datacenters.available.datacenters : dc.location.name] - - # Select the first available location from the preference list - available_preferred_locations = [ - for preferred in local.preferred_locations : - preferred if contains(local.available_location_names, preferred) - ] - selected_location = length(local.available_preferred_locations) > 0 ? local.available_preferred_locations[0] : null - - # Select the first available server type from the preference list (with validation) - available_small_server_types = [ - for preferred in local.preferred_server_types.small : - preferred if contains(local.available_server_type_names, preferred) - ] - small_server_type = length(local.available_small_server_types) > 0 ? local.available_small_server_types[0] : null - - available_medium_server_types = [ - for preferred in local.preferred_server_types.medium : - preferred if contains(local.available_server_type_names, preferred) - ] - medium_server_type = length(local.available_medium_server_types) > 0 ? local.available_medium_server_types[0] : null } -# Validation checks - fail early with helpful error messages resource "null_resource" "location_validation" { - count = local.selected_location != null ? 0 : 1 + count = contains(local.available_location_names, var.location) ? 0 : 1 provisioner "local-exec" { command = <<-EOT - echo "DEPLOYMENT FAILED: No suitable location available" - echo "Requested locations: ${join(", ", local.preferred_locations)}" + echo "DEPLOYMENT FAILED: Requested location is unavailable" + echo "Requested location: ${var.location}" echo "Available locations: ${join(", ", local.available_location_names)}" echo "Please check Hetzner Cloud region availability" exit 1 @@ -60,28 +48,28 @@ resource "null_resource" "location_validation" { } resource "null_resource" "small_server_type_validation" { - count = local.small_server_type != null ? 0 : 1 + count = contains(local.available_server_type_names, var.small_server_type) ? 0 : 1 provisioner "local-exec" { command = <<-EOT - echo "DEPLOYMENT FAILED: No suitable database server types available" - echo "Requested types: ${join(", ", local.preferred_server_types.small)}" + echo "DEPLOYMENT FAILED: Requested small server type is currently unavailable" + echo "Requested small server type: ${var.small_server_type}" echo "Available types: ${join(", ", local.available_server_type_names)}" - echo "Please check server type availability in the selected region" + echo "Please check server type availability" exit 1 EOT } } resource "null_resource" "medium_server_type_validation" { - count = local.medium_server_type != null ? 0 : 1 + count = contains(local.available_server_type_names, var.medium_server_type) ? 0 : 1 provisioner "local-exec" { command = <<-EOT - echo "DEPLOYMENT FAILED: No suitable Kubernetes server types available" - echo "Requested types: ${join(", ", local.preferred_server_types.medium)}" + echo "DEPLOYMENT FAILED: Requested medium server type is currently unavailable" + echo "Requested medium server type: ${var.medium_server_type}" echo "Available types: ${join(", ", local.available_server_type_names)}" - echo "Please check server type availability in the selected region" + echo "Please check server type availability" exit 1 EOT } @@ -96,10 +84,10 @@ resource "null_resource" "deployment_info" { provisioner "local-exec" { command = <<-EOT - echo "VALIDATION PASSED: Deploying Wire offline infrastructure" - echo "Location: ${local.selected_location}" - echo "Database server type: ${local.medium_server_type}" - echo "Kubernetes server type: ${local.medium_server_type}" + echo "VALIDATION PASSED: Deploying WIAB staging infrastructure" + echo "Location: ${var.location}" + echo "Small server type: ${var.small_server_type}" + echo "Medium server type: ${var.medium_server_type}" echo "Total instances: ${local.datanode_count + local.kubenode_count + 2}" EOT } @@ -141,11 +129,11 @@ resource "hcloud_server" "adminhost" { null_resource.deployment_info, hcloud_network_subnet.main ] - location = local.selected_location + location = var.location name = "adminhost-${random_pet.adminhost.id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.small_server_type + server_type = var.small_server_type network { network_id = hcloud_network.main.id ip = "" @@ -161,11 +149,11 @@ resource "hcloud_server" "assethost" { null_resource.deployment_info, hcloud_network_subnet.main ] - location = local.selected_location + location = var.location name = "assethost-${random_pet.assethost.id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.small_server_type + server_type = var.small_server_type public_net { ipv4_enabled = false ipv6_enabled = false @@ -186,11 +174,11 @@ resource "hcloud_server" "kubenode" { hcloud_network_subnet.main ] count = local.kubenode_count - location = local.selected_location + location = var.location name = "kubenode-${random_pet.kubenode[count.index].id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.medium_server_type + server_type = var.medium_server_type public_net { ipv4_enabled = false ipv6_enabled = false @@ -211,11 +199,11 @@ resource "hcloud_server" "datanode" { hcloud_network_subnet.main ] count = local.datanode_count - location = local.selected_location + location = var.location name = "datanode-${random_pet.datanode[count.index].id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.medium_server_type + server_type = var.medium_server_type public_net { ipv4_enabled = false ipv6_enabled = false diff --git a/terraform/examples/wiab-staging-hetzner/outputs.tf b/terraform/examples/wiab-staging-hetzner/outputs.tf index c758df645..8fc10ec4e 100644 --- a/terraform/examples/wiab-staging-hetzner/outputs.tf +++ b/terraform/examples/wiab-staging-hetzner/outputs.tf @@ -4,32 +4,32 @@ output "ssh_private_key" { } output "selected_server_types" { - description = "Server types selected after checking availability" + description = "Server types selected for the current deployment attempt" value = { - small_server_type = local.small_server_type - medium_server_type = local.medium_server_type + small_server_type = var.small_server_type + medium_server_type = var.medium_server_type } } output "selected_location" { - description = "Location selected after checking availability" - value = local.selected_location + description = "Location selected for the current deployment attempt" + value = var.location } output "resource_fallback_info" { - description = "Information about resource fallback selections" + description = "Information about the requested deployment combination and its availability" value = { - requested_locations = local.preferred_locations + requested_location = var.location available_locations = local.available_location_names - selected_location = local.selected_location + selected_location = var.location - requested_small_types = local.preferred_server_types.small - available_small_types = local.available_small_server_types - selected_small_type = local.small_server_type + requested_small_type = var.small_server_type + selected_small_type = var.small_server_type - requested_medium_types = local.preferred_server_types.medium - available_medium_types = local.available_medium_server_types - selected_medium_type = local.medium_server_type + requested_medium_type = var.medium_server_type + selected_medium_type = var.medium_server_type + + available_server_types = local.available_server_type_names } } @@ -86,8 +86,8 @@ output "static-inventory" { kube-node = { hosts = { for index, server in hcloud_server.kubenode : server.name => { - ansible_host = tolist(hcloud_server.kubenode[index].network)[0].ip - ip = tolist(hcloud_server.kubenode[index].network)[0].ip + ansible_host = tolist(hcloud_server.kubenode[index].network)[0].ip + ip = tolist(hcloud_server.kubenode[index].network)[0].ip } } # NOTE: Necessary for the Hetzner Cloud until Calico v3.17 arrives in Kubespray diff --git a/terraform/examples/wire-server-deploy-offline-hetzner/main.tf b/terraform/examples/wire-server-deploy-offline-hetzner/main.tf index ffb6e3a1b..657f6ed0c 100644 --- a/terraform/examples/wire-server-deploy-offline-hetzner/main.tf +++ b/terraform/examples/wire-server-deploy-offline-hetzner/main.tf @@ -7,55 +7,43 @@ locals { postgresql_count = 3 rabbitmq_count = 3 ssh_keys = [hcloud_ssh_key.adminhost.name] +} - # Location preferences with fallbacks (EU only) - preferred_locations = ["fsn1", "hel1", "nbg1"] +variable "location" { + description = "Hetzner location selected by the deployment script" + type = string + default = "hel1" +} - # Server type preferences with fallbacks (optimized for availability) - preferred_server_types = { - small = ["cx23", "cx33", "cpx22"] # For cassandra, elasticsearch, minio, postgresql, rabbitmq - medium = ["cx33", "cx43", "cpx32"] # For adminhost, assethost, kubenode - } +variable "small_server_type" { + description = "Server type for cassandra, elasticsearch, minio, postgresql, and rabbitmq selected by the deployment script" + type = string + default = "cx23" +} + +variable "medium_server_type" { + description = "Server type for adminhost, assethost, and kubenode selected by the deployment script" + type = string + default = "cx33" } # Get available server types and locations data "hcloud_server_types" "available" {} data "hcloud_datacenters" "available" {} -# Helper locals to select available resources with robust fallback logic +# Validate the exact combination requested by the deployment script. locals { available_server_type_names = [for st in data.hcloud_server_types.available.server_types : st.name] available_location_names = [for dc in data.hcloud_datacenters.available.datacenters : dc.location.name] - - # Select the first available location from the preference list - available_preferred_locations = [ - for preferred in local.preferred_locations : - preferred if contains(local.available_location_names, preferred) - ] - selected_location = length(local.available_preferred_locations) > 0 ? local.available_preferred_locations[0] : null - - # Select the first available server type from the preference list (with validation) - available_small_server_types = [ - for preferred in local.preferred_server_types.small : - preferred if contains(local.available_server_type_names, preferred) - ] - small_server_type = length(local.available_small_server_types) > 0 ? local.available_small_server_types[0] : null - - available_medium_server_types = [ - for preferred in local.preferred_server_types.medium : - preferred if contains(local.available_server_type_names, preferred) - ] - medium_server_type = length(local.available_medium_server_types) > 0 ? local.available_medium_server_types[0] : null } -# Validation checks - fail early with helpful error messages resource "null_resource" "location_validation" { - count = local.selected_location != null ? 0 : 1 + count = contains(local.available_location_names, var.location) ? 0 : 1 provisioner "local-exec" { command = <<-EOT - echo "DEPLOYMENT FAILED: No suitable location available" - echo "Requested locations: ${join(", ", local.preferred_locations)}" + echo "DEPLOYMENT FAILED: Requested location is unavailable" + echo "Requested location: ${var.location}" echo "Available locations: ${join(", ", local.available_location_names)}" echo "Please check Hetzner Cloud region availability" exit 1 @@ -64,28 +52,28 @@ resource "null_resource" "location_validation" { } resource "null_resource" "small_server_type_validation" { - count = local.small_server_type != null ? 0 : 1 + count = contains(local.available_server_type_names, var.small_server_type) ? 0 : 1 provisioner "local-exec" { command = <<-EOT echo "DEPLOYMENT FAILED: No suitable database server types available" - echo "Requested types: ${join(", ", local.preferred_server_types.small)}" + echo "Requested small server type: ${var.small_server_type}" echo "Available types: ${join(", ", local.available_server_type_names)}" - echo "Please check server type availability in the selected region" + echo "Please check server type availability" exit 1 EOT } } resource "null_resource" "medium_server_type_validation" { - count = local.medium_server_type != null ? 0 : 1 + count = contains(local.available_server_type_names, var.medium_server_type) ? 0 : 1 provisioner "local-exec" { command = <<-EOT echo "DEPLOYMENT FAILED: No suitable Kubernetes server types available" - echo "Requested types: ${join(", ", local.preferred_server_types.medium)}" + echo "Requested medium server type: ${var.medium_server_type}" echo "Available types: ${join(", ", local.available_server_type_names)}" - echo "Please check server type availability in the selected region" + echo "Please check server type availability" exit 1 EOT } @@ -100,10 +88,10 @@ resource "null_resource" "deployment_info" { provisioner "local-exec" { command = <<-EOT - echo "VALIDATION PASSED: Deploying Wire offline infrastructure" - echo "Location: ${local.selected_location}" - echo "Database server type: ${local.small_server_type}" - echo "Kubernetes server type: ${local.medium_server_type}" + echo "VALIDATION PASSED: Deploying WSD default infrastructure" + echo "Location: ${var.location}" + echo "Database server type: ${var.small_server_type}" + echo "Kubernetes server type: ${var.medium_server_type}" echo "Total instances: ${local.cassandra_count + local.postgresql_count + local.elasticsearch_count + local.minio_count + local.kubenode_count + 2}" EOT } @@ -145,11 +133,11 @@ resource "hcloud_server" "adminhost" { null_resource.deployment_info, hcloud_network_subnet.main ] - location = local.selected_location + location = var.location name = "adminhost-${random_pet.adminhost.id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.medium_server_type + server_type = var.medium_server_type network { network_id = hcloud_network.main.id ip = "" @@ -165,11 +153,11 @@ resource "hcloud_server" "assethost" { null_resource.deployment_info, hcloud_network_subnet.main ] - location = local.selected_location + location = var.location name = "assethost-${random_pet.assethost.id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.medium_server_type + server_type = var.medium_server_type public_net { ipv4_enabled = false ipv6_enabled = false @@ -190,11 +178,11 @@ resource "hcloud_server" "kubenode" { hcloud_network_subnet.main ] count = local.kubenode_count - location = local.selected_location + location = var.location name = "kubenode-${random_pet.kubenode[count.index].id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.medium_server_type + server_type = var.medium_server_type public_net { ipv4_enabled = false ipv6_enabled = false @@ -215,11 +203,11 @@ resource "hcloud_server" "cassandra" { hcloud_network_subnet.main ] count = local.cassandra_count - location = local.selected_location + location = var.location name = "cassandra-${random_pet.cassandra[count.index].id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.small_server_type + server_type = var.small_server_type public_net { ipv4_enabled = false ipv6_enabled = false @@ -240,11 +228,11 @@ resource "hcloud_server" "elasticsearch" { hcloud_network_subnet.main ] count = local.elasticsearch_count - location = local.selected_location + location = var.location name = "elasticsearch-${random_pet.elasticsearch[count.index].id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.small_server_type + server_type = var.small_server_type public_net { ipv4_enabled = false ipv6_enabled = false @@ -265,11 +253,11 @@ resource "hcloud_server" "minio" { hcloud_network_subnet.main ] count = local.minio_count - location = local.selected_location + location = var.location name = "minio-${random_pet.minio[count.index].id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.small_server_type + server_type = var.small_server_type public_net { ipv4_enabled = false ipv6_enabled = false @@ -290,11 +278,11 @@ resource "hcloud_server" "postgresql" { hcloud_network_subnet.main ] count = local.postgresql_count - location = local.selected_location + location = var.location name = "postgresql-${random_pet.postgresql[count.index].id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.small_server_type + server_type = var.small_server_type public_net { ipv4_enabled = false ipv6_enabled = false @@ -315,11 +303,11 @@ resource "hcloud_server" "rabbitmq" { hcloud_network_subnet.main ] count = local.rabbitmq_count - location = local.selected_location + location = var.location name = "rabbitmq-${random_pet.rabbitmq[count.index].id}" image = "ubuntu-22.04" ssh_keys = local.ssh_keys - server_type = local.small_server_type + server_type = var.small_server_type public_net { ipv4_enabled = false ipv6_enabled = false diff --git a/terraform/examples/wire-server-deploy-offline-hetzner/outputs.tf b/terraform/examples/wire-server-deploy-offline-hetzner/outputs.tf index 5bfebe101..e7cc5fd21 100644 --- a/terraform/examples/wire-server-deploy-offline-hetzner/outputs.tf +++ b/terraform/examples/wire-server-deploy-offline-hetzner/outputs.tf @@ -4,32 +4,32 @@ output "ssh_private_key" { } output "selected_server_types" { - description = "Server types selected after checking availability" + description = "Server types selected for the current deployment attempt" value = { - small_server_type = local.small_server_type - medium_server_type = local.medium_server_type + small_server_type = var.small_server_type + medium_server_type = var.medium_server_type } } output "selected_location" { - description = "Location selected after checking availability" - value = local.selected_location + description = "Location selected for the current deployment attempt" + value = var.location } output "resource_fallback_info" { - description = "Information about resource fallback selections" + description = "Information about the requested deployment combination and its availability" value = { - requested_locations = local.preferred_locations + requested_location = var.location available_locations = local.available_location_names - selected_location = local.selected_location + selected_location = var.location - requested_small_types = local.preferred_server_types.small - available_small_types = local.available_small_server_types - selected_small_type = local.small_server_type + requested_small_type = var.small_server_type + selected_small_type = var.small_server_type - requested_medium_types = local.preferred_server_types.medium - available_medium_types = local.available_medium_server_types - selected_medium_type = local.medium_server_type + requested_medium_type = var.medium_server_type + selected_medium_type = var.medium_server_type + + available_server_types = local.available_server_type_names } } @@ -43,9 +43,9 @@ output "static-inventory" { value = { all = { vars = { - ansible_user = "root" - private_interface = "enp7s0" - adminhost_ip = tolist(hcloud_server.adminhost.network)[0].ip + ansible_user = "root" + private_interface = "enp7s0" + adminhost_ip = tolist(hcloud_server.adminhost.network)[0].ip } } adminhost = { diff --git a/values/smtp/demo-values.example.yaml b/values/smtp/demo-values.example.yaml index 61af4315b..297b57297 100644 --- a/values/smtp/demo-values.example.yaml +++ b/values/smtp/demo-values.example.yaml @@ -1,43 +1,43 @@ -# CHANGEME-PROD: This is often a good default when using calico's default CIDR -# https://github.com/kubernetes-sigs/kubespray/blob/master/docs/calico.md#optional--define-the-default-pool-cidr -# or flannel's https://github.com/kubernetes-sigs/kubespray/blob/master/docs/flannel.md#flannel +# CHANGEME-DEMO: CHange it according to what has been configured for minikube pod_network_cidr at +# https://github.com/wireapp/wire-server-deploy/blob/master/ansible/inventory/demo/host.yml#L39 # If you override those values, etc., then verify that this CIDR still makes sense -# For all variables the "egos-tech/smtp" image supports see: https://gitlab.com/egos-tech/smtp and https://github.com/ntppool/charts/tree/main/charts/smtp +# https://github.com/wireapp/helm-charts/tree/dev/charts/smtp -service: - type: ClusterIP # for outside of cluster access, change to NodePort and add nodePort to service.yaml - port: 25 +smtp: + service: + type: ClusterIP # for outside of cluster access, change to NodePort and add nodePort to service.yaml + port: 25 -# for DKIM support -# extraVolumes: -# - name: dkim-key -# secret: -# secretName: dkim-private-key + # for DKIM support + # extraVolumes: + # - name: dkim-key + # secret: + # secretName: dkim-private-key -# extraVolumeMounts: -# - name: dkim-key -# mountPath: /secrets/dkim.key -# subPath: wire-mail.private -# readOnly: true + # extraVolumeMounts: + # - name: dkim-key + # mountPath: /secrets/dkim.key + # subPath: wire-mail.private + # readOnly: true -config: - RELAY_NETWORKS: ":10.233.0.0/16" -# for DKIM support -# MAILNAME: mail.example.com -# DKIM_KEY_PATH: /secrets/dkim.key -# DKIM_DOMAIN: "example.com" -# DKIM_PRIVATE_KEY: "/etc/exim4/dkim.key" + config: + RELAY_NETWORKS: ":10.233.0.0/16" + # for DKIM support + # MAILNAME: mail.example.com + # DKIM_KEY_PATH: /secrets/dkim.key + # DKIM_DOMAIN: "example.com" + # DKIM_PRIVATE_KEY: "/etc/exim4/dkim.key" -# PORT: "25" -# NET_DEV: eth0 -# OTHER_HOSTNAMES: other.example.com -# DISABLE_IPV6: 1 -# BIND_IP: 0.0.0.0 -# BIND_IP6: ::0 -# KEY_PATH: /path/to/key.crt -# CERTIFICATE_PATH: /path/to/certificate.crt -# SMARTHOST_ADDRESS: mail.example.com -# SMARTHOST_PORT: "587" -# SMARTHOST_USER: exampleuser -# SMARTHOST_PASSWORD: secret -# SMARTHOST_ALIASES: "*.example.com" + # PORT: "25" + # NET_DEV: eth0 + # OTHER_HOSTNAMES: other.example.com + # DISABLE_IPV6: 1 + # BIND_IP: 0.0.0.0 + # BIND_IP6: ::0 + # KEY_PATH: /path/to/key.crt + # CERTIFICATE_PATH: /path/to/certificate.crt + # SMARTHOST_ADDRESS: mail.example.com + # SMARTHOST_PORT: "587" + # SMARTHOST_USER: exampleuser + # SMARTHOST_PASSWORD: secret + # SMARTHOST_ALIASES: "*.example.com" diff --git a/values/smtp/prod-values.example.yaml b/values/smtp/prod-values.example.yaml index 61af4315b..099aa4da8 100644 --- a/values/smtp/prod-values.example.yaml +++ b/values/smtp/prod-values.example.yaml @@ -4,40 +4,41 @@ # If you override those values, etc., then verify that this CIDR still makes sense # For all variables the "egos-tech/smtp" image supports see: https://gitlab.com/egos-tech/smtp and https://github.com/ntppool/charts/tree/main/charts/smtp -service: - type: ClusterIP # for outside of cluster access, change to NodePort and add nodePort to service.yaml - port: 25 +smtp: + service: + type: ClusterIP # for outside of cluster access, change to NodePort and add nodePort to service.yaml + port: 25 -# for DKIM support -# extraVolumes: -# - name: dkim-key -# secret: -# secretName: dkim-private-key + # for DKIM support + # extraVolumes: + # - name: dkim-key + # secret: + # secretName: dkim-private-key -# extraVolumeMounts: -# - name: dkim-key -# mountPath: /secrets/dkim.key -# subPath: wire-mail.private -# readOnly: true + # extraVolumeMounts: + # - name: dkim-key + # mountPath: /secrets/dkim.key + # subPath: wire-mail.private + # readOnly: true -config: - RELAY_NETWORKS: ":10.233.0.0/16" -# for DKIM support -# MAILNAME: mail.example.com -# DKIM_KEY_PATH: /secrets/dkim.key -# DKIM_DOMAIN: "example.com" -# DKIM_PRIVATE_KEY: "/etc/exim4/dkim.key" + config: + RELAY_NETWORKS: ":10.233.0.0/16" + # for DKIM support + # MAILNAME: mail.example.com + # DKIM_KEY_PATH: /secrets/dkim.key + # DKIM_DOMAIN: "example.com" + # DKIM_PRIVATE_KEY: "/etc/exim4/dkim.key" -# PORT: "25" -# NET_DEV: eth0 -# OTHER_HOSTNAMES: other.example.com -# DISABLE_IPV6: 1 -# BIND_IP: 0.0.0.0 -# BIND_IP6: ::0 -# KEY_PATH: /path/to/key.crt -# CERTIFICATE_PATH: /path/to/certificate.crt -# SMARTHOST_ADDRESS: mail.example.com -# SMARTHOST_PORT: "587" -# SMARTHOST_USER: exampleuser -# SMARTHOST_PASSWORD: secret -# SMARTHOST_ALIASES: "*.example.com" + # PORT: "25" + # NET_DEV: eth0 + # OTHER_HOSTNAMES: other.example.com + # DISABLE_IPV6: 1 + # BIND_IP: 0.0.0.0 + # BIND_IP6: ::0 + # KEY_PATH: /path/to/key.crt + # CERTIFICATE_PATH: /path/to/certificate.crt + # SMARTHOST_ADDRESS: mail.example.com + # SMARTHOST_PORT: "587" + # SMARTHOST_USER: exampleuser + # SMARTHOST_PASSWORD: secret + # SMARTHOST_ALIASES: "*.example.com"