|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +source .buildkite/scripts/install_deps.sh |
| 4 | + |
| 5 | +cleanup_cloud_stale() { |
| 6 | + local exit_code=$? |
| 7 | + |
| 8 | + cd "$WORKSPACE" |
| 9 | + rm -f "${AWS_RESOURCES_FILE}" |
| 10 | + rm -f "${AWS_REDSHIFT_RESOURCES_FILE}" |
| 11 | + |
| 12 | + exit "$exit_code" |
| 13 | +} |
| 14 | + |
| 15 | +trap cleanup_cloud_stale EXIT |
| 16 | + |
| 17 | +set -euo pipefail |
| 18 | + |
| 19 | +AWS_RESOURCES_FILE="aws.resources.txt" |
| 20 | +AWS_REDSHIFT_RESOURCES_FILE="redshift_clusters.json" |
| 21 | + |
| 22 | +RESOURCE_RETENTION_PERIOD="${RESOURCE_RETENTION_PERIOD:-"24 hours"}" |
| 23 | +DELETE_RESOURCES_BEFORE_DATE=$(date -Is -d "${RESOURCE_RETENTION_PERIOD} ago") |
| 24 | +export DELETE_RESOURCES_BEFORE_DATE |
| 25 | + |
| 26 | +CLOUD_REAPER_IMAGE="${DOCKER_REGISTRY}/observability-ci/cloud-reaper:0.3.0" |
| 27 | + |
| 28 | +DRY_RUN="$(buildkite-agent meta-data get DRY_RUN --default "${DRY_RUN:-"true"}")" |
| 29 | + |
| 30 | +resources_to_delete=0 |
| 31 | + |
| 32 | +COMMAND="validate" |
| 33 | +if [[ "${DRY_RUN}" != "true" ]]; then |
| 34 | + # TODO: to be changed to "destroy --confirm" once it can be tested |
| 35 | + # that filters work as expected |
| 36 | + COMMAND="plan" |
| 37 | +else |
| 38 | + COMMAND="plan" |
| 39 | +fi |
| 40 | + |
| 41 | +any_resources_to_delete() { |
| 42 | + local file=$1 |
| 43 | + local number=0 |
| 44 | + # First three lines are like: |
| 45 | + # ⇒ Loading configuration... |
| 46 | + # ✓ Succeeded to load configuration |
| 47 | + # Scanning resources... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 |
| 48 | + number=$(tail -n +4 "${file}" | wc -l) |
| 49 | + if [ "${number}" -eq 0 ]; then |
| 50 | + return 1 |
| 51 | + fi |
| 52 | + return 0 |
| 53 | +} |
| 54 | + |
| 55 | +# As long as cloud reaper does not support OIDC authentication. |
| 56 | +create_aws_ephemeral_user() { |
| 57 | + # Generate a unique name for the ephemeral IAM user. |
| 58 | + EPHEMERAL_USER="ephemeral-admin-$(date +%s)" |
| 59 | + echo "Creating IAM user: ${EPHEMERAL_USER}" |
| 60 | + aws iam create-user --user-name "${EPHEMERAL_USER}" \ |
| 61 | + --tags Key=ephemeral,Value=true Key=division,Value=engineering Key=org,Value=obs Key=environment,Value=ci Key=repo,Value=elastic-package Key=created_at,Value="$(date -Is)" |
| 62 | + |
| 63 | + echo "Attaching AdministratorAccess policy to ${EPHEMERAL_USER}..." |
| 64 | + aws iam attach-user-policy --user-name "${EPHEMERAL_USER}" --policy-arn arn:aws:iam::aws:policy/AdministratorAccess |
| 65 | + |
| 66 | + echo "Creating access keys for ${EPHEMERAL_USER}..." |
| 67 | + creds_json=$(aws iam create-access-key --user-name "${EPHEMERAL_USER}") |
| 68 | + AWS_ACCESS_KEY_ID_EPHEMERAL=$(echo "$creds_json" | jq -r '.AccessKey.AccessKeyId') |
| 69 | + AWS_SECRET_ACCESS_KEY_EPHEMERAL=$(echo "$creds_json" | jq -r '.AccessKey.SecretAccessKey') |
| 70 | + export EPHEMERAL_USER AWS_ACCESS_KEY_ID_EPHEMERAL AWS_SECRET_ACCESS_KEY_EPHEMERAL |
| 71 | +} |
| 72 | + |
| 73 | +# Define cleanup function to delete the ephemeral IAM user regardless of script outcome. |
| 74 | +cleanup_ephemeral_user() { |
| 75 | + echo "Cleaning up ephemeral IAM user: ${EPHEMERAL_USER}" |
| 76 | + aws iam detach-user-policy --user-name "${EPHEMERAL_USER}" --policy-arn arn:aws:iam::aws:policy/AdministratorAccess |
| 77 | + key_id=$(echo "$creds_json" | jq -r '.AccessKey.AccessKeyId') |
| 78 | + aws iam delete-access-key --user-name "${EPHEMERAL_USER}" --access-key-id "${key_id}" |
| 79 | + aws iam delete-user --user-name "${EPHEMERAL_USER}" |
| 80 | + echo "Ephemeral IAM user ${EPHEMERAL_USER} deleted." |
| 81 | + unset EPHEMERAL_USER AWS_ACCESS_KEY_ID_EPHEMERAL AWS_SECRET_ACCESS_KEY_EPHEMERAL |
| 82 | +} |
| 83 | +trap cleanup_ephemeral_user EXIT |
| 84 | + |
| 85 | +cloud_reaper_aws() { |
| 86 | + echo "--- Configuring ephemeral user" |
| 87 | + create_aws_ephemeral_user |
| 88 | + |
| 89 | + echo "Validating configuration" |
| 90 | + docker run --rm -v "$(pwd)/.buildkite/configs/cleanup.aws.yml":/etc/cloud-reaper/config.yml \ |
| 91 | + -e AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID_EPHEMERAL" \ |
| 92 | + -e AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY_EPHEMERAL" \ |
| 93 | + -e ACCOUNT_PROJECT="observability-ci" \ |
| 94 | + -e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \ |
| 95 | + "${CLOUD_REAPER_IMAGE}" \ |
| 96 | + cloud-reaper \ |
| 97 | + --debug \ |
| 98 | + --config /etc/cloud-reaper/config.yml \ |
| 99 | + validate |
| 100 | + |
| 101 | + echo "Scanning resources" |
| 102 | + docker run --rm -v "$(pwd)/.buildkite/configs/cleanup.aws.yml":/etc/cloud-reaper/config.yml \ |
| 103 | + -e AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID_EPHEMERAL" \ |
| 104 | + -e AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY_EPHEMERAL" \ |
| 105 | + -e ACCOUNT_PROJECT="observability-ci" \ |
| 106 | + -e CREATION_DATE="${DELETE_RESOURCES_BEFORE_DATE}" \ |
| 107 | + "${CLOUD_REAPER_IMAGE}" \ |
| 108 | + cloud-reaper \ |
| 109 | + --config /etc/cloud-reaper/config.yml \ |
| 110 | + ${COMMAND} | tee "${AWS_RESOURCES_FILE}" |
| 111 | +} |
| 112 | + |
| 113 | +echo "--- Installing awscli" |
| 114 | +with_aws_cli |
| 115 | + |
| 116 | +echo "--- Cleaning up AWS resources older than ${DELETE_RESOURCES_BEFORE_DATE}..." |
| 117 | +cloud_reaper_aws |
| 118 | + |
| 119 | +if any_resources_to_delete "${AWS_RESOURCES_FILE}" ; then |
| 120 | + echo "Pending AWS resources" |
| 121 | + resources_to_delete=1 |
| 122 | +fi |
| 123 | + |
| 124 | +if [ "${resources_to_delete}" -eq 1 ]; then |
| 125 | + message="There are resources to be deleted" |
| 126 | + echo "${message}" |
| 127 | + if running_on_buildkite ; then |
| 128 | + buildkite-agent annotate \ |
| 129 | + "${message}" \ |
| 130 | + --context "ctx-cloud-reaper-error" \ |
| 131 | + --style "error" |
| 132 | + fi |
| 133 | +fi |
| 134 | + |
| 135 | +echo "--- Cleaning up other AWS resources older than ${DELETE_RESOURCES_BEFORE_DATE}" |
| 136 | + |
| 137 | +export AWS_DEFAULT_REGION=us-east-1 |
| 138 | +# Avoid to send the output of the CLI to a pager |
| 139 | +export AWS_PAGER="" |
| 140 | + |
| 141 | +echo "--- Checking if any Redshift cluster still created" |
| 142 | +aws redshift describe-clusters \ |
| 143 | + --tag-keys "environment" \ |
| 144 | + --tag-values "ci" > "${AWS_REDSHIFT_RESOURCES_FILE}" |
| 145 | + |
| 146 | +clusters_num=$(jq -rc '.Clusters | length' "${AWS_REDSHIFT_RESOURCES_FILE}") |
| 147 | + |
| 148 | +echo "Number of clusters found: ${clusters_num}" |
| 149 | + |
| 150 | +redshift_clusters_to_delete=0 |
| 151 | +while read -r i ; do |
| 152 | + identifier=$(echo "$i" | jq -rc ".ClusterIdentifier") |
| 153 | + # tags |
| 154 | + repo=$(echo "$i" | jq -rc '.Tags[] | select(.Key == "repo").Value') |
| 155 | + environment=$(echo "$i" | jq -rc '.Tags[] | select(.Key == "environment").Value') |
| 156 | + # creation time tag in milliseconds |
| 157 | + createdAt=$(echo "$i" | jq -rc '.Tags[] | select(.Key == "created_date").Value') |
| 158 | + # epoch in milliseconds minus retention period |
| 159 | + thresholdEpoch=$(date -d "${RESOURCE_RETENTION_PERIOD} ago" +"%s%3N") |
| 160 | + |
| 161 | + if [[ ! "${identifier}" =~ ^elastic-package-test- ]]; then |
| 162 | + echo "Skip cluster ${identifier}, do not match required identifiers." |
| 163 | + continue |
| 164 | + fi |
| 165 | + |
| 166 | + if [[ "${repo}" != "integrations" && "${repo}" != "elastic-package" ]]; then |
| 167 | + echo "Skip cluster ${identifier}, not from the expected repo: ${repo}." |
| 168 | + continue |
| 169 | + fi |
| 170 | + |
| 171 | + if [[ "${environment}" != "ci" ]]; then |
| 172 | + echo "Skip cluster ${identifier}, not from the expected environment: ${environment}." |
| 173 | + continue |
| 174 | + fi |
| 175 | + |
| 176 | + if [ "${createdAt}" -gt "${thresholdEpoch}" ]; then |
| 177 | + echo "Skip cluster $identifier. It was created < ${RESOURCE_RETENTION_PERIOD} ago" |
| 178 | + continue |
| 179 | + fi |
| 180 | + |
| 181 | + echo "To be deleted cluster: $identifier. It was created > ${RESOURCE_RETENTION_PERIOD} ago" |
| 182 | + if [ "${DRY_RUN}" != "false" ]; then |
| 183 | + redshift_clusters_to_delete=1 |
| 184 | + continue |
| 185 | + fi |
| 186 | + |
| 187 | + echo "Deleting: $identifier. It was created > ${RESOURCE_RETENTION_PERIOD} ago" |
| 188 | + if ! aws redshift delete-cluster \ |
| 189 | + --cluster-identifier "${identifier}" \ |
| 190 | + --skip-final-cluster-snapshot \ |
| 191 | + --output json \ |
| 192 | + --query "Cluster.{ClusterStatus:ClusterStatus,ClusterIdentifier:ClusterIdentifier}" ; then |
| 193 | + |
| 194 | + echo "Failed delete-cluster" |
| 195 | + buildkite-agent annotate \ |
| 196 | + "Deleted redshift cluster: ${identifier}" \ |
| 197 | + --context "ctx-aws-readshift-deleted-error-${identifier}" \ |
| 198 | + --style "error" |
| 199 | + |
| 200 | + redshift_clusters_to_delete=1 |
| 201 | + else |
| 202 | + echo "Done." |
| 203 | + # if deletion works, no need to mark this one as to be deleted |
| 204 | + buildkite-agent annotate \ |
| 205 | + "Deleted redshift cluster: ${identifier}" \ |
| 206 | + --context "ctx-aws-readshift-deleted-${identifier}" \ |
| 207 | + --style "success" |
| 208 | + fi |
| 209 | +done <<< "$(jq -c '.Clusters[]' "${AWS_REDSHIFT_RESOURCES_FILE}")" |
| 210 | + |
| 211 | +if [ "${redshift_clusters_to_delete}" -eq 1 ]; then |
| 212 | + resources_to_delete=1 |
| 213 | + message="There are redshift resources to be deleted" |
| 214 | + echo "${message}" |
| 215 | + if running_on_buildkite ; then |
| 216 | + buildkite-agent annotate \ |
| 217 | + "${message}" \ |
| 218 | + --context "ctx-aws-readshift-error" \ |
| 219 | + --style "error" |
| 220 | + fi |
| 221 | +fi |
| 222 | + |
| 223 | +# TODO: List and delete the required resources using aws cli or using cloud-reaper tool |
| 224 | +echo "--- TODO: Cleaning up IAM roles" |
| 225 | +echo "--- TODO: Cleaning up IAM policies" |
| 226 | +echo "--- TODO: Cleaning up Schedulers" |
| 227 | + |
| 228 | +if [ "${resources_to_delete}" -eq 1 ]; then |
| 229 | + exit 1 |
| 230 | +fi |
0 commit comments