Files
apinto/build/resources/auto-join.sh
2025-06-26 14:52:42 +08:00

121 lines
4.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/sh
ERR_LOG=/var/log/apinto/error.log
echo_info() {
echo "[$(date "+%Y-%m-%d %H:%M:%S")] [INFO] $1" >> $ERR_LOG
}
echo_error() {
echo "[$(date "+%Y-%m-%d %H:%M:%S")] [ERROR] $1" >> $ERR_LOG
}
# 检查环境变量
if [ -z "$SERVICE" ] || [ -z "$NAMESPACE" ]; then
echo_error "Environment variables SERVICE and NAMESPACE must be set."
exit 1
fi
# 解析当前 Pod 的序号
CURRENT_INDEX=${HOSTNAME##*-}
BASE_NAME=${HOSTNAME%-*}
MAX_ATTEMPTS=60 # 最大尝试次数节点等待约1分钟
RETRY_INTERVAL=5 # 重试间隔,单位秒
MAX_JOIN_RETRIES=12 # 加入集群失败的最大重试次数
MAX_POD_INDEX=${MAX_POD_INDEX:-10} # 默认检查最多10个Pod可通过环境变量配置
# 等待本地服务启动
attempt=0
until curl --max-time 5 --silent --fail http://127.0.0.1:9400 || [ $attempt -ge $MAX_ATTEMPTS ]; do
echo_info "Waiting for localhost to be ready... Attempt $attempt"
sleep 1
attempt=$((attempt + 1))
done
if [ $attempt -ge $MAX_ATTEMPTS ]; then
echo_error "Timeout waiting for localhost to be ready after $MAX_ATTEMPTS attempts."
exit 1
fi
# 检查是否成功加入集群
check_cluster_join() {
info_output=$(./apinto info 2>&1)
peer_count=$(echo "$info_output" | grep -c -- "--Peer")
if [ "$peer_count" -ge 2 ]; then
echo_info "Successfully joined the cluster with $peer_count peers. Cluster info: $info_output"
return 0
else
echo_info "Failed to join the cluster. Only $peer_count peer(s) found. Info: $info_output"
return 1
fi
}
# 尝试加入集群,带重试
try_join_cluster() {
local target_addr=$1
local join_retries=0
while [ $join_retries -lt $MAX_JOIN_RETRIES ]; do
echo_info "Attempting to join cluster via $target_addr (Retry $join_retries/$MAX_JOIN_RETRIES)..."
join_output=$(./apinto join -addr "$target_addr" 2>&1)
if [ $? -eq 0 ]; then
if check_cluster_join; then
return 0
else
echo_info "Join via $target_addr executed but cluster validation failed.Details: $join_output"
fi
else
echo_info "Join via $target_addr failed. Details: $join_output"
fi
join_retries=$((join_retries + 1))
if [ $join_retries -lt $MAX_JOIN_RETRIES ]; then
echo_info "Retrying join in $RETRY_INTERVAL seconds..."
sleep $RETRY_INTERVAL
fi
done
echo_error "Failed to join cluster via $target_addr after $MAX_JOIN_RETRIES retries."
return 1
}
if [ "$CURRENT_INDEX" -eq 0 ]; then
# apinto-0: 检查其他 Pod 是否在运行
echo_info "This is $HOSTNAME. Checking if other nodes are running..."
for i in $(seq 1 "$MAX_POD_INDEX"); do
OTHER_POD="${BASE_NAME}-${i}.${SERVICE}.${NAMESPACE}.svc.cluster.local"
attempt=0
while [ $attempt -lt $MAX_ATTEMPTS ]; do
if curl --max-time 5 --silent --fail http://${OTHER_POD}:9401; then
echo_info "Found a running node: ${OTHER_POD}."
if try_join_cluster "${OTHER_POD}:9401"; then
exit 0
fi
echo_info "Failed to join via ${OTHER_POD}. Trying next node..."
break
else
echo_info "${OTHER_POD} is not ready yet. Retrying in $RETRY_INTERVAL seconds..."
fi
sleep $RETRY_INTERVAL
attempt=$((attempt + 1))
done
echo_info "Timeout waiting for ${OTHER_POD} after $MAX_ATTEMPTS attempts."
done
echo_info "No other nodes are available or joinable. Starting as the first node."
else
# 非 apinto-0 的 Pod加入 apinto-0
LEADER_POD="${BASE_NAME}-0.${SERVICE}.${NAMESPACE}.svc.cluster.local"
echo_info "This is $HOSTNAME. Waiting for $LEADER_POD to be ready..."
attempt=0
until curl --max-time 5 --silent --fail http://$LEADER_POD:9401 || [ $attempt -ge $MAX_ATTEMPTS ]; do
echo_info "Waiting for $LEADER_POD to be ready... Attempt $attempt"
sleep 1
attempt=$((attempt + 1))
done
if [ $attempt -ge $MAX_ATTEMPTS ]; then
echo_error "Timeout waiting for $LEADER_POD to be ready after $MAX_ATTEMPTS attempts."
exit 1
fi
echo_info "$LEADER_POD is ready."
if try_join_cluster "$LEADER_POD:9401"; then
exit 0
else
echo_error "Failed to join cluster via $LEADER_POD after $MAX_JOIN_RETRIES retries."
exit 1
fi
fi