#!/bin/bash
# cms-cluster   Manages CMS related services across the cluster
#
# Authors:      Hovey Yu <hovey.yu@ericsson.com>
#
# description:  Ericsson / SA Media Watchpoint CMS Cluster Services Management Script

trap '
    for job in $(jobs -pr); do
        jjob=$(ps h --ppid $job -o pid)
        kill -- $job $jjob
    done
    exit 1' SIGINT SIGTERM

CMD_SSH="ssh -nq -o StrictHostKeyChecking=no"
if [ "$(whoami)" == "root" ]; then
    CMD_SSH="$CMD_SSH -i /opt/cms/keys/cms_id_rsa"
fi
CMD_CURL="curl -m 10 -s"
CONSUL_URL="localhost:8501"

CMS_VERS="6.1"

VERBOSE=false
USE_CONSUL=true
FORCE_FULL=false
PRE_UPGRADE=false
NODES=( )
SERVICES=( )

CLUSTER_GROUPS=( cs edb es app pt )

# Default time in seconds before flushing redis
REDIS_FLUSH=120

# Format= ( node_query:consul_service )
# If using consul, script will query consul API for the consul_service
# Node query is only used if not using consul or no consul nodes are found for that service
# Node query can be a cluster group or a specific cname/hostname/IP or empty 
# Empty node query means all nodes
# Node query with ~ means all cluster groups except
CORE_SRV_LIST=(
    "pt:portaltomcat"
    "app:workflow?tag=process"
    "app:xporttomcat"
    "app:xportmanager"
    "app:cms?tag=process"
    "app:adinameserver"
    "app:tomcat"
    "app:cms-snmp-subagent"
)

# List of services used with -f
FULL_SRV_LIST=(
    ":collectd"
    "~cs:logstash?tag=shipper"
    "es:elasticsearch?tag=es"
    "edb:ppfm?tag=standby"
    "edb:ppfm?tag=master"
    "~edb:ppfm?tag=witness"
    "cs:kibana"
    "cs:redis"
    "cs:logstash?tag=indexer"
    "cs:elasticsearch?tag=elk"
)

# List of other services that can be called with -s option
OTHER_SRV_LIST=(
    "app:consul-template?tag=app"
    "cs:consul-template?tag=cs"
    "cs:haproxy"
    "cs:keepalived"
    ":consul"
)

SRV_LIST=(
    ${CORE_SRV_LIST[@]}
    ${FULL_SRV_LIST[@]}
    ${OTHER_SRV_LIST[@]}
)

function log_sys() {
    logger -t cms-cluster "$@"
}

function log_verbose() {
    if $VERBOSE; then
        echo -e "$@"
    fi
}

function log_error() {
    echo -e "$@"
    log_sys "$@"
}

function log_debug() {
    log_verbose "$@"
    log_sys "$@"
}



function usage() {
    echo "Usage $0 {start|stop|status|restart|shutdown|reboot} [-h][-v] [-n node] [-f|-s service] "
    echo
    echo "  Commands:"
    echo "    start                         Start all services"
    echo "    stop                          Stop all services"
    echo "    status                        Checks status of all services"
    echo "    restart                       Restarts all services"
    echo "    shutdown                      Stops all services and shuts down all servers"
    echo "    reboot                        Stops all services and reboots all servers"
    echo
    echo "  General Arguments:"
    echo "    -h | --help                   Displays usage"
    echo "    -v | --verbose                Prints verbose messages"
    echo
    echo "  Node Arguments:"
    echo "    -n <node> | --node <node>     Manually specify a single node (cname/IP)"
    echo
    echo "  Service Arguments:"
    echo "    -f | --full                   Includes all CMS related services"
    echo "                                  Cannot be used with '-s service'"
    echo "    -s <srv> | --service <srv>    Manually specify a single service"
    echo "                                  Cannot be used with '-f' or with commands 'shutdown' or 'reboot'"

}

# Reads product.properties to get CMS version X.X
# Arguments: none
function get_cms_version() {
    ${CMD_SSH} app "grep -oP '(?<=product.version=).{0,3}' /opt/tandbergtv/cms/product.properties"
}

# Check VM reachability
function nodes_status() {
    for node in ${NODES[@]}; do
        ${CMD_SSH} $node "sudo /sbin/service consul status &> /dev/null || echo Consul is not running on $node"
        if [ $? -ne 0 ]; then
            log_error "Node $node is unavailable"
            NODES=( ${NODES[@]/$node} )
        fi
    done
    
    if [ -z "$NODES" ]; then
        log_error "No nodes available"
        exit 1
    fi
}

# Reregister ppfm services with tags
# ppfm tags: witness, master, standby
function ppfm_tag() {
    local VERBOSE=true
    local time=0
    local timeout=300
    while :; do
        ppfm_cluster=$(${CMD_SSH} edb1 "sudo /sbin/service ppfm-1.1 cluster-status")
        if [ $? -eq 0 ]; then
            # Register master, standby, witness
            cluster_table=$(echo "$ppfm_cluster" | sed -n '/Database Status/q;p' | tail -n +5)
            
            while read -r line; do
                ip=$(echo "$line" | awk '{print $2}')
                role=$(echo "$line" | awk '{print tolower($1)}')
                log_debug "Registering $ip as ppfm $role"
                ${CMD_SSH} $ip "sudo /opt/consul/bin/regconsulsrv -n ppfm -p 7800 -s '/etc/init.d/ppfm-1.1' -t clusterservices -t $role -r &> /dev/null" </dev/null
                if [ $? -ne 0 ]; then
                    log_error "Unable to register ppfm on $ip; please ensure consul is running"
                    exit 1
                fi
            done <<< "$cluster_table"
            break
        elif [ $time -ge $timeout ]; then
            log_error "PPFM cluster is not healthy! Please fix before continuing."
            log_error "$ppfm_cluster"
            exit 1
        else
            log_debug "Waiting for PPFM cluster to be healthy..."
            sleep 30
            (( time += 30 ))
        fi
    done
    
    # Give ppfm time to appear in consul dns
    ppfm_time=0
    ppfm_timeout=60
    while :; do
        local witness=$(dig +short @127.0.0.1 -p 8600 witness.ppfm.service.consul. ANY)
        local master=$(dig +short @127.0.0.1 -p 8600 master.ppfm.service.consul. ANY)
        local standby=$(dig +short @127.0.0.1 -p 8600 standby.ppfm.service.consul. ANY)
        if [ -z "$witness" ] || [ -z "$master" ] || [ -z "$standby" ]; then
            if [ $ppfm_time -ge $ppfm_timeout ]; then
                log_error "PPFM nodes address retrieved from consul: \nWITNESS  : $witness \nMASTER   : $master \nSTANDBY  : $standby"
                log_error "PPFM cluster not registered properly with consul! Please fix before continuing."
                log_error "$ppfm_cluster"
                exit 1
            fi
            log_debug "Waiting for PPFM cluster to get registered properly with consul..."
            sleep 10
            (( ppfm_time += 10 ))
        else
            break
        fi
    done
}

# Queries consul for list of nodes that have service <srv> registered
# Arguments: <srv>
function get_consul_service_nodes() {
    ${CMD_CURL} "$CONSUL_URL/v1/catalog/service/$1" | jq -r '.[] | .Node' 2> /dev/null
}

# Queries consul for list of services registered with node <node>
# Arguments: <node>
function get_consul_node_services() {
    ${CMD_CURL} "$CONSUL_URL/v1/catalog/node/$1" | jq -r '.Services[] | .Service' 2> /dev/null
}

# Queries consul for list of nodes
# Arguments: none
function get_consul_nodes() {
    ${CMD_CURL} "$CONSUL_URL/v1/catalog/nodes" | jq -r '.[] | .Node' 2> /dev/null
}

# Queries DNS for list of nodes
# Arguments: none
function get_dns_nodes() {
    local nodes=""
    for cg in ${CLUSTER_GROUPS[@]}; do
        idx=1
        while host $cg$idx &> /dev/null; do
            nodes="$nodes $cg$idx"
            ((idx++))
        done
    done
    echo $nodes | tr ' ' '\n' | sort
}

# Queries consul for list of nodes
# Arguments: <srv> | <clustergroup>:<srv>
# If there is no : in argument, we need to figure out which nodes to run on
# If -n <node> was specified, we will try to this service on that one node
# If ppfm, we need to list in order of standby, master, and witness
# All other services, find mapping from SRV_LIST
# If no mapping was found, try querying consul
function parse_service() {
    # If colon was already specified, use provided query 
    if [[ "$1" =~ .*:.* ]]; then
        echo "$1"
        return
    # If only one node exists, use that node
    elif [ ${#NODES[@]} -eq 1 ]; then
        echo "${NODES[0]}:$1"
        return
    fi
    
    # For PPFM, treat as special case
    case $1 in
        ppfm | ppfm-1.1 )  echo "edb:ppfm?tag=standby" \
                                "edb:ppfm?tag=master" \
                                "~edb:ppfm?tag=witness"
                           return ;;
    esac
    
    # Try to regex match with an entry from SRV_LIST, where tag is optional
    # Be mindful that if no tag is specified (<srv>?tag=<t>), then all services regardless of tag
    # will be selected 
    found=false
    for s in ${SRV_LIST[@]}; do
        [[ "$s" =~ .*:$1(\?tag=.*)?$ ]] && echo "$s" && found=true
    done
    
    # If no service can be found, this service will be tried on all nodes
    if ! $found; then
        echo ":$1"
    fi
}


# Returns the init.d service corresponding to the consul service
# Arguments: <consul_srv>
# For some consul services, the service name does not match
function map_consul_service() {
    case $1 in
        portaltomcat )     echo "tomcat" ;;
        ppas )             echo "ppas-9.3" ;;
        ppfm* )            echo "ppfm-1.1" ;;
        * )                echo "${1%%\?*}" ;;
    esac
}

# Flush redis after a certain amount of time
# Arguments: <node> [time]
function flush_redis() {
    list_num=$(${CMD_SSH} $1 "redis-cli llen logstash")
    if [ "$list_num" != "0" ]; then
        time=0
        delay=10
        max_time=${2:-$REDIS_FLUSH}
        while [ $time -le $max_time ]; do
            log_verbose "Waiting $max_time seconds for redis on $node before flushing $list_num records"
            sleep $delay
            (( time += delay ))
        done
        log_verbose "Flushing $list_num records from redis"
        redis-cli FLUSHALL
    fi
}

# Use node query for service call
# Arguments: <command> <node_query:consul_service>
function node_query_service_call() {
    local node_query=${2%%:*}
    local init_service=$(map_consul_service ${2##*:})
    
    if [[ "$node_query" =~ ~.* ]]; then
        query_nodes=$(echo "${NODES[@]}" | tr ' ' '\n' | grep -v ${node_query#~*})
    elif [ ! -z "$node_query" ]; then
        query_nodes=$(echo "${NODES[@]}" | tr ' ' '\n' | grep "$node_query")
    else
        node_query="all"
        query_nodes=$(echo "${NODES[@]}")
    fi
    if [ ! -z "$query_nodes" ]; then
        echo "sudo /sbin/service $init_service $1 on $node_query nodes:" $query_nodes
        for node in $query_nodes; do
            service_call $node $init_service $1 &
        done
    fi
}

# SSH into a node and calls a service command
# Arguments: <node> <srv> <command>
# Executes: ssh <node> "service <srv> <command>"
# If redis, flush all after a grace period for pre-upgrade
function service_call() {
    local node=$1
    local srv=$2
    local command=$3
    
    if ${CMD_SSH} $node "stat /etc/init.d/$srv &> /dev/null"; then
        if [ "$srv" == "redis" ] && [ "$command" == "stop" ] && $PRE_UPGRADE; then
            flush_redis $node
        fi
        ${CMD_SSH} $node "sudo /sbin/service $srv $command" &> /dev/null
        local ret=$?
        local exit_str=$(log_verbose "exitcode is $ret")
        case $command in
            status )
                # All services exit 0 if running
                # All services exit 3 if stopped except:
                #  logstash or kibana exit 2
                case $ret in
                    0 )
                        printf "  %-60s %s\e[1;32m%s\e[m%s\n" "$node: sudo /sbin/service $srv $command" "[" "RUNNING" "] $exit_str"
                        ;;
                    2 )
                        case $srv in
                            logstash | kibana )
                                printf "  %-60s %s\e[1;31m%s\e[m%s\n" "$node: sudo /sbin/service $srv $command" "[" "STOPPED" "] $exit_str"
                                ;;
                            * )
                                printf "  %-60s %s\e[1;31m%s\e[m%s\n" "$node: sudo /sbin/service $srv $command" "[" "FAILED" "] $exit_str"
                                log_sys "$node: sudo /sbin/service $srv $command [ FAILED ] exitcode is $ret"
                                ;;
                        esac
                        ;;
                    3 )
                        printf "  %-60s %s\e[1;31m%s\e[m%s\n" "$node: sudo /sbin/service $srv $command" "[" "STOPPED" "] $exit_str"
                        ;;
                    * )
                        printf "  %-60s %s\e[1;31m%s\e[m%s\n" "$node: sudo /sbin/service $srv $command" "[" "FAILED" "] $exit_str"
                        log_sys "$node: sudo /sbin/service $srv $command [ FAILED ] exitcode is $ret"
                        ;;
                esac
                ;;
            stop* )
                # All services exit 0 if stopped successfully
                # If not, call status to explicitly check
                case $ret in
                    0 )
                        printf "  %-60s %s\e[1;31m%s\e[m%s\n" "$node: sudo /sbin/service $srv $command" "[" "STOPPED" "] $exit_str"
                        ;;
                    * )
                        log_sys "$node: sudo /sbin/service $srv $command returned $ret; running status to check"
                        service_call $1 $2 status
                        ;;
                esac
                ${CMD_SSH} $node "sudo chkconfig $srv off"
                ;;
            start )
                # All services exit 0 if started successfully
                # If not, call status to explicitly check
                case $ret in
                    0 )
                        printf "  %-60s %s  \e[1;32m%-4s\e[m%s\n" "$node: sudo /sbin/service $srv $command" "[" "OK" "] $exit_str"
                        ;;
                    * )
                        log_sys "$node: sudo /sbin/service $srv $command returned $ret; running status to check"
                        service_call $1 $2 status
                        ;;
                esac
                ${CMD_SSH} $node "sudo chkconfig $srv on"
                ;;
        esac
    else
        #log_error "  Error: Could not find service $srv on node $node"
        return 1
    fi
    return $?
}

# This command calls several service_call commands on several nodes and services
# Calls all services in $SERVICES on all nodes in consul or otherwise specified in $NODES
# If a service cannot be found in consul, it is called on all nodes in the node query
# For consul service itself, don't query consul as it only returns servers
# If cluster command is start, reverse the services order
# If cluster command is stop for all nodes, retag PPFM nodes with semi-permanence
# Arguments: <COMMAND>
function cluster_command() {
    local services_list=""
    # If function is start, reverse the service list
    if [ "$1" == "start" ]; then
        for (( i=${#SERVICES[@]}-1 ; i>= 0 ; i-- )) ; do
            services_list="$services_list ${SERVICES[${i}]}"
        done
    else
        services_list="${SERVICES[@]}"
        if [ "$1" == "stop" ] && [ ${#NODES[@]} -ne 1 ] && [[ "$services_list" =~ .*"ppfm".* ]]; then
            ppfm_tag
        fi
    fi
    
    log_verbose "Running cluster command: $1\n"
    log_verbose "Service List:\n$services_list\n"
    log_verbose "Node List:\n${NODES[@]}\n\n"
    
    for srv in $services_list ; do
        local consul_service=${srv##*:}
        local consul_nodes=$(get_consul_service_nodes "$consul_service")
        if $USE_CONSUL && [ ! -z "$consul_nodes" ] && [ "$consul_service" != "consul" ]; then
            init_service=$(map_consul_service $consul_service)
            echo "sudo /sbin/service $init_service $1 on nodes:" $consul_nodes
            for node in $consul_nodes; do
                service_call $node $init_service $1 &
            done
        else
            node_query_service_call $1 $srv
        fi
        wait
    done
    echo
}

function cluster_init() {
    local found_myself=false
    for node in ${NODES[@]}; do
        if ! $found_myself; then
            ip=$(host $node | tail -n 1 | awk '{print $NF}')
            for my_ip in $(hostname -I); do
                if [ "$my_ip" == "$ip" ]; then
                    found_myself=true
                    break
                fi
            done
            if $found_myself; then
                continue
            fi
        fi
        echo "Running init $1 on $node"
        $CMD_SSH $node "init $1"
    done
    if $found_myself; then
        echo "Running init $1 on (myself)"
        init $1
    fi
}

function start() {
    cluster_command start
}

function stop() {
    cluster_command stop
}

function status() {
    cluster_command status
}

function restart() {
    cluster_command stop
    cluster_command start
}

function shutdown() {
    cluster_command stop
    cluster_init 0
}

function reboot() {
    cluster_command stop
    cluster_init 6
}

function get_arguments() {
    while [ ! -z "$1" ]; do
        case $1 in
            -h | --help )                      usage
                                               exit
                                               ;;
            -v | --verbose )                   VERBOSE=true
                                               ;;
            -n | --node )                      if [ -z "$NODES" ] && [ ! -z "$2" ]; then
                                                   USE_CONSUL=false
                                                   NODES=( "$2" )
                                                   shift
                                               else
                                                   usage
                                                   exit 1
                                               fi
                                               ;;
            -f | --full )                      if [ -z "$SERVICES" ]; then
                                                   SERVICES=( ${CORE_SRV_LIST[@]} ${FULL_SRV_LIST[@]} )
                                               else
                                                   usage
                                                   exit 1
                                               fi
                                               ;;
            -s | --service )                   if $FORCE_FULL; then
                                                   usage
                                                   exit 1
                                               elif [ -z "$SERVICES" ] && [ ! -z "$2" ]; then
                                                   SERVICES=( $(parse_service $2) )
                                                   shift
                                               else
                                                   usage
                                                   exit 1
                                               fi
                                               ;;
            * )                                echo "Invalid argument detected: $1"
                                               usage
                                               exit 1
        esac
        shift
    done
    
    # Set nodes and services
    if [ -z "$SERVICES" ]; then
        SERVICES=( ${CORE_SRV_LIST[@]} )
        if $FORCE_FULL || $PRE_UPGRADE; then
            SERVICES=( ${SERVICES[@]} ${FULL_SRV_LIST[@]} )
        fi
    fi
    
    if [ -z "$NODES" ]; then
        NODES=( $(get_dns_nodes) )
    fi
        
    # If CMS has not been upgraded yet
    # Remove adinameserver since it's part of cms
    if $PRE_UPGRADE; then
        SERVICES=( ${SERVICES[@]/"app:adinameserver"} )
    fi
    
    # If only one node, replace ppfm tags
    if [ ${#NODES[@]} -eq 1 ]; then
        SERVICES=( ${SERVICES[@]/"edb:ppfm?tag=standby"} )
        SERVICES=( ${SERVICES[@]/"edb:ppfm?tag=master"/"edb:ppfm"} )
        SERVICES=( ${SERVICES[@]/"~edb:ppfm?tag=witness"/"~edb:ppfm"} )
    fi
    
    # If less than 2 edb nodes, remove PPFM services
    if [ ${#SERVICES[@]} -ne 1 ] && [[ $(grep -o 'edb' <<< ${NODES[*]} | wc -l) -lt 2 ]]; then
        SERVICES=( ${SERVICES[@]//*ppfm*} )
    fi

    nodes_status
}

if [ $# -lt 1 ]; then
    usage
    exit 1
fi

# If CMS has not been upgraded yet
# Tag PPFM instances with roles
if [ "$(get_cms_version)" != "$CMS_VERS" ]; then
    PRE_UPGRADE=true
    USE_CONSUL=false
fi

# For shutdown and reboot, must use all services
case $1 in
    start|stop|status|restart )                COMMAND=$1
                                               shift
                                               get_arguments $@
                                               $COMMAND
                                               ;;
    shutdown|reboot )                          COMMAND=$1
                                               shift
                                               FORCE_FULL=true
                                               get_arguments $@
                                               $COMMAND
                                               ;;
    *)
                                               usage
                                               exit 1
esac

