#!/bin/bash

# chkconfig: 345 99 01
# description: Alertmanager Alertmanager

### BEGIN INIT INFO
# Provides:       alertmanager
# Required-Start: 
# Required-Stop:  
# Default-Start:  2 3 5
# Default-Stop:
# Description:    Alertmanager Alertmanager
### END INIT INFO

# Author: Gerhard Lausser <gerhard.lausser@consol.de>

# Notes for OMD init script requirements
# - Must handle omd config options like daemon enabling/disabling
# - When a daemon is disabled by omd config it needs
#   to return an exit code of 5.
# - The init script must output an exit code of 2 when
#   an unknown param is used.
# - In general the exit code for succeeded actions is
#   0 and for failed actions it is 1.
# - There are exceptions for the exit code handling:
#   - When a service is already stopped and should be
#     restarted/stopped, it should result in an exit code of 0.
#   - When a service is already running and should be started
#     this also should result in an exit code of 0.
# - When a restart is requested and the program is still not running
#   the script should only execute a start
# - When a restart is requested and the program can not be stopped the
#   script should terminate without starting the daemon
# - When a reload is requested and the program is not running
#   the init script should execute a start instead

cd /omd/sites/monitor
. .profile
. lib/omd/init_profile
. etc/omd/site.conf
[ -r etc/alertmanager/vars ] \
&& [ -f etc/alertmanager/vars ] \
&& . etc/alertmanager/vars

[ "$CONFIG_ALERTMANAGER" = "on" ] || exit 5

mkdir -p /omd/sites/monitor/tmp/alertmanager
mkdir -p /omd/sites/monitor/var/alertmanager
BIN=/omd/sites/monitor/bin/alertmanager
TOOL=/omd/sites/monitor/bin/amtool
CFG_FILE=/omd/sites/monitor/tmp/alertmanager/alertmanager.yml
LOG_DIR=/omd/sites/monitor/var/alertmanager
LOGFILE=/omd/sites/monitor/var/alertmanager/alertmanager.log
PID_FILE=/omd/sites/monitor/tmp/lock/alertmanager.lock
DATA_DIR=/omd/sites/monitor/var/alertmanager/data
USR=monitor
GRP=monitor

EXTERNAL_FQDN=${OMD_EXTERNAL_FQDN:-$(hostname --fqdn)}

unset LANG
export LC_ALL=C

OPTIONS="--config.file $CFG_FILE --web.listen-address=$CONFIG_ALERTMANAGER_TCP_ADDR:$CONFIG_ALERTMANAGER_TCP_PORT --web.external-url=http://${EXTERNAL_FQDN}/monitor/alertmanager --storage.path=$DATA_DIR"

if [ -n "$CLUSTER_LISTEN_ADDRESS" ] ; then
       # Cluster mode active
  OPTIONS="--cluster.listen-address $CLUSTER_LISTEN_ADDRESS $OPTIONS"
  for peer in "${CLUSTER_PEERS[@]}"; do
               OPTIONS="--cluster.peer $peer $OPTIONS"
  done
else
  OPTIONS="--cluster.listen-address= $OPTIONS"
fi

OPTIONS="$EXTRA_OPTIONS $OPTIONS"

# Put together fragments and build a new alertmanager.yml
#
#
rebuild_config() {
    cp /omd/sites/monitor/etc/alertmanager/alertmanager.yml $CFG_FILE
    WORK_CFG=/omd/sites/monitor/tmp/alertmanager.yml.$$
    if grep -q "# - start of routes" $CFG_FILE && grep -q "# - end of routes" $CFG_FILE; then
        cat $CFG_FILE |\
            sed -n '1,/# - start of routes/p' > $WORK_CFG
        # all files in /omd/sites/monitor/etc/alertmanager/alertmanager.d/routes/*/*.yml
        for routes in /omd/sites/monitor/etc/alertmanager/alertmanager.d/routes/*.yml
        do
            [ -f $routes ] && cat $routes >> $WORK_CFG
        done
        for routes in /omd/sites/monitor/etc/alertmanager/alertmanager.d/routes/*
        do
            if [ -d $routes ]; then
                for subroutes in ${routes}/*.yml
                do
                    [ -f $subroutes ] && cat $subroutes >> $WORK_CFG
                done
            fi
        done
        cat $CFG_FILE |\
            sed -n '/# - end of routes/,$p' >> $WORK_CFG
        cp $WORK_CFG $CFG_FILE
    fi

    if grep -q "# - start of inhibit_rules" $CFG_FILE && grep -q "# - end of inhibit_rules" $CFG_FILE; then
        cat $CFG_FILE |\
            sed -n '1,/# - start of inhibit_rules/p' > $WORK_CFG
        # all files in /omd/sites/monitor/etc/alertmanager/alertmanager.d/inhibit_rules/*/*.yml
        for inhibits in /omd/sites/monitor/etc/alertmanager/alertmanager.d/inhibit_rules/*.yml
        do
            [ -f $inhibits ] && cat $inhibits >> $WORK_CFG
        done
        for inhibits in /omd/sites/monitor/etc/alertmanager/alertmanager.d/inhibit_rules/*
        do
            if [ -d $inhibits ]; then
                for subinhibits in ${inhibits}/*.yml
                do
                    [ -f $subinhibits ] && cat $subinhibits >> $WORK_CFG
                done
            fi
        done
        cat $CFG_FILE |\
            sed -n '/# - end of inhibit_rules/,$p' >> $WORK_CFG
        cp $WORK_CFG $CFG_FILE
    fi

    if grep -q "# - start of receivers" $CFG_FILE && grep -q "# - end of receivers" $CFG_FILE; then
        cat $CFG_FILE |\
            sed -n '1,/# - start of receivers/p' > $WORK_CFG
        # all files in /omd/sites/monitor/etc/alertmanager/alertmanager.d/receivers/*/*.yml
        for receivers in /omd/sites/monitor/etc/alertmanager/alertmanager.d/receivers/*.yml
        do
            [ -f $receivers ] && cat $receivers >> $WORK_CFG
        done
        for receivers in /omd/sites/monitor/etc/alertmanager/alertmanager.d/receivers/*
        do
            if [ -d $receivers ]; then
                for subreceivers in ${receivers}/*.yml
                do
                    [ -f $subreceivers ] && cat $subreceivers >> $WORK_CFG
                done
            fi
        done
        cat $CFG_FILE |\
            sed -n '/# - end of receivers/,$p' >> $WORK_CFG
        cp $WORK_CFG $CFG_FILE
    fi

    for y in /omd/sites/monitor/etc/alertmanager/alertmanager.d/*.yml
    do
       test -f $y && cat $y >> $CFG_FILE
    done
    sed -ri "s&""#""#""#ROOT#""#""#""&/omd/sites/monitor&g" $CFG_FILE
    sed -ri "s&""#""#""#SITE#""#""#""&monitor&g" $CFG_FILE
    sed -ri "s/###ALERTMANAGER_TCP_PORT###/$CONFIG_ALERTMANAGER_TCP_PORT/g" $CFG_FILE
    sed -ri "s/###ALERTMANAGER_TCP_ADDR###/$CONFIG_ALERTMANAGER_TCP_ADDR/g" $CFG_FILE

    rm -f "$WORK_CFG"
}

# Fetches the pid of the currently running alertmanager process of the given
# user.
#
# --ppid 1 in ps seem not to filter by direct ppid but by the whole
# parent process tree. So filter by hand again.
#
# It returns 1 when no process can be found and echos the PID while
# returning 0 when a process can be found.
alertmanager_proc() {
    [[ -f $PID_FILE ]] && kill -0 $(< $PID_FILE) >/dev/null 2>&1 || return 1
    read comm user < <(ps -p $(< $PID_FILE) -o comm,user 2>/dev/null | tail -n 1) >/dev/null 2>&1
    test "$comm" = alertmanager >/dev/null 2>&1 || return 1
    test "$user" = $USR >/dev/null 2>&1 || return 1
    cat $PID_FILE
    return 0
}

# First try to use the process list to gather a alertmanager process,
# when no process is found via ps take a look at the lock file
#
# It returns 1 when no process can be found and echos the PID while
# returning 0 when a process can be found.
pidof_alertmanager() {
    alertmanager_proc
    return $?
}


verify_config() {
    if [ "$1" != "quiet" ]; then
        echo -n "Running configuration check... "
    fi
    RESULT=$($TOOL check-config $CFG_FILE 2>&1)
    if [ $? -eq 0 ]; then
        if [ "$1" != "quiet" ]; then
            echo "done."
            echo "$RESULT" >&2
        fi
        return 0
    else
        if [ "$1" != "quiet" ]; then
            echo "CONFIG ERROR! Aborted. Check your Alertmanager configuration."
        fi
        echo "$RESULT" >&2
        return 1
    fi
}

prep_start() {
    touch $PID_FILE
    chown $USR:$GRP $PID_FILE
}

alertmanager_wait_stop() {
    pid=$(pidof_alertmanager) || true
    if ! kill -0 "${pid:-}" >/dev/null 2>&1; then
        echo -n 'Not running. '
        return 0
    fi

    # wait until really stopped.
    # it might happen that alertmanager has a subprocess which
    # is left running and becomes ppid 1 after killing the
    # main alertmanager process. So fetch the process id again
    # multiple times to fetch new processes until all are gone.
    if [ -n "${pid:-}" ]; then
        I=0
        while kill -0 ${pid:-} >/dev/null 2>&1; do
            # Send single kill per process
            kill $pid
            while kill -0 ${pid:-} >/dev/null 2>&1;  do
                if [ $I = '60' ]; then
                    return 1
                else
                    echo -n "."
                    I=$(($I+1))
                    sleep 1
                fi
            done
            # Is there another proc with ppid 1?
            pid=$(pidof_alertmanager | head -n1) || true
        done
    fi

    [ -f "$PID_FILE" ] && rm -f "$PID_FILE"
    return 0

}

alertmanager_wait_start() {
    prep_start
    nohup $BIN $OPTIONS >>$LOGFILE 2>&1 &
    typeset apid=$!

    sleep 2

    if kill -0 $apid ; then
        echo $apid > $PID_FILE
        return 0
    else
        rm -f $PID_FILE
        return 1
    fi
}

if [ ! -f $BIN ]; then
    echo "Alertmanager binary $BIN not found. Terminating..."
    exit 1
fi

case "$1" in start|restart|reload|checkconfig)
    rebuild_config
    if [ ! -f $CFG_FILE ]; then
        echo "Alertmanager configuration file $CFG_FILE not found. Terminating..."
        exit 1
    fi
esac

__init_hook $0 $1 pre
case "$1" in
    start)
        echo -n "Starting alertmanager..."
         if pidof_alertmanager >/dev/null 2>&1; then
             echo 'Already running.'
             exit 1
         fi

         #if ! verify_config quiet; then
         #    exit 1
         #fi

         if alertmanager_wait_start; then
             echo 'OK'
             __init_hook $0 $1 post 0
             exit 0
         else
             echo 'ERROR'
             __init_hook $0 $1 post 1
             exit 1
         fi
    ;;
    stop)
        echo -n "Stopping alertmanager..."
        if alertmanager_wait_stop; then
            echo 'OK'
             __init_hook $0 $1 post 0
            exit 0
        else
            echo 'ERROR'
             __init_hook $0 $1 post 1
            exit 1
        fi
    ;;
    check|checkconfig)
        if ! verify_config; then
            exit 1
        fi
        exit 0
    ;;
    status)
        PID=$(pidof_alertmanager 2>&1) || true
        if kill -0 "${PID:-}" >/dev/null 2>&1; then
            echo "Running ($PID)."
            exit 0
        else
            echo 'Not running. '
            exit 1
        fi
    ;;
    restart)
        if ! verify_config quiet; then
            exit 1
        fi

        $0 stop || (echo "Unable to stop Alertmanager. Terminating..." && exit 1)
        echo -n "Starting Alertmanager..."
        if alertmanager_wait_start; then
            echo 'OK'
            exit 0
        else
            echo 'ERROR'
            exit 1
        fi
    ;;
    
    reload|force-reload)
       PID=$(pidof_alertmanager 2>&1) || true
        if kill -0 "${PID:-}" >/dev/null 2>&1; then
            kill -HUP $PID
            exit 0
        else
            exit 1
        fi
    ;;
    check)
    ;;
    *)
        echo "Usage: alertmanager {start|stop|restart|reload|status|checkconfig}"
        exit 2
    ;;
esac
 
# EOF
