#!/bin/bash
BASE_COLLECTION_PATH="must-gather"
CORE_DUMP_PATH=${OUT:-"${BASE_COLLECTION_PATH}/node_core_dumps"}

mkdir -p "${CORE_DUMP_PATH}"/

function get_dump_off_node {
    local debugPod=""
    
    #Get debug pod's name
    debugPod=$(oc debug --to-namespace="default" node/"$1" -o jsonpath='{.metadata.name}')

    #Start Debug pod force it to stay up until removed in "default" namespace
    oc debug --to-namespace="default" node/"$1" -- /bin/bash -c 'sleep 300' > /dev/null 2>&1 &

    #Mimic a normal oc call, i.e pause between two successive calls to allow pod to register
    sleep 2
    oc wait -n "default" --for=condition=Ready pod/"$debugPod" --timeout=30s

    if [ -z "$debugPod" ]
    then
      echo "Debug pod for node ""$1"" never activated"
    else
      #Copy Core Dumps out of Nodes suppress Stdout
      echo "Copying core dumps on node ""$1"""
      oc cp  --loglevel 1 -n "default" "$debugPod":/host/var/lib/systemd/coredump "${CORE_DUMP_PATH}"/"$1"_core_dump > /dev/null 2>&1 && PIDS+=($!)

      #clean up debug pod after we are done using them  
      oc delete pod "$debugPod" -n "default"  
    fi
}

function gather_core_dump_data {
  #Run coredump pull function on all nodes in parallel
  for NODE in ${NODES}; do
    get_dump_off_node "${NODE}" &
  done
}

if [ $# -eq 0 ]; then
    echo "WARNING: Collecting core dumps on ALL linux nodes in your cluster. This could take a long time."
fi

PIDS=()
NODES="${*:-$(oc get nodes -o jsonpath='{.items[?(@.status.nodeInfo.operatingSystem=="linux")].metadata.name}')}"

gather_core_dump_data

echo "INFO: Waiting for node core dump collection to complete ..."
wait "${PIDS[@]}"
echo "INFO: Node core dump collection to complete."

# force disk flush to ensure that all data gathered is accessible in the copy container
sync
