说明:通过 Java 类名定时循环监控 java 进程资源以及机器内存情况占用并写入文件
例如:每 60s 记录 Spark 执行器资源使用情况:
./java_resource_monitor.sh CoarseGrainedExecutorBackend logs 60
结果:在 logs 目录中,以 PID 为名写出每个执行器资源,并将机器内存写入 free.log 中。
脚本内容:
#!/bin/bash
# ============================================
# 自带定时采样循环的 Java 进程资源监控脚本
# 兼容无 jps 环境
# ============================================
set -e
cd "$(dirname "$0")"
if [ $# -lt 2 ]; then
echo "Usage: $0 <process_name> <output_directory> [interval_seconds] [print_to_console]"
exit 1
fi
PROCESS_NAME=$1
OUTPUT_DIR=$2
INTERVAL=${3:-5} # 默认5秒采样一次
PRINT_TO_CONSOLE=${4:false}
mkdir -p "$OUTPUT_DIR"
# 明确指定命令路径(防止非交互环境下 PATH 不完整)
PS_CMD=$(command -v ps)
GREP_CMD=$(command -v grep)
AWK_CMD=$(command -v awk)
FREE_CMD=$(command -v free)
DATE_CMD=$(command -v date)
TAIL_CMD=$(command -v tail)
echo "========================================="
echo " Java Resource Watcher"
echo " Process Name : $PROCESS_NAME"
echo " Output Dir : $OUTPUT_DIR"
echo " Interval : ${INTERVAL}s"
echo " Print Console: $PRINT_TO_CONSOLE}"
echo " CURRENT_PID : $CURRENT_PID"
echo "========================================="
echo ""
# 获取当前脚本的PID
CURRENT_PID=$$
# 捕获 Ctrl+C 信号
trap 'echo ""; echo "Stopped by user."; exit 0' INT
while true; do
CURRENT_TIME=$($DATE_CMD +"%Y-%m-%d %H:%M:%S")
# ---------- 查找Java进程 ----------
if command -v jps >/dev/null 2>&1; then
pids=$(jps | grep -i "$PROCESS_NAME" | $AWK_CMD '{print $1}')
else
pids=$($PS_CMD -eo pid,cmd | $GREP_CMD '[j]ava' | $GREP_CMD -i "$PROCESS_NAME" | $AWK_CMD '{print $1}')
fi
if [ -z "$pids" ]; then
echo "[$CURRENT_TIME] ⚠️ No Java process found for name: $PROCESS_NAME"
else
# ---------- 记录每个进程的资源 ----------
for pid in $pids; do
OUTPUT_FILE="$OUTPUT_DIR/$pid"
# 过滤一些无关进程
if [ "$pid" == "$CURRENT_PID" ];then
# echo "skip current pid: $$CURRENT_PID"
continue
fi
if [ -f "/proc/$pid/cmdline" ]; then
# 读取 cmdline 并替换 null 字符为空格
cmdline=$(cat "/proc/$pid/cmdline" | tr '\0' ' ')
case "$cmdline" in
*"$java "*)
# echo "===> find. $pid $cmdline"
if [ ! -f "$OUTPUT_FILE" ]; then
echo "TIME USER PID CPU% MEM% MEM_GB MEM_KB VSZ_KB CMD" > "$OUTPUT_FILE"
fi
$PS_CMD -p "$pid" -o user=,pid=,%cpu=,%mem=,rss=,vsz=,comm=,args= --no-headers | \
$AWK_CMD -v current_time="$CURRENT_TIME" '
{
mem_gb = sprintf("%.2f", $5/1024/1024)
printf "%-20s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %s\n",
current_time, $1, $2, $3, $4, mem_gb, $5, $6, $8
}' >> "$OUTPUT_FILE"
;;
*)
continue
;;
esac
fi
done
fi
# ---------- 系统内存 ----------
FREE_LOG="$OUTPUT_DIR/free.log"
if [ ! -f "$FREE_LOG" ]; then
echo "TIME TOTAL_GI USED_GI FREE_GI SHARED_GI BUFFERS_GI CACHE_GI SWAP_TOTAL-GI SWAP_USED-GI SWAP_FREE-GI" > "$FREE_LOG"
fi
#FREE_OUTPUT=$($FREE_CMD -g | $AWK_CMD '
#NR == 2 { mem=$2" "$3" "$4" "$5" "$6" "$7" "$8 }
#NR == 3 { swap=$2" "$3" "$4 }
#END { print mem" "swap }
#')
FREE_OUTPUT=$(free -g | awk -v current_time="$CURRENT_TIME" '
NR == 1 {
# 处理标题行,不输出任何内容
next
}
NR == 2 {
# 处理内存数据行(Mem行)
# 这里输出所有需要的列:总内存、已用、空闲、共享、缓冲区、缓存
printf "%-10s %-10s %-10s %-10s %-10s %-10s %-10s", $2, $3, $4, $5, $6, $7, $8
}
NR == 3 {
# 处理交换数据行(Swap行)
# 继续写入 swap 总量、已用、空闲
printf " %-10s %-10s %-10s", $2, $3, $4
}')
echo "$CURRENT_TIME $FREE_OUTPUT" >> "$FREE_LOG"
# ---------- 控制台摘要输出 ----------
if [ "$PRINT_TO_CONSOLE" ]; then
echo ""
echo "=== Resource Usage Summary @ $CURRENT_TIME ==="
if [ -n "$pids" ]; then
for pid in $pids; do
if [ -f "$OUTPUT_DIR/$pid" ]; then
LAST_LINE=$($TAIL_CMD -n 1 "$OUTPUT_DIR/$pid")
echo "Process $pid:"
echo " TIME USER PID CPU% MEM% MEM_GB MEM_KB VSZ_KB CMD"
echo " $LAST_LINE"
fi
done
fi
if [ -f "$FREE_LOG" ]; then
LAST_FREE_LINE=$($TAIL_CMD -n 1 "$FREE_LOG")
echo "Memory Status:"
echo " TIME TOTAL_GI USED_GI FREE_GI SHARED_GI BUFFERS_GI CACHE_GI SWAP_TOTAL-GI SWAP_USED-GI SWAP_FREE-GI"
echo " $LAST_FREE_LINE"
fi
fi
# ---------- 等待下一次采样 ----------
sleep "$INTERVAL"
done

198

被折叠的 条评论
为什么被折叠?



