标签归档:bash

一些方便系统诊断的bash函数

这段脚本包含100多个bash函数,是我几年前方便自己调试和诊断问题写的。贴出来给有需要的人,因为比较懒怎么使用这些函数就不写说明了。其中以下划线开头的是表示私有函数,以cf_开头的表示公共函数,可当做命令使用。

# check current os is linux
function cf_is_linux() {
  [[ "$OSTYPE" = *linux* ]] && echo "true" && return 0
  echo "false" && return 1
}

# check current os is mac/darwin
function cf_is_darwin() {
  [[ "$OSTYPE" = *darwin* ]] && echo "true" && return 0
  echo "false" && return 1
}

# check current os is windows/cygwin
function cf_is_cygwin() {
  [[ "$OSTYPE" = *cygwin* ]] && echo "true" && return 0
  echo "false" && return 1
}

function cf_is_gnu_date() {
  date --version >/dev/null 2>&1 && echo "true" && return 0
  echo "false" && return 1
}

function cf_is_gnu_sed() {
  sed --version >/dev/null 2>&1 && echo "true" && return 0
  echo "false" && return 1
}

function cf_is_gnu_awk() {
  awk --version | grep GNU >/dev/null && echo "true" && return 0
  echo "false" && return 1
}

function cf_is_gnu_grep() {
  grep --version | grep GNU >/dev/null && echo "true" && return 0
  echo "false" && return 1
}

# java style startsWith
function cf_starts_with() {
  local str=$1
  local pre=$2
  [[ "$str" ==  ${pre}* ]]
}

# java style substring
function cf_substring() {
  local str=$1
  local begin=$2
  local end=$3
  if [ -z "$end" ]; then
    echo ${str:$begin} 
  else
    local len=`expr $end - $begin`
    echo ${str:$begin:$len}
  fi
}

# get current shell name
function cf_shell_name() {
  local name=$( ps -ocommand= -p $$ | awk '{print $1}')
  if cf_starts_with $name "-"; then
    cf_substring $name 1
  else
    echo $name
  fi
}

# check current shell is bash
function cf_is_bash() {
  [[ `cf_shell_name` = "-bash" || `basename $(cf_shell_name)` = "bash" ]] && echo "true" && return 0
  echo "false" && return 1
}

# check current shell is zsh
function cf_is_zsh() {
  [[ `cf_shell_name` = "-zsh" || `basename $(cf_shell_name)` = "zsh" ]] && echo "true" && return 0
  echo "false" && return 1
}

function _script_dir() {
  if cf_is_bash >/dev/null; then
    cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -P 
  elif cf_is_zsh >/dev/null; then
    cd "$( dirname "${(%):-%N}" )" && pwd -P 
  else
    echo "unsupported shell" && return 1
  fi
}

function _script_file() {
  if cf_is_bash >/dev/null; then
    basename "${BASH_SOURCE[0]}" 
  elif cf_is_zsh >/dev/null; then
    basename "${(%):-%N}" 
  else
    echo "unsupported shell" && return 1
  fi
}

# colorful grep. private function
function _get_colorful_grep() {
  cf_is_gnu_grep >/dev/null && echo "grep --color" && return 0
  export GREP_OPTIONS='--color=always'
  export GREP_COLOR='1;35;40'
  echo "grep" 
}

# list all common functions
function cf_functions() {
  if cf_is_bash >/dev/null; then
    declare -F | awk '{print $NF}' | grep "cf_" | sort
  elif cf_is_zsh >/dev/null; then
    print -l ${(ok)functions} | grep "cf_" | sort
  else
    echo "unsupported shell" && return 1
  fi
}

# get total memory (MB)
function cf_mem_total() {
  if cf_is_linux >/dev/null; then
    free -m | awk '/^Mem/{print $2"M"}'
  elif cf_is_darwin >/dev/null; then
    sysctl hw.memsize | awk '{print $2/1024/1024"M"}'
  else
    echo "unsupported os" && return 1
  fi 
}

# decimal to hexadecimal
function cf_dec2Hex() {
  printf "%x" $1
}

# decimal to octal 
function cf_dec2Oct() {
  printf "%o" $1
}

# decimal to binary
function cf_dec2Bin() {
  echo "obase=2; $1" | bc
}

# hexadecimal to decimal 
function cf_hex2Dec() {
  echo $((16#$1))
}

# octal to decimal 
function cf_oct2Dec() {
  echo $((8#$1))
}

# binary to decimal 
function cf_bin2Dec() {
  echo $((2#$1))
}

function cf_calc() {
  local exp="$1"
  echo "$exp" | bc -l | awk '{printf "%.2f", $0}'
}

# warning and exit, not for interactive shell
function cf_die() {
  local msg="$1"
  local code=${2:-1}
  echo "$msg" && exit $code
}

# highlight key words from file or pipeline
function cf_highlight() {
  local keyword="$1"
  local cgrep="$(_get_colorful_grep)"
  if [ -p /dev/stdin ]; then
    # from pipeline
    while IFS='' read line; do
      echo $line | eval "$cgrep -E \"${keyword}|$\""
    done
  else
    local file="$2"
    eval "$cgrep -E \"${keyword}|$\"" "$file"
  fi
}

function cf_ps_env() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1

  if cf_is_linux >/dev/null; then
    xargs --null --max-args=1 < /proc/$pid/environ
  elif cf_is_darwin >/dev/null; then
    ps -wwE -p $pid
  else
    echo "unsupported os" && return 1
  fi
}

# get bash(current shell) major version
function cf_bash_major_ver() {
  echo ${BASH_VERSINFO[0]}
}

# get bash(current shell) minor version
function cf_bash_minor_ver() {
  echo ${BASH_VERSINFO[1]}
}

# get kernel version
function cf_kernel_ver() {
  if cf_is_linux >/dev/null; then
    uname -r | cut -d'-' -f1
  elif cf_is_darwin >/dev/null; then
    uname -r | cut -d'-' -f1
  else
    echo "unsupported os" && return 1
  fi
}

# get kernel major version
function cf_kernel_major_ver() {
  if cf_is_linux >/dev/null; then
    uname -r | awk -F'.' '{print $1"."$2}' 
  elif cf_is_darwin >/dev/null; then
    uname -r | awk -F'.' '{print $1"."$2}' 
  else
    echo "unsupported os" && return 1
  fi
}

# get kernel minor version
function cf_kernel_minor_ver() {
  if cf_is_linux >/dev/null; then
    uname -r | awk -F'.' '{print $3}'
  elif cf_is_darwin >/dev/null; then
    uname -r | awk -F'.' '{print $3}'
  else
    echo "unsupported os" && return 1
  fi
}

# get value from config file such as app.properties
function cf_get_property() {
  local file="$1"
  local key="$2"
  grep "^${key}=" "$file" | tr -d '\r' | cut -d'=' -f2 | cf_trim
}

# get command path, eg: `cf_command_path ls` output /usr/bin/ls
function cf_command_path() {
  local cmd=$1
  cf_is_bash && builtin type -P $cmd && return $?

  if [ -x /usr/bin/which ]; then
    local p=$( /usr/bin/which $1 | head -1 )
    [ ! -z "$p" ] && echo $p && return 0
    return 1
  else
    local p=$( which $1 | grep "^/" | head -1 )
    [ ! -z "$p" ] && echo $p && return 0
    return 1
  fi
}

# get all ip addresses
function cf_ip_list() {
  if [ -x /sbin/ip ]; then
    local list=$(/sbin/ip -o -4 addr list | awk '{print $4}' | cut -d'/' -f1 | tr '\n' ',')
  else
    local list=$(/sbin/ifconfig | grep "inet " | awk '{print $2}' | sed 's/addr://' | tr '\n' ',')
  fi
  echo ${list%,}
}

function cf_stdio() {
  local pid=$1
  /usr/sbin/lsof -a -p $pid -d 0,1,2
}

function cf_stdout() {
  local pid=$1
  if cf_is_linux >/dev/null; then
    readlink -f /proc/$pid/fd/1
  elif cf_is_darwin >/dev/null; then
    /usr/sbin/lsof -a -p $pid -d 1 | awk 'NR>1{print $NF}'
  else
    echo "unsupported os" && return 1
  fi
}

# get file last modification time
function cf_last_modification() {
  local file="$1"
  if [[ $OSTYPE == *linux* ]];then
    date +%Y%m%d%H%M%S -r $file
  elif [[ $OSTYPE == *darwin* ]];then
    stat -f "%Sm" -t "%Y%m%d%H%M%S" $file
  fi
}

# check current user is root 
function cf_is_root() {
  [ `whoami` = "root" ] && echo "true" && return 0
  echo "false" && return 1
}

# check current shell is interactive
function cf_is_interactive_shell() {
  if cf_is_bash >/dev/null; then
    [[ "$-" = *i* ]] && echo "true" && return 0
  elif cf_is_zsh >/dev/null; then
    [[ -o interactive ]] && echo "true" && return 0
  else
    echo "unsupported shell" && return 1
  fi
  echo "false" && return 1
}

# check current shell is login shell
function cf_is_login_shell() {
  if cf_is_bash >/dev/null; then
    shopt -q login_shell && echo "true" && return 0
  elif cf_is_zsh >/dev/null; then
    [[ -o login ]] && echo "true" && return 0
  else
    echo "unsupported shell" && return 1
  fi
  echo "false" && return 1
}

# check command is exists
function cf_is_command_exists() {
  local cmd=$1
  if [ -x /usr/bin/which ]; then
    /usr/bin/which $cmd >/dev/null 2>&1 && echo "true" && return 0
  else
    which $cmd >/dev/null 2>&1 && echo "true" && return 0 
  fi
  echo "false" && return 1
}

# check file name globbing flag
function cf_is_glob_enabled() {
  if cf_is_bash >/dev/null; then 
    [[ $- != *f* ]] && echo "true" && return 0
  elif cf_is_zsh >/dev/null; then
    [[ -o glob ]] && echo "true" && return 0
  else
    echo "unsupported shell" && return 1
  fi
  echo "false" && return 1
}

# enable file name globbing
function cf_enable_glob() {
  cf_is_bash >/dev/null && set +f && return 0
  cf_is_zsh >/dev/null && set -o glob && return 0
  echo "unsupported shell" && return 1
}

# disable file name globbing
function cf_disable_glob() {
  cf_is_bash >/dev/null && set -f && return 0
  cf_is_zsh >/dev/null && set -o noglob && return 0
  echo "unsupported shell" && return 1
}

# check extglob flag
function cf_is_extglob_enabled() {
  if cf_is_bash >/dev/null; then 
    shopt -q extglob && echo "true" && return 0
  elif cf_is_zsh >/dev/null; then
    [[ -o kshglob ]] && echo "true" && return 0
  else
    echo "unsupported shell" && return 1
  fi
  echo "false" && return 1
}

# enable extglob 
function cf_enable_extglob() {
  cf_is_bash >/dev/null && shopt -s extglob && return 0
  cf_is_zsh >/dev/null && set -o kshglob && return 0
  echo "unsupported shell" && return 1
}

# disable extglob 
function cf_disable_extglob() {
  cf_is_bash >/dev/null && shopt -u extglob && return 0
  cf_is_zsh >/dev/null && unsetopt kshglob && return 0
  echo "unsupported shell" && return 1
}

# check pid is exists
function cf_is_pid_exists() {
  local pid=$1
  [ -z "$pid" ] && echo "false" && return 1
  kill -0 $pid >/dev/null 2>&1 && echo "true" && return 0
  echo "false" && return 1
}

function cf_is_java() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  ps -ocommand= -p$pid | awk '$1~/java$/' > /dev/null && echo "true" && return 0
  echo "false" && return 1
}

function cf_is_available_port() {
  local port=$1
  if [[ "$OSTYPE" = *linux* ]];then
    local r=$( netstat -ant | awk '$6=="LISTEN" && $4~":'$port'$"' )
  elif [[ "$OSTYPE" = *darwin* ]];then
    local r=$( netstat -ant | awk '$6=="LISTEN"' | grep "\.$port " )
  else
    echo "unknown system" && return 1
  fi

  [ -z "$r" ] && echo "true" && return 0;
  echo "false" && return 1 # port has been used
}

function cf_defined() {
  if cf_is_bash >/dev/null; then
    [[ ${!1-X} == ${!1-Y} ]]
  elif cf_is_zsh >/dev/null; then
    [[ ${(P)1-X} == ${(P)1-Y} ]]
  else
    echo "unsupported shell" && return 1
  fi
}

function cf_has_value() {
    cf_defined $1 || return 1
    if cf_is_bash >/dev/null; then
      [[ -n ${!1} ]] && return 0
    elif cf_is_zsh >/dev/null; then
      [[ -n ${(P)1} ]] && return 0
    fi
    return 1
}

function cf_has_sudo_privilege() {
  # do not need password
  sudo -n echo >/dev/null 2>&1
}

function cf_timestamp() {
  date +%F-%T | tr ':-' '_' #2015_12_01_22_15_22
}

function cf_length() {
  echo ${#1}
}

# trim string
function cf_trim() {
  if [ -p /dev/stdin ]; then
    while IFS='' read line; do
      _trim "$line"
    done
  else
    _trim "$1"
  fi
}

# private function
function _trim() {
  local str="$1"
  local extglob=$(cf_is_extglob_enabled)
  if cf_is_bash >/dev/null || cf_is_zsh >/dev/null; then
    [ $extglob = "false" ] && cf_enable_extglob
    str="${str##*( )}"
    str="${str%%*( )}"
    [ $extglob = "false" ] && cf_disable_extglob
  else
    echo "unsupported shell" && return 1
  fi
  echo $str
}

function cf_lower() {
  echo "$1" | tr '[:upper:]' '[:lower:]'
}

function cf_upper() {
  echo "$1" | tr '[:lower:]' '[:upper:]'
}

function cf_ps_name() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  if cf_is_java $pid >/dev/null; then
    local main=$(cf_ps_java_main $pid)
    echo "java($main)"
  else
    ps -ocommand= -p $pid | awk '{print $1}'
  fi
}

function cf_ppid() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  ps -oppid= -p $pid
}

function cf_ps_java_main() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  ps -ocommand= -p $pid | tr ' ' '\n' | awk '/-classpath|-cp/{getline;next};/^-/{next}1' | awk 'NR==2'
}

function cf_ps_time() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1

  local elapsed="$(ps -oetime= -p $pid | cf_trim)"
  local started="$(ps -olstart= -p $pid | cf_trim)"
  if [ `cf_is_gnu_date` = "true" ]; then
    started=$(date +'%Y-%m-%d %H:%M:%S' -d "$started")
  fi
  local cpu_time=$(ps -otime= -p $pid | cf_trim)
  echo "started from: $started, elapsed: $elapsed, cumulative cpu time: $cpu_time"
}

function cf_ps_zombies() {
  ps -opid,state,command -e | awk 'NR==1 || $2=="Z"'
}

function cf_connection_topology() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1

  /usr/sbin/lsof -Pan -iTCP -p $pid > /tmp/.$pid.lsof
  grep -o "[0-9.:]*->[0-9.:]*" /tmp/.$pid.lsof > /tmp/.$pid.conns
  grep "LISTEN" /tmp/.$pid.lsof | awk '$9~/*/{print substr($9,3)}' > /tmp/.$pid.ports

  echo "-------------- downstream -------------"
  for port in $(cat /tmp/.$pid.ports); do
    cf_connection_list_by_port $port | awk '$6=="ESTABLISHED" {print $5}' | cut -d':' -f1 | sort | uniq -c | awk '{print $2"-->localhost:"'$port'" ("$1")"}'
  done

  echo "-------------- upstream ---------------"
  local portsExpr=$(cat /tmp/.$pid.ports | sed -e 's/^/:/' -e 's/$/->/' | xargs | sed 's/ /|/g')
  grep -Ev "$portsExpr" /tmp/.$pid.conns > /tmp/.$pid.out
  awk -F'->' '{print $2}' /tmp/.$pid.out | sort | uniq -c | sort -nrk1 | awk '{print "localhost-->"$2" ("$1")"}'
  rm -f /tmp/.$pid.lsof /tmp/.$pid.conns /tmp/.$pid.ports
}

function cf_connection_list_by_pid() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  /usr/sbin/lsof -Pan -iTCP -p $pid
}

function cf_connection_list_by_port() {
  local port=$1
  netstat -ant| awk '$4~/[:.]'"$port"'$/'
}

function cf_connection_stat_by_pid() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  local interval=${2:-1}
  /usr/sbin/lsof -Pan -iTCP -p $pid -r $interval
}

function cf_connection_stat_by_port() {
  local port=$1
  netstat -ant -c| awk '$1=="Proto"{print "\n"$0};$4~/[:.]'"$port"'$/'
}

function cf_listening_sockets() {
  #lsof -Pnl -i4TCP -sTCP:LISTEN #low version unsupported -sTCP params
  if cf_is_linux >/dev/null || cf_is_darwin >/dev/null; then
    if cf_has_sudo_privilege; then
      sudo /usr/sbin/lsof -Pnl -i4TCP | grep LISTEN
    else
      /usr/sbin/lsof -Pnl -i4TCP | grep LISTEN
    fi
  else
    netstat -plnt 2>/dev/null | grep -v tcp6
  fi
}

function cf_traffic_by_eth() {
  local eth=${1:-"eth0"}
  if cf_is_linux >/dev/null; then
    [ ! -d /sys/class/net/$eth ] && echo "network interface not exists." && return 1
    while true; do
      local r1=`cat /sys/class/net/$eth/statistics/rx_bytes`
      local t1=`cat /sys/class/net/$eth/statistics/tx_bytes`
      sleep 1
      local r2=`cat /sys/class/net/$eth/statistics/rx_bytes`
      local t2=`cat /sys/class/net/$eth/statistics/tx_bytes`
      local rkbps=`cf_calc "( $r2 - $r1 ) / 1024"`
      local tkbps=`cf_calc "( $t2 - $t1 ) / 1024"`
      echo "$eth: RX $rkbps kB/s TX $tkbps kB/s"
    done
  elif cf_is_darwin >/dev/null; then
    # `netstat -I eth0 -w 1` or `nettop -n -m tcp`
    declare -a tuple
    local _i1=0
    cf_is_zsh >/dev/null && _i1=1
    local _i2=1
    cf_is_zsh >/dev/null && _i1=2
    while true; do
      tuple=( $(netstat -nbi -I $eth | tail -1 | awk '{print $7,$10}') )
      local r1=${tuple[$_i1]}
      local t1=${tuple[$_i2]}
      sleep 1
      tuple=( $(netstat -nbi -I $eth | tail -1 | awk '{print $7,$10}') )
      local r2=${tuple[$_i1]}
      local t2=${tuple[$_i2]}
      local rkbps=`cf_calc "( $r2 - $r1 ) / 1024"`
      local tkbps=`cf_calc "( $t2 - $t1 ) / 1024"`
      echo "$eth: RX $rkbps kB/s TX $tkbps kB/s"
    done
  else
    echo "unsupported os" && return 1
  fi
}

function cf_traffic_by_pid() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1

  # kernel 2.6.18 not support, must 2.6.32 or later?
  local pf="/proc/$pid/net/netstat"
  [ ! -f $pf ] && echo "$pf not found!" && return 1

  declare -a tuple
  local _i1=0
  cf_is_zsh >/dev/null && _i1=1
  local _i2=1
  cf_is_zsh >/dev/null && _i1=2
  local pname="$(cf_ps_name $pid)"
  while true; do
    tuple=( $(grep "IpExt: " $pf | awk 'NR==2{print $8,$9}') )
    local r1=${tuple[$_i1]}
    local t1=${tuple[$_i2]}
    sleep 1
    tuple=( $(grep "IpExt: " $pf | awk 'NR==2{print $8,$9}') )
    local r2=${tuple[$_i1]}
    local t2=${tuple[$_i2]}
    local rkbps=`cf_calc "( $r2 - $r1 ) / 1024"`
    local tkbps=`cf_calc "( $t2 - $t1 ) / 1024"`
    echo "$pname: IN $rkbps kB/s OUT $tkbps kB/s"
  done
}

function cf_iotop() {
  sudo iotop -bod1
}

function cf_check_sum() {
  local dir=${1:-$PWD}
  local dirsum=0
  for sum  in $(find ${dir} -type f -print0 | xargs -0 cksum | awk '{print $1}')
  do
    dirsum=$(( ${sum} + ${dirsum} ))
  done
  echo ${dirsum}
}

function cf_java_classpath_check() {
  [ $# -eq 0 ] && echo "please enter classpath dir" && return 1
  [ ! -d "$1" ] && echo "not a directory" && return 1

  local tmpfile="/tmp/.cp$(date +%s)"
  local tmphash="/tmp/.hash$(date +%s)"
  local verbose="/tmp/cp-verbose.log"

  if cf_is_zsh >/dev/null;then
    local -a files
    local begin=1
  elif cf_is_bash >/dev/null;then
    declare -a files
    local begin=0
  else 
    echo "unsupported shell" && return 1
  fi
  files=(`find "$1" -name "*.jar"`)

  for f in $files; do
    jarName=`basename $f`
    list=`unzip -l $f | awk -v fn=$jarName '/\.class$/{print $NF,fn}'`
    size=`echo "$list" | wc -l`
    echo $jarName $size >> $tmphash
    echo "$list"
  done | sort | awk 'NF{ a[$1]++;m[$1]=m[$1]","$2}END{for(i in a) if(a[i] > 1) print i,substr(m[i],2)}' > $tmpfile

  awk '{print $2}' $tmpfile | awk -F',' '{i=1;for(;i<=NF;i++) for(j=i+1;j<=NF;j++) print $i,$j}' | sort | uniq -c | sort -nrk1 | 
  while read line; do
    local dup=${line%% *}
    local jars=${line#* }
    local jar1=${jars% *}
    local jar2=${jars#* }
    local len_jar1=`grep -F "$jar1" $tmphash | grep ^"$jar1" | awk '{print $2}'`
    local len_jar2=`grep -F "$jar2" $tmphash | grep ^"$jar2" | awk '{print $2}'`
    local len=$(($len_jar1 > $len_jar2 ? $len_jar1 : $len_jar2))
    local per=$(echo "scale=2; $dup/$len" | bc -l)
    echo ${per/./} $dup $jar1 $jar2
  done | sort -nr -k1 -k2 | awk 'NR==1{print "Similarity DuplicateClasses File1 File2"}{print "%"$0}'| column -t

  sort $tmpfile | awk '{print $1,"\n\t\t",$2}' > $verbose
  echo "See $verbose for more details."

  rm -f $tmpfile
  rm -f $tmphash
}

function cf_java_class_find() {
  local libdir=$1
  local name=$2
  local glob=$(cf_is_glob_enabled)
  [ $glob = "false" ] && cf_enable_glob
  builtin pushd $libdir >/dev/null
  for j in *.jar; do
    unzip -l $j | grep $name && echo $j;
  done
  builtin popd >/dev/null
  [ $glob = "false" ] && cf_disable_glob
}

function cf_java_pids() {
  ps x | grep "jav[a]" | awk '{print $1}'
}

function cf_java_infos() {
  for p in `cf_java_pids`; do
    echo "java pid: $p"
    info=`ps -opid=,command= -p $p | tr ' ' '\n' | awk '/-classpath|-cp/{getline;next};/-Xmx|-Dcatalina.base/{print};/^-/{next};1' | xargs`
    echo "  $info"
    time=`cf_ps_time $p`
    echo "  $time"
  done
}

function cf_java_threads() {
  local pid=$1
  local vm_threads="GC task|VM |CompilerThread|Finalizer|Reference Handler|Signal Dispatcher"
  "$JAVA_HOME"/bin/jstack $pid | grep "^\"" | grep -Ev "$vm_threads"
}

function cf_java_sysprops() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  "$JAVA_HOME"/bin/jinfo -sysprops $pid
}

function cf_jstack_series() {
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  local count=${2:-5}  # defaults 5 times
  local delay=${3:-0.5} # defaults 0.5 seconds

  local logdir=${LOG_DIR:-"/tmp"}
  while [ $count -gt 0 ]; do
    if cf_is_gnu_date >/dev/null; then 
      local suffix=$(date +%H%M%S.%N)
    else
      local suffix=$(date +%H%M%S)"."$count
    fi
    "$JAVA_HOME"/bin/jstack $pid > $logdir/jstack.$pid.$suffix
    sleep $delay
    let count--
    echo -n "."
  done
}

function cf_dmesg() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1

  dmesg -T "$@" 2>/dev/null
  [ $? -eq 0 ] && return 0

  dmesg "$@" | perl -w -e 'use strict;
  my ($uptime) = do { local @ARGV="/proc/uptime";<>}; ($uptime) = ($uptime =~ /^(\d+)\./);
  foreach my $line (<>) {
    printf( ($line=~/^\[\s*(\d+)\.\d+\](.+)/) ? ( "[%s]%s\n", scalar localtime(time - $uptime + $1), $2 ) : $line )
  }'
}

function cf_trace_http_request() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e read -s 2000 -qftp $pid 2>&1 | grep " HTTP/1[.][01][\]r[\]n" 
}

function cf_trace_http_response() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e write -s 2000 -qftp $pid 2>&1 | grep "HTTP/1[.][01] " 
}

function cf_trace_http_req_header() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e read -s 2000 -qftp $pid 2>&1 | grep " HTTP/1[.][01][\]r[\]n" | sed  's/\\r\\n/\n/g'
}

function cf_trace_http_resp_header() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e write -s 2000 -qftp $pid 2>&1 | grep "HTTP/1[.][01] " | sed 's/\\r\\n/\n/g'
}

function cf_trace_http_invoke() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e sendto -s 2000 -qftp $pid 2>&1 | grep " HTTP/1[.][01][\]r[\]n" 
}

function cf_trace_connect() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e connect -s 2000 -qftp $pid 2>&1 | grep "port"
}

function cf_trace_socket() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e connect,socket,close -s 2000 -qftp $pid 2>&1 | grep "port"
}

function cf_trace_sql_select() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e sendto,write -s 2000 -qftp $pid 2>&1 | grep -i "[\]3select"
}

function cf_trace_sql_update() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e sendto,write -s 2000 -qftp $pid 2>&1 | grep -i "[\]3update"
}

function cf_trace_sql_insert() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e sendto,write -s 2000 -qftp $pid 2>&1 | grep -i "[\]3insert"
}

function cf_trace_redis_command() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  local cmd=$2
  strace -e sendto,write -s 2000 -qftp $pid 2>&1 | grep -i "$cmd[\]r[\]n"
}

function cf_trace_dubbo_request() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e read -s 2000 -qftp $pid 2>&1 | grep -i "[\]tinterface"
}

function cf_trace_dubbo_invoke() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  strace -e write -s 2000 -qftp $pid 2>&1 | grep -i "[\]tinterface"
}

function cf_trace_system_call() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  local pid=$1
  ! cf_is_pid_exists >/dev/null $pid && echo "pid:$pid not exists" && return 1
  local time=${2:-5}

  local outfile="/tmp/.sys-call.$pid"
  strace -cqftp $pid -o $outfile & 
  local spid=$!
  while [ $time -gt 0 ]; do
    sleep 1
    let time--
    echo -n "."
  done
  echo ""
  kill $spid && echo "ok"
  # if strace process still exists
  cf_is_pid_exists $spid >/dev/null 2>&1 && kill -9 $spid
  cat $outfile && rm -f $outfile
}

function cf_random_entropy_stat() {
  ! cf_is_linux >/dev/null && echo "only works in linux" && return 1
  while true; do
    echo "entropy available:" `cat /proc/sys/kernel/random/entropy_avail`
    sleep 1
  done
}

function cf_json_fmt() {
  python -mjson.tool
}

function cf_http_server() {
  local port=${1:-8000}
  python -mSimpleHTTPServer $port 2>/dev/null
}

作业控制与前台进程组

这篇文章是对之前的SIGTTIN信号量的疑惑?的解答,对于为何会有这种奇怪的用法,在另一篇shell下精确的定位一个命令 也介绍过了,这里想讨论的重点不在于怎么变通解决那个问题,而是导致SIGTTIN发生的机制是怎么引起的。我的同事对这个问题也产生了好奇,在stackoverflow上发帖,有人给出了解释,解答的人直接给出了bash的源码jobs.c里的initialize_job_control方法片段,指出SIGTTIN正是那里面的逻辑。不过如果你跟我一样对shell和linux系统调用都懂得很肤浅的话,这段代码并不容易懂,所以在这里更详细的解释一下这个问题的来龙去脉。

刚开始碰到这个问题的时候,通过strace看到了是SIGTTIN信号量所致,因为这个信号量默认的行为是让进程STOP(暂停),即通过ps观察到的状态为T。对于SIGTTIN信号量《Linux/UNIX系统编程手册》上是这么说的:

只有前台作业中的进程才能够从控制终端读取输入。这个限制条件避免了多个作业竞争读取终端输入。如果后台作业尝试从终端读取输入,就会接收到一个SIGTTIN信号。SIGTTIN信号的默认处理动作是停止作业。

但我们的脚本里并没有后台进程,那两个进程也没有读取终端,跟上面的解释对不上。也没有在网上搜到其它引发SIGTTIN信号的情况,在这里困惑了很久。不过凭直觉知道这个问题应该跟作业控制有关,在脚本里显式的开启作业控制,是能够正常运行的:

$ cat sleep.sh
#!/bin/bash
set -m
bash -ic 'sleep 3'
bash -ic 'sleep 2'

所以一定是在进程某个状态上的不一致导致的。上周末的时候阅读了一下strace的log,对出问题的脚本:

#!/bin/bash
bash -ic 'sleep 3'
bash -ic 'sleep 2'

使用strace -f -e verbose=all -t ./sleep.sh 2>log 得到更详细的日志

...
03:39:06 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f997f03ca10) = 9897
...
[pid  9897] 03:39:06 execve("/usr/bin/bash", ["bash", "-ic", "sleep 3"], [/* 30 vars */]) = 0
...
[pid  9897] 03:39:06 open("/dev/tty", O_RDWR|O_NONBLOCK) = 3
[pid  9897] 03:39:06 getrlimit(RLIMIT_NOFILE, {rlim_cur=1024, rlim_max=4*1024}) = 0
[pid  9897] 03:39:06 fcntl(255, F_GETFD) = -1 EBADF (Bad file descriptor)
[pid  9897] 03:39:06 dup2(3, 255)       = 255
[pid  9897] 03:39:06 close(3)           = 0
[pid  9897] 03:39:06 ioctl(255, TIOCGPGRP, [9891]) = 0
[pid  9897] 03:39:06 setpgid(0, 9897)   = 0 //第一个子进程更改了它的进程组ID
...
03:39:09 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f997f03ca10) = 9922
...
[pid  9922] 03:39:09 execve("/usr/bin/bash", ["bash", "-ic", "sleep 2"], [/* 30 vars */]) = 0
...
[pid  9922] 03:39:09 access("/usr/bin/bash", R_OK) = 0
[pid  9922] 03:39:09 getpgrp()          = 9891
[pid  9922] 03:39:09 ioctl(2, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, 0x7ffd356e49c0) = -1 ENOTTY (Inappropriate ioctl for device)
[pid  9922] 03:39:09 open("/dev/tty", O_RDWR|O_NONBLOCK) = 3
[pid  9922] 03:39:09 getrlimit(RLIMIT_NOFILE, {rlim_cur=1024, rlim_max=4*1024}) = 0
[pid  9922] 03:39:09 fcntl(255, F_GETFD) = -1 EBADF (Bad file descriptor)
[pid  9922] 03:39:09 dup2(3, 255)       = 255
[pid  9922] 03:39:09 close(3)           = 0
[pid  9922] 03:39:09 ioctl(255, TIOCGPGRP, [9897]) = 0
[pid  9922] 03:39:09 rt_sigaction(SIGTTIN, {SIG_DFL, [], SA_RESTORER, 0x7f912a22b650}, {SIG_IGN, [], SA_RESTORER, 0x7f912a22b650}, 8) = 0
[pid  9922] 03:39:09 kill(0, SIGTTIN)   = 0
[pid  9896] 03:39:09 <... wait4 resumed> 0x7ffc5b5e6800, 0, NULL) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
[pid  9922] 03:39:09 --- SIGTTIN {si_signo=SIGTTIN, si_code=SI_USER, si_pid=9922, si_uid=1000} ---
[pid  9896] 03:39:09 --- SIGTTIN {si_signo=SIGTTIN, si_code=SI_USER, si_pid=9922, si_uid=1000} ---
[pid  9922] 03:39:09 --- stopped by SIGTTIN ---
[pid  9896] 03:39:09 --- stopped by SIGTTIN ---

确认这个SIGTTIN信号是第二个bash -ic 'sleep 2'进程发出的,kill(0, SIGTTIN)表示它把这个信号发送到自己所在的进程组,整个进程组的进程都接收到这个信号,所以它和它的父进程sleep.sh都变成了stop状态。

脚本里两次执行的bash -ic子进程也都是shell,它们在初始化的时候会有作业控制的逻辑,结合jobs.cinitialize_job_control方法里的代码(shell初始化时调用到这里):

/* We can only have job control if we are interactive. */
if (interactive == 0)
{
  job_control = 0;
  original_pgrp = NO_PID;
  shell_tty = fileno (stderr);
}
else
{
  shell_tty = -1;

  /* If forced_interactive is set, we skip the normal check that stderr
    is attached to a tty, so we need to check here.  If it's not, we
    need to see whether we have a controlling tty by opening /dev/tty,
    since trying to use job control tty pgrp manipulations on a non-tty
    is going to fail. */ 
    // bash "-i" 参数会启用 forced_interactive
  if (forced_interactive && isatty (fileno (stderr)) == 0)
    shell_tty = open ("/dev/tty", O_RDWR|O_NONBLOCK);

  /* Get our controlling terminal.  If job_control is set, or
    interactive is set, then this is an interactive shell no
     matter where fd 2 is directed. */
   if (shell_tty == -1)
    shell_tty = dup (fileno (stderr));/* fd 2 */

  shell_tty = move_to_high_fd (shell_tty, 1, -1);

  /* Compensate for a bug in systems that compiled the BSD
 rlogind with DEBUG defined, like NeXT and Alliant. */
  if (shell_pgrp == 0)
{
  shell_pgrp = getpid ();
  setpgid (0, shell_pgrp);
  tcsetpgrp (shell_tty, shell_pgrp);
}

  while ((terminal_pgrp = tcgetpgrp (shell_tty)) != -1)
{
  if (shell_pgrp != terminal_pgrp)
    {
      SigHandler *ottin;

      ottin = set_signal_handler(SIGTTIN, SIG_DFL);
      kill (0, SIGTTIN); // 第二次执行bash -ic时触发了这里
      set_signal_handler (SIGTTIN, ottin);
      continue;
    }
  break;
}
if (terminal_pgrp == -1)
t_errno = errno;

  /* Make sure that we are using the new line discipline. */
  if (set_new_line_discipline (shell_tty) < 0)
{
  sys_error (_("initialize_job_control: line discipline"));
  job_control = 0;
}
  else
{
  original_pgrp = shell_pgrp;
  shell_pgrp = getpid ();

  // 第一次bash -ic 'sleep 3'触发了这里的 setpgid 修改了当前进程组
  if ((original_pgrp != shell_pgrp) && (setpgid (0, shell_pgrp) < 0))
    {
      sys_error (_("initialize_job_control: setpgid"));
      shell_pgrp = original_pgrp;
    }

  job_control = 1;

  /* If (and only if) we just set our process group to our pid,
     thereby becoming a process group leader, and the terminal
     is not in the same process group as our (new) process group,
     then set the terminal's process group to our (new) process
     group.  If that fails, set our process group back to what it
     was originally (so we can still read from the terminal) and
     turn off job control.  */
  if (shell_pgrp != original_pgrp && shell_pgrp != terminal_pgrp)
    {
      if (give_terminal_to (shell_pgrp, 0) < 0)
    {
      t_errno = errno;
      setpgid (0, original_pgrp);
      shell_pgrp = original_pgrp;
      job_control = 0;
    }
   }
...

关键点就在于shell_pgrpterminal_pgrp这两个变量,shell_pgrp是当前进程组,而terminal_pgrp是占用当前控制终端的进程所在的进程组(前台进程组),这些状态都是可以通过ps观察到的,可以跟踪一下:

$ ps xfo pid,ppid,pgid,sid,tpgid,stat,tty,command | awk "NR==1||/pts\/2/"
  PID  PPID  PGID   SID TPGID STAT TT       COMMAND
12413 12410 12410 12410    -1 S    ?        sshd: hongjiang@pts/0
12414 12413 12414 12414 12580 Ss   pts/0     \_ -bash
12579 12414 12579 12414 12580 S    pts/0         \_ /bin/bash ./sleep.sh
12580 12579 12580 12414 12580 S+   pts/0             \_ sleep 3

$ ps xfo pid,ppid,pgid,sid,tpgid,stat,tty,command | awk "NR==1||/pts\/2/"
  PID  PPID  PGID   SID TPGID STAT TT       COMMAND
12413 12410 12410 12410    -1 S    ?        sshd: hongjiang@pts/0
12414 12413 12414 12414 12414 Ss+  pts/0     \_ -bash
12579 12414 12579 12414 12414 T    pts/0         \_ /bin/bash ./sleep.sh
12607 12579 12579 12414 12414 T    pts/0             \_ bash -ic sleep 2

在第一次执行bash -ic 'sleep 3'的时候,sleep.sh父进程先clone出bash子进程(pid 12580),因为-i参数强制这个bash子进程用交互式运行,它会加载$HOME下的.bashrc等文件,这个过程可能会fork/clone出若干子进程(所以会看到第二次bash -ic sleep 2进程的ID跟第一次不是连续的),等这些配置文件加载完之后,它并不是fork/clone的形式执行sleep 3而是使用当前进程(12580)执行的sleep 3,这里很关键的信息是"PGID"和"TPGID"都是本身进程ID,而非父进程ID,跟第二次的状态不一样。

因为脚本默认是关闭作业控制的,本来每个子进程并不会设置为独立的进程组,比如下面这个脚本:

$ cat a.sh
#!/bin/bash
/usr/bin/sleep 10

$ ps xfo pid,ppid,pgid,sid,tpgid,stat,tty,command | awk "NR==1||/pts\/2/"
  PID  PPID  PGID   SID TPGID STAT TT       COMMAND
12668 12665 12665 12665    -1 S    ?        sshd: hongjiang@pts/2
12669 12668 12669 12669 12736 Ss   pts/2     \_ -bash
12736 12669 12736 12669 12736 S+   pts/2         \_ /bin/bash ./a.sh
12737 12736 12736 12669 12736 S+   pts/2             \_ /usr/bin/sleep 10

上面脚本执行时sleep子进程"PGID"和"TPGID"都是进程父进程a.sh的,并没有被设置为一个独立的进程组。

sleep.sh脚本里之所以会对子进程设置一个独立的进程组,是因为"-i"参数使得bash -ic 'sleep 3'在非交互式脚本里运行时进程被强制设置成了独立的进程组(见initializejobcontroll里的setpgid),同时"TPGID"这个表示前台进程组的状态也被改为了bash -ic 'sleep 3'的进程组ID。

那为什么在接下来的bash -ic 'sleep 2'子进程执行时却不像前面的那样呢?这正是最诡异的地方。它们所在的sleep.sh脚本是非交互式运行的,它本来预期脚本执行过程不应该产生与脚本进程组不一致的前台进程组,所以前台子进程组结束的时候,不会去更新"TPGID",可以用下面脚本来验证:

$ cat wait.sh
#!/bin/bash
bash -ic 'sleep 5'
sleep 4
sleep 3 &  #不让wait.sh进程立即退出
wait

上面wait.sh脚本里第一个子进程强制修改了"TPGID",子进程退出,以及后续再执行前台进程都不会去更新这个状态

$ ps xfo pid,ppid,pgid,sid,tpgid,stat,tty,command | awk "NR==1||/pts\/0/"
  PID  PPID  PGID   SID TPGID STAT TT       COMMAND
13867 13864 13864 13864    -1 S    ?        sshd: hongjiang@pts/0
13868 13867 13868 13868 14138 Ss   pts/0     \_ -bash
14137 13868 14137 13868 14138 S    pts/0         \_ /bin/bash ./wait.sh
14138 14137 14138 13868 14138 S+   pts/0             \_ sleep 5

$ ps xfo pid,ppid,pgid,sid,tpgid,stat,tty,command | awk "NR==1||/pts\/0/"
  PID  PPID  PGID   SID TPGID STAT TT       COMMAND
13867 13864 13864 13864    -1 S    ?        sshd: hongjiang@pts/0
13868 13867 13868 13868 14138 Ss   pts/0     \_ -bash
14137 13868 14137 13868 14138 S    pts/0         \_ /bin/bash ./wait.sh
14165 14137 14137 13868 14138 S    pts/0             \_ sleep 4

$ ps xfo pid,ppid,pgid,sid,tpgid,stat,tty,command | awk "NR==1||/pts\/0/"
  PID  PPID  PGID   SID TPGID STAT TT       COMMAND
13867 13864 13864 13864    -1 S    ?        sshd: hongjiang@pts/0
13868 13867 13868 13868 14138 Ss   pts/0     \_ -bash
14137 13868 14137 13868 14138 S    pts/0         \_ /bin/bash ./wait.sh
14168 14137 14137 13868 14138 S    pts/0             \_ sleep 3

回到sleep.sh脚本里,第二行bash -ic 'sleep 2'子进程初始化时,"TPGID"仍是上个进程bash -ic 'sleep 3'修改过的值。而bash -ic 'sleep 2'子进程也因为"-i"参数让自己以交互式运行,但是在还没有执行到setpgid之前,就先触发了SIGTTIN的逻辑:

if (shell_pgrp != terminal_pgrp)
{
    SigHandler *ottin;

    ottin = set_signal_handler(SIGTTIN, SIG_DFL);
    kill (0, SIGTTIN);
    set_signal_handler (SIGTTIN, ottin);
    continue;
}

因为这段代码会认为终端被其他前台进程占用,对当前进程组发出SIGTTIN信号。在这个场景里,这恰好是一种误会!

当我们显式的对sleep.sh脚本设置开启作业控制时:

$ cat sleep.sh
#!/bin/bash
set -m
bash -ic 'sleep 3'
bash -ic 'sleep 2'

$ ps xfo pid,ppid,pgid,sid,tpgid,stat,tty,command | awk "NR==1||/pts\/2/"
  PID  PPID  PGID   SID TPGID STAT TT       COMMAND
12668 12665 12665 12665    -1 S    ?        sshd: hongjiang@pts/2
12669 12668 12669 12669 12874 Ss   pts/2     \_ -bash
12873 12669 12873 12669 12874 S    pts/2         \_ /bin/bash ./sleep.sh
12874 12873 12874 12669 12874 S+   pts/2             \_ sleep 3

$ ps xfo pid,ppid,pgid,sid,tpgid,stat,tty,command | awk "NR==1||/pts\/2/"
  PID  PPID  PGID   SID TPGID STAT TT       COMMAND
12668 12665 12665 12665    -1 S    ?        sshd: hongjiang@pts/2
12669 12668 12669 12669 12901 Ss   pts/2     \_ -bash
12873 12669 12873 12669 12901 S    pts/2         \_ /bin/bash ./sleep.sh
12901 12873 12901 12669 12901 S+   pts/2             \_ sleep 2

它对每个子进程都设置为独立的进程组,并在每个进程(前台)结束的时候更新"TPGID"为父进程组ID,避免了initialize_job_controll里发送SIGTTIN的逻辑。

有很多shell的问题都是跟作业控制相关的,另一个例子参考tomcat进程意外退出的问题分析;作业控制可以玩出很多高阶花样,但它也大大增加了shell的复杂度,这个例子是一个典型的反面教材,最好不要在非交互式脚本里调用bash -ic来执行命令。

shell下精确的定位一个命令

之前在这篇SIGTTIN? 里提出了问题,但没有交代背景,为什么用交互式shell执行”which mvn”命令,是为了更准确的获取用户当前环境里所用的mvn命令到底是哪个。若用户对mvn做过alias,优先使用alias过的指令,然后再选择$PATH路径下的命令;这个函数的完整内容如下:

function get_mvn_cmd() {
  if [[ "$OSTYPE" == *cygwin* ]];then
    ppid=$( ps -ef -p $$ | awk 'NR==2{print $3}' )
    user_shell=$( ps -p $ppid | awk 'NR==2{print $8}' )
    #has some trouble with cygwin, while Ctrl-c cannot terminal
    set -m
  else
    ppid=$( ps -oppid= $$ )
    user_shell=$( ps -ocomm= -p $ppid )
  fi

  # while as login shell, it's -bash not bash
  if [[ "$user_shell" == "-"* ]];then
    user_shell=${user_shell:1}
  fi

  mvn=$( $user_shell -ic "alias mvn" 2>/dev/null | cut -d'=' -f2 | sed "s/'//g" )

  if [ -z "$mvn" ];then
    $user_shell -ic "which mvn" >/dev/null
    if [ $? -eq 0 ];then
      mvn=$( $user_shell -ic "which mvn" | head -1 )
    fi
  fi

  if [ -z "$mvn" ]; then
    echo "mvn command not found" >&2
    kill -s TERM $TOP_PID
  else
    echo $mvn
  fi
}

函数里先获取用户所使用的shell(即当前脚本的父进程),然后以交互式执行alias mvn看看是否又被用户别名过,没有的话再使用which mvn获取mvn命令。

为什么写的那么啰嗦是想兼容好几种用户环境,如果是linux或者使用的gnu-which,可以利用gnu-which的一些参数一次性按优先级依次从”alias, functions, commands”里查找命令

$ (alias; declare -f) | gwhich --read-alias --read-functions mvn

上面利用gnu-which的--read-alias--read-functions参数优先从前边aliasdeclare -f输出的结果里查找,最后再从$PATH里查找。实际上centos7的bash下which命令就被alias过了:

alias which='alias | /usr/bin/which --tty-only --read-alias --show-dot --show-tilde'    

bash的陷阱(3): return

bash里的return

当函数hook是一个只有一行return的空函数,并且hook函数被另一个test函数用在结尾的时候,test的返回结果很有趣:

#!/bin/bash

function test {
  ls /notexist >/dev/null 2>&1
  hook
}

function hook() {
    return
}

hook
echo "hook execute result "$?

test
echo "test execute result "$?

如果没有给return语句一个显式的code,它返回的是上一次命令的结果。hook函数这里应该用return 0更严谨一下。

bash debug

便于调试bash的输出效果:

#!/usr/bin/env bash

if [ -z "$1" ];then
  echo "usage: bashdebug your_script"
  exit 1
fi

PS4='+[$(date "+%Y-%m-%d %H:%M:%S") +${SECONDS}s ${BASH_SOURCE} ${LINENO}] ${FUNCNAME[0]:+${FUNCNAME[0]}(): }'

ACK=""
which ack &>/dev/null
[ $? -eq 0 ] && ACK=$(which ack)

if [ -z "$ACK" ];then
  bash -x $1
else
  bash -x $1 |& $ACK --passthru "^\+.*:"
fi

sh script.sh与./script.sh的差异

这两种方式差异是进程名称的不同,假设有以下内容的脚本:

$ cat script.sh
#!/bin/bash
sleep 4400

sh script.sh执行,它是明确的启动一个sh命令,把脚本名称做参数传递给它,进程名称是sh:

$ pstree  
 ...
 |-sshd-+...
 |      |-sshd---sshd---bash---sh---sleep

而以./script.sh方式执行,操作系统负责脚本加载(execve系统调用),进程名称是脚本名

$ pstree  
 ...
 |-sshd-+...
 |      |-sshd---sshd---bash---script.sh---sleep

shell前边的连字符含义

在一个脚本里,要获取其父shell时,使用了下面的方式:

#!/bin/bash 
ps -ocomm= -p $(ps -oppid= $$)

它在某些环境下,父shell会显示为 “/usr/local/bin/zsh” 或者 “bash” ,而某些环境下却会显示为”-bash”或”-zsh”;这个开头多出来的连字符是怎么回事?查了一下,原来是表示的是”login shell”。

linux下可以在”/etc/passwd”里看到用户的login shell,而在mac下要确认当前用户的login shell,要通过下面的命令:

$ dscl . read /users/$USER UserShell
UserShell: /bin/bash

在mac下,当打开终端程序(Terminal.app)时,终端shell是login进程的子进程(不管你配置那种command):

$ pstree
 ...
 |-+= 01272 hongjiang /Applications/Utilities/Terminal.app/Contents/MacOS/Terminal
 | \-+= 01275 root login -pf hongjiang
 |   \-+= 01276 hongjiang -bash
 ...

$ ps -ef | grep login
0  1275  1272   0  2:05PM ttys000    0:00.05 login -pf hongjiang 

# 把Terminal的启动命令修改为zsh也一样

 |-+= 01988 hongjiang /Applications/Utilities/Terminal.app/Contents/MacOS/Terminal
     | \-+= 01991 root login -pf hongjiang /bin/zsh  |   \-+= 01992 hongjiang -zsh

而在mac的 iTerm.app 下,不管如果你配置的command是”Login shell”还是修改为其他shell,启动后shell没有再挂在login进程下:

$ pstree
 |-+= 00574 hongjiang /Volumes/Data/program/iTerm.app/Contents/MacOS/iTerm
 | \-+= 02177 hongjiang /usr/local/bin/zsh  

但在iTerm下如果使用的是“Login shell”显示的名称前边却是有连字符的,而其他shell则没有

$ ps $$
  PID   TT  STAT      TIME COMMAND
 2220 s001  Ss     0:00.01 -/bin/bash

# 修改启动shell为zsh

➜  ps $$
  PID   TT  STAT      TIME COMMAND
 2524 s000  Ss     0:00.16 /usr/local/bin/zsh

要想在iTerm下保持跟Terminal一致也用login来启动,应该在配置里修改启动命令为:

login -pf $username /usr/local/bin/zsh   

在linux下,从tty登录shell也是一样由login进程启动的

$ ps -ef | grep bash
 hongjia+  2241   467  0 14:09 tty1     00:00:00 -bash

$ pstree 
systemd─┬─NetworkManager─┬─2*[dhclient]
    │                └─3*[{NetworkManager}]
    ├─...
    ├─login───bash
    ...   

$ ps -ef | grep login
root       467  0.0  0.2  84584  2328 ?        Ss   14:08   0:00 login -- hongjiang    

使用su命令切换到一个用户shell下,默认情况这个shell并不是”login shell”不会去执行/etc/profile和home目录下相关配置:

$ sudo su hongjiang

$ ps -ef | grep $$
hongjia+  2796  2795  0 14:57 pts/0    00:00:00 bash

要以”login shell”方式启动,需要对su指定一个参数“-“

$ sudo su - hongjiang

$ ps -ef | grep $$
hongjia+  3188  3187  0 15:35 pts/0    00:00:00 -bash

从bash文档里可以看到,要以”login shell”方式启动一个shell,要么第一个参数给一个特定的连字符“-”,要么显式的对bash设定”–login”参数。

bash的一般性诊断

如果脚本执行的时候报“找不到文件或目录”,但有没有提示任何文件或名录名称,比如:

$ ./service.sh
: No such file or directory

这种情况下,很可能是脚本里的特殊字符引起的,可以通过开启bash的debug选项来查看,为了输出行号可以设置PS4环境变量:

$ export PS4='+{$LINENO:${FUNCNAME[0]}} ' && bash -x service.sh
set: usage: set [--abefhkmnptuvxBCHP] [-o option-name] [arg ...]
+{3:} $'\r'
: command not found
+{23:} $'\r'
: command not found:
...

可以看到提示在底3行和23行出现了windows下的换行符\r导致脚本没能正确识别。通过cat -A可以看到行尾的windows换行符^M$

解决方法:

$ sed -i 's/\r//g' service.sh

bash的几个细节

1) 使用 /bin/bash 还是 /usr/bin/env bash

bash脚本的开头,通常声明为#!/bin/bash,但一些严谨一些(为了更好的跨平台)的脚本里,声明为#!/usr/bin/env bash

并不是所有的操作系统上bash都是/bin路径下的,有些unix(比如bsd系列),bash不是它们的默认shell,安装路径可能是在/usr/local/bin/bash,所以通过env方式在path下选择bash兼容性更好一些。不过,如果确定脚本只是为linux和mac使用,也无所谓。

2) 标准输入使用”-” ,还是”/dev/fd/0″

相对于使用”-“表示标准输入,”/dev/fd/0″的一个好处是,在脚本里更明确一些,因为有时很难区分”-“到底代表标准输入,还是命令的参数。

/dev/fd/0  标准输入 
/dev/fd/1  标准输出 
/dev/fd/2  标准错误 

3) 获取当前目录

有好几种方式可以获取当前运行的脚本所在的目录,最严谨的一种方式是:

CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -P )"

4) set -e 要不要设置

有些情况下,我们需要判断上一条命令是否执行成功,比如kill -0 pid检查某个进程是否存在,这个时候如果设置了set -e,导致进程不存在的kill -0执行失败脚本退出。如果进程不存在的也有一个分支逻辑的话,那个分支就不可能执行到了,这种情况下显然不能使用set -e

5) exit与fork

exit通常是当某个执行失败或条件不符合时希望真个脚本退出时使用的,但需要注意的一点是,当使用”$()”执行一个函数时,是fork方式,这个函数里定义的exit是让fork的那个进程退出,而不是当前脚本进程,需要当前脚本退出的话,一个变通的方式是定义个类似exit的方法给最顶层的进程发信号:

TOP_PID=$$

trap "exit 1" TERM

quit() {
  local msg=$1
  [ ! -z "$msg" ] && echo $msg >&2
  kill -s TERM $TOP_PID
}

foo() {
    command || quit 
}

result=$(foo)

bash的陷阱(2): 函数返回值问题

一个脚本里遇到的,这个函数的写法是:

inner_stop(){
  local pid=$( get_service_pid )
  [ ! -z "$pid" ] && kill $pid
}

在调用完这个函数的地方需要检查一下它的执行结果,通过 $? 来判断,结果当前一句pid为空时,inner_stop 总会返回1。本来的意图是,判断当pid存在才执行kill,否则pid为空直接返回成功,而不是返回错误。这里忘了bash函数的返回值是上最后一条语句的执行结果 [ xxx ] 测试语句不满足时返回1

在返回值这点上要注意,不同于其他语言,bash属于弱类型语言,并且返回值总是最后一个命令的执行结果,if condition; then doSomething; fi 语句即可以返回if condition 的执行结果,也可以返回doSomething的结果。(注意这里说的结果是指return value,不是标准输出的内容)

#!/bin/bash

foo() {
  [ ! -z "$x" ] && echo "x not empty"
}

foo || echo "foo function return $?"

上面脚本执行时,如果没有全局声明过x变量,foo函数的返回总是1(错误),需要对另一个分支显式的return 0才行。