fork 0k-charms
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

363 lines
11 KiB

  1. #!/bin/bash
  2. ##
  3. ## TODO
  4. ## - don't sleep 1 but wait in flock for 1 second
  5. ## - every waiting proc should write at least their PID and priority,
  6. ## to leave alive PID with higher priority the precedence. (and probably
  7. ## a check to the last probing time, and invalidate it if it is higher than 10s
  8. ## for example.)
  9. ## - could add the time they waited in the waiting list, and last probe.
  10. ## - should execute "$@", if user needs '-c' it can run ``bash -c ""``
  11. exname="$(basename "$0")"
  12. usage="$exname LOCKLABELS [-k] [FLOCK_OPTIONS] -- [CMD...]"
  13. verb() { [ -z "$verbose" ] || echo "$@" >&2 ; }
  14. err() { echo "$@" >&2; }
  15. die() { echo "$@" >&2; exit 1; }
  16. md5_compat() { md5sum | cut -c -32; true; }
  17. LOCKLABELS=
  18. flock_opts=()
  19. command=()
  20. nonblock=
  21. errcode=1
  22. timeout=
  23. cmd=
  24. priority=1
  25. remove_duplicate=
  26. while [ "$1" ]; do
  27. case "$1" in
  28. -h|--help)
  29. echo "$help"
  30. exit 0
  31. ;;
  32. -V|--version)
  33. echo "$version"
  34. exit 0
  35. ;;
  36. -c)
  37. cmd="$2"
  38. shift
  39. ;;
  40. -p|--priority)
  41. priority=$2
  42. shift
  43. ;;
  44. -D)
  45. remove_duplicate=true
  46. ;;
  47. -k)
  48. kill=yes
  49. ;;
  50. -n|--nb|--nonblock)
  51. nonblock=true
  52. ;;
  53. -w|--wait|--timeout)
  54. timeout=$2 ## will manage this
  55. shift
  56. ;;
  57. -E|--conflict-exit-code)
  58. errcode=$2 ## will manage this
  59. shift
  60. ;;
  61. -v|--verbose)
  62. verbose=true ## will manage this
  63. ;;
  64. -n|--nb|--nonblock)
  65. nonblock=true ## will manage this
  66. ;;
  67. --)
  68. [ "$cmd" ] && die "'--' and '-c' are mutualy exclusive"
  69. shift
  70. command+=("$@")
  71. break 2
  72. ;;
  73. *)
  74. [ -z "$LOCKLABELS" ] && { LOCKLABELS=$1 ; shift ; continue ; }
  75. flock_opts+=("$1")
  76. ;;
  77. esac
  78. shift
  79. done
  80. if [ -z "$LOCKLABELS" ]; then
  81. err "You must provide a lock file as first argument."
  82. err "$usage"
  83. exit 1
  84. fi
  85. if [ "$remove_duplicate" ]; then
  86. md5code=$(
  87. if [ "$cmd" ]; then
  88. echo bash -c "$cmd"
  89. else
  90. echo "${command[@]}"
  91. fi | md5_compat)
  92. fi
  93. function is_int () { [[ "$1" =~ ^-?[0-9]+$ ]] ; }
  94. is_pid_alive() {
  95. local pid="$1"
  96. ps --pid "$pid" >/dev/null 2>&1
  97. }
  98. is_pgid_alive() {
  99. local pgid="$1"
  100. [ "$(ps -e -o pgid,pid= | egrep "^ *$pgid ")" ]
  101. }
  102. pgid_from_pid() {
  103. local pid="$1"
  104. pgid=$(ps -o pgid= "$pid" 2>/dev/null | egrep -o "[0-9]+")
  105. if ! is_int "$pgid"; then
  106. err "Could not retrieve a valid PGID from PID '$pid' (returned '$pgid')."
  107. return 1
  108. fi
  109. echo "$pgid"
  110. }
  111. ensure_kill() {
  112. local pid="$1" timeout=5 start=$SECONDS kill_count=0 pgid
  113. pgid=$(pgid_from_pid "$pid")
  114. while is_pid_alive "$pid"; do
  115. if is_pgid_alive "$pgid"; then
  116. if [ "$kill_count" -gt 4 ]; then
  117. err "FATAL: duplicate command, GPID=$pgid has resisted kill procedure. Aborting."
  118. return 1
  119. elif [ "$kill_count" -gt 2 ]; then
  120. err "duplicate command, PGID wouldn't close itself, force kill PGID: kill -9 -- -$pgid"
  121. kill -9 -- "$pgid"
  122. sleep 1
  123. else
  124. err "duplicate command, Sending SIGKILL to PGID: kill -- -$pgid"
  125. kill -- -"$pgid"
  126. sleep 1
  127. fi
  128. ((kill_count++))
  129. fi
  130. if [ "$((SECONDS - start))" -gt "$timeout" ]; then
  131. err "timeout reached. $pid"
  132. return 1
  133. fi
  134. done
  135. return 0
  136. }
  137. acquire_pid_file() {
  138. local label=$1
  139. lockfile="/var/lock/lockcmd-$label.lock"
  140. mkdir -p /var/run/lockcmd
  141. pidfile="/var/run/lockcmd/$label.pid"
  142. export pidfile
  143. (
  144. verb() { [ -z "$verbose" ] || echo "$exname($label) $pid> $@" >&2 ; }
  145. err() { echo "$exname($label) $pid> $@" >&2; }
  146. start=$SECONDS
  147. kill_count=0
  148. pgid_not_alive_count=0
  149. while true; do
  150. ## ask for lock on $lockfile (fd 200)
  151. if ! flock -n -x 200; then
  152. verb "Couldn't acquire primary lock... (elapsed $((SECONDS - start)))"
  153. else
  154. verb "Acquired lock '$label' on pidfile, inspecting pidfile."
  155. if ! [ -e "$pidfile" ]; then
  156. verb "No pidfile, inscribing my PID"
  157. echo -e "$pid $priority" > "$pidfile"
  158. exit 0
  159. fi
  160. if ! content=$(cat "$pidfile" 2>/dev/null); then
  161. err "Can't read $pidfile"
  162. exit 1
  163. fi
  164. read opid opriority < <(echo "$content" | head -n 1)
  165. opriority=${opriority:-1}
  166. verb "Previous PID is $opid, with priority $opriority"
  167. if ! is_pid_alive "$opid"; then
  168. err "Ignoring stale PID $opid"
  169. echo -e "$pid $priority" > "$pidfile"
  170. exit 0
  171. else
  172. if [ "$remove_duplicate" ]; then ## Add my pid and md5 if not already there.
  173. same_cmd_pids=$(
  174. echo "$content" | tail -n +1 | \
  175. egrep "^[0-9]+ $md5code$" 2>/dev/null | \
  176. cut -f 1 -d " ")
  177. same_pids=()
  178. found_myself=
  179. for spid in $same_cmd_pids; do
  180. if [ "$spid" == "$pid" ]; then
  181. found_myself=true
  182. continue
  183. fi
  184. same_pids+=("$spid")
  185. done
  186. [ "$found_myself" ] || echo "$pid $md5code" >> "$pidfile"
  187. fi
  188. flock -u 200 ## reopen the lock to give a chance to the other process to remove the pidfile.
  189. if [ "$remove_duplicate" ]; then ## Add my pid and md5 if not already there.
  190. for spid in "${same_pids[@]}"; do
  191. if ! ensure_kill "$spid"; then
  192. err "Couldn't kill previous duplicate command."
  193. exit 1
  194. fi
  195. done
  196. fi
  197. pgid=$(pgid_from_pid "$opid")
  198. verb "PGID of previous PID is $pgid"
  199. if is_pgid_alive "$pgid"; then
  200. verb "Previous PGID is still alive"
  201. if [ "$kill" ] && [ "$priority" -ge "$opriority" ]; then
  202. if [ "$kill_count" -gt 4 ]; then
  203. err "$pid>FATAL: GPID=$pgid has resisted kill procedure. Aborting."
  204. exit 1
  205. elif [ "$kill_count" -gt 2 ]; then
  206. err "PGID wouldn't close itself, force kill PGID: kill -9 -- -$pgid" >&2
  207. kill -9 -- "$pgid"
  208. sleep 1
  209. else
  210. err "Sending SIGKILL to PGID: kill -- -$pgid" >&2
  211. kill -- -"$pgid"
  212. sleep 1
  213. fi
  214. ((kill_count++))
  215. else
  216. if [ "$nonblock" ]; then
  217. verb "Nonblock options forces exit."
  218. exit 1
  219. else
  220. verb "Couldn't acquire Lock... (elapsed $((SECONDS - start)))"
  221. fi
  222. fi
  223. else
  224. if [ "$pgid_not_alive_count" -gt 4 ]; then
  225. verb "$pid>A lock exists for label $label, but PGID:$pgid in it isn't alive while child $pid is ?!?."
  226. err "$pid>Can't force seizing the lock." >&2
  227. exit 1
  228. fi
  229. ((pgid_not_alive_count++))
  230. fi
  231. fi
  232. fi
  233. if [ "$timeout" ] && [ "$timeout" -lt "$((SECONDS - start))" ]; then
  234. err "Timeout reached (${timeout}s) while waiting for lock on $label"
  235. exit "$errcode"
  236. fi
  237. sleep 1
  238. done
  239. ) 200> "$lockfile"
  240. }
  241. remove_pid_file() {
  242. local label=$1
  243. lockfile="/var/lock/lockcmd-$label.lock"
  244. mkdir -p /var/run/lockcmd
  245. pidfile="/var/run/lockcmd/$label.pid"
  246. (
  247. verb() { [ -z "$verbose" ] || echo "$exname($label) $pid> $@" >&2 ; }
  248. err() { echo "$exname($label) $pid> $@" >&2; }
  249. verb "Asking lock to delete $pidfile."
  250. timeout=5
  251. start=$SECONDS
  252. while true; do
  253. ## ask for lock on $lockfile (fd 200)
  254. if ! flock -n -x 200; then
  255. verb "Couldn't acquire primary lock... (elapsed $((SECONDS - start)))"
  256. else
  257. verb "Acquired lock '$label' on pidfile."
  258. if ! [ -e "$pidfile" ]; then
  259. verb "No more pidfile, somebody deleted for us ?1?"
  260. exit 1
  261. fi
  262. if ! content=$(cat "$pidfile" 2>/dev/null); then
  263. err "Can't read $pidfile"
  264. exit 1
  265. fi
  266. read opid opriority < <(echo "$content" | head -n 1)
  267. opriority=${opriority:-1}
  268. if [ "$opid" == "$pid" ]; then
  269. verb "Deleted pidfile. Releasing lock."
  270. rm -f "$pidfile"
  271. exit 0
  272. else
  273. verb "Removing duplicates in pidfile. Releasing lock."
  274. [ "$remove_duplicate" ] && sed -ri "/^$pid $md5code$/d" "$pidfile"
  275. exit 0
  276. fi
  277. fi
  278. if [ "$timeout" ] && [ "$timeout" -lt "$((SECONDS - start))" ]; then
  279. err "Timeout reached (${timeout}s) while waiting for lock on $label"
  280. exit "$errcode"
  281. fi
  282. sleep 1
  283. done
  284. ) 200> "$lockfile"
  285. }
  286. ## appends a command to the signal handler functions
  287. #
  288. # example: trap_add EXIT,INT close_ssh "$ip"
  289. trap_add() {
  290. local sigs="$1" sig cmd old_cmd
  291. shift || {
  292. echo "${FUNCNAME} usage error" >&2
  293. return 1
  294. }
  295. cmd="$@"
  296. while IFS="," read -d "," sig; do
  297. prev_cmd="$(trap -p "$sig")"
  298. if [ "$prev_cmd" ]; then
  299. new_cmd="${prev_cmd#trap -- \'}"
  300. new_cmd="${new_cmd%\' "$sig"};$cmd"
  301. else
  302. new_cmd="$cmd"
  303. fi
  304. trap -- "$new_cmd" "$sig" || {
  305. echo "unable to add command '$@' to trap $sig" >&2 ;
  306. return 1
  307. }
  308. done < <(echo "$sigs,")
  309. }
  310. remove_all_pid_file() {
  311. while read -d "," label; do
  312. {
  313. remove_pid_file "$label" || err "Could not delete $label"
  314. } &
  315. done < <(echo "$LOCKLABELS,")
  316. wait
  317. }
  318. ##
  319. ## Code
  320. ##
  321. pid="$$"
  322. trap_add EXIT "remove_all_pid_file"
  323. while read -d "," label; do
  324. acquire_pid_file "$label" || exit "$errcode" &
  325. done < <(echo "$LOCKLABELS,")
  326. wait
  327. if [ "$cmd" ]; then
  328. bash -c "$cmd"
  329. else
  330. "${command[@]}"
  331. fi
  332. errlvl="$?"
  333. exit "$?"