-
Notifications
You must be signed in to change notification settings - Fork 2
/
watchdog-tomcat.go
189 lines (168 loc) · 5.74 KB
/
watchdog-tomcat.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
package main
import (
"bytes"
"errors"
"flag"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/coreos/go-systemd/daemon"
)
func runWatchedApp(application string) (pid int, err error) {
/*Execute Application
1. run app and set group pid for the forked child process
2. wait app start up*/
cmd := exec.Command(application)
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
err = cmd.Run()
if err != nil {
fmt.Println(err)
pid = 0
return pid, err
}
pgid := cmd.Process.Pid
cmd.Wait()
/*Get pid of JVM
get pid of child process by filter ps result using group pid*/
grep := exec.Command("grep", strconv.Itoa(pgid))
ps := exec.Command("ps", "axo", "pid,pgid,comm")
var out bytes.Buffer
pr, pw := io.Pipe()
ps.Stdout = pw
grep.Stdin = pr
grep.Stdout = &out
err = ps.Start()
if err != nil {
fmt.Println(err)
pid = 0
return pid, err
}
err = grep.Start()
if err != nil {
fmt.Println(err)
pid = 0
return pid, err
}
go func() {
defer pw.Close()
ps.Wait()
}()
defer pr.Close()
grep.Wait()
res := strings.TrimSpace(out.String())
pid, _ = strconv.Atoi(strings.Split(res, " ")[0])
return pid, nil
}
func healthCheck(check_url string, timeout time.Duration) (bool, error) {
var netClient = &http.Client{
Timeout: time.Second * timeout,
}
resp, err := netClient.Get(check_url)
if err != nil {
return false, err
}
if resp.StatusCode != 200 {
return false, errors.New("health check status is not 200")
}
defer resp.Body.Close()
return true, nil
}
func fileExists(filename string) bool {
info, err := os.Stat(filename)
if os.IsNotExist(err) {
return false
}
return !info.IsDir()
}
func main() {
// GET ALL FLAGS
app := flag.String("app", "", "path to the app should be run")
scheme := flag.String("scheme", "http", "scheme for health check,\n EXAMPLE: 'scheme://ip:port/path'")
ip := flag.String("ip", "127.0.0.1", "ip address for health check,\n EXAMPLE: 'scheme://ip:port/path'")
port := flag.String("port", "80", "port for health check,\n EXAMPLE: 'scheme://ip:port/path'")
path := flag.String("path", "", "path for health check,\n EXAMPLE: 'scheme://ip:port/path'")
healthcheck_timeout := flag.Duration("healthcheck-timeout", 5, "Timeout for healthcheck when service is running")
initialcheck_timeout := flag.Duration("initialcheck-timeout", 5, "Timeout for initialcheck when service is boot up")
fail_max := flag.Int("fail-max", 20, "max continued failed time")
flag.Parse()
if !fileExists(*app) {
fmt.Printf("app [%s] should exist and is an executable file", app)
os.Exit(1)
}
check_url := fmt.Sprintf("%s://%s:%s/%s", *scheme, *ip, *port, *path)
// RUN APPLICATION
pid, err := runWatchedApp(*app)
if err != nil {
fmt.Printf("application run error: %i\n", err)
os.Exit(2)
} else {
daemon.SdNotify(false, fmt.Sprintf("MAINPID=%d", pid))
}
/* WATCHDOG INITIAL
start watchdog when
1. first health check success*/
for {
_, err := healthCheck(check_url, *initialcheck_timeout)
if err == nil {
daemon.SdNotify(false, daemon.SdNotifyReady)
fmt.Println("WATCHDOG INITIALIZING: program is ok, watchdog is ready")
break
} else {
fmt.Printf("WATCHDOG INITIALIZING: program is not ok, watchdog is waiting\nINITIAL ERROR: %s\n", err.Error())
}
time.Sleep(1000 * time.Millisecond)
}
// WATCHDOG START
var wg sync.WaitGroup
wg.Add(1)
go func(check_url string) {
watchdog_usec, _ := strconv.ParseFloat(os.Getenv("WATCHDOG_USEC"), 64)
var wd_fail bool
var wd_usec, check_time_spent float64
var wd_interval, continue_fail int
wd_usec = watchdog_usec / (2 * 1000000)
for {
// send watchdog signal
if wd_fail == false {
daemon.SdNotify(false, daemon.SdNotifyWatchdog)
fmt.Printf("WATCHDOG STATUS: activate; LAST_CHECK_TIME_SPENT: %f; LAST_SLEEP_TIME: %d\n", check_time_spent, wd_interval)
}
// check and change the watchdog failed state
check_start := time.Now()
/* add your alert logic here, send alert to email or IM
the reason put alert logic here is we should time alert process and caculate the right
sleep time*/
check_success, err := healthCheck(check_url, *healthcheck_timeout)
if check_success == true {
continue_fail = 0
} else {
continue_fail += 1
}
if continue_fail > *fail_max {
wd_fail = true
} else {
wd_fail = false
}
check_time_spent = time.Since(check_start).Seconds()
if wd_fail == false {
if check_success == false {
fmt.Printf("CHECK STATUS: failed; ERROR: %s\n", err.Error())
}
wd_interval = int(wd_usec - check_time_spent + 0.5)
time.Sleep(time.Duration(wd_interval) * 1000 * time.Millisecond)
} else {
// sleep until systemd watchdog exceed limit
fmt.Printf("Watchdog change to failed state, because continued failed time is exceed fail-max limit: %d\n", *fail_max)
time.Sleep(time.Duration(int(wd_usec*2-check_time_spent+0.5)) * 1000 * time.Millisecond)
}
}
}(check_url)
wg.Wait()
}