mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
* fn: fnlb: enhancements and new grouper tests *) added healthy threshold (default: 1) *) grouper is now using configured hcEndpoint for version checks *) grouper now logs when servers switch between healthy/unhealthy status *) moved DB code out of grouper *) run health check immediately at start (don't wait until hcInterval) *) optional shutdown timeout (default: 0) & mgmt port (default: 8081) *) hot path List() in grouper now uses atomic ptr Load *) consistent router: moved closure to a new function *) bugfix: version parsing from fn servers should not panic fnlb *) bugfix: servers removed from DB, stayed in healthy list *) bugfix: if DB is down, health checker stopped monitoring *) basic new tests for grouper (add/rm/unhealthy/healthy) server
415 lines
8.6 KiB
Go
415 lines
8.6 KiB
Go
package lb
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"io"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"sort"
|
|
"sync"
|
|
"time"
|
|
|
|
"fmt"
|
|
"github.com/coreos/go-semver/semver"
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
// NewAllGrouper returns a Grouper that will return the entire list of nodes
|
|
// that are being maintained, regardless of key. An 'AllGrouper' will health
|
|
// check servers at a specified interval, taking them in and out as they
|
|
// pass/fail and exposes endpoints for adding, removing and listing nodes.
|
|
func NewAllGrouper(conf Config, db DBStore) (Grouper, error) {
|
|
a := &allGrouper{
|
|
nodeList: make(map[string]nodeState),
|
|
nodeHealthyList: make([]string, 0),
|
|
db: db,
|
|
|
|
// XXX (reed): need to be reconfigurable at some point
|
|
hcInterval: time.Duration(conf.HealthcheckInterval) * time.Second,
|
|
hcEndpoint: conf.HealthcheckEndpoint,
|
|
hcUnhealthy: int64(conf.HealthcheckUnhealthy),
|
|
hcHealthy: int64(conf.HealthcheckHealthy),
|
|
hcTimeout: time.Duration(conf.HealthcheckTimeout) * time.Second,
|
|
minAPIVersion: *conf.MinAPIVersion,
|
|
|
|
// for health checks
|
|
httpClient: &http.Client{Transport: conf.Transport},
|
|
}
|
|
|
|
for _, n := range conf.Nodes {
|
|
err := a.add(n)
|
|
if err != nil {
|
|
// XXX (reed): could prob ignore these but meh
|
|
logrus.WithError(err).WithFields(logrus.Fields{"node": n}).Error("error adding node")
|
|
}
|
|
}
|
|
go a.healthcheck()
|
|
return a, nil
|
|
}
|
|
|
|
// nodeState is used to store success/fail counts and other health related data.
|
|
type nodeState struct {
|
|
|
|
// num of consecutive successes & failures
|
|
success uint64
|
|
fail uint64
|
|
|
|
// current health state
|
|
healthy bool
|
|
}
|
|
|
|
// allGrouper will return all healthy nodes it is tracking from List.
|
|
// nodes may be added / removed through the HTTP api. each allGrouper will
|
|
// poll its database for the full list of nodes, and then run its own
|
|
// health checks on those nodes to maintain a list of healthy nodes.
|
|
// the list of healthy nodes will be maintained in sorted order so that,
|
|
// without any network partitions, all lbs may consistently hash with the
|
|
// same backing list, such that H(k) -> v for any k->v pair (vs attempting
|
|
// to maintain a list among nodes in the db, which could have thrashing
|
|
// due to network connectivity between any pair).
|
|
type allGrouper struct {
|
|
|
|
// health checker state and lock
|
|
nodeLock sync.RWMutex
|
|
nodeList map[string]nodeState
|
|
nodeHealthyList []string
|
|
|
|
db DBStore
|
|
|
|
httpClient *http.Client
|
|
|
|
hcInterval time.Duration
|
|
hcEndpoint string
|
|
hcUnhealthy int64
|
|
hcHealthy int64
|
|
hcTimeout time.Duration
|
|
minAPIVersion semver.Version
|
|
}
|
|
|
|
func (a *allGrouper) add(newb string) error {
|
|
if newb == "" {
|
|
return nil // we can't really do a lot of validation since hosts could be an ip or domain but we have health checks
|
|
}
|
|
err := a.checkAPIVersion(newb)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return a.db.Add(newb)
|
|
}
|
|
|
|
func (a *allGrouper) remove(ded string) error {
|
|
return a.db.Delete(ded)
|
|
}
|
|
|
|
func (a *allGrouper) publishHealth() {
|
|
|
|
a.nodeLock.Lock()
|
|
|
|
// get a list of healthy nodes
|
|
newList := make([]string, 0, len(a.nodeList))
|
|
for key, value := range a.nodeList {
|
|
if value.healthy {
|
|
newList = append(newList, key)
|
|
}
|
|
}
|
|
|
|
// sort and update healthy List
|
|
sort.Strings(newList)
|
|
a.nodeHealthyList = newList
|
|
|
|
a.nodeLock.Unlock()
|
|
}
|
|
|
|
// return a copy
|
|
func (a *allGrouper) List(string) ([]string, error) {
|
|
|
|
a.nodeLock.RLock()
|
|
ret := make([]string, len(a.nodeHealthyList))
|
|
copy(ret, a.nodeHealthyList)
|
|
a.nodeLock.RUnlock()
|
|
|
|
var err error
|
|
if len(ret) == 0 {
|
|
err = ErrNoNodes
|
|
}
|
|
return ret, err
|
|
}
|
|
|
|
func (a *allGrouper) runHealthCheck() {
|
|
|
|
// fetch a list of nodes from DB
|
|
list, err := a.db.List()
|
|
if err != nil {
|
|
// if DB fails, the show must go on, report it but perform HC
|
|
logrus.WithError(err).Error("error checking db for nodes")
|
|
|
|
// compile a list of nodes to be health checked
|
|
a.nodeLock.RLock()
|
|
list = make([]string, 0, len(a.nodeList))
|
|
for key, _ := range a.nodeList {
|
|
list = append(list, key)
|
|
}
|
|
a.nodeLock.RUnlock()
|
|
|
|
} else {
|
|
|
|
isChanged := false
|
|
|
|
// compile a map of DB nodes for deletion check
|
|
deleteCheck := make(map[string]bool, len(list))
|
|
for _, node := range list {
|
|
deleteCheck[node] = true
|
|
}
|
|
|
|
a.nodeLock.Lock()
|
|
|
|
// handle new nodes
|
|
for _, node := range list {
|
|
_, ok := a.nodeList[node]
|
|
if !ok {
|
|
// add new node
|
|
a.nodeList[node] = nodeState{
|
|
healthy: true,
|
|
}
|
|
isChanged = true
|
|
}
|
|
}
|
|
|
|
// handle deleted nodes: purge unmarked nodes
|
|
for key, _ := range a.nodeList {
|
|
_, ok := deleteCheck[key]
|
|
if !ok {
|
|
delete(a.nodeList, key)
|
|
isChanged = true
|
|
}
|
|
}
|
|
|
|
a.nodeLock.Unlock()
|
|
|
|
// publish if add/deleted nodes
|
|
if isChanged {
|
|
a.publishHealth()
|
|
}
|
|
}
|
|
|
|
// spawn health checkers
|
|
for _, key := range list {
|
|
go a.ping(key)
|
|
}
|
|
}
|
|
|
|
func (a *allGrouper) healthcheck() {
|
|
|
|
// run hc immediately upon startup
|
|
a.runHealthCheck()
|
|
|
|
for range time.Tick(a.hcInterval) {
|
|
a.runHealthCheck()
|
|
}
|
|
}
|
|
|
|
type fnVersion struct {
|
|
Version string `json:"version"`
|
|
}
|
|
|
|
func (a *allGrouper) getVersion(urlString string) (string, error) {
|
|
req, _ := http.NewRequest(http.MethodGet, urlString, nil)
|
|
ctx, cancel := context.WithTimeout(context.Background(), a.hcTimeout)
|
|
defer cancel()
|
|
req = req.WithContext(ctx)
|
|
|
|
resp, err := a.httpClient.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer func() {
|
|
io.Copy(ioutil.Discard, resp.Body)
|
|
resp.Body.Close()
|
|
}()
|
|
|
|
var v fnVersion
|
|
err = json.NewDecoder(resp.Body).Decode(&v)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return v.Version, nil
|
|
}
|
|
|
|
func (a *allGrouper) checkAPIVersion(node string) error {
|
|
versionURL := "http://" + node + a.hcEndpoint
|
|
|
|
version, err := a.getVersion(versionURL)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
nodeVer, err := semver.NewVersion(version)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if nodeVer.LessThan(a.minAPIVersion) {
|
|
return fmt.Errorf("incompatible API version: %v", nodeVer)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (a *allGrouper) ping(node string) {
|
|
err := a.checkAPIVersion(node)
|
|
if err != nil {
|
|
logrus.WithError(err).WithFields(logrus.Fields{"node": node}).Error("Unable to check API version")
|
|
a.fail(node)
|
|
} else {
|
|
a.alive(node)
|
|
}
|
|
}
|
|
|
|
func (a *allGrouper) fail(key string) {
|
|
|
|
isChanged := false
|
|
|
|
a.nodeLock.Lock()
|
|
|
|
// if deleted, skip
|
|
node, ok := a.nodeList[key]
|
|
if !ok {
|
|
a.nodeLock.Unlock()
|
|
return
|
|
}
|
|
|
|
node.success = 0
|
|
node.fail++
|
|
|
|
// overflow case
|
|
if node.fail == 0 {
|
|
node.fail = uint64(a.hcUnhealthy)
|
|
}
|
|
|
|
if node.healthy && node.fail >= uint64(a.hcUnhealthy) {
|
|
node.healthy = false
|
|
isChanged = true
|
|
}
|
|
|
|
a.nodeList[key] = node
|
|
a.nodeLock.Unlock()
|
|
|
|
if isChanged {
|
|
logrus.WithFields(logrus.Fields{"node": key}).Info("is unhealthy")
|
|
a.publishHealth()
|
|
}
|
|
}
|
|
|
|
func (a *allGrouper) alive(key string) {
|
|
|
|
isChanged := false
|
|
|
|
a.nodeLock.Lock()
|
|
|
|
// if deleted, skip
|
|
node, ok := a.nodeList[key]
|
|
if !ok {
|
|
a.nodeLock.Unlock()
|
|
return
|
|
}
|
|
|
|
node.fail = 0
|
|
node.success++
|
|
|
|
// overflow case
|
|
if node.success == 0 {
|
|
node.success = uint64(a.hcHealthy)
|
|
}
|
|
|
|
if !node.healthy && node.success >= uint64(a.hcHealthy) {
|
|
node.healthy = true
|
|
isChanged = true
|
|
}
|
|
|
|
a.nodeList[key] = node
|
|
a.nodeLock.Unlock()
|
|
|
|
if isChanged {
|
|
logrus.WithFields(logrus.Fields{"node": key}).Info("is healthy")
|
|
a.publishHealth()
|
|
}
|
|
}
|
|
|
|
func (a *allGrouper) Wrap(next http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
switch r.URL.Path {
|
|
case "/1/lb/nodes":
|
|
switch r.Method {
|
|
case "PUT":
|
|
a.addNode(w, r)
|
|
case "DELETE":
|
|
a.removeNode(w, r)
|
|
case "GET":
|
|
a.listNodes(w, r)
|
|
}
|
|
return
|
|
}
|
|
|
|
next.ServeHTTP(w, r)
|
|
})
|
|
}
|
|
|
|
func (a *allGrouper) addNode(w http.ResponseWriter, r *http.Request) {
|
|
var bod struct {
|
|
Node string `json:"node"`
|
|
}
|
|
err := json.NewDecoder(r.Body).Decode(&bod)
|
|
if err != nil {
|
|
sendError(w, http.StatusBadRequest, err.Error())
|
|
return
|
|
}
|
|
|
|
err = a.add(bod.Node)
|
|
if err != nil {
|
|
sendError(w, 500, err.Error()) // TODO filter ?
|
|
return
|
|
}
|
|
sendSuccess(w, "node added")
|
|
}
|
|
|
|
func (a *allGrouper) removeNode(w http.ResponseWriter, r *http.Request) {
|
|
var bod struct {
|
|
Node string `json:"node"`
|
|
}
|
|
err := json.NewDecoder(r.Body).Decode(&bod)
|
|
if err != nil {
|
|
sendError(w, http.StatusBadRequest, err.Error())
|
|
return
|
|
}
|
|
|
|
err = a.remove(bod.Node)
|
|
if err != nil {
|
|
sendError(w, 500, err.Error()) // TODO filter ?
|
|
return
|
|
}
|
|
sendSuccess(w, "node deleted")
|
|
}
|
|
|
|
func (a *allGrouper) listNodes(w http.ResponseWriter, r *http.Request) {
|
|
|
|
a.nodeLock.RLock()
|
|
|
|
out := make(map[string]string, len(a.nodeList))
|
|
|
|
for key, value := range a.nodeList {
|
|
if value.healthy {
|
|
out[key] = "online"
|
|
} else {
|
|
out[key] = "offline"
|
|
}
|
|
}
|
|
|
|
a.nodeLock.RUnlock()
|
|
|
|
sendValue(w, struct {
|
|
Nodes map[string]string `json:"nodes"`
|
|
}{
|
|
Nodes: out,
|
|
})
|
|
}
|