package agent import ( "bufio" "context" "errors" "fmt" "io" "io/ioutil" "os" "runtime" "strconv" "strings" "sync" "go.opencensus.io/trace" "github.com/sirupsen/logrus" ) const ( Mem1MB = 1024 * 1024 Mem1GB = 1024 * 1024 * 1024 // Assume 2GB RAM on non-linux systems DefaultNonLinuxMemory = 2048 * Mem1MB ) var CapacityFull = errors.New("max capacity reached") // A simple resource (memory, cpu, disk, etc.) tracker for scheduling. // TODO: add cpu, disk, network IO for future type ResourceTracker interface { // WaitAsyncResource returns a channel that will send once when there seem to be sufficient // resource levels to run an async task, it is up to the implementer to create policy here. WaitAsyncResource(ctx context.Context) chan struct{} // GetResourceToken returns a channel to wait for a resource token on. If the provided context is canceled, // the channel will never receive anything. If it is not possible to fulfill this resource, the channel // will never receive anything (use IsResourcePossible). If a resource token is available for the provided // resource parameters, it will otherwise be sent once on the returned channel. The channel is never closed. // if isNB is set, resource check is done and error token is returned without blocking. // if isAsync is set, resource allocation specific for async requests is considered. (eg. always allow // a sync only reserve area) Memory is expected to be provided in MB units. GetResourceToken(ctx context.Context, memory, cpuQuota uint64, isAsync, isNB bool) <-chan ResourceToken // IsResourcePossible returns whether it's possible to fulfill the requested resources on this // machine. It must be called before GetResourceToken or GetResourceToken may hang. // Memory is expected to be provided in MB units. IsResourcePossible(memory, cpuQuota uint64, isAsync bool) bool // returns number of waiters waiting for a resource token blocked on condition variable GetResourceTokenWaiterCount() uint64 } type resourceTracker struct { // cond protects access to ram variables below cond *sync.Cond // ramTotal is the total usable memory for sync functions ramSyncTotal uint64 // ramSyncUsed is ram reserved for running sync containers including hot/idle ramSyncUsed uint64 // ramAsyncTotal is the total usable memory for async + sync functions ramAsyncTotal uint64 // ramAsyncUsed is ram reserved for running async + sync containers including hot/idle ramAsyncUsed uint64 // memory in use for async area in which agent stops dequeuing async jobs ramAsyncHWMark uint64 // cpuTotal is the total usable cpu for sync functions cpuSyncTotal uint64 // cpuSyncUsed is cpu reserved for running sync containers including hot/idle cpuSyncUsed uint64 // cpuAsyncTotal is the total usable cpu for async + sync functions cpuAsyncTotal uint64 // cpuAsyncUsed is cpu reserved for running async + sync containers including hot/idle cpuAsyncUsed uint64 // cpu in use for async area in which agent stops dequeuing async jobs cpuAsyncHWMark uint64 // number of waiters waiting for a token blocked on the condition variable tokenWaiterCount uint64 } func NewResourceTracker(cfg *AgentConfig) ResourceTracker { obj := &resourceTracker{ cond: sync.NewCond(new(sync.Mutex)), } obj.initializeMemory(cfg) obj.initializeCPU(cfg) return obj } type ResourceToken interface { // Close must be called by any thread that receives a token. io.Closer Error() error } type resourceToken struct { once sync.Once err error decrement func() } func (t *resourceToken) Error() error { return t.err } func (t *resourceToken) Close() error { t.once.Do(func() { t.decrement() }) return nil } func (a *resourceTracker) isResourceAvailableLocked(memory uint64, cpuQuota uint64, isAsync bool) bool { asyncAvailMem := a.ramAsyncTotal - a.ramAsyncUsed syncAvailMem := a.ramSyncTotal - a.ramSyncUsed asyncAvailCPU := a.cpuAsyncTotal - a.cpuAsyncUsed syncAvailCPU := a.cpuSyncTotal - a.cpuSyncUsed // For sync functions, we can steal from async pool. For async, we restrict it to sync pool if isAsync { return asyncAvailMem >= memory && asyncAvailCPU >= cpuQuota } else { return asyncAvailMem+syncAvailMem >= memory && asyncAvailCPU+syncAvailCPU >= cpuQuota } } // is this request possible to meet? If no, fail quick func (a *resourceTracker) IsResourcePossible(memory uint64, cpuQuota uint64, isAsync bool) bool { memory = memory * Mem1MB if isAsync { return memory <= a.ramAsyncTotal && cpuQuota <= a.cpuAsyncTotal } else { return memory <= a.ramSyncTotal+a.ramAsyncTotal && cpuQuota <= a.cpuSyncTotal+a.cpuAsyncTotal } } // returns number of waiters waiting for a resource token blocked on condition variable func (a *resourceTracker) GetResourceTokenWaiterCount() uint64 { var waiters uint64 a.cond.L.Lock() waiters = a.tokenWaiterCount a.cond.L.Unlock() return waiters } func (a *resourceTracker) allocResourcesLocked(memory, cpuQuota uint64, isAsync bool) ResourceToken { var asyncMem, syncMem uint64 var asyncCPU, syncCPU uint64 if isAsync { // async uses async pool only asyncMem = memory asyncCPU = cpuQuota } else { // if sync fits async + sync pool syncMem = minUint64(a.ramSyncTotal-a.ramSyncUsed, memory) syncCPU = minUint64(a.cpuSyncTotal-a.cpuSyncUsed, cpuQuota) asyncMem = memory - syncMem asyncCPU = cpuQuota - syncCPU } a.ramAsyncUsed += asyncMem a.ramSyncUsed += syncMem a.cpuAsyncUsed += asyncCPU a.cpuSyncUsed += syncCPU return &resourceToken{decrement: func() { a.cond.L.Lock() a.ramAsyncUsed -= asyncMem a.ramSyncUsed -= syncMem a.cpuAsyncUsed -= asyncCPU a.cpuSyncUsed -= syncCPU a.cond.L.Unlock() // WARNING: yes, we wake up everyone even async waiters when only sync pool has space, but // the cost of this spurious wake up is unlikely to impact much performance. Simpler // to use one cond variable for the time being. a.cond.Broadcast() }} } func (a *resourceTracker) getResourceTokenNB(memory uint64, cpuQuota uint64, isAsync bool) ResourceToken { if !a.IsResourcePossible(memory, cpuQuota, isAsync) { return &resourceToken{decrement: func() {}, err: CapacityFull} } memory = memory * Mem1MB var t ResourceToken a.cond.L.Lock() if !a.isResourceAvailableLocked(memory, cpuQuota, isAsync) { t = &resourceToken{decrement: func() {}, err: CapacityFull} } else { t = a.allocResourcesLocked(memory, cpuQuota, isAsync) } a.cond.L.Unlock() return t } func (a *resourceTracker) getResourceTokenNBChan(ctx context.Context, memory uint64, cpuQuota uint64, isAsync bool) <-chan ResourceToken { ctx, span := trace.StartSpan(ctx, "agent_get_resource_token_nbio_chan") ch := make(chan ResourceToken) go func() { defer span.End() t := a.getResourceTokenNB(memory, cpuQuota, isAsync) select { case ch <- t: case <-ctx.Done(): // if we can't send b/c nobody is waiting anymore, need to decrement here t.Close() } }() return ch } // the received token should be passed directly to launch (unconditionally), launch // will close this token (i.e. the receiver should not call Close) func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, cpuQuota uint64, isAsync, isNB bool) <-chan ResourceToken { if isNB { return a.getResourceTokenNBChan(ctx, memory, cpuQuota, isAsync) } ch := make(chan ResourceToken) if !a.IsResourcePossible(memory, cpuQuota, isAsync) { // return the channel, but never send anything. return ch } c := a.cond isWaiting := false memory = memory * Mem1MB // if we find a resource token, shut down the thread waiting on ctx finish. // alternatively, if the ctx is done, wake up the cond loop. ctx, cancel := context.WithCancel(ctx) go func() { <-ctx.Done() c.L.Lock() if isWaiting { c.Broadcast() } c.L.Unlock() }() ctx, span := trace.StartSpan(ctx, "agent_get_resource_token") go func() { defer span.End() defer cancel() c.L.Lock() isWaiting = true for !a.isResourceAvailableLocked(memory, cpuQuota, isAsync) && ctx.Err() == nil { a.tokenWaiterCount++ c.Wait() a.tokenWaiterCount-- } isWaiting = false if ctx.Err() != nil { c.L.Unlock() return } t := a.allocResourcesLocked(memory, cpuQuota, isAsync) c.L.Unlock() select { case ch <- t: case <-ctx.Done(): // if we can't send b/c nobody is waiting anymore, need to decrement here t.Close() } }() return ch } // WaitAsyncResource will send a signal on the returned channel when RAM and CPU in-use // in the async area is less than high water mark func (a *resourceTracker) WaitAsyncResource(ctx context.Context) chan struct{} { ch := make(chan struct{}, 1) isWaiting := false c := a.cond // if we find a resource token, shut down the thread waiting on ctx finish. // alternatively, if the ctx is done, wake up the cond loop. ctx, cancel := context.WithCancel(ctx) go func() { <-ctx.Done() c.L.Lock() if isWaiting { c.Broadcast() } c.L.Unlock() }() ctx, span := trace.StartSpan(ctx, "agent_wait_async_resource") go func() { defer span.End() defer cancel() c.L.Lock() isWaiting = true for (a.ramAsyncUsed >= a.ramAsyncHWMark || a.cpuAsyncUsed >= a.cpuAsyncHWMark) && ctx.Err() == nil { c.Wait() } isWaiting = false c.L.Unlock() if ctx.Err() == nil { ch <- struct{}{} } }() return ch } func minUint64(a, b uint64) uint64 { if a <= b { return a } return b } func maxUint64(a, b uint64) uint64 { if a >= b { return a } return b } func clampUint64(val, min, max uint64) uint64 { val = minUint64(val, max) val = maxUint64(val, min) return val } func (a *resourceTracker) initializeCPU(cfg *AgentConfig) { var maxSyncCPU, maxAsyncCPU, cpuAsyncHWMark uint64 // Use all available CPU from go.runtime in non-linux systems. We ignore // non-linux container implementations and their limits on CPU if there's any. // (This is also the default if we cannot determine limits from proc or sysfs) totalCPU := uint64(runtime.NumCPU() * 1000) availCPU := totalCPU if runtime.GOOS == "linux" { // Why do we prefer /proc/cpuinfo for Linux and not just use runtime.NumCPU? // This is because NumCPU is sched_getaffinity based and we prefer to check // cgroup which will more likely be same cgroup for container runtime numCPU, err := checkProcCPU() if err != nil { logrus.WithError(err).Error("Error checking for CPU, falling back to runtime CPU count.") } else { totalCPU = 1000 * numCPU availCPU = totalCPU } // Clamp further if cgroups CFS quota/period limits are in place cgroupCPU := checkCgroupCPU() if cgroupCPU > 0 { availCPU = minUint64(availCPU, cgroupCPU) } // TODO: check cgroup cpuset to clamp this further. We might be restricted into // a subset of CPUs. (eg. /sys/fs/cgroup/cpuset/cpuset.effective_cpus) // TODO: skip CPU headroom for ourselves for now } // now based on cfg, further clamp on calculated values if cfg != nil && cfg.MaxTotalCPU != 0 { availCPU = minUint64(cfg.MaxTotalCPU, availCPU) } logrus.WithFields(logrus.Fields{ "totalCPU": totalCPU, "availCPU": availCPU, }).Info("available cpu") // %20 of cpu for sync only reserve maxSyncCPU = uint64(availCPU * 2 / 10) maxAsyncCPU = availCPU - maxSyncCPU cpuAsyncHWMark = maxAsyncCPU * 8 / 10 logrus.WithFields(logrus.Fields{ "cpuSync": maxSyncCPU, "cpuAsync": maxAsyncCPU, "cpuAsyncHWMark": cpuAsyncHWMark, }).Info("sync and async cpu reservations") if maxSyncCPU == 0 || maxAsyncCPU == 0 { logrus.Fatal("Cannot get the proper CPU information to size server") } if maxSyncCPU+maxAsyncCPU < 1000 { logrus.Warn("Severaly Limited CPU: cpuSync + cpuAsync < 1000m (1 CPU)") } else if maxAsyncCPU < 1000 { logrus.Warn("Severaly Limited CPU: cpuAsync < 1000m (1 CPU)") } a.cpuAsyncHWMark = cpuAsyncHWMark a.cpuSyncTotal = maxSyncCPU a.cpuAsyncTotal = maxAsyncCPU } func (a *resourceTracker) initializeMemory(cfg *AgentConfig) { var maxSyncMemory, maxAsyncMemory, ramAsyncHWMark uint64 availMemory := uint64(DefaultNonLinuxMemory) if runtime.GOOS == "linux" { // system wide available memory totalMemory, err := checkProcMem() if err != nil { logrus.WithError(err).Fatal("Cannot get the proper memory information to size server.") } availMemory = totalMemory // cgroup limit restriction on memory usage cGroupLimit, err := checkCgroupMem() if err != nil { logrus.WithError(err).Error("Error checking for cgroup memory limits, falling back to host memory available..") } else { availMemory = minUint64(cGroupLimit, availMemory) } // clamp the available memory by head room (for docker, ourselves, other processes) headRoom, err := getMemoryHeadRoom(availMemory, cfg) if err != nil { logrus.WithError(err).Fatal("Out of memory") } availMemory = availMemory - headRoom logrus.WithFields(logrus.Fields{ "totalMemory": totalMemory, "headRoom": headRoom, "cgroupLimit": cGroupLimit, }).Info("available memory") } // now based on cfg, further clamp on calculated values if cfg != nil && cfg.MaxTotalMemory != 0 { availMemory = minUint64(cfg.MaxTotalMemory, availMemory) } // %20 of ram for sync only reserve maxSyncMemory = uint64(availMemory * 2 / 10) maxAsyncMemory = availMemory - maxSyncMemory ramAsyncHWMark = maxAsyncMemory * 8 / 10 // For non-linux OS, we expect these (or their defaults) properly configured from command-line/env logrus.WithFields(logrus.Fields{ "availMemory": availMemory, "ramSync": maxSyncMemory, "ramAsync": maxAsyncMemory, "ramAsyncHWMark": ramAsyncHWMark, }).Info("sync and async ram reservations") if maxSyncMemory == 0 || maxAsyncMemory == 0 { logrus.Fatal("Cannot get the proper memory pool information to size server") } if maxSyncMemory+maxAsyncMemory < 256*Mem1MB { logrus.Warn("Severely Limited memory: ramSync + ramAsync < 256MB") } else if maxAsyncMemory < 256*Mem1MB { logrus.Warn("Severely Limited memory: ramAsync < 256MB") } a.ramAsyncHWMark = ramAsyncHWMark a.ramSyncTotal = maxSyncMemory a.ramAsyncTotal = maxAsyncMemory } // headroom estimation in order not to consume entire RAM if possible func getMemoryHeadRoom(usableMemory uint64, cfg *AgentConfig) (uint64, error) { // get %10 of the RAM headRoom := uint64(usableMemory / 10) // TODO: improve this pre-fork calculation, we should fetch/query this // instead of estimate below. // if pre-fork pool is enabled, add 1 MB per pool-item if cfg != nil && cfg.PreForkPoolSize != 0 { headRoom += Mem1MB * cfg.PreForkPoolSize } // TODO: improve these calculations. // clamp this with 256MB min -- 5GB max maxHeadRoom := uint64(5 * Mem1GB) minHeadRoom := uint64(256 * Mem1MB) if minHeadRoom >= usableMemory { return 0, fmt.Errorf("Not enough memory: %v", usableMemory) } headRoom = clampUint64(headRoom, minHeadRoom, maxHeadRoom) return headRoom, nil } func readString(fileName string) (string, error) { b, err := ioutil.ReadFile(fileName) if err != nil { return "", err } value := string(b) return strings.TrimSpace(value), nil } func checkCgroupMem() (uint64, error) { value, err := readString("/sys/fs/cgroup/memory/memory.limit_in_bytes") if err != nil { return 0, err } return strconv.ParseUint(value, 10, 64) } func checkCgroupCPU() uint64 { periodStr, err := readString("/sys/fs/cgroup/cpu/cpu.cfs_period_us") if err != nil { return 0 } quotaStr, err := readString("/sys/fs/cgroup/cpu/cpu.cfs_quota_us") if err != nil { return 0 } period, err := strconv.ParseUint(periodStr, 10, 64) if err != nil { logrus.Warn("Cannot parse CFS period", err) return 0 } quota, err := strconv.ParseInt(quotaStr, 10, 64) if err != nil { logrus.Warn("Cannot parse CFS quota", err) return 0 } if quota <= 0 || period <= 0 { return 0 } return uint64(quota) * 1000 / period } var errCantReadMemInfo = errors.New("Didn't find MemAvailable in /proc/meminfo, kernel is probably < 3.14") func checkProcMem() (uint64, error) { f, err := os.Open("/proc/meminfo") if err != nil { return 0, err } defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { b := scanner.Text() if !strings.HasPrefix(b, "MemAvailable") { continue } // expect form: // MemAvailable: 1234567890 kB tri := strings.Fields(b) if len(tri) != 3 { return 0, fmt.Errorf("MemAvailable line has unexpected format: %v", b) } c, err := strconv.ParseUint(tri[1], 10, 64) if err != nil { return 0, fmt.Errorf("Could not parse MemAvailable: %v", b) } switch tri[2] { // convert units to bytes case "kB": c *= 1024 case "MB": c *= 1024 * 1024 default: return 0, fmt.Errorf("Unexpected units for MemAvailable in /proc/meminfo, need kB or MB, got: %v", tri[2]) } return c, nil } return 0, errCantReadMemInfo } func checkProcCPU() (uint64, error) { f, err := os.Open("/proc/cpuinfo") if err != nil { return 0, err } defer f.Close() total := uint64(0) scanner := bufio.NewScanner(f) for scanner.Scan() { b := scanner.Text() // processor : 0 toks := strings.Fields(b) if len(toks) == 3 && toks[0] == "processor" && toks[1] == ":" { total += 1 } } if total == 0 { return 0, errors.New("Could not parse cpuinfo") } return total, nil }