1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Scavenging free pages. 6 // 7 // This file implements scavenging (the release of physical pages backing mapped 8 // memory) of free and unused pages in the heap as a way to deal with page-level 9 // fragmentation and reduce the RSS of Go applications. 10 // 11 // Scavenging in Go happens on two fronts: there's the background 12 // (asynchronous) scavenger and the heap-growth (synchronous) scavenger. 13 // 14 // The former happens on a goroutine much like the background sweeper which is 15 // soft-capped at using scavengePercent of the mutator's time, based on 16 // order-of-magnitude estimates of the costs of scavenging. The background 17 // scavenger's primary goal is to bring the estimated heap RSS of the 18 // application down to a goal. 19 // 20 // That goal is defined as: 21 // (retainExtraPercent+100) / 100 * (next_gc / last_next_gc) * last_heap_inuse 22 // 23 // Essentially, we wish to have the application's RSS track the heap goal, but 24 // the heap goal is defined in terms of bytes of objects, rather than pages like 25 // RSS. As a result, we need to take into account for fragmentation internal to 26 // spans. next_gc / last_next_gc defines the ratio between the current heap goal 27 // and the last heap goal, which tells us by how much the heap is growing and 28 // shrinking. We estimate what the heap will grow to in terms of pages by taking 29 // this ratio and multiplying it by heap_inuse at the end of the last GC, which 30 // allows us to account for this additional fragmentation. Note that this 31 // procedure makes the assumption that the degree of fragmentation won't change 32 // dramatically over the next GC cycle. Overestimating the amount of 33 // fragmentation simply results in higher memory use, which will be accounted 34 // for by the next pacing up date. Underestimating the fragmentation however 35 // could lead to performance degradation. Handling this case is not within the 36 // scope of the scavenger. Situations where the amount of fragmentation balloons 37 // over the course of a single GC cycle should be considered pathologies, 38 // flagged as bugs, and fixed appropriately. 39 // 40 // An additional factor of retainExtraPercent is added as a buffer to help ensure 41 // that there's more unscavenged memory to allocate out of, since each allocation 42 // out of scavenged memory incurs a potentially expensive page fault. 43 // 44 // The goal is updated after each GC and the scavenger's pacing parameters 45 // (which live in mheap_) are updated to match. The pacing parameters work much 46 // like the background sweeping parameters. The parameters define a line whose 47 // horizontal axis is time and vertical axis is estimated heap RSS, and the 48 // scavenger attempts to stay below that line at all times. 49 // 50 // The synchronous heap-growth scavenging happens whenever the heap grows in 51 // size, for some definition of heap-growth. The intuition behind this is that 52 // the application had to grow the heap because existing fragments were 53 // not sufficiently large to satisfy a page-level memory allocation, so we 54 // scavenge those fragments eagerly to offset the growth in RSS that results. 55 56 package runtime 57 58 const ( 59 // The background scavenger is paced according to these parameters. 60 // 61 // scavengePercent represents the portion of mutator time we're willing 62 // to spend on scavenging in percent. 63 // 64 // scavengePageLatency is a worst-case estimate (order-of-magnitude) of 65 // the time it takes to scavenge one (regular-sized) page of memory. 66 // scavengeHugePageLatency is the same but for huge pages. 67 // 68 // scavengePagePeriod is derived from scavengePercent and scavengePageLatency, 69 // and represents the average time between scavenging one page that we're 70 // aiming for. scavengeHugePagePeriod is the same but for huge pages. 71 // These constants are core to the scavenge pacing algorithm. 72 scavengePercent = 1 // 1% 73 scavengePageLatency = 10e3 // 10µs 74 scavengeHugePageLatency = 10e3 // 10µs 75 scavengePagePeriod = scavengePageLatency / (scavengePercent / 100.0) 76 scavengeHugePagePeriod = scavengePageLatency / (scavengePercent / 100.0) 77 78 // retainExtraPercent represents the amount of memory over the heap goal 79 // that the scavenger should keep as a buffer space for the allocator. 80 // 81 // The purpose of maintaining this overhead is to have a greater pool of 82 // unscavenged memory available for allocation (since using scavenged memory 83 // incurs an additional cost), to account for heap fragmentation and 84 // the ever-changing layout of the heap. 85 retainExtraPercent = 10 86 ) 87 88 // heapRetained returns an estimate of the current heap RSS. 89 // 90 // mheap_.lock must be held or the world must be stopped. 91 func heapRetained() uint64 { 92 return memstats.heap_sys - memstats.heap_released 93 } 94 95 // gcPaceScavenger updates the scavenger's pacing, particularly 96 // its rate and RSS goal. 97 // 98 // The RSS goal is based on the current heap goal with a small overhead 99 // to accomodate non-determinism in the allocator. 100 // 101 // The pacing is based on scavengePageRate, which applies to both regular and 102 // huge pages. See that constant for more information. 103 // 104 // mheap_.lock must be held or the world must be stopped. 105 func gcPaceScavenger() { 106 // If we're called before the first GC completed, disable scavenging. 107 // We never scavenge before the 2nd GC cycle anyway (we don't have enough 108 // information about the heap yet) so this is fine, and avoids a fault 109 // or garbage data later. 110 if memstats.last_next_gc == 0 { 111 mheap_.scavengeBytesPerNS = 0 112 return 113 } 114 // Compute our scavenging goal. 115 goalRatio := float64(memstats.next_gc) / float64(memstats.last_next_gc) 116 retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio) 117 // Add retainExtraPercent overhead to retainedGoal. This calculation 118 // looks strange but the purpose is to arrive at an integer division 119 // (e.g. if retainExtraPercent = 12.5, then we get a divisor of 8) 120 // that also avoids the overflow from a multiplication. 121 retainedGoal += retainedGoal / (1.0 / (retainExtraPercent / 100.0)) 122 // Align it to a physical page boundary to make the following calculations 123 // a bit more exact. 124 retainedGoal = (retainedGoal + uint64(physPageSize) - 1) &^ (uint64(physPageSize) - 1) 125 126 // Represents where we are now in the heap's contribution to RSS in bytes. 127 // 128 // Guaranteed to always be a multiple of physPageSize on systems where 129 // physPageSize <= pageSize since we map heap_sys at a rate larger than 130 // any physPageSize and released memory in multiples of the physPageSize. 131 // 132 // However, certain functions recategorize heap_sys as other stats (e.g. 133 // stack_sys) and this happens in multiples of pageSize, so on systems 134 // where physPageSize > pageSize the calculations below will not be exact. 135 // Generally this is OK since we'll be off by at most one regular 136 // physical page. 137 retainedNow := heapRetained() 138 139 // If we're already below our goal, publish the goal in case it changed 140 // then disable the background scavenger. 141 if retainedNow <= retainedGoal { 142 mheap_.scavengeRetainedGoal = retainedGoal 143 mheap_.scavengeBytesPerNS = 0 144 return 145 } 146 147 // Now we start to compute the total amount of work necessary and the total 148 // amount of time we're willing to give the scavenger to complete this work. 149 // This will involve calculating how much of the work consists of huge pages 150 // and how much consists of regular pages since the former can let us scavenge 151 // more memory in the same time. 152 totalWork := retainedNow - retainedGoal 153 154 // On systems without huge page support, all work is regular work. 155 regularWork := totalWork 156 hugeTime := uint64(0) 157 158 // On systems where we have huge pages, we want to do as much of the 159 // scavenging work as possible on huge pages, because the costs are the 160 // same per page, but we can give back more more memory in a shorter 161 // period of time. 162 if physHugePageSize != 0 { 163 // Start by computing the amount of free memory we have in huge pages 164 // in total. Trivially, this is all the huge page work we need to do. 165 hugeWork := uint64(mheap_.free.unscavHugePages) << physHugePageShift 166 167 // ...but it could turn out that there's more huge work to do than 168 // total work, so cap it at total work. This might happen for very large 169 // heaps where the additional factor of retainExtraPercent can make it so 170 // that there are free chunks of memory larger than a huge page that we don't want 171 // to scavenge. 172 if hugeWork >= totalWork { 173 hugePages := totalWork >> physHugePageShift 174 hugeWork = hugePages << physHugePageShift 175 } 176 // Everything that's not huge work is regular work. At this point we 177 // know huge work so we can calculate how much time that will take 178 // based on scavengePageRate (which applies to pages of any size). 179 regularWork = totalWork - hugeWork 180 hugeTime = (hugeWork >> physHugePageShift) * scavengeHugePagePeriod 181 } 182 // Finally, we can compute how much time it'll take to do the regular work 183 // and the total time to do all the work. 184 regularTime := regularWork / uint64(physPageSize) * scavengePagePeriod 185 totalTime := hugeTime + regularTime 186 187 now := nanotime() 188 189 // Update all the pacing parameters in mheap with scavenge.lock held, 190 // so that scavenge.gen is kept in sync with the updated values. 191 mheap_.scavengeRetainedGoal = retainedGoal 192 mheap_.scavengeRetainedBasis = retainedNow 193 mheap_.scavengeTimeBasis = now 194 mheap_.scavengeBytesPerNS = float64(totalWork) / float64(totalTime) 195 mheap_.scavengeGen++ // increase scavenge generation 196 } 197 198 // Sleep/wait state of the background scavenger. 199 var scavenge struct { 200 lock mutex 201 g *g 202 parked bool 203 timer *timer 204 205 // Generation counter. 206 // 207 // It represents the last generation count (as defined by 208 // mheap_.scavengeGen) checked by the scavenger and is updated 209 // each time the scavenger checks whether it is on-pace. 210 // 211 // Skew between this field and mheap_.scavengeGen is used to 212 // determine whether a new update is available. 213 // 214 // Protected by mheap_.lock. 215 gen uint64 216 } 217 218 // wakeScavenger unparks the scavenger if necessary. It must be called 219 // after any pacing update. 220 // 221 // mheap_.lock and scavenge.lock must not be held. 222 func wakeScavenger() { 223 lock(&scavenge.lock) 224 if scavenge.parked { 225 // Try to stop the timer but we don't really care if we succeed. 226 // It's possible that either a timer was never started, or that 227 // we're racing with it. 228 // In the case that we're racing with there's the low chance that 229 // we experience a spurious wake-up of the scavenger, but that's 230 // totally safe. 231 stopTimer(scavenge.timer) 232 233 // Unpark the goroutine and tell it that there may have been a pacing 234 // change. 235 scavenge.parked = false 236 goready(scavenge.g, 0) 237 } 238 unlock(&scavenge.lock) 239 } 240 241 // scavengeSleep attempts to put the scavenger to sleep for ns. 242 // 243 // Note that this function should only be called by the scavenger. 244 // 245 // The scavenger may be woken up earlier by a pacing change, and it may not go 246 // to sleep at all if there's a pending pacing change. 247 // 248 // Returns false if awoken early (i.e. true means a complete sleep). 249 func scavengeSleep(ns int64) bool { 250 lock(&scavenge.lock) 251 252 // First check if there's a pending update. 253 // If there is one, don't bother sleeping. 254 var hasUpdate bool 255 systemstack(func() { 256 lock(&mheap_.lock) 257 hasUpdate = mheap_.scavengeGen != scavenge.gen 258 unlock(&mheap_.lock) 259 }) 260 if hasUpdate { 261 unlock(&scavenge.lock) 262 return false 263 } 264 265 // Set the timer. 266 // 267 // This must happen here instead of inside gopark 268 // because we can't close over any variables without 269 // failing escape analysis. 270 now := nanotime() 271 scavenge.timer.when = now + ns 272 startTimer(scavenge.timer) 273 274 // Mark ourself as asleep and go to sleep. 275 scavenge.parked = true 276 goparkunlock(&scavenge.lock, waitReasonSleep, traceEvGoSleep, 2) 277 278 // Return true if we completed the full sleep. 279 return (nanotime() - now) >= ns 280 } 281 282 // Background scavenger. 283 // 284 // The background scavenger maintains the RSS of the application below 285 // the line described by the proportional scavenging statistics in 286 // the mheap struct. 287 func bgscavenge(c chan int) { 288 scavenge.g = getg() 289 290 lock(&scavenge.lock) 291 scavenge.parked = true 292 293 scavenge.timer = new(timer) 294 scavenge.timer.f = func(_ interface{}, _ uintptr) { 295 wakeScavenger() 296 } 297 298 c <- 1 299 goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1) 300 301 // Parameters for sleeping. 302 // 303 // If we end up doing more work than we need, we should avoid spinning 304 // until we have more work to do: instead, we know exactly how much time 305 // until more work will need to be done, so we sleep. 306 // 307 // We should avoid sleeping for less than minSleepNS because Gosched() 308 // overheads among other things will work out better in that case. 309 // 310 // There's no reason to set a maximum on sleep time because we'll always 311 // get woken up earlier if there's any kind of update that could change 312 // the scavenger's pacing. 313 // 314 // retryDelayNS tracks how much to sleep next time we fail to do any 315 // useful work. 316 const minSleepNS = int64(100 * 1000) // 100 µs 317 318 retryDelayNS := minSleepNS 319 320 for { 321 released := uintptr(0) 322 park := false 323 ttnext := int64(0) 324 325 // Run on the system stack since we grab the heap lock, 326 // and a stack growth with the heap lock means a deadlock. 327 systemstack(func() { 328 lock(&mheap_.lock) 329 330 // Update the last generation count that the scavenger has handled. 331 scavenge.gen = mheap_.scavengeGen 332 333 // If background scavenging is disabled or if there's no work to do just park. 334 retained := heapRetained() 335 if mheap_.scavengeBytesPerNS == 0 || retained <= mheap_.scavengeRetainedGoal { 336 unlock(&mheap_.lock) 337 park = true 338 return 339 } 340 341 // Calculate how big we want the retained heap to be 342 // at this point in time. 343 // 344 // The formula is for that of a line, y = b - mx 345 // We want y (want), 346 // m = scavengeBytesPerNS (> 0) 347 // x = time between scavengeTimeBasis and now 348 // b = scavengeRetainedBasis 349 rate := mheap_.scavengeBytesPerNS 350 tdist := nanotime() - mheap_.scavengeTimeBasis 351 rdist := uint64(rate * float64(tdist)) 352 want := mheap_.scavengeRetainedBasis - rdist 353 354 // If we're above the line, scavenge to get below the 355 // line. 356 if retained > want { 357 released = mheap_.scavengeLocked(uintptr(retained - want)) 358 } 359 unlock(&mheap_.lock) 360 361 // If we over-scavenged a bit, calculate how much time it'll 362 // take at the current rate for us to make that up. We definitely 363 // won't have any work to do until at least that amount of time 364 // passes. 365 if released > uintptr(retained-want) { 366 extra := released - uintptr(retained-want) 367 ttnext = int64(float64(extra) / rate) 368 } 369 }) 370 371 if park { 372 lock(&scavenge.lock) 373 scavenge.parked = true 374 goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1) 375 continue 376 } 377 378 if debug.gctrace > 0 { 379 if released > 0 { 380 print("scvg: ", released>>20, " MB released\n") 381 } 382 print("scvg: inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n") 383 } 384 385 if released == 0 { 386 // If we were unable to release anything this may be because there's 387 // no free memory available to scavenge. Go to sleep and try again. 388 if scavengeSleep(retryDelayNS) { 389 // If we successfully slept through the delay, back off exponentially. 390 retryDelayNS *= 2 391 } 392 continue 393 } 394 retryDelayNS = minSleepNS 395 396 if ttnext > 0 && ttnext > minSleepNS { 397 // If there's an appreciable amount of time until the next scavenging 398 // goal, just sleep. We'll get woken up if anything changes and this 399 // way we avoid spinning. 400 scavengeSleep(ttnext) 401 continue 402 } 403 404 // Give something else a chance to run, no locks are held. 405 Gosched() 406 } 407 } 408