Source file src/runtime/mgcscavenge.go

     1	// Copyright 2019 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Scavenging free pages.
     6	//
     7	// This file implements scavenging (the release of physical pages backing mapped
     8	// memory) of free and unused pages in the heap as a way to deal with page-level
     9	// fragmentation and reduce the RSS of Go applications.
    10	//
    11	// Scavenging in Go happens on two fronts: there's the background
    12	// (asynchronous) scavenger and the heap-growth (synchronous) scavenger.
    13	//
    14	// The former happens on a goroutine much like the background sweeper which is
    15	// soft-capped at using scavengePercent of the mutator's time, based on
    16	// order-of-magnitude estimates of the costs of scavenging. The background
    17	// scavenger's primary goal is to bring the estimated heap RSS of the
    18	// application down to a goal.
    19	//
    20	// That goal is defined as:
    21	//   (retainExtraPercent+100) / 100 * (next_gc / last_next_gc) * last_heap_inuse
    22	//
    23	// Essentially, we wish to have the application's RSS track the heap goal, but
    24	// the heap goal is defined in terms of bytes of objects, rather than pages like
    25	// RSS. As a result, we need to take into account for fragmentation internal to
    26	// spans. next_gc / last_next_gc defines the ratio between the current heap goal
    27	// and the last heap goal, which tells us by how much the heap is growing and
    28	// shrinking. We estimate what the heap will grow to in terms of pages by taking
    29	// this ratio and multiplying it by heap_inuse at the end of the last GC, which
    30	// allows us to account for this additional fragmentation. Note that this
    31	// procedure makes the assumption that the degree of fragmentation won't change
    32	// dramatically over the next GC cycle. Overestimating the amount of
    33	// fragmentation simply results in higher memory use, which will be accounted
    34	// for by the next pacing up date. Underestimating the fragmentation however
    35	// could lead to performance degradation. Handling this case is not within the
    36	// scope of the scavenger. Situations where the amount of fragmentation balloons
    37	// over the course of a single GC cycle should be considered pathologies,
    38	// flagged as bugs, and fixed appropriately.
    39	//
    40	// An additional factor of retainExtraPercent is added as a buffer to help ensure
    41	// that there's more unscavenged memory to allocate out of, since each allocation
    42	// out of scavenged memory incurs a potentially expensive page fault.
    43	//
    44	// The goal is updated after each GC and the scavenger's pacing parameters
    45	// (which live in mheap_) are updated to match. The pacing parameters work much
    46	// like the background sweeping parameters. The parameters define a line whose
    47	// horizontal axis is time and vertical axis is estimated heap RSS, and the
    48	// scavenger attempts to stay below that line at all times.
    49	//
    50	// The synchronous heap-growth scavenging happens whenever the heap grows in
    51	// size, for some definition of heap-growth. The intuition behind this is that
    52	// the application had to grow the heap because existing fragments were
    53	// not sufficiently large to satisfy a page-level memory allocation, so we
    54	// scavenge those fragments eagerly to offset the growth in RSS that results.
    55	
    56	package runtime
    57	
    58	const (
    59		// The background scavenger is paced according to these parameters.
    60		//
    61		// scavengePercent represents the portion of mutator time we're willing
    62		// to spend on scavenging in percent.
    63		//
    64		// scavengePageLatency is a worst-case estimate (order-of-magnitude) of
    65		// the time it takes to scavenge one (regular-sized) page of memory.
    66		// scavengeHugePageLatency is the same but for huge pages.
    67		//
    68		// scavengePagePeriod is derived from scavengePercent and scavengePageLatency,
    69		// and represents the average time between scavenging one page that we're
    70		// aiming for. scavengeHugePagePeriod is the same but for huge pages.
    71		// These constants are core to the scavenge pacing algorithm.
    72		scavengePercent         = 1    // 1%
    73		scavengePageLatency     = 10e3 // 10µs
    74		scavengeHugePageLatency = 10e3 // 10µs
    75		scavengePagePeriod      = scavengePageLatency / (scavengePercent / 100.0)
    76		scavengeHugePagePeriod  = scavengePageLatency / (scavengePercent / 100.0)
    77	
    78		// retainExtraPercent represents the amount of memory over the heap goal
    79		// that the scavenger should keep as a buffer space for the allocator.
    80		//
    81		// The purpose of maintaining this overhead is to have a greater pool of
    82		// unscavenged memory available for allocation (since using scavenged memory
    83		// incurs an additional cost), to account for heap fragmentation and
    84		// the ever-changing layout of the heap.
    85		retainExtraPercent = 10
    86	)
    87	
    88	// heapRetained returns an estimate of the current heap RSS.
    89	//
    90	// mheap_.lock must be held or the world must be stopped.
    91	func heapRetained() uint64 {
    92		return memstats.heap_sys - memstats.heap_released
    93	}
    94	
    95	// gcPaceScavenger updates the scavenger's pacing, particularly
    96	// its rate and RSS goal.
    97	//
    98	// The RSS goal is based on the current heap goal with a small overhead
    99	// to accomodate non-determinism in the allocator.
   100	//
   101	// The pacing is based on scavengePageRate, which applies to both regular and
   102	// huge pages. See that constant for more information.
   103	//
   104	// mheap_.lock must be held or the world must be stopped.
   105	func gcPaceScavenger() {
   106		// If we're called before the first GC completed, disable scavenging.
   107		// We never scavenge before the 2nd GC cycle anyway (we don't have enough
   108		// information about the heap yet) so this is fine, and avoids a fault
   109		// or garbage data later.
   110		if memstats.last_next_gc == 0 {
   111			mheap_.scavengeBytesPerNS = 0
   112			return
   113		}
   114		// Compute our scavenging goal.
   115		goalRatio := float64(memstats.next_gc) / float64(memstats.last_next_gc)
   116		retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
   117		// Add retainExtraPercent overhead to retainedGoal. This calculation
   118		// looks strange but the purpose is to arrive at an integer division
   119		// (e.g. if retainExtraPercent = 12.5, then we get a divisor of 8)
   120		// that also avoids the overflow from a multiplication.
   121		retainedGoal += retainedGoal / (1.0 / (retainExtraPercent / 100.0))
   122		// Align it to a physical page boundary to make the following calculations
   123		// a bit more exact.
   124		retainedGoal = (retainedGoal + uint64(physPageSize) - 1) &^ (uint64(physPageSize) - 1)
   125	
   126		// Represents where we are now in the heap's contribution to RSS in bytes.
   127		//
   128		// Guaranteed to always be a multiple of physPageSize on systems where
   129		// physPageSize <= pageSize since we map heap_sys at a rate larger than
   130		// any physPageSize and released memory in multiples of the physPageSize.
   131		//
   132		// However, certain functions recategorize heap_sys as other stats (e.g.
   133		// stack_sys) and this happens in multiples of pageSize, so on systems
   134		// where physPageSize > pageSize the calculations below will not be exact.
   135		// Generally this is OK since we'll be off by at most one regular
   136		// physical page.
   137		retainedNow := heapRetained()
   138	
   139		// If we're already below our goal, publish the goal in case it changed
   140		// then disable the background scavenger.
   141		if retainedNow <= retainedGoal {
   142			mheap_.scavengeRetainedGoal = retainedGoal
   143			mheap_.scavengeBytesPerNS = 0
   144			return
   145		}
   146	
   147		// Now we start to compute the total amount of work necessary and the total
   148		// amount of time we're willing to give the scavenger to complete this work.
   149		// This will involve calculating how much of the work consists of huge pages
   150		// and how much consists of regular pages since the former can let us scavenge
   151		// more memory in the same time.
   152		totalWork := retainedNow - retainedGoal
   153	
   154		// On systems without huge page support, all work is regular work.
   155		regularWork := totalWork
   156		hugeTime := uint64(0)
   157	
   158		// On systems where we have huge pages, we want to do as much of the
   159		// scavenging work as possible on huge pages, because the costs are the
   160		// same per page, but we can give back more more memory in a shorter
   161		// period of time.
   162		if physHugePageSize != 0 {
   163			// Start by computing the amount of free memory we have in huge pages
   164			// in total. Trivially, this is all the huge page work we need to do.
   165			hugeWork := uint64(mheap_.free.unscavHugePages) << physHugePageShift
   166	
   167			// ...but it could turn out that there's more huge work to do than
   168			// total work, so cap it at total work. This might happen for very large
   169			// heaps where the additional factor of retainExtraPercent can make it so
   170			// that there are free chunks of memory larger than a huge page that we don't want
   171			// to scavenge.
   172			if hugeWork >= totalWork {
   173				hugePages := totalWork >> physHugePageShift
   174				hugeWork = hugePages << physHugePageShift
   175			}
   176			// Everything that's not huge work is regular work. At this point we
   177			// know huge work so we can calculate how much time that will take
   178			// based on scavengePageRate (which applies to pages of any size).
   179			regularWork = totalWork - hugeWork
   180			hugeTime = (hugeWork >> physHugePageShift) * scavengeHugePagePeriod
   181		}
   182		// Finally, we can compute how much time it'll take to do the regular work
   183		// and the total time to do all the work.
   184		regularTime := regularWork / uint64(physPageSize) * scavengePagePeriod
   185		totalTime := hugeTime + regularTime
   186	
   187		now := nanotime()
   188	
   189		// Update all the pacing parameters in mheap with scavenge.lock held,
   190		// so that scavenge.gen is kept in sync with the updated values.
   191		mheap_.scavengeRetainedGoal = retainedGoal
   192		mheap_.scavengeRetainedBasis = retainedNow
   193		mheap_.scavengeTimeBasis = now
   194		mheap_.scavengeBytesPerNS = float64(totalWork) / float64(totalTime)
   195		mheap_.scavengeGen++ // increase scavenge generation
   196	}
   197	
   198	// Sleep/wait state of the background scavenger.
   199	var scavenge struct {
   200		lock   mutex
   201		g      *g
   202		parked bool
   203		timer  *timer
   204	
   205		// Generation counter.
   206		//
   207		// It represents the last generation count (as defined by
   208		// mheap_.scavengeGen) checked by the scavenger and is updated
   209		// each time the scavenger checks whether it is on-pace.
   210		//
   211		// Skew between this field and mheap_.scavengeGen is used to
   212		// determine whether a new update is available.
   213		//
   214		// Protected by mheap_.lock.
   215		gen uint64
   216	}
   217	
   218	// wakeScavenger unparks the scavenger if necessary. It must be called
   219	// after any pacing update.
   220	//
   221	// mheap_.lock and scavenge.lock must not be held.
   222	func wakeScavenger() {
   223		lock(&scavenge.lock)
   224		if scavenge.parked {
   225			// Try to stop the timer but we don't really care if we succeed.
   226			// It's possible that either a timer was never started, or that
   227			// we're racing with it.
   228			// In the case that we're racing with there's the low chance that
   229			// we experience a spurious wake-up of the scavenger, but that's
   230			// totally safe.
   231			stopTimer(scavenge.timer)
   232	
   233			// Unpark the goroutine and tell it that there may have been a pacing
   234			// change.
   235			scavenge.parked = false
   236			goready(scavenge.g, 0)
   237		}
   238		unlock(&scavenge.lock)
   239	}
   240	
   241	// scavengeSleep attempts to put the scavenger to sleep for ns.
   242	//
   243	// Note that this function should only be called by the scavenger.
   244	//
   245	// The scavenger may be woken up earlier by a pacing change, and it may not go
   246	// to sleep at all if there's a pending pacing change.
   247	//
   248	// Returns false if awoken early (i.e. true means a complete sleep).
   249	func scavengeSleep(ns int64) bool {
   250		lock(&scavenge.lock)
   251	
   252		// First check if there's a pending update.
   253		// If there is one, don't bother sleeping.
   254		var hasUpdate bool
   255		systemstack(func() {
   256			lock(&mheap_.lock)
   257			hasUpdate = mheap_.scavengeGen != scavenge.gen
   258			unlock(&mheap_.lock)
   259		})
   260		if hasUpdate {
   261			unlock(&scavenge.lock)
   262			return false
   263		}
   264	
   265		// Set the timer.
   266		//
   267		// This must happen here instead of inside gopark
   268		// because we can't close over any variables without
   269		// failing escape analysis.
   270		now := nanotime()
   271		scavenge.timer.when = now + ns
   272		startTimer(scavenge.timer)
   273	
   274		// Mark ourself as asleep and go to sleep.
   275		scavenge.parked = true
   276		goparkunlock(&scavenge.lock, waitReasonSleep, traceEvGoSleep, 2)
   277	
   278		// Return true if we completed the full sleep.
   279		return (nanotime() - now) >= ns
   280	}
   281	
   282	// Background scavenger.
   283	//
   284	// The background scavenger maintains the RSS of the application below
   285	// the line described by the proportional scavenging statistics in
   286	// the mheap struct.
   287	func bgscavenge(c chan int) {
   288		scavenge.g = getg()
   289	
   290		lock(&scavenge.lock)
   291		scavenge.parked = true
   292	
   293		scavenge.timer = new(timer)
   294		scavenge.timer.f = func(_ interface{}, _ uintptr) {
   295			wakeScavenger()
   296		}
   297	
   298		c <- 1
   299		goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
   300	
   301		// Parameters for sleeping.
   302		//
   303		// If we end up doing more work than we need, we should avoid spinning
   304		// until we have more work to do: instead, we know exactly how much time
   305		// until more work will need to be done, so we sleep.
   306		//
   307		// We should avoid sleeping for less than minSleepNS because Gosched()
   308		// overheads among other things will work out better in that case.
   309		//
   310		// There's no reason to set a maximum on sleep time because we'll always
   311		// get woken up earlier if there's any kind of update that could change
   312		// the scavenger's pacing.
   313		//
   314		// retryDelayNS tracks how much to sleep next time we fail to do any
   315		// useful work.
   316		const minSleepNS = int64(100 * 1000) // 100 µs
   317	
   318		retryDelayNS := minSleepNS
   319	
   320		for {
   321			released := uintptr(0)
   322			park := false
   323			ttnext := int64(0)
   324	
   325			// Run on the system stack since we grab the heap lock,
   326			// and a stack growth with the heap lock means a deadlock.
   327			systemstack(func() {
   328				lock(&mheap_.lock)
   329	
   330				// Update the last generation count that the scavenger has handled.
   331				scavenge.gen = mheap_.scavengeGen
   332	
   333				// If background scavenging is disabled or if there's no work to do just park.
   334				retained := heapRetained()
   335				if mheap_.scavengeBytesPerNS == 0 || retained <= mheap_.scavengeRetainedGoal {
   336					unlock(&mheap_.lock)
   337					park = true
   338					return
   339				}
   340	
   341				// Calculate how big we want the retained heap to be
   342				// at this point in time.
   343				//
   344				// The formula is for that of a line, y = b - mx
   345				// We want y (want),
   346				//   m = scavengeBytesPerNS (> 0)
   347				//   x = time between scavengeTimeBasis and now
   348				//   b = scavengeRetainedBasis
   349				rate := mheap_.scavengeBytesPerNS
   350				tdist := nanotime() - mheap_.scavengeTimeBasis
   351				rdist := uint64(rate * float64(tdist))
   352				want := mheap_.scavengeRetainedBasis - rdist
   353	
   354				// If we're above the line, scavenge to get below the
   355				// line.
   356				if retained > want {
   357					released = mheap_.scavengeLocked(uintptr(retained - want))
   358				}
   359				unlock(&mheap_.lock)
   360	
   361				// If we over-scavenged a bit, calculate how much time it'll
   362				// take at the current rate for us to make that up. We definitely
   363				// won't have any work to do until at least that amount of time
   364				// passes.
   365				if released > uintptr(retained-want) {
   366					extra := released - uintptr(retained-want)
   367					ttnext = int64(float64(extra) / rate)
   368				}
   369			})
   370	
   371			if park {
   372				lock(&scavenge.lock)
   373				scavenge.parked = true
   374				goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
   375				continue
   376			}
   377	
   378			if debug.gctrace > 0 {
   379				if released > 0 {
   380					print("scvg: ", released>>20, " MB released\n")
   381				}
   382				print("scvg: inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
   383			}
   384	
   385			if released == 0 {
   386				// If we were unable to release anything this may be because there's
   387				// no free memory available to scavenge. Go to sleep and try again.
   388				if scavengeSleep(retryDelayNS) {
   389					// If we successfully slept through the delay, back off exponentially.
   390					retryDelayNS *= 2
   391				}
   392				continue
   393			}
   394			retryDelayNS = minSleepNS
   395	
   396			if ttnext > 0 && ttnext > minSleepNS {
   397				// If there's an appreciable amount of time until the next scavenging
   398				// goal, just sleep. We'll get woken up if anything changes and this
   399				// way we avoid spinning.
   400				scavengeSleep(ttnext)
   401				continue
   402			}
   403	
   404			// Give something else a chance to run, no locks are held.
   405			Gosched()
   406		}
   407	}
   408
View as plain text