...

Source file src/runtime/os_linux.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package runtime
     6	
     7	import (
     8		"runtime/internal/sys"
     9		"unsafe"
    10	)
    11	
    12	type mOS struct{}
    13	
    14	//go:noescape
    15	func futex(addr unsafe.Pointer, op int32, val uint32, ts, addr2 unsafe.Pointer, val3 uint32) int32
    16	
    17	// Linux futex.
    18	//
    19	//	futexsleep(uint32 *addr, uint32 val)
    20	//	futexwakeup(uint32 *addr)
    21	//
    22	// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
    23	// Futexwakeup wakes up threads sleeping on addr.
    24	// Futexsleep is allowed to wake up spuriously.
    25	
    26	const (
    27		_FUTEX_PRIVATE_FLAG = 128
    28		_FUTEX_WAIT_PRIVATE = 0 | _FUTEX_PRIVATE_FLAG
    29		_FUTEX_WAKE_PRIVATE = 1 | _FUTEX_PRIVATE_FLAG
    30	)
    31	
    32	// Atomically,
    33	//	if(*addr == val) sleep
    34	// Might be woken up spuriously; that's allowed.
    35	// Don't sleep longer than ns; ns < 0 means forever.
    36	//go:nosplit
    37	func futexsleep(addr *uint32, val uint32, ns int64) {
    38		// Some Linux kernels have a bug where futex of
    39		// FUTEX_WAIT returns an internal error code
    40		// as an errno. Libpthread ignores the return value
    41		// here, and so can we: as it says a few lines up,
    42		// spurious wakeups are allowed.
    43		if ns < 0 {
    44			futex(unsafe.Pointer(addr), _FUTEX_WAIT_PRIVATE, val, nil, nil, 0)
    45			return
    46		}
    47	
    48		var ts timespec
    49		ts.setNsec(ns)
    50		futex(unsafe.Pointer(addr), _FUTEX_WAIT_PRIVATE, val, unsafe.Pointer(&ts), nil, 0)
    51	}
    52	
    53	// If any procs are sleeping on addr, wake up at most cnt.
    54	//go:nosplit
    55	func futexwakeup(addr *uint32, cnt uint32) {
    56		ret := futex(unsafe.Pointer(addr), _FUTEX_WAKE_PRIVATE, cnt, nil, nil, 0)
    57		if ret >= 0 {
    58			return
    59		}
    60	
    61		// I don't know that futex wakeup can return
    62		// EAGAIN or EINTR, but if it does, it would be
    63		// safe to loop and call futex again.
    64		systemstack(func() {
    65			print("futexwakeup addr=", addr, " returned ", ret, "\n")
    66		})
    67	
    68		*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
    69	}
    70	
    71	func getproccount() int32 {
    72		// This buffer is huge (8 kB) but we are on the system stack
    73		// and there should be plenty of space (64 kB).
    74		// Also this is a leaf, so we're not holding up the memory for long.
    75		// See golang.org/issue/11823.
    76		// The suggested behavior here is to keep trying with ever-larger
    77		// buffers, but we don't have a dynamic memory allocator at the
    78		// moment, so that's a bit tricky and seems like overkill.
    79		const maxCPUs = 64 * 1024
    80		var buf [maxCPUs / 8]byte
    81		r := sched_getaffinity(0, unsafe.Sizeof(buf), &buf[0])
    82		if r < 0 {
    83			return 1
    84		}
    85		n := int32(0)
    86		for _, v := range buf[:r] {
    87			for v != 0 {
    88				n += int32(v & 1)
    89				v >>= 1
    90			}
    91		}
    92		if n == 0 {
    93			n = 1
    94		}
    95		return n
    96	}
    97	
    98	// Clone, the Linux rfork.
    99	const (
   100		_CLONE_VM             = 0x100
   101		_CLONE_FS             = 0x200
   102		_CLONE_FILES          = 0x400
   103		_CLONE_SIGHAND        = 0x800
   104		_CLONE_PTRACE         = 0x2000
   105		_CLONE_VFORK          = 0x4000
   106		_CLONE_PARENT         = 0x8000
   107		_CLONE_THREAD         = 0x10000
   108		_CLONE_NEWNS          = 0x20000
   109		_CLONE_SYSVSEM        = 0x40000
   110		_CLONE_SETTLS         = 0x80000
   111		_CLONE_PARENT_SETTID  = 0x100000
   112		_CLONE_CHILD_CLEARTID = 0x200000
   113		_CLONE_UNTRACED       = 0x800000
   114		_CLONE_CHILD_SETTID   = 0x1000000
   115		_CLONE_STOPPED        = 0x2000000
   116		_CLONE_NEWUTS         = 0x4000000
   117		_CLONE_NEWIPC         = 0x8000000
   118	
   119		cloneFlags = _CLONE_VM | /* share memory */
   120			_CLONE_FS | /* share cwd, etc */
   121			_CLONE_FILES | /* share fd table */
   122			_CLONE_SIGHAND | /* share sig handler table */
   123			_CLONE_SYSVSEM | /* share SysV semaphore undo lists (see issue #20763) */
   124			_CLONE_THREAD /* revisit - okay for now */
   125	)
   126	
   127	//go:noescape
   128	func clone(flags int32, stk, mp, gp, fn unsafe.Pointer) int32
   129	
   130	// May run with m.p==nil, so write barriers are not allowed.
   131	//go:nowritebarrier
   132	func newosproc(mp *m) {
   133		stk := unsafe.Pointer(mp.g0.stack.hi)
   134		/*
   135		 * note: strace gets confused if we use CLONE_PTRACE here.
   136		 */
   137		if false {
   138			print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " clone=", funcPC(clone), " id=", mp.id, " ostk=", &mp, "\n")
   139		}
   140	
   141		// Disable signals during clone, so that the new thread starts
   142		// with signals disabled. It will enable them in minit.
   143		var oset sigset
   144		sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
   145		ret := clone(cloneFlags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(funcPC(mstart)))
   146		sigprocmask(_SIG_SETMASK, &oset, nil)
   147	
   148		if ret < 0 {
   149			print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
   150			if ret == -_EAGAIN {
   151				println("runtime: may need to increase max user processes (ulimit -u)")
   152			}
   153			throw("newosproc")
   154		}
   155	}
   156	
   157	// Version of newosproc that doesn't require a valid G.
   158	//go:nosplit
   159	func newosproc0(stacksize uintptr, fn unsafe.Pointer) {
   160		stack := sysAlloc(stacksize, &memstats.stacks_sys)
   161		if stack == nil {
   162			write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
   163			exit(1)
   164		}
   165		ret := clone(cloneFlags, unsafe.Pointer(uintptr(stack)+stacksize), nil, nil, fn)
   166		if ret < 0 {
   167			write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
   168			exit(1)
   169		}
   170	}
   171	
   172	var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
   173	var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
   174	
   175	const (
   176		_AT_NULL   = 0  // End of vector
   177		_AT_PAGESZ = 6  // System physical page size
   178		_AT_HWCAP  = 16 // hardware capability bit vector
   179		_AT_RANDOM = 25 // introduced in 2.6.29
   180		_AT_HWCAP2 = 26 // hardware capability bit vector 2
   181	)
   182	
   183	var procAuxv = []byte("/proc/self/auxv\x00")
   184	
   185	var addrspace_vec [1]byte
   186	
   187	func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
   188	
   189	func sysargs(argc int32, argv **byte) {
   190		n := argc + 1
   191	
   192		// skip over argv, envp to get to auxv
   193		for argv_index(argv, n) != nil {
   194			n++
   195		}
   196	
   197		// skip NULL separator
   198		n++
   199	
   200		// now argv+n is auxv
   201		auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
   202		if sysauxv(auxv[:]) != 0 {
   203			return
   204		}
   205		// In some situations we don't get a loader-provided
   206		// auxv, such as when loaded as a library on Android.
   207		// Fall back to /proc/self/auxv.
   208		fd := open(&procAuxv[0], 0 /* O_RDONLY */, 0)
   209		if fd < 0 {
   210			// On Android, /proc/self/auxv might be unreadable (issue 9229), so we fallback to
   211			// try using mincore to detect the physical page size.
   212			// mincore should return EINVAL when address is not a multiple of system page size.
   213			const size = 256 << 10 // size of memory region to allocate
   214			p, err := mmap(nil, size, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
   215			if err != 0 {
   216				return
   217			}
   218			var n uintptr
   219			for n = 4 << 10; n < size; n <<= 1 {
   220				err := mincore(unsafe.Pointer(uintptr(p)+n), 1, &addrspace_vec[0])
   221				if err == 0 {
   222					physPageSize = n
   223					break
   224				}
   225			}
   226			if physPageSize == 0 {
   227				physPageSize = size
   228			}
   229			munmap(p, size)
   230			return
   231		}
   232		var buf [128]uintptr
   233		n = read(fd, noescape(unsafe.Pointer(&buf[0])), int32(unsafe.Sizeof(buf)))
   234		closefd(fd)
   235		if n < 0 {
   236			return
   237		}
   238		// Make sure buf is terminated, even if we didn't read
   239		// the whole file.
   240		buf[len(buf)-2] = _AT_NULL
   241		sysauxv(buf[:])
   242	}
   243	
   244	func sysauxv(auxv []uintptr) int {
   245		var i int
   246		for ; auxv[i] != _AT_NULL; i += 2 {
   247			tag, val := auxv[i], auxv[i+1]
   248			switch tag {
   249			case _AT_RANDOM:
   250				// The kernel provides a pointer to 16-bytes
   251				// worth of random data.
   252				startupRandomData = (*[16]byte)(unsafe.Pointer(val))[:]
   253	
   254			case _AT_PAGESZ:
   255				physPageSize = val
   256			}
   257	
   258			archauxv(tag, val)
   259			vdsoauxv(tag, val)
   260		}
   261		return i / 2
   262	}
   263	
   264	var sysTHPSizePath = []byte("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size\x00")
   265	
   266	func getHugePageSize() uintptr {
   267		var numbuf [20]byte
   268		fd := open(&sysTHPSizePath[0], 0 /* O_RDONLY */, 0)
   269		if fd < 0 {
   270			return 0
   271		}
   272		n := read(fd, noescape(unsafe.Pointer(&numbuf[0])), int32(len(numbuf)))
   273		closefd(fd)
   274		if n <= 0 {
   275			return 0
   276		}
   277		l := n - 1 // remove trailing newline
   278		v, ok := atoi(slicebytetostringtmp(numbuf[:l]))
   279		if !ok || v < 0 {
   280			v = 0
   281		}
   282		if v&(v-1) != 0 {
   283			// v is not a power of 2
   284			return 0
   285		}
   286		return uintptr(v)
   287	}
   288	
   289	func osinit() {
   290		ncpu = getproccount()
   291		physHugePageSize = getHugePageSize()
   292	}
   293	
   294	var urandom_dev = []byte("/dev/urandom\x00")
   295	
   296	func getRandomData(r []byte) {
   297		if startupRandomData != nil {
   298			n := copy(r, startupRandomData)
   299			extendRandom(r, n)
   300			return
   301		}
   302		fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
   303		n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
   304		closefd(fd)
   305		extendRandom(r, int(n))
   306	}
   307	
   308	func goenvs() {
   309		goenvs_unix()
   310	}
   311	
   312	// Called to do synchronous initialization of Go code built with
   313	// -buildmode=c-archive or -buildmode=c-shared.
   314	// None of the Go runtime is initialized.
   315	//go:nosplit
   316	//go:nowritebarrierrec
   317	func libpreinit() {
   318		initsig(true)
   319	}
   320	
   321	// Called to initialize a new m (including the bootstrap m).
   322	// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
   323	func mpreinit(mp *m) {
   324		mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
   325		mp.gsignal.m = mp
   326	}
   327	
   328	func gettid() uint32
   329	
   330	// Called to initialize a new m (including the bootstrap m).
   331	// Called on the new thread, cannot allocate memory.
   332	func minit() {
   333		minitSignals()
   334	
   335		// for debuggers, in case cgo created the thread
   336		getg().m.procid = uint64(gettid())
   337	}
   338	
   339	// Called from dropm to undo the effect of an minit.
   340	//go:nosplit
   341	func unminit() {
   342		unminitSignals()
   343	}
   344	
   345	//#ifdef GOARCH_386
   346	//#define sa_handler k_sa_handler
   347	//#endif
   348	
   349	func sigreturn()
   350	func sigtramp(sig uint32, info *siginfo, ctx unsafe.Pointer)
   351	func cgoSigtramp()
   352	
   353	//go:noescape
   354	func sigaltstack(new, old *stackt)
   355	
   356	//go:noescape
   357	func setitimer(mode int32, new, old *itimerval)
   358	
   359	//go:noescape
   360	func rtsigprocmask(how int32, new, old *sigset, size int32)
   361	
   362	//go:nosplit
   363	//go:nowritebarrierrec
   364	func sigprocmask(how int32, new, old *sigset) {
   365		rtsigprocmask(how, new, old, int32(unsafe.Sizeof(*new)))
   366	}
   367	
   368	func raise(sig uint32)
   369	func raiseproc(sig uint32)
   370	
   371	//go:noescape
   372	func sched_getaffinity(pid, len uintptr, buf *byte) int32
   373	func osyield()
   374	
   375	//go:nosplit
   376	//go:nowritebarrierrec
   377	func setsig(i uint32, fn uintptr) {
   378		var sa sigactiont
   379		sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER | _SA_RESTART
   380		sigfillset(&sa.sa_mask)
   381		// Although Linux manpage says "sa_restorer element is obsolete and
   382		// should not be used". x86_64 kernel requires it. Only use it on
   383		// x86.
   384		if GOARCH == "386" || GOARCH == "amd64" {
   385			sa.sa_restorer = funcPC(sigreturn)
   386		}
   387		if fn == funcPC(sighandler) {
   388			if iscgo {
   389				fn = funcPC(cgoSigtramp)
   390			} else {
   391				fn = funcPC(sigtramp)
   392			}
   393		}
   394		sa.sa_handler = fn
   395		sigaction(i, &sa, nil)
   396	}
   397	
   398	//go:nosplit
   399	//go:nowritebarrierrec
   400	func setsigstack(i uint32) {
   401		var sa sigactiont
   402		sigaction(i, nil, &sa)
   403		if sa.sa_flags&_SA_ONSTACK != 0 {
   404			return
   405		}
   406		sa.sa_flags |= _SA_ONSTACK
   407		sigaction(i, &sa, nil)
   408	}
   409	
   410	//go:nosplit
   411	//go:nowritebarrierrec
   412	func getsig(i uint32) uintptr {
   413		var sa sigactiont
   414		sigaction(i, nil, &sa)
   415		return sa.sa_handler
   416	}
   417	
   418	// setSignaltstackSP sets the ss_sp field of a stackt.
   419	//go:nosplit
   420	func setSignalstackSP(s *stackt, sp uintptr) {
   421		*(*uintptr)(unsafe.Pointer(&s.ss_sp)) = sp
   422	}
   423	
   424	//go:nosplit
   425	func (c *sigctxt) fixsigcode(sig uint32) {
   426	}
   427	
   428	// sysSigaction calls the rt_sigaction system call.
   429	//go:nosplit
   430	func sysSigaction(sig uint32, new, old *sigactiont) {
   431		if rt_sigaction(uintptr(sig), new, old, unsafe.Sizeof(sigactiont{}.sa_mask)) != 0 {
   432			// Workaround for bugs in QEMU user mode emulation.
   433			//
   434			// QEMU turns calls to the sigaction system call into
   435			// calls to the C library sigaction call; the C
   436			// library call rejects attempts to call sigaction for
   437			// SIGCANCEL (32) or SIGSETXID (33).
   438			//
   439			// QEMU rejects calling sigaction on SIGRTMAX (64).
   440			//
   441			// Just ignore the error in these case. There isn't
   442			// anything we can do about it anyhow.
   443			if sig != 32 && sig != 33 && sig != 64 {
   444				// Use system stack to avoid split stack overflow on ppc64/ppc64le.
   445				systemstack(func() {
   446					throw("sigaction failed")
   447				})
   448			}
   449		}
   450	}
   451	
   452	// rt_sigaction is implemented in assembly.
   453	//go:noescape
   454	func rt_sigaction(sig uintptr, new, old *sigactiont, size uintptr) int32
   455	

View as plain text