Source file src/pkg/pkg/syscall/exec_linux.go
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "runtime"
11 "unsafe"
12 )
13
14
15
16 type SysProcIDMap struct {
17 ContainerID int
18 HostID int
19 Size int
20 }
21
22 type SysProcAttr struct {
23 Chroot string
24 Credential *Credential
25
26
27
28 Ptrace bool
29 Setsid bool
30 Setpgid bool
31 Setctty bool
32 Noctty bool
33 Ctty int
34 Foreground bool
35 Pgid int
36 Pdeathsig Signal
37 Cloneflags uintptr
38 Unshareflags uintptr
39 UidMappings []SysProcIDMap
40 GidMappings []SysProcIDMap
41
42
43
44
45 GidMappingsEnableSetgroups bool
46 AmbientCaps []uintptr
47 }
48
49 var (
50 none = [...]byte{'n', 'o', 'n', 'e', 0}
51 slash = [...]byte{'/', 0}
52 )
53
54
55 func runtime_BeforeFork()
56 func runtime_AfterFork()
57 func runtime_AfterForkInChild()
58
59
60
61
62
63
64
65
66
67
68
69 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
70
71
72 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
73 if locked {
74 runtime_AfterFork()
75 }
76 if err1 != 0 {
77 return 0, err1
78 }
79
80
81 pid = int(r1)
82
83 if sys.UidMappings != nil || sys.GidMappings != nil {
84 Close(p[0])
85 var err2 Errno
86
87
88 if sys.Unshareflags&CLONE_NEWUSER == 0 {
89 if err := writeUidGidMappings(pid, sys); err != nil {
90 err2 = err.(Errno)
91 }
92 }
93 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
94 Close(p[1])
95 }
96
97 return pid, 0
98 }
99
100 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
101
102 type capHeader struct {
103 version uint32
104 pid int32
105 }
106
107 type capData struct {
108 effective uint32
109 permitted uint32
110 inheritable uint32
111 }
112 type caps struct {
113 hdr capHeader
114 data [2]capData
115 }
116
117
118 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
119
120
121 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
122
123
124
125
126
127
128
129
130
131
132
133 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
134
135 const (
136 PR_CAP_AMBIENT = 0x2f
137 PR_CAP_AMBIENT_RAISE = 0x2
138 )
139
140
141
142
143
144
145
146
147 var (
148 err2 Errno
149 nextfd int
150 i int
151 caps caps
152 fd1 uintptr
153 puid, psetgroups, pgid []byte
154 uidmap, setgroups, gidmap []byte
155 )
156
157 if sys.UidMappings != nil {
158 puid = []byte("/proc/self/uid_map\000")
159 uidmap = formatIDMappings(sys.UidMappings)
160 }
161
162 if sys.GidMappings != nil {
163 psetgroups = []byte("/proc/self/setgroups\000")
164 pgid = []byte("/proc/self/gid_map\000")
165
166 if sys.GidMappingsEnableSetgroups {
167 setgroups = []byte("allow\000")
168 } else {
169 setgroups = []byte("deny\000")
170 }
171 gidmap = formatIDMappings(sys.GidMappings)
172 }
173
174
175 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
176
177
178
179
180 fd := make([]int, len(attr.Files))
181 nextfd = len(attr.Files)
182 for i, ufd := range attr.Files {
183 if nextfd < int(ufd) {
184 nextfd = int(ufd)
185 }
186 fd[i] = int(ufd)
187 }
188 nextfd++
189
190
191
192 if sys.UidMappings != nil || sys.GidMappings != nil {
193 if err := forkExecPipe(p[:]); err != nil {
194 err1 = err.(Errno)
195 return
196 }
197 }
198
199 hasRawVforkSyscall := runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "s390x"
200
201
202
203 runtime_BeforeFork()
204 locked = true
205 switch {
206 case hasRawVforkSyscall && (sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0):
207 r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
208 case runtime.GOARCH == "s390x":
209 r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
210 default:
211 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
212 }
213 if err1 != 0 || r1 != 0 {
214
215
216
217
218
219
220 return
221 }
222
223
224
225 runtime_AfterForkInChild()
226
227
228 if len(sys.AmbientCaps) > 0 {
229 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
230 if err1 != 0 {
231 goto childerror
232 }
233 }
234
235
236 if sys.UidMappings != nil || sys.GidMappings != nil {
237 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
238 goto childerror
239 }
240 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
241 if err1 != 0 {
242 goto childerror
243 }
244 if r1 != unsafe.Sizeof(err2) {
245 err1 = EINVAL
246 goto childerror
247 }
248 if err2 != 0 {
249 err1 = err2
250 goto childerror
251 }
252 }
253
254
255 if sys.Setsid {
256 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
257 if err1 != 0 {
258 goto childerror
259 }
260 }
261
262
263 if sys.Setpgid || sys.Foreground {
264
265 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
266 if err1 != 0 {
267 goto childerror
268 }
269 }
270
271 if sys.Foreground {
272 pgrp := int32(sys.Pgid)
273 if pgrp == 0 {
274 r1, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
275
276 pgrp = int32(r1)
277 }
278
279
280 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
281 if err1 != 0 {
282 goto childerror
283 }
284 }
285
286
287 if sys.Unshareflags != 0 {
288 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
289 if err1 != 0 {
290 goto childerror
291 }
292
293 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
294 dirfd := int(_AT_FDCWD)
295 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
296 goto childerror
297 }
298 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
299 if err1 != 0 {
300 goto childerror
301 }
302 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
303 goto childerror
304 }
305
306 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
307 goto childerror
308 }
309 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
310 if err1 != 0 {
311 goto childerror
312 }
313 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
314 goto childerror
315 }
316 }
317
318 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
319 dirfd := int(_AT_FDCWD)
320 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
321 goto childerror
322 }
323 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
324 if err1 != 0 {
325 goto childerror
326 }
327 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
328 goto childerror
329 }
330 }
331
332
333
334
335
336
337
338
339 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
340 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
341 if err1 != 0 {
342 goto childerror
343 }
344 }
345 }
346
347
348 if chroot != nil {
349 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
350 if err1 != 0 {
351 goto childerror
352 }
353 }
354
355
356 if cred := sys.Credential; cred != nil {
357 ngroups := uintptr(len(cred.Groups))
358 groups := uintptr(0)
359 if ngroups > 0 {
360 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
361 }
362 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
363 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
364 if err1 != 0 {
365 goto childerror
366 }
367 }
368 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
369 if err1 != 0 {
370 goto childerror
371 }
372 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
373 if err1 != 0 {
374 goto childerror
375 }
376 }
377
378 if len(sys.AmbientCaps) != 0 {
379
380
381 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
382
383 if _, _, err1 := RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
384 goto childerror
385 }
386
387 for _, c := range sys.AmbientCaps {
388
389
390 caps.data[capToIndex(c)].permitted |= capToMask(c)
391 caps.data[capToIndex(c)].inheritable |= capToMask(c)
392 }
393
394 if _, _, err1 := RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
395 goto childerror
396 }
397
398 for _, c := range sys.AmbientCaps {
399 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
400 if err1 != 0 {
401 goto childerror
402 }
403 }
404 }
405
406
407 if dir != nil {
408 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
409 if err1 != 0 {
410 goto childerror
411 }
412 }
413
414
415 if sys.Pdeathsig != 0 {
416 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
417 if err1 != 0 {
418 goto childerror
419 }
420
421
422
423
424 r1, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
425 if r1 != ppid {
426 pid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
427 _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
428 if err1 != 0 {
429 goto childerror
430 }
431 }
432 }
433
434
435
436 if pipe < nextfd {
437 _, _, err1 = RawSyscall(_SYS_dup, uintptr(pipe), uintptr(nextfd), 0)
438 if err1 != 0 {
439 goto childerror
440 }
441 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
442 pipe = nextfd
443 nextfd++
444 }
445 for i = 0; i < len(fd); i++ {
446 if fd[i] >= 0 && fd[i] < int(i) {
447 if nextfd == pipe {
448 nextfd++
449 }
450 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(nextfd), 0)
451 if err1 != 0 {
452 goto childerror
453 }
454 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
455 fd[i] = nextfd
456 nextfd++
457 }
458 }
459
460
461 for i = 0; i < len(fd); i++ {
462 if fd[i] == -1 {
463 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
464 continue
465 }
466 if fd[i] == int(i) {
467
468
469 _, _, err1 = RawSyscall(SYS_FCNTL, uintptr(fd[i]), F_SETFD, 0)
470 if err1 != 0 {
471 goto childerror
472 }
473 continue
474 }
475
476
477 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(i), 0)
478 if err1 != 0 {
479 goto childerror
480 }
481 }
482
483
484
485
486
487 for i = len(fd); i < 3; i++ {
488 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
489 }
490
491
492 if sys.Noctty {
493 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
494 if err1 != 0 {
495 goto childerror
496 }
497 }
498
499
500 if sys.Setctty {
501 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
502 if err1 != 0 {
503 goto childerror
504 }
505 }
506
507
508
509
510 if sys.Ptrace {
511 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
512 if err1 != 0 {
513 goto childerror
514 }
515 }
516
517
518 _, _, err1 = RawSyscall(SYS_EXECVE,
519 uintptr(unsafe.Pointer(argv0)),
520 uintptr(unsafe.Pointer(&argv[0])),
521 uintptr(unsafe.Pointer(&envv[0])))
522
523 childerror:
524
525 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
526 for {
527 RawSyscall(SYS_EXIT, 253, 0, 0)
528 }
529 }
530
531
532 func forkExecPipe(p []int) (err error) {
533 err = Pipe2(p, O_CLOEXEC)
534
535
536 if err == ENOSYS {
537 if err = Pipe(p); err != nil {
538 return
539 }
540 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil {
541 return
542 }
543 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC)
544 }
545 return
546 }
547
548 func formatIDMappings(idMap []SysProcIDMap) []byte {
549 var data []byte
550 for _, im := range idMap {
551 data = append(data, []byte(itoa(im.ContainerID)+" "+itoa(im.HostID)+" "+itoa(im.Size)+"\n")...)
552 }
553 return data
554 }
555
556
557 func writeIDMappings(path string, idMap []SysProcIDMap) error {
558 fd, err := Open(path, O_RDWR, 0)
559 if err != nil {
560 return err
561 }
562
563 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
564 Close(fd)
565 return err
566 }
567
568 if err := Close(fd); err != nil {
569 return err
570 }
571
572 return nil
573 }
574
575
576
577
578
579 func writeSetgroups(pid int, enable bool) error {
580 sgf := "/proc/" + itoa(pid) + "/setgroups"
581 fd, err := Open(sgf, O_RDWR, 0)
582 if err != nil {
583 return err
584 }
585
586 var data []byte
587 if enable {
588 data = []byte("allow")
589 } else {
590 data = []byte("deny")
591 }
592
593 if _, err := Write(fd, data); err != nil {
594 Close(fd)
595 return err
596 }
597
598 return Close(fd)
599 }
600
601
602
603 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
604 if sys.UidMappings != nil {
605 uidf := "/proc/" + itoa(pid) + "/uid_map"
606 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
607 return err
608 }
609 }
610
611 if sys.GidMappings != nil {
612
613 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
614 return err
615 }
616 gidf := "/proc/" + itoa(pid) + "/gid_map"
617 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
618 return err
619 }
620 }
621
622 return nil
623 }
624
View as plain text