Add an option to skip remounting / as MS_PRIVATE.

Also update the minijail0.1 file.

Bug: 27304928

Change-Id: Id5c03fef3c7906e6fe53bad130d74c895f03f730
diff --git a/libminijail.c b/libminijail.c
index bcbaed5..118e61f 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -96,6 +96,7 @@
 		int capbset_drop:1;
 		int vfs:1;
 		int enter_vfs:1;
+		int skip_remount_private:1;
 		int pids:1;
 		int ipc:1;
 		int net:1;
@@ -398,6 +399,11 @@
 	j->flags.enter_vfs = 1;
 }
 
+void API minijail_skip_remount_private(struct minijail *j)
+{
+	j->flags.skip_remount_private = 1;
+}
+
 void API minijail_namespace_pids(struct minijail *j)
 {
 	j->flags.vfs = 1;
@@ -682,16 +688,15 @@
 	char *buf;
 };
 
-void marshal_state_init(struct marshal_state *state,
-			char *buf, size_t available)
+void marshal_state_init(struct marshal_state *state, char *buf,
+			size_t available)
 {
 	state->available = available;
 	state->buf = buf;
 	state->total = 0;
 }
 
-void marshal_append(struct marshal_state *state,
-		    void *src, size_t length)
+void marshal_append(struct marshal_state *state, void *src, size_t length)
 {
 	size_t copy_len = MIN(state->available, length);
 
@@ -727,7 +732,7 @@
 	if (j->flags.seccomp_filter && j->filter_prog) {
 		struct sock_fprog *fp = j->filter_prog;
 		marshal_append(state, (char *)fp->filter,
-				fp->len * sizeof(struct sock_filter));
+			       fp->len * sizeof(struct sock_filter));
 	}
 	for (m = j->mounts_head; m; m = m->next) {
 		marshal_append(state, m->src, strlen(m->src) + 1);
@@ -1372,12 +1377,15 @@
 		if (unshare(CLONE_NEWNS))
 			pdie("unshare(vfs)");
 		/*
-		 * Remount all filesystems as private. If they are shared
-		 * new bind mounts will creep out of our namespace.
+		 * Unless asked not to, remount all filesystems as private.
+		 * If they are shared, new bind mounts will creep out of our
+		 * namespace.
 		 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
 		 */
-		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
-			pdie("mount(/, private)");
+		if (!j->flags.skip_remount_private) {
+			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+				pdie("mount(/, private)");
+		}
 	}
 
 	if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
@@ -1650,7 +1658,8 @@
 					  char *const argv[],
 					  pid_t *pchild_pid,
 					  int *pstdin_fd, int *pstdout_fd,
-					  int *pstderr_fd) {
+					  int *pstderr_fd)
+{
 	return minijail_run_internal(j, filename, argv, pchild_pid,
 				     pstdin_fd, pstdout_fd, pstderr_fd, false);
 }
@@ -1780,13 +1789,13 @@
 	 * We might hack around this by having the clone()d child (init of the
 	 * pid namespace) return directly, rather than leaving the clone()d
 	 * process hanging around to be init for the new namespace (and having
-	 * its fork()ed child return in turn), but that process would be crippled
-	 * with its libc locks potentially broken. We might try fork()ing in the
-	 * parent before we clone() to ensure that we own all the locks, but
-	 * then we have to have the forked child hanging around consuming
-	 * resources (and possibly having file descriptors / shared memory
-	 * regions / etc attached). We'd need to keep the child around to avoid
-	 * having its children get reparented to init.
+	 * its fork()ed child return in turn), but that process would be
+	 * crippled with its libc locks potentially broken. We might try
+	 * fork()ing in the parent before we clone() to ensure that we own all
+	 * the locks, but then we have to have the forked child hanging around
+	 * consuming resources (and possibly having file descriptors / shared
+	 * memory regions / etc attached). We'd need to keep the child around to
+	 * avoid having its children get reparented to init.
 	 *
 	 * TODO(ellyjones): figure out if the "forked child hanging around"
 	 * problem is fixable or not. It would be nice if we worked in this
diff --git a/libminijail.h b/libminijail.h
index 8bd8b39..49f7786 100644
--- a/libminijail.h
+++ b/libminijail.h
@@ -58,6 +58,11 @@
 void minijail_reset_signal_mask(struct minijail *j);
 void minijail_namespace_vfs(struct minijail *j);
 void minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path);
+/*
+ * This option is *dangerous* as it negates most of the functionality of
+ * minijail_namespace_vfs(). You very likely don't need this.
+ */
+void minijail_skip_remount_private(struct minijail *j);
 void minijail_namespace_ipc(struct minijail *j);
 void minijail_namespace_net(struct minijail *j);
 void minijail_namespace_enter_net(struct minijail *j, const char *ns_path);
diff --git a/minijail0.1 b/minijail0.1
index ae53ce0..8d7e188 100644
--- a/minijail0.1
+++ b/minijail0.1
@@ -1,4 +1,4 @@
-.TH MINIJAIL0 "1" "January 2012" "Chromium OS" "User Commands"
+.TH MINIJAIL0 "1" "March 2016" "Chromium OS" "User Commands"
 .SH NAME
 minijail0 \- sandbox a process
 .SH SYNOPSIS
@@ -9,11 +9,11 @@
 Runs PROGRAM inside a sandbox.
 .TP
 \fB-a <table>\fR
-Run using the alternate syscall table named <table>.  Only available on kernels
+Run using the alternate syscall table named \fItable\fR. Only available on kernels
 and architectures that support the PR_ALT_SYSCALL option of prctl(2).
 .TP
 \fB-b <src>,<dest>[,<writeable>]
-Bind-mount <src> into the chroot directory at <dest>, optionally writeable.
+Bind-mount \fIsrc\fR into the chroot directory at \fIdest\fR, optionally writeable.
 .TP
 \fB-c <caps>\fR
 Restrict capabilities to \fIcaps\fR. When used in conjunction with \fB-u\fR and
@@ -24,21 +24,16 @@
 \fBcapabilities\fR(7).
 .TP
 \fB-C <dir>\fR
-Change root (using chroot(2)) to <dir>.
+Change root (using chroot(2)) to \fIdir\fR.
 .TP
 \fB-e[file]\fR
-Enter a new network namespace, or if \fIfile\fR is specified, Enter an existing
+Enter a new network namespace, or if \fIfile\fR is specified, enter an existing
 network namespace specified by \fIfile\fR which is typically of the form
 /proc/<pid>/ns/net.
 .TP
 \fB-f <file>\fR
 Write the pid of the jailed process to \fIfile\fR.
 .TP
-\fB-t\fR
-Mounts a tmpfs filesystem on /tmp. /tmp must exist in the chroot.
-This must be used with -C. The default filesystem has a max size of 128M
-and has standard /tmp permissions (777).
-.TP
 \fB-G\fR
 Inherit all the supplementary groups of the user specified with \fB-u\fR. It
 is an error to use this option without having specified a \fBuser name\fR to
@@ -56,17 +51,25 @@
 (Other direct numbers may be specified if minijail0 is not in sync with the
  host kernel or something like 32/64-bit compatibility issues exist.)
 .TP
+\fB-k <src>,<dest>,<type>[,<flags>]\fR
+Mount \fIsrc\fR, a \fItype\fR filesystem, into the chroot directory at \fIdest\fR, with optional \fIflags\fR.
+.TP
+\fB-K\fR
+Don't mark all existing mounts as MS_PRIVATE.
+This option is \fBdangerous\fR as it negates most of the functionality of \fB-v\fR.
+You very likely don't need this.
+.TP
 \fB-l\fR
 Run inside a new IPC namespace. This option makes the program's System V IPC
 namespace independent.
 .TP
 \fB-m "<uid> <loweruid> <count>[,<uid> <loweruid> <count>]"\fR
-Set the uid mapping of a user namespace (implies \fB-pU\fR).  Same arguments as
-\fBnewuidmap(1)\fR.  Multiple mappings should be separated by ','.
+Set the uid mapping of a user namespace (implies \fB-pU\fR). Same arguments as
+\fBnewuidmap(1)\fR. Multiple mappings should be separated by ','.
 .TP
 \fB-M "<uid> <loweruid> <count>[,<uid> <loweruid> <count>]"\fR
-Set the gid mapping of a user namespace (implies \fB-pU\fR).  Same arguments as
-\fBnewgidmap(1)\fR.  Multiple mappings should be separated by ','.
+Set the gid mapping of a user namespace (implies \fB-pU\fR). Same arguments as
+\fBnewgidmap(1)\fR. Multiple mappings should be separated by ','.
 .TP
 \fB-p\fR
 Run inside a new PID namespace. This option will make it impossible for the
@@ -89,9 +92,14 @@
 .TP
 \fB-S <arch-specific seccomp_filter policy file>\fR
 Enable seccomp(2) in mode 13 which restricts the child process to a set of
-system calls defined in the policy file.  Note that system calls often change
+system calls defined in the policy file. Note that system calls often change
 names based on the architecture or mode. (uname -m is your friend.)
 .TP
+\fB-t\fR
+Mounts a tmpfs filesystem on /tmp. /tmp must exist in the chroot.
+This must be used with \fB-C\fR. The default filesystem has a max size of 128M
+and has standard /tmp permissions (777).
+.TP
 \fB-T <type>\fR
 Assume program's ELF linkage type is \fItype\fR,
 which should be either 'static' or 'dynamic'.
@@ -112,15 +120,16 @@
 the process to which they will actually apply - specifically capability use
 (since capabilities are not inherited to an exec'd process unless the exec'd
 process has POSIX file capabilities), seccomp (since we can't exec() once we're
-seccomp'd), and ptrace-disable (which is always cleared on exec().
+seccomp'd), and ptrace-disable (which is always cleared on exec()).
 
 To this end, \fBlibminijailpreload\fR is forcibly loaded into all
 dynamically-linked target programs if any of these restrictions are in effect;
 we pass the specific restrictions in an environment variable which the preloaded
 library looks for. The forcibly-loaded library then applies the restrictions
 to the newly-loaded program.
+
 .SH AUTHOR
-Written by Elly Jones (ellyjones@chromium.org)
+The Chromium OS Authors <chromiumos-dev@chromium.org>
 .SH COPYRIGHT
 Copyright \(co 2011 The Chromium OS Authors
 License BSD-like.
diff --git a/minijail0.c b/minijail0.c
index 3d648e3..f3caeac 100644
--- a/minijail0.c
+++ b/minijail0.c
@@ -101,9 +101,9 @@
 	       "  [-M \"<gid> <lowergid> <count>[,<uid> <loweruid> <count>]\"]\n"
 	       "  <program> [args...]\n"
 	       "  -a <table>: Use alternate syscall table <table>.\n"
-	       "  -b:         Binds <src> to <dest> in chroot.\n"
+	       "  -b:         Bind <src> to <dest> in chroot.\n"
 	       "              Multiple instances allowed.\n"
-	       "  -k:         Mount <src> to <dest> in chroot.\n"
+	       "  -k:         Mount <src> at <dest> in chroot.\n"
 	       "              Multiple instances allowed, flags are passed to mount(2).\n"
 	       "  -c <caps>:  Restrict caps to <caps>.\n"
 	       "  -C <dir>:   chroot(2) to <dir>.\n"
@@ -117,6 +117,7 @@
 	       "  -i:         Exit immediately after fork (do not act as init).\n"
 	       "              Not compatible with -p.\n"
 	       "  -I:         Run <program> as init (pid 1) inside a new pid namespace (implies -p).\n"
+	       "  -K:         Don't mark all existing mounts as MS_PRIVATE.\n"
 	       "  -l:         Enter new IPC namespace.\n"
 	       "  -L:         Report blocked syscalls to syslog when using seccomp filter.\n"
 	       "              Forces the following syscalls to be allowed:\n"
@@ -166,12 +167,13 @@
 	int use_seccomp_filter = 0;
 	int binding = 0;
 	int pivot_root = 0, chroot = 0;
+	int mount_ns = 0, skip_remount = 0;
 	const size_t path_max = 4096;
 	const char *filter_path;
 	if (argc > 1 && argv[1][0] != '-')
 		return 1;
 	while ((opt = getopt(argc, argv,
-			     "u:g:sS:c:C:P:b:V:f:m:M:k:a:e::T:vrGhHinplLtIU"))
+			     "u:g:sS:c:C:P:b:V:f:m:M:k:a:e::T:vrGhHinplLtIUK"))
 	       != -1) {
 		switch (opt) {
 		case 'u':
@@ -228,6 +230,10 @@
 		case 'k':
 			add_mount(j, optarg);
 			break;
+		case 'K':
+			minijail_skip_remount_private(j);
+			skip_remount = 1;
+			break;
 		case 'P':
 			if (chroot) {
 				fprintf(stderr,
@@ -254,6 +260,7 @@
 			break;
 		case 'v':
 			minijail_namespace_vfs(j);
+			mount_ns = 1;
 			break;
 		case 'V':
 			minijail_namespace_enter_vfs(j, optarg);
@@ -337,6 +344,16 @@
 	}
 
 	/*
+	 * Remounting / as MS_PRIVATE only happens when entering a new mount
+	 * namespace, so skipping it only applies in that case.
+	 */
+	if (skip_remount && !mount_ns) {
+		fprintf(stderr, "Can't skip marking mounts as MS_PRIVATE"
+				" without mount namespaces.\n");
+		exit(1);
+	}
+
+	/*
 	 * We parse seccomp filters here to make sure we've collected all
 	 * cmdline options.
 	 */