minijail: Support setting syscall table with PR_ALT_SYSCALL

Add support for setting the syscall table for a jailed process using
prctl(PR_ALT_SYSCALL).  This adds the option '-a <table>' which
changes the jailed process's syscall table to the alt_syscall
table named <table>.  alt_syscall tables must be registerd in the
kernel (see crosreview.com/312137 for an example of how this is done).

Bug: 25649436
TEST=Create a test blacklist that blocks write(2) and observe that
'minijail0 -a test -- /bin/echo hello' prints nothing to stdout.

Change-Id: Idddafa1d0b81483a594e05d9d3390d4f9ad849c6
Signed-off-by: Andrew Bresticker <abrestic@chromium.org>
diff --git a/libminijail.c b/libminijail.c
index e9dfc89..544651a 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -53,6 +53,10 @@
 # define PR_SET_SECCOMP 22
 #endif
 
+#ifndef PR_ALT_SYSCALL
+# define PR_ALT_SYSCALL 0x43724f53
+#endif
+
 /* For seccomp_filter using BPF. */
 #ifndef PR_SET_NO_NEW_PRIVS
 # define PR_SET_NO_NEW_PRIVS 38
@@ -102,6 +106,7 @@
 		int mount_tmp:1;
 		int do_init:1;
 		int pid_file:1;
+		int alt_syscall:1;
 	} flags;
 	uid_t uid;
 	gid_t gid;
@@ -116,6 +121,7 @@
 	char *pid_file_path;
 	char *uidmap;
 	char *gidmap;
+	char *alt_syscall_table;
 	struct sock_fprog *filter_prog;
 	struct mountpoint *mounts_head;
 	struct mountpoint *mounts_tail;
@@ -539,6 +545,15 @@
 	fclose(file);
 }
 
+int API minijail_use_alt_syscall(struct minijail *j, const char *table)
+{
+	j->alt_syscall_table = strdup(table);
+	if (!j->alt_syscall_table)
+		return -ENOMEM;
+	j->flags.alt_syscall = 1;
+	return 0;
+}
+
 struct marshal_state {
 	size_t available;
 	size_t total;
@@ -577,6 +592,10 @@
 		marshal_append(state, j->user, strlen(j->user) + 1);
 	if (j->chrootdir)
 		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
+	if (j->alt_syscall_table) {
+		marshal_append(state, j->alt_syscall_table,
+			       strlen(j->alt_syscall_table) + 1);
+	}
 	if (j->flags.seccomp_filter && j->filter_prog) {
 		struct sock_fprog *fp = j->filter_prog;
 		marshal_append(state, (char *)fp->filter,
@@ -673,6 +692,15 @@
 			goto bad_chrootdir;
 	}
 
+	if (j->alt_syscall_table) {	/* stale pointer */
+		char *alt_syscall_table = consumestr(&serialized, &length);
+		if (!alt_syscall_table)
+			goto bad_syscall_table;
+		j->alt_syscall_table = strdup(alt_syscall_table);
+		if (!j->alt_syscall_table)
+			goto bad_syscall_table;
+	}
+
 	if (j->flags.seccomp_filter && j->filter_len > 0) {
 		size_t ninstrs = j->filter_len;
 		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
@@ -720,6 +748,9 @@
 		free(j->filter_prog);
 	}
 bad_filters:
+	if (j->alt_syscall_table)
+		free(j->alt_syscall_table);
+bad_syscall_table:
 	if (j->chrootdir)
 		free(j->chrootdir);
 bad_chrootdir:
@@ -728,6 +759,7 @@
 clear_pointers:
 	j->user = NULL;
 	j->chrootdir = NULL;
+	j->alt_syscall_table = NULL;
 out:
 	return ret;
 }
@@ -1153,6 +1185,15 @@
 	}
 
 	/*
+	 * Select the specified alternate syscall table.  The table must not
+	 * block prctl(2) if we're using seccomp as well.
+	 */
+	if (j->flags.alt_syscall) {
+		if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
+			pdie("prctl(PR_ALT_SYSCALL)");
+	}
+
+	/*
 	 * seccomp has to come last since it cuts off all the other
 	 * privilege-dropping syscalls :)
 	 */
@@ -1701,5 +1742,7 @@
 		free(j->user);
 	if (j->chrootdir)
 		free(j->chrootdir);
+	if (j->alt_syscall_table)
+		free(j->alt_syscall_table);
 	free(j);
 }
diff --git a/libminijail.h b/libminijail.h
index 3f00a5e..e7f24ce 100644
--- a/libminijail.h
+++ b/libminijail.h
@@ -64,6 +64,10 @@
 int minijail_write_pid_file(struct minijail *j, const char *path);
 void minijail_inherit_usergroups(struct minijail *j);
 void minijail_disable_ptrace(struct minijail *j);
+/* Changes the jailed process's syscall table to the alt_syscall table
+ * named |table|.
+ */
+int minijail_use_alt_syscall(struct minijail *j, const char *table);
 
 /* minijail_enter_chroot: enables chroot() restriction for @j
  * @j   minijail to apply restriction to
diff --git a/minijail0.1 b/minijail0.1
index 1d85385..1f3b126 100644
--- a/minijail0.1
+++ b/minijail0.1
@@ -8,6 +8,10 @@
 .PP
 Runs PROGRAM inside a sandbox.
 .TP
+\fB-a <table>\fR
+Run using the alternate syscall table named <table>.  Only available on kernels
+and architectures that support the PR_ALT_SYSCALL option of prctl(2).
+.TP
 \fB-b <src>,<dest>[,<writeable>]
 Bind-mount <src> into the chroot directory at <dest>, optionally writeable.
 .TP
diff --git a/minijail0.c b/minijail0.c
index 2d6ac1c..68c9478 100644
--- a/minijail0.c
+++ b/minijail0.c
@@ -99,6 +99,7 @@
 	       "[-m \"<uid> <loweruid> <count>[,<uid> <loweruid> <count>]\"] "
 	       "[-M \"<gid> <lowergid> <count>[,<uid> <loweruid> <count>]\"] "
 	       "<program> [args...]\n"
+	       "  -a <table>: use alternate syscall table <table>\n"
 	       "  -b:         binds <src> to <dest> in chroot. Multiple "
 	       "instances allowed\n"
 	       "  -k:         mount <src> to <dest> in chroot. Multiple "
@@ -165,7 +166,7 @@
 	if (argc > 1 && argv[1][0] != '-')
 		return 1;
 	while ((opt = getopt(argc, argv,
-			     "u:g:sS:c:C:P:b:V:f:m:M:k:e::vrGhHinpLtIU"))
+			     "u:g:sS:c:C:P:b:V:f:m:M:k:a:e::vrGhHinpLtIU"))
 	       != -1) {
 		switch (opt) {
 		case 'u':
@@ -292,6 +293,12 @@
 				exit(1);
 			}
 			break;
+		case 'a':
+			if (0 != minijail_use_alt_syscall(j, optarg)) {
+				fprintf(stderr, "Could not set alt-syscall table\n");
+				exit(1);
+			}
+			break;
 		default:
 			usage(argv[0]);
 			exit(1);