Merge upstream code as of chromiumos/platform2 #676219.

Merged source code from https://chromium.googlesource.com/chromiumos/platform2/+/master/minijail/
as of 67621965ac2609fb1b8ff9d98159633747890f00.

Bug: 22487289
diff --git a/CPPLINT.cfg b/CPPLINT.cfg
new file mode 100644
index 0000000..51ff339
--- /dev/null
+++ b/CPPLINT.cfg
@@ -0,0 +1 @@
+exclude_files=.*
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..75ea871
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,62 @@
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+include common.mk
+
+LIBDIR = lib
+PRELOADNAME = libminijailpreload.so
+PRELOADPATH = \"/$(LIBDIR)/$(PRELOADNAME)\"
+CPPFLAGS += -DPRELOADPATH="$(PRELOADPATH)"
+
+ifneq ($(HAVE_SECUREBITS_H),no)
+CPPFLAGS += -DHAVE_SECUREBITS_H
+endif
+ifneq ($(USE_seccomp),yes)
+CPPFLAGS += -DUSE_SECCOMP_SOFTFAIL
+endif
+
+all: CC_BINARY(minijail0) CC_LIBRARY(libminijail.so) \
+		CC_LIBRARY(libminijailpreload.so)
+
+# TODO(jorgelo): convert to TEST().
+tests: CC_BINARY(libminijail_unittest) CC_BINARY(syscall_filter_unittest)
+
+CC_BINARY(minijail0): LDLIBS += -lcap -ldl
+CC_BINARY(minijail0): libsyscalls.gen.o libminijail.o syscall_filter.o \
+		signal.o bpf.o util.o elfparse.o minijail0.o
+clean: CLEAN(minijail0)
+
+CC_LIBRARY(libminijail.so): LDLIBS += -lcap
+CC_LIBRARY(libminijail.so): libminijail.o syscall_filter.o signal.o bpf.o \
+		util.o libsyscalls.gen.o
+clean: CLEAN(libminijail.so)
+
+CC_BINARY(libminijail_unittest): LDLIBS += -lcap
+CC_BINARY(libminijail_unittest): libminijail_unittest.o libminijail.o \
+		syscall_filter.o signal.o bpf.o util.o libsyscalls.gen.o
+clean: CLEAN(libminijail_unittest)
+
+CC_LIBRARY(libminijailpreload.so): LDLIBS += -lcap -ldl
+CC_LIBRARY(libminijailpreload.so): libminijailpreload.o libminijail.o \
+		libsyscalls.gen.o syscall_filter.o signal.o bpf.o util.o
+clean: CLEAN(libminijailpreload.so)
+
+CC_BINARY(syscall_filter_unittest): syscall_filter_unittest.o syscall_filter.o \
+		bpf.o util.o libsyscalls.gen.o
+clean: CLEAN(syscall_filter_unittest)
+
+libsyscalls.gen.o: CPPFLAGS += -I$(SRC)
+
+libsyscalls.gen.o.depends: libsyscalls.gen.c
+
+# Only regenerate libsyscalls.gen.c if the Makefile or header changes.
+# NOTE! This will not detect if the file is not appropriate for the target.
+# TODO(jorgelo): fix generation when 'CC' env variable is not set.
+libsyscalls.gen.c: $(SRC)/Makefile $(SRC)/libsyscalls.h
+	@printf "Generating target-arch specific $@... "
+	$(QUIET)$(SRC)/gen_syscalls.sh $@
+	@printf "done.\n"
+clean: CLEAN(libsyscalls.gen.c)
+
+$(eval $(call add_object_rules,libsyscalls.gen.o,CC,c,CFLAGS))
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..1b8effe
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,3 @@
+set noparent
+jorgelo@chromium.org
+wad@chromium.org
diff --git a/PRESUBMIT.cfg b/PRESUBMIT.cfg
new file mode 100644
index 0000000..17f9b03
--- /dev/null
+++ b/PRESUBMIT.cfg
@@ -0,0 +1,4 @@
+[Hook Overrides]
+
+# We are using Linux-style indentation with tabs
+tab_check: false
diff --git a/arch.h b/arch.h
new file mode 100644
index 0000000..cfe6122
--- /dev/null
+++ b/arch.h
@@ -0,0 +1,63 @@
+/* arch.h
+ * Copyright 2014 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * ARCH_NR #define's.
+ */
+
+#ifndef ARCH_H
+#define ARCH_H
+
+#include <linux/audit.h>
+
+#if defined(__i386__)
+#  define ARCH_NR AUDIT_ARCH_I386
+#elif defined(__x86_64__)
+#  define ARCH_NR AUDIT_ARCH_X86_64
+#elif defined(__arm__)
+/*
+ * <linux/audit.h> includes <linux/elf-em.h>, which does not define EM_ARM.
+ * <linux/elf.h> only includes <asm/elf.h> if we're in the kernel.
+ */
+#  ifndef EM_ARM
+#    define EM_ARM 40
+#  endif
+#  define ARCH_NR AUDIT_ARCH_ARM
+#elif defined(__hppa__)
+#  define ARCH_NR AUDIT_ARCH_PARISC
+#elif defined(__ia64__)
+#  define ARCH_NR AUDIT_ARCH_IA64
+#elif defined(__mips__)
+#  if defined(__mips64)
+#    if defined(__MIPSEB__)
+#      define ARCH_NR AUDIT_ARCH_MIPS64
+#    else
+#      define ARCH_NR AUDIT_ARCH_MIPSEL64
+#    endif
+#  else
+#    if defined(__MIPSEB__)
+#      define ARCH_NR AUDIT_ARCH_MIPS
+#    else
+#      define ARCH_NR AUDIT_ARCH_MIPSEL
+#    endif
+#  endif
+#elif defined(__powerpc64__)
+#  define ARCH_NR AUDIT_ARCH_PPC64
+#elif defined(__powerpc__)
+#  define ARCH_NR AUDIT_ARCH_PPC
+#elif defined(__s390x__)
+#  define ARCH_NR AUDIT_ARCH_S390X
+#elif defined(__s390__)
+#  define ARCH_NR AUDIT_ARCH_S390
+#elif defined(__sparc__)
+#  if defined(__arch64__)
+#    define AUDIT_ARCH_SPARC64
+#  else
+#    define AUDIT_ARCH_SPARC
+#  endif
+#else
+#  error "AUDIT_ARCH value unavailable"
+#endif
+
+#endif /* ARCH_H */
diff --git a/bpf.c b/bpf.c
new file mode 100644
index 0000000..8eacab8
--- /dev/null
+++ b/bpf.c
@@ -0,0 +1,271 @@
+/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bpf.h"
+
+/* Architecture validation. */
+size_t bpf_validate_arch(struct sock_filter *filter)
+{
+	struct sock_filter *curr_block = filter;
+	set_bpf_stmt(curr_block++, BPF_LD+BPF_W+BPF_ABS, arch_nr);
+	set_bpf_jump(curr_block++,
+			BPF_JMP+BPF_JEQ+BPF_K, ARCH_NR, SKIP, NEXT);
+	set_bpf_ret_kill(curr_block++);
+	return curr_block - filter;
+}
+
+/* Syscall number eval functions. */
+size_t bpf_allow_syscall(struct sock_filter *filter, int nr)
+{
+	struct sock_filter *curr_block = filter;
+	set_bpf_jump(curr_block++, BPF_JMP+BPF_JEQ+BPF_K, nr, NEXT, SKIP);
+	set_bpf_stmt(curr_block++, BPF_RET+BPF_K, SECCOMP_RET_ALLOW);
+	return curr_block - filter;
+}
+
+size_t bpf_allow_syscall_args(struct sock_filter *filter,
+		int nr, unsigned int id)
+{
+	struct sock_filter *curr_block = filter;
+	set_bpf_jump(curr_block++, BPF_JMP+BPF_JEQ+BPF_K, nr, NEXT, SKIP);
+	set_bpf_jump_lbl(curr_block++, id);
+	return curr_block - filter;
+}
+
+/* Size-aware arg loaders. */
+#if defined(BITS32)
+size_t bpf_load_arg(struct sock_filter *filter, int argidx)
+{
+	set_bpf_stmt(filter, BPF_LD+BPF_W+BPF_ABS, LO_ARG(argidx));
+	return 1U;
+}
+#elif defined(BITS64)
+size_t bpf_load_arg(struct sock_filter *filter, int argidx)
+{
+	struct sock_filter *curr_block = filter;
+	set_bpf_stmt(curr_block++, BPF_LD+BPF_W+BPF_ABS, LO_ARG(argidx));
+	set_bpf_stmt(curr_block++, BPF_ST, 0); /* lo -> M[0] */
+	set_bpf_stmt(curr_block++, BPF_LD+BPF_W+BPF_ABS, HI_ARG(argidx));
+	set_bpf_stmt(curr_block++, BPF_ST, 1); /* hi -> M[1] */
+	return curr_block - filter;
+}
+#endif
+
+/* Size-aware equality comparison. */
+size_t bpf_comp_jeq32(struct sock_filter *filter, unsigned long c,
+		unsigned char jt, unsigned char jf)
+{
+	unsigned int lo = (unsigned int)(c & 0xFFFFFFFF);
+	set_bpf_jump(filter, BPF_JMP+BPF_JEQ+BPF_K, lo, jt, jf);
+	return 1U;
+}
+
+/*
+ * On 64 bits, we have to do two 32-bit comparisons.
+ * We jump true when *both* comparisons are true.
+ */
+#if defined(BITS64)
+size_t bpf_comp_jeq64(struct sock_filter *filter, uint64_t c,
+		unsigned char jt, unsigned char jf)
+{
+	unsigned int lo = (unsigned int)(c & 0xFFFFFFFF);
+	unsigned int hi = (unsigned int)(c >> 32);
+
+	struct sock_filter *curr_block = filter;
+
+	/* bpf_load_arg leaves |hi| in A */
+	curr_block += bpf_comp_jeq32(curr_block, hi, NEXT, SKIPN(2) + jf);
+	set_bpf_stmt(curr_block++, BPF_LD+BPF_MEM, 0); /* swap in |lo| */
+	curr_block += bpf_comp_jeq32(curr_block, lo, jt, jf);
+
+	return curr_block - filter;
+}
+#endif
+
+/* Size-aware bitwise AND. */
+size_t bpf_comp_jset32(struct sock_filter *filter, unsigned long mask,
+		unsigned char jt, unsigned char jf)
+{
+	unsigned int mask_lo = (unsigned int)(mask & 0xFFFFFFFF);
+	set_bpf_jump(filter, BPF_JMP+BPF_JSET+BPF_K, mask_lo, jt, jf);
+	return 1U;
+}
+
+/*
+ * On 64 bits, we have to do two 32-bit bitwise ANDs.
+ * We jump true when *either* bitwise AND is true (non-zero).
+ */
+#if defined(BITS64)
+size_t bpf_comp_jset64(struct sock_filter *filter, uint64_t mask,
+		unsigned char jt, unsigned char jf)
+{
+	unsigned int mask_lo = (unsigned int)(mask & 0xFFFFFFFF);
+	unsigned int mask_hi = (unsigned int)(mask >> 32);
+
+	struct sock_filter *curr_block = filter;
+
+	/* bpf_load_arg leaves |hi| in A */
+	curr_block += bpf_comp_jset32(curr_block, mask_hi, SKIPN(2) + jt, NEXT);
+	set_bpf_stmt(curr_block++, BPF_LD+BPF_MEM, 0); /* swap in |lo| */
+	curr_block += bpf_comp_jset32(curr_block, mask_lo, jt, jf);
+
+	return curr_block - filter;
+}
+#endif
+
+size_t bpf_arg_comp(struct sock_filter **pfilter,
+		int op, int argidx, unsigned long c, unsigned int label_id)
+{
+	struct sock_filter *filter = calloc(BPF_ARG_COMP_LEN + 1,
+			sizeof(struct sock_filter));
+	struct sock_filter *curr_block = filter;
+	size_t (*comp_function)(struct sock_filter *filter, unsigned long k,
+				unsigned char jt, unsigned char jf);
+	int flip = 0;
+
+	/* Load arg */
+	curr_block += bpf_load_arg(curr_block, argidx);
+
+	/* Jump type */
+	switch (op) {
+	case EQ:
+		comp_function = bpf_comp_jeq;
+		flip = 0;
+		break;
+	case NE:
+		comp_function = bpf_comp_jeq;
+		flip = 1;
+		break;
+	case SET:
+		comp_function = bpf_comp_jset;
+		flip = 0;
+		break;
+	default:
+		*pfilter = NULL;
+		return 0;
+	}
+
+	/*
+	 * It's easier for the rest of the code to have the true branch
+	 * skip and the false branch fall through.
+	 */
+	unsigned char jt = flip ? NEXT : SKIP;
+	unsigned char jf = flip ? SKIP : NEXT;
+	curr_block += comp_function(curr_block, c, jt, jf);
+	curr_block += set_bpf_jump_lbl(curr_block, label_id);
+
+	*pfilter = filter;
+	return curr_block - filter;
+}
+
+void dump_bpf_filter(struct sock_filter *filter, unsigned short len)
+{
+	int i = 0;
+
+	printf("len == %d\n", len);
+	printf("filter:\n");
+	for (i = 0; i < len; i++) {
+		printf("%d: \t{ code=%#x, jt=%u, jf=%u, k=%#x \t}\n",
+			i, filter[i].code, filter[i].jt, filter[i].jf, filter[i].k);
+	}
+}
+
+void dump_bpf_prog(struct sock_fprog *fprog)
+{
+	struct sock_filter *filter = fprog->filter;
+	unsigned short len = fprog->len;
+	dump_bpf_filter(filter, len);
+}
+
+int bpf_resolve_jumps(struct bpf_labels *labels,
+		struct sock_filter *filter, size_t count)
+{
+	struct sock_filter *begin = filter;
+	__u8 insn = count - 1;
+
+	if (count < 1)
+		return -1;
+	/*
+	 * Walk it once, backwards, to build the label table and do fixups.
+	 * Since backward jumps are disallowed by BPF, this is easy.
+	 */
+	for (filter += insn; filter >= begin; --insn, --filter) {
+		if (filter->code != (BPF_JMP+BPF_JA))
+			continue;
+		switch ((filter->jt<<8)|filter->jf) {
+		case (JUMP_JT<<8)|JUMP_JF:
+			if (labels->labels[filter->k].location == 0xffffffff) {
+				fprintf(stderr, "Unresolved label: '%s'\n",
+					labels->labels[filter->k].label);
+				return 1;
+			}
+			filter->k = labels->labels[filter->k].location -
+					(insn + 1);
+			filter->jt = 0;
+			filter->jf = 0;
+			continue;
+		case (LABEL_JT<<8)|LABEL_JF:
+			if (labels->labels[filter->k].location != 0xffffffff) {
+				fprintf(stderr, "Duplicate label use: '%s'\n",
+					labels->labels[filter->k].label);
+				return 1;
+			}
+			labels->labels[filter->k].location = insn;
+			filter->k = 0; /* fall through */
+			filter->jt = 0;
+			filter->jf = 0;
+			continue;
+		}
+	}
+	return 0;
+}
+
+/* Simple lookup table for labels. */
+int bpf_label_id(struct bpf_labels *labels, const char *label)
+{
+	struct __bpf_label *begin = labels->labels, *end;
+	int id;
+	if (labels->count == 0) {
+		begin->label = strndup(label, MAX_BPF_LABEL_LEN);
+		if (!begin->label) {
+			return -1;
+		}
+		begin->location = 0xffffffff;
+		labels->count++;
+		return 0;
+	}
+	end = begin + labels->count;
+	for (id = 0; begin < end; ++begin, ++id) {
+		if (!strcmp(label, begin->label))
+			return id;
+	}
+	begin->label = strndup(label, MAX_BPF_LABEL_LEN);
+	if (!begin->label) {
+		return -1;
+	}
+	begin->location = 0xffffffff;
+	labels->count++;
+	return id;
+}
+
+/* Free label strings. */
+void free_label_strings(struct bpf_labels *labels)
+{
+	if (labels->count == 0)
+		return;
+
+	struct __bpf_label *begin = labels->labels, *end;
+
+	end = begin + labels->count;
+	for (; begin < end; ++begin) {
+		if (begin->label)
+			free((void*)(begin->label));
+	}
+}
diff --git a/bpf.h b/bpf.h
new file mode 100644
index 0000000..7cbc5dd
--- /dev/null
+++ b/bpf.h
@@ -0,0 +1,194 @@
+/* bpf.h
+ * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Berkeley Packet Filter functions.
+ */
+
+#ifndef BPF_H
+#define BPF_H
+
+#include <asm/bitsperlong.h>   /* for __BITS_PER_LONG */
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <stddef.h>
+#include <sys/user.h>
+
+#include "arch.h"
+
+#if __BITS_PER_LONG == 32 || defined(__ILP32__)
+#define BITS32
+#elif __BITS_PER_LONG == 64
+#define BITS64
+#endif
+
+/* Constants for comparison operators. */
+#define MIN_OPERATOR 128
+enum operator {
+	EQ = MIN_OPERATOR,
+	NE,
+	LT,
+	LE,
+	GT,
+	GE,
+	SET
+};
+
+/*
+ * BPF return values and data structures,
+ * since they're not yet in the kernel.
+ */
+#define SECCOMP_RET_KILL	0x00000000U /* kill the task immediately */
+#define SECCOMP_RET_TRAP	0x00030000U /* return SIGSYS */
+#define SECCOMP_RET_ERRNO	0x00050000U /* return -1 and set errno */
+#define SECCOMP_RET_ALLOW	0x7fff0000U /* allow */
+
+#define SECCOMP_RET_DATA	0x0000ffffU /* mask for return value */
+
+struct seccomp_data {
+	int nr;
+	__u32 arch;
+	__u64 instruction_pointer;
+	__u64 args[6];
+};
+
+#define syscall_nr (offsetof(struct seccomp_data, nr))
+#define arch_nr (offsetof(struct seccomp_data, arch))
+
+/* Size-dependent defines. */
+#if defined(BITS32)
+/*
+ * On 32 bits, comparisons take 2 instructions: 1 for loading the argument,
+ * 1 for the actual comparison.
+ */
+#define BPF_LOAD_ARG_LEN	1U
+#define BPF_COMP_LEN		1U
+#define BPF_ARG_COMP_LEN (BPF_LOAD_ARG_LEN + BPF_COMP_LEN)
+
+#define bpf_comp_jeq bpf_comp_jeq32
+#define bpf_comp_jset bpf_comp_jset32
+
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+
+#elif defined(BITS64)
+/*
+ * On 64 bits, comparisons take 7 instructions: 4 for loading the argument,
+ * and 3 for the actual comparison.
+ */
+#define BPF_LOAD_ARG_LEN	4U
+#define BPF_COMP_LEN		3U
+#define BPF_ARG_COMP_LEN (BPF_LOAD_ARG_LEN + BPF_COMP_LEN)
+
+#define bpf_comp_jeq bpf_comp_jeq64
+#define bpf_comp_jset bpf_comp_jset64
+
+/* Ensure that we load the logically correct offset. */
+#if defined(__LITTLE_ENDIAN)
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#elif defined(__BIG_ENDIAN)
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#else
+#error "Unknown endianness"
+#endif
+
+#else
+#error "Unknown bit width"
+
+#endif
+
+/* Common jump targets. */
+#define NEXT 0
+#define SKIP 1
+#define SKIPN(_n) (_n)
+
+/* Support for labels in BPF programs. */
+#define JUMP_JT 0xff
+#define JUMP_JF 0xff
+#define LABEL_JT 0xfe
+#define LABEL_JF 0xfe
+
+#define MAX_BPF_LABEL_LEN 32
+
+#define BPF_LABELS_MAX 256
+struct bpf_labels {
+	int count;
+	struct __bpf_label {
+		const char *label;
+		unsigned int location;
+	} labels[BPF_LABELS_MAX];
+};
+
+/* BPF instruction manipulation functions and macros. */
+static inline size_t set_bpf_instr(struct sock_filter *instr,
+	unsigned short code, unsigned int k,
+	unsigned char jt, unsigned char jf)
+{
+	instr->code = code;
+	instr->k = k;
+	instr->jt = jt;
+	instr->jf = jf;
+	return 1U;
+}
+
+#define set_bpf_stmt(_block, _code, _k) \
+	set_bpf_instr((_block), (_code), (_k), 0, 0)
+
+#define set_bpf_jump(_block, _code, _k, _jt, _jf) \
+	set_bpf_instr((_block), (_code), (_k), (_jt), (_jf))
+
+#define set_bpf_lbl(_block, _lbl_id) \
+	set_bpf_jump((_block), BPF_JMP+BPF_JA, (_lbl_id), \
+			LABEL_JT, LABEL_JF)
+
+#define set_bpf_jump_lbl(_block, _lbl_id) \
+	set_bpf_jump((_block), BPF_JMP+BPF_JA, (_lbl_id), \
+			JUMP_JT, JUMP_JF)
+
+#define set_bpf_ret_kill(_block) \
+	set_bpf_stmt((_block), BPF_RET+BPF_K, SECCOMP_RET_KILL)
+
+#define set_bpf_ret_trap(_block) \
+	set_bpf_stmt((_block), BPF_RET+BPF_K, SECCOMP_RET_TRAP)
+
+#define set_bpf_ret_errno(_block, _errno) \
+	set_bpf_stmt((_block), BPF_RET+BPF_K, \
+		SECCOMP_RET_ERRNO | ((_errno) & SECCOMP_RET_DATA))
+
+#define set_bpf_ret_allow(_block) \
+	set_bpf_stmt((_block), BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
+
+#define bpf_load_syscall_nr(_filter) \
+	set_bpf_stmt((_filter), BPF_LD+BPF_W+BPF_ABS, syscall_nr)
+
+/* BPF label functions. */
+int bpf_resolve_jumps(struct bpf_labels *labels,
+		struct sock_filter *filter, size_t count);
+int bpf_label_id(struct bpf_labels *labels, const char *label);
+void free_label_strings(struct bpf_labels *labels);
+
+/* BPF helper functions. */
+size_t bpf_load_arg(struct sock_filter *filter, int argidx);
+size_t bpf_comp_jeq(struct sock_filter *filter, unsigned long c,
+		unsigned char jt, unsigned char jf);
+size_t bpf_comp_jset(struct sock_filter *filter, unsigned long mask,
+		unsigned char jt, unsigned char jf);
+
+/* Functions called by syscall_filter.c */
+#define ARCH_VALIDATION_LEN 3U
+#define ALLOW_SYSCALL_LEN 2U
+
+size_t bpf_arg_comp(struct sock_filter **pfilter,
+		int op, int argidx, unsigned long c, unsigned int label_id);
+size_t bpf_validate_arch(struct sock_filter *filter);
+size_t bpf_allow_syscall(struct sock_filter *filter, int nr);
+size_t bpf_allow_syscall_args(struct sock_filter *filter,
+		int nr, unsigned int id);
+
+/* Debug functions. */
+void dump_bpf_prog(struct sock_fprog *fprog);
+void dump_bpf_filter(struct sock_filter *filter, unsigned short len);
+
+#endif /* BPF_H */
diff --git a/common.mk b/common.mk
new file mode 120000
index 0000000..126e5b8
--- /dev/null
+++ b/common.mk
@@ -0,0 +1 @@
+../common-mk/common.mk
\ No newline at end of file
diff --git a/elfparse.c b/elfparse.c
new file mode 100644
index 0000000..b2b5891
--- /dev/null
+++ b/elfparse.c
@@ -0,0 +1,111 @@
+/* Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "elfparse.h"
+
+int is_elf_magic (const uint8_t *buf)
+{
+	return (buf[EI_MAG0] == ELFMAG0) &&
+	       (buf[EI_MAG1] == ELFMAG1) &&
+	       (buf[EI_MAG2] == ELFMAG2) &&
+	       (buf[EI_MAG3] == ELFMAG3);
+}
+
+#define parseElftemplate(bit)                                                \
+ElfType parseElf ## bit(FILE *elf_file, uint8_t *pHead, int little_endian)   \
+{                                                                            \
+	ElfType                      ret          = ELFSTATIC;               \
+	Minijail_Elf ## bit ## _Ehdr *pHeader     = NULL;                    \
+	Minijail_Elf ## bit ## _Phdr pheader      = { 0 };                   \
+	uint32_t                     i            = 0;                       \
+	                                                                     \
+	if (!elf_file || !pHead)                                             \
+		return ELFERROR;                                             \
+	                                                                     \
+	pHeader = (Minijail_Elf ## bit ## _Ehdr *)pHead;                     \
+	if (little_endian) {                                                 \
+		pHeader->e_phoff = le ## bit ## toh(pHeader->e_phoff);       \
+		pHeader->e_phentsize = le16toh(pHeader->e_phentsize);        \
+		pHeader->e_phnum = le16toh(pHeader->e_phnum);                \
+	} else {                                                             \
+		pHeader->e_phoff = be ## bit ## toh(pHeader->e_phoff);       \
+		pHeader->e_phentsize = be16toh(pHeader->e_phentsize);        \
+		pHeader->e_phnum = be16toh(pHeader->e_phnum);                \
+	}                                                                    \
+	if (pHeader->e_phentsize != sizeof(Minijail_Elf ## bit ## _Phdr))    \
+		return ELFERROR;                                             \
+	                                                                     \
+	if (fseek(elf_file, pHeader->e_phoff, SEEK_SET) != 0)                \
+		return ELFERROR;                                             \
+	                                                                     \
+	for (i = 0; i < pHeader->e_phnum; i++) {                             \
+		if (fread(&pheader, sizeof(pheader), 1, elf_file) == 1) {    \
+			if (pheader.p_type == PT_INTERP) {                   \
+				ret = ELFDYNAMIC;                            \
+				break;                                       \
+			}                                                    \
+		} else {                                                     \
+			ret = ELFERROR;                                      \
+			break;                                               \
+		}                                                            \
+	}                                                                    \
+	return ret;                                                          \
+}
+parseElftemplate(64)
+parseElftemplate(32)
+
+/* Public function to determine the linkage of an ELF. */
+ElfType get_elf_linkage(const char *path)
+{
+	ElfType ret = ELFERROR;
+	FILE *elf_file = NULL;
+	uint8_t pHeader[HEADERSIZE] = "";
+
+	elf_file = fopen(path, "r");
+	if (elf_file) {
+		if (fread(pHeader, 1, HEADERSIZE, elf_file) == HEADERSIZE) {
+			if (is_elf_magic(pHeader)) {
+				if ((pHeader[EI_DATA] == ELFDATA2LSB) &&
+				    (pHeader[EI_CLASS] == ELFCLASS64)) {
+					/* 64 bit little endian */
+					ret = parseElf64(elf_file, pHeader, 1);
+				} else if ((pHeader[EI_DATA] == ELFDATA2MSB) &&
+					  (pHeader[EI_CLASS] == ELFCLASS64)) {
+					/* 64 bit big endian */
+					ret = parseElf64(elf_file, pHeader, 0);
+				} else if ((pHeader[EI_DATA] == ELFDATA2LSB) &&
+					  (pHeader[EI_CLASS] == ELFCLASS32)) {
+					/* 32 bit little endian */
+					ret = parseElf32(elf_file, pHeader, 1);
+				} else if ((pHeader[EI_DATA] == ELFDATA2MSB) &&
+					  (pHeader[EI_CLASS] == ELFCLASS32)) {
+					/* 32 bit big endian */
+					ret = parseElf32(elf_file, pHeader, 0);
+				}
+			} else {
+				/*
+				 * The binary is not an ELF. We assume it's a
+				 * script. We should parse the #! line and
+				 * check the interpreter to guard against
+				 * static interpreters escaping the sandbox.
+				 * As minijail is only called from rootfs
+				 * it was deemed not necessary to check this.
+				 * So we will just let execve decided if this
+				 * is valid.
+				 */
+				ret = ELFDYNAMIC;
+			}
+		} else {
+			/*
+			 * The file is smaller than |HEADERSIZE| bytes.
+			 * We assume it's a short script. See above for
+			 * reasoning on scripts.
+			 */
+			ret = ELFDYNAMIC;
+		}
+		fclose(elf_file);
+	}
+	return ret;
+}
diff --git a/elfparse.h b/elfparse.h
new file mode 100644
index 0000000..6b65eae
--- /dev/null
+++ b/elfparse.h
@@ -0,0 +1,103 @@
+/* elfparse.h
+ * Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Elf parsing.
+ */
+
+#ifndef _ELFPARSE_H_
+#define _ELFPARSE_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <elf.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <endian.h>
+#include <string.h>
+
+/*
+ * These structs come from elf.h
+ * The version in elf.h do not pack these structs so
+ * portability could be an issue.
+ * The compiler could mess with aligmment depending on arch
+ * so I'm redefining them here and packing them to 1-byte alignment.
+ */
+#define EI_NIDENT (16)
+#pragma pack(push)
+#pragma pack(1)
+typedef struct
+{
+	unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
+	Elf32_Half    e_type;             /* Object file type */
+	Elf32_Half    e_machine;          /* Architecture */
+	Elf32_Word    e_version;          /* Object file version */
+	Elf32_Addr    e_entry;            /* Entry point virtual address */
+	Elf32_Off     e_phoff;            /* Program header table file offset */
+	Elf32_Off     e_shoff;            /* Section header table file offset */
+	Elf32_Word    e_flags;            /* Processor-specific flags */
+	Elf32_Half    e_ehsize;           /* ELF header size in bytes */
+	Elf32_Half    e_phentsize;        /* Program header table entry size */
+	Elf32_Half    e_phnum;            /* Program header table entry count */
+	Elf32_Half    e_shentsize;        /* Section header table entry size */
+	Elf32_Half    e_shnum;            /* Section header table entry count */
+	Elf32_Half    e_shstrndx;         /* Section header string table index */
+} Minijail_Elf32_Ehdr;
+
+typedef struct
+{
+	unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
+	Elf64_Half    e_type;             /* Object file type */
+	Elf64_Half    e_machine;          /* Architecture */
+	Elf64_Word    e_version;          /* Object file version */
+	Elf64_Addr    e_entry;            /* Entry point virtual address */
+	Elf64_Off     e_phoff;            /* Program header table file offset */
+	Elf64_Off     e_shoff;            /* Section header table file offset */
+	Elf64_Word    e_flags;            /* Processor-specific flags */
+	Elf64_Half    e_ehsize;           /* ELF header size in bytes */
+	Elf64_Half    e_phentsize;        /* Program header table entry size */
+	Elf64_Half    e_phnum;            /* Program header table entry count */
+	Elf64_Half    e_shentsize;        /* Section header table entry size */
+	Elf64_Half    e_shnum;            /* Section header table entry count */
+	Elf64_Half    e_shstrndx;         /* Section header string table index */
+} Minijail_Elf64_Ehdr;
+
+typedef struct
+{
+	Elf32_Word      p_type;           /* Segment type */
+	Elf32_Off       p_offset;         /* Segment file offset */
+	Elf32_Addr      p_vaddr;          /* Segment virtual address */
+	Elf32_Addr      p_paddr;          /* Segment physical address */
+	Elf32_Word      p_filesz;         /* Segment size in file */
+	Elf32_Word      p_memsz;          /* Segment size in memory */
+	Elf32_Word      p_flags;          /* Segment flags */
+	Elf32_Word      p_align;          /* Segment alignment */
+} Minijail_Elf32_Phdr;
+
+typedef struct
+{
+	Elf64_Word      p_type;           /* Segment type */
+	Elf64_Word      p_flags;          /* Segment flags */
+	Elf64_Off       p_offset;         /* Segment file offset */
+	Elf64_Addr      p_vaddr;          /* Segment virtual address */
+	Elf64_Addr      p_paddr;          /* Segment physical address */
+	Elf64_Xword     p_filesz;         /* Segment size in file */
+	Elf64_Xword     p_memsz;          /* Segment size in memory */
+	Elf64_Xword     p_align;          /* Segment alignment */
+} Minijail_Elf64_Phdr;
+#pragma pack(pop)
+/* End of definitions from elf.h */
+
+enum ElfTypeEnum { ELFERROR=0, ELFSTATIC=1, ELFDYNAMIC=2 };
+typedef enum ElfTypeEnum ElfType;
+
+/*
+ * This is the initial amount of the ELF file we try and read.
+ * It is the same value that the kernel uses (BINPRM_BUF_SIZE).
+ */
+#define HEADERSIZE  128
+
+ElfType get_elf_linkage(const char *path);
+
+#endif /* _ELFPARSE_H_ */
diff --git a/gen_syscalls.sh b/gen_syscalls.sh
new file mode 100755
index 0000000..3121b42
--- /dev/null
+++ b/gen_syscalls.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Generates a header file with a system call table made up of "name",
+# syscall_nr entries by including the build target <asm/unistd.h> and
+# emitting the list of defines.  Use of the compiler is needed to
+# dereference the actual provider of syscall definitions.
+#   E.g., asm/unistd_32.h or asm/unistd_64.h, etc.
+
+set -e
+
+if [ $# -ne 1 ] && [ $# -ne 3 ]; then
+  echo "Usage: $(basename "$0") OUTFILE"
+  echo "Usage: $(basename "$0") CC CFLAGS OUTFILE"
+  exit 1
+fi
+
+if [ $# -eq 3 ]; then
+  CC="$1"
+  shift
+  CFLAGS="$1"
+  shift
+fi
+OUTFILE="$1"
+
+# sed expression which extracts system calls that are
+# defined via asm/unistd.h.  It converts them from:
+#  #define __NR_read foo
+# to:
+# #ifdef __NR_read
+#  { "read", __NR_read },
+# #endif
+SED_MULTILINE='s/#define __(ARM_)?(NR_)([a-z0-9_]*) (.*)$/#ifdef __\1\2\3\
+{ "\1\3", __\1\2\3 },\n#endif/g p;'
+
+cat <<-EOF > "${OUTFILE}"
+/* GENERATED BY MAKEFILE */
+#include <stddef.h>
+#include <asm/unistd.h>
+#include "libsyscalls.h"
+const struct syscall_entry syscall_table[] = {
+$(echo '#include <asm/unistd.h>' | \
+  ${CC} ${CFLAGS} -dD - -E | sed -rne "${SED_MULTILINE}")
+  { NULL, -1 },
+};
+EOF
diff --git a/libminijail-private.h b/libminijail-private.h
new file mode 100644
index 0000000..eafcdf5
--- /dev/null
+++ b/libminijail-private.h
@@ -0,0 +1,86 @@
+/* libminijail-private.h
+ * Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Values shared between libminijailpreload and libminijail, but not visible to
+ * the outside world.
+ */
+
+#ifndef LIBMINIJAIL_PRIVATE_H
+#define LIBMINIJAIL_PRIVATE_H
+
+/* Explicitly declare exported functions so that -fvisibility tricks
+ * can be used for testing and minimal symbol leakage occurs.
+ */
+#define API __attribute__ ((visibility("default")))
+
+static const char *kFdEnvVar = "__MINIJAIL_FD";
+static const char *kLdPreloadEnvVar = "LD_PRELOAD";
+
+struct minijail;
+
+/* minijail_size: returns the size (in bytes) of @j if marshalled
+ * @j jail to compute size of
+ *
+ * Returns 0 on error.
+ */
+extern size_t minijail_size(const struct minijail *j);
+
+/* minijail_marshal: serializes @j to @buf
+ * @j    minijail to serialize
+ * @buf  buffer to serialize to
+ * @size size of @buf
+ *
+ * Returns 0 on success.
+ *
+ * Writes |j| to |buf| such that it can be reparsed by the same
+ * library on the same architecture.  This is meant to be used
+ * by minijail0.c and libminijailpreload.c.  minijail flags that
+ * require minijail_run() will be excluded.
+ *
+ * The marshalled data is not robust to differences between the child
+ * and parent process (personality, etc).
+ */
+extern int minijail_marshal(const struct minijail *j,
+                            char *buf,
+                            size_t size);
+
+/* minijail_unmarshal: initializes @j from @serialized
+ * @j          minijail to initialize
+ * @serialized serialized jail buffer
+ * @length     length of buffer
+ *
+ * Returns 0 on success.
+ */
+extern int minijail_unmarshal(struct minijail *j,
+                              char *serialized,
+                              size_t length);
+
+/* minijail_from_fd: builds @j from @fd
+ * @j  minijail to initialize
+ * @fd fd to initialize from
+ *
+ * Returns 0 on success.
+ */
+extern int minijail_from_fd(int fd, struct minijail *j);
+
+/* minijail_to_fd: sends @j over @fd
+ * @j  minijail to send
+ * @fd fd to send over
+ *
+ * Returns 0 on success.
+ */
+extern int minijail_to_fd(struct minijail *j, int fd);
+
+/* minijail_preexec: strips @j of all options handled by minijail_enter()
+ * @j jail to strip
+ */
+extern void minijail_preexec(struct minijail *j);
+
+/* minijail_preenter: strips @j of all options handled by minijail_run()
+ * @j jail to strip
+ */
+extern void minijail_preenter(struct minijail *j);
+
+#endif /* !LIBMINIJAIL_PRIVATE_H */
diff --git a/libminijail.c b/libminijail.c
new file mode 100644
index 0000000..0dab24d
--- /dev/null
+++ b/libminijail.c
@@ -0,0 +1,1374 @@
+/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#define _BSD_SOURCE
+#define _GNU_SOURCE
+
+#include <asm/unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/capability.h>
+#include <pwd.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/capability.h>
+#include <sys/mount.h>
+#include <sys/param.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "libminijail.h"
+#include "libminijail-private.h"
+
+#include "signal.h"
+#include "syscall_filter.h"
+#include "util.h"
+
+#ifdef HAVE_SECUREBITS_H
+#include <linux/securebits.h>
+#else
+#define SECURE_ALL_BITS         0x15
+#define SECURE_ALL_LOCKS        (SECURE_ALL_BITS << 1)
+#endif
+
+/* Until these are reliably available in linux/prctl.h */
+#ifndef PR_SET_SECCOMP
+# define PR_SET_SECCOMP 22
+#endif
+
+/* For seccomp_filter using BPF. */
+#ifndef PR_SET_NO_NEW_PRIVS
+# define PR_SET_NO_NEW_PRIVS 38
+#endif
+#ifndef SECCOMP_MODE_FILTER
+# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
+#endif
+
+#ifdef USE_SECCOMP_SOFTFAIL
+# define SECCOMP_SOFTFAIL 1
+#else
+# define SECCOMP_SOFTFAIL 0
+#endif
+
+struct binding {
+	char *src;
+	char *dest;
+	int writeable;
+	struct binding *next;
+};
+
+struct minijail {
+	/*
+	 * WARNING: if you add a flag here you need to make sure it's
+	 * accounted for in minijail_pre{enter|exec}() below.
+	 */
+	struct {
+		int uid:1;
+		int gid:1;
+		int caps:1;
+		int vfs:1;
+		int enter_vfs:1;
+		int pids:1;
+		int net:1;
+		int seccomp:1;
+		int readonly:1;
+		int usergroups:1;
+		int ptrace:1;
+		int no_new_privs:1;
+		int seccomp_filter:1;
+		int log_seccomp_filter:1;
+		int chroot:1;
+		int mount_tmp:1;
+	} flags;
+	uid_t uid;
+	gid_t gid;
+	gid_t usergid;
+	char *user;
+	uint64_t caps;
+	pid_t initpid;
+	int mountns_fd;
+	int filter_len;
+	int binding_count;
+	char *chrootdir;
+	struct sock_fprog *filter_prog;
+	struct binding *bindings_head;
+	struct binding *bindings_tail;
+};
+
+/*
+ * Strip out flags meant for the parent.
+ * We keep things that are not inherited across execve(2) (e.g. capabilities),
+ * or are easier to set after execve(2) (e.g. seccomp filters).
+ */
+void minijail_preenter(struct minijail *j)
+{
+	j->flags.vfs = 0;
+	j->flags.enter_vfs = 0;
+	j->flags.readonly = 0;
+	j->flags.pids = 0;
+}
+
+/*
+ * Strip out flags meant for the child.
+ * We keep things that are inherited across execve(2).
+ */
+void minijail_preexec(struct minijail *j)
+{
+	int vfs = j->flags.vfs;
+	int enter_vfs = j->flags.enter_vfs;
+	int readonly = j->flags.readonly;
+	if (j->user)
+		free(j->user);
+	j->user = NULL;
+	memset(&j->flags, 0, sizeof(j->flags));
+	/* Now restore anything we meant to keep. */
+	j->flags.vfs = vfs;
+	j->flags.enter_vfs = enter_vfs;
+	j->flags.readonly = readonly;
+	/* Note, |pids| will already have been used before this call. */
+}
+
+/* Minijail API. */
+
+struct minijail API *minijail_new(void)
+{
+	return calloc(1, sizeof(struct minijail));
+}
+
+void API minijail_change_uid(struct minijail *j, uid_t uid)
+{
+	if (uid == 0)
+		die("useless change to uid 0");
+	j->uid = uid;
+	j->flags.uid = 1;
+}
+
+void API minijail_change_gid(struct minijail *j, gid_t gid)
+{
+	if (gid == 0)
+		die("useless change to gid 0");
+	j->gid = gid;
+	j->flags.gid = 1;
+}
+
+int API minijail_change_user(struct minijail *j, const char *user)
+{
+	char *buf = NULL;
+	struct passwd pw;
+	struct passwd *ppw = NULL;
+	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
+	if (sz == -1)
+		sz = 65536;	/* your guess is as good as mine... */
+
+	/*
+	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
+	 * the maximum needed size of the buffer, so we don't have to search.
+	 */
+	buf = malloc(sz);
+	if (!buf)
+		return -ENOMEM;
+	getpwnam_r(user, &pw, buf, sz, &ppw);
+	/*
+	 * We're safe to free the buffer here. The strings inside pw point
+	 * inside buf, but we don't use any of them; this leaves the pointers
+	 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
+	 */
+	free(buf);
+	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
+	if (!ppw)
+		return -1;
+	minijail_change_uid(j, ppw->pw_uid);
+	j->user = strdup(user);
+	if (!j->user)
+		return -ENOMEM;
+	j->usergid = ppw->pw_gid;
+	return 0;
+}
+
+int API minijail_change_group(struct minijail *j, const char *group)
+{
+	char *buf = NULL;
+	struct group gr;
+	struct group *pgr = NULL;
+	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
+	if (sz == -1)
+		sz = 65536;	/* and mine is as good as yours, really */
+
+	/*
+	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
+	 * the maximum needed size of the buffer, so we don't have to search.
+	 */
+	buf = malloc(sz);
+	if (!buf)
+		return -ENOMEM;
+	getgrnam_r(group, &gr, buf, sz, &pgr);
+	/*
+	 * We're safe to free the buffer here. The strings inside gr point
+	 * inside buf, but we don't use any of them; this leaves the pointers
+	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
+	 */
+	free(buf);
+	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
+	if (!pgr)
+		return -1;
+	minijail_change_gid(j, pgr->gr_gid);
+	return 0;
+}
+
+void API minijail_use_seccomp(struct minijail *j)
+{
+	j->flags.seccomp = 1;
+}
+
+void API minijail_no_new_privs(struct minijail *j)
+{
+	j->flags.no_new_privs = 1;
+}
+
+void API minijail_use_seccomp_filter(struct minijail *j)
+{
+	j->flags.seccomp_filter = 1;
+}
+
+void API minijail_log_seccomp_filter_failures(struct minijail *j)
+{
+	j->flags.log_seccomp_filter = 1;
+}
+
+void API minijail_use_caps(struct minijail *j, uint64_t capmask)
+{
+	j->caps = capmask;
+	j->flags.caps = 1;
+}
+
+void API minijail_namespace_vfs(struct minijail *j)
+{
+	j->flags.vfs = 1;
+}
+
+void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
+{
+	int ns_fd = open(ns_path, O_RDONLY);
+	if (ns_fd < 0) {
+		pdie("failed to open namespace '%s'", ns_path);
+	}
+	j->mountns_fd = ns_fd;
+	j->flags.enter_vfs = 1;
+}
+
+void API minijail_namespace_pids(struct minijail *j)
+{
+	j->flags.vfs = 1;
+	j->flags.readonly = 1;
+	j->flags.pids = 1;
+}
+
+void API minijail_namespace_net(struct minijail *j)
+{
+	j->flags.net = 1;
+}
+
+void API minijail_remount_readonly(struct minijail *j)
+{
+	j->flags.vfs = 1;
+	j->flags.readonly = 1;
+}
+
+void API minijail_inherit_usergroups(struct minijail *j)
+{
+	j->flags.usergroups = 1;
+}
+
+void API minijail_disable_ptrace(struct minijail *j)
+{
+	j->flags.ptrace = 1;
+}
+
+int API minijail_enter_chroot(struct minijail *j, const char *dir)
+{
+	if (j->chrootdir)
+		return -EINVAL;
+	j->chrootdir = strdup(dir);
+	if (!j->chrootdir)
+		return -ENOMEM;
+	j->flags.chroot = 1;
+	return 0;
+}
+
+void API minijail_mount_tmp(struct minijail *j)
+{
+	j->flags.mount_tmp = 1;
+}
+
+int API minijail_bind(struct minijail *j, const char *src, const char *dest,
+		      int writeable)
+{
+	struct binding *b;
+
+	if (*dest != '/')
+		return -EINVAL;
+	b = calloc(1, sizeof(*b));
+	if (!b)
+		return -ENOMEM;
+	b->dest = strdup(dest);
+	if (!b->dest)
+		goto error;
+	b->src = strdup(src);
+	if (!b->src)
+		goto error;
+	b->writeable = writeable;
+
+	info("bind %s -> %s", src, dest);
+
+	/*
+	 * Force vfs namespacing so the bind mounts don't leak out into the
+	 * containing vfs namespace.
+	 */
+	minijail_namespace_vfs(j);
+
+	if (j->bindings_tail)
+		j->bindings_tail->next = b;
+	else
+		j->bindings_head = b;
+	j->bindings_tail = b;
+	j->binding_count++;
+
+	return 0;
+
+error:
+	free(b->src);
+	free(b->dest);
+	free(b);
+	return -ENOMEM;
+}
+
+void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
+{
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
+		if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
+			warn("not loading seccomp filter, seccomp not supported");
+			return;
+		}
+	}
+	FILE *file = fopen(path, "r");
+	if (!file) {
+		pdie("failed to open seccomp filter file '%s'", path);
+	}
+
+	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
+	if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
+		die("failed to compile seccomp filter BPF program in '%s'",
+		    path);
+	}
+
+	j->filter_len = fprog->len;
+	j->filter_prog = fprog;
+
+	fclose(file);
+}
+
+struct marshal_state {
+	size_t available;
+	size_t total;
+	char *buf;
+};
+
+void marshal_state_init(struct marshal_state *state,
+			char *buf, size_t available)
+{
+	state->available = available;
+	state->buf = buf;
+	state->total = 0;
+}
+
+void marshal_append(struct marshal_state *state,
+		    char *src, size_t length)
+{
+	size_t copy_len = MIN(state->available, length);
+
+	/* Up to |available| will be written. */
+	if (copy_len) {
+		memcpy(state->buf, src, copy_len);
+		state->buf += copy_len;
+		state->available -= copy_len;
+	}
+	/* |total| will contain the expected length. */
+	state->total += length;
+}
+
+void minijail_marshal_helper(struct marshal_state *state,
+			     const struct minijail *j)
+{
+	struct binding *b = NULL;
+	marshal_append(state, (char *)j, sizeof(*j));
+	if (j->user)
+		marshal_append(state, j->user, strlen(j->user) + 1);
+	if (j->chrootdir)
+		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
+	if (j->flags.seccomp_filter && j->filter_prog) {
+		struct sock_fprog *fp = j->filter_prog;
+		marshal_append(state, (char *)fp->filter,
+				fp->len * sizeof(struct sock_filter));
+	}
+	for (b = j->bindings_head; b; b = b->next) {
+		marshal_append(state, b->src, strlen(b->src) + 1);
+		marshal_append(state, b->dest, strlen(b->dest) + 1);
+		marshal_append(state, (char *)&b->writeable,
+				sizeof(b->writeable));
+	}
+}
+
+size_t API minijail_size(const struct minijail *j)
+{
+	struct marshal_state state;
+	marshal_state_init(&state, NULL, 0);
+	minijail_marshal_helper(&state, j);
+	return state.total;
+}
+
+int minijail_marshal(const struct minijail *j, char *buf, size_t available)
+{
+	struct marshal_state state;
+	marshal_state_init(&state, buf, available);
+	minijail_marshal_helper(&state, j);
+	return (state.total > available);
+}
+
+/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength
+ * @length    Number of bytes to consume
+ * @buf       Buffer to consume from
+ * @buflength Size of @buf
+ *
+ * Returns a pointer to the base of the bytes, or NULL for errors.
+ */
+void *consumebytes(size_t length, char **buf, size_t *buflength)
+{
+	char *p = *buf;
+	if (length > *buflength)
+		return NULL;
+	*buf += length;
+	*buflength -= length;
+	return p;
+}
+
+/* consumestr: consumes a C string from a buffer @buf of length @length
+ * @buf    Buffer to consume
+ * @length Length of buffer
+ *
+ * Returns a pointer to the base of the string, or NULL for errors.
+ */
+char *consumestr(char **buf, size_t *buflength)
+{
+	size_t len = strnlen(*buf, *buflength);
+	if (len == *buflength)
+		/* There's no null-terminator */
+		return NULL;
+	return consumebytes(len + 1, buf, buflength);
+}
+
+int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
+{
+	int i;
+	int count;
+	int ret = -EINVAL;
+
+	if (length < sizeof(*j))
+		goto out;
+	memcpy((void *)j, serialized, sizeof(*j));
+	serialized += sizeof(*j);
+	length -= sizeof(*j);
+
+	/* Potentially stale pointers not used as signals. */
+	j->bindings_head = NULL;
+	j->bindings_tail = NULL;
+	j->filter_prog = NULL;
+
+	if (j->user) {		/* stale pointer */
+		char *user = consumestr(&serialized, &length);
+		if (!user)
+			goto clear_pointers;
+		j->user = strdup(user);
+		if (!j->user)
+			goto clear_pointers;
+	}
+
+	if (j->chrootdir) {	/* stale pointer */
+		char *chrootdir = consumestr(&serialized, &length);
+		if (!chrootdir)
+			goto bad_chrootdir;
+		j->chrootdir = strdup(chrootdir);
+		if (!j->chrootdir)
+			goto bad_chrootdir;
+	}
+
+	if (j->flags.seccomp_filter && j->filter_len > 0) {
+		size_t ninstrs = j->filter_len;
+		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
+		    ninstrs > USHRT_MAX)
+			goto bad_filters;
+
+		size_t program_len = ninstrs * sizeof(struct sock_filter);
+		void *program = consumebytes(program_len, &serialized, &length);
+		if (!program)
+			goto bad_filters;
+
+		j->filter_prog = malloc(sizeof(struct sock_fprog));
+		j->filter_prog->len = ninstrs;
+		j->filter_prog->filter = malloc(program_len);
+		memcpy(j->filter_prog->filter, program, program_len);
+	}
+
+	count = j->binding_count;
+	j->binding_count = 0;
+	for (i = 0; i < count; ++i) {
+		int *writeable;
+		const char *dest;
+		const char *src = consumestr(&serialized, &length);
+		if (!src)
+			goto bad_bindings;
+		dest = consumestr(&serialized, &length);
+		if (!dest)
+			goto bad_bindings;
+		writeable = consumebytes(sizeof(*writeable), &serialized, &length);
+		if (!writeable)
+			goto bad_bindings;
+		if (minijail_bind(j, src, dest, *writeable))
+			goto bad_bindings;
+	}
+
+	return 0;
+
+bad_bindings:
+	if (j->flags.seccomp_filter && j->filter_len > 0) {
+		free(j->filter_prog->filter);
+		free(j->filter_prog);
+	}
+bad_filters:
+	if (j->chrootdir)
+		free(j->chrootdir);
+bad_chrootdir:
+	if (j->user)
+		free(j->user);
+clear_pointers:
+	j->user = NULL;
+	j->chrootdir = NULL;
+out:
+	return ret;
+}
+
+/* bind_one: Applies bindings from @b for @j, recursing as needed.
+ * @j Minijail these bindings are for
+ * @b Head of list of bindings
+ *
+ * Returns 0 for success.
+ */
+int bind_one(const struct minijail *j, struct binding *b)
+{
+	int ret = 0;
+	char *dest = NULL;
+	if (ret)
+		return ret;
+	/* dest has a leading "/" */
+	if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0)
+		return -ENOMEM;
+	ret = mount(b->src, dest, NULL, MS_BIND, NULL);
+	if (ret)
+		pdie("bind: %s -> %s", b->src, dest);
+	if (!b->writeable) {
+		ret = mount(b->src, dest, NULL,
+			    MS_BIND | MS_REMOUNT | MS_RDONLY, NULL);
+		if (ret)
+			pdie("bind ro: %s -> %s", b->src, dest);
+	}
+	free(dest);
+	if (b->next)
+		return bind_one(j, b->next);
+	return ret;
+}
+
+int enter_chroot(const struct minijail *j)
+{
+	int ret;
+	if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
+		return ret;
+
+	if (chroot(j->chrootdir))
+		return -errno;
+
+	if (chdir("/"))
+		return -errno;
+
+	return 0;
+}
+
+int mount_tmp(void)
+{
+	return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
+}
+
+int remount_readonly(void)
+{
+	const char *kProcPath = "/proc";
+	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
+	/*
+	 * Right now, we're holding a reference to our parent's old mount of
+	 * /proc in our namespace, which means using MS_REMOUNT here would
+	 * mutate our parent's mount as well, even though we're in a VFS
+	 * namespace (!). Instead, remove their mount from our namespace
+	 * and make our own.
+	 */
+	if (umount(kProcPath))
+		return -errno;
+	if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
+		return -errno;
+	return 0;
+}
+
+void drop_ugid(const struct minijail *j)
+{
+	if (j->flags.usergroups) {
+		if (initgroups(j->user, j->usergid))
+			pdie("initgroups");
+	} else {
+		/* Only attempt to clear supplemental groups if we are changing
+		 * users. */
+		if ((j->uid || j->gid) && setgroups(0, NULL))
+			pdie("setgroups");
+	}
+
+	if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
+		pdie("setresgid");
+
+	if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
+		pdie("setresuid");
+}
+
+/*
+ * We specifically do not use cap_valid() as that only tells us the last
+ * valid cap we were *compiled* against (i.e. what the version of kernel
+ * headers says).  If we run on a different kernel version, then it's not
+ * uncommon for that to be less (if an older kernel) or more (if a newer
+ * kernel).  So suck up the answer via /proc.
+ */
+static int run_cap_valid(unsigned int cap)
+{
+	static unsigned int last_cap;
+
+	if (!last_cap) {
+		const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
+		FILE *fp = fopen(cap_file, "re");
+		if (fscanf(fp, "%u", &last_cap) != 1)
+			pdie("fscanf(%s)", cap_file);
+		fclose(fp);
+	}
+
+	return cap <= last_cap;
+}
+
+void drop_caps(const struct minijail *j)
+{
+	cap_t caps = cap_get_proc();
+	cap_value_t flag[1];
+	const uint64_t one = 1;
+	unsigned int i;
+	if (!caps)
+		die("can't get process caps");
+	if (cap_clear_flag(caps, CAP_INHERITABLE))
+		die("can't clear inheritable caps");
+	if (cap_clear_flag(caps, CAP_EFFECTIVE))
+		die("can't clear effective caps");
+	if (cap_clear_flag(caps, CAP_PERMITTED))
+		die("can't clear permitted caps");
+	for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
+		/* Keep CAP_SETPCAP for dropping bounding set bits. */
+		if (i != CAP_SETPCAP && !(j->caps & (one << i)))
+			continue;
+		flag[0] = i;
+		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
+			die("can't add effective cap");
+		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
+			die("can't add permitted cap");
+		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
+			die("can't add inheritable cap");
+	}
+	if (cap_set_proc(caps))
+		die("can't apply initial cleaned capset");
+
+	/*
+	 * Instead of dropping bounding set first, do it here in case
+	 * the caller had a more permissive bounding set which could
+	 * have been used above to raise a capability that wasn't already
+	 * present. This requires CAP_SETPCAP, so we raised/kept it above.
+	 */
+	for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
+		if (j->caps & (one << i))
+			continue;
+		if (prctl(PR_CAPBSET_DROP, i))
+			pdie("prctl(PR_CAPBSET_DROP)");
+	}
+
+	/* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
+	if ((j->caps & (one << CAP_SETPCAP)) == 0) {
+		flag[0] = CAP_SETPCAP;
+		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
+			die("can't clear effective cap");
+		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
+			die("can't clear permitted cap");
+		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
+			die("can't clear inheritable cap");
+	}
+
+	if (cap_set_proc(caps))
+		die("can't apply final cleaned capset");
+
+	cap_free(caps);
+}
+
+void set_seccomp_filter(const struct minijail *j)
+{
+	/*
+	 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
+	 * in the kernel source tree for an explanation of the parameters.
+	 */
+	if (j->flags.no_new_privs) {
+		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
+			pdie("prctl(PR_SET_NO_NEW_PRIVS)");
+	}
+
+	/*
+	 * If we're logging seccomp filter failures,
+	 * install the SIGSYS handler first.
+	 */
+	if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
+		if (install_sigsys_handler())
+			pdie("install SIGSYS handler");
+		warn("logging seccomp filter failures");
+	}
+
+	/*
+	 * Install the syscall filter.
+	 */
+	if (j->flags.seccomp_filter) {
+		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog)) {
+			if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
+				warn("seccomp not supported");
+				return;
+			}
+			pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+		}
+	}
+}
+
+void API minijail_enter(const struct minijail *j)
+{
+	if (j->flags.pids)
+		die("tried to enter a pid-namespaced jail;"
+		    " try minijail_run()?");
+
+	if (j->flags.usergroups && !j->user)
+		die("usergroup inheritance without username");
+
+	/*
+	 * We can't recover from failures if we've dropped privileges partially,
+	 * so we don't even try. If any of our operations fail, we abort() the
+	 * entire process.
+	 */
+	if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
+		pdie("setns(CLONE_NEWNS)");
+
+	if (j->flags.vfs && unshare(CLONE_NEWNS))
+		pdie("unshare(vfs)");
+
+	if (j->flags.net && unshare(CLONE_NEWNET))
+		pdie("unshare(net)");
+
+	if (j->flags.chroot && enter_chroot(j))
+		pdie("chroot");
+
+	if (j->flags.mount_tmp && mount_tmp())
+		pdie("mount_tmp");
+
+	if (j->flags.readonly && remount_readonly())
+		pdie("remount");
+
+	if (j->flags.caps) {
+		/*
+		 * POSIX capabilities are a bit tricky. If we drop our
+		 * capability to change uids, our attempt to use setuid()
+		 * below will fail. Hang on to root caps across setuid(), then
+		 * lock securebits.
+		 */
+		if (prctl(PR_SET_KEEPCAPS, 1))
+			pdie("prctl(PR_SET_KEEPCAPS)");
+		if (prctl
+		    (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
+			pdie("prctl(PR_SET_SECUREBITS)");
+	}
+
+	/*
+	 * If we're setting no_new_privs, we can drop privileges
+	 * before setting seccomp filter. This way filter policies
+	 * don't need to allow privilege-dropping syscalls.
+	 */
+	if (j->flags.no_new_privs) {
+		drop_ugid(j);
+		if (j->flags.caps)
+			drop_caps(j);
+
+		set_seccomp_filter(j);
+	} else {
+		/*
+		 * If we're not setting no_new_privs,
+		 * we need to set seccomp filter *before* dropping privileges.
+		 * WARNING: this means that filter policies *must* allow
+		 * setgroups()/setresgid()/setresuid() for dropping root and
+		 * capget()/capset()/prctl() for dropping caps.
+		 */
+		set_seccomp_filter(j);
+
+		drop_ugid(j);
+		if (j->flags.caps)
+			drop_caps(j);
+	}
+
+	/*
+	 * seccomp has to come last since it cuts off all the other
+	 * privilege-dropping syscalls :)
+	 */
+	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
+		if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
+			warn("seccomp not supported");
+			return;
+		}
+		pdie("prctl(PR_SET_SECCOMP)");
+	}
+}
+
+/* TODO(wad) will visibility affect this variable? */
+static int init_exitstatus = 0;
+
+void init_term(int __attribute__ ((unused)) sig)
+{
+	_exit(init_exitstatus);
+}
+
+int init(pid_t rootpid)
+{
+	pid_t pid;
+	int status;
+	/* so that we exit with the right status */
+	signal(SIGTERM, init_term);
+	/* TODO(wad) self jail with seccomp_filters here. */
+	while ((pid = wait(&status)) > 0) {
+		/*
+		 * This loop will only end when either there are no processes
+		 * left inside our pid namespace or we get a signal.
+		 */
+		if (pid == rootpid)
+			init_exitstatus = status;
+	}
+	if (!WIFEXITED(init_exitstatus))
+		_exit(MINIJAIL_ERR_INIT);
+	_exit(WEXITSTATUS(init_exitstatus));
+}
+
+int API minijail_from_fd(int fd, struct minijail *j)
+{
+	size_t sz = 0;
+	size_t bytes = read(fd, &sz, sizeof(sz));
+	char *buf;
+	int r;
+	if (sizeof(sz) != bytes)
+		return -EINVAL;
+	if (sz > USHRT_MAX)	/* Arbitrary sanity check */
+		return -E2BIG;
+	buf = malloc(sz);
+	if (!buf)
+		return -ENOMEM;
+	bytes = read(fd, buf, sz);
+	if (bytes != sz) {
+		free(buf);
+		return -EINVAL;
+	}
+	r = minijail_unmarshal(j, buf, sz);
+	free(buf);
+	return r;
+}
+
+int API minijail_to_fd(struct minijail *j, int fd)
+{
+	char *buf;
+	size_t sz = minijail_size(j);
+	ssize_t written;
+	int r;
+
+	if (!sz)
+		return -EINVAL;
+	buf = malloc(sz);
+	r = minijail_marshal(j, buf, sz);
+	if (r) {
+		free(buf);
+		return r;
+	}
+	/* Sends [size][minijail]. */
+	written = write(fd, &sz, sizeof(sz));
+	if (written != sizeof(sz)) {
+		free(buf);
+		return -EFAULT;
+	}
+	written = write(fd, buf, sz);
+	if (written < 0 || (size_t) written != sz) {
+		free(buf);
+		return -EFAULT;
+	}
+	free(buf);
+	return 0;
+}
+
+int setup_preload(void)
+{
+	char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
+	char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
+	if (!newenv)
+		return -ENOMEM;
+
+	/* Only insert a separating space if we have something to separate... */
+	sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
+		PRELOADPATH);
+
+	/* setenv() makes a copy of the string we give it */
+	setenv(kLdPreloadEnvVar, newenv, 1);
+	free(newenv);
+	return 0;
+}
+
+int setup_pipe(int fds[2])
+{
+	int r = pipe(fds);
+	char fd_buf[11];
+	if (r)
+		return r;
+	r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
+	if (r <= 0)
+		return -EINVAL;
+	setenv(kFdEnvVar, fd_buf, 1);
+	return 0;
+}
+
+int setup_pipe_end(int fds[2], size_t index)
+{
+	if (index > 1)
+		return -1;
+
+	close(fds[1 - index]);
+	return fds[index];
+}
+
+int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
+{
+	if (index > 1)
+		return -1;
+
+	close(fds[1 - index]);
+	/* dup2(2) the corresponding end of the pipe into |fd|. */
+	return dup2(fds[index], fd);
+}
+
+int API minijail_run(struct minijail *j, const char *filename,
+		     char *const argv[])
+{
+	return minijail_run_pid_pipes(j, filename, argv,
+				      NULL, NULL, NULL, NULL);
+}
+
+int API minijail_run_pid(struct minijail *j, const char *filename,
+			 char *const argv[], pid_t *pchild_pid)
+{
+	return minijail_run_pid_pipes(j, filename, argv, pchild_pid,
+				      NULL, NULL, NULL);
+}
+
+int API minijail_run_pipe(struct minijail *j, const char *filename,
+			  char *const argv[], int *pstdin_fd)
+{
+	return minijail_run_pid_pipes(j, filename, argv, NULL, pstdin_fd,
+				      NULL, NULL);
+}
+
+int API minijail_run_pid_pipe(struct minijail *j, const char *filename,
+			      char *const argv[], pid_t *pchild_pid,
+			      int *pstdin_fd)
+{
+	return minijail_run_pid_pipes(j, filename, argv, pchild_pid, pstdin_fd,
+				      NULL, NULL);
+}
+
+int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
+			       char *const argv[], pid_t *pchild_pid,
+			       int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
+{
+	char *oldenv, *oldenv_copy = NULL;
+	pid_t child_pid;
+	int pipe_fds[2];
+	int stdin_fds[2];
+	int stdout_fds[2];
+	int stderr_fds[2];
+	int ret;
+	/* We need to remember this across the minijail_preexec() call. */
+	int pid_namespace = j->flags.pids;
+
+	oldenv = getenv(kLdPreloadEnvVar);
+	if (oldenv) {
+		oldenv_copy = strdup(oldenv);
+		if (!oldenv_copy)
+			return -ENOMEM;
+	}
+
+	if (setup_preload())
+		return -EFAULT;
+
+	/*
+	 * Make the process group ID of this process equal to its PID, so that
+	 * both the Minijail process and the jailed process can be killed
+	 * together.
+	 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
+	 * the process is already a process group leader.
+	 */
+	if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
+		if (errno != EPERM) {
+			pdie("setpgid(0, 0)");
+		}
+	}
+
+	/*
+	 * Before we fork(2) and execve(2) the child process, we need to open
+	 * a pipe(2) to send the minijail configuration over.
+	 */
+	if (setup_pipe(pipe_fds))
+		return -EFAULT;
+
+	/*
+	 * If we want to write to the child process' standard input,
+	 * create the pipe(2) now.
+	 */
+	if (pstdin_fd) {
+		if (pipe(stdin_fds))
+			return -EFAULT;
+	}
+
+	/*
+	 * If we want to read from the child process' standard output,
+	 * create the pipe(2) now.
+	 */
+	if (pstdout_fd) {
+		if (pipe(stdout_fds))
+			return -EFAULT;
+	}
+
+	/*
+	 * If we want to read from the child process' standard error,
+	 * create the pipe(2) now.
+	 */
+	if (pstderr_fd) {
+		if (pipe(stderr_fds))
+			return -EFAULT;
+	}
+
+	/* Use sys_clone() if and only if we're creating a pid namespace.
+	 *
+	 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
+	 *
+	 * In multithreaded programs, there are a bunch of locks inside libc,
+	 * some of which may be held by other threads at the time that we call
+	 * minijail_run_pid(). If we call fork(), glibc does its level best to
+	 * ensure that we hold all of these locks before it calls clone()
+	 * internally and drop them after clone() returns, but when we call
+	 * sys_clone(2) directly, all that gets bypassed and we end up with a
+	 * child address space where some of libc's important locks are held by
+	 * other threads (which did not get cloned, and hence will never release
+	 * those locks). This is okay so long as we call exec() immediately
+	 * after, but a bunch of seemingly-innocent libc functions like setenv()
+	 * take locks.
+	 *
+	 * Hence, only call sys_clone() if we need to, in order to get at pid
+	 * namespacing. If we follow this path, the child's address space might
+	 * have broken locks; you may only call functions that do not acquire
+	 * any locks.
+	 *
+	 * Unfortunately, fork() acquires every lock it can get its hands on, as
+	 * previously detailed, so this function is highly likely to deadlock
+	 * later on (see "deadlock here") if we're multithreaded.
+	 *
+	 * We might hack around this by having the clone()d child (init of the
+	 * pid namespace) return directly, rather than leaving the clone()d
+	 * process hanging around to be init for the new namespace (and having
+	 * its fork()ed child return in turn), but that process would be crippled
+	 * with its libc locks potentially broken. We might try fork()ing in the
+	 * parent before we clone() to ensure that we own all the locks, but
+	 * then we have to have the forked child hanging around consuming
+	 * resources (and possibly having file descriptors / shared memory
+	 * regions / etc attached). We'd need to keep the child around to avoid
+	 * having its children get reparented to init.
+	 *
+	 * TODO(ellyjones): figure out if the "forked child hanging around"
+	 * problem is fixable or not. It would be nice if we worked in this
+	 * case.
+	 */
+	if (pid_namespace)
+		child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
+	else
+		child_pid = fork();
+
+	if (child_pid < 0) {
+		free(oldenv_copy);
+		die("failed to fork child");
+	}
+
+	if (child_pid) {
+		/* Restore parent's LD_PRELOAD. */
+		if (oldenv_copy) {
+			setenv(kLdPreloadEnvVar, oldenv_copy, 1);
+			free(oldenv_copy);
+		} else {
+			unsetenv(kLdPreloadEnvVar);
+		}
+		unsetenv(kFdEnvVar);
+
+		j->initpid = child_pid;
+
+		/* Send marshalled minijail. */
+		close(pipe_fds[0]);	/* read endpoint */
+		ret = minijail_to_fd(j, pipe_fds[1]);
+		close(pipe_fds[1]);	/* write endpoint */
+		if (ret) {
+			kill(j->initpid, SIGKILL);
+			die("failed to send marshalled minijail");
+		}
+
+		if (pchild_pid)
+			*pchild_pid = child_pid;
+
+		/*
+		 * If we want to write to the child process' standard input,
+		 * set up the write end of the pipe.
+		 */
+		if (pstdin_fd)
+			*pstdin_fd = setup_pipe_end(stdin_fds,
+						    1	/* write end */);
+
+		/*
+		 * If we want to read from the child process' standard output,
+		 * set up the read end of the pipe.
+		 */
+		if (pstdout_fd)
+			*pstdout_fd = setup_pipe_end(stdout_fds,
+						     0	/* read end */);
+
+		/*
+		 * If we want to read from the child process' standard error,
+		 * set up the read end of the pipe.
+		 */
+		if (pstderr_fd)
+			*pstderr_fd = setup_pipe_end(stderr_fds,
+						     0	/* read end */);
+
+		return 0;
+	}
+	free(oldenv_copy);
+
+	/*
+	 * If we want to write to the jailed process' standard input,
+	 * set up the read end of the pipe.
+	 */
+	if (pstdin_fd) {
+		if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
+					    STDIN_FILENO) < 0)
+			die("failed to set up stdin pipe");
+	}
+
+	/*
+	 * If we want to read from the jailed process' standard output,
+	 * set up the write end of the pipe.
+	 */
+	if (pstdout_fd) {
+		if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
+					    STDOUT_FILENO) < 0)
+			die("failed to set up stdout pipe");
+	}
+
+	/*
+	 * If we want to read from the jailed process' standard error,
+	 * set up the write end of the pipe.
+	 */
+	if (pstderr_fd) {
+		if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
+					    STDERR_FILENO) < 0)
+			die("failed to set up stderr pipe");
+	}
+
+	/* Strip out flags that cannot be inherited across execve. */
+	minijail_preexec(j);
+	/* Jail this process and its descendants... */
+	minijail_enter(j);
+
+	if (pid_namespace) {
+		/*
+		 * pid namespace: this process will become init inside the new
+		 * namespace, so fork off a child to actually run the program
+		 * (we don't want all programs we might exec to have to know
+		 * how to be init).
+		 *
+		 * If we're multithreaded, we'll probably deadlock here. See
+		 * WARNING above.
+		 */
+		child_pid = fork();
+		if (child_pid < 0)
+			_exit(child_pid);
+		else if (child_pid > 0)
+			init(child_pid);	/* never returns */
+	}
+
+	/*
+	 * If we aren't pid-namespaced:
+	 *   calling process
+	 *   -> execve()-ing process
+	 * If we are:
+	 *   calling process
+	 *   -> init()-ing process
+	 *      -> execve()-ing process
+	 */
+	_exit(execve(filename, argv, environ));
+}
+
+int API minijail_run_static(struct minijail *j, const char *filename,
+			    char *const argv[])
+{
+	pid_t child_pid;
+	int pid_namespace = j->flags.pids;
+
+	if (j->flags.caps)
+		die("caps not supported with static targets");
+
+	if (pid_namespace)
+		child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
+	else
+		child_pid = fork();
+
+	if (child_pid < 0) {
+		die("failed to fork child");
+	}
+	if (child_pid > 0 ) {
+		j->initpid = child_pid;
+		return 0;
+	}
+
+	/*
+	 * We can now drop this child into the sandbox
+	 * then execve the target.
+	 */
+
+	j->flags.pids = 0;
+	minijail_enter(j);
+
+	if (pid_namespace) {
+		/*
+		 * pid namespace: this process will become init inside the new
+		 * namespace, so fork off a child to actually run the program
+		 * (we don't want all programs we might exec to have to know
+		 * how to be init).
+		 *
+		 * If we're multithreaded, we'll probably deadlock here. See
+		 * WARNING above.
+		 */
+		child_pid = fork();
+		if (child_pid < 0)
+			_exit(child_pid);
+		else if (child_pid > 0)
+			init(child_pid);	/* never returns */
+	}
+
+	_exit(execve(filename, argv, environ));
+}
+
+int API minijail_kill(struct minijail *j)
+{
+	int st;
+	if (kill(j->initpid, SIGTERM))
+		return -errno;
+	if (waitpid(j->initpid, &st, 0) < 0)
+		return -errno;
+	return st;
+}
+
+int API minijail_wait(struct minijail *j)
+{
+	int st;
+	if (waitpid(j->initpid, &st, 0) < 0)
+		return -errno;
+
+	if (!WIFEXITED(st)) {
+		int error_status = st;
+		if (WIFSIGNALED(st)) {
+			int signum = WTERMSIG(st);
+			warn("child process %d received signal %d",
+			     j->initpid, signum);
+			/*
+			 * We return MINIJAIL_ERR_JAIL if the process received
+			 * SIGSYS, which happens when a syscall is blocked by
+			 * seccomp filters.
+			 * If not, we do what bash(1) does:
+			 * $? = 128 + signum
+			 */
+			if (signum == SIGSYS) {
+				error_status = MINIJAIL_ERR_JAIL;
+			} else {
+				error_status = 128 + signum;
+			}
+		}
+		return error_status;
+	}
+
+	int exit_status = WEXITSTATUS(st);
+	if (exit_status != 0)
+		info("child process %d exited with status %d",
+		     j->initpid, exit_status);
+
+	return exit_status;
+}
+
+void API minijail_destroy(struct minijail *j)
+{
+	if (j->flags.seccomp_filter && j->filter_prog) {
+		free(j->filter_prog->filter);
+		free(j->filter_prog);
+	}
+	while (j->bindings_head) {
+		struct binding *b = j->bindings_head;
+		j->bindings_head = j->bindings_head->next;
+		free(b->dest);
+		free(b->src);
+		free(b);
+	}
+	j->bindings_tail = NULL;
+	if (j->user)
+		free(j->user);
+	if (j->chrootdir)
+		free(j->chrootdir);
+	free(j);
+}
diff --git a/libminijail.h b/libminijail.h
new file mode 100644
index 0000000..6738a32
--- /dev/null
+++ b/libminijail.h
@@ -0,0 +1,165 @@
+/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/* The general pattern of use here:
+ * 1) Construct a minijail with minijail_new()
+ * 2) Apply the desired restrictions to it
+ * 3) Enter it, which locks the current process inside it, or:
+ * 3) Run a process inside it
+ * 4) Destroy it.
+ */
+
+#ifndef _LIBMINIJAIL_H_
+#define _LIBMINIJAIL_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+	MINIJAIL_ERR_PRELOAD = 252,
+	MINIJAIL_ERR_JAIL = 253,
+	MINIJAIL_ERR_INIT = 254,
+};
+
+struct minijail;
+
+/* Allocates a new minijail with no restrictions. */
+struct minijail *minijail_new(void);
+
+/* These functions add restrictions to the minijail. They are not applied until
+ * minijail_enter() is called. See the documentation in minijail0.1 for
+ * explanations in detail of what the restrictions do.
+ */
+void minijail_change_uid(struct minijail *j, uid_t uid);
+void minijail_change_gid(struct minijail *j, gid_t gid);
+/* Stores user to change to and copies |user| for internal consistency. */
+int minijail_change_user(struct minijail *j, const char *user);
+/* Does not take ownership of |group|. */
+int minijail_change_group(struct minijail *j, const char *group);
+void minijail_use_seccomp(struct minijail *j);
+void minijail_no_new_privs(struct minijail *j);
+void minijail_use_seccomp_filter(struct minijail *j);
+void minijail_parse_seccomp_filters(struct minijail *j, const char *path);
+void minijail_log_seccomp_filter_failures(struct minijail *j);
+void minijail_use_caps(struct minijail *j, uint64_t capmask);
+void minijail_namespace_vfs(struct minijail *j);
+void minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path);
+void minijail_namespace_net(struct minijail *j);
+/* Implies namespace_vfs and remount_readonly.
+ * WARNING: this is NOT THREAD SAFE. See the block comment in </libminijail.c>.
+ */
+void minijail_namespace_pids(struct minijail *j);
+void minijail_remount_readonly(struct minijail *j);
+void minijail_inherit_usergroups(struct minijail *j);
+void minijail_disable_ptrace(struct minijail *j);
+
+/* minijail_enter_chroot: enables chroot() restriction for @j
+ * @j   minijail to apply restriction to
+ * @dir directory to chroot() to. Owned by caller.
+ *
+ * Enters @dir, binding all bind mounts specified with minijail_bind() into
+ * place. Requires @dir to contain all necessary directories for bind mounts
+ * (i.e., if you have requested a bind mount at /etc, /etc must exist in @dir.)
+ *
+ * Returns 0 on success.
+ */
+int minijail_enter_chroot(struct minijail *j, const char *dir);
+
+/* minijail_mount_tmp: enables mounting of a tmpfs filesystem on /tmp.
+ * As be rules of bind mounts, /tmp must exist in chroot.
+ */
+void minijail_mount_tmp(struct minijail *j);
+
+/* minijail_bind: bind-mounts @src into @j as @dest, optionally writeable
+ * @j         minijail to bind inside
+ * @src       source to bind
+ * @dest      location to bind (inside chroot)
+ * @writeable 1 if the bind mount should be writeable
+ *
+ * This may be called multiple times; all bindings will be applied in the order
+ * of minijail_bind() calls.
+ */
+int minijail_bind(struct minijail *j, const char *src, const char *dest,
+		  int writeable);
+
+/* Lock this process into the given minijail. Note that this procedure cannot fail,
+ * since there is no way to undo privilege-dropping; therefore, if any part of
+ * the privilege-drop fails, minijail_enter() will abort the entire process.
+ *
+ * Some restrictions cannot be enabled this way (pid namespaces) and attempting
+ * to do so will cause an abort.
+ */
+void minijail_enter(const struct minijail *j);
+
+/* Run the specified command in the given minijail, execve(3)-style. This is
+ * required if minijail_namespace_pids() was used.
+ */
+int minijail_run(struct minijail *j, const char *filename,
+		 char *const argv[]);
+
+/* Run the specified command in the given minijail, execve(3)-style.
+ * Used with static binaries.
+ */
+int minijail_run_static(struct minijail *j, const char *filename,
+			char *const argv[]);
+
+/* Run the specified command in the given minijail, execve(3)-style.
+ * Update |*pchild_pid| with the pid of the child.
+ */
+int minijail_run_pid(struct minijail *j, const char *filename,
+		     char *const argv[], pid_t *pchild_pid);
+
+/* Run the specified command in the given minijail, execve(3)-style.
+ * Update |*pstdin_fd| with a fd that allows writing to the child's
+ * standard input.
+ */
+int minijail_run_pipe(struct minijail *j, const char *filename,
+		      char *const argv[], int *pstdin_fd);
+
+/* Run the specified command in the given minijail, execve(3)-style.
+ * Update |*pchild_pid| with the pid of the child.
+ * Update |*pstdin_fd| with a fd that allows writing to the child's
+ * standard input.
+ */
+int minijail_run_pid_pipe(struct minijail *j, const char *filename,
+			  char *const argv[], pid_t *pchild_pid,
+			  int *pstdin_fd);
+
+/* Run the specified command in the given minijail, execve(3)-style.
+ * Update |*pchild_pid| with the pid of the child.
+ * Update |*pstdin_fd| with a fd that allows writing to the child's
+ * standard input.
+ * Update |*pstdout_fd| with a fd that allows reading from the child's
+ * standard output.
+ * Update |*pstderr_fd| with a fd that allows reading from the child's
+ * standard error.
+ */
+int minijail_run_pid_pipes(struct minijail *j, const char *filename,
+			   char *const argv[], pid_t *pchild_pid,
+			   int *pstdin_fd, int *pstdout_fd, int *pstderr_fd);
+
+/* Kill the specified minijail. The minijail must have been created with pid
+ * namespacing; if it was, all processes inside it are atomically killed.
+ */
+int minijail_kill(struct minijail *j);
+
+/* Wait for all processed in the specified minijail to exit. Returns the exit
+ * status of the _first_ process spawned in the jail.
+ */
+int minijail_wait(struct minijail *j);
+
+/* Frees the given minijail. It does not matter if the process is inside the minijail or
+ * not. */
+void minijail_destroy(struct minijail *j);
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* !_LIBMINIJAIL_H_ */
diff --git a/libminijail_unittest.c b/libminijail_unittest.c
new file mode 100644
index 0000000..011ce85
--- /dev/null
+++ b/libminijail_unittest.c
@@ -0,0 +1,225 @@
+/* libminijail_unittest.c
+ * Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Test platform independent logic of minijail.
+ */
+
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "test_harness.h"
+
+#include "libminijail.h"
+#include "libminijail-private.h"
+
+/* Prototypes needed only by test. */
+void *consumebytes(size_t length, char **buf, size_t *buflength);
+char *consumestr(char **buf, size_t *buflength);
+
+/* Silence unused variable warnings. */
+TEST(silence_unused) {
+  EXPECT_STREQ(kLdPreloadEnvVar, kLdPreloadEnvVar);
+  EXPECT_STREQ(kFdEnvVar, kFdEnvVar);
+  EXPECT_STRNE(kFdEnvVar, kLdPreloadEnvVar);
+}
+
+TEST(consumebytes_zero) {
+  char buf[1024];
+  size_t len = sizeof(buf);
+  char *pos = &buf[0];
+  EXPECT_NE(NULL, consumebytes(0, &pos, &len));
+  EXPECT_EQ(&buf[0], pos);
+  EXPECT_EQ(sizeof(buf), len);
+}
+
+TEST(consumebytes_exact) {
+  char buf[1024];
+  size_t len = sizeof(buf);
+  char *pos = &buf[0];
+  /* One past the end since it consumes the whole buffer. */
+  char *end = &buf[sizeof(buf)];
+  EXPECT_NE(NULL, consumebytes(len, &pos, &len));
+  EXPECT_EQ((size_t)0, len);
+  EXPECT_EQ(end, pos);
+}
+
+TEST(consumebytes_half) {
+  char buf[1024];
+  size_t len = sizeof(buf);
+  char *pos = &buf[0];
+  /* One past the end since it consumes the whole buffer. */
+  char *end = &buf[sizeof(buf) / 2];
+  EXPECT_NE(NULL, consumebytes(len / 2, &pos, &len));
+  EXPECT_EQ(sizeof(buf) / 2, len);
+  EXPECT_EQ(end, pos);
+}
+
+TEST(consumebytes_toolong) {
+  char buf[1024];
+  size_t len = sizeof(buf);
+  char *pos = &buf[0];
+  /* One past the end since it consumes the whole buffer. */
+  EXPECT_EQ(NULL, consumebytes(len + 1, &pos, &len));
+  EXPECT_EQ(sizeof(buf), len);
+  EXPECT_EQ(&buf[0], pos);
+}
+
+TEST(consumestr_zero) {
+  char buf[1024];
+  size_t len = 0;
+  char *pos = &buf[0];
+  memset(buf, 0xff, sizeof(buf));
+  EXPECT_EQ(NULL, consumestr(&pos, &len));
+  EXPECT_EQ((size_t)0, len);
+  EXPECT_EQ(&buf[0], pos);
+}
+
+TEST(consumestr_nonul) {
+  char buf[1024];
+  size_t len = sizeof(buf);
+  char *pos = &buf[0];
+  memset(buf, 0xff, sizeof(buf));
+  EXPECT_EQ(NULL, consumestr(&pos, &len));
+  EXPECT_EQ(sizeof(buf), len);
+  EXPECT_EQ(&buf[0], pos);
+}
+
+TEST(consumestr_full) {
+  char buf[1024];
+  size_t len = sizeof(buf);
+  char *pos = &buf[0];
+  memset(buf, 0xff, sizeof(buf));
+  buf[sizeof(buf)-1] = '\0';
+  EXPECT_EQ((void *)buf, consumestr(&pos, &len));
+  EXPECT_EQ((size_t)0, len);
+  EXPECT_EQ(&buf[sizeof(buf)], pos);
+}
+
+TEST(consumestr_trailing_nul) {
+  char buf[1024];
+  size_t len = sizeof(buf) - 1;
+  char *pos = &buf[0];
+  memset(buf, 0xff, sizeof(buf));
+  buf[sizeof(buf)-1] = '\0';
+  EXPECT_EQ(NULL, consumestr(&pos, &len));
+  EXPECT_EQ(sizeof(buf) - 1, len);
+  EXPECT_EQ(&buf[0], pos);
+}
+
+FIXTURE(marshal) {
+  char buf[4096];
+  struct minijail *m;
+  struct minijail *j;
+  size_t size;
+};
+
+FIXTURE_SETUP(marshal) {
+  self->m = minijail_new();
+  self->j = minijail_new();
+  ASSERT_TRUE(self->m && self->j) TH_LOG("allocation failed");
+  self->size = minijail_size(self->m);
+  ASSERT_GT(sizeof(self->buf), self->size) {
+    TH_LOG("static buffer too small for test");
+  }
+}
+
+FIXTURE_TEARDOWN(marshal) {
+  minijail_destroy(self->m);
+  minijail_destroy(self->j);
+}
+
+TEST_F(marshal, empty) {
+  ASSERT_EQ(0, minijail_marshal(self->m, self->buf, sizeof(self->buf)));
+  EXPECT_EQ(0, minijail_unmarshal(self->j, self->buf, self->size));
+}
+
+TEST_F(marshal, 0xff) {
+  memset(self->buf, 0xff, sizeof(self->buf));
+  /* Should fail on the first consumestr since a NUL will never be found. */
+  EXPECT_EQ(-EINVAL, minijail_unmarshal(self->j, self->buf, sizeof(self->buf)));
+}
+
+/*
+ * TODO(jorgelo): rewrite these tests to not depend on libminijailpreload.so.
+TEST(test_minijail_run_pid_pipe) {
+  pid_t pid;
+  int child_stdin;
+  int mj_run_ret;
+  ssize_t write_ret;
+  int status;
+  char filename[] = "test/read_stdin";
+  char *argv[2];
+  argv[0] = filename;
+  argv[1] = NULL;
+
+  struct minijail *j = minijail_new();
+  mj_run_ret = minijail_run_pid_pipe(j, argv[0], argv, &pid, &child_stdin);
+  EXPECT_EQ(mj_run_ret, 0);
+  write_ret = write(child_stdin, "test\n", strlen("test\n"));
+  EXPECT_GT(write_ret, -1);
+
+  waitpid(pid, &status, 0);
+  ASSERT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(WEXITSTATUS(status), 0);
+
+  minijail_destroy(j);
+}
+
+TEST(test_minijail_run_pid_pipes) {
+  pid_t pid;
+  int child_stdin, child_stdout, child_stderr;
+  int mj_run_ret;
+  ssize_t write_ret, read_ret;
+  const size_t buf_len = 128;
+  char buf[buf_len];
+  int status;
+  char filename[] = "/bin/cat";
+  char teststr[] = "test\n";
+  size_t teststr_len = strlen(teststr);
+  char *argv[4];
+
+  struct minijail *j = minijail_new();
+
+  argv[0] = filename;
+  argv[1] = NULL;
+  mj_run_ret = minijail_run_pid_pipes(j, argv[0], argv,
+                                      &pid, &child_stdin, &child_stdout, NULL);
+  EXPECT_EQ(mj_run_ret, 0);
+
+  write_ret = write(child_stdin, teststr, teststr_len);
+  EXPECT_EQ(write_ret, (int)teststr_len);
+
+  read_ret = read(child_stdout, buf, 8);
+  EXPECT_EQ(read_ret, (int)teststr_len);
+  buf[teststr_len] = 0;
+  EXPECT_EQ(strcmp(buf, teststr), 0);
+
+  EXPECT_EQ(kill(pid, SIGTERM), 0);
+  waitpid(pid, &status, 0);
+  ASSERT_TRUE(WIFSIGNALED(status));
+  EXPECT_EQ(WTERMSIG(status), SIGTERM);
+
+  argv[0] = "/bin/sh";
+  argv[1] = "-c";
+  argv[2] = "echo test >&2";
+  argv[3] = NULL;
+  mj_run_ret = minijail_run_pid_pipes(j, argv[0], argv, &pid, &child_stdin,
+                                      &child_stdout, &child_stderr);
+  EXPECT_EQ(mj_run_ret, 0);
+
+  read_ret = read(child_stderr, buf, buf_len);
+  EXPECT_GE(read_ret, (int)teststr_len);
+
+  waitpid(pid, &status, 0);
+  ASSERT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(WEXITSTATUS(status), 0);
+
+  minijail_destroy(j);
+}
+*/
+
+TEST_HARNESS_MAIN
diff --git a/libminijailpreload.c b/libminijailpreload.c
new file mode 100644
index 0000000..90f8733
--- /dev/null
+++ b/libminijailpreload.c
@@ -0,0 +1,143 @@
+/* libminijailpreload.c - preload hack library
+ * Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * This library is preloaded into every program launched by minijail_run().
+ * DO NOT EXPORT ANY SYMBOLS FROM THIS LIBRARY. They will replace other symbols
+ * in the programs it is preloaded into and cause impossible-to-debug failures.
+ * See the minijail0.1 for a design explanation.
+ */
+
+#include "libminijail.h"
+#include "libminijail-private.h"
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <syslog.h>
+#include <unistd.h>
+
+static int (*real_main) (int, char **, char **);
+static void *libc_handle;
+
+static void die(const char *failed)
+{
+	syslog(LOG_ERR, "libminijail: %s", failed);
+	abort();
+}
+
+static void unset_in_env(char **envp, const char *name)
+{
+	int i;
+	for (i = 0; envp[i]; i++)
+		if (!strncmp(envp[i], name, strlen(name)))
+			envp[i][0] = '\0';
+}
+
+/** @brief Fake main(), spliced in before the real call to main() by
+ *         __libc_start_main (see below).
+ *  We get serialized commands from our invoking process over an fd specified
+ *  by an environment variable (kFdEnvVar). The environment variable is a list
+ *  of key=value pairs (see move_commands_to_env); we use them to construct a
+ *  jail, then enter it.
+ */
+static int fake_main(int argc, char **argv, char **envp)
+{
+	char *fd_name = getenv(kFdEnvVar);
+	int fd = -1;
+	struct minijail *j;
+	if (geteuid() != getuid() || getegid() != getgid())
+		/* If we didn't do this check, an attacker could set kFdEnvVar
+		 * for any setuid program that uses libminijail to cause it to
+		 * get capabilities or a uid it did not expect.
+		 */
+		/* TODO(wad) why would libminijail interact here? */
+		return MINIJAIL_ERR_PRELOAD;
+	if (!fd_name)
+		return MINIJAIL_ERR_PRELOAD;
+	fd = atoi(fd_name);
+	if (fd < 0)
+		return MINIJAIL_ERR_PRELOAD;
+
+	j = minijail_new();
+	if (!j)
+		die("preload: out of memory");
+	if (minijail_from_fd(fd, j))
+		die("preload: failed to parse minijail from parent");
+	close(fd);
+
+	/* TODO(ellyjones): this trashes existing preloads, so one can't do:
+	 * LD_PRELOAD="/tmp/test.so libminijailpreload.so" prog; the
+	 * descendants of prog will have no LD_PRELOAD set at all.
+	 */
+	unset_in_env(envp, kLdPreloadEnvVar);
+	/* Strip out flags meant for the parent. */
+	minijail_preenter(j);
+	minijail_enter(j);
+	minijail_destroy(j);
+	dlclose(libc_handle);
+	return real_main(argc, argv, envp);
+}
+
+/** @brief LD_PRELOAD override of __libc_start_main.
+ *
+ *  It is really best if you do not look too closely at this function.  We need
+ *  to ensure that some of our code runs before the target program (see the
+ *  minijail0.1 file in this directory for high-level details about this), and
+ *  the only available place to hook is this function, which is normally
+ *  responsible for calling main(). Our LD_PRELOAD will overwrite the real
+ *  __libc_start_main with this one, so we have to look up the real one from
+ *  libc and invoke it with a pointer to the fake main() we'd like to run before
+ *  the real main(). We can't just run our setup code *here* because
+ *  __libc_start_main is responsible for setting up the C runtime environment,
+ *  so we can't rely on things like malloc() being available yet.
+ */
+
+int API __libc_start_main(int (*main) (int, char **, char **),
+		      int argc, char **ubp_av, void (*init) (void),
+		      void (*fini) (void), void (*rtld_fini) (void),
+		      void (*stack_end))
+{
+	void *sym;
+	/* This hack is unfortunately required by C99 - casting directly from
+	 * void* to function pointers is left undefined. See POSIX.1-2003, the
+	 * Rationale for the specification of dlsym(), and dlsym(3). This
+	 * deliberately violates strict-aliasing rules, but gcc can't tell.
+	 */
+	union {
+		int (*fn) (int (*main) (int, char **, char **), int argc,
+			   char **ubp_av, void (*init) (void),
+			   void (*fini) (void), void (*rtld_fini) (void),
+			   void (*stack_end));
+		void *symval;
+	} real_libc_start_main;
+
+	/* We hold this handle for the duration of the real __libc_start_main()
+	 * and drop it just before calling the real main().
+	 */
+	libc_handle = dlopen("libc.so.6", RTLD_NOW);
+
+	if (!libc_handle) {
+		syslog(LOG_ERR, "can't dlopen() libc");
+		/* We dare not use abort() here because it will run atexit()
+		 * handlers and try to flush stdio.
+		 */
+		_exit(1);
+	}
+	sym = dlsym(libc_handle, "__libc_start_main");
+	if (!sym) {
+		syslog(LOG_ERR, "can't find the real __libc_start_main()");
+		_exit(1);
+	}
+	real_libc_start_main.symval = sym;
+	real_main = main;
+
+	/* Note that we swap fake_main in for main - fake_main knows that it
+	 * should call real_main after it's done.
+	 */
+	return real_libc_start_main.fn(fake_main, argc, ubp_av, init, fini,
+				       rtld_fini, stack_end);
+}
diff --git a/libsyscalls.h b/libsyscalls.h
new file mode 100644
index 0000000..a2eb43c
--- /dev/null
+++ b/libsyscalls.h
@@ -0,0 +1,16 @@
+/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#ifndef MINIJAIL_LIBSYSCALLS_H_
+#define MINIJAIL_LIBSYSCALLS_H_
+#include <sys/types.h>
+
+struct syscall_entry {
+  const char *name;
+  int nr;
+};
+
+extern const struct syscall_entry syscall_table[];
+
+#endif  /* MINIJAIL_LIBSYSCALLS_H_ */
diff --git a/minijail0.1 b/minijail0.1
new file mode 100644
index 0000000..1d85385
--- /dev/null
+++ b/minijail0.1
@@ -0,0 +1,95 @@
+.TH MINIJAIL0 "1" "January 2012" "Chromium OS" "User Commands"
+.SH NAME
+minijail0 \- sandbox a process
+.SH SYNOPSIS
+.B minijail0
+[\fIOPTION\fR]... <\fIprogram\fR> [\fIargs\fR]...
+.SH DESCRIPTION
+.PP
+Runs PROGRAM inside a sandbox.
+.TP
+\fB-b <src>,<dest>[,<writeable>]
+Bind-mount <src> into the chroot directory at <dest>, optionally writeable.
+.TP
+\fB-c <caps>\fR
+Restrict capabilities to \fIcaps\fR. When used in conjunction with \fB-u\fR and
+.TP
+\fB-C <dir>\fR
+Change root (using chroot(2)) to <dir>.
+.TP
+\fB-t\fR
+Mounts a tmpfs filesystem on /tmp. /tmp must exist in the chroot.
+This must be used with -C. The default filesystem has a max size of 128M
+and has standard /tmp permissions (777).
+.TP
+\fB-g\fR, this allows a program to have access to only certain parts of root's
+default privileges while running as another user and group ID altogether. Note
+that these capabilities are not inherited by subprocesses of the process given
+capabilities unless those subprocesses have POSIX file capabilities. See
+\fBcapabilities\fR(7).
+.TP
+\fB-G\fR
+Inherit all the supplementary groups of the user specified with \fB-u\fR. It
+is an error to use this option without having specified a \fBuser name\fR to
+\fB-u\fR.
+.TP
+\fB-g <group>\fR
+Change groups to \fIgroup\fR, which may be either a group name or a numeric
+group ID.
+.TP
+\fB-h\fR
+Print a help message.
+.TP
+\fB-H\fR
+Print a help message detailing supported system call names for seccomp_filter.
+(Other direct numbers may be specified if minijail0 is not in sync with the
+ host kernel or something like 32/64-bit compatibility issues exist.)
+.TP
+\fB-p\fR
+Run inside a new PID namespace. This option will make it impossible for the
+program to see or affect processes that are not its descendants. This implies
+\fB-v\fR and \fB-r\fR, since otherwise the process can see outside its namespace
+by inspecting /proc.
+.TP
+\fB-r\fR
+Remount certain filesystems readonly. Currently this only remounts /proc. This
+implies \fB-v\fR. Remounting /proc readonly means that even if the process has
+write access to a system config knob in /proc (e.g., in /sys/kernel), it cannot
+change the value.
+.TP
+\fB-s\fR
+Enable seccomp(2) in mode 1, which restricts the child process to a very small
+set of system calls.
+.TP
+\fB-S <arch-specific seccomp_filter policy file>\fR
+Enable seccomp(2) in mode 13 which restricts the child process to a set of
+system calls defined in the policy file.  Note that system calls often change
+names based on the architecture or mode. (uname -m is your friend.)
+.TP
+\fB-u <user>\fR
+Change users to \fIuser\fR, which may be either a user name or a numeric user
+ID.
+.TP
+\fB-v\fR
+Run inside a new VFS namespace. This option makes the program's mountpoints
+independent of the rest of the system's.
+.SH IMPLEMENTATION
+This program is broken up into two parts: \fBminijail0\fR (the frontend) and a helper
+library called \fBlibminijailpreload\fR. Some jailings can only be achieved from
+the process to which they will actually apply - specifically capability use
+(since capabilities are not inherited to an exec'd process unless the exec'd
+process has POSIX file capabilities), seccomp (since we can't exec() once we're
+seccomp'd), and ptrace-disable (which is always cleared on exec().
+
+To this end, \fBlibminijailpreload\fR is forcibly loaded into all
+dynamically-linked target programs if any of these restrictions are in effect;
+we pass the specific restrictions in an environment variable which the preloaded
+library looks for. The forcibly-loaded library then applies the restrictions
+to the newly-loaded program.
+.SH AUTHOR
+Written by Elly Jones (ellyjones@chromium.org)
+.SH COPYRIGHT
+Copyright \(co 2011 The Chromium OS Authors
+License BSD-like.
+.SH "SEE ALSO"
+\fBlibminijail.h\fR \fBminijail0(5)\fR
diff --git a/minijail0.5 b/minijail0.5
new file mode 100644
index 0000000..b9036b9
--- /dev/null
+++ b/minijail0.5
@@ -0,0 +1,85 @@
+.TH MINIJAIL0 "1" "July 2011" "Chromium OS" "User Commands"
+.SH NAME
+minijail0 \- sandbox a process
+.SH DESCRIPTION
+.PP
+Runs PROGRAM inside a sandbox. See minijail(1) for details.
+.SH EXAMPLES
+
+Safely switch from root to nobody while dropping all capabilities and
+inheriting any groups from nobody:
+
+  # minijail0 -c 0 -G -u nobody /usr/bin/whoami
+  nobody
+
+Run in a PID and VFS namespace without superuser capabilities (but still
+as root) and with a private view of /proc:
+
+  # minijail0 -p -v -r -c 0 /bin/ps
+    PID TTY           TIME CMD
+      1 pts/0     00:00:00 minijail0
+      2 pts/0     00:00:00 ps
+
+Running a process with a seccomp filter policy at reduced privileges:
+
+  # minijail0 -S /usr/share/minijail0/$(uname -m)/cat.policy -- \\
+              /bin/cat /proc/self/seccomp_filter
+  ...
+
+.SH SECCOMP_FILTER POLICY
+The policy file supplied to the \fB-S\fR argument supports the following syntax:
+
+  \fB<syscall_name>\fR:\fB<ftrace filter policy>\fR
+  \fB<syscall_number>\fR:\fB<ftrace filter policy>\fR
+  \fB<empty line>\fR
+  \fB# any single line comment\fR
+
+A policy that emulates seccomp(2) in mode 1 may look like:
+  read: 1
+  write: 1
+  sig_return: 1
+  exit: 1
+
+The "1" acts as a wildcard and allows any use of the mentioned system
+call.  More advanced filtering is possible if your kernel supports
+CONFIG_FTRACE_SYSCALLS.  For example, we can allow a process to open any
+file read only and mmap PROT_READ only:
+
+  # open with O_LARGEFILE|O_RDONLY|O_NONBLOCK or some combination
+  open: flags == 32768 || flags == 0 || flags == 34816 || flags == 2048
+  mmap2: prot == 0x0
+  munmap: 1
+  close: 1
+
+The supported arguments may be found by reviewing the system call
+prototypes in the Linux kernel source code.  Be aware that any
+non-numeric comparison may be subject to time-of-check-time-of-use
+attacks and cannot be considered safe.
+
+\fBexecve\fR may only be used when invoking with CAP_SYS_ADMIN privileges.
+
+.SH SECCOMP_FILTER POLICY WRITING
+
+Determining policy for seccomp_filter can be time consuming.  System
+calls are often named in arch-specific, or legacy tainted, ways.  E.g.,
+geteuid versus geteuid32.  On process death due to a seccomp filter
+rule, the offending system call number will be supplied with a best
+guess of the ABI defined name.  This information may be used to produce
+working baseline policies.  However, if the process being contained has
+a fairly tight working domain, using \fBstrace -e raw=all <program>\fR
+can generate the list of system calls that are needed. Note that when
+using libminijail or minijail with preloading, supporting initial
+process setup calls will not be required.  Be conservative.
+
+It's also possible to analyze the binary checking for all non-dead
+functions and determining if any of them issue system calls.  There is
+no active implementation for this, but something like
+code.google.com/p/seccompsandbox is one possible runtime variant.
+
+.SH AUTHOR
+The Chromium OS Authors <chromiumos-dev@chromium.org>
+.SH COPYRIGHT
+Copyright \(co 2011 The Chromium OS Authors
+License BSD-like.
+.SH "SEE ALSO"
+\fBminijail\fR(1)
diff --git a/minijail0.c b/minijail0.c
new file mode 100644
index 0000000..22b828c
--- /dev/null
+++ b/minijail0.c
@@ -0,0 +1,275 @@
+/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "libminijail.h"
+#include "libsyscalls.h"
+
+#include "elfparse.h"
+#include "util.h"
+
+static void set_user(struct minijail *j, const char *arg)
+{
+	char *end = NULL;
+	int uid = strtod(arg, &end);
+	if (!*end && *arg) {
+		minijail_change_uid(j, uid);
+		return;
+	}
+
+	if (minijail_change_user(j, arg)) {
+		fprintf(stderr, "Bad user: '%s'\n", arg);
+		exit(1);
+	}
+}
+
+static void set_group(struct minijail *j, const char *arg)
+{
+	char *end = NULL;
+	int gid = strtod(arg, &end);
+	if (!*end && *arg) {
+		minijail_change_gid(j, gid);
+		return;
+	}
+
+	if (minijail_change_group(j, arg)) {
+		fprintf(stderr, "Bad group: '%s'\n", arg);
+		exit(1);
+	}
+}
+
+static void use_caps(struct minijail *j, const char *arg)
+{
+	uint64_t caps;
+	char *end = NULL;
+	caps = strtoull(arg, &end, 16);
+	if (*end) {
+		fprintf(stderr, "Invalid cap set: '%s'\n", arg);
+		exit(1);
+	}
+	minijail_use_caps(j, caps);
+}
+
+static void add_binding(struct minijail *j, char *arg)
+{
+	char *src = strtok(arg, ",");
+	char *dest = strtok(NULL, ",");
+	char *flags = strtok(NULL, ",");
+	if (!src || !dest) {
+		fprintf(stderr, "Bad binding: %s %s\n", src, dest);
+		exit(1);
+	}
+	if (minijail_bind(j, src, dest, flags ? atoi(flags) : 0)) {
+		fprintf(stderr, "Bind failure.\n");
+		exit(1);
+	}
+}
+
+static void usage(const char *progn)
+{
+	size_t i;
+
+	printf("Usage: %s [-Ghinprsvt] [-b <src>,<dest>[,<writeable>]] "
+	       "[-c <caps>] [-C <dir>] [-g <group>] [-S <file>] [-u <user>] "
+	       "<program> [args...]\n"
+	       "  -b:         binds <src> to <dest> in chroot. Multiple "
+	       "instances allowed\n"
+	       "  -c <caps>:  restrict caps to <caps>\n"
+	       "  -C <dir>:   chroot to <dir>\n"
+	       "  -e:         enter new network namespace\n"
+	       "  -G:         inherit secondary groups from uid\n"
+	       "  -g <group>: change gid to <group>\n"
+	       "  -h:         help (this message)\n"
+	       "  -H:         seccomp filter help message\n"
+	       "  -i:         exit immediately after fork (do not act as init)\n"
+	       "              Not compatible with -p\n"
+	       "  -L:         report blocked syscalls to syslog when using seccomp filter.\n"
+	       "              Forces the following syscalls to be allowed:\n"
+	       "                  ", progn);
+	for (i = 0; i < log_syscalls_len; i++)
+		printf("%s ", log_syscalls[i]);
+
+	printf("\n"
+	       "  -n:         set no_new_privs\n"
+	       "  -p:         enter new pid namespace (implies -vr)\n"
+	       "  -r:         remount /proc read-only (implies -v)\n"
+	       "  -s:         use seccomp\n"
+	       "  -S <file>:  set seccomp filter using <file>\n"
+	       "              E.g., -S /usr/share/filters/<prog>.$(uname -m)\n"
+	       "              Requires -n when not running as root\n"
+	       "  -t:         mount tmpfs at /tmp inside chroot\n"
+	       "  -u <user>:  change uid to <user>\n"
+	       "  -v:         enter new mount namespace\n"
+	       "  -V <file>:  enter specified mount namespace\n");
+}
+
+static void seccomp_filter_usage(const char *progn)
+{
+	const struct syscall_entry *entry = syscall_table;
+	printf("Usage: %s -S <policy.file> <program> [args...]\n\n"
+	       "System call names supported:\n", progn);
+	for (; entry->name && entry->nr >= 0; ++entry)
+		printf("  %s [%d]\n", entry->name, entry->nr);
+	printf("\nSee minijail0(5) for example policies.\n");
+}
+
+static int parse_args(struct minijail *j, int argc, char *argv[],
+		      int *exit_immediately)
+{
+	int opt;
+	int use_seccomp_filter = 0;
+	const size_t path_max = 4096;
+	const char *filter_path;
+	if (argc > 1 && argv[1][0] != '-')
+		return 1;
+	while ((opt = getopt(argc, argv, "u:g:sS:c:C:b:V:vrGhHinpLet")) != -1) {
+		switch (opt) {
+		case 'u':
+			set_user(j, optarg);
+			break;
+		case 'g':
+			set_group(j, optarg);
+			break;
+		case 'n':
+			minijail_no_new_privs(j);
+			break;
+		case 's':
+			minijail_use_seccomp(j);
+			break;
+		case 'S':
+			minijail_use_seccomp_filter(j);
+			if (strlen(optarg) >= path_max) {
+				fprintf(stderr,
+					"Filter path is too long.\n");
+				exit(1);
+			}
+			filter_path = strndup(optarg, path_max);
+			if (!filter_path) {
+				fprintf(stderr,
+					"Could not strndup(3) filter path.\n");
+				exit(1);
+			}
+			use_seccomp_filter = 1;
+			break;
+		case 'L':
+			minijail_log_seccomp_filter_failures(j);
+			break;
+		case 'b':
+			add_binding(j, optarg);
+			break;
+		case 'c':
+			use_caps(j, optarg);
+			break;
+		case 'C':
+			if (0 != minijail_enter_chroot(j, optarg)) {
+				fprintf(stderr, "Could not set chroot.\n");
+				exit(1);
+			}
+			break;
+		case 't':
+			minijail_mount_tmp(j);
+			break;
+		case 'v':
+			minijail_namespace_vfs(j);
+			break;
+		case 'V':
+			minijail_namespace_enter_vfs(j, optarg);
+			break;
+		case 'r':
+			minijail_remount_readonly(j);
+			break;
+		case 'G':
+			minijail_inherit_usergroups(j);
+			break;
+		case 'p':
+			minijail_namespace_pids(j);
+			break;
+		case 'e':
+			minijail_namespace_net(j);
+			break;
+		case 'i':
+			*exit_immediately = 1;
+			break;
+		case 'H':
+			seccomp_filter_usage(argv[0]);
+			exit(1);
+		default:
+			usage(argv[0]);
+			exit(1);
+		}
+		if (optind < argc && argv[optind][0] != '-')
+			break;
+	}
+
+	/*
+	 * We parse seccomp filters here to make sure we've collected all
+	 * cmdline options.
+	 */
+	if (use_seccomp_filter) {
+		minijail_parse_seccomp_filters(j, filter_path);
+		free((void*)filter_path);
+	}
+
+	if (argc == optind) {
+		usage(argv[0]);
+		exit(1);
+	}
+
+	return optind;
+}
+
+int main(int argc, char *argv[])
+{
+	struct minijail *j = minijail_new();
+	char *dl_mesg = NULL;
+	int exit_immediately = 0;
+	int consumed = parse_args(j, argc, argv, &exit_immediately);
+	ElfType elftype = ELFERROR;
+	argc -= consumed;
+	argv += consumed;
+
+	/* Check that we can access the target program. */
+	if (access(argv[0], X_OK)) {
+		fprintf(stderr, "Target program '%s' is not accessible.\n",
+			argv[0]);
+		return 1;
+	}
+
+	/* Check if target is statically or dynamically linked. */
+	elftype = get_elf_linkage(argv[0]);
+	if (elftype == ELFSTATIC) {
+		/* Target binary is static. */
+		minijail_run_static(j, argv[0], argv);
+	} else if (elftype == ELFDYNAMIC) {
+		/*
+		 * Target binary is dynamically linked so we can
+		 * inject libminijailpreload.so into it.
+		 */
+
+		/* Check that we can dlopen() libminijailpreload.so. */
+		if (!dlopen(PRELOADPATH, RTLD_LAZY | RTLD_LOCAL)) {
+			    dl_mesg = dlerror();
+			    fprintf(stderr, "dlopen(): %s\n", dl_mesg);
+			    return 1;
+		}
+		minijail_run(j, argv[0], argv);
+	} else {
+		fprintf(stderr,
+			"Target program '%s' is not a valid ELF file.\n",
+			argv[0]);
+		return 1;
+	}
+
+	if (exit_immediately) {
+		info("not running init loop, exiting immediately");
+		return 0;
+	}
+	return minijail_wait(j);
+}
diff --git a/signal.c b/signal.c
new file mode 100644
index 0000000..7342e04
--- /dev/null
+++ b/signal.c
@@ -0,0 +1,73 @@
+/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/* These header files need to be included before asm/siginfo.h such that
+ * pid_t, timer_t, and clock_t are defined. */
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <asm/siginfo.h>
+#define __have_siginfo_t 1
+#define __have_sigval_t 1
+#define __have_sigevent_t 1
+
+#include <signal.h>
+#include <string.h>
+
+#include "signal.h"
+
+#include "util.h"
+
+struct local_sigsys {
+	void		*ip;
+	int		nr;
+	unsigned int	arch;
+};
+
+void log_sigsys_handler(int nr, siginfo_t *info, void *void_context)
+{
+	struct local_sigsys sigsys;
+	const char *syscall_name;
+	memcpy(&sigsys, &info->_sifields, sizeof(sigsys));
+	syscall_name = lookup_syscall_name(sigsys.nr);
+
+	if (syscall_name)
+		warn("blocked syscall: %s", syscall_name);
+	else
+		warn("blocked syscall: %d", nr);
+
+	(void) void_context;
+
+	/*
+	 * We trapped on a syscall that should have killed the process.
+	 * This should never ever return, but we're paranoid.
+	 */
+	for (;;)
+		_exit(1);
+}
+
+int install_sigsys_handler()
+{
+	int ret = 0;
+	struct sigaction act;
+	sigset_t mask;
+
+	memset(&act, 0, sizeof(act));
+	act.sa_sigaction = &log_sigsys_handler;
+	act.sa_flags = SA_SIGINFO;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGSYS);
+
+	ret = sigaction(SIGSYS, &act, NULL);
+	if (ret < 0)
+		return ret;
+
+	ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
diff --git a/signal.h b/signal.h
new file mode 100644
index 0000000..d68bbb2
--- /dev/null
+++ b/signal.h
@@ -0,0 +1,14 @@
+/* signal.h
+ * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Signal handling functions.
+ */
+
+#ifndef SIGNAL_H
+#define SIGNAL_H
+
+int install_sigsys_handler();
+
+#endif /* SIGNAL_H */
diff --git a/syscall_filter.c b/syscall_filter.c
new file mode 100644
index 0000000..9ea5dca
--- /dev/null
+++ b/syscall_filter.c
@@ -0,0 +1,527 @@
+/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "syscall_filter.h"
+
+#include "util.h"
+
+#define MAX_LINE_LENGTH		1024
+#define MAX_POLICY_LINE_LENGTH	1024
+
+#define ONE_INSTR	1
+#define TWO_INSTRS	2
+
+int str_to_op(const char *op_str)
+{
+	if (!strcmp(op_str, "==")) {
+		return EQ;
+	} else if (!strcmp(op_str, "!=")) {
+		return NE;
+	} else if (!strcmp(op_str, "&")) {
+		return SET;
+	} else {
+		return 0;
+	}
+}
+
+struct sock_filter *new_instr_buf(size_t count)
+{
+	struct sock_filter *buf = calloc(count, sizeof(struct sock_filter));
+	if (!buf)
+		die("could not allocate BPF instruction buffer");
+
+	return buf;
+}
+
+struct filter_block *new_filter_block()
+{
+	struct filter_block *block = calloc(1, sizeof(struct filter_block));
+	if (!block)
+		die("could not allocate BPF filter block");
+
+	block->instrs = NULL;
+	block->last = block->next = NULL;
+
+	return block;
+}
+
+void append_filter_block(struct filter_block *head,
+		struct sock_filter *instrs, size_t len)
+{
+	struct filter_block *new_last;
+
+	/*
+	 * If |head| has no filter assigned yet,
+	 * we don't create a new node.
+	 */
+	if (head->instrs == NULL) {
+		new_last = head;
+	} else {
+		new_last = new_filter_block();
+		if (head->next != NULL) {
+			head->last->next = new_last;
+			head->last = new_last;
+		} else {
+			head->last = head->next = new_last;
+		}
+		head->total_len += len;
+	}
+
+	new_last->instrs = instrs;
+	new_last->total_len = new_last->len = len;
+	new_last->last = new_last->next = NULL;
+}
+
+void extend_filter_block_list(struct filter_block *list,
+		struct filter_block *another)
+{
+	if (list->last != NULL) {
+		list->last->next = another;
+		list->last = another->last;
+	} else {
+		list->next = another;
+		list->last = another->last;
+	}
+	list->total_len += another->total_len;
+}
+
+void append_ret_kill(struct filter_block *head)
+{
+	struct sock_filter *filter = new_instr_buf(ONE_INSTR);
+	set_bpf_ret_kill(filter);
+	append_filter_block(head, filter, ONE_INSTR);
+}
+
+void append_ret_trap(struct filter_block *head)
+{
+	struct sock_filter *filter = new_instr_buf(ONE_INSTR);
+	set_bpf_ret_trap(filter);
+	append_filter_block(head, filter, ONE_INSTR);
+}
+
+void append_ret_errno(struct filter_block *head, int errno_val)
+{
+	struct sock_filter *filter = new_instr_buf(ONE_INSTR);
+	set_bpf_ret_errno(filter, errno_val);
+	append_filter_block(head, filter, ONE_INSTR);
+}
+
+void append_allow_syscall(struct filter_block *head, int nr)
+{
+	struct sock_filter *filter = new_instr_buf(ALLOW_SYSCALL_LEN);
+	size_t len = bpf_allow_syscall(filter, nr);
+	if (len != ALLOW_SYSCALL_LEN)
+		die("error building syscall number comparison");
+
+	append_filter_block(head, filter, len);
+}
+
+void allow_log_syscalls(struct filter_block *head)
+{
+	unsigned int i;
+	for (i = 0; i < log_syscalls_len; i++) {
+		warn("allowing syscall: %s", log_syscalls[i]);
+		append_allow_syscall(head, lookup_syscall(log_syscalls[i]));
+	}
+}
+
+unsigned int get_label_id(struct bpf_labels *labels, const char *label_str)
+{
+	int label_id = bpf_label_id(labels, label_str);
+	if (label_id < 0)
+		die("could not allocate BPF label string");
+	return label_id;
+}
+
+unsigned int group_end_lbl(struct bpf_labels *labels, int nr, int idx)
+{
+	char lbl_str[MAX_BPF_LABEL_LEN];
+	snprintf(lbl_str, MAX_BPF_LABEL_LEN, "%d_%d_end", nr, idx);
+	return get_label_id(labels, lbl_str);
+}
+
+unsigned int success_lbl(struct bpf_labels *labels, int nr)
+{
+	char lbl_str[MAX_BPF_LABEL_LEN];
+	snprintf(lbl_str, MAX_BPF_LABEL_LEN, "%d_success", nr);
+	return get_label_id(labels, lbl_str);
+}
+
+int compile_atom(struct filter_block *head, char *atom,
+		struct bpf_labels *labels, int nr, int group_idx)
+{
+	/* Splits the atom. */
+	char *atom_ptr;
+	char *argidx_str = strtok_r(atom, " ", &atom_ptr);
+	char *operator_str = strtok_r(NULL, " ", &atom_ptr);
+	char *constant_str = strtok_r(NULL, " ", &atom_ptr);
+
+	if (argidx_str == NULL || operator_str == NULL || constant_str == NULL)
+		return -1;
+
+	int op = str_to_op(operator_str);
+	if (op < MIN_OPERATOR)
+		return -1;
+
+	if (strncmp(argidx_str, "arg", 3)) {
+		return -1;
+	}
+
+	char *argidx_ptr;
+	long int argidx = strtol(argidx_str + 3, &argidx_ptr, 10);
+	/*
+	 * Checks to see if an actual argument index
+	 * was parsed.
+	 */
+	if (argidx_ptr == argidx_str + 3)
+		return -1;
+
+	long int c = strtol(constant_str, NULL, 0);
+	/*
+	 * Looks up the label for the end of the AND statement
+	 * this atom belongs to.
+	 */
+	unsigned int id = group_end_lbl(labels, nr, group_idx);
+
+	/*
+	 * Builds a BPF comparison between a syscall argument
+	 * and a constant.
+	 * The comparison lives inside an AND statement.
+	 * If the comparison succeeds, we continue
+	 * to the next comparison.
+	 * If this comparison fails, the whole AND statement
+	 * will fail, so we jump to the end of this AND statement.
+	 */
+	struct sock_filter *comp_block;
+	size_t len = bpf_arg_comp(&comp_block, op, argidx, c, id);
+	if (len == 0)
+		return -1;
+
+	append_filter_block(head, comp_block, len);
+	return 0;
+}
+
+int compile_errno(struct filter_block *head, char *ret_errno)
+{
+	char *errno_ptr;
+
+	/* Splits the 'return' keyword and the actual errno value. */
+	char *ret_str = strtok_r(ret_errno, " ", &errno_ptr);
+	if (strncmp(ret_str, "return", strlen("return")))
+		return -1;
+
+	char *errno_val_str = strtok_r(NULL, " ", &errno_ptr);
+
+	if (errno_val_str) {
+		char *errno_val_ptr;
+		int errno_val = strtol(
+				errno_val_str, &errno_val_ptr, 0);
+		/* Checks to see if we parsed an actual errno. */
+		if (errno_val_ptr == errno_val_str)
+			return -1;
+
+		append_ret_errno(head, errno_val);
+	} else {
+		append_ret_kill(head);
+	}
+	return 0;
+}
+
+struct filter_block *compile_section(int nr, const char *policy_line,
+		unsigned int entry_lbl_id, struct bpf_labels *labels)
+{
+	/*
+	 * |policy_line| should be an expression of the form:
+	 * "arg0 == 3 && arg1 == 5 || arg0 == 0x8"
+	 *
+	 * This is, an expression in DNF (disjunctive normal form);
+	 * a disjunction ('||') of one or more conjunctions ('&&')
+	 * of one or more atoms.
+	 *
+	 * Atoms are of the form "arg{DNUM} {OP} {NUM}"
+	 * where:
+	 *   - DNUM is a decimal number.
+	 *   - OP is an operator: ==, !=, or & (flags set).
+	 *   - NUM is an octal, decimal, or hexadecimal number.
+	 *
+	 * When the syscall arguments make the expression true,
+	 * the syscall is allowed. If not, the process is killed.
+	 *
+	 * To block a syscall without killing the process,
+	 * |policy_line| can be of the form:
+	 * "return <errno>"
+	 *
+	 * This "return {NUM}" policy line will block the syscall,
+	 * make it return -1 and set |errno| to NUM.
+	 *
+	 * A regular policy line can also include a "return <errno>" clause,
+	 * separated by a semicolon (';'):
+	 * "arg0 == 3 && arg1 == 5 || arg0 == 0x8; return {NUM}"
+	 *
+	 * If the syscall arguments don't make the expression true,
+	 * the syscall will be blocked as above instead of killing the process.
+	 */
+
+	size_t len = 0;
+	int group_idx = 0;
+
+	/* Checks for overly long policy lines. */
+	if (strlen(policy_line) >= MAX_POLICY_LINE_LENGTH)
+		return NULL;
+
+	/* We will modify |policy_line|, so let's make a copy. */
+	char *line = strndup(policy_line, MAX_POLICY_LINE_LENGTH);
+	if (!line)
+		return NULL;
+
+	/*
+	 * We build the filter section as a collection of smaller
+	 * "filter blocks" linked together in a singly-linked list.
+	 */
+	struct filter_block *head = new_filter_block();
+
+	/*
+	 * Filter sections begin with a label where the main filter
+	 * will jump after checking the syscall number.
+	 */
+	struct sock_filter *entry_label = new_instr_buf(ONE_INSTR);
+	set_bpf_lbl(entry_label, entry_lbl_id);
+	append_filter_block(head, entry_label, ONE_INSTR);
+
+	/* Checks whether we're unconditionally blocking this syscall. */
+	if (strncmp(line, "return", strlen("return")) == 0) {
+		if (compile_errno(head, line) < 0)
+			return NULL;
+		free(line);
+		return head;
+	}
+
+	/* Splits the optional "return <errno>" part. */
+	char *line_ptr;
+	char *arg_filter = strtok_r(line, ";", &line_ptr);
+	char *ret_errno = strtok_r(NULL, ";", &line_ptr);
+
+	/*
+	 * Splits the policy line by '||' into conjunctions and each conjunction
+	 * by '&&' into atoms.
+	 */
+	char *arg_filter_str = arg_filter;
+	char *group;
+	while ((group = tokenize(&arg_filter_str, "||")) != NULL) {
+		char *group_str = group;
+		char *comp;
+		while ((comp = tokenize(&group_str, "&&")) != NULL) {
+			/* Compiles each atom into a BPF block. */
+			if (compile_atom(head, comp, labels, nr, group_idx) < 0)
+				return NULL;
+		}
+		/*
+		 * If the AND statement succeeds, we're done,
+		 * so jump to SUCCESS line.
+		 */
+		unsigned int id = success_lbl(labels, nr);
+		struct sock_filter *group_end_block = new_instr_buf(TWO_INSTRS);
+		len = set_bpf_jump_lbl(group_end_block, id);
+		/*
+		 * The end of each AND statement falls after the
+		 * jump to SUCCESS.
+		 */
+		id = group_end_lbl(labels, nr, group_idx++);
+		len += set_bpf_lbl(group_end_block + len, id);
+		append_filter_block(head, group_end_block, len);
+	}
+
+	/*
+	 * If no AND statements succeed, we end up here,
+	 * because we never jumped to SUCCESS.
+	 * If we have to return an errno, do it,
+	 * otherwise just kill the task.
+	 */
+	if (ret_errno) {
+		if (compile_errno(head, ret_errno) < 0)
+			return NULL;
+	} else {
+		append_ret_kill(head);
+	}
+
+	/*
+	 * Every time the filter succeeds we jump to a predefined SUCCESS
+	 * label. Add that label and BPF RET_ALLOW code now.
+	 */
+	unsigned int id = success_lbl(labels, nr);
+	struct sock_filter *success_block = new_instr_buf(TWO_INSTRS);
+	len = set_bpf_lbl(success_block, id);
+	len += set_bpf_ret_allow(success_block + len);
+	append_filter_block(head, success_block, len);
+
+	free(line);
+	return head;
+}
+
+int compile_filter(FILE *policy_file, struct sock_fprog *prog,
+		int log_failures)
+{
+	char line[MAX_LINE_LENGTH];
+	int line_count = 0;
+
+	struct bpf_labels labels;
+	labels.count = 0;
+
+	if (!policy_file)
+		return -1;
+
+	struct filter_block *head = new_filter_block();
+	struct filter_block *arg_blocks = NULL;
+
+	/* Start filter by validating arch. */
+	struct sock_filter *valid_arch = new_instr_buf(ARCH_VALIDATION_LEN);
+	size_t len = bpf_validate_arch(valid_arch);
+	append_filter_block(head, valid_arch, len);
+
+	/* Load syscall number. */
+	struct sock_filter *load_nr = new_instr_buf(ONE_INSTR);
+	len = bpf_load_syscall_nr(load_nr);
+	append_filter_block(head, load_nr, len);
+
+	/* If we're logging failures, allow the necessary syscalls first. */
+	if (log_failures)
+		allow_log_syscalls(head);
+
+	/*
+	 * Loop through all the lines in the policy file.
+	 * Build a jump table for the syscall number.
+	 * If the policy line has an arg filter, build the arg filter
+	 * as well.
+	 * Chain the filter sections together and dump them into
+	 * the final buffer at the end.
+	 */
+	while (fgets(line, sizeof(line), policy_file)) {
+		++line_count;
+		char *policy_line = line;
+		char *syscall_name = strsep(&policy_line, ":");
+		int nr = -1;
+
+		syscall_name = strip(syscall_name);
+
+		/* Allow comments and empty lines. */
+		if (*syscall_name == '#' || *syscall_name == '\0')
+			continue;
+
+		if (!policy_line)
+			return -1;
+
+		nr = lookup_syscall(syscall_name);
+		if (nr < 0) {
+			warn("compile_filter: nonexistent syscall '%s'",
+			     syscall_name);
+			return -1;
+		}
+
+		policy_line = strip(policy_line);
+
+		/*
+		 * For each syscall, add either a simple ALLOW,
+		 * or an arg filter block.
+		 */
+		if (strcmp(policy_line, "1") == 0) {
+			/* Add simple ALLOW. */
+			append_allow_syscall(head, nr);
+		} else {
+			/*
+			 * Create and jump to the label that will hold
+			 * the arg filter block.
+			 */
+			unsigned int id = bpf_label_id(&labels, syscall_name);
+			struct sock_filter *nr_comp =
+					new_instr_buf(ALLOW_SYSCALL_LEN);
+			bpf_allow_syscall_args(nr_comp, nr, id);
+			append_filter_block(head, nr_comp, ALLOW_SYSCALL_LEN);
+
+			/* Build the arg filter block. */
+			struct filter_block *block =
+				compile_section(nr, policy_line, id, &labels);
+
+			if (!block)
+				return -1;
+
+			if (arg_blocks) {
+				extend_filter_block_list(arg_blocks, block);
+			} else {
+				arg_blocks = block;
+			}
+		}
+	}
+
+	/*
+	 * If none of the syscalls match, either fall back to KILL,
+	 * or return TRAP.
+	 */
+	if (!log_failures)
+		append_ret_kill(head);
+	else
+		append_ret_trap(head);
+
+	/* Allocate the final buffer, now that we know its size. */
+	size_t final_filter_len = head->total_len +
+		(arg_blocks? arg_blocks->total_len : 0);
+	if (final_filter_len > BPF_MAXINSNS)
+		return -1;
+
+	struct sock_filter *final_filter =
+			calloc(final_filter_len, sizeof(struct sock_filter));
+
+	if (flatten_block_list(head, final_filter, 0, final_filter_len) < 0)
+		return -1;
+
+	if (flatten_block_list(arg_blocks, final_filter,
+			head->total_len, final_filter_len) < 0)
+		return -1;
+
+	free_block_list(head);
+	free_block_list(arg_blocks);
+
+	bpf_resolve_jumps(&labels, final_filter, final_filter_len);
+
+	free_label_strings(&labels);
+
+	prog->filter = final_filter;
+	prog->len = final_filter_len;
+	return 0;
+}
+
+int flatten_block_list(struct filter_block *head, struct sock_filter *filter,
+		size_t index, size_t cap)
+{
+	size_t _index = index;
+
+	struct filter_block *curr;
+	size_t i;
+
+	for (curr = head; curr; curr = curr->next) {
+		for (i = 0; i < curr->len; i++) {
+			if (_index >= cap)
+				return -1;
+			filter[_index++] = curr->instrs[i];
+		}
+	}
+	return 0;
+}
+
+void free_block_list(struct filter_block *head)
+{
+	struct filter_block *current, *prev;
+
+	current = head;
+	while (current) {
+		free(current->instrs);
+		prev = current;
+		current = current->next;
+		free(prev);
+	}
+}
diff --git a/syscall_filter.h b/syscall_filter.h
new file mode 100644
index 0000000..4790d9f
--- /dev/null
+++ b/syscall_filter.h
@@ -0,0 +1,37 @@
+/* syscall_filter.h
+ * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Syscall filter functions.
+ */
+
+#ifndef SYSCALL_FILTER_H
+#define SYSCALL_FILTER_H
+
+#include "bpf.h"
+
+#define NO_LOGGING  0
+#define USE_LOGGING 1
+
+struct filter_block {
+	struct sock_filter *instrs;
+	size_t len;
+
+	struct filter_block *next;
+	struct filter_block *last;
+	size_t total_len;
+};
+
+struct bpf_labels;
+
+struct filter_block *compile_section(int nr, const char *policy_line,
+		unsigned int label_id, struct bpf_labels *labels);
+int compile_filter(FILE *policy_file, struct sock_fprog *prog,
+		int log_failures);
+
+int flatten_block_list(struct filter_block *head, struct sock_filter *filter,
+		size_t index, size_t cap);
+void free_block_list(struct filter_block *head);
+
+#endif /* SYSCALL_FILTER_H */
diff --git a/syscall_filter_unittest.c b/syscall_filter_unittest.c
new file mode 100644
index 0000000..fb903c5
--- /dev/null
+++ b/syscall_filter_unittest.c
@@ -0,0 +1,622 @@
+/* syscall_filter_unittest.c
+ * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Test syscall filtering.
+ */
+
+#include <asm/unistd.h>
+#include <errno.h>
+#include <fcntl.h>	/* For O_WRONLY */
+
+#include "test_harness.h"
+
+#include "bpf.h"
+#include "syscall_filter.h"
+
+#include "util.h"
+
+/* BPF testing macros. */
+#define EXPECT_EQ_BLOCK(_block, _code, _k, _jt, _jf)	\
+do {	\
+	EXPECT_EQ((_block)->code, _code);		\
+	EXPECT_EQ((_block)->k, (unsigned int)(_k));	\
+	EXPECT_EQ((_block)->jt, _jt);			\
+	EXPECT_EQ((_block)->jf, _jf);			\
+} while (0)
+
+#define EXPECT_EQ_STMT(_block, _code, _k) \
+	EXPECT_EQ_BLOCK(_block, _code, _k, 0, 0)
+
+#define EXPECT_COMP(_block) \
+do {	\
+	EXPECT_EQ((_block)->len, BPF_ARG_COMP_LEN + 1);			\
+	EXPECT_EQ((_block)->instrs->code, BPF_LD+BPF_W+BPF_ABS);	\
+} while (0)
+
+#define EXPECT_LBL(_block) \
+	do {	\
+	EXPECT_EQ((_block)->code, BPF_JMP+BPF_JA);	\
+	EXPECT_EQ((_block)->jt, LABEL_JT);		\
+	EXPECT_EQ((_block)->jf, LABEL_JF);		\
+} while (0)
+
+#define EXPECT_JUMP_LBL(_block) \
+do {	\
+	EXPECT_EQ((_block)->code, BPF_JMP+BPF_JA);	\
+	EXPECT_EQ((_block)->jt, JUMP_JT);		\
+	EXPECT_EQ((_block)->jf, JUMP_JF);		\
+} while (0)
+
+#define EXPECT_GROUP_END(_block) \
+do {	\
+	EXPECT_EQ((_block)->len, 2U);			\
+	EXPECT_JUMP_LBL(&(_block)->instrs[0]);		\
+	EXPECT_LBL(&(_block)->instrs[1]);		\
+} while (0)
+
+#define EXPECT_KILL(_block) \
+do {	\
+	EXPECT_EQ((_block)->len, 1U);				\
+	EXPECT_EQ_STMT(_block->instrs,				\
+			BPF_RET+BPF_K, SECCOMP_RET_KILL);	\
+} while (0)
+
+#define EXPECT_ALLOW(_block) \
+do {	\
+	EXPECT_EQ((_block)->len, 2U);				\
+	EXPECT_LBL(&(_block)->instrs[0]);			\
+	EXPECT_EQ_STMT(&(_block)->instrs[1],			\
+			BPF_RET+BPF_K, SECCOMP_RET_ALLOW);	\
+} while (0)
+
+#define EXPECT_ARCH_VALIDATION(_filter) \
+do {	\
+	EXPECT_EQ_STMT(&(_filter)[0], BPF_LD+BPF_W+BPF_ABS, arch_nr);	\
+	EXPECT_EQ_BLOCK(&(_filter)[1],					\
+			BPF_JMP+BPF_JEQ+BPF_K, ARCH_NR, SKIP, NEXT);	\
+	EXPECT_EQ_STMT(&(_filter)[2], BPF_RET+BPF_K, SECCOMP_RET_KILL);	\
+} while (0)
+
+#define EXPECT_ALLOW_SYSCALL(_filter, _nr) \
+do {	\
+	EXPECT_EQ_BLOCK(&(_filter)[0],					\
+			BPF_JMP+BPF_JEQ+BPF_K, (_nr), NEXT, SKIP);	\
+	EXPECT_EQ_STMT(&(_filter)[1],					\
+			BPF_RET+BPF_K, SECCOMP_RET_ALLOW);		\
+} while (0)
+
+#define EXPECT_ALLOW_SYSCALL_ARGS(_filter, _nr, _id, _jt, _jf) \
+do {	\
+	EXPECT_EQ_BLOCK(&(_filter)[0],					\
+			BPF_JMP+BPF_JEQ+BPF_K, (_nr), NEXT, SKIP);	\
+	EXPECT_EQ_BLOCK(&(_filter)[1],					\
+			BPF_JMP+BPF_JA, (_id), (_jt), (_jf));		\
+} while (0)
+
+
+FIXTURE(bpf) {};
+
+FIXTURE_SETUP(bpf) {}
+FIXTURE_TEARDOWN(bpf) {}
+
+/* Test that setting one BPF instruction works. */
+TEST_F(bpf, set_bpf_instr) {
+	struct sock_filter instr;
+	unsigned char code = BPF_LD+BPF_W+BPF_ABS;
+	unsigned int k = 4;
+	unsigned char jt = 1, jf = 2;
+
+	size_t len = set_bpf_instr(&instr, code, k, jt, jf);
+
+	EXPECT_EQ(len, 1U);
+	EXPECT_EQ_BLOCK(&instr, code, k, jt, jf);
+}
+
+TEST_F(bpf, bpf_load_arg) {
+	struct sock_filter load_arg[BPF_LOAD_ARG_LEN];
+	int argidx = 1;
+	size_t len = bpf_load_arg(load_arg, argidx);
+
+	EXPECT_EQ(len, BPF_LOAD_ARG_LEN);
+
+#if defined(BITS32)
+	EXPECT_EQ_STMT(&load_arg[0], BPF_LD+BPF_W+BPF_ABS, LO_ARG(argidx));
+#elif defined(BITS64)
+	EXPECT_EQ_STMT(&load_arg[0], BPF_LD+BPF_W+BPF_ABS, LO_ARG(argidx));
+	EXPECT_EQ_STMT(&load_arg[1], BPF_ST, 0);
+	EXPECT_EQ_STMT(&load_arg[2], BPF_LD+BPF_W+BPF_ABS, HI_ARG(argidx));
+	EXPECT_EQ_STMT(&load_arg[3], BPF_ST, 1);
+#endif
+}
+
+TEST_F(bpf, bpf_comp_jeq) {
+	struct sock_filter comp_jeq[BPF_COMP_LEN];
+	unsigned long c = 1;
+	unsigned char jt = 1;
+	unsigned char jf = 2;
+
+	size_t len = bpf_comp_jeq(comp_jeq, c, jt, jf);
+
+	EXPECT_EQ(len, BPF_COMP_LEN);
+
+#if defined(BITS32)
+	EXPECT_EQ_BLOCK(&comp_jeq[0],
+			BPF_JMP+BPF_JEQ+BPF_K, c, jt, jf);
+#elif defined(BITS64)
+	EXPECT_EQ_BLOCK(&comp_jeq[0],
+			BPF_JMP+BPF_JEQ+BPF_K, 0, 0, jf + 2);
+	EXPECT_EQ_STMT(&comp_jeq[1], BPF_LD+BPF_MEM, 0);
+	EXPECT_EQ_BLOCK(&comp_jeq[2],
+			BPF_JMP+BPF_JEQ+BPF_K, c, jt, jf);
+#endif
+}
+
+TEST_F(bpf, bpf_comp_jset) {
+	struct sock_filter comp_jset[BPF_COMP_LEN];
+	unsigned long mask = O_WRONLY;
+	unsigned char jt = 1;
+	unsigned char jf = 2;
+
+	size_t len = bpf_comp_jset(comp_jset, mask, jt, jf);
+
+	EXPECT_EQ(len, BPF_COMP_LEN);
+
+#if defined(BITS32)
+	EXPECT_EQ_BLOCK(&comp_jset[0],
+			BPF_JMP+BPF_JSET+BPF_K, mask, jt, jf);
+#elif defined(BITS64)
+	EXPECT_EQ_BLOCK(&comp_jset[0],
+			BPF_JMP+BPF_JSET+BPF_K, 0, jt + 2, 0);
+	EXPECT_EQ_STMT(&comp_jset[1], BPF_LD+BPF_MEM, 0);
+	EXPECT_EQ_BLOCK(&comp_jset[2],
+			BPF_JMP+BPF_JSET+BPF_K, mask, jt, jf);
+#endif
+}
+
+TEST_F(bpf, bpf_arg_comp) {
+	struct sock_filter *arg_comp;
+	int op = EQ;
+	int argidx = 1;
+	unsigned long c = 3;
+	unsigned int label_id = 0;
+
+	size_t len = bpf_arg_comp(&arg_comp, op, argidx, c, label_id);
+
+	EXPECT_EQ(len, BPF_ARG_COMP_LEN + 1);
+
+#if defined(BITS32)
+	EXPECT_EQ_STMT(&arg_comp[0],
+			BPF_LD+BPF_W+BPF_ABS, LO_ARG(argidx));
+	EXPECT_EQ_BLOCK(&arg_comp[1],
+			BPF_JMP+BPF_JEQ+BPF_K, c, 1, 0);
+	EXPECT_JUMP_LBL(&arg_comp[2]);
+#elif defined(BITS64)
+	EXPECT_EQ_STMT(&arg_comp[0],
+			BPF_LD+BPF_W+BPF_ABS, LO_ARG(argidx));
+	EXPECT_EQ_STMT(&arg_comp[1], BPF_ST, 0);
+	EXPECT_EQ_STMT(&arg_comp[2],
+			BPF_LD+BPF_W+BPF_ABS, HI_ARG(argidx));
+	EXPECT_EQ_STMT(&arg_comp[3], BPF_ST, 1);
+
+	EXPECT_EQ_BLOCK(&arg_comp[4],
+			BPF_JMP+BPF_JEQ+BPF_K, 0, 0, 2);
+	EXPECT_EQ_STMT(&arg_comp[5], BPF_LD+BPF_MEM, 0);
+	EXPECT_EQ_BLOCK(&arg_comp[6],
+			BPF_JMP+BPF_JEQ+BPF_K, c, 1, 0);
+	EXPECT_JUMP_LBL(&arg_comp[7]);
+#endif
+	free(arg_comp);
+}
+
+TEST_F(bpf, bpf_validate_arch) {
+	struct sock_filter validate_arch[ARCH_VALIDATION_LEN];
+
+	size_t len = bpf_validate_arch(validate_arch);
+
+	EXPECT_EQ(len, ARCH_VALIDATION_LEN);
+	EXPECT_ARCH_VALIDATION(validate_arch);
+}
+
+TEST_F(bpf, bpf_allow_syscall) {
+	struct sock_filter allow_syscall[ALLOW_SYSCALL_LEN];
+	int nr = 1;
+
+	size_t len = bpf_allow_syscall(allow_syscall, nr);
+
+	EXPECT_EQ(len, ALLOW_SYSCALL_LEN);
+	EXPECT_ALLOW_SYSCALL(allow_syscall, nr);
+}
+
+TEST_F(bpf, bpf_allow_syscall_args) {
+	struct sock_filter allow_syscall[ALLOW_SYSCALL_LEN];
+	int nr = 1;
+	unsigned int id = 1024;
+
+	size_t len = bpf_allow_syscall_args(allow_syscall, nr, id);
+
+	EXPECT_EQ(len, ALLOW_SYSCALL_LEN);
+	EXPECT_ALLOW_SYSCALL_ARGS(allow_syscall, nr, id, JUMP_JT, JUMP_JF);
+}
+
+FIXTURE(arg_filter) {
+	struct bpf_labels labels;
+};
+
+FIXTURE_SETUP(arg_filter) {}
+FIXTURE_TEARDOWN(arg_filter) {}
+
+TEST_F(arg_filter, arg0_equals) {
+	const char *fragment = "arg0 == 0";
+	int nr = 1;
+	unsigned int id = 0;
+	struct filter_block *block =
+		compile_section(nr, fragment, id, &self->labels);
+
+	ASSERT_NE(block, NULL);
+	size_t exp_total_len = 1 + (BPF_ARG_COMP_LEN + 1) + 2 + 1 + 2;
+	EXPECT_EQ(block->total_len, exp_total_len);
+
+	/* First block is a label. */
+	struct filter_block *curr_block = block;
+	ASSERT_NE(curr_block, NULL);
+	EXPECT_EQ(block->len, 1U);
+	EXPECT_LBL(curr_block->instrs);
+
+	/* Second block is a comparison. */
+	curr_block = block->next;
+	EXPECT_COMP(curr_block);
+
+	/* Third block is a jump and a label (end of AND group). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_GROUP_END(curr_block);
+
+	/* Fourth block is SECCOMP_RET_KILL */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_KILL(curr_block);
+
+	/* Fifth block is "SUCCESS" label and SECCOMP_RET_ALLOW */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_ALLOW(curr_block);
+
+	EXPECT_EQ(curr_block->next, NULL);
+
+	free_block_list(block);
+	free_label_strings(&self->labels);
+}
+
+TEST_F(arg_filter, arg0_mask) {
+	const char *fragment = "arg1 & 02";	/* O_RDWR */
+	int nr = 1;
+	unsigned int id = 0;
+	struct filter_block *block =
+		compile_section(nr, fragment, id, &self->labels);
+
+	ASSERT_NE(block, NULL);
+	size_t exp_total_len = 1 + (BPF_ARG_COMP_LEN + 1) + 2 + 1 + 2;
+	EXPECT_EQ(block->total_len, exp_total_len);
+
+	/* First block is a label. */
+	struct filter_block *curr_block = block;
+	ASSERT_NE(curr_block, NULL);
+	EXPECT_EQ(block->len, 1U);
+	EXPECT_LBL(curr_block->instrs);
+
+	/* Second block is a comparison. */
+	curr_block = block->next;
+	EXPECT_COMP(curr_block);
+
+	/* Third block is a jump and a label (end of AND group). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_GROUP_END(curr_block);
+
+	/* Fourth block is SECCOMP_RET_KILL */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_KILL(curr_block);
+
+	/* Fifth block is "SUCCESS" label and SECCOMP_RET_ALLOW */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_ALLOW(curr_block);
+
+	EXPECT_EQ(curr_block->next, NULL);
+
+	free_block_list(block);
+	free_label_strings(&self->labels);
+}
+
+TEST_F(arg_filter, and_or) {
+	const char *fragment = "arg0 == 0 && arg1 == 0 || arg0 == 1";
+	int nr = 1;
+	unsigned int id = 0;
+
+	struct filter_block *block =
+		compile_section(nr, fragment, id, &self->labels);
+	ASSERT_NE(block, NULL);
+	size_t exp_total_len = 1 + 3 * (BPF_ARG_COMP_LEN + 1) + 2 + 2 + 1 + 2;
+	EXPECT_EQ(block->total_len, exp_total_len);
+
+	/* First block is a label. */
+	struct filter_block *curr_block = block;
+	ASSERT_NE(curr_block, NULL);
+	EXPECT_EQ(block->len, 1U);
+	EXPECT_LBL(curr_block->instrs);
+
+	/* Second block is a comparison ("arg0 == 0"). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_COMP(curr_block);
+
+	/* Third block is a comparison ("arg1 == 0"). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_COMP(curr_block);
+
+	/* Fourth block is a jump and a label (end of AND group). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_GROUP_END(curr_block);
+
+	/* Fifth block is a comparison ("arg0 == 1"). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_COMP(curr_block);
+
+	/* Sixth block is a jump and a label (end of AND group). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_GROUP_END(curr_block);
+
+	/* Seventh block is SECCOMP_RET_KILL */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_KILL(curr_block);
+
+	/* Eigth block is "SUCCESS" label and SECCOMP_RET_ALLOW */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_ALLOW(curr_block);
+
+	EXPECT_EQ(curr_block->next, NULL);
+
+	free_block_list(block);
+	free_label_strings(&self->labels);
+}
+
+TEST_F(arg_filter, ret_errno) {
+	const char *fragment = "arg0 == 0 || arg0 == 1; return 1";
+	int nr = 1;
+	unsigned int id = 0;
+
+	struct filter_block *block =
+		compile_section(nr, fragment, id, &self->labels);
+	ASSERT_NE(block, NULL);
+	size_t exp_total_len = 1 + 2 * (BPF_ARG_COMP_LEN + 1) + 2 + 2 + 1 + 2;
+	EXPECT_EQ(block->total_len, exp_total_len);
+
+	/* First block is a label. */
+	struct filter_block *curr_block = block;
+	ASSERT_NE(curr_block, NULL);
+	EXPECT_EQ(block->len, 1U);
+	EXPECT_LBL(curr_block->instrs);
+
+	/* Second block is a comparison ("arg0 == 0"). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_COMP(curr_block);
+
+	/* Third block is a jump and a label (end of AND group). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_GROUP_END(curr_block);
+
+	/* Fourth block is a comparison ("arg0 == 1"). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_COMP(curr_block);
+
+	/* Fifth block is a jump and a label (end of AND group). */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_GROUP_END(curr_block);
+
+	/* Sixth block is SECCOMP_RET_ERRNO */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_EQ(curr_block->len, 1U);
+	EXPECT_EQ_STMT(curr_block->instrs,
+			BPF_RET+BPF_K,
+			SECCOMP_RET_ERRNO | (1 & SECCOMP_RET_DATA));
+
+	/* Seventh block is "SUCCESS" label and SECCOMP_RET_ALLOW */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_ALLOW(curr_block);
+
+	EXPECT_EQ(curr_block->next, NULL);
+
+	free_block_list(block);
+	free_label_strings(&self->labels);
+}
+
+TEST_F(arg_filter, unconditional_errno) {
+	const char *fragment = "return 1";
+	int nr = 1;
+	unsigned int id = 0;
+
+	struct filter_block *block =
+		compile_section(nr, fragment, id, &self->labels);
+	ASSERT_NE(block, NULL);
+	size_t exp_total_len = 2;
+	EXPECT_EQ(block->total_len, exp_total_len);
+
+	/* First block is a label. */
+	struct filter_block *curr_block = block;
+	ASSERT_NE(curr_block, NULL);
+	EXPECT_EQ(block->len, 1U);
+	EXPECT_LBL(curr_block->instrs);
+
+	/* Second block is SECCOMP_RET_ERRNO */
+	curr_block = curr_block->next;
+	EXPECT_NE(curr_block, NULL);
+	EXPECT_EQ(curr_block->len, 1U);
+	EXPECT_EQ_STMT(curr_block->instrs,
+			BPF_RET+BPF_K,
+			SECCOMP_RET_ERRNO | (1 & SECCOMP_RET_DATA));
+
+	EXPECT_EQ(curr_block->next, NULL);
+
+	free_block_list(block);
+	free_label_strings(&self->labels);
+}
+
+TEST_F(arg_filter, invalid) {
+	const char *fragment = "argnn == 0";
+	int nr = 1;
+	unsigned int id = 0;
+
+	struct filter_block *block =
+			compile_section(nr, fragment, id, &self->labels);
+	ASSERT_EQ(block, NULL);
+
+	fragment = "arg0 == 0 && arg1 == 1; return errno";
+	block = compile_section(nr, fragment, id, &self->labels);
+	ASSERT_EQ(block, NULL);
+}
+
+FIXTURE(filter) {};
+
+FIXTURE_SETUP(filter) {}
+FIXTURE_TEARDOWN(filter) {}
+
+TEST_F(filter, seccomp_mode1) {
+	struct sock_fprog actual;
+	FILE *policy = fopen("test/seccomp.policy", "r");
+	int res = compile_filter(policy, &actual, NO_LOGGING);
+
+	/*
+	 * Checks return value, filter length, and that the filter
+	 * validates arch, loads syscall number, and
+	 * only allows expected syscalls.
+	 */
+	ASSERT_EQ(res, 0);
+	EXPECT_EQ(actual.len, 13);
+	EXPECT_ARCH_VALIDATION(actual.filter);
+	EXPECT_EQ_STMT(actual.filter + ARCH_VALIDATION_LEN,
+			BPF_LD+BPF_W+BPF_ABS, syscall_nr);
+	EXPECT_ALLOW_SYSCALL(actual.filter + ARCH_VALIDATION_LEN + 1,
+			__NR_read);
+	EXPECT_ALLOW_SYSCALL(actual.filter + ARCH_VALIDATION_LEN + 3,
+			__NR_write);
+	EXPECT_ALLOW_SYSCALL(actual.filter + ARCH_VALIDATION_LEN + 5,
+			__NR_rt_sigreturn);
+	EXPECT_ALLOW_SYSCALL(actual.filter + ARCH_VALIDATION_LEN + 7,
+			__NR_exit);
+	EXPECT_EQ_STMT(actual.filter + ARCH_VALIDATION_LEN + 9, BPF_RET+BPF_K,
+			SECCOMP_RET_KILL);
+
+	free(actual.filter);
+	fclose(policy);
+}
+
+TEST_F(filter, seccomp_read_write) {
+	struct sock_fprog actual;
+	FILE *policy = fopen("test/stdin_stdout.policy", "r");
+	int res = compile_filter(policy, &actual, NO_LOGGING);
+
+	/*
+	 * Checks return value, filter length, and that the filter
+	 * validates arch, loads syscall number, and
+	 * only allows expected syscalls, jumping to correct arg filter
+	 * offsets.
+	 */
+	ASSERT_EQ(res, 0);
+	size_t exp_total_len = 27 + 3 * (BPF_ARG_COMP_LEN + 1);
+	EXPECT_EQ(actual.len, exp_total_len);
+
+	EXPECT_ARCH_VALIDATION(actual.filter);
+	EXPECT_EQ_STMT(actual.filter + ARCH_VALIDATION_LEN,
+			BPF_LD+BPF_W+BPF_ABS, syscall_nr);
+	EXPECT_ALLOW_SYSCALL_ARGS(actual.filter + ARCH_VALIDATION_LEN + 1,
+			__NR_read, 7, 0, 0);
+	EXPECT_ALLOW_SYSCALL_ARGS(actual.filter + ARCH_VALIDATION_LEN + 3,
+			__NR_write, 12 + BPF_ARG_COMP_LEN, 0, 0);
+	EXPECT_ALLOW_SYSCALL(actual.filter + ARCH_VALIDATION_LEN + 5,
+			__NR_rt_sigreturn);
+	EXPECT_ALLOW_SYSCALL(actual.filter + ARCH_VALIDATION_LEN + 7,
+			__NR_exit);
+	EXPECT_EQ_STMT(actual.filter + ARCH_VALIDATION_LEN + 9, BPF_RET+BPF_K,
+			SECCOMP_RET_KILL);
+
+	free(actual.filter);
+	fclose(policy);
+}
+
+TEST_F(filter, invalid) {
+	struct sock_fprog actual;
+
+	FILE *policy = fopen("test/invalid_syscall_name.policy", "r");
+	int res = compile_filter(policy, &actual, NO_LOGGING);
+	ASSERT_NE(res, 0);
+	fclose(policy);
+
+	policy = fopen("test/invalid_arg_filter.policy", "r");
+	res = compile_filter(policy, &actual, NO_LOGGING);
+	ASSERT_NE(res, 0);
+	fclose(policy);
+}
+
+TEST_F(filter, nonexistent) {
+	struct sock_fprog actual;
+
+	FILE *policy = fopen("test/nonexistent-file.policy", "r");
+	int res = compile_filter(policy, &actual, NO_LOGGING);
+	ASSERT_NE(res, 0);
+}
+
+TEST_F(filter, log) {
+	struct sock_fprog actual;
+
+	FILE *policy = fopen("test/seccomp.policy", "r");
+	int res = compile_filter(policy, &actual, USE_LOGGING);
+
+	size_t i;
+	size_t index = 0;
+	/*
+	 * Checks return value, filter length, and that the filter
+	 * validates arch, loads syscall number, only allows expected syscalls,
+	 * and returns TRAP on failure.
+	 * NOTE(jorgelo): the filter is longer since we add the syscalls needed
+	 * for logging.
+	 */
+	ASSERT_EQ(res, 0);
+	EXPECT_EQ(actual.len, 13 + 2 * log_syscalls_len);
+	EXPECT_ARCH_VALIDATION(actual.filter);
+	EXPECT_EQ_STMT(actual.filter + ARCH_VALIDATION_LEN,
+			BPF_LD+BPF_W+BPF_ABS, syscall_nr);
+
+	index = ARCH_VALIDATION_LEN + 1;
+	for (i = 0; i < log_syscalls_len; i++)
+		EXPECT_ALLOW_SYSCALL(actual.filter + (index + 2 * i),
+				     lookup_syscall(log_syscalls[i]));
+
+	index += 2 * log_syscalls_len;
+
+	EXPECT_ALLOW_SYSCALL(actual.filter + index, __NR_read);
+	EXPECT_ALLOW_SYSCALL(actual.filter + index + 2, __NR_write);
+	EXPECT_ALLOW_SYSCALL(actual.filter + index + 4, __NR_rt_sigreturn);
+	EXPECT_ALLOW_SYSCALL(actual.filter + index + 6, __NR_exit);
+	EXPECT_EQ_STMT(actual.filter + index + 8, BPF_RET+BPF_K,
+			SECCOMP_RET_TRAP);
+
+	free(actual.filter);
+	fclose(policy);
+}
+
+TEST_HARNESS_MAIN
diff --git a/test/invalid_arg_filter.policy b/test/invalid_arg_filter.policy
new file mode 100644
index 0000000..b79b31a
--- /dev/null
+++ b/test/invalid_arg_filter.policy
@@ -0,0 +1 @@
+open: argnn ==
diff --git a/test/invalid_syscall_name.policy b/test/invalid_syscall_name.policy
new file mode 100644
index 0000000..3e6d403
--- /dev/null
+++ b/test/invalid_syscall_name.policy
@@ -0,0 +1 @@
+notasyscall: 1
diff --git a/test/read_stdin b/test/read_stdin
new file mode 100755
index 0000000..29578a6
--- /dev/null
+++ b/test/read_stdin
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+read line
+
+if [ "$line" == "test" ]; then
+	exit 0
+else
+	exit 1
+fi
diff --git a/test/seccomp.policy b/test/seccomp.policy
new file mode 100644
index 0000000..b983631
--- /dev/null
+++ b/test/seccomp.policy
@@ -0,0 +1,4 @@
+read: 1
+write: 1
+rt_sigreturn: 1
+exit: 1
diff --git a/test/stdin_stdout.policy b/test/stdin_stdout.policy
new file mode 100644
index 0000000..874a5ca
--- /dev/null
+++ b/test/stdin_stdout.policy
@@ -0,0 +1,4 @@
+read: arg0 == 0
+write: arg0 == 1 || arg0 == 2
+rt_sigreturn: 1
+exit: 1
diff --git a/test_harness.h b/test_harness.h
new file mode 100644
index 0000000..9bca7f9
--- /dev/null
+++ b/test_harness.h
@@ -0,0 +1,438 @@
+/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * test_harness.h: simple C unit test helper.
+ *
+ * Usage:
+ *   #include "test_harness.h"
+ *   TEST(standalone_test) {
+ *     do_some_stuff;
+ *     EXPECT_GT(10, stuff) {
+ *        stuff_state_t state;
+ *        enumerate_stuff_state(&state);
+ *        TH_LOG("expectation failed with state: %s", state.msg);
+ *     }
+ *     more_stuff;
+ *     ASSERT_NE(some_stuff, NULL) TH_LOG("how did it happen?!");
+ *     last_stuff;
+ *     EXPECT_EQ(0, last_stuff);
+ *   }
+ *
+ *   FIXTURE(my_fixture) {
+ *     mytype_t *data;
+ *     int awesomeness_level;
+ *   };
+ *   FIXTURE_SETUP(my_fixture) {
+ *     self->data = mytype_new();
+ *     ASSERT_NE(NULL, self->data);
+ *   }
+ *   FIXTURE_TEARDOWN(my_fixture) {
+ *     mytype_free(self->data);
+ *   }
+ *   TEST_F(my_fixture, data_is_good) {
+ *     EXPECT_EQ(1, is_my_data_good(self->data));
+ *   }
+ *
+ *   TEST_HARNESS_MAIN
+ *
+ * API inspired by code.google.com/p/googletest
+ */
+#ifndef TEST_HARNESS_H_
+#define TEST_HARNESS_H_
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+/* All exported functionality should be declared through this macro. */
+#define TEST_API(x) _##x
+
+/*
+ * Exported APIs
+ */
+
+/* TEST(name) { implementation }
+ * Defines a test by name.
+ * Names must be unique and tests must not be run in parallel.  The
+ * implementation containing block is a function and scoping should be treated
+ * as such.  Returning early may be performed with a bare "return;" statement.
+ *
+ * EXPECT_* and ASSERT_* are valid in a TEST() { } context.
+ */
+#define TEST TEST_API(TEST)
+
+/* FIXTURE(datatype name) {
+ *   type property1;
+ *   ...
+ * };
+ * Defines the data provided to TEST_F()-defined tests as |self|.  It should be
+ * populated and cleaned up using FIXTURE_SETUP and FIXTURE_TEARDOWN.
+ */
+#define FIXTURE TEST_API(FIXTURE)
+
+/* FIXTURE_DATA(datatype name)
+ * This call may be used when the type of the fixture data
+ * is needed.  In general, this should not be needed unless
+ * the |self| is being passed to a helper directly.
+ */
+#define FIXTURE_DATA TEST_API(FIXTURE_DATA)
+
+/* FIXTURE_SETUP(fixture name) { implementation }
+ * Populates the required "setup" function for a fixture.  An instance of the
+ * datatype defined with _FIXTURE_DATA will be exposed as |self| for the
+ * implementation.
+ *
+ * ASSERT_* are valid for use in this context and will prempt the execution
+ * of any dependent fixture tests.
+ *
+ * A bare "return;" statement may be used to return early.
+ */
+#define FIXTURE_SETUP TEST_API(FIXTURE_SETUP)
+
+/* FIXTURE_TEARDOWN(fixture name) { implementation }
+ * Populates the required "teardown" function for a fixture.  An instance of the
+ * datatype defined with _FIXTURE_DATA will be exposed as |self| for the
+ * implementation to clean up.
+ *
+ * A bare "return;" statement may be used to return early.
+ */
+#define FIXTURE_TEARDOWN TEST_API(FIXTURE_TEARDOWN)
+
+/* TEST_F(fixture, name) { implementation }
+ * Defines a test that depends on a fixture (e.g., is part of a test case).
+ * Very similar to TEST() except that |self| is the setup instance of fixture's
+ * datatype exposed for use by the implementation.
+ */
+#define TEST_F TEST_API(TEST_F)
+
+/* Use once to append a main() to the test file. E.g.,
+ *   TEST_HARNESS_MAIN
+ */
+#define TEST_HARNESS_MAIN TEST_API(TEST_HARNESS_MAIN)
+
+/*
+ * Operators for use in TEST and TEST_F.
+ * ASSERT_* calls will stop test execution immediately.
+ * EXPECT_* calls will emit a failure warning, note it, and continue.
+ */
+
+/* ASSERT_EQ(expected, measured): expected == measured */
+#define ASSERT_EQ TEST_API(ASSERT_EQ)
+/* ASSERT_NE(expected, measured): expected != measured */
+#define ASSERT_NE TEST_API(ASSERT_NE)
+/* ASSERT_LT(expected, measured): expected < measured */
+#define ASSERT_LT TEST_API(ASSERT_LT)
+/* ASSERT_LE(expected, measured): expected <= measured */
+#define ASSERT_LE TEST_API(ASSERT_LE)
+/* ASSERT_GT(expected, measured): expected > measured */
+#define ASSERT_GT TEST_API(ASSERT_GT)
+/* ASSERT_GE(expected, measured): expected >= measured */
+#define ASSERT_GE TEST_API(ASSERT_GE)
+/* ASSERT_NULL(measured): NULL == measured */
+#define ASSERT_NULL TEST_API(ASSERT_NULL)
+/* ASSERT_TRUE(measured): measured != 0 */
+#define ASSERT_TRUE TEST_API(ASSERT_TRUE)
+/* ASSERT_FALSE(measured): measured == 0 */
+#define ASSERT_FALSE TEST_API(ASSERT_FALSE)
+/* ASSERT_STREQ(expected, measured): !strcmp(expected, measured) */
+#define ASSERT_STREQ TEST_API(ASSERT_STREQ)
+/* ASSERT_STRNE(expected, measured): strcmp(expected, measured) */
+#define ASSERT_STRNE TEST_API(ASSERT_STRNE)
+/* EXPECT_EQ(expected, measured): expected == measured */
+#define EXPECT_EQ TEST_API(EXPECT_EQ)
+/* EXPECT_NE(expected, measured): expected != measured */
+#define EXPECT_NE TEST_API(EXPECT_NE)
+/* EXPECT_LT(expected, measured): expected < measured */
+#define EXPECT_LT TEST_API(EXPECT_LT)
+/* EXPECT_LE(expected, measured): expected <= measured */
+#define EXPECT_LE TEST_API(EXPECT_LE)
+/* EXPECT_GT(expected, measured): expected > measured */
+#define EXPECT_GT TEST_API(EXPECT_GT)
+/* EXPECT_GE(expected, measured): expected >= measured */
+#define EXPECT_GE TEST_API(EXPECT_GE)
+/* EXPECT_NULL(measured): NULL == measured */
+#define EXPECT_NULL TEST_API(EXPECT_NULL)
+/* EXPECT_TRUE(measured): 0 != measured */
+#define EXPECT_TRUE TEST_API(EXPECT_TRUE)
+/* EXPECT_FALSE(measured): 0 == measured */
+#define EXPECT_FALSE TEST_API(EXPECT_FALSE)
+/* EXPECT_STREQ(expected, measured): !strcmp(expected, measured) */
+#define EXPECT_STREQ TEST_API(EXPECT_STREQ)
+/* EXPECT_STRNE(expected, measured): strcmp(expected, measured) */
+#define EXPECT_STRNE TEST_API(EXPECT_STRNE)
+
+/* TH_LOG(format, ...)
+ * Optional debug logging function available for use in tests.
+ * Logging may be enabled or disabled by defining TH_LOG_ENABLED.
+ * E.g., #define TH_LOG_ENABLED 1
+ * If no definition is provided, logging is enabled by default.
+ */
+#define TH_LOG  TEST_API(TH_LOG)
+
+/*
+ * Internal implementation.
+ *
+ */
+
+/* Utilities exposed to the test definitions */
+#ifndef TH_LOG_STREAM
+#  define TH_LOG_STREAM stderr
+#endif
+
+#ifndef TH_LOG_ENABLED
+#  define TH_LOG_ENABLED 1
+#endif
+
+#define _TH_LOG(fmt, ...) do { \
+  if (TH_LOG_ENABLED) \
+    __TH_LOG(fmt, ##__VA_ARGS__); \
+} while (0)
+
+/* Unconditional logger for internal use. */
+#define __TH_LOG(fmt, ...) \
+    fprintf(TH_LOG_STREAM, "%s:%d:%s:" fmt "\n", \
+            __FILE__, __LINE__, _metadata->name, ##__VA_ARGS__)
+
+/* Defines the test function and creates the registration stub. */
+#define _TEST(test_name) \
+  static void test_name(struct __test_metadata *_metadata); \
+  static struct __test_metadata _##test_name##_object = \
+    { .name= "global." #test_name, .fn= &test_name }; \
+  static void __attribute__((constructor)) _register_##test_name(void) { \
+    __register_test(&_##test_name##_object); \
+  } \
+  static void test_name( \
+    struct __test_metadata __attribute__((unused)) *_metadata)
+
+/* Wraps the struct name so we have one less argument to pass around. */
+#define _FIXTURE_DATA(fixture_name) struct _test_data_##fixture_name
+
+/* Called once per fixture to setup the data and register. */
+#define _FIXTURE(fixture_name) \
+  static void __attribute__((constructor)) \
+      _register_##fixture_name##_data(void) { \
+    __fixture_count++; \
+  } \
+  _FIXTURE_DATA(fixture_name)
+
+/* Prepares the setup function for the fixture.  |_metadata| is included
+ * so that ASSERT_* work as a convenience.
+ */
+#define _FIXTURE_SETUP(fixture_name) \
+  void fixture_name##_setup( \
+    struct __test_metadata __attribute__((unused)) *_metadata, \
+    _FIXTURE_DATA(fixture_name) __attribute__((unused)) *self)
+#define _FIXTURE_TEARDOWN(fixture_name) \
+  void fixture_name##_teardown( \
+    struct __test_metadata __attribute__((unused)) *_metadata, \
+    _FIXTURE_DATA(fixture_name) __attribute__((unused)) *self)
+
+/* Emits test registration and helpers for fixture-based test
+ * cases.
+ * TODO(wad) register fixtures on dedicated test lists.
+ */
+#define _TEST_F(fixture_name, test_name) \
+  static void fixture_name##_##test_name( \
+    struct __test_metadata *_metadata, \
+    _FIXTURE_DATA(fixture_name) *self); \
+  static inline void wrapper_##fixture_name##_##test_name( \
+    struct __test_metadata *_metadata) { \
+    /* fixture data is allocated, setup, and torn down per call. */ \
+    _FIXTURE_DATA(fixture_name) self; \
+    memset(&self, 0, sizeof(_FIXTURE_DATA(fixture_name))); \
+    fixture_name##_setup(_metadata, &self); \
+    /* Let setup failure terminate early. */ \
+    if (!_metadata->passed) return; \
+    fixture_name##_##test_name(_metadata, &self); \
+    fixture_name##_teardown(_metadata, &self); \
+  } \
+  static struct __test_metadata _##fixture_name##_##test_name##_object = { \
+    .name= #fixture_name "." #test_name, \
+    .fn= &wrapper_##fixture_name##_##test_name, \
+   }; \
+  static void __attribute__((constructor)) \
+      _register_##fixture_name##_##test_name(void) { \
+    __register_test(&_##fixture_name##_##test_name##_object); \
+  } \
+  static void fixture_name##_##test_name( \
+    struct __test_metadata __attribute__((unused)) *_metadata, \
+    _FIXTURE_DATA(fixture_name) __attribute__((unused)) *self)
+
+/* Exports a simple wrapper to run the test harness. */
+#define _TEST_HARNESS_MAIN \
+  int main(int argc, char **argv) { return test_harness_run(argc, argv); }
+
+#define _ASSERT_EQ(_expected, _seen) \
+  __EXPECT(_expected, _seen, ==, 1)
+#define _ASSERT_NE(_expected, _seen) \
+  __EXPECT(_expected, _seen, !=, 1)
+#define _ASSERT_LT(_expected, _seen) \
+  __EXPECT(_expected, _seen, <, 1)
+#define _ASSERT_LE(_expected, _seen) \
+  __EXPECT(_expected, _seen, <=, 1)
+#define _ASSERT_GT(_expected, _seen) \
+  __EXPECT(_expected, _seen, >, 1)
+#define _ASSERT_GE(_expected, _seen) \
+  __EXPECT(_expected, _seen, >=, 1)
+#define _ASSERT_NULL(_seen) \
+  __EXPECT(NULL, _seen, ==, 1)
+
+#define _ASSERT_TRUE(_seen) \
+  _ASSERT_NE(0, _seen)
+#define _ASSERT_FALSE(_seen) \
+  _ASSERT_EQ(0, _seen)
+#define _ASSERT_STREQ(_expected, _seen) \
+  __EXPECT_STR(_expected, _seen, ==, 1)
+#define _ASSERT_STRNE(_expected, _seen) \
+  __EXPECT_STR(_expected, _seen, !=, 1)
+
+#define _EXPECT_EQ(_expected, _seen) \
+  __EXPECT(_expected, _seen, ==, 0)
+#define _EXPECT_NE(_expected, _seen) \
+  __EXPECT(_expected, _seen, !=, 0)
+#define _EXPECT_LT(_expected, _seen) \
+  __EXPECT(_expected, _seen, <, 0)
+#define _EXPECT_LE(_expected, _seen) \
+  __EXPECT(_expected, _seen, <=, 0)
+#define _EXPECT_GT(_expected, _seen) \
+  __EXPECT(_expected, _seen, >, 0)
+#define _EXPECT_GE(_expected, _seen) \
+  __EXPECT(_expected, _seen, >=, 0)
+
+#define _EXPECT_NULL(_seen) \
+  __EXPECT(NULL, _seen, ==, 0)
+#define _EXPECT_TRUE(_seen) \
+  _EXPECT_NE(0, _seen)
+#define _EXPECT_FALSE(_seen) \
+  _EXPECT_EQ(0, _seen)
+
+#define _EXPECT_STREQ(_expected, _seen) \
+  __EXPECT_STR(_expected, _seen, ==, 0)
+#define _EXPECT_STRNE(_expected, _seen) \
+  __EXPECT_STR(_expected, _seen, !=, 0)
+
+/* Support an optional handler after and ASSERT_* or EXPECT_*.  The approach is
+ * not thread-safe, but it should be fine in most sane test scenarios.
+ *
+ * Using __bail(), which optionally abort()s, is the easiest way to early
+ * return while still providing an optional block to the API consumer.
+ */
+#define OPTIONAL_HANDLER(_assert) \
+  for (; _metadata->trigger;  _metadata->trigger = __bail(_assert))
+
+#define __EXPECT(_expected, _seen, _t, _assert) do { \
+  /* Avoid multiple evaluation of the cases */ \
+  __typeof__(_expected) __exp = (_expected); \
+  __typeof__(_seen) __seen = (_seen); \
+  if (!(__exp _t __seen)) { \
+    unsigned long long __exp_print = 0; \
+    unsigned long long __seen_print = 0; \
+    /* Avoid casting complaints the scariest way we can. */ \
+    memcpy(&__exp_print, &__exp, sizeof(__exp)); \
+    memcpy(&__seen_print, &__seen, sizeof(__seen)); \
+    __TH_LOG("Expected %s (%llu) %s %s (%llu)", \
+            #_expected, __exp_print, #_t, \
+            #_seen, __seen_print); \
+    _metadata->passed = 0; \
+    /* Ensure the optional handler is triggered */ \
+    _metadata->trigger = 1; \
+  } \
+} while (0); OPTIONAL_HANDLER(_assert)
+
+#define __EXPECT_STR(_expected, _seen, _t, _assert) do { \
+  const char *__exp = (_expected); \
+  const char *__seen = (_seen); \
+  if (!(strcmp(__exp, __seen) _t 0))  { \
+    __TH_LOG("Expected '%s' %s '%s'.", __exp, #_t, __seen); \
+    _metadata->passed = 0; \
+    _metadata->trigger = 1; \
+  } \
+} while (0); OPTIONAL_HANDLER(_assert)
+
+/* Contains all the information for test execution and status checking. */
+struct __test_metadata {
+  const char *name;
+  void (*fn)(struct __test_metadata *);
+  int passed;
+  int trigger; /* extra handler after the evaluation */
+  struct __test_metadata *prev, *next;
+};
+
+/* Storage for the (global) tests to be run. */
+static struct __test_metadata *__test_list = NULL;
+static unsigned int __test_count = 0;
+static unsigned int __fixture_count = 0;
+
+static inline void __register_test(struct __test_metadata *t) {
+  __test_count++;
+  /* Circular linked list where only prev is circular. */
+  if (__test_list == NULL) {
+    __test_list = t;
+    t->next = NULL;
+    t->prev = t;
+    return;
+  }
+  t->next = NULL;
+  t->prev = __test_list->prev;
+  t->prev->next = t;
+  __test_list->prev = t;
+}
+
+static inline int __bail(int for_realz) {
+  if (for_realz)
+    abort();
+  return 0;
+}
+
+static int test_harness_run(int __attribute__((unused)) argc,
+                            char __attribute__((unused)) **argv) {
+  struct __test_metadata *t;
+  int ret = 0;
+  unsigned int count = 0;
+
+  /* TODO(wad) add optional arguments similar to gtest. */
+  printf("[==========] Running %u tests from %u test cases.\n",
+          __test_count, __fixture_count + 1);
+  for (t = __test_list; t; t = t->next) {
+    pid_t child_pid;
+    int status;
+    count++;
+    t->passed = 1;
+    t->trigger = 0;
+    printf("[ RUN      ] %s\n", t->name);
+    child_pid = fork();
+    if (child_pid < 0) {
+      printf("ERROR SPAWNING TEST CHILD\n");
+      t->passed = 0;
+    } else if (child_pid == 0) {
+      t->fn(t);
+      _exit(t->passed);
+    } else {
+      /* TODO(wad) add timeout support. */
+      waitpid(child_pid, &status, 0);
+      if (WIFEXITED(status))
+        t->passed = WEXITSTATUS(status);
+      if (WIFSIGNALED(status)) {
+        t->passed = 0;
+        fprintf(TH_LOG_STREAM,
+                "%s: Test terminated unexpectedly by signal %d\n",
+               t->name,
+               WTERMSIG(status));
+      }
+    }
+    printf("[     %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name);
+    if (!t->passed)
+      ret = 1;
+  }
+  /* TODO(wad) organize by fixtures since ordering is not guaranteed now. */
+  printf("[==========] %u tests ran.\n", count);
+  printf("[  %s  ]\n", (ret ? "FAILED" : "PASSED"));
+  return ret;
+}
+
+#endif  /* TEST_HARNESS_H_ */
diff --git a/util.c b/util.c
new file mode 100644
index 0000000..f05bf07
--- /dev/null
+++ b/util.c
@@ -0,0 +1,119 @@
+/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <ctype.h>
+#include <string.h>
+
+#include "util.h"
+
+#include "libsyscalls.h"
+
+/*
+ * These are syscalls used by the syslog() C library call.  You can find them
+ * by running a simple test program.  See below for x86_64 behavior:
+ * $ cat test.c
+ * main() { syslog(0, "foo"); }
+ * $ gcc test.c -static
+ * $ strace ./a.out
+ * ...
+ * socket(PF_FILE, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 3 <- look for socket connection
+ * connect(...)                                    <- important
+ * sendto(...)                                     <- important
+ * exit_group(0)                                   <- finish!
+ */
+#if defined(__x86_64__)
+const char *log_syscalls[] = { "connect", "sendto" };
+#elif defined(__i386__)
+const char *log_syscalls[] = { "socketcall", "time" };
+#elif defined(__arm__)
+const char *log_syscalls[] = { "connect", "gettimeofday", "send" };
+#elif defined(__powerpc__) || defined(__ia64__) || defined(__hppa__) || \
+      defined(__sparc__) || defined(__mips__)
+const char *log_syscalls[] = { "connect", "send" };
+#else
+#error "Unsupported platform"
+#endif
+
+const size_t log_syscalls_len = sizeof(log_syscalls)/sizeof(log_syscalls[0]);
+
+int lookup_syscall(const char *name)
+{
+	const struct syscall_entry *entry = syscall_table;
+	for (; entry->name && entry->nr >= 0; ++entry)
+		if (!strcmp(entry->name, name))
+			return entry->nr;
+	return -1;
+}
+
+const char *lookup_syscall_name(int nr)
+{
+	const struct syscall_entry *entry = syscall_table;
+	for (; entry->name && entry->nr >= 0; ++entry)
+		if (entry->nr == nr)
+			return entry->name;
+	return NULL;
+}
+
+char *strip(char *s)
+{
+	char *end;
+	while (*s && isblank(*s))
+		s++;
+	end = s + strlen(s) - 1;
+	while (end >= s && *end && (isblank(*end) || *end == '\n'))
+		end--;
+	*(end + 1) = '\0';
+	return s;
+}
+
+char *tokenize(char **stringp, const char *delim)
+{
+	char *ret = NULL;
+
+	/* If the string is NULL or empty, there are no tokens to be found. */
+	if (stringp == NULL || *stringp == NULL || **stringp == '\0')
+		return NULL;
+
+	/*
+	 * If the delimiter is NULL or empty,
+	 * the full string makes up the only token.
+	 */
+	if (delim == NULL || *delim == '\0') {
+		ret = *stringp;
+		*stringp = NULL;
+		return ret;
+	}
+
+	char *found;
+	while (**stringp != '\0') {
+		found = strstr(*stringp, delim);
+
+		if (!found) {
+			/*
+			 * The delimiter was not found, so the full string
+			 * makes up the only token, and we're done.
+			 */
+			ret = *stringp;
+			*stringp = NULL;
+			break;
+		}
+
+		if (found != *stringp) {
+			/* There's a non-empty token before the delimiter. */
+			*found = '\0';
+			ret = *stringp;
+			*stringp = found + strlen(delim);
+			break;
+		}
+
+		/*
+		 * The delimiter was found at the start of the string,
+		 * skip it and keep looking for a non-empty token.
+		 */
+		*stringp += strlen(delim);
+	}
+
+	return ret;
+}
diff --git a/util.h b/util.h
new file mode 100644
index 0000000..d07e5b1
--- /dev/null
+++ b/util.h
@@ -0,0 +1,37 @@
+/* util.h
+ * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Logging and other utility functions.
+ */
+
+#ifndef _UTIL_H_
+#define _UTIL_H_
+
+#include <stdlib.h>
+#include <syslog.h>
+
+#define die(_msg, ...) do { \
+	syslog(LOG_ERR, "libminijail: " _msg, ## __VA_ARGS__); \
+	abort(); \
+} while (0)
+
+#define pdie(_msg, ...) \
+	die(_msg ": %m", ## __VA_ARGS__)
+
+#define warn(_msg, ...) \
+	syslog(LOG_WARNING, "libminijail: " _msg, ## __VA_ARGS__)
+
+#define info(_msg, ...) \
+	syslog(LOG_INFO, "libminijail: " _msg, ## __VA_ARGS__)
+
+extern const char *log_syscalls[];
+extern const size_t log_syscalls_len;
+
+int lookup_syscall(const char *name);
+const char *lookup_syscall_name(int nr);
+char *strip(char *s);
+char *tokenize(char **stringp, const char *delim);
+
+#endif /* _UTIL_H_ */