__alloc_pages_slowpath: Give up timeslice if under memory pressure.

If we aren't able to get any memory despite going through the slowpath once,
suspend the current process until later.  Previously, the kernel would just
call cond_resched() in a few places, but that doesn't work well if the
current task has realtime priority: it will prevent other critical tasks
like kswapd, nfsiod, or mtdblock## from running.  Those tasks, in turn,
might need to run in order to actually free up memory, which is what the
alloc_pages_slowpath() loop is waiting for in the first place.

This fix is probably not really ideal (it would slow down recovery in the
"lucky" cases where we manage to clear things up after a single time through
the loop).  But it seems to be relatively harmless because it's such a
rarely-used code path.  And it definitely beats hanging the whole kernel.

This condition triggered very frequently when the OOM killed kicked in, but
note that the OOM killer is not actually necessary to cause the problem.  It
could occur if the kernel ever freed up pages from a realtime process and
needed to swap them back in.

I added printk's to each case, and confirmed that neither of them shows up
(at least on a TV box) under normal conditions.

b/8522311 b/7885817

Change-Id: I335cbc5dd3bba2f9bc9d6bc2a955ebebfc025bdc
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8e4ed88..7fa9f39 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -6,6 +6,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/mm.h>
+#include <linux/ratelimit.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/writeback.h>
@@ -802,6 +803,7 @@
  */
 long wait_iff_congested(struct zone *zone, int sync, long timeout)
 {
+	static DEFINE_RATELIMIT_STATE(rs, HZ * 10, 50);
 	long ret;
 	unsigned long start = jiffies;
 	DEFINE_WAIT(wait);
@@ -812,6 +814,7 @@
 	 * encountered in the current zone, yield if necessary instead
 	 * of sleeping on the congestion queue
 	 */
+#if 0
 	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
 			!zone_is_reclaim_congested(zone)) {
 		cond_resched();
@@ -823,8 +826,11 @@
 
 		goto out;
 	}
-
+#endif
 	/* Sleep until uncongested or a write happens */
+	if (__ratelimit(&rs))
+		printk("Congested: pid %d (%s) sleeping.\n",
+			task_pid_nr(current), current->comm);
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 	ret = io_schedule_timeout(timeout);
 	finish_wait(wqh, &wait);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 92e89a0..14c2bbf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -32,6 +32,7 @@
 #include <linux/slab.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
+#include <linux/ratelimit.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
@@ -1992,6 +1993,7 @@
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
+	static DEFINE_RATELIMIT_STATE(rs, HZ * 10, 50);
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
 	int alloc_flags;
@@ -2111,6 +2113,13 @@
 					goto nopage;
 			}
 
+			/* If we're a realtime task, we need to share. */
+			if (__ratelimit(&rs))
+				printk("Mem pressure: pid %d (%s) sleeping.\n",
+					task_pid_nr(current), current->comm);
+			struct timespec ts = { 1, 50*1000000 };
+			hrtimer_nanosleep(&ts, NULL, HRTIMER_MODE_REL,
+						CLOCK_MONOTONIC);
 			goto restart;
 		}
 	}