__alloc_pages_slowpath: Give up timeslice if under memory pressure. If we aren't able to get any memory despite going through the slowpath once, suspend the current process until later. Previously, the kernel would just call cond_resched() in a few places, but that doesn't work well if the current task has realtime priority: it will prevent other critical tasks like kswapd, nfsiod, or mtdblock## from running. Those tasks, in turn, might need to run in order to actually free up memory, which is what the alloc_pages_slowpath() loop is waiting for in the first place. This fix is probably not really ideal (it would slow down recovery in the "lucky" cases where we manage to clear things up after a single time through the loop). But it seems to be relatively harmless because it's such a rarely-used code path. And it definitely beats hanging the whole kernel. This condition triggered very frequently when the OOM killed kicked in, but note that the OOM killer is not actually necessary to cause the problem. It could occur if the kernel ever freed up pages from a realtime process and needed to swap them back in. I added printk's to each case, and confirmed that neither of them shows up (at least on a TV box) under normal conditions. b/8522311 b/7885817 Change-Id: I335cbc5dd3bba2f9bc9d6bc2a955ebebfc025bdc

commit: 34241c953d74825c87dbdeddddeba025e1e888d3 [log] [tgz]
author: Avery Pennarun <apenwarr@gmail.com> Thu Apr 04 07:00:32 2013 -0400
committer: Avery Pennarun <apenwarr@gmail.com> Thu Apr 04 21:30:44 2013 -0400
tree: 1aca242b8cd9e8aa7d87e105587906cba63a4f13
parent: 864ab1276f04f5ff8b593a3abb9d3ccf635d0832 [diff]
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8e4ed88..7fa9f39 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c

@@ -6,6 +6,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/mm.h>
+#include <linux/ratelimit.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/writeback.h>
@@ -802,6 +803,7 @@
  */
 long wait_iff_congested(struct zone *zone, int sync, long timeout)
 {
+	static DEFINE_RATELIMIT_STATE(rs, HZ * 10, 50);
 	long ret;
 	unsigned long start = jiffies;
 	DEFINE_WAIT(wait);
@@ -812,6 +814,7 @@
 	 * encountered in the current zone, yield if necessary instead
 	 * of sleeping on the congestion queue
 	 */
+#if 0
 	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
 			!zone_is_reclaim_congested(zone)) {
 		cond_resched();
@@ -823,8 +826,11 @@
 
 		goto out;
 	}
-
+#endif
 	/* Sleep until uncongested or a write happens */
+	if (__ratelimit(&rs))
+		printk("Congested: pid %d (%s) sleeping.\n",
+			task_pid_nr(current), current->comm);
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 	ret = io_schedule_timeout(timeout);
 	finish_wait(wqh, &wait);

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 92e89a0..14c2bbf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c

@@ -32,6 +32,7 @@
 #include <linux/slab.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
+#include <linux/ratelimit.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
@@ -1992,6 +1993,7 @@
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
+	static DEFINE_RATELIMIT_STATE(rs, HZ * 10, 50);
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
 	int alloc_flags;
@@ -2111,6 +2113,13 @@
 					goto nopage;
 			}
 
+			/* If we're a realtime task, we need to share. */
+			if (__ratelimit(&rs))
+				printk("Mem pressure: pid %d (%s) sleeping.\n",
+					task_pid_nr(current), current->comm);
+			struct timespec ts = { 1, 50*1000000 };
+			hrtimer_nanosleep(&ts, NULL, HRTIMER_MODE_REL,
+						CLOCK_MONOTONIC);
 			goto restart;
 		}
 	}
commit	34241c953d74825c87dbdeddddeba025e1e888d3	[log] [tgz]
author	Avery Pennarun <apenwarr@gmail.com>	Thu Apr 04 07:00:32 2013 -0400
committer	Avery Pennarun <apenwarr@gmail.com>	Thu Apr 04 21:30:44 2013 -0400
tree	1aca242b8cd9e8aa7d87e105587906cba63a4f13
parent	864ab1276f04f5ff8b593a3abb9d3ccf635d0832 [diff]