Track bits corrected per subpage.

Note for future kernel updates: upstream Linux handles this
differently, by reading each subpage individually and tracking
errors that way. This patch should not need to be ported to
a later kernel version.

Change-Id: If7f1be1162ea5e9b0b04faf9855a5b464e5bdf62
diff --git a/drivers/mtd/nand/comcerto_nand.c b/drivers/mtd/nand/comcerto_nand.c
index 4904789..d5ca078 100644
--- a/drivers/mtd/nand/comcerto_nand.c
+++ b/drivers/mtd/nand/comcerto_nand.c
@@ -422,6 +422,7 @@
 	uint8_t *oob = nand_device->oob_poi;
 
 	for (; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
+
 		chip->ecc.hwctl(mtd, NAND_ECC_READ);
 		chip->read_buf(mtd, p, eccsize);
 		chip->read_buf(mtd, ecc_code, ecc_bytes);
@@ -429,8 +430,15 @@
 		stat = chip->ecc.correct(mtd, p, oob, NULL);
 		if (stat < 0)
 			mtd->ecc_stats.failed++;
-		else
+		else {
+			int idx = eccsteps;
+			if (idx >= MTD_ECC_STAT_SUBPAGES) {
+				idx = MTD_ECC_STAT_SUBPAGES - 1;
+			}
+
 			mtd->ecc_stats.corrected += stat;
+			mtd->ecc_subpage_stats.subpage_corrected[idx] += stat;
+		}
 
 		comcerto_ecc_shift(ECC_SHIFT_DISABLE);
 
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 137e578..1b3d432 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1422,46 +1422,73 @@
 }
 
 /*
- * NOTE(apenwarr): Newer kernels do this much better.
- *  Among other things, they report a max_flips value that's the largest
- *  number of flips in any 1024-byte ECC calculation, as opposed to the total
- *  flips in the whole 4096-byte page.  The latter is dangerous because
- *  you could see 24 flips in a single 1024-byte region, which is the edge
- *  of disaster, even though it's only 1/4 of the maximum 96 flips we could
- *  handle if averaged across 4 pages.  So where we'd like to set a threshold
- *  per 1024-byte region, we instead have to set a threshold per
- *  4096-byte region that *still* must be well under 24.
+ * NOTE(dgentry): Newer kernels do this in a different, and much better, way.
+ *  The upstream mtd APIs to NAND drivers know about subpages and allow errors
+ *  to be reported on a per-subpage level.
+ *
+ *  Here, we judge errors in two ways:
+ *  1. If the underlying NAND driver reported errors per sub-page
+ *     via mtd_ecc_subpage_stats, we check that the number of corrected
+ *     bits is within a safe distance from the maximum number of bits
+ *     we can correct. At the time of this writing only comcerto_nand.c
+ *     reports per-subpage errors.
+ *  2. We check the number of bits corrected on the entire page. For
+ *     example, we might allow up to 72 bits to be corrected on a 4096
+ *     byte page. This is dangerous because there is a big difference between
+ *     having 18 bits corrected on each 1024 byte sub-page versus having
+ *     72 bits corrected all on one subpage.
+ *     Nonetheless if the NAND driver only reports stats using struct
+ *     mtd_ecc_stats, this is the best we can do.
  *
  *  Anyway, this code can go away someday when we use a newer kernel.
  */
 static int unclean_if_too_many_flips(struct mtd_info *mtd,
-		struct mtd_ecc_stats *stats) {
+		struct mtd_ecc_stats *stats,
+		struct mtd_ecc_subpage_stats *subpage_stats) {
 	uint32_t flips = mtd->ecc_stats.corrected - stats->corrected;
-	uint32_t threshold;
+	uint32_t threshold, subpage_threshold;
+	int i, rc = 0;
+
 	switch (mtd->oobsize) {
 	case 8:
 	case 16:
 	case 64:
 		threshold = 0;
+		subpage_threshold = 0;
 		break;
 	case 128:
 		threshold = 4;
+		subpage_threshold = 2;
 		break;
 	case 224:
 		threshold = 72;
+		subpage_threshold = 18;
 		break;
 	default:
 		threshold = 0;
+		subpage_threshold = 0;
 		break;
 	}
 	if (flips > threshold / 2) {
-		// This should be very rare, bu we want to know as we
+		// This should be very rare, but we want to know as we
 		// approach our threshold, which should be even more rare.
 		printk_ratelimited(KERN_WARNING
 			"ECC: corrected %d bits (threshold=%d)\n",
 			flips, threshold);
 	}
-	return  flips > threshold ? -EUCLEAN : 0;
+	if (flips > threshold) rc = -EUCLEAN;
+	for (i = 0; i < MTD_ECC_STAT_SUBPAGES; i++) {
+		flips = mtd->ecc_subpage_stats.subpage_corrected[i] -
+		    subpage_stats->subpage_corrected[i];
+		if (flips > subpage_threshold / 2) {
+			printk_ratelimited(KERN_WARNING
+				"ECC: corrected %d bits in one subpage "
+				"(threshold=%d)\n", flips, subpage_threshold);
+		}
+		if (flips > subpage_threshold) rc = -EUCLEAN;
+	}
+
+	return rc;
 }
 
 /**
@@ -1478,6 +1505,7 @@
 	int chipnr, page, realpage, col, bytes, aligned;
 	struct nand_chip *chip = mtd->priv;
 	struct mtd_ecc_stats stats;
+	struct mtd_ecc_subpage_stats subpage_stats;
 	int blkcheck = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1;
 	int sndcmd = 1;
 	int ret = 0;
@@ -1489,6 +1517,7 @@
 	uint8_t *bufpoi, *oob, *buf;
 
 	stats = mtd->ecc_stats;
+	subpage_stats = mtd->ecc_subpage_stats;
 
 	chipnr = (int)(from >> chip->chip_shift);
 	chip->select_chip(mtd, chipnr);
@@ -1610,7 +1639,7 @@
 	if (mtd->ecc_stats.failed - stats.failed)
 		return -EBADMSG;
 
-	return unclean_if_too_many_flips(mtd, &stats);
+	return unclean_if_too_many_flips(mtd, &stats, &subpage_stats);
 }
 
 /**
@@ -1805,6 +1834,7 @@
 	int page, realpage, chipnr, sndcmd = 1;
 	struct nand_chip *chip = mtd->priv;
 	struct mtd_ecc_stats stats;
+	struct mtd_ecc_subpage_stats subpage_stats;
 	int blkcheck = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1;
 	int readlen = ops->ooblen;
 	int len;
@@ -1814,6 +1844,7 @@
 			__func__, (unsigned long long)from, readlen);
 
 	stats = mtd->ecc_stats;
+	subpage_stats = mtd->ecc_subpage_stats;
 
 	if (ops->mode == MTD_OPS_AUTO_OOB)
 		len = chip->ecc.layout->oobavail;
@@ -1892,7 +1923,7 @@
 	if (mtd->ecc_stats.failed - stats.failed)
 		return -EBADMSG;
 
-	return unclean_if_too_many_flips(mtd, &stats);
+	return unclean_if_too_many_flips(mtd, &stats, &subpage_stats);
 }
 
 /**
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 629401a..86d8ebb 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -262,6 +262,8 @@
 
 	/* ECC status information */
 	struct mtd_ecc_stats ecc_stats;
+	struct mtd_ecc_subpage_stats ecc_subpage_stats;
+
 	/* Subpage shift (NAND) */
 	int subpage_sft;
 
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index ec5aeaa..013f032 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -251,6 +251,11 @@
 	__u32 bbtblocks;
 };
 
+struct mtd_ecc_subpage_stats {
+#define MTD_ECC_STAT_SUBPAGES 8
+	__u32 subpage_corrected[MTD_ECC_STAT_SUBPAGES];
+};
+
 /*
  * MTD file modes - for read/write access to MTD
  *