From 1c1452be2e9ae282a7316c3b23987811bd7acda6 Mon Sep 17 00:00:00 2001
From: Jonas Larsson <jonas.larsson@martinsson.se>
Date: Tue, 31 Mar 2009 11:16:48 +0200
Subject: [PATCH 001/162] atmel-mci: Add support for inverted detect pin

Same patch as before, modified to use bool. Also adds description of
the new field in struct atmel_mci that I missed in the first patch.

This patch adds Atmel MCI support for inverted detect pins.

Signed-off-by: Jonas Larsson <jonas.larsson@martinsson.se>
Acked-by: Pierre Ossman <pierre@ossman.eu>
Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
---
 drivers/mmc/host/atmel-mci.c | 12 +++++++++---
 include/linux/atmel-mci.h    |  2 ++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index cf6a100bb38f..7b603e4b41db 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -177,6 +177,7 @@ struct atmel_mci {
  *	available.
  * @wp_pin: GPIO pin used for card write protect sending, or negative
  *	if not available.
+ * @detect_is_active_high: The state of the detect pin when it is active.
  * @detect_timer: Timer used for debouncing @detect_pin interrupts.
  */
 struct atmel_mci_slot {
@@ -196,6 +197,7 @@ struct atmel_mci_slot {
 
 	int			detect_pin;
 	int			wp_pin;
+	bool			detect_is_active_high;
 
 	struct timer_list	detect_timer;
 };
@@ -924,7 +926,8 @@ static int atmci_get_cd(struct mmc_host *mmc)
 	struct atmel_mci_slot	*slot = mmc_priv(mmc);
 
 	if (gpio_is_valid(slot->detect_pin)) {
-		present = !gpio_get_value(slot->detect_pin);
+		present = !(gpio_get_value(slot->detect_pin) ^
+			    slot->detect_is_active_high);
 		dev_dbg(&mmc->class_dev, "card is %spresent\n",
 				present ? "" : "not ");
 	}
@@ -1028,7 +1031,8 @@ static void atmci_detect_change(unsigned long data)
 		return;
 
 	enable_irq(gpio_to_irq(slot->detect_pin));
-	present = !gpio_get_value(slot->detect_pin);
+	present = !(gpio_get_value(slot->detect_pin) ^
+		    slot->detect_is_active_high);
 	present_old = test_bit(ATMCI_CARD_PRESENT, &slot->flags);
 
 	dev_vdbg(&slot->mmc->class_dev, "detect change: %d (was %d)\n",
@@ -1456,6 +1460,7 @@ static int __init atmci_init_slot(struct atmel_mci *host,
 	slot->host = host;
 	slot->detect_pin = slot_data->detect_pin;
 	slot->wp_pin = slot_data->wp_pin;
+	slot->detect_is_active_high = slot_data->detect_is_active_high;
 	slot->sdc_reg = sdc_reg;
 
 	mmc->ops = &atmci_ops;
@@ -1477,7 +1482,8 @@ static int __init atmci_init_slot(struct atmel_mci *host,
 		if (gpio_request(slot->detect_pin, "mmc_detect")) {
 			dev_dbg(&mmc->class_dev, "no detect pin available\n");
 			slot->detect_pin = -EBUSY;
-		} else if (gpio_get_value(slot->detect_pin)) {
+		} else if (gpio_get_value(slot->detect_pin) ^
+				slot->detect_is_active_high) {
 			clear_bit(ATMCI_CARD_PRESENT, &slot->flags);
 		}
 	}
diff --git a/include/linux/atmel-mci.h b/include/linux/atmel-mci.h
index 2f1f95737acb..57b1846a3c87 100644
--- a/include/linux/atmel-mci.h
+++ b/include/linux/atmel-mci.h
@@ -10,6 +10,7 @@
  * @bus_width: Number of data lines wired up the slot
  * @detect_pin: GPIO pin wired to the card detect switch
  * @wp_pin: GPIO pin wired to the write protect sensor
+ * @detect_is_active_high: The state of the detect pin when it is active
  *
  * If a given slot is not present on the board, @bus_width should be
  * set to 0. The other fields are ignored in this case.
@@ -24,6 +25,7 @@ struct mci_slot_pdata {
 	unsigned int		bus_width;
 	int			detect_pin;
 	int			wp_pin;
+	bool			detect_is_active_high;
 };
 
 /**

From 7ebcfcf19723254ebe90cdf8718436b9955a9a01 Mon Sep 17 00:00:00 2001
From: Jonas Larsson <jonas.larsson@martinsson.se>
Date: Tue, 31 Mar 2009 11:16:52 +0200
Subject: [PATCH 002/162] avr32: Solves problem with inverted MCI detect pin on
 Merisc board

Same patch as before, modified to use bool.

This patch solves the problem with the inverted mci detect pin on Merisc
boards.

Signed-off-by: Jonas Larsson <jonas.larsson@martinsson.se>
Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
---
 arch/avr32/boards/merisc/setup.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/avr32/boards/merisc/setup.c b/arch/avr32/boards/merisc/setup.c
index 20b300cf105a..623b077594fc 100644
--- a/arch/avr32/boards/merisc/setup.c
+++ b/arch/avr32/boards/merisc/setup.c
@@ -94,9 +94,10 @@ static struct spi_board_info __initdata spi0_board_info[] = {
 
 static struct mci_platform_data __initdata mci0_data = {
 	.slot[0] = {
-		.bus_width	= 4,
-		.detect_pin	= GPIO_PIN_PE(19),
-		.wp_pin		= GPIO_PIN_PE(20),
+		.bus_width		= 4,
+		.detect_pin		= GPIO_PIN_PE(19),
+		.wp_pin			= GPIO_PIN_PE(20),
+		.detect_is_active_high	= true,
 	},
 };
 

From f25181f598cf4a8ccc40a51d8b74f8b555ecddee Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lmcilroy@redhat.com>
Date: Thu, 23 Apr 2009 22:18:00 -0400
Subject: [PATCH 003/162] xfs_file_last_byte() needs to acquire ilock

We had some systems crash with this stack:

[<a00000010000cb20>] ia64_leave_kernel+0x0/0x280
[<a00000021291ca00>] xfs_bmbt_get_startoff+0x0/0x20 [xfs]
[<a0000002129080b0>] xfs_bmap_last_offset+0x210/0x280 [xfs]
[<a00000021295b010>] xfs_file_last_byte+0x70/0x1a0 [xfs]
[<a00000021295b200>] xfs_itruncate_start+0xc0/0x1a0 [xfs]
[<a0000002129935f0>] xfs_inactive_free_eofblocks+0x290/0x460 [xfs]
[<a000000212998fb0>] xfs_release+0x1b0/0x240 [xfs]
[<a0000002129ad930>] xfs_file_release+0x70/0xa0 [xfs]
[<a000000100162ea0>] __fput+0x1a0/0x420
[<a000000100163160>] fput+0x40/0x60

The problem here is that xfs_file_last_byte() does not acquire the
inode lock and can therefore race with another thread that is modifying
the extext list.  While xfs_bmap_last_offset() is trying to lookup
what was the last extent some extents were merged and the extent list
shrunk so the index we lookup is now beyond the end of the extent list
and potentially in a freed buffer.

Signed-off-by: Lachlan McIlroy <lmcilroy@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_inode.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e7ae08d1df48..123b20c8cbf2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1258,8 +1258,10 @@ xfs_file_last_byte(
 	 * necessary.
 	 */
 	if (ip->i_df.if_flags & XFS_IFEXTENTS) {
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
 		error = xfs_bmap_last_offset(NULL, ip, &last_block,
 			XFS_DATA_FORK);
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		if (error) {
 			last_block = 0;
 		}

From 2ac00af7a6d2e65013e6f28bd1f37c0cd98ba134 Mon Sep 17 00:00:00 2001
From: Olaf Weber <olaf@sgi.com>
Date: Fri, 17 Apr 2009 16:12:45 -0500
Subject: [PATCH 004/162] xfs: add more checks to superblock validation

There had been reports where xfs filesystem was randomly
corrupted with fsfuzzer, and xfs failed to handle it
gracefully. This patch fixes couple of reported problem
by providing additional checks in the superblock
validation routine.

Signed-off-by: Olaf Weber <olaf@sgi.com>
Reviewed-by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_mount.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b101990df027..65a99725d0cc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -291,14 +291,17 @@ xfs_mount_validate_sb(
 	    sbp->sb_sectsize > XFS_MAX_SECTORSIZE			||
 	    sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG			||
 	    sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG			||
+	    sbp->sb_sectsize != (1 << sbp->sb_sectlog)			||
 	    sbp->sb_blocksize < XFS_MIN_BLOCKSIZE			||
 	    sbp->sb_blocksize > XFS_MAX_BLOCKSIZE			||
 	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
 	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
+	    sbp->sb_blocksize != (1 << sbp->sb_blocklog)		||
 	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
 	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
 	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||
 	    sbp->sb_inodelog > XFS_DINODE_MAX_LOG			||
+	    sbp->sb_inodesize != (1 << sbp->sb_inodelog)		||
 	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||

From 4be4a00fb55879ef44b5914c189aecffa828deb4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 29 Apr 2009 10:50:48 -0400
Subject: [PATCH 005/162] xfs: a couple getbmap cleanups

 - reshuffle various conditionals for data vs attr fork to make the code
   more readable
 - do fine-grainded goto-based error handling
 - exit early from conditionals instead of keeping a long else branch around
 - allow kmem_alloc to fail

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_bmap.c | 164 ++++++++++++++++++++++------------------------
 1 file changed, 80 insertions(+), 84 deletions(-)

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3a6ed426327a..abe42448b1c0 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5880,7 +5880,7 @@ xfs_getbmap(
 	void			*arg)		/* formatter arg */
 {
 	__int64_t		bmvend;		/* last block requested */
-	int			error;		/* return value */
+	int			error = 0;	/* return value */
 	__int64_t		fixlen;		/* length for -1 case */
 	int			i;		/* extent number */
 	int			lock;		/* lock state */
@@ -5899,30 +5899,8 @@ xfs_getbmap(
 
 	mp = ip->i_mount;
 	iflags = bmv->bmv_iflags;
-
 	whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
 
-	/*	If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
-	 *	generate a DMAPI read event.  Otherwise, if the DM_EVENT_READ
-	 *	bit is set for the file, generate a read event in order
-	 *	that the DMAPI application may do its thing before we return
-	 *	the extents.  Usually this means restoring user file data to
-	 *	regions of the file that look like holes.
-	 *
-	 *	The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
-	 *	BMV_IF_NO_DMAPI_READ so that read events are generated.
-	 *	If this were not true, callers of ioctl( XFS_IOC_GETBMAP )
-	 *	could misinterpret holes in a DMAPI file as true holes,
-	 *	when in fact they may represent offline user data.
-	 */
-	if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 &&
-	    DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
-	    whichfork == XFS_DATA_FORK) {
-		error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
-		if (error)
-			return XFS_ERROR(error);
-	}
-
 	if (whichfork == XFS_ATTR_FORK) {
 		if (XFS_IFORK_Q(ip)) {
 			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
@@ -5936,11 +5914,37 @@ xfs_getbmap(
 					 ip->i_mount);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
-	} else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
-		   ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
-		   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
-		return XFS_ERROR(EINVAL);
-	if (whichfork == XFS_DATA_FORK) {
+
+		prealloced = 0;
+		fixlen = 1LL << 32;
+	} else {
+		/*
+		 * If the BMV_IF_NO_DMAPI_READ interface bit specified, do
+		 * not generate a DMAPI read event.  Otherwise, if the
+		 * DM_EVENT_READ bit is set for the file, generate a read
+		 * event in order that the DMAPI application may do its thing
+		 * before we return the extents.  Usually this means restoring
+		 * user file data to regions of the file that look like holes.
+		 *
+		 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
+		 * BMV_IF_NO_DMAPI_READ so that read events are generated.
+		 * If this were not true, callers of ioctl(XFS_IOC_GETBMAP)
+		 * could misinterpret holes in a DMAPI file as true holes,
+		 * when in fact they may represent offline user data.
+		 */
+		if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
+		    !(iflags & BMV_IF_NO_DMAPI_READ)) {
+			error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip,
+					      0, 0, 0, NULL);
+			if (error)
+				return XFS_ERROR(error);
+		}
+
+		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+			return XFS_ERROR(EINVAL);
+
 		if (xfs_get_extsz_hint(ip) ||
 		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
 			prealloced = 1;
@@ -5949,42 +5953,34 @@ xfs_getbmap(
 			prealloced = 0;
 			fixlen = ip->i_size;
 		}
-	} else {
-		prealloced = 0;
-		fixlen = 1LL << 32;
 	}
 
 	if (bmv->bmv_length == -1) {
 		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
-		bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset),
-					(__int64_t)0);
-	} else if (bmv->bmv_length < 0)
-		return XFS_ERROR(EINVAL);
-	if (bmv->bmv_length == 0) {
+		bmv->bmv_length =
+			max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
+	} else if (bmv->bmv_length == 0) {
 		bmv->bmv_entries = 0;
 		return 0;
+	} else if (bmv->bmv_length < 0) {
+		return XFS_ERROR(EINVAL);
 	}
+
 	nex = bmv->bmv_count - 1;
 	if (nex <= 0)
 		return XFS_ERROR(EINVAL);
 	bmvend = bmv->bmv_offset + bmv->bmv_length;
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
-	if (((iflags & BMV_IF_DELALLOC) == 0) &&
-	    (whichfork == XFS_DATA_FORK) &&
-	    (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
-		/* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
-		error = xfs_flush_pages(ip, (xfs_off_t)0,
-					       -1, 0, FI_REMAPF);
-		if (error) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-		return error;
+	if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
+		if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+			error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+			if (error)
+				goto out_unlock_iolock;
 		}
-	}
 
-	ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) ||
-	       ip->i_delayed_blks == 0);
+		ASSERT(ip->i_delayed_blks == 0);
+	}
 
 	lock = xfs_ilock_map_shared(ip);
 
@@ -5995,23 +5991,25 @@ xfs_getbmap(
 	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
 		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
 
-	bmapi_flags = xfs_bmapi_aflag(whichfork) |
-			((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
+	bmapi_flags = xfs_bmapi_aflag(whichfork);
+	if (!(iflags & BMV_IF_PREALLOC))
+		bmapi_flags |= XFS_BMAPI_IGSTATE;
 
 	/*
 	 * Allocate enough space to handle "subnex" maps at a time.
 	 */
+	error = ENOMEM;
 	subnex = 16;
-	map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP);
+	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL);
+	if (!map)
+		goto out_unlock_ilock;
 
 	bmv->bmv_entries = 0;
 
-	if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) {
-		if (((iflags & BMV_IF_DELALLOC) == 0) ||
-		    whichfork == XFS_ATTR_FORK) {
-			error = 0;
-			goto unlock_and_return;
-		}
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
+	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
+		error = 0;
+		goto out_free_map;
 	}
 
 	nexleft = nex;
@@ -6023,10 +6021,12 @@ xfs_getbmap(
 				  bmapi_flags, NULL, 0, map, &nmap,
 				  NULL, NULL);
 		if (error)
-			goto unlock_and_return;
+			goto out_free_map;
 		ASSERT(nmap <= subnex);
 
 		for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+			int full = 0;	/* user array is full */
+
 			out.bmv_oflags = 0;
 			if (map[i].br_state == XFS_EXT_UNWRITTEN)
 				out.bmv_oflags |= BMV_OF_PREALLOC;
@@ -6041,36 +6041,32 @@ xfs_getbmap(
 			    whichfork == XFS_ATTR_FORK) {
 				/* came to the end of attribute fork */
 				out.bmv_oflags |= BMV_OF_LAST;
-				goto unlock_and_return;
-			} else {
-				int full = 0;	/* user array is full */
-
-				if (!xfs_getbmapx_fix_eof_hole(ip, &out,
-							prealloced, bmvend,
-							map[i].br_startblock)) {
-					goto unlock_and_return;
-				}
-
-				/* format results & advance arg */
-				error = formatter(&arg, &out, &full);
-				if (error || full)
-					goto unlock_and_return;
-				nexleft--;
-				bmv->bmv_offset =
-					out.bmv_offset + out.bmv_length;
-				bmv->bmv_length = MAX((__int64_t)0,
-					(__int64_t)(bmvend - bmv->bmv_offset));
-				bmv->bmv_entries++;
+				goto out_free_map;
 			}
+
+			if (!xfs_getbmapx_fix_eof_hole(ip, &out, prealloced,
+					bmvend, map[i].br_startblock))
+				goto out_free_map;
+
+			/* format results & advance arg */
+			error = formatter(&arg, &out, &full);
+			if (error || full)
+				goto out_free_map;
+			nexleft--;
+			bmv->bmv_offset =
+				out.bmv_offset + out.bmv_length;
+			bmv->bmv_length =
+				max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+			bmv->bmv_entries++;
 		}
 	} while (nmap && nexleft && bmv->bmv_length);
 
-unlock_and_return:
-	xfs_iunlock_map_shared(ip, lock);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
+ out_free_map:
 	kmem_free(map);
-
+ out_unlock_ilock:
+	xfs_iunlock_map_shared(ip, lock);
+ out_unlock_iolock:
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return error;
 }
 

From 6321e3ed2acf3ee9643cdd403e1c88605d7944ba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 24 Feb 2009 08:39:02 -0500
Subject: [PATCH 006/162] xfs: fix getbmap vs mmap deadlock

xfs_getbmap (or rather the formatters called by it) copy out the getbmap
structures under the ilock, which can deadlock against mmap.  This has
been reported via bugzilla a while ago (#717) and has recently also
shown up via lockdep.

So allocate a temporary buffer to format the kernel getbmap structures
into and then copy them out after dropping the locks.

A little problem with this is that we limit the number of extents we
can copy out by the maximum allocation size, but I see no real way
around that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_bmap.c | 52 +++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index abe42448b1c0..ca7c6005a487 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5890,12 +5890,13 @@ xfs_getbmap(
 	int			nexleft;	/* # of user extents left */
 	int			subnex;		/* # of bmapi's can do */
 	int			nmap;		/* number of map entries */
-	struct getbmapx		out;		/* output structure */
+	struct getbmapx		*out;		/* output structure */
 	int			whichfork;	/* data or attr fork */
 	int			prealloced;	/* this is a file with
 						 * preallocated data space */
 	int			iflags;		/* interface flags */
 	int			bmapi_flags;	/* flags for xfs_bmapi */
+	int			cur_ext = 0;
 
 	mp = ip->i_mount;
 	iflags = bmv->bmv_iflags;
@@ -5971,6 +5972,13 @@ xfs_getbmap(
 		return XFS_ERROR(EINVAL);
 	bmvend = bmv->bmv_offset + bmv->bmv_length;
 
+
+	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
+		return XFS_ERROR(ENOMEM);
+	out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
+	if (!out)
+		return XFS_ERROR(ENOMEM);
+
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
 		if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
@@ -6025,39 +6033,39 @@ xfs_getbmap(
 		ASSERT(nmap <= subnex);
 
 		for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
-			int full = 0;	/* user array is full */
-
-			out.bmv_oflags = 0;
+			out[cur_ext].bmv_oflags = 0;
 			if (map[i].br_state == XFS_EXT_UNWRITTEN)
-				out.bmv_oflags |= BMV_OF_PREALLOC;
+				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
 			else if (map[i].br_startblock == DELAYSTARTBLOCK)
-				out.bmv_oflags |= BMV_OF_DELALLOC;
-			out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
-			out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-			out.bmv_unused1 = out.bmv_unused2 = 0;
+				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
+			out[cur_ext].bmv_offset =
+				XFS_FSB_TO_BB(mp, map[i].br_startoff);
+			out[cur_ext].bmv_length =
+				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			out[cur_ext].bmv_unused1 = 0;
+			out[cur_ext].bmv_unused2 = 0;
 			ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
 			      (map[i].br_startblock != DELAYSTARTBLOCK));
                         if (map[i].br_startblock == HOLESTARTBLOCK &&
 			    whichfork == XFS_ATTR_FORK) {
 				/* came to the end of attribute fork */
-				out.bmv_oflags |= BMV_OF_LAST;
+				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
 				goto out_free_map;
 			}
 
-			if (!xfs_getbmapx_fix_eof_hole(ip, &out, prealloced,
-					bmvend, map[i].br_startblock))
+			if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
+					prealloced, bmvend,
+					map[i].br_startblock))
 				goto out_free_map;
 
-			/* format results & advance arg */
-			error = formatter(&arg, &out, &full);
-			if (error || full)
-				goto out_free_map;
 			nexleft--;
 			bmv->bmv_offset =
-				out.bmv_offset + out.bmv_length;
+				out[cur_ext].bmv_offset +
+				out[cur_ext].bmv_length;
 			bmv->bmv_length =
 				max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
 			bmv->bmv_entries++;
+			cur_ext++;
 		}
 	} while (nmap && nexleft && bmv->bmv_length);
 
@@ -6067,6 +6075,16 @@ xfs_getbmap(
 	xfs_iunlock_map_shared(ip, lock);
  out_unlock_iolock:
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	for (i = 0; i < cur_ext; i++) {
+		int full = 0;	/* user array is full */
+
+		/* format results & advance arg */
+		error = formatter(&arg, &out[i], &full);
+		if (error || full)
+			break;
+	}
+
 	return error;
 }
 

From ec91d1335f478c5cd089d82ffbf936075c5f24c8 Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Thu, 7 May 2009 19:49:45 -0500
Subject: [PATCH 007/162] xfs: fix double unlock in xfs_swap_extents()

Regreesion from commit ef8f7fc, which rearranged the code in
xfs_swap_extents() leading to double unlock of xfs inode ilock.
That resulted in xfs_fsr deadlocking itself on platforms, which
don't handle double unlock of rw_semaphore nicely. It caused the
count go negative, which represents the write holder, without
really having one. ia64 is one of the platforms where deadlock
was easily reproduced and the fix was tested.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_dfrag.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e6d839bddbf0..7465f9ee125f 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -347,13 +347,15 @@ xfs_swap_extents(
 
 	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
 
-out_unlock:
-	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 out:
 	kmem_free(tempifp);
 	return error;
 
+out_unlock:
+	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	goto out;
+
 out_trans_cancel:
 	xfs_trans_cancel(tp, 0);
 	goto out_unlock;

From 334034c132017fa662716b70591b2297ed7f238c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 May 2009 13:37:11 -0700
Subject: [PATCH 008/162] avr32: remove obsolete hw_interrupt_type

The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) have
been kept around for migration reasons.  After more than two years it's
time to remove them finally.

This patch cleans up one of the remaining users.  When all such patches
hit mainline we can remove the defines and typedefs finally.

Impact: cleanup

Convert the last remaining users to struct irq_chip and remove the
define.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
---
 arch/avr32/include/asm/hw_irq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/avr32/include/asm/hw_irq.h b/arch/avr32/include/asm/hw_irq.h
index 218b0a6bfd1b..a36f9fcb8fcd 100644
--- a/arch/avr32/include/asm/hw_irq.h
+++ b/arch/avr32/include/asm/hw_irq.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_AVR32_HW_IRQ_H
 #define __ASM_AVR32_HW_IRQ_H
 
-static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
+static inline void hw_resend_irq(struct irq_chip *h, unsigned int i)
 {
 	/* Nothing to do */
 }

From 096324873f9c7172a17aff9db1356f4f01b77afe Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Sat, 23 May 2009 14:30:12 -0500
Subject: [PATCH 009/162] xfs: fix overflow in xfs_growfs_data_private

In the case where growing a filesystem would leave the last AG
too small, the fixup code has an overflow in the calculation
of the new size with one fewer ag, because "nagcount" is a 32
bit number.  If the new filesystem has > 2^32 blocks in it
this causes a problem resulting in an EINVAL return from growfs:

 # xfs_io -f -c "truncate 19998630180864" fsfile
 # mkfs.xfs -f -bsize=4096 -dagsize=76288719b,size=3905982455b fsfile
 # mount -o loop fsfile /mnt
 # xfs_growfs /mnt

meta-data=/dev/loop0             isize=256    agcount=52,
agsize=76288719 blks
         =                       sectsz=512   attr=2
data     =                       bsize=4096   blocks=3905982455, imaxpct=5
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0
log      =internal               bsize=4096   blocks=32768, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=0
realtime =none                   extsz=4096   blocks=0, rtextents=0
xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Invalid argument

Reported-by: richard.ems@cape-horn-eng.com
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_fsops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 8379e3bca26c..cbd451bb4848 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -160,7 +160,7 @@ xfs_growfs_data_private(
 	nagcount = new + (nb_mod != 0);
 	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
 		nagcount--;
-		nb = nagcount * mp->m_sb.sb_agblocks;
+		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
 		if (nb < mp->m_sb.sb_dblocks)
 			return XFS_ERROR(EINVAL);
 	}

From 13503fa9137d9708d52214e9506c671dbf2fbdce Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 26 Mar 2009 17:39:20 +0900
Subject: [PATCH 010/162] x86, mce: Cleanup param parser

- Fix the comment formatting.

- The error path does not return 0, and printk lacks level and "\n".

- Move __setup("nomce") next to mcheck_disable().

- Improve readability etc.

[ Impact: cleanup ]

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <49CB3F38.7090703@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 6fb0b359d2a5..77effb55afe7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -839,25 +839,29 @@ static int __init mcheck_disable(char *str)
 	mce_dont_init = 1;
 	return 1;
 }
+__setup("nomce", mcheck_disable);
 
-/* mce=off disables machine check.
-   mce=TOLERANCELEVEL (number, see above)
-   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
-   mce=nobootlog Don't log MCEs from before booting. */
+/*
+ * mce=off disables machine check
+ * mce=TOLERANCELEVEL (number, see above)
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+ * mce=nobootlog Don't log MCEs from before booting.
+ */
 static int __init mcheck_enable(char *str)
 {
 	if (!strcmp(str, "off"))
 		mce_dont_init = 1;
-	else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
-		mce_bootlog = str[0] == 'b';
+	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
+		mce_bootlog = (str[0] == 'b');
 	else if (isdigit(str[0]))
 		get_option(&str, &tolerant);
-	else
-		printk("mce= argument %s ignored. Please use /sys", str);
+	else {
+		printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n",
+		       str);
+		return 0;
+	}
 	return 1;
 }
-
-__setup("nomce", mcheck_disable);
 __setup("mce=", mcheck_enable);
 
 /*

From e9eee03e99d519599eb615c3e251d5f6cc4be57d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:17 +0200
Subject: [PATCH 011/162] x86, mce: clean up mce_64.c

This file has been modified many times along the years, by multiple
authors, so the general style and structure has diverged in a number
of areas making this file hard to read.

So fix the coding style match that of the rest of the x86 arch code.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 261 +++++++++++++++++-----------
 1 file changed, 156 insertions(+), 105 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 77effb55afe7..1491246c4d69 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -1,46 +1,47 @@
 /*
  * Machine check handler.
+ *
  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  * Rest from unknown author(s).
  * 2004 Andi Kleen. Rewrote most of it.
  * Copyright 2008 Intel Corporation
  * Author: Andi Kleen
  */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/smp_lock.h>
-#include <linux/string.h>
-#include <linux/rcupdate.h>
-#include <linux/kallsyms.h>
-#include <linux/sysdev.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/capability.h>
-#include <linux/cpu.h>
-#include <linux/percpu.h>
-#include <linux/poll.h>
 #include <linux/thread_info.h>
-#include <linux/ctype.h>
-#include <linux/kmod.h>
-#include <linux/kdebug.h>
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
+#include <linux/capability.h>
+#include <linux/miscdevice.h>
 #include <linux/ratelimit.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/mce.h>
-#include <asm/uaccess.h>
-#include <asm/smp.h>
-#include <asm/idle.h>
+#include <linux/kallsyms.h>
+#include <linux/rcupdate.h>
+#include <linux/smp_lock.h>
+#include <linux/kobject.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/sysdev.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+#include <linux/cpu.h>
+#include <linux/fs.h>
 
-#define MISC_MCELOG_MINOR 227
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/smp.h>
+
+#define MISC_MCELOG_MINOR	227
 
 atomic_t mce_entry;
 
-static int mce_dont_init;
+static int			mce_dont_init;
 
 /*
  * Tolerant levels:
@@ -49,16 +50,16 @@ static int mce_dont_init;
  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  *   3: never panic or SIGBUS, log all errors (for testing only)
  */
-static int tolerant = 1;
-static int banks;
-static u64 *bank;
-static unsigned long notify_user;
-static int rip_msr;
-static int mce_bootlog = -1;
-static atomic_t mce_events;
+static int			tolerant = 1;
+static int			banks;
+static u64			*bank;
+static unsigned long		notify_user;
+static int			rip_msr;
+static int			mce_bootlog = -1;
+static atomic_t			mce_events;
 
-static char trigger[128];
-static char *trigger_argv[2] = { trigger, NULL };
+static char			trigger[128];
+static char			*trigger_argv[2] = { trigger, NULL };
 
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 
@@ -89,19 +90,23 @@ static struct mce_log mcelog = {
 void mce_log(struct mce *mce)
 {
 	unsigned next, entry;
+
 	atomic_inc(&mce_events);
 	mce->finished = 0;
 	wmb();
 	for (;;) {
 		entry = rcu_dereference(mcelog.next);
 		for (;;) {
-			/* When the buffer fills up discard new entries. Assume
-			   that the earlier errors are the more interesting. */
+			/*
+			 * When the buffer fills up discard new entries.
+			 * Assume that the earlier errors are the more
+			 * interesting ones:
+			 */
 			if (entry >= MCE_LOG_LEN) {
 				set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
 				return;
 			}
-			/* Old left over entry. Skip. */
+			/* Old left over entry. Skip: */
 			if (mcelog.entry[entry].finished) {
 				entry++;
 				continue;
@@ -264,12 +269,12 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
  * implies that most kernel services cannot be safely used. Don't even
  * think about putting a printk in there!
  */
-void do_machine_check(struct pt_regs * regs, long error_code)
+void do_machine_check(struct pt_regs *regs, long error_code)
 {
 	struct mce m, panicm;
+	int panicm_found = 0;
 	u64 mcestart = 0;
 	int i;
-	int panicm_found = 0;
 	/*
 	 * If no_way_out gets set, there is no safe way to recover from this
 	 * MCE.  If tolerant is cranked up, we'll try anyway.
@@ -293,6 +298,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	mce_setup(&m);
 
 	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+
 	/* if the restart IP is not valid, we're done for */
 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
 		no_way_out = 1;
@@ -356,23 +362,29 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 		mce_get_rip(&m, regs);
 		mce_log(&m);
 
-		/* Did this bank cause the exception? */
-		/* Assume that the bank with uncorrectable errors did it,
-		   and that there is only a single one. */
-		if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
+		/*
+		 * Did this bank cause the exception?
+		 *
+		 * Assume that the bank with uncorrectable errors did it,
+		 * and that there is only a single one:
+		 */
+		if ((m.status & MCI_STATUS_UC) &&
+					(m.status & MCI_STATUS_EN)) {
 			panicm = m;
 			panicm_found = 1;
 		}
 	}
 
-	/* If we didn't find an uncorrectable error, pick
-	   the last one (shouldn't happen, just being safe). */
+	/*
+	 * If we didn't find an uncorrectable error, pick
+	 * the last one (shouldn't happen, just being safe).
+	 */
 	if (!panicm_found)
 		panicm = m;
 
 	/*
 	 * If we have decided that we just CAN'T continue, and the user
-	 *  has not set tolerant to an insane level, give up and die.
+	 * has not set tolerant to an insane level, give up and die.
 	 */
 	if (no_way_out && tolerant < 3)
 		mce_panic("Machine check", &panicm, mcestart);
@@ -451,10 +463,9 @@ void mce_log_therm_throt_event(__u64 status)
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
  * errors, poll 2x slower (up to check_interval seconds).
  */
-
 static int check_interval = 5 * 60; /* 5 minutes */
+
 static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
-static void mcheck_timer(unsigned long);
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
 static void mcheck_timer(unsigned long data)
@@ -464,9 +475,10 @@ static void mcheck_timer(unsigned long data)
 
 	WARN_ON(smp_processor_id() != data);
 
-	if (mce_available(&current_cpu_data))
+	if (mce_available(&current_cpu_data)) {
 		machine_check_poll(MCP_TIMESTAMP,
 				&__get_cpu_var(mce_poll_banks));
+	}
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -501,6 +513,7 @@ int mce_notify_user(void)
 	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
 
 	clear_thread_flag(TIF_MCE_NOTIFY);
+
 	if (test_and_clear_bit(0, &notify_user)) {
 		wake_up_interruptible(&mce_wait);
 
@@ -520,9 +533,10 @@ int mce_notify_user(void)
 	return 0;
 }
 
-/* see if the idle task needs to notify userspace */
+/* see if the idle task needs to notify userspace: */
 static int
-mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
+mce_idle_callback(struct notifier_block *nfb, unsigned long action,
+		  void *unused)
 {
 	/* IDLE_END should be safe - interrupts are back on */
 	if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
@@ -532,7 +546,7 @@ mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 }
 
 static struct notifier_block mce_idle_notifier = {
-	.notifier_call = mce_idle_callback,
+	.notifier_call		= mce_idle_callback,
 };
 
 static __init int periodic_mcheck_init(void)
@@ -547,8 +561,8 @@ __initcall(periodic_mcheck_init);
  */
 static int mce_cap_init(void)
 {
-	u64 cap;
 	unsigned b;
+	u64 cap;
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	b = cap & 0xff;
@@ -578,9 +592,9 @@ static int mce_cap_init(void)
 
 static void mce_init(void *dummy)
 {
+	mce_banks_t all_banks;
 	u64 cap;
 	int i;
-	mce_banks_t all_banks;
 
 	/*
 	 * Log the machine checks left over from the previous reset.
@@ -605,14 +619,21 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
-		if (c->x86 == 15 && banks > 4)
-			/* disable GART TBL walk error reporting, which trips off
-			   incorrectly with the IOMMU & 3ware & Cerberus. */
+		if (c->x86 == 15 && banks > 4) {
+			/*
+			 * disable GART TBL walk error reporting, which
+			 * trips off incorrectly with the IOMMU & 3ware
+			 * & Cerberus:
+			 */
 			clear_bit(10, (unsigned long *)&bank[4]);
-		if(c->x86 <= 17 && mce_bootlog < 0)
-			/* Lots of broken BIOS around that don't clear them
-			   by default and leave crap in there. Don't log. */
+		}
+		if (c->x86 <= 17 && mce_bootlog < 0) {
+			/*
+			 * Lots of broken BIOS around that don't clear them
+			 * by default and leave crap in there. Don't log:
+			 */
 			mce_bootlog = 0;
+		}
 	}
 
 }
@@ -646,7 +667,7 @@ static void mce_init_timer(void)
 
 /*
  * Called for each booted CPU to set up machine checks.
- * Must be called with preempt off.
+ * Must be called with preempt off:
  */
 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
@@ -669,8 +690,8 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
  */
 
 static DEFINE_SPINLOCK(mce_state_lock);
-static int open_count;	/* #times opened */
-static int open_exclu;	/* already open exclusive? */
+static int		open_count;		/* #times opened */
+static int		open_exclu;		/* already open exclusive? */
 
 static int mce_open(struct inode *inode, struct file *file)
 {
@@ -680,6 +701,7 @@ static int mce_open(struct inode *inode, struct file *file)
 	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 		spin_unlock(&mce_state_lock);
 		unlock_kernel();
+
 		return -EBUSY;
 	}
 
@@ -712,13 +734,14 @@ static void collect_tscs(void *data)
 	rdtscll(cpu_tsc[smp_processor_id()]);
 }
 
+static DEFINE_MUTEX(mce_read_mutex);
+
 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 			loff_t *off)
 {
-	unsigned long *cpu_tsc;
-	static DEFINE_MUTEX(mce_read_mutex);
-	unsigned prev, next;
 	char __user *buf = ubuf;
+	unsigned long *cpu_tsc;
+	unsigned prev, next;
 	int i, err;
 
 	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
@@ -732,6 +755,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 		mutex_unlock(&mce_read_mutex);
 		kfree(cpu_tsc);
+
 		return -EINVAL;
 	}
 
@@ -770,6 +794,7 @@ timeout:
 	 * synchronize.
 	 */
 	on_each_cpu(collect_tscs, cpu_tsc, 1);
+
 	for (i = next; i < MCE_LOG_LEN; i++) {
 		if (mcelog.entry[i].finished &&
 		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
@@ -782,6 +807,7 @@ timeout:
 	}
 	mutex_unlock(&mce_read_mutex);
 	kfree(cpu_tsc);
+
 	return err ? -EFAULT : buf - ubuf;
 }
 
@@ -799,6 +825,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
+
 	switch (cmd) {
 	case MCE_GET_RECORD_LEN:
 		return put_user(sizeof(struct mce), p);
@@ -810,6 +837,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 		do {
 			flags = mcelog.flags;
 		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
 		return put_user(flags, p);
 	}
 	default:
@@ -818,11 +846,11 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 }
 
 static const struct file_operations mce_chrdev_ops = {
-	.open = mce_open,
-	.release = mce_release,
-	.read = mce_read,
-	.poll = mce_poll,
-	.unlocked_ioctl = mce_ioctl,
+	.open			= mce_open,
+	.release		= mce_release,
+	.read			= mce_read,
+	.poll			= mce_poll,
+	.unlocked_ioctl		= mce_ioctl,
 };
 
 static struct miscdevice mce_log_device = {
@@ -891,13 +919,16 @@ static int mce_shutdown(struct sys_device *dev)
 	return mce_disable();
 }
 
-/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
-   Only one CPU is active at this time, the others get readded later using
-   CPU hotplug. */
+/*
+ * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+ * Only one CPU is active at this time, the others get re-added later using
+ * CPU hotplug:
+ */
 static int mce_resume(struct sys_device *dev)
 {
 	mce_init(NULL);
 	mce_cpu_features(&current_cpu_data);
+
 	return 0;
 }
 
@@ -916,14 +947,16 @@ static void mce_restart(void)
 }
 
 static struct sysdev_class mce_sysclass = {
-	.suspend = mce_suspend,
-	.shutdown = mce_shutdown,
-	.resume = mce_resume,
-	.name = "machinecheck",
+	.suspend	= mce_suspend,
+	.shutdown	= mce_shutdown,
+	.resume		= mce_resume,
+	.name		= "machinecheck",
 };
 
 DEFINE_PER_CPU(struct sys_device, device_mce);
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
+
+__cpuinitdata
+void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
 /* Why are there no generic functions for this? */
 #define ACCESSOR(name, var, start) \
@@ -937,9 +970,12 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
 				    const char *buf, size_t siz) {	\
 		char *end;						\
 		unsigned long new = simple_strtoul(buf, &end, 0);	\
-		if (end == buf) return -EINVAL;				\
+									\
+		if (end == buf)						\
+			return -EINVAL;					\
 		var = new;						\
 		start;							\
+									\
 		return end-buf;						\
 	}								\
 	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
@@ -950,6 +986,7 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
 			 char *buf)
 {
 	u64 b = bank[attr - bank_attrs];
+
 	return sprintf(buf, "%llx\n", b);
 }
 
@@ -958,15 +995,18 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
 {
 	char *end;
 	u64 new = simple_strtoull(buf, &end, 0);
+
 	if (end == buf)
 		return -EINVAL;
+
 	bank[attr - bank_attrs] = new;
 	mce_restart();
+
 	return end-buf;
 }
 
-static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
-				char *buf)
+static ssize_t
+show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
 {
 	strcpy(buf, trigger);
 	strcat(buf, "\n");
@@ -974,21 +1014,27 @@ static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 }
 
 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
-				const char *buf,size_t siz)
+				const char *buf, size_t siz)
 {
 	char *p;
 	int len;
+
 	strncpy(trigger, buf, sizeof(trigger));
 	trigger[sizeof(trigger)-1] = 0;
 	len = strlen(trigger);
 	p = strchr(trigger, '\n');
-	if (*p) *p = 0;
+
+	if (*p)
+		*p = 0;
+
 	return len;
 }
 
 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-ACCESSOR(check_interval,check_interval,mce_restart())
+
+ACCESSOR(check_interval, check_interval, mce_restart())
+
 static struct sysdev_attribute *mce_attributes[] = {
 	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 	NULL
@@ -996,7 +1042,7 @@ static struct sysdev_attribute *mce_attributes[] = {
 
 static cpumask_var_t mce_device_initialized;
 
-/* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
+/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
 static __cpuinit int mce_create_device(unsigned int cpu)
 {
 	int err;
@@ -1006,15 +1052,15 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 		return -EIO;
 
 	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
-	per_cpu(device_mce,cpu).id = cpu;
-	per_cpu(device_mce,cpu).cls = &mce_sysclass;
+	per_cpu(device_mce, cpu).id	= cpu;
+	per_cpu(device_mce, cpu).cls	= &mce_sysclass;
 
-	err = sysdev_register(&per_cpu(device_mce,cpu));
+	err = sysdev_register(&per_cpu(device_mce, cpu));
 	if (err)
 		return err;
 
 	for (i = 0; mce_attributes[i]; i++) {
-		err = sysdev_create_file(&per_cpu(device_mce,cpu),
+		err = sysdev_create_file(&per_cpu(device_mce, cpu),
 					 mce_attributes[i]);
 		if (err)
 			goto error;
@@ -1035,10 +1081,10 @@ error2:
 	}
 error:
 	while (--i >= 0) {
-		sysdev_remove_file(&per_cpu(device_mce,cpu),
+		sysdev_remove_file(&per_cpu(device_mce, cpu),
 				   mce_attributes[i]);
 	}
-	sysdev_unregister(&per_cpu(device_mce,cpu));
+	sysdev_unregister(&per_cpu(device_mce, cpu));
 
 	return err;
 }
@@ -1051,12 +1097,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 		return;
 
 	for (i = 0; mce_attributes[i]; i++)
-		sysdev_remove_file(&per_cpu(device_mce,cpu),
+		sysdev_remove_file(&per_cpu(device_mce, cpu),
 			mce_attributes[i]);
 	for (i = 0; i < banks; i++)
 		sysdev_remove_file(&per_cpu(device_mce, cpu),
 			&bank_attrs[i]);
-	sysdev_unregister(&per_cpu(device_mce,cpu));
+	sysdev_unregister(&per_cpu(device_mce, cpu));
 	cpumask_clear_cpu(cpu, mce_device_initialized);
 }
 
@@ -1076,11 +1122,12 @@ static void mce_disable_cpu(void *h)
 
 static void mce_reenable_cpu(void *h)
 {
-	int i;
 	unsigned long action = *(unsigned long *)h;
+	int i;
 
 	if (!mce_available(&current_cpu_data))
 		return;
+
 	if (!(action & CPU_TASKS_FROZEN))
 		cmci_reenable();
 	for (i = 0; i < banks; i++)
@@ -1088,8 +1135,8 @@ static void mce_reenable_cpu(void *h)
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
-				      unsigned long action, void *hcpu)
+static int __cpuinit
+mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
 	struct timer_list *t = &per_cpu(mce_timer, cpu);
@@ -1142,12 +1189,14 @@ static __init int mce_init_banks(void)
 
 	for (i = 0; i < banks; i++) {
 		struct sysdev_attribute *a = &bank_attrs[i];
-		a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+
+		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
 		if (!a->attr.name)
 			goto nomem;
-		a->attr.mode = 0644;
-		a->show = show_bank;
-		a->store = set_bank;
+
+		a->attr.mode	= 0644;
+		a->show		= show_bank;
+		a->store	= set_bank;
 	}
 	return 0;
 
@@ -1156,6 +1205,7 @@ nomem:
 		kfree(bank_attrs[i].attr.name);
 	kfree(bank_attrs);
 	bank_attrs = NULL;
+
 	return -ENOMEM;
 }
 
@@ -1185,6 +1235,7 @@ static __init int mce_init_device(void)
 
 	register_hotcpu_notifier(&mce_cpu_notifier);
 	misc_register(&mce_log_device);
+
 	return err;
 }
 

From 3b58dfd04bdfa52e717ead8f3c7622610eb7f950 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:21 +0200
Subject: [PATCH 012/162] x86, mce: clean up mce_32.c

Make the coding style match that of the rest of the x86 arch code.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_32.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 3552119b091d..05979e7eff12 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -2,13 +2,12 @@
  * mce.c - x86 Machine Check Exception Reporting
  * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
  */
-
-#include <linux/init.h>
-#include <linux/types.h>
+#include <linux/thread_info.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
 #include <linux/smp.h>
-#include <linux/thread_info.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -17,18 +16,20 @@
 #include "mce.h"
 
 int mce_disabled;
-int nr_mce_banks;
 
+int nr_mce_banks;
 EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
 
 /* Handle unconfigured int18 (should never happen) */
 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
 {
-	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
+	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+	       smp_processor_id());
 }
 
 /* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
+void (*machine_check_vector)(struct pt_regs *, long error_code) =
+						unexpected_machine_check;
 
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)

From c5aaf0e0702513637278ca4e27a156caa9392817 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:18 +0200
Subject: [PATCH 013/162] x86, mce: clean up p4.c

Make the coding style match that of the rest of the x86 arch code.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/p4.c | 73 +++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f53bdcbaf382..cb344aba479c 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -2,18 +2,17 @@
  * P4 specific Machine Check Exception Reporting
  */
 
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
 #include <linux/smp.h>
 
+#include <asm/therm_throt.h>
 #include <asm/processor.h>
 #include <asm/system.h>
-#include <asm/msr.h>
 #include <asm/apic.h>
-
-#include <asm/therm_throt.h>
+#include <asm/msr.h>
 
 #include "mce.h"
 
@@ -36,6 +35,7 @@ static int mce_num_extended_msrs;
 
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
+
 static void unexpected_thermal_interrupt(struct pt_regs *regs)
 {
 	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
@@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs)
 	add_taint(TAINT_MACHINE_CHECK);
 }
 
-/* P4/Xeon Thermal transition interrupt handler */
+/* P4/Xeon Thermal transition interrupt handler: */
 static void intel_thermal_interrupt(struct pt_regs *regs)
 {
 	__u64 msr_val;
@@ -54,8 +54,9 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
 	therm_throt_process(msr_val & 0x1);
 }
 
-/* Thermal interrupt handler for this CPU setup */
-static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
+/* Thermal interrupt handler for this CPU setup: */
+static void (*vendor_thermal_interrupt)(struct pt_regs *regs) =
+						unexpected_thermal_interrupt;
 
 void smp_thermal_interrupt(struct pt_regs *regs)
 {
@@ -65,67 +66,76 @@ void smp_thermal_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
-/* P4/Xeon Thermal regulation detect and init */
+/* P4/Xeon Thermal regulation detect and init: */
 static void intel_init_thermal(struct cpuinfo_x86 *c)
 {
-	u32 l, h;
 	unsigned int cpu = smp_processor_id();
+	u32 l, h;
 
-	/* Thermal monitoring */
+	/* Thermal monitoring: */
 	if (!cpu_has(c, X86_FEATURE_ACPI))
 		return;	/* -ENODEV */
 
-	/* Clock modulation */
+	/* Clock modulation: */
 	if (!cpu_has(c, X86_FEATURE_ACC))
 		return;	/* -ENODEV */
 
-	/* first check if its enabled already, in which case there might
+	/*
+	 * First check if its enabled already, in which case there might
 	 * be some SMM goo which handles it, so we can't even put a handler
-	 * since it might be delivered via SMI already -zwanem.
+	 * since it might be delivered via SMI already:
 	 */
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 	h = apic_read(APIC_LVTTHMR);
 	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
-		printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
-				cpu);
+		printk(KERN_DEBUG
+		       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+
 		return; /* -EBUSY */
 	}
 
-	/* check whether a vector already exists, temporarily masked? */
+	/* Check whether a vector already exists, temporarily masked? */
 	if (h & APIC_VECTOR_MASK) {
-		printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
-				"installed\n",
-			cpu, (h & APIC_VECTOR_MASK));
+		printk(KERN_DEBUG
+		       "CPU%d: Thermal LVT vector (%#x) already installed\n",
+		       cpu, (h & APIC_VECTOR_MASK));
+
 		return; /* -EBUSY */
 	}
 
-	/* The temperature transition interrupt handler setup */
-	h = THERMAL_APIC_VECTOR;		/* our delivery vector */
-	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
+	/*
+	 * The temperature transition interrupt handler setup:
+	 */
+
+	/* Our delivery vector: */
+	h = THERMAL_APIC_VECTOR;
+
+	/* We'll mask the thermal vector in the lapic till we're ready: */
+	h |= APIC_DM_FIXED | APIC_LVT_MASKED;
 	apic_write(APIC_LVTTHMR, h);
 
 	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
 	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
 
-	/* ok we're good to go... */
+	/* Ok, we're good to go... */
 	vendor_thermal_interrupt = intel_thermal_interrupt;
 
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
 
+	/* Unmask the thermal vector: */
 	l = apic_read(APIC_LVTTHMR);
 	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+
 	printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
 
 	/* enable thermal throttle processing */
 	atomic_set(&therm_throt_en, 1);
-	return;
 }
 #endif /* CONFIG_X86_MCE_P4THERMAL */
 
-
 /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
-static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
+static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
 {
 	u32 h;
 
@@ -143,9 +153,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
 
 static void intel_machine_check(struct pt_regs *regs, long error_code)
 {
-	int recover = 1;
 	u32 alow, ahigh, high, low;
 	u32 mcgstl, mcgsth;
+	int recover = 1;
 	int i;
 
 	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -157,7 +167,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
 
 	if (mce_num_extended_msrs > 0) {
 		struct intel_mce_extended_msrs dbg;
+
 		intel_get_extended_msrs(&dbg);
+
 		printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
 			"\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
 			"\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
@@ -171,6 +183,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
 		if (high & (1<<31)) {
 			char misc[20];
 			char addr[24];
+
 			misc[0] = addr[0] = '\0';
 			if (high & (1<<29))
 				recover |= 1;
@@ -196,6 +209,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
 		panic("Unable to continue");
 
 	printk(KERN_EMERG "Attempting to continue.\n");
+
 	/*
 	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
 	 * recoverable/continuable.This will allow BIOS to look at the MSRs
@@ -217,7 +231,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
 	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 }
 
-
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;

From ed8bc7ed9a2ad875617b24d2ba09e49ee886638c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:21 +0200
Subject: [PATCH 014/162] x86, mce: clean up p5.c

Make the coding style match that of the rest of the x86 arch code.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/p5.c | 43 +++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index c9f77ea69edc..8812f5441830 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -2,11 +2,10 @@
  * P5 specific Machine Check Exception Reporting
  * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
  */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
 #include <linux/smp.h>
 
 #include <asm/processor.h>
@@ -15,39 +14,53 @@
 
 #include "mce.h"
 
-/* Machine check handler for Pentium class Intel */
+/* Machine check handler for Pentium class Intel CPUs: */
 static void pentium_machine_check(struct pt_regs *regs, long error_code)
 {
 	u32 loaddr, hi, lotype;
+
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
 	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
-	printk(KERN_EMERG "CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
-	if (lotype&(1<<5))
-		printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
+
+	printk(KERN_EMERG
+		"CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
+		smp_processor_id(), loaddr, lotype);
+
+	if (lotype & (1<<5)) {
+		printk(KERN_EMERG
+			"CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+			smp_processor_id());
+	}
+
 	add_taint(TAINT_MACHINE_CHECK);
 }
 
-/* Set up machine check reporting for processors with Intel style MCE */
+/* Set up machine check reporting for processors with Intel style MCE: */
 void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 
-	/*Check for MCE support */
+	/* Check for MCE support: */
 	if (!cpu_has(c, X86_FEATURE_MCE))
 		return;
 
-	/* Default P5 to off as its often misconnected */
+	/* Default P5 to off as its often misconnected: */
 	if (mce_disabled != -1)
 		return;
+
 	machine_check_vector = pentium_machine_check;
+	/* Make sure the vector pointer is visible before we enable MCEs: */
 	wmb();
 
-	/* Read registers before enabling */
+	/* Read registers before enabling: */
 	rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
 	rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
-	printk(KERN_INFO "Intel old style machine check architecture supported.\n");
+	printk(KERN_INFO
+	       "Intel old style machine check architecture supported.\n");
 
-	/* Enable MCE */
+	/* Enable MCE: */
 	set_in_cr4(X86_CR4_MCE);
-	printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
+	printk(KERN_INFO
+	       "Intel old style machine check reporting enabled on CPU#%d.\n",
+	       smp_processor_id());
 }

From ea2566ff80e096eeecc0918fb5d5a4612d8f62ef Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:19 +0200
Subject: [PATCH 015/162] x86, mce: clean up p6.c

Make the coding style match that of the rest of the x86 arch code.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/p6.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 2ac52d7b434b..43c24e667457 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -2,11 +2,10 @@
  * P6 specific Machine Check Exception Reporting
  * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
  */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
 #include <linux/smp.h>
 
 #include <asm/processor.h>
@@ -18,9 +17,9 @@
 /* Machine Check Handler For PII/PIII */
 static void intel_machine_check(struct pt_regs *regs, long error_code)
 {
-	int recover = 1;
 	u32 alow, ahigh, high, low;
 	u32 mcgstl, mcgsth;
+	int recover = 1;
 	int i;
 
 	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
 		if (high & (1<<31)) {
 			char misc[20];
 			char addr[24];
-			misc[0] = addr[0] = '\0';
+
+			misc[0] = '\0';
+			addr[0] = '\0';
+
 			if (high & (1<<29))
 				recover |= 1;
 			if (high & (1<<25))
 				recover |= 2;
 			high &= ~(1<<31);
+
 			if (high & (1<<27)) {
 				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
 				snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
 				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
 				snprintf(addr, 24, " at %08x%08x", ahigh, alow);
 			}
+
 			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
 				smp_processor_id(), i, high, low, misc, addr);
 		}
@@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
 	/*
 	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
 	 * recoverable/continuable.This will allow BIOS to look at the MSRs
-	 * for errors if the OS could not log the error.
+	 * for errors if the OS could not log the error:
 	 */
 	for (i = 0; i < nr_mce_banks; i++) {
 		unsigned int msr;
+
 		msr = MSR_IA32_MC0_STATUS+i*4;
 		rdmsr(msr, low, high);
 		if (high & (1<<31)) {
-			/* Clear it */
+			/* Clear it: */
 			wrmsr(msr, 0UL, 0UL);
-			/* Serialize */
+			/* Serialize: */
 			wmb();
 			add_taint(TAINT_MACHINE_CHECK);
 		}
@@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
 	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 }
 
-/* Set up machine check reporting for processors with Intel style MCE */
+/* Set up machine check reporting for processors with Intel style MCE: */
 void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
 
 	/* Ok machine check is available */
 	machine_check_vector = intel_machine_check;
+	/* Make sure the vector pointer is visible before we enable MCEs: */
 	wmb();
 
 	printk(KERN_INFO "Intel machine check architecture supported.\n");

From efee4ca80980f97b60e91e3322c3342f19623eff Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:20 +0200
Subject: [PATCH 016/162] x86, mce: clean up k7.c

Make the coding style match that of the rest of the x86 arch code.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/k7.c | 42 +++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index dd3af6e7b39a..89e510424152 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -2,11 +2,10 @@
  * Athlon specific Machine Check Exception Reporting
  * (C) Copyright 2002 Dave Jones <davej@redhat.com>
  */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
 #include <linux/smp.h>
 
 #include <asm/processor.h>
@@ -15,12 +14,12 @@
 
 #include "mce.h"
 
-/* Machine Check Handler For AMD Athlon/Duron */
+/* Machine Check Handler For AMD Athlon/Duron: */
 static void k7_machine_check(struct pt_regs *regs, long error_code)
 {
-	int recover = 1;
 	u32 alow, ahigh, high, low;
 	u32 mcgstl, mcgsth;
+	int recover = 1;
 	int i;
 
 	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
 
 	for (i = 1; i < nr_mce_banks; i++) {
 		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
-		if (high&(1<<31)) {
+		if (high & (1<<31)) {
 			char misc[20];
 			char addr[24];
-			misc[0] = addr[0] = '\0';
+
+			misc[0] = '\0';
+			addr[0] = '\0';
+
 			if (high & (1<<29))
 				recover |= 1;
 			if (high & (1<<25))
 				recover |= 2;
 			high &= ~(1<<31);
+
 			if (high & (1<<27)) {
 				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
 				snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
 				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
 				snprintf(addr, 24, " at %08x%08x", ahigh, alow);
 			}
+
 			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
 				smp_processor_id(), i, high, low, misc, addr);
-			/* Clear it */
+
+			/* Clear it: */
 			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
-			/* Serialize */
+			/* Serialize: */
 			wmb();
 			add_taint(TAINT_MACHINE_CHECK);
 		}
 	}
 
-	if (recover&2)
+	if (recover & 2)
 		panic("CPU context corrupt");
-	if (recover&1)
+	if (recover & 1)
 		panic("Unable to continue");
+
 	printk(KERN_EMERG "Attempting to continue.\n");
+
 	mcgstl &= ~(1<<2);
 	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 }
 
 
-/* AMD K7 machine check is Intel like */
+/* AMD K7 machine check is Intel like: */
 void amd_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
 		return;
 
 	machine_check_vector = k7_machine_check;
+	/* Make sure the vector pointer is visible before we enable MCEs: */
 	wmb();
 
 	printk(KERN_INFO "Intel machine check architecture supported.\n");
+
 	rdmsr(MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<8))	/* Control register present ? */
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 	nr_mce_banks = l & 0xff;
 
-	/* Clear status for MC index 0 separately, we don't touch CTL,
-	 * as some K7 Athlons cause spurious MCEs when its enabled. */
+	/*
+	 * Clear status for MC index 0 separately, we don't touch CTL,
+	 * as some K7 Athlons cause spurious MCEs when its enabled:
+	 */
 	if (boot_cpu_data.x86 == 6) {
 		wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
 		i = 1;
 	} else
 		i = 0;
+
 	for (; i < nr_mce_banks; i++) {
 		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
 		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);

From 91425084f74c0ad087b3fb6bdad79a825f952720 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:22 +0200
Subject: [PATCH 017/162] x86, mce: clean up winchip.c

Make the coding style match that of the rest of the x86 arch code.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/winchip.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2a043d89811d..81b02487090b 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -2,11 +2,10 @@
  * IDT Winchip specific Machine Check Exception Reporting
  * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
  */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -14,7 +13,7 @@
 
 #include "mce.h"
 
-/* Machine check handler for WinChip C6 */
+/* Machine check handler for WinChip C6: */
 static void winchip_machine_check(struct pt_regs *regs, long error_code)
 {
 	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
@@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
 void winchip_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 lo, hi;
+
 	machine_check_vector = winchip_machine_check;
+	/* Make sure the vector pointer is visible before we enable MCEs: */
 	wmb();
+
 	rdmsr(MSR_IDT_FCR1, lo, hi);
 	lo |= (1<<2);	/* Enable EIERRINT (int 18 MCE) */
 	lo &= ~(1<<4);	/* Enable MCE */
 	wrmsr(MSR_IDT_FCR1, lo, hi);
+
 	set_in_cr4(X86_CR4_MCE);
-	printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
+
+	printk(KERN_INFO
+	       "Winchip machine check reporting enabled on CPU#0.\n");
 }

From bdbfbdd5e8f0efb9bfef2e597f8ac673c36317ab Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:20 +0200
Subject: [PATCH 018/162] x86, mce: clean up non-fatal.c

Make the coding style match that of the rest of the x86 arch code.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/non-fatal.c | 53 +++++++++++++-------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index a74af128efc9..70b710420f74 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -6,15 +6,14 @@
  * This file contains routines to check for non-fatal MCEs every 15s
  *
  */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/jiffies.h>
-#include <linux/workqueue.h>
 #include <linux/interrupt.h>
-#include <linux/smp.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/smp.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -22,9 +21,9 @@
 
 #include "mce.h"
 
-static int firstbank;
+static int		firstbank;
 
-#define MCE_RATE	15*HZ	/* timer rate is 15s */
+#define MCE_RATE	(15*HZ)	/* timer rate is 15s */
 
 static void mce_checkregs(void *info)
 {
@@ -34,23 +33,24 @@ static void mce_checkregs(void *info)
 	for (i = firstbank; i < nr_mce_banks; i++) {
 		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
 
-		if (high & (1<<31)) {
-			printk(KERN_INFO "MCE: The hardware reports a non "
-				"fatal, correctable incident occurred on "
-				"CPU %d.\n",
+		if (!(high & (1<<31)))
+			continue;
+
+		printk(KERN_INFO "MCE: The hardware reports a non fatal, "
+			"correctable incident occurred on CPU %d.\n",
 				smp_processor_id());
-			printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
 
-			/*
-			 * Scrub the error so we don't pick it up in MCE_RATE
-			 * seconds time.
-			 */
-			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+		printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
 
-			/* Serialize */
-			wmb();
-			add_taint(TAINT_MACHINE_CHECK);
-		}
+		/*
+		 * Scrub the error so we don't pick it up in MCE_RATE
+		 * seconds time:
+		 */
+		wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+
+		/* Serialize: */
+		wmb();
+		add_taint(TAINT_MACHINE_CHECK);
 	}
 }
 
@@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void)
 
 	/* Some Athlons misbehave when we frob bank 0 */
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		boot_cpu_data.x86 == 6)
-			firstbank = 1;
+						boot_cpu_data.x86 == 6)
+		firstbank = 1;
 	else
-			firstbank = 0;
+		firstbank = 0;
 
 	/*
 	 * Check for non-fatal errors every MCE_RATE s
 	 */
 	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
 	printk(KERN_INFO "Machine check exception polling timer started.\n");
+
 	return 0;
 }
 module_init(init_nonfatal_mce_checker);

From cb6f3c155b0afabc48667efb9e7b1ce92ccfcab4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:19 +0200
Subject: [PATCH 019/162] x86, mce: clean up therm_throt.c

Make the coding style match that of the rest of the x86 arch code.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 72 +++++++++++++-----------
 1 file changed, 38 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d5ae2243f0b9..a2b5d7ddb19f 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,7 +1,7 @@
 /*
- *
  * Thermal throttle event support code (such as syslog messaging and rate
  * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
+ *
  * This allows consistent reporting of CPU thermal throttle events.
  *
  * Maintains a counter in /sys that keeps track of the number of thermal
@@ -13,43 +13,44 @@
  * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
  *          Inspired by Ross Biro's and Al Borchers' counter code.
  */
-
+#include <linux/notifier.h>
+#include <linux/jiffies.h>
 #include <linux/percpu.h>
 #include <linux/sysdev.h>
 #include <linux/cpu.h>
-#include <asm/cpu.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
+
 #include <asm/therm_throt.h>
+#include <asm/cpu.h>
 
 /* How long to wait between reporting thermal events */
-#define CHECK_INTERVAL              (300 * HZ)
+#define CHECK_INTERVAL		(300 * HZ)
 
 static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
 static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
-atomic_t therm_throt_en = ATOMIC_INIT(0);
+
+atomic_t therm_throt_en		= ATOMIC_INIT(0);
 
 #ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name)                              \
-        static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+#define define_therm_throt_sysdev_one_ro(_name)				\
+	static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
 
-#define define_therm_throt_sysdev_show_func(name)                            \
-static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,        \
-					struct sysdev_attribute *attr,	     \
-                                              char *buf)                     \
-{                                                                            \
-	unsigned int cpu = dev->id;                                          \
-	ssize_t ret;                                                         \
-                                                                             \
-	preempt_disable();              /* CPU hotplug */                    \
-	if (cpu_online(cpu))                                                 \
-		ret = sprintf(buf, "%lu\n",                                  \
-			      per_cpu(thermal_throttle_##name, cpu));        \
-	else                                                                 \
-		ret = 0;                                                     \
-	preempt_enable();                                                    \
-                                                                             \
-	return ret;                                                          \
+#define define_therm_throt_sysdev_show_func(name)			\
+static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,	\
+					struct sysdev_attribute *attr,	\
+					      char *buf)		\
+{									\
+	unsigned int cpu = dev->id;					\
+	ssize_t ret;							\
+									\
+	preempt_disable();	/* CPU hotplug */			\
+	if (cpu_online(cpu))						\
+		ret = sprintf(buf, "%lu\n",				\
+			      per_cpu(thermal_throttle_##name, cpu));	\
+	else								\
+		ret = 0;						\
+	preempt_enable();						\
+									\
+	return ret;							\
 }
 
 define_therm_throt_sysdev_show_func(count);
@@ -61,8 +62,8 @@ static struct attribute *thermal_throttle_attrs[] = {
 };
 
 static struct attribute_group thermal_throttle_attr_group = {
-	.attrs = thermal_throttle_attrs,
-	.name = "thermal_throttle"
+	.attrs	= thermal_throttle_attrs,
+	.name	= "thermal_throttle"
 };
 #endif /* CONFIG_SYSFS */
 
@@ -110,10 +111,11 @@ int therm_throt_process(int curr)
 }
 
 #ifdef CONFIG_SYSFS
-/* Add/Remove thermal_throttle interface for CPU device */
+/* Add/Remove thermal_throttle interface for CPU device: */
 static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
 {
-	return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+	return sysfs_create_group(&sys_dev->kobj,
+				  &thermal_throttle_attr_group);
 }
 
 static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -121,19 +123,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
 	sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
 }
 
-/* Mutex protecting device creation against CPU hotplug */
+/* Mutex protecting device creation against CPU hotplug: */
 static DEFINE_MUTEX(therm_cpu_lock);
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
-						   unsigned long action,
-						   void *hcpu)
+static __cpuinit int
+thermal_throttle_cpu_callback(struct notifier_block *nfb,
+			      unsigned long action,
+			      void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
 	struct sys_device *sys_dev;
 	int err = 0;
 
 	sys_dev = get_cpu_sysdev(cpu);
+
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:

From 1cb2a8e1767ab60370ecce90654c0f281c602d95 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:18 +0200
Subject: [PATCH 020/162] x86, mce: clean up mce_amd_64.c

Make the coding style match that of the rest of the x86 arch code.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 188 +++++++++++++-----------
 1 file changed, 103 insertions(+), 85 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 56dde9c4bc96..4d90ec3eb51d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -13,22 +13,22 @@
  *
  *  All MC4_MISCi registers are shared between multi-cores
  */
-
-#include <linux/cpu.h>
-#include <linux/errno.h>
-#include <linux/init.h>
 #include <linux/interrupt.h>
-#include <linux/kobject.h>
 #include <linux/notifier.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
+#include <linux/kobject.h>
 #include <linux/sysdev.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
 #include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+
+#include <asm/percpu.h>
 #include <asm/apic.h>
+#include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
-#include <asm/percpu.h>
-#include <asm/idle.h>
 
 #define PFX               "mce_threshold: "
 #define VERSION           "version 1.1.1"
@@ -48,26 +48,26 @@
 #define MCG_XBLK_ADDR     0xC0000400
 
 struct threshold_block {
-	unsigned int block;
-	unsigned int bank;
-	unsigned int cpu;
-	u32 address;
-	u16 interrupt_enable;
-	u16 threshold_limit;
-	struct kobject kobj;
-	struct list_head miscj;
+	unsigned int		block;
+	unsigned int		bank;
+	unsigned int		cpu;
+	u32			address;
+	u16			interrupt_enable;
+	u16			threshold_limit;
+	struct kobject		kobj;
+	struct list_head	miscj;
 };
 
 /* defaults used early on boot */
 static struct threshold_block threshold_defaults = {
-	.interrupt_enable = 0,
-	.threshold_limit = THRESHOLD_MAX,
+	.interrupt_enable	= 0,
+	.threshold_limit	= THRESHOLD_MAX,
 };
 
 struct threshold_bank {
-	struct kobject *kobj;
-	struct threshold_block *blocks;
-	cpumask_var_t cpus;
+	struct kobject		*kobj;
+	struct threshold_block	*blocks;
+	cpumask_var_t		cpus;
 };
 static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
 
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);
  */
 
 struct thresh_restart {
-	struct threshold_block *b;
-	int reset;
-	u16 old_limit;
+	struct threshold_block	*b;
+	int			reset;
+	u16			old_limit;
 };
 
 /* must be called with correct cpu affinity */
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)
 	} else if (tr->old_limit) {	/* change limit w/o reset */
 		int new_count = (mci_misc_hi & THRESHOLD_MAX) +
 		    (tr->old_limit - tr->b->threshold_limit);
+
 		mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
 		    (new_count & THRESHOLD_MAX);
 	}
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
-	unsigned int bank, block;
 	unsigned int cpu = smp_processor_id();
-	u8 lvt_off;
 	u32 low = 0, high = 0, address = 0;
+	unsigned int bank, block;
 	struct thresh_restart tr;
+	u8 lvt_off;
 
 	for (bank = 0; bank < NR_BANKS; ++bank) {
 		for (block = 0; block < NR_BLOCKS; ++block) {
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 				if (!address)
 					break;
 				address += MCG_XBLK_ADDR;
-			}
-			else
+			} else
 				++address;
 
 			if (rdmsr_safe(address, &low, &high))
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
  */
 static void amd_threshold_interrupt(void)
 {
+	u32 low = 0, high = 0, address = 0;
 	unsigned int bank, block;
 	struct mce m;
-	u32 low = 0, high = 0, address = 0;
 
 	mce_setup(&m);
 
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)
 		if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
 			continue;
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			if (block == 0)
+			if (block == 0) {
 				address = MSR_IA32_MC0_MISC + bank * 4;
-			else if (block == 1) {
+			} else if (block == 1) {
 				address = (low & MASK_BLKPTR_LO) >> 21;
 				if (!address)
 					break;
 				address += MCG_XBLK_ADDR;
-			}
-			else
+			} else {
 				++address;
+			}
 
 			if (rdmsr_safe(address, &low, &high))
 				break;
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)
 			     (high & MASK_LOCKED_HI))
 				continue;
 
-			/* Log the machine check that caused the threshold
-			   event. */
+			/*
+			 * Log the machine check that caused the threshold
+			 * event.
+			 */
 			machine_check_poll(MCP_TIMESTAMP,
 					&__get_cpu_var(mce_poll_banks));
 
@@ -254,48 +256,56 @@ static void amd_threshold_interrupt(void)
 
 struct threshold_attr {
 	struct attribute attr;
-	ssize_t(*show) (struct threshold_block *, char *);
-	ssize_t(*store) (struct threshold_block *, const char *, size_t count);
+	ssize_t (*show) (struct threshold_block *, char *);
+	ssize_t (*store) (struct threshold_block *, const char *, size_t count);
 };
 
-#define SHOW_FIELDS(name)                                           \
-static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
-{                                                                   \
-        return sprintf(buf, "%lx\n", (unsigned long) b->name);      \
+#define SHOW_FIELDS(name)						\
+static ssize_t show_ ## name(struct threshold_block *b, char *buf)	\
+{									\
+	return sprintf(buf, "%lx\n", (unsigned long) b->name);		\
 }
 SHOW_FIELDS(interrupt_enable)
 SHOW_FIELDS(threshold_limit)
 
-static ssize_t store_interrupt_enable(struct threshold_block *b,
-				      const char *buf, size_t count)
+static ssize_t
+store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count)
 {
-	char *end;
 	struct thresh_restart tr;
-	unsigned long new = simple_strtoul(buf, &end, 0);
+	unsigned long new;
+	char *end;
+
+	new = simple_strtoul(buf, &end, 0);
 	if (end == buf)
 		return -EINVAL;
+
 	b->interrupt_enable = !!new;
 
-	tr.b = b;
-	tr.reset = 0;
-	tr.old_limit = 0;
+	tr.b		= b;
+	tr.reset	= 0;
+	tr.old_limit	= 0;
+
 	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
 	return end - buf;
 }
 
-static ssize_t store_threshold_limit(struct threshold_block *b,
-				     const char *buf, size_t count)
+static ssize_t
+store_threshold_limit(struct threshold_block *b, const char *buf, size_t count)
 {
-	char *end;
 	struct thresh_restart tr;
-	unsigned long new = simple_strtoul(buf, &end, 0);
+	unsigned long new;
+	char *end;
+
+	new = simple_strtoul(buf, &end, 0);
 	if (end == buf)
 		return -EINVAL;
+
 	if (new > THRESHOLD_MAX)
 		new = THRESHOLD_MAX;
 	if (new < 1)
 		new = 1;
+
 	tr.old_limit = b->threshold_limit;
 	b->threshold_limit = new;
 	tr.b = b;
@@ -307,8 +317,8 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
 }
 
 struct threshold_block_cross_cpu {
-	struct threshold_block *tb;
-	long retval;
+	struct threshold_block	*tb;
+	long			retval;
 };
 
 static void local_error_count_handler(void *_tbcc)
@@ -338,15 +348,16 @@ static ssize_t store_error_count(struct threshold_block *b,
 	return 1;
 }
 
-#define THRESHOLD_ATTR(_name,_mode,_show,_store) {            \
-        .attr = {.name = __stringify(_name), .mode = _mode }, \
-        .show = _show,                                        \
-        .store = _store,                                      \
+#define THRESHOLD_ATTR(_name, _mode, _show, _store)			\
+{									\
+	.attr	= {.name = __stringify(_name), .mode = _mode },		\
+	.show	= _show,						\
+	.store	= _store,						\
 };
 
-#define RW_ATTR(name)                                           \
-static struct threshold_attr name =                             \
-        THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
+#define RW_ATTR(name)							\
+static struct threshold_attr name =					\
+	THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
 
 RW_ATTR(interrupt_enable);
 RW_ATTR(threshold_limit);
@@ -359,15 +370,17 @@ static struct attribute *default_attrs[] = {
 	NULL
 };
 
-#define to_block(k) container_of(k, struct threshold_block, kobj)
-#define to_attr(a) container_of(a, struct threshold_attr, attr)
+#define to_block(k)	container_of(k, struct threshold_block, kobj)
+#define to_attr(a)	container_of(a, struct threshold_attr, attr)
 
 static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
 	struct threshold_block *b = to_block(kobj);
 	struct threshold_attr *a = to_attr(attr);
 	ssize_t ret;
+
 	ret = a->show ? a->show(b, buf) : -EIO;
+
 	return ret;
 }
 
@@ -377,18 +390,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
 	struct threshold_block *b = to_block(kobj);
 	struct threshold_attr *a = to_attr(attr);
 	ssize_t ret;
+
 	ret = a->store ? a->store(b, buf, count) : -EIO;
+
 	return ret;
 }
 
 static struct sysfs_ops threshold_ops = {
-	.show = show,
-	.store = store,
+	.show			= show,
+	.store			= store,
 };
 
 static struct kobj_type threshold_ktype = {
-	.sysfs_ops = &threshold_ops,
-	.default_attrs = default_attrs,
+	.sysfs_ops		= &threshold_ops,
+	.default_attrs		= default_attrs,
 };
 
 static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
@@ -396,9 +411,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 					       unsigned int block,
 					       u32 address)
 {
-	int err;
-	u32 low, high;
 	struct threshold_block *b = NULL;
+	u32 low, high;
+	int err;
 
 	if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
 		return 0;
@@ -421,20 +436,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 	if (!b)
 		return -ENOMEM;
 
-	b->block = block;
-	b->bank = bank;
-	b->cpu = cpu;
-	b->address = address;
-	b->interrupt_enable = 0;
-	b->threshold_limit = THRESHOLD_MAX;
+	b->block		= block;
+	b->bank			= bank;
+	b->cpu			= cpu;
+	b->address		= address;
+	b->interrupt_enable	= 0;
+	b->threshold_limit	= THRESHOLD_MAX;
 
 	INIT_LIST_HEAD(&b->miscj);
 
-	if (per_cpu(threshold_banks, cpu)[bank]->blocks)
+	if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
 		list_add(&b->miscj,
 			 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
-	else
+	} else {
 		per_cpu(threshold_banks, cpu)[bank]->blocks = b;
+	}
 
 	err = kobject_init_and_add(&b->kobj, &threshold_ktype,
 				   per_cpu(threshold_banks, cpu)[bank]->kobj,
@@ -447,8 +463,9 @@ recurse:
 		if (!address)
 			return 0;
 		address += MCG_XBLK_ADDR;
-	} else
+	} else {
 		++address;
+	}
 
 	err = allocate_threshold_blocks(cpu, bank, ++block, address);
 	if (err)
@@ -507,6 +524,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
 		cpumask_copy(b->cpus, cpu_core_mask(cpu));
 		per_cpu(threshold_banks, cpu)[bank] = b;
+
 		goto out;
 	}
 #endif
@@ -605,15 +623,13 @@ static void deallocate_threshold_block(unsigned int cpu,
 
 static void threshold_remove_bank(unsigned int cpu, int bank)
 {
-	int i = 0;
 	struct threshold_bank *b;
 	char name[32];
+	int i = 0;
 
 	b = per_cpu(threshold_banks, cpu)[bank];
-
 	if (!b)
 		return;
-
 	if (!b->blocks)
 		goto free_out;
 
@@ -624,6 +640,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
 		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
+
 		return;
 	}
 #endif
@@ -659,8 +676,8 @@ static void threshold_remove_device(unsigned int cpu)
 }
 
 /* get notified when a cpu comes on/off */
-static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action,
-						     unsigned int cpu)
+static void __cpuinit
+amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
 {
 	if (cpu >= NR_CPUS)
 		return;
@@ -686,11 +703,12 @@ static __init int threshold_init_device(void)
 	/* to hit CPUs online before the notifier is up */
 	for_each_online_cpu(lcpu) {
 		int err = threshold_create_device(lcpu);
+
 		if (err)
 			return err;
 	}
 	threshold_cpu_callback = amd_64_threshold_cpu_callback;
+
 	return 0;
 }
-
 device_initcall(threshold_init_device);

From 6cc6f3ebd19fea722c19630af5ad68af7f51d493 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Apr 2009 12:31:23 +0200
Subject: [PATCH 021/162] x86, mce: unify Intel thermal init, prepare

Prepare for unification, make two intel_init_thermal equal.

[ Impact: cleanup ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 35 ++++++++++--------
 arch/x86/kernel/cpu/mcheck/p4.c           | 44 +++++++++++------------
 2 files changed, 40 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index cef3ee30744b..b85d0c107c84 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -34,21 +34,22 @@ asmlinkage void smp_thermal_interrupt(void)
 	irq_exit();
 }
 
+static inline void intel_set_thermal_handler(void) { }
+
 static void intel_init_thermal(struct cpuinfo_x86 *c)
 {
-	u32 l, h;
-	int tm2 = 0;
 	unsigned int cpu = smp_processor_id();
+	int tm2 = 0;
+	u32 l, h;
 
-	if (!cpu_has(c, X86_FEATURE_ACPI))
+	/* Thermal monitoring depends on ACPI and clock modulation*/
+	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
 		return;
 
-	if (!cpu_has(c, X86_FEATURE_ACC))
-		return;
-
-	/* first check if TM1 is already enabled by the BIOS, in which
-	 * case there might be some SMM goo which handles it, so we can't even
-	 * put a handler since it might be delivered via SMI already.
+	/*
+	 * First check if its enabled already, in which case there might
+	 * be some SMM goo which handles it, so we can't even put a handler
+	 * since it might be delivered via SMI already:
 	 */
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 	h = apic_read(APIC_LVTTHMR);
@@ -61,31 +62,35 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
 		tm2 = 1;
 
+	/* Check whether a vector already exists */
 	if (h & APIC_VECTOR_MASK) {
 		printk(KERN_DEBUG
-		       "CPU%d: Thermal LVT vector (%#x) already "
-		       "installed\n", cpu, (h & APIC_VECTOR_MASK));
+		       "CPU%d: Thermal LVT vector (%#x) already installed\n",
+		       cpu, (h & APIC_VECTOR_MASK));
 		return;
 	}
 
-	h = THERMAL_APIC_VECTOR;
-	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
+	/* We'll mask the thermal vector in the lapic till we're ready: */
+	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
 	apic_write(APIC_LVTTHMR, h);
 
 	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
 	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
 
+	intel_set_thermal_handler();
+
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
 
+	/* Unmask the thermal vector: */
 	l = apic_read(APIC_LVTTHMR);
 	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+
 	printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
-		cpu, tm2 ? "TM2" : "TM1");
+	       cpu, tm2 ? "TM2" : "TM1");
 
 	/* enable thermal throttle processing */
 	atomic_set(&therm_throt_en, 1);
-	return;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index cb344aba479c..f70753a443bb 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -66,19 +66,21 @@ void smp_thermal_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+static void intel_set_thermal_handler(void)
+{
+	vendor_thermal_interrupt = intel_thermal_interrupt;
+}
+
 /* P4/Xeon Thermal regulation detect and init: */
 static void intel_init_thermal(struct cpuinfo_x86 *c)
 {
 	unsigned int cpu = smp_processor_id();
+	int tm2 = 0;
 	u32 l, h;
 
-	/* Thermal monitoring: */
-	if (!cpu_has(c, X86_FEATURE_ACPI))
-		return;	/* -ENODEV */
-
-	/* Clock modulation: */
-	if (!cpu_has(c, X86_FEATURE_ACC))
-		return;	/* -ENODEV */
+	/* Thermal monitoring depends on ACPI and clock modulation*/
+	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
+		return;
 
 	/*
 	 * First check if its enabled already, in which case there might
@@ -90,35 +92,28 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
 		printk(KERN_DEBUG
 		       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
-
-		return; /* -EBUSY */
+		return;
 	}
 
-	/* Check whether a vector already exists, temporarily masked? */
+	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
+		tm2 = 1;
+
+	/* Check whether a vector already exists */
 	if (h & APIC_VECTOR_MASK) {
 		printk(KERN_DEBUG
 		       "CPU%d: Thermal LVT vector (%#x) already installed\n",
 		       cpu, (h & APIC_VECTOR_MASK));
-
-		return; /* -EBUSY */
+		return;
 	}
 
-	/*
-	 * The temperature transition interrupt handler setup:
-	 */
-
-	/* Our delivery vector: */
-	h = THERMAL_APIC_VECTOR;
-
 	/* We'll mask the thermal vector in the lapic till we're ready: */
-	h |= APIC_DM_FIXED | APIC_LVT_MASKED;
+	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
 	apic_write(APIC_LVTTHMR, h);
 
 	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
-	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
+	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
 
-	/* Ok, we're good to go... */
-	vendor_thermal_interrupt = intel_thermal_interrupt;
+	intel_set_thermal_handler();
 
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
@@ -127,7 +122,8 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	l = apic_read(APIC_LVTTHMR);
 	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
 
-	printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
+	printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
+	       cpu, tm2 ? "TM2" : "TM1");
 
 	/* enable thermal throttle processing */
 	atomic_set(&therm_throt_en, 1);

From a65d086235208a3b3546e209d2210048549099b2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Apr 2009 12:31:23 +0200
Subject: [PATCH 022/162] x86, mce: unify Intel thermal init

Mechanic unification. No change in code.

[ Impact: cleanup, 32-bit / 64-bit unification ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/Makefile       |  3 +-
 arch/x86/kernel/cpu/mcheck/mce.h          | 11 ++++
 arch/x86/kernel/cpu/mcheck/mce_intel.c    | 73 +++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 61 +------------------
 arch/x86/kernel/cpu/mcheck/p4.c           | 59 +-----------------
 5 files changed, 89 insertions(+), 118 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel.c

diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe8..6def76942bf2 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,7 +1,8 @@
 obj-y				=  mce_$(BITS).o therm_throt.o
 
 obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o
-obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o
+obj-$(CONFIG_X86_MCE_P4THERMAL)	+= mce_intel.o
+obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o mce_intel.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o
 obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o
 obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index ae9f628838f1..2d1a54bdadfc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -1,6 +1,8 @@
 #include <linux/init.h>
 #include <asm/mce.h>
 
+#ifdef CONFIG_X86_32
+
 void amd_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
@@ -12,3 +14,12 @@ extern void (*machine_check_vector)(struct pt_regs *, long error_code);
 
 extern int nr_mce_banks;
 
+void intel_set_thermal_handler(void);
+
+#else
+
+static inline void intel_set_thermal_handler(void) { }
+
+#endif
+
+void intel_init_thermal(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
new file mode 100644
index 000000000000..bad3cbb0e566
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -0,0 +1,73 @@
+/*
+ * Common code for Intel machine checks
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/therm_throt.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/apic.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+
+void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+	unsigned int cpu = smp_processor_id();
+	int tm2 = 0;
+	u32 l, h;
+
+	/* Thermal monitoring depends on ACPI and clock modulation*/
+	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
+		return;
+
+	/*
+	 * First check if its enabled already, in which case there might
+	 * be some SMM goo which handles it, so we can't even put a handler
+	 * since it might be delivered via SMI already:
+	 */
+	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+	h = apic_read(APIC_LVTTHMR);
+	if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
+		printk(KERN_DEBUG
+		       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+		return;
+	}
+
+	if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+		tm2 = 1;
+
+	/* Check whether a vector already exists */
+	if (h & APIC_VECTOR_MASK) {
+		printk(KERN_DEBUG
+		       "CPU%d: Thermal LVT vector (%#x) already installed\n",
+		       cpu, (h & APIC_VECTOR_MASK));
+		return;
+	}
+
+	/* We'll mask the thermal vector in the lapic till we're ready: */
+	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
+	apic_write(APIC_LVTTHMR, h);
+
+	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
+
+	intel_set_thermal_handler();
+
+	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+	wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
+
+	/* Unmask the thermal vector: */
+	l = apic_read(APIC_LVTTHMR);
+	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+
+	printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
+	       cpu, tm2 ? "TM2" : "TM1");
+
+	/* enable thermal throttle processing */
+	atomic_set(&therm_throt_en, 1);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index b85d0c107c84..38f9632306fa 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -17,6 +17,8 @@
 #include <asm/therm_throt.h>
 #include <asm/apic.h>
 
+#include "mce.h"
+
 asmlinkage void smp_thermal_interrupt(void)
 {
 	__u64 msr_val;
@@ -34,65 +36,6 @@ asmlinkage void smp_thermal_interrupt(void)
 	irq_exit();
 }
 
-static inline void intel_set_thermal_handler(void) { }
-
-static void intel_init_thermal(struct cpuinfo_x86 *c)
-{
-	unsigned int cpu = smp_processor_id();
-	int tm2 = 0;
-	u32 l, h;
-
-	/* Thermal monitoring depends on ACPI and clock modulation*/
-	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
-		return;
-
-	/*
-	 * First check if its enabled already, in which case there might
-	 * be some SMM goo which handles it, so we can't even put a handler
-	 * since it might be delivered via SMI already:
-	 */
-	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-	h = apic_read(APIC_LVTTHMR);
-	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
-		printk(KERN_DEBUG
-		       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
-		return;
-	}
-
-	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
-		tm2 = 1;
-
-	/* Check whether a vector already exists */
-	if (h & APIC_VECTOR_MASK) {
-		printk(KERN_DEBUG
-		       "CPU%d: Thermal LVT vector (%#x) already installed\n",
-		       cpu, (h & APIC_VECTOR_MASK));
-		return;
-	}
-
-	/* We'll mask the thermal vector in the lapic till we're ready: */
-	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
-	apic_write(APIC_LVTTHMR, h);
-
-	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
-	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
-
-	intel_set_thermal_handler();
-
-	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
-	/* Unmask the thermal vector: */
-	l = apic_read(APIC_LVTTHMR);
-	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
-	printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
-	       cpu, tm2 ? "TM2" : "TM1");
-
-	/* enable thermal throttle processing */
-	atomic_set(&therm_throt_en, 1);
-}
-
 /*
  * Support for Intel Correct Machine Check Interrupts. This allows
  * the CPU to raise an interrupt when a corrected machine check happened.
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f70753a443bb..f979ffea330b 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -66,68 +66,11 @@ void smp_thermal_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
-static void intel_set_thermal_handler(void)
+void intel_set_thermal_handler(void)
 {
 	vendor_thermal_interrupt = intel_thermal_interrupt;
 }
 
-/* P4/Xeon Thermal regulation detect and init: */
-static void intel_init_thermal(struct cpuinfo_x86 *c)
-{
-	unsigned int cpu = smp_processor_id();
-	int tm2 = 0;
-	u32 l, h;
-
-	/* Thermal monitoring depends on ACPI and clock modulation*/
-	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
-		return;
-
-	/*
-	 * First check if its enabled already, in which case there might
-	 * be some SMM goo which handles it, so we can't even put a handler
-	 * since it might be delivered via SMI already:
-	 */
-	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-	h = apic_read(APIC_LVTTHMR);
-	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
-		printk(KERN_DEBUG
-		       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
-		return;
-	}
-
-	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
-		tm2 = 1;
-
-	/* Check whether a vector already exists */
-	if (h & APIC_VECTOR_MASK) {
-		printk(KERN_DEBUG
-		       "CPU%d: Thermal LVT vector (%#x) already installed\n",
-		       cpu, (h & APIC_VECTOR_MASK));
-		return;
-	}
-
-	/* We'll mask the thermal vector in the lapic till we're ready: */
-	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
-	apic_write(APIC_LVTTHMR, h);
-
-	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
-	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
-
-	intel_set_thermal_handler();
-
-	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
-	/* Unmask the thermal vector: */
-	l = apic_read(APIC_LVTTHMR);
-	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
-	printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
-	       cpu, tm2 ? "TM2" : "TM1");
-
-	/* enable thermal throttle processing */
-	atomic_set(&therm_throt_en, 1);
-}
 #endif /* CONFIG_X86_MCE_P4THERMAL */
 
 /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */

From 06b851d98266b812b2fa23d007cdf53f41194bbb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:25 +0200
Subject: [PATCH 023/162] x86, mce: unify, prepare 64bit in mce.h

Prepare mce.h for unification, so that it will build on 32-bit x86
kernels too.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4f8c199584e7..8488210b866f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -1,8 +1,6 @@
 #ifndef _ASM_X86_MCE_H
 #define _ASM_X86_MCE_H
 
-#ifdef __x86_64__
-
 #include <linux/types.h>
 #include <asm/ioctls.h>
 
@@ -10,21 +8,21 @@
  * Machine Check support for x86
  */
 
-#define MCG_CTL_P	 (1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	 (1ULL<<8)   /* MCG_CAP register available */
 #define MCG_EXT_P	 (1ULL<<9)   /* Extended registers available */
 #define MCG_CMCI_P	 (1ULL<<10)  /* CMCI supported */
 
-#define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
-#define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
-#define MCG_STATUS_MCIP  (1UL<<2)   /* machine check in progress */
+#define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV  (1ULL<<1)   /* ip points to correct instruction */
+#define MCG_STATUS_MCIP  (1ULL<<2)   /* machine check in progress */
 
-#define MCI_STATUS_VAL   (1UL<<63)  /* valid error */
-#define MCI_STATUS_OVER  (1UL<<62)  /* previous errors lost */
-#define MCI_STATUS_UC    (1UL<<61)  /* uncorrected error */
-#define MCI_STATUS_EN    (1UL<<60)  /* error enabled */
-#define MCI_STATUS_MISCV (1UL<<59)  /* misc error reg. valid */
-#define MCI_STATUS_ADDRV (1UL<<58)  /* addr reg. valid */
-#define MCI_STATUS_PCC   (1UL<<57)  /* processor context corrupt */
+#define MCI_STATUS_VAL   (1ULL<<63)  /* valid error */
+#define MCI_STATUS_OVER  (1ULL<<62)  /* previous errors lost */
+#define MCI_STATUS_UC    (1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN    (1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59)  /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58)  /* addr reg. valid */
+#define MCI_STATUS_PCC   (1ULL<<57)  /* processor context corrupt */
 
 /* Fields are zero when not available */
 struct mce {
@@ -82,13 +80,11 @@ struct mce_log {
 #define K8_MCE_THRESHOLD_BANK_5    (MCE_THRESHOLD_BASE + 5 * 9)
 #define K8_MCE_THRESHOLD_DRAM_ECC  (MCE_THRESHOLD_BANK_4 + 0)
 
-#endif /* __x86_64__ */
-
 #ifdef __KERNEL__
 
 #ifdef CONFIG_X86_32
 extern int mce_disabled;
-#else /* CONFIG_X86_32 */
+#endif
 
 #include <asm/atomic.h>
 
@@ -143,8 +139,6 @@ extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
 extern int mce_notify_user(void);
 
-#endif /* !CONFIG_X86_32 */
-
 #ifdef CONFIG_X86_MCE
 extern void mcheck_init(struct cpuinfo_x86 *c);
 #else

From a988d334ae8213c0e0e62327222f6e5e6e52bcf1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:25 +0200
Subject: [PATCH 024/162] x86, mce: unify, prepare codes

Move current 32-bit mce_32.c code into mce_64.c.

[ Remove unused artifact stop/restart_mce pointed by Andi Kleen ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@firstfloor.org>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 65 +++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 1491246c4d69..ce48ae75e1dc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -1240,3 +1240,68 @@ static __init int mce_init_device(void)
 }
 
 device_initcall(mce_init_device);
+
+#ifdef CONFIG_X86_32
+
+int mce_disabled;
+
+int nr_mce_banks;
+EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
+
+/* Handle unconfigured int18 (should never happen) */
+static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+{
+	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+	       smp_processor_id());
+}
+
+/* Call the installed machine check handler for this CPU setup. */
+void (*machine_check_vector)(struct pt_regs *, long error_code) =
+						unexpected_machine_check;
+
+/* This has to be run for each processor */
+void mcheck_init(struct cpuinfo_x86 *c)
+{
+	if (mce_disabled == 1)
+		return;
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		amd_mcheck_init(c);
+		break;
+
+	case X86_VENDOR_INTEL:
+		if (c->x86 == 5)
+			intel_p5_mcheck_init(c);
+		if (c->x86 == 6)
+			intel_p6_mcheck_init(c);
+		if (c->x86 == 15)
+			intel_p4_mcheck_init(c);
+		break;
+
+	case X86_VENDOR_CENTAUR:
+		if (c->x86 == 5)
+			winchip_mcheck_init(c);
+		break;
+
+	default:
+		break;
+	}
+}
+
+static int __init mcheck_disable(char *str)
+{
+	mce_disabled = 1;
+	return 1;
+}
+
+static int __init mcheck_enable(char *str)
+{
+	mce_disabled = -1;
+	return 1;
+}
+
+__setup("nomce", mcheck_disable);
+__setup("mce", mcheck_enable);
+
+#endif /* CONFIG_X86_32 */

From 711c2e481c9d1113650d09de10f61ee88ab56fda Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:26 +0200
Subject: [PATCH 025/162] x86, mce: unify, prepare for 32-bit v2

Prepare the 64-bit mce_64.c code side to be built on 32-bit.

[ includes ifdef relocation by Andi Kleen ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@firstfloor.org>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.h    | 4 ++--
 arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index 2d1a54bdadfc..cd6cffcc2de0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -1,14 +1,14 @@
 #include <linux/init.h>
 #include <asm/mce.h>
 
-#ifdef CONFIG_X86_32
-
 void amd_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
+#ifdef CONFIG_X86_32
+
 /* Call the installed machine check handler for this CPU setup. */
 extern void (*machine_check_vector)(struct pt_regs *, long error_code);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index ce48ae75e1dc..2e2c3d2e9586 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -37,6 +37,10 @@
 #include <asm/msr.h>
 #include <asm/smp.h>
 
+#include "mce.h"
+
+#ifdef CONFIG_X86_64
+
 #define MISC_MCELOG_MINOR	227
 
 atomic_t mce_entry;
@@ -1241,7 +1245,7 @@ static __init int mce_init_device(void)
 
 device_initcall(mce_init_device);
 
-#ifdef CONFIG_X86_32
+#else /* CONFIG_X86_32: */
 
 int mce_disabled;
 

From dba3725d44f5dfb5711fd509fca10b5b828c43b7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:26 +0200
Subject: [PATCH 026/162] x86, mce: unify

move mce_64.c => mce.c and glue it up in the Makefile.
Remove mce_32.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/Makefile           |  2 +-
 .../x86/kernel/cpu/mcheck/{mce_64.c => mce.c} |  0
 arch/x86/kernel/cpu/mcheck/mce_32.c           | 77 -------------------
 3 files changed, 1 insertion(+), 78 deletions(-)
 rename arch/x86/kernel/cpu/mcheck/{mce_64.c => mce.c} (100%)
 delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_32.c

diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 6def76942bf2..55f01b39a105 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,4 +1,4 @@
-obj-y				=  mce_$(BITS).o therm_throt.o
+obj-y				=  mce.o therm_throt.o
 
 obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o
 obj-$(CONFIG_X86_MCE_P4THERMAL)	+= mce_intel.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce.c
similarity index 100%
rename from arch/x86/kernel/cpu/mcheck/mce_64.c
rename to arch/x86/kernel/cpu/mcheck/mce.c
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
deleted file mode 100644
index 05979e7eff12..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * mce.c - x86 Machine Check Exception Reporting
- * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
- */
-#include <linux/thread_info.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-
-#include "mce.h"
-
-int mce_disabled;
-
-int nr_mce_banks;
-EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
-
-/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
-{
-	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
-	       smp_processor_id());
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
-						unexpected_machine_check;
-
-/* This has to be run for each processor */
-void mcheck_init(struct cpuinfo_x86 *c)
-{
-	if (mce_disabled == 1)
-		return;
-
-	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		amd_mcheck_init(c);
-		break;
-
-	case X86_VENDOR_INTEL:
-		if (c->x86 == 5)
-			intel_p5_mcheck_init(c);
-		if (c->x86 == 6)
-			intel_p6_mcheck_init(c);
-		if (c->x86 == 15)
-			intel_p4_mcheck_init(c);
-		break;
-
-	case X86_VENDOR_CENTAUR:
-		if (c->x86 == 5)
-			winchip_mcheck_init(c);
-		break;
-
-	default:
-		break;
-	}
-}
-
-static int __init mcheck_disable(char *str)
-{
-	mce_disabled = 1;
-	return 1;
-}
-
-static int __init mcheck_enable(char *str)
-{
-	mce_disabled = -1;
-	return 1;
-}
-
-__setup("nomce", mcheck_disable);
-__setup("mce", mcheck_enable);

From cb491fca55e5282f0a95ef39c55352e00d6ca75e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:17 +0200
Subject: [PATCH 027/162] x86, mce: Rename sysfs variables

Shorten variable names. This also compacts the code a bit.

	device_mce		=> mce_dev
	mce_device_initialized	=> mce_dev_initialized
	mce_attribute		=> mce_attrs

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h              |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c        | 58 ++++++++++++-------------
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 10 ++---
 3 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 8488210b866f..b9972a6bc2a1 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -90,7 +90,7 @@ extern int mce_disabled;
 
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, device_mce);
+DECLARE_PER_CPU(struct sys_device, mce_dev);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2e2c3d2e9586..ba8dd41a10dc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -957,7 +957,7 @@ static struct sysdev_class mce_sysclass = {
 	.name		= "machinecheck",
 };
 
-DEFINE_PER_CPU(struct sys_device, device_mce);
+DEFINE_PER_CPU(struct sys_device, mce_dev);
 
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1039,12 +1039,12 @@ static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 
 ACCESSOR(check_interval, check_interval, mce_restart())
 
-static struct sysdev_attribute *mce_attributes[] = {
+static struct sysdev_attribute *mce_attrs[] = {
 	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 	NULL
 };
 
-static cpumask_var_t mce_device_initialized;
+static cpumask_var_t mce_dev_initialized;
 
 /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
 static __cpuinit int mce_create_device(unsigned int cpu)
@@ -1055,40 +1055,36 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
-	per_cpu(device_mce, cpu).id	= cpu;
-	per_cpu(device_mce, cpu).cls	= &mce_sysclass;
+	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
+	per_cpu(mce_dev, cpu).id	= cpu;
+	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
 
-	err = sysdev_register(&per_cpu(device_mce, cpu));
+	err = sysdev_register(&per_cpu(mce_dev, cpu));
 	if (err)
 		return err;
 
-	for (i = 0; mce_attributes[i]; i++) {
-		err = sysdev_create_file(&per_cpu(device_mce, cpu),
-					 mce_attributes[i]);
+	for (i = 0; mce_attrs[i]; i++) {
+		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
 		if (err)
 			goto error;
 	}
 	for (i = 0; i < banks; i++) {
-		err = sysdev_create_file(&per_cpu(device_mce, cpu),
+		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
 					&bank_attrs[i]);
 		if (err)
 			goto error2;
 	}
-	cpumask_set_cpu(cpu, mce_device_initialized);
+	cpumask_set_cpu(cpu, mce_dev_initialized);
 
 	return 0;
 error2:
-	while (--i >= 0) {
-		sysdev_remove_file(&per_cpu(device_mce, cpu),
-					&bank_attrs[i]);
-	}
+	while (--i >= 0)
+		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
 error:
-	while (--i >= 0) {
-		sysdev_remove_file(&per_cpu(device_mce, cpu),
-				   mce_attributes[i]);
-	}
-	sysdev_unregister(&per_cpu(device_mce, cpu));
+	while (--i >= 0)
+		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+
+	sysdev_unregister(&per_cpu(mce_dev, cpu));
 
 	return err;
 }
@@ -1097,24 +1093,24 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 {
 	int i;
 
-	if (!cpumask_test_cpu(cpu, mce_device_initialized))
+	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
 		return;
 
-	for (i = 0; mce_attributes[i]; i++)
-		sysdev_remove_file(&per_cpu(device_mce, cpu),
-			mce_attributes[i]);
+	for (i = 0; mce_attrs[i]; i++)
+		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+
 	for (i = 0; i < banks; i++)
-		sysdev_remove_file(&per_cpu(device_mce, cpu),
-			&bank_attrs[i]);
-	sysdev_unregister(&per_cpu(device_mce, cpu));
-	cpumask_clear_cpu(cpu, mce_device_initialized);
+		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
+
+	sysdev_unregister(&per_cpu(mce_dev, cpu));
+	cpumask_clear_cpu(cpu, mce_dev_initialized);
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
 static void mce_disable_cpu(void *h)
 {
-	int i;
 	unsigned long action = *(unsigned long *)h;
+	int i;
 
 	if (!mce_available(&current_cpu_data))
 		return;
@@ -1221,7 +1217,7 @@ static __init int mce_init_device(void)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
+	alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
 
 	err = mce_init_banks();
 	if (err)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 4d90ec3eb51d..083f270251fa 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -517,7 +517,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (!b)
 			goto out;
 
-		err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
+		err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -540,7 +540,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out;
 	}
 
-	b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
+	b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
 	if (!b->kobj)
 		goto out_free;
 
@@ -560,7 +560,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (i == cpu)
 			continue;
 
-		err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
+		err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -638,7 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 
 		return;
@@ -650,7 +650,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		if (i == cpu)
 			continue;
 
-		sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
 		per_cpu(threshold_banks, i)[bank] = NULL;
 	}
 

From b659294b779565c60f5e12ef505328e2b974eb62 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 12:31:27 +0200
Subject: [PATCH 028/162] x86, mce: print number of MCE banks

The number of MCE banks supported by a CPU is a useful number to know,
so print it out during CPU initialization.

[ Impact: add printout ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ba8dd41a10dc..49c74222359d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -570,6 +570,8 @@ static int mce_cap_init(void)
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	b = cap & 0xff;
+	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
+
 	if (b > MAX_NR_BANKS) {
 		printk(KERN_WARNING
 		       "MCE: Using only %u machine check banks out of %u\n",
@@ -1287,6 +1289,7 @@ void mcheck_init(struct cpuinfo_x86 *c)
 	default:
 		break;
 	}
+	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
 }
 
 static int __init mcheck_disable(char *str)

From ba2d0f2b0c56d7174a0208f7c463271f39040728 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Apr 2009 12:31:24 +0200
Subject: [PATCH 029/162] x86, mce: Cleanup symbols in intel thermal codes

Decode magic constants and turn them into symbols.

[ Cleanup to use symbols already exists - HS ]

[ Impact: cleanup ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/msr-index.h          | 7 +++++++
 arch/x86/kernel/cpu/mcheck/mce_intel.c    | 9 +++++----
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +-
 arch/x86/kernel/cpu/mcheck/p4.c           | 2 +-
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ec41fc16c167..c86404695083 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -208,7 +208,14 @@
 
 #define MSR_IA32_THERM_CONTROL		0x0000019a
 #define MSR_IA32_THERM_INTERRUPT	0x0000019b
+
+#define THERM_INT_LOW_ENABLE		(1 << 0)
+#define THERM_INT_HIGH_ENABLE		(1 << 1)
+
 #define MSR_IA32_THERM_STATUS		0x0000019c
+
+#define THERM_STATUS_PROCHOT		(1 << 0)
+
 #define MSR_IA32_MISC_ENABLE		0x000001a0
 
 /* MISC_ENABLE bits: architectural */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index bad3cbb0e566..2b011d2d8579 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -32,13 +32,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	 */
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 	h = apic_read(APIC_LVTTHMR);
-	if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
+	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
 		printk(KERN_DEBUG
 		       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
 		return;
 	}
 
-	if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
 		tm2 = 1;
 
 	/* Check whether a vector already exists */
@@ -54,12 +54,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	apic_write(APIC_LVTTHMR, h);
 
 	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
-	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
+	wrmsr(MSR_IA32_THERM_INTERRUPT,
+		l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
 
 	intel_set_thermal_handler();
 
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-	wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
+	wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
 
 	/* Unmask the thermal vector: */
 	l = apic_read(APIC_LVTTHMR);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 38f9632306fa..13abafcb72e4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -29,7 +29,7 @@ asmlinkage void smp_thermal_interrupt(void)
 	irq_enter();
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-	if (therm_throt_process(msr_val & 1))
+	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
 		mce_log_therm_throt_event(msr_val);
 
 	inc_irq_stat(irq_thermal_count);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f979ffea330b..82cee108a2d3 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -51,7 +51,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
 	ack_APIC_irq();
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-	therm_throt_process(msr_val & 0x1);
+	therm_throt_process(msr_val & THERM_STATUS_PROCHOT);
 }
 
 /* Thermal interrupt handler for this CPU setup: */

From 01c6680a547a3ee8dd170c269ea8e037b3191b71 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Apr 2009 12:31:24 +0200
Subject: [PATCH 030/162] x86, mce: Cleanup MCG definitions

Decode more magic constants and turn them into symbols.

[ Sort definitions bitwise, introduce MCG_EXT_CNT - HS ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h       | 10 +++++++---
 arch/x86/kernel/cpu/mcheck/mce.c |  5 +++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b9972a6bc2a1..94aedaf6327f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -8,9 +8,13 @@
  * Machine Check support for x86
  */
 
-#define MCG_CTL_P	 (1ULL<<8)   /* MCG_CAP register available */
-#define MCG_EXT_P	 (1ULL<<9)   /* Extended registers available */
-#define MCG_CMCI_P	 (1ULL<<10)  /* CMCI supported */
+#define MCG_BANKCNT_MASK	0xff         /* Number of Banks */
+#define MCG_CTL_P		(1ULL<<8)    /* MCG_CAP register available */
+#define MCG_EXT_P		(1ULL<<9)    /* Extended registers available */
+#define MCG_CMCI_P		(1ULL<<10)   /* CMCI supported */
+#define MCG_EXT_CNT_MASK	0xff0000     /* Number of Extended registers */
+#define MCG_EXT_CNT_SHIFT	16
+#define MCG_EXT_CNT(c)		(((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
 
 #define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1ULL<<1)   /* ip points to correct instruction */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 49c74222359d..147333627416 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -569,7 +569,8 @@ static int mce_cap_init(void)
 	u64 cap;
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
-	b = cap & 0xff;
+
+	b = cap & MCG_BANKCNT_MASK;
 	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
 
 	if (b > MAX_NR_BANKS) {
@@ -590,7 +591,7 @@ static int mce_cap_init(void)
 	}
 
 	/* Use accurate RIP reporting if available. */
-	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
+	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
 		rip_msr = MSR_IA32_MCG_EIP;
 
 	return 0;

From 3cde5c8c839bf46a7be799ed0e1d0b4780aaf794 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 27 Apr 2009 18:01:31 +0200
Subject: [PATCH 031/162] x86, mce: initial steps to make 64bit mce code 32bit
 clean

Replace unsigned long with u64s if they need to contain 64bit values.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 147333627416..cd1313b47506 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -156,15 +156,15 @@ static void print_mce(struct mce *m)
 	       "and contact your hardware vendor\n");
 }
 
-static void mce_panic(char *msg, struct mce *backup, unsigned long start)
+static void mce_panic(char *msg, struct mce *backup, u64 start)
 {
 	int i;
 
 	oops_begin();
 	for (i = 0; i < MCE_LOG_LEN; i++) {
-		unsigned long tsc = mcelog.entry[i].tsc;
+		u64 tsc = mcelog.entry[i].tsc;
 
-		if (time_before(tsc, start))
+		if ((s64)(tsc - start) < 0)
 			continue;
 		print_mce(&mcelog.entry[i]);
 		if (backup && mcelog.entry[i].tsc == backup->tsc)
@@ -970,13 +970,13 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 	static ssize_t show_ ## name(struct sys_device *s,		\
 				     struct sysdev_attribute *attr,	\
 				     char *buf) {			\
-		return sprintf(buf, "%lx\n", (unsigned long)var);	\
+		return sprintf(buf, "%Lx\n", (u64)var);			\
 	}								\
 	static ssize_t set_ ## name(struct sys_device *s,		\
 				    struct sysdev_attribute *attr,	\
 				    const char *buf, size_t siz) {	\
 		char *end;						\
-		unsigned long new = simple_strtoul(buf, &end, 0);	\
+		u64 new = simple_strtoull(buf, &end, 0);		\
 									\
 		if (end == buf)						\
 			return -EINVAL;					\

From 06b7a7a5ec917761969444fee967c43868a76468 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 27 Apr 2009 18:37:43 +0200
Subject: [PATCH 032/162] x86, mce: implement the PPro bank 0 quirk in the
 64bit machine check code

Quoting the comment:

* SDM documents that on family 6 bank 0 should not be written
* because it aliases to another special BIOS controlled
* register.
* But it's not aliased anymore on model 0x1a+
* Don't ignore bank 0 completely because there could be a valid
* event later, merely don't write CTL0.

This is mostly a port on the 32bit code, except that 32bit
always didn't write it and didn't have the 0x1a heuristic. I checked
with the CPU designers that the quirk is not required starting with
this model.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 40 +++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index cd1313b47506..1dcd3be03328 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -65,6 +65,8 @@ static atomic_t			mce_events;
 static char			trigger[128];
 static char			*trigger_argv[2] = { trigger, NULL };
 
+static unsigned long		dont_init_banks;
+
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 
 /* MCA banks polled by the period polling timer for corrected events */
@@ -72,6 +74,11 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 };
 
+static inline int skip_bank_init(int i)
+{
+	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
+}
+
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
 {
@@ -616,6 +623,8 @@ static void mce_init(void *dummy)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
 	for (i = 0; i < banks; i++) {
+		if (skip_bank_init(i))
+			continue;
 		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
@@ -643,6 +652,19 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 		}
 	}
 
+	if (c->x86_vendor == X86_VENDOR_INTEL) {
+		/*
+		 * SDM documents that on family 6 bank 0 should not be written
+		 * because it aliases to another special BIOS controlled
+		 * register.
+		 * But it's not aliased anymore on model 0x1a+
+		 * Don't ignore bank 0 completely because there could be a
+		 * valid event later, merely don't write CTL0.
+		 */
+
+		if (c->x86 == 6 && c->x86_model < 0x1A)
+			__set_bit(0, &dont_init_banks);
+	}
 }
 
 static void mce_cpu_features(struct cpuinfo_x86 *c)
@@ -911,8 +933,10 @@ static int mce_disable(void)
 {
 	int i;
 
-	for (i = 0; i < banks; i++)
-		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+	for (i = 0; i < banks; i++) {
+		if (!skip_bank_init(i))
+			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+	}
 	return 0;
 }
 
@@ -1119,8 +1143,10 @@ static void mce_disable_cpu(void *h)
 		return;
 	if (!(action & CPU_TASKS_FROZEN))
 		cmci_clear();
-	for (i = 0; i < banks; i++)
-		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+	for (i = 0; i < banks; i++) {
+		if (!skip_bank_init(i))
+			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+	}
 }
 
 static void mce_reenable_cpu(void *h)
@@ -1133,8 +1159,10 @@ static void mce_reenable_cpu(void *h)
 
 	if (!(action & CPU_TASKS_FROZEN))
 		cmci_reenable();
-	for (i = 0; i < banks; i++)
-		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+	for (i = 0; i < banks; i++) {
+		if (!skip_bank_init(i))
+			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+	}
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */

From 2e6f694fde0a7158590e121962ca2e3c06633528 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 27 Apr 2009 18:42:48 +0200
Subject: [PATCH 033/162] x86, mce: port K7 bank 0 quirk to 64bit mce code

Various K7 have broken bank 0s. Don't enable it by default

Port from the 32bit code.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1dcd3be03328..1336280edcc2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -650,6 +650,12 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 			 */
 			mce_bootlog = 0;
 		}
+		/*
+		 * Various K7s with broken bank 0 around. Always disable
+		 * by default.
+		 */
+		 if (c->x86 == 6)
+			bank[0] = 0;
 	}
 
 	if (c->x86_vendor == X86_VENDOR_INTEL) {

From 5d7279268b654d1f8ac43b0eb6cd9598d9cf55fd Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 27 Apr 2009 19:25:48 +0200
Subject: [PATCH 034/162] x86, mce: use a call vector to call the 64bit mce
 handler

Allows to call different machine check handlers from the low
level machine check entry vector.

This is needed for later when it will be used for 32bit too.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 23 ++++++++++++-----------
 arch/x86/kernel/cpu/mcheck/mce.h |  3 ++-
 arch/x86/kernel/entry_64.S       |  2 +-
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1336280edcc2..d99318b470d8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -39,6 +39,16 @@
 
 #include "mce.h"
 
+/* Handle unconfigured int18 (should never happen) */
+static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+{
+	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+	       smp_processor_id());
+}
+
+/* Call the installed machine check handler for this CPU setup. */
+void (*machine_check_vector)(struct pt_regs *, long error_code) =
+						unexpected_machine_check;
 #ifdef CONFIG_X86_64
 
 #define MISC_MCELOG_MINOR	227
@@ -715,6 +725,8 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 	}
 	mce_cpu_quirks(c);
 
+	machine_check_vector = do_machine_check;
+
 	mce_init(NULL);
 	mce_cpu_features(c);
 	mce_init_timer();
@@ -1285,17 +1297,6 @@ int mce_disabled;
 int nr_mce_banks;
 EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
 
-/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
-{
-	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
-	       smp_processor_id());
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
-						unexpected_machine_check;
-
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
 {
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index cd6cffcc2de0..966ae3c5cb11 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -7,11 +7,12 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
-#ifdef CONFIG_X86_32
 
 /* Call the installed machine check handler for this CPU setup. */
 extern void (*machine_check_vector)(struct pt_regs *, long error_code);
 
+#ifdef CONFIG_X86_32
+
 extern int nr_mce_banks;
 
 void intel_set_thermal_handler(void);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e8433..63276c45bffa 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1382,7 +1382,7 @@ paranoiderrorentry stack_segment do_stack_segment
 errorentry general_protection do_general_protection
 errorentry page_fault do_page_fault
 #ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check do_machine_check
+paranoidzeroentry machine_check *machine_check_vector(%rip)
 #endif
 
 	/*

From 04b2b1a4df6cd0fdaa598f3c623a19c2d93cb48a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 28 Apr 2009 22:50:19 +0200
Subject: [PATCH 035/162] x86, mce: rename 64bit mce_dont_init to mce_disabled

Give it the same name as on 32bit. This makes further merging easier.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h       |  2 --
 arch/x86/kernel/cpu/mcheck/mce.c | 15 +++++++--------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 94aedaf6327f..c3c7ee701753 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -86,9 +86,7 @@ struct mce_log {
 
 #ifdef __KERNEL__
 
-#ifdef CONFIG_X86_32
 extern int mce_disabled;
-#endif
 
 #include <asm/atomic.h>
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d99318b470d8..6ab477060f56 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -49,14 +49,15 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code)
 /* Call the installed machine check handler for this CPU setup. */
 void (*machine_check_vector)(struct pt_regs *, long error_code) =
 						unexpected_machine_check;
+
+int				mce_disabled;
+
 #ifdef CONFIG_X86_64
 
 #define MISC_MCELOG_MINOR	227
 
 atomic_t mce_entry;
 
-static int			mce_dont_init;
-
 /*
  * Tolerant levels:
  *   0: always panic on uncorrected errors, log corrected errors
@@ -194,7 +195,7 @@ static void mce_panic(char *msg, struct mce *backup, u64 start)
 
 int mce_available(struct cpuinfo_x86 *c)
 {
-	if (mce_dont_init)
+	if (mce_disabled)
 		return 0;
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
@@ -720,7 +721,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 		return;
 
 	if (mce_cap_init() < 0) {
-		mce_dont_init = 1;
+		mce_disabled = 1;
 		return;
 	}
 	mce_cpu_quirks(c);
@@ -911,7 +912,7 @@ static struct miscdevice mce_log_device = {
  */
 static int __init mcheck_disable(char *str)
 {
-	mce_dont_init = 1;
+	mce_disabled = 1;
 	return 1;
 }
 __setup("nomce", mcheck_disable);
@@ -925,7 +926,7 @@ __setup("nomce", mcheck_disable);
 static int __init mcheck_enable(char *str)
 {
 	if (!strcmp(str, "off"))
-		mce_dont_init = 1;
+		mce_disabled = 1;
 	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
 		mce_bootlog = (str[0] == 'b');
 	else if (isdigit(str[0]))
@@ -1292,8 +1293,6 @@ device_initcall(mce_init_device);
 
 #else /* CONFIG_X86_32: */
 
-int mce_disabled;
-
 int nr_mce_banks;
 EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
 

From d7c3c9a609563868d8a70e220399d06a25aba095 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 28 Apr 2009 23:07:25 +0200
Subject: [PATCH 036/162] x86, mce: move mce_disabled option into common
 32bit/64bit code

It's the same function, so let's share it.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6ab477060f56..5395200dc9d9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -907,16 +907,6 @@ static struct miscdevice mce_log_device = {
 	&mce_chrdev_ops,
 };
 
-/*
- * Old style boot options parsing. Only for compatibility.
- */
-static int __init mcheck_disable(char *str)
-{
-	mce_disabled = 1;
-	return 1;
-}
-__setup("nomce", mcheck_disable);
-
 /*
  * mce=off disables machine check
  * mce=TOLERANCELEVEL (number, see above)
@@ -1327,19 +1317,22 @@ void mcheck_init(struct cpuinfo_x86 *c)
 	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
 }
 
-static int __init mcheck_disable(char *str)
-{
-	mce_disabled = 1;
-	return 1;
-}
-
 static int __init mcheck_enable(char *str)
 {
 	mce_disabled = -1;
 	return 1;
 }
 
-__setup("nomce", mcheck_disable);
 __setup("mce", mcheck_enable);
 
-#endif /* CONFIG_X86_32 */
+#endif /* CONFIG_X86_OLD_MCE */
+
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+static int __init mcheck_disable(char *str)
+{
+	mce_disabled = 1;
+	return 1;
+}
+__setup("nomce", mcheck_disable);

From 8e97aef5f43ec715f394bc15015ff263b80c3ad6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 28 Apr 2009 14:23:18 +0200
Subject: [PATCH 037/162] x86, mce: remove machine check handler idle notify on
 64bit

i386 has no idle notifiers, but the 64bit machine check
code uses them to wake up mcelog from a fatal machine check
exception.

For corrected machine checks found by the poller or
threshold interrupts going through an idle notifier is not needed
because the wake_up can is just done directly and doesn't
need the idle notifier. It is only needed for logging
exceptions.

To be honest I never liked the idle notifier even though I signed
off on it. On closer investigation the code actually turned out
to be nearly. Right now machine check exceptions on x86 are always
unrecoverable (lead to panic due to PCC), which means we never execute
the idle notifier path.

The only exception is the somewhat weird tolerant==3 case, which
ignores PCC. I'll fix this in a future patch in a much cleaner way.

So remove the "mcelog wakeup through idle notifier" code
from 64bit.

This allows to compile the 64bit machine check handler on 32bit
which doesn't have idle notifiers.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5395200dc9d9..7562c1f674f3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -555,29 +555,6 @@ int mce_notify_user(void)
 	return 0;
 }
 
-/* see if the idle task needs to notify userspace: */
-static int
-mce_idle_callback(struct notifier_block *nfb, unsigned long action,
-		  void *unused)
-{
-	/* IDLE_END should be safe - interrupts are back on */
-	if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
-		mce_notify_user();
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block mce_idle_notifier = {
-	.notifier_call		= mce_idle_callback,
-};
-
-static __init int periodic_mcheck_init(void)
-{
-       idle_notifier_register(&mce_idle_notifier);
-       return 0;
-}
-__initcall(periodic_mcheck_init);
-
 /*
  * Initialize Machine Checks for a CPU.
  */

From d896a940ef4f12a0a6bc432853b249dcfbacabf0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 28 Apr 2009 14:25:18 +0200
Subject: [PATCH 038/162] x86, mce: remove oops_begin() use in 64bit machine
 check

First 32bit doesn't have oops_begin, so it's a barrier of using
this code on 32bit.

On closer examination it turns out oops_begin is not
a good idea in a machine check panic anyways. All oops_begin
does it so check for recursive/parallel oopses and implement the
"wait on oops" heuristic. But there's actually no good reason
to lock machine checks against oopses or prevent them
from recursion. Also "wait on oops" does not really make
sense for a machine check too.

Replace it with a manual bust_spinlocks/console_verbose.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7562c1f674f3..f4d6841d2bdf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -178,7 +178,8 @@ static void mce_panic(char *msg, struct mce *backup, u64 start)
 {
 	int i;
 
-	oops_begin();
+	bust_spinlocks(1);
+	console_verbose();
 	for (i = 0; i < MCE_LOG_LEN; i++) {
 		u64 tsc = mcelog.entry[i].tsc;
 

From 4efc0670baf4b14bc95502e54a83ccf639146125 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 28 Apr 2009 19:07:31 +0200
Subject: [PATCH 039/162] x86, mce: use 64bit machine check code on 32bit

The 64bit machine check code is in many ways much better than
the 32bit machine check code: it is more specification compliant,
is cleaner, only has a single code base versus one per CPU,
has better infrastructure for recovery, has a cleaner way to communicate
with user space etc. etc.

Use the 64bit code for 32bit too.

This is the second attempt to do this. There was one a couple of years
ago to unify this code for 32bit and 64bit.  Back then this ran into some
trouble with K7s and was reverted.

I believe this time the K7 problems (and some others) are addressed.
I went over the old handlers and was very careful to retain
all quirks.

But of course this needs a lot of testing on old systems. On newer
64bit capable systems I don't expect much problems because they have been
already tested with the 64bit kernel.

I made this a CONFIG for now that still allows to select the old
machine check code. This is mostly to make testing easier,
if someone runs into a problem we can ask them to try
with the CONFIG switched.

The new code is default y for more coverage.

Once there is confidence the 64bit code works well on older hardware
too the CONFIG_X86_OLD_MCE and the associated code can be easily
removed.

This causes a behaviour change for 32bit installations. They now
have to install the mcelog package to be able to log
corrected machine checks.

The 64bit machine check code only handles CPUs which support the
standard Intel machine check architecture described in the IA32 SDM.
The 32bit code has special support for some older CPUs which
have non standard machine check architectures, in particular
WinChip C3 and Intel P5.  I made those a separate CONFIG option
and kept them for now. The WinChip variant could be probably
removed without too much pain, it doesn't really do anything
interesting. P5 is also disabled by default (like it
was before) because many motherboards have it miswired, but
according to Alan Cox a few embedded setups use that one.

Forward ported/heavily changed version of old patch, original patch
included review/fixes from Thomas Gleixner, Bert Wesarg.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig                    | 33 +++++++++++++++++++++++++++--
 arch/x86/include/asm/entry_arch.h   |  2 +-
 arch/x86/kernel/apic/apic.c         |  4 ++--
 arch/x86/kernel/apic/nmi.c          |  2 +-
 arch/x86/kernel/cpu/mcheck/Makefile |  3 ++-
 arch/x86/kernel/cpu/mcheck/mce.c    | 32 ++++++++++++++++++++++++----
 arch/x86/kernel/cpu/mcheck/mce.h    | 18 +++++++++++++---
 arch/x86/kernel/cpu/mcheck/p5.c     |  5 +++++
 arch/x86/kernel/irq.c               |  4 ++--
 arch/x86/kernel/irqinit_32.c        |  2 +-
 arch/x86/kernel/signal.c            |  4 ++--
 arch/x86/kernel/traps.c             |  4 ++--
 12 files changed, 92 insertions(+), 21 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a6efe0a2e9ae..c1c5ccd1937f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -789,6 +789,22 @@ config X86_MCE
 	  to disable it.  MCE support simply ignores non-MCE processors like
 	  the 386 and 486, so nearly everyone can say Y here.
 
+config X86_OLD_MCE
+	depends on X86_32 && X86_MCE
+	bool "Use legacy machine check code (will go away)"
+	default n
+	select X86_ANCIENT_MCE
+	---help---
+	  Use the old i386 machine check code. This is merely intended for
+	  testing in a transition period. Try this if you run into any machine
+	  check related software problems, but report the problem to
+	  linux-kernel.  When in doubt say no.
+
+config X86_NEW_MCE
+	depends on X86_MCE
+	bool
+	default y if (!X86_OLD_MCE && X86_32) || X86_64
+
 config X86_MCE_INTEL
 	def_bool y
 	prompt "Intel MCE features"
@@ -805,6 +821,15 @@ config X86_MCE_AMD
 	   Additional support for AMD specific MCE features such as
 	   the DRAM Error Threshold.
 
+config X86_ANCIENT_MCE
+       def_bool n
+       depends on X86_32
+       prompt "Support for old Pentium 5 / WinChip machine checks"
+       ---help---
+	 Include support for machine check handling on old Pentium 5 or WinChip
+	 systems. These typically need to be enabled explicitely on the command
+	 line.
+
 config X86_MCE_THRESHOLD
 	depends on X86_MCE_AMD || X86_MCE_INTEL
 	bool
@@ -812,7 +837,7 @@ config X86_MCE_THRESHOLD
 
 config X86_MCE_NONFATAL
 	tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
-	depends on X86_32 && X86_MCE
+	depends on X86_OLD_MCE
 	---help---
 	  Enabling this feature starts a timer that triggers every 5 seconds which
 	  will look at the machine check registers to see if anything happened.
@@ -825,11 +850,15 @@ config X86_MCE_NONFATAL
 
 config X86_MCE_P4THERMAL
 	bool "check for P4 thermal throttling interrupt."
-	depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
+	depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
 	---help---
 	  Enabling this feature will cause a message to be printed when the P4
 	  enters thermal throttling.
 
+config X86_THERMAL_VECTOR
+	def_bool y
+	depends on X86_MCE_P4THERMAL || X86_MCE_INTEL
+
 config VM86
 	bool "Enable VM86 support" if EMBEDDED
 	default y
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf258..486c9e946f5c 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -52,7 +52,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
 #endif
 
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#ifdef CONFIG_X86_THERMAL_VECTOR
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f246..ad532289ef2e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -843,7 +843,7 @@ void clear_local_APIC(void)
 	}
 
 	/* lets not touch this if we didn't frob it */
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
 	if (maxlvt >= 5) {
 		v = apic_read(APIC_LVTTHMR);
 		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -1962,7 +1962,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
 	if (maxlvt >= 5)
 		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
 #endif
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index ce4fbfa315a1..c4762276c17e 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
 
 static inline int mce_in_progress(void)
 {
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#if defined(CONFIG_X86_NEW_MCE)
 	return atomic_read(&mce_entry) > 0;
 #endif
 	return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 55f01b39a105..5f8b09425d39 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,6 +1,7 @@
 obj-y				=  mce.o therm_throt.o
 
-obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o
+obj-$(CONFIG_X86_OLD_MCE)	+= k7.o p4.o p6.o
+obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o
 obj-$(CONFIG_X86_MCE_P4THERMAL)	+= mce_intel.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o mce_intel.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index f4d6841d2bdf..e193de44ef19 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -52,7 +52,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
 
 int				mce_disabled;
 
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_NEW_MCE
 
 #define MISC_MCELOG_MINOR	227
 
@@ -662,6 +662,21 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 	}
 }
 
+static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
+{
+	if (c->x86 != 5)
+		return;
+	switch (c->x86_vendor) {
+	case X86_VENDOR_INTEL:
+		if (mce_p5_enabled())
+			intel_p5_mcheck_init(c);
+		break;
+	case X86_VENDOR_CENTAUR:
+		winchip_mcheck_init(c);
+		break;
+	}
+}
+
 static void mce_cpu_features(struct cpuinfo_x86 *c)
 {
 	switch (c->x86_vendor) {
@@ -695,6 +710,11 @@ static void mce_init_timer(void)
  */
 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
+	if (mce_disabled)
+		return;
+
+	mce_ancient_init(c);
+
 	if (!mce_available(c))
 		return;
 
@@ -893,6 +913,10 @@ static struct miscdevice mce_log_device = {
  */
 static int __init mcheck_enable(char *str)
 {
+	if (*str == 0)
+		enable_p5_mce();
+	if (*str == '=')
+		str++;
 	if (!strcmp(str, "off"))
 		mce_disabled = 1;
 	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
@@ -900,13 +924,13 @@ static int __init mcheck_enable(char *str)
 	else if (isdigit(str[0]))
 		get_option(&str, &tolerant);
 	else {
-		printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n",
+		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
 		       str);
 		return 0;
 	}
 	return 1;
 }
-__setup("mce=", mcheck_enable);
+__setup("mce", mcheck_enable);
 
 /*
  * Sysfs support
@@ -1259,7 +1283,7 @@ static __init int mce_init_device(void)
 
 device_initcall(mce_init_device);
 
-#else /* CONFIG_X86_32: */
+#else /* CONFIG_X86_OLD_MCE: */
 
 int nr_mce_banks;
 EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index 966ae3c5cb11..84a552b458c8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -1,17 +1,29 @@
 #include <linux/init.h>
 #include <asm/mce.h>
 
+#ifdef CONFIG_X86_OLD_MCE
 void amd_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
-void winchip_mcheck_init(struct cpuinfo_x86 *c);
+#endif
 
+#ifdef CONFIG_X86_ANCIENT_MCE
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+void winchip_mcheck_init(struct cpuinfo_x86 *c);
+extern int mce_p5_enable;
+static inline int mce_p5_enabled(void) { return mce_p5_enable; }
+static inline void enable_p5_mce(void) { mce_p5_enable = 1; }
+#else
+static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline int mce_p5_enabled(void) { return 0; }
+static inline void enable_p5_mce(void) { }
+#endif
 
 /* Call the installed machine check handler for this CPU setup. */
 extern void (*machine_check_vector)(struct pt_regs *, long error_code);
 
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_OLD_MCE
 
 extern int nr_mce_banks;
 
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 8812f5441830..015f481ab1b0 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -14,6 +14,9 @@
 
 #include "mce.h"
 
+/* By default disabled */
+int		mce_p5_enable;
+
 /* Machine check handler for Pentium class Intel CPUs: */
 static void pentium_machine_check(struct pt_regs *regs, long error_code)
 {
@@ -44,9 +47,11 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
 	if (!cpu_has(c, X86_FEATURE_MCE))
 		return;
 
+#ifdef CONFIG_X86_OLD_MCE
 	/* Default P5 to off as its often misconnected: */
 	if (mce_disabled != -1)
 		return;
+#endif
 
 	machine_check_vector = pentium_machine_check;
 	/* Make sure the vector pointer is visible before we enable MCEs: */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c8..35eddc9ec99e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -89,7 +89,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
 	seq_printf(p, "  Thermal event interrupts\n");
-# ifdef CONFIG_X86_64
+# ifdef CONFIG_X86_MCE_THRESHOLD
 	seq_printf(p, "%*s: ", prec, "THR");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
@@ -176,7 +176,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #endif
 #ifdef CONFIG_X86_MCE
 	sum += irq_stats(cpu)->irq_thermal_count;
-# ifdef CONFIG_X86_64
+# ifdef CONFIG_X86_MCE_THRESHOLD
 	sum += irq_stats(cpu)->irq_threshold_count;
 #endif
 #endif
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 368b0a8836f9..98846e03211e 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -181,7 +181,7 @@ void __init native_init_IRQ(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 #endif
 
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
 	/* thermal monitor LVT interrupt */
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e3..d0851e3f77eb 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -25,11 +25,11 @@
 #include <asm/ucontext.h>
 #include <asm/i387.h>
 #include <asm/vdso.h>
+#include <asm/mce.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
-#include <asm/mce.h>
 #endif /* CONFIG_X86_64 */
 
 #include <asm/syscall.h>
@@ -857,7 +857,7 @@ static void do_signal(struct pt_regs *regs)
 void
 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#ifdef CONFIG_X86_NEW_MCE
 	/* notify userspace of pending MCEs */
 	if (thread_info_flags & _TIF_MCE_NOTIFY)
 		mce_notify_user();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff0..ad771f15bddd 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -798,7 +798,8 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
 
 	return new_kesp;
 }
-#else
+#endif
+
 asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
 {
 }
@@ -806,7 +807,6 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
 asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
 {
 }
-#endif
 
 /*
  * 'math_state_restore()' saves the current math information in the

From 45f458e9a8a216b02b76fe61d9e8bc40d659fbe8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 28 Apr 2009 23:18:26 +0200
Subject: [PATCH 040/162] x86, mce: deprecate old 32bit machine check code

Schedule for removal in 2.6.32

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/feature-removal-schedule.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index de491a3e2313..ec9ef5d0d7b3 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -437,3 +437,13 @@ Why:	Superseded by tdfxfb. I2C/DDC support used to live in a separate
 	driver but this caused driver conflicts.
 Who:	Jean Delvare <khali@linux-fr.org>
 	Krzysztof Helt <krzysztof.h1@wp.pl>
+
+----------------------------
+
+What:	CONFIG_X86_OLD_MCE
+When:	2.6.32
+Why:	Remove the old legacy 32bit machine check code. This has been
+	superseded by the newer machine check code from the 64bit port,
+	but the old version has been kept around for easier testing. Note this
+	doesn't impact the old P5 and WinChip machine check handlers.
+Who:	Andi Kleen <andi@firstfloor.org>

From 7856f6cce4a8cda8c1f94b99605c07d16b8d8dec Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 28 Apr 2009 23:32:56 +0200
Subject: [PATCH 041/162] x86, mce: enable MCE_INTEL for 32bit new MCE

Enable the 64bit MCE_INTEL code (CMCI, thermal interrupts) for 32bit NEW_MCE.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig                       | 2 +-
 arch/x86/include/asm/entry_arch.h      | 4 ++++
 arch/x86/include/asm/hardirq.h         | 2 +-
 arch/x86/include/asm/irq_vectors.h     | 5 +++--
 arch/x86/kernel/cpu/mcheck/threshold.c | 2 +-
 arch/x86/kernel/entry_64.S             | 2 +-
 arch/x86/kernel/irqinit_32.c           | 4 ++++
 arch/x86/kernel/traps.c                | 2 +-
 8 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c1c5ccd1937f..e1c9f77f69ec 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -808,7 +808,7 @@ config X86_NEW_MCE
 config X86_MCE_INTEL
 	def_bool y
 	prompt "Intel MCE features"
-	depends on X86_64 && X86_MCE && X86_LOCAL_APIC
+	depends on X86_NEW_MCE && X86_LOCAL_APIC
 	---help---
 	   Additional support for intel specific MCE features such as
 	   the thermal monitor.
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 486c9e946f5c..b2eb9c066843 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -56,4 +56,8 @@ BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
 
+#ifdef CONFIG_X86_MCE_THRESHOLD
+BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
+#endif
+
 #endif
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 37555e52f980..922ee7c29693 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -20,7 +20,7 @@ typedef struct {
 #endif
 #ifdef CONFIG_X86_MCE
 	unsigned int irq_thermal_count;
-# ifdef CONFIG_X86_64
+# ifdef CONFIG_X86_MCE_THRESHOLD
 	unsigned int irq_threshold_count;
 # endif
 #endif
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47c..451e24d18050 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -87,10 +87,11 @@
 #define CALL_FUNCTION_SINGLE_VECTOR	0xfb
 #define THERMAL_APIC_VECTOR		0xfa
 
+#define THRESHOLD_APIC_VECTOR		0xf9
+
 #ifdef CONFIG_X86_32
-/* 0xf8 - 0xf9 : free */
+/* 0xf9 : free */
 #else
-# define THRESHOLD_APIC_VECTOR		0xf9
 # define UV_BAU_MESSAGE			0xf8
 #endif
 
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 23ee9e730f78..d746df2909c9 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)
 
 void (*mce_threshold_vector)(void) = default_threshold_interrupt;
 
-asmlinkage void mce_threshold_interrupt(void)
+asmlinkage void smp_threshold_interrupt(void)
 {
 	exit_idle();
 	irq_enter();
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 63276c45bffa..a31a7f29cffe 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1007,7 +1007,7 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
 #endif
 
 apicinterrupt THRESHOLD_APIC_VECTOR \
-	threshold_interrupt mce_threshold_interrupt
+	threshold_interrupt smp_threshold_interrupt
 apicinterrupt THERMAL_APIC_VECTOR \
 	thermal_interrupt smp_thermal_interrupt
 
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 98846e03211e..2512ad93dabf 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -186,6 +186,10 @@ void __init native_init_IRQ(void)
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
 
+#ifdef CONFIG_X86_MCE_THRESHOLD
+	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
+
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ad771f15bddd..0d358c884b35 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -804,7 +804,7 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
 {
 }
 
-asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 {
 }
 

From de5619dfef76ddb403eb7c6de39c0130166c5dc3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 28 Apr 2009 23:34:40 +0200
Subject: [PATCH 042/162] x86, mce: enable MCE_AMD for 32bit NEW_MCE

That's very easy using the infrastructure enabled earlier for MCE_INTEL

Untested.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e1c9f77f69ec..a148e7ac0d83 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -816,7 +816,7 @@ config X86_MCE_INTEL
 config X86_MCE_AMD
 	def_bool y
 	prompt "AMD MCE features"
-	depends on X86_64 && X86_MCE && X86_LOCAL_APIC
+	depends on X86_NEW_MCE && X86_LOCAL_APIC
 	---help---
 	   Additional support for AMD specific MCE features such as
 	   the DRAM Error Threshold.

From 172d899db4bf0beb7766d583379e5ed552130e4a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 28 Apr 2009 23:37:02 +0200
Subject: [PATCH 043/162] x86, mce: document new 32bit mcelog requirement in
 Documentation/Changes

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/Changes | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Documentation/Changes b/Documentation/Changes
index b95082be4d5e..d21b3b5aa543 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -48,6 +48,7 @@ o  procps                 3.2.0                   # ps --version
 o  oprofile               0.9                     # oprofiled --version
 o  udev                   081                     # udevinfo -V
 o  grub                   0.93                    # grub --version
+o  mcelog		  0.6
 
 Kernel compilation
 ==================
@@ -276,6 +277,16 @@ before running exportfs or mountd.  It is recommended that all NFS
 services be protected from the internet-at-large by a firewall where
 that is possible.
 
+mcelog
+------
+
+In Linux 2.6.31+ the i386 kernel needs to run the mcelog utility
+as a regular cronjob similar to the x86-64 kernel to process and log
+machine check events when CONFIG_X86_NEW_MCE is enabled. Machine check
+events are errors reported by the CPU. Processing them is strongly encouraged.
+All x86-64 kernels since 2.6.4 require the mcelog utility to
+process machine checks.
+
 Getting updated software
 ========================
 
@@ -365,6 +376,10 @@ FUSE
 ----
 o <http://sourceforge.net/projects/fuse>
 
+mcelog
+------
+o <ftp://ftp.kernel.org/pub/linux/utils/cpu/mce/mcelog/>
+
 Networking
 **********
 

From a9862e0560866eadbc59b84867492004da436516 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 19 May 2009 22:49:07 +0200
Subject: [PATCH 044/162] Export add_timer_on for modules

Needed in followon patch.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 kernel/timer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..e2c47b82ac36 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -756,6 +756,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	wake_up_idle_cpu(cpu);
 	spin_unlock_irqrestore(&base->lock, flags);
 }
+EXPORT_SYMBOL_GPL(add_timer_on);
 
 /**
  * del_timer - deactive a timer.

From 5f8c1a54cab6f449fe04d42d0661bc796fa4e73e Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 29 Apr 2009 19:29:12 +0200
Subject: [PATCH 045/162] x86, mce: add MSR read wrappers for easier error
 injection

This will be used by future patches to allow machine check error injection.
Right now it's a nop, except for adding some wrappers around the MSR reads.

This is early in the sequence to avoid too many conflicts.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 37 +++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e193de44ef19..e755c95674dc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -194,6 +194,19 @@ static void mce_panic(char *msg, struct mce *backup, u64 start)
 	panic(msg);
 }
 
+/* MSR access wrappers used for error injection */
+static u64 mce_rdmsrl(u32 msr)
+{
+	u64 v;
+	rdmsrl(msr, v);
+	return v;
+}
+
+static void mce_wrmsrl(u32 msr, u64 v)
+{
+	wrmsrl(msr, v);
+}
+
 int mce_available(struct cpuinfo_x86 *c)
 {
 	if (mce_disabled)
@@ -213,7 +226,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 	if (rip_msr) {
 		/* Assume the RIP in the MSR is exact. Is this true? */
 		m->mcgstatus |= MCG_STATUS_EIPV;
-		rdmsrl(rip_msr, m->ip);
+		m->ip = mce_rdmsrl(rip_msr);
 		m->cs = 0;
 	}
 }
@@ -231,7 +244,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 	mce_setup(&m);
 
-	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 	for (i = 0; i < banks; i++) {
 		if (!bank[i] || !test_bit(i, *b))
 			continue;
@@ -242,7 +255,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		m.tsc = 0;
 
 		barrier();
-		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
 		if (!(m.status & MCI_STATUS_VAL))
 			continue;
 
@@ -257,9 +270,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 			continue;
 
 		if (m.status & MCI_STATUS_MISCV)
-			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
 		if (m.status & MCI_STATUS_ADDRV)
-			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
 
 		if (!(flags & MCP_TIMESTAMP))
 			m.tsc = 0;
@@ -275,7 +288,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		/*
 		 * Clear state for this bank.
 		 */
-		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 
 	/*
@@ -320,7 +333,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	mce_setup(&m);
 
-	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 
 	/* if the restart IP is not valid, we're done for */
 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -338,7 +351,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		m.addr = 0;
 		m.bank = i;
 
-		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
 		if ((m.status & MCI_STATUS_VAL) == 0)
 			continue;
 
@@ -378,9 +391,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		}
 
 		if (m.status & MCI_STATUS_MISCV)
-			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
 		if (m.status & MCI_STATUS_ADDRV)
-			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
 
 		mce_get_rip(&m, regs);
 		mce_log(&m);
@@ -449,9 +462,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	/* the last thing we do is clear state */
 	for (i = 0; i < banks; i++) {
 		if (test_bit(i, toclear))
-			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
-	wrmsrl(MSR_IA32_MCG_STATUS, 0);
+	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out2:
 	atomic_dec(&mce_entry);
 }

From ea149b36c7f511d17dd89fee734cb09778a91fa0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 29 Apr 2009 19:31:00 +0200
Subject: [PATCH 046/162] x86, mce: add basic error injection infrastructure

Allow user programs to write mce records into /dev/mcelog. When they do
that a fake machine check is triggered to test the machine check code.

This uses the MCE MSR wrappers added earlier.

The implementation is straight forward. There is a struct mce record
per CPU and the MCE MSR accesses get data from there if there is valid
data injected there. This allows to test the machine check code
relatively realistically because only the lowest layer of hardware
access is intercepted.

The test suite and injector are available at
git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig                        |   8 ++
 arch/x86/include/asm/mce.h              |   3 +
 arch/x86/kernel/cpu/mcheck/Makefile     |   1 +
 arch/x86/kernel/cpu/mcheck/mce-inject.c | 126 ++++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce.c        |  39 +++++++-
 5 files changed, 176 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/mcheck/mce-inject.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a148e7ac0d83..e25b6358fbe3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -835,6 +835,14 @@ config X86_MCE_THRESHOLD
 	bool
 	default y
 
+config X86_MCE_INJECT
+	depends on X86_NEW_MCE
+	tristate "Machine check injector support"
+	---help---
+	  Provide support for injecting machine checks for testing purposes.
+	  If you don't know what a machine check is and you don't do kernel
+	  QA it is safe to say n.
+
 config X86_MCE_NONFATAL
 	tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
 	depends on X86_OLD_MCE
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c3c7ee701753..e7d2372301ef 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -141,6 +141,9 @@ extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
 extern int mce_notify_user(void);
 
+DECLARE_PER_CPU(struct mce, injectm);
+extern struct file_operations mce_chrdev_ops;
+
 #ifdef CONFIG_X86_MCE
 extern void mcheck_init(struct cpuinfo_x86 *c);
 #else
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 5f8b09425d39..60ee182c6c52 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o mce_intel.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o
 obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o
 obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
+obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 000000000000..58afac4b5df5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,126 @@
+/*
+ * Machine check injection support.
+ * Copyright 2008 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Authors:
+ * Andi Kleen
+ * Ying Huang
+ */
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <asm/uaccess.h>
+#include <asm/mce.h>
+
+/* Update fake mce registers on current CPU. */
+static void inject_mce(struct mce *m)
+{
+	struct mce *i = &per_cpu(injectm, m->cpu);
+
+	/* Make sure noone reads partially written injectm */
+	i->finished = 0;
+	mb();
+	m->finished = 0;
+	/* First set the fields after finished */
+	i->cpu = m->cpu;
+	mb();
+	/* Now write record in order, finished last (except above) */
+	memcpy(i, m, sizeof(struct mce));
+	/* Finally activate it */
+	mb();
+	i->finished = 1;
+}
+
+struct delayed_mce {
+	struct timer_list timer;
+	struct mce m;
+};
+
+/* Inject mce on current CPU */
+static void raise_mce(unsigned long data)
+{
+	struct delayed_mce *dm = (struct delayed_mce *)data;
+	struct mce *m = &dm->m;
+	int cpu = m->cpu;
+
+	inject_mce(m);
+	if (m->status & MCI_STATUS_UC) {
+		struct pt_regs regs;
+		memset(&regs, 0, sizeof(struct pt_regs));
+		regs.ip = m->ip;
+		regs.cs = m->cs;
+		printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
+		do_machine_check(&regs, 0);
+		printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
+	} else {
+		mce_banks_t b;
+		memset(&b, 0xff, sizeof(mce_banks_t));
+		printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
+		machine_check_poll(0, &b);
+		mce_notify_user();
+		printk(KERN_INFO "Finished machine check poll on CPU %d\n",
+		       cpu);
+	}
+	kfree(dm);
+}
+
+/* Error injection interface */
+static ssize_t mce_write(struct file *filp, const char __user *ubuf,
+			 size_t usize, loff_t *off)
+{
+	struct delayed_mce *dm;
+	struct mce m;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	/*
+	 * There are some cases where real MSR reads could slip
+	 * through.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+		return -EIO;
+
+	if ((unsigned long)usize > sizeof(struct mce))
+		usize = sizeof(struct mce);
+	if (copy_from_user(&m, ubuf, usize))
+		return -EFAULT;
+
+	if (m.cpu >= NR_CPUS || !cpu_online(m.cpu))
+		return -EINVAL;
+
+	dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
+	if (!dm)
+		return -ENOMEM;
+
+	/*
+	 * Need to give user space some time to set everything up,
+	 * so do it a jiffie or two later everywhere.
+	 * Should we use a hrtimer here for better synchronization?
+	 */
+	memcpy(&dm->m, &m, sizeof(struct mce));
+	setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
+	dm->timer.expires = jiffies + 2;
+	add_timer_on(&dm->timer, m.cpu);
+	return usize;
+}
+
+static int inject_init(void)
+{
+	printk(KERN_INFO "Machine check injector initialized\n");
+	mce_chrdev_ops.write = mce_write;
+	return 0;
+}
+
+module_init(inject_init);
+/* Cannot tolerate unloading currently because we cannot
+ * guarantee all openers of mce_chrdev will get a reference to us.
+ */
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e755c95674dc..fe216bd10f43 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -98,6 +98,9 @@ void mce_setup(struct mce *m)
 	rdtscll(m->tsc);
 }
 
+DEFINE_PER_CPU(struct mce, injectm);
+EXPORT_PER_CPU_SYMBOL_GPL(injectm);
+
 /*
  * Lockless MCE logging infrastructure.
  * This avoids deadlocks on printk locks without having to break locks. Also
@@ -194,16 +197,46 @@ static void mce_panic(char *msg, struct mce *backup, u64 start)
 	panic(msg);
 }
 
+/* Support code for software error injection */
+
+static int msr_to_offset(u32 msr)
+{
+	unsigned bank = __get_cpu_var(injectm.bank);
+	if (msr == rip_msr)
+		return offsetof(struct mce, ip);
+	if (msr == MSR_IA32_MC0_STATUS + bank*4)
+		return offsetof(struct mce, status);
+	if (msr == MSR_IA32_MC0_ADDR + bank*4)
+		return offsetof(struct mce, addr);
+	if (msr == MSR_IA32_MC0_MISC + bank*4)
+		return offsetof(struct mce, misc);
+	if (msr == MSR_IA32_MCG_STATUS)
+		return offsetof(struct mce, mcgstatus);
+	return -1;
+}
+
 /* MSR access wrappers used for error injection */
 static u64 mce_rdmsrl(u32 msr)
 {
 	u64 v;
+	if (__get_cpu_var(injectm).finished) {
+		int offset = msr_to_offset(msr);
+		if (offset < 0)
+			return 0;
+		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
+	}
 	rdmsrl(msr, v);
 	return v;
 }
 
 static void mce_wrmsrl(u32 msr, u64 v)
 {
+	if (__get_cpu_var(injectm).finished) {
+		int offset = msr_to_offset(msr);
+		if (offset >= 0)
+			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
+		return;
+	}
 	wrmsrl(msr, v);
 }
 
@@ -296,6 +329,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 	 * exceptions.
 	 */
 }
+EXPORT_SYMBOL_GPL(machine_check_poll);
 
 /*
  * The actual machine check handler. This only handles real
@@ -468,6 +502,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
  out2:
 	atomic_dec(&mce_entry);
 }
+EXPORT_SYMBOL_GPL(do_machine_check);
 
 #ifdef CONFIG_X86_MCE_INTEL
 /***
@@ -568,6 +603,7 @@ int mce_notify_user(void)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(mce_notify_user);
 
 /*
  * Initialize Machine Checks for a CPU.
@@ -904,13 +940,14 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 	}
 }
 
-static const struct file_operations mce_chrdev_ops = {
+struct file_operations mce_chrdev_ops = {
 	.open			= mce_open,
 	.release		= mce_release,
 	.read			= mce_read,
 	.poll			= mce_poll,
 	.unlocked_ioctl		= mce_ioctl,
 };
+EXPORT_SYMBOL_GPL(mce_chrdev_ops);
 
 static struct miscdevice mce_log_device = {
 	MISC_MCELOG_MINOR,

From a1ff41bfc1bb7a6d19cf958f89a9b539678781e5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 25 May 2009 22:16:14 -0700
Subject: [PATCH 047/162] x86, mce: add comment about mce_chrdev_ops being
 writable

Add a comment explaining that mce_chrdev_ops is intentionally
writable.

[ Impact: comment only ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index fe216bd10f43..156cdf6d9181 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -940,6 +940,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 	}
 }
 
+/* Modified in mce-inject.c, so not static or const */
 struct file_operations mce_chrdev_ops = {
 	.open			= mce_open,
 	.release		= mce_release,

From 5706001aacba5d3db5f224ca135e5e91a30be39c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 25 May 2009 22:18:17 -0700
Subject: [PATCH 048/162] x86, mce: fix comment style in mce-inject.c

Fix style of winged comment in mce-inject.c.

[ Impact: comment only ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce-inject.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 58afac4b5df5..673c72855022 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -120,7 +120,8 @@ static int inject_init(void)
 }
 
 module_init(inject_init);
-/* Cannot tolerate unloading currently because we cannot
+/*
+ * Cannot tolerate unloading currently because we cannot
  * guarantee all openers of mce_chrdev will get a reference to us.
  */
 MODULE_LICENSE("GPL");

From 88921be30296e126896ee4d30758f989d1c4ddfb Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:51 +0200
Subject: [PATCH 049/162] x86, mce: synchronize core after machine check
 handling

The example code in the IA32 SDM recommends to synchronize the CPU
after machine check handling. So do that here.

[ Impact: Spec compliance ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 156cdf6d9181..495c96808668 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -328,6 +328,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 	 * Don't clear MCG_STATUS here because it's only defined for
 	 * exceptions.
 	 */
+
+	sync_core();
 }
 EXPORT_SYMBOL_GPL(machine_check_poll);
 
@@ -501,6 +503,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out2:
 	atomic_dec(&mce_entry);
+	sync_core();
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
 

From b56f642d2bf8c1f7c6499c1e55b23311a33cc796 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:52 +0200
Subject: [PATCH 050/162] x86, mce: use extended sysattrs for the
 check_interval attribute.

Instead of using own callbacks use the generic ones provided by
the sysdev later.

This finally allows to get rid of the ugly ACCESSOR macros. Should
also save some text size.

[ Impact: cleanup ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 39 ++++++++++++--------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 495c96808668..3cac9da7ce24 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1054,28 +1054,6 @@ DEFINE_PER_CPU(struct sys_device, mce_dev);
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
-/* Why are there no generic functions for this? */
-#define ACCESSOR(name, var, start) \
-	static ssize_t show_ ## name(struct sys_device *s,		\
-				     struct sysdev_attribute *attr,	\
-				     char *buf) {			\
-		return sprintf(buf, "%Lx\n", (u64)var);			\
-	}								\
-	static ssize_t set_ ## name(struct sys_device *s,		\
-				    struct sysdev_attribute *attr,	\
-				    const char *buf, size_t siz) {	\
-		char *end;						\
-		u64 new = simple_strtoull(buf, &end, 0);		\
-									\
-		if (end == buf)						\
-			return -EINVAL;					\
-		var = new;						\
-		start;							\
-									\
-		return end-buf;						\
-	}								\
-	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-
 static struct sysdev_attribute *bank_attrs;
 
 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
@@ -1126,13 +1104,26 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 	return len;
 }
 
+static ssize_t store_int_with_restart(struct sys_device *s,
+				      struct sysdev_attribute *attr,
+				      const char *buf, size_t size)
+{
+	ssize_t ret = sysdev_store_int(s, attr, buf, size);
+	mce_restart();
+	return ret;
+}
+
 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 
-ACCESSOR(check_interval, check_interval, mce_restart())
+static struct sysdev_ext_attribute attr_check_interval = {
+	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
+		     store_int_with_restart),
+	&check_interval
+};
 
 static struct sysdev_attribute *mce_attrs[] = {
-	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
+	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
 	NULL
 };
 

From fc016a49c2d92f2efbe22c1fb66eb7a5d2a06ed1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:53 +0200
Subject: [PATCH 051/162] x86, mce: remove unused mce_events variable

Remove unused mce_events static variable.

[ Impact: cleanup ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3cac9da7ce24..69aad7e96a53 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -71,7 +71,6 @@ static u64			*bank;
 static unsigned long		notify_user;
 static int			rip_msr;
 static int			mce_bootlog = -1;
-static atomic_t			mce_events;
 
 static char			trigger[128];
 static char			*trigger_argv[2] = { trigger, NULL };
@@ -116,7 +115,6 @@ void mce_log(struct mce *mce)
 {
 	unsigned next, entry;
 
-	atomic_inc(&mce_events);
 	mce->finished = 0;
 	wmb();
 	for (;;) {

From 8be9110569aec1f65d86b08aef7ec49659137bf9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 May 2009 21:56:53 +0200
Subject: [PATCH 052/162] x86, mce: remove mce_init unused argument

Remove unused mce_init argument.

[ Impact: cleanup ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 69aad7e96a53..20c7e7c669d2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -643,7 +643,7 @@ static int mce_cap_init(void)
 	return 0;
 }
 
-static void mce_init(void *dummy)
+static void mce_init(void)
 {
 	mce_banks_t all_banks;
 	u64 cap;
@@ -776,7 +776,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 
 	machine_check_vector = do_machine_check;
 
-	mce_init(NULL);
+	mce_init();
 	mce_cpu_features(c);
 	mce_init_timer();
 }
@@ -1020,7 +1020,7 @@ static int mce_shutdown(struct sys_device *dev)
  */
 static int mce_resume(struct sys_device *dev)
 {
-	mce_init(NULL);
+	mce_init();
 	mce_cpu_features(&current_cpu_data);
 
 	return 0;
@@ -1030,7 +1030,7 @@ static void mce_cpu_restart(void *data)
 {
 	del_timer_sync(&__get_cpu_var(mce_timer));
 	if (mce_available(&current_cpu_data))
-		mce_init(NULL);
+		mce_init();
 	mce_init_timer();
 }
 

From 32561696c23028596f24b353d98f2e23b58f91f7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:53 +0200
Subject: [PATCH 053/162] x86, mce: rename and align out2 label

There's only a single out path in do_machine_check now, so rename the
label from out2 to out.  Also align it at the first column.

[ Impact: minor cleanup, no functional changes ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 20c7e7c669d2..18d505d80221 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -361,9 +361,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
-		goto out2;
+		goto out;
 	if (!banks)
-		goto out2;
+		goto out;
 
 	mce_setup(&m);
 
@@ -499,7 +499,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
- out2:
+out:
 	atomic_dec(&mce_entry);
 	sync_core();
 }

From b170204ddb7844ffff62d2d537b20c0eeb97725e Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:54 +0200
Subject: [PATCH 054/162] x86, mce: drop BKL in mce_open

BKL is not needed for anything in mce_open because it has
an own spinlock. Remove it.

[ Impact: cleanup ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18d505d80221..8ab28368bb92 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -13,7 +13,6 @@
 #include <linux/ratelimit.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
-#include <linux/smp_lock.h>
 #include <linux/kobject.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
@@ -791,12 +790,10 @@ static int		open_exclu;		/* already open exclusive? */
 
 static int mce_open(struct inode *inode, struct file *file)
 {
-	lock_kernel();
 	spin_lock(&mce_state_lock);
 
 	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 		spin_unlock(&mce_state_lock);
-		unlock_kernel();
 
 		return -EBUSY;
 	}
@@ -806,7 +803,6 @@ static int mce_open(struct inode *inode, struct file *file)
 	open_count++;
 
 	spin_unlock(&mce_state_lock);
-	unlock_kernel();
 
 	return nonseekable_open(inode, file);
 }

From 8780e8e0f6b34862cdf2c62d4d2674d6bc3207db Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:56 +0200
Subject: [PATCH 055/162] x86, mce: improve documentation

Document that check_interval set to 0 means no polling.
Noticed by Hidetoshi Seto

Also add a reference from boot options to the sysfs tunables

Acked-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/x86/x86_64/boot-options.txt | 2 ++
 Documentation/x86/x86_64/machinecheck     | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 34c13040a718..63fca718256e 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -5,6 +5,8 @@ only the AMD64 specific ones are listed here.
 
 Machine check
 
+   Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables.
+
    mce=off disable machine check
    mce=bootlog Enable logging of machine checks left over from booting.
                Disabled by default on AMD because some BIOS leave bogus ones.
diff --git a/Documentation/x86/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck
index a05e58e7b159..a4fdb25446e0 100644
--- a/Documentation/x86/x86_64/machinecheck
+++ b/Documentation/x86/x86_64/machinecheck
@@ -41,7 +41,9 @@ check_interval
 	the polling interval.  When the poller stops finding MCEs, it
 	triggers an exponential backoff (poll less often) on the polling
 	interval. The check_interval variable is both the initial and
-	maximum polling interval.
+	maximum polling interval. 0 means no polling for corrected machine
+	check errors (but some corrected errors might be still reported
+	in other ways)
 
 tolerant
 	Tolerance level. When a machine check exception occurs for a non

From 9319cec8c185e84fc5281afb6ac5d4c47a234841 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Tue, 14 Apr 2009 17:26:30 +0900
Subject: [PATCH 056/162] x86, mce: use strict_strtoull

Use strict_strtoull instead of simple_strtoull.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c        |  9 ++++-----
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 16 ++++++----------
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8ab28368bb92..4375ffb5459f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1059,18 +1059,17 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
 }
 
 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
-			const char *buf, size_t siz)
+			const char *buf, size_t size)
 {
-	char *end;
-	u64 new = simple_strtoull(buf, &end, 0);
+	u64 new;
 
-	if (end == buf)
+	if (strict_strtoull(buf, 0, &new) < 0)
 		return -EINVAL;
 
 	bank[attr - bank_attrs] = new;
 	mce_restart();
 
-	return end-buf;
+	return size;
 }
 
 static ssize_t
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 083f270251fa..0c563432e25c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -269,14 +269,12 @@ SHOW_FIELDS(interrupt_enable)
 SHOW_FIELDS(threshold_limit)
 
 static ssize_t
-store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count)
+store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
 {
 	struct thresh_restart tr;
 	unsigned long new;
-	char *end;
 
-	new = simple_strtoul(buf, &end, 0);
-	if (end == buf)
+	if (strict_strtoul(buf, 0, &new) < 0)
 		return -EINVAL;
 
 	b->interrupt_enable = !!new;
@@ -287,18 +285,16 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count)
 
 	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
-	return end - buf;
+	return size;
 }
 
 static ssize_t
-store_threshold_limit(struct threshold_block *b, const char *buf, size_t count)
+store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
 {
 	struct thresh_restart tr;
 	unsigned long new;
-	char *end;
 
-	new = simple_strtoul(buf, &end, 0);
-	if (end == buf)
+	if (strict_strtoul(buf, 0, &new) < 0)
 		return -EINVAL;
 
 	if (new > THRESHOLD_MAX)
@@ -313,7 +309,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t count)
 
 	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
-	return end - buf;
+	return size;
 }
 
 struct threshold_block_cross_cpu {

From cc3aec52ab8e013984270a79d1aa51f691d239b0 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 30 Apr 2009 15:58:22 +0900
Subject: [PATCH 057/162] x86, mce: trivial clean up for therm_throt.c

This patch removes following checkpatch warning:

WARNING: Use #include <linux/cpu.h> instead of <asm/cpu.h>
+#include <asm/cpu.h>

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index a2b5d7ddb19f..7b1ae2e20ba5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -20,7 +20,6 @@
 #include <linux/cpu.h>
 
 #include <asm/therm_throt.h>
-#include <asm/cpu.h>
 
 /* How long to wait between reporting thermal events */
 #define CHECK_INTERVAL		(300 * HZ)

From 14a02530e2239f753a0f3f089847e723adbdaa47 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 30 Apr 2009 16:04:51 +0900
Subject: [PATCH 058/162] x86, mce: trivial clean up for mce.c

This fixs following checkpatch warnings:

WARNING: Use #include <linux/uaccess.h> instead of <asm/uaccess.h>
+#include <asm/uaccess.h>

WARNING: Use #include <linux/smp.h> instead of <asm/smp.h>
+#include <asm/smp.h>

WARNING: line over 80 characters
+                               set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);

WARNING: braces {} are not necessary for any arm of this statement
+       if (mce_notify_user()) {
[...]
+       } else {
[...]

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 4375ffb5459f..1d0aa9c4e15b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -14,6 +14,7 @@
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
 #include <linux/kobject.h>
+#include <linux/uaccess.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
 #include <linux/percpu.h>
@@ -27,14 +28,13 @@
 #include <linux/kmod.h>
 #include <linux/poll.h>
 #include <linux/cpu.h>
+#include <linux/smp.h>
 #include <linux/fs.h>
 
 #include <asm/processor.h>
-#include <asm/uaccess.h>
 #include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
-#include <asm/smp.h>
 
 #include "mce.h"
 
@@ -125,7 +125,8 @@ void mce_log(struct mce *mce)
 			 * interesting ones:
 			 */
 			if (entry >= MCE_LOG_LEN) {
-				set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
+				set_bit(MCE_OVERFLOW,
+					(unsigned long *)&mcelog.flags);
 				return;
 			}
 			/* Old left over entry. Skip: */
@@ -556,11 +557,10 @@ static void mcheck_timer(unsigned long data)
 	 * polling interval, otherwise increase the polling interval.
 	 */
 	n = &__get_cpu_var(next_interval);
-	if (mce_notify_user()) {
+	if (mce_notify_user())
 		*n = max(*n/2, HZ/100);
-	} else {
+	else
 		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
-	}
 
 	t->expires = jiffies + *n;
 	add_timer(t);

From 34fa1967aa0827776e37feb5666df0327575a0f2 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 8 Apr 2009 12:31:18 +0200
Subject: [PATCH 059/162] x86, mce: trivial clean up for mce_amd_64.c

Fix for followings:

WARNING: Use #include <linux/percpu.h> instead of <asm/percpu.h>
+#include <asm/percpu.h>

ERROR: Macros with multiple statements should be enclosed in a do - while
loop
+#define THRESHOLD_ATTR(_name, _mode, _show, _store)                    \
+{                                                                      \
+       .attr   = {.name = __stringify(_name), .mode = _mode },         \
+       .show   = _show,                                                \
+       .store  = _store,                                               \
+};

WARNING: usage of NR_CPUS is often wrong - consider using cpu_possible(),
num_possible_cpus(), for_each_possible_cpu(), etc
+       if (cpu >= NR_CPUS)

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 0c563432e25c..ddae21620bda 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 #include <linux/kobject.h>
+#include <linux/percpu.h>
 #include <linux/sysdev.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
@@ -24,7 +25,6 @@
 #include <linux/cpu.h>
 #include <linux/smp.h>
 
-#include <asm/percpu.h>
 #include <asm/apic.h>
 #include <asm/idle.h>
 #include <asm/mce.h>
@@ -344,17 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,
 	return 1;
 }
 
-#define THRESHOLD_ATTR(_name, _mode, _show, _store)			\
-{									\
-	.attr	= {.name = __stringify(_name), .mode = _mode },		\
-	.show	= _show,						\
-	.store	= _store,						\
+#define RW_ATTR(val)							\
+static struct threshold_attr val = {					\
+	.attr	= {.name = __stringify(val), .mode = 0644 },		\
+	.show	= show_## val,						\
+	.store	= store_## val,						\
 };
 
-#define RW_ATTR(name)							\
-static struct threshold_attr name =					\
-	THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
-
 RW_ATTR(interrupt_enable);
 RW_ATTR(threshold_limit);
 RW_ATTR(error_count);
@@ -675,9 +671,6 @@ static void threshold_remove_device(unsigned int cpu)
 static void __cpuinit
 amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
 {
-	if (cpu >= NR_CPUS)
-		return;
-
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:

From 61a021a0700c22ee527d73d92f9acb109ff478f8 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Tue, 14 Apr 2009 17:09:04 +0900
Subject: [PATCH 060/162] x86, mce: trivial clean up for mce_intel_64.c

Fix for:

WARNING: space prohibited between function name and open parenthesis '('
+       for_each_online_cpu (cpu) {

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 13abafcb72e4..eff3740501a3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -196,7 +196,7 @@ void cmci_rediscover(int dying)
 		return;
 	cpumask_copy(old, &current->cpus_allowed);
 
-	for_each_online_cpu (cpu) {
+	for_each_online_cpu(cpu) {
 		if (cpu == dying)
 			continue;
 		if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))

From 98a9c8c3ba13dfc3df8e6d2a126d2fa4e4621e9c Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 28 May 2009 11:41:01 +0900
Subject: [PATCH 061/162] x86, mce: trivial clean up for mce-inject.c

Fix for:

WARNING: Use #include <linux/uaccess.h> instead of <asm/uaccess.h>
+#include <asm/uaccess.h>

WARNING: usage of NR_CPUS is often wrong - consider using cpu_possible(), num_possible_cpus(), for_each_possible_cpu(), etc
+       if (m.cpu >= NR_CPUS || !cpu_online(m.cpu))

ERROR: trailing whitespace
+/* $

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce-inject.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 673c72855022..7b3a5428396a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -11,13 +11,13 @@
  * Andi Kleen
  * Ying Huang
  */
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/timer.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/smp.h>
-#include <asm/uaccess.h>
 #include <asm/mce.h>
 
 /* Update fake mce registers on current CPU. */
@@ -93,7 +93,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
 	if (copy_from_user(&m, ubuf, usize))
 		return -EFAULT;
 
-	if (m.cpu >= NR_CPUS || !cpu_online(m.cpu))
+	if (m.cpu >= num_possible_cpus() || !cpu_online(m.cpu))
 		return -EINVAL;
 
 	dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);

From eb2a6ab729ac40a553797703a5a5dba3a74de004 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 28 Apr 2009 23:32:56 +0200
Subject: [PATCH 062/162] x86: trivial clean up for irq_vectors.h

Fix a wrong comment.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/irq_vectors.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 451e24d18050..233006c4e365 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -86,11 +86,10 @@
 #define CALL_FUNCTION_VECTOR		0xfc
 #define CALL_FUNCTION_SINGLE_VECTOR	0xfb
 #define THERMAL_APIC_VECTOR		0xfa
-
 #define THRESHOLD_APIC_VECTOR		0xf9
 
 #ifdef CONFIG_X86_32
-/* 0xf9 : free */
+/* 0xf8 : free */
 #else
 # define UV_BAU_MESSAGE			0xf8
 #endif

From cd13adcc823aa421efa4efd995fa7004a58cf38d Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 27 May 2009 16:57:31 +0900
Subject: [PATCH 063/162] x86: trivial clean up for arch/x86/Kconfig

Use tab.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e25b6358fbe3..8c0fff0860bd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -822,13 +822,13 @@ config X86_MCE_AMD
 	   the DRAM Error Threshold.
 
 config X86_ANCIENT_MCE
-       def_bool n
-       depends on X86_32
-       prompt "Support for old Pentium 5 / WinChip machine checks"
-       ---help---
-	 Include support for machine check handling on old Pentium 5 or WinChip
-	 systems. These typically need to be enabled explicitely on the command
-	 line.
+	def_bool n
+	depends on X86_32
+	prompt "Support for old Pentium 5 / WinChip machine checks"
+	---help---
+	  Include support for machine check handling on old Pentium 5 or WinChip
+	  systems. These typically need to be enabled explicitely on the command
+	  line.
 
 config X86_MCE_THRESHOLD
 	depends on X86_MCE_AMD || X86_MCE_INTEL

From 38736072d45488fd45f076388b6570419bbbc682 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 28 May 2009 10:05:33 -0700
Subject: [PATCH 064/162] x86, mce: drop "extern" from function prototypes in
 asm/mce.h

Function prototypes don't need to be prefixed by "extern".

[ Impact: cleanup ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index e7d2372301ef..ac6e0303bf21 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -121,13 +121,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
 static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
 #endif
 
-extern int mce_available(struct cpuinfo_x86 *c);
+int mce_available(struct cpuinfo_x86 *c);
 
 void mce_log_therm_throt_event(__u64 status);
 
 extern atomic_t mce_entry;
 
-extern void do_machine_check(struct pt_regs *, long);
+void do_machine_check(struct pt_regs *, long);
 
 typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
 DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
@@ -137,15 +137,15 @@ enum mcp_flags {
 	MCP_UC = (1 << 1),		/* log uncorrected errors */
 	MCP_DONTLOG = (1 << 2),		/* only clear, don't log */
 };
-extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
-extern int mce_notify_user(void);
+int mce_notify_user(void);
 
 DECLARE_PER_CPU(struct mce, injectm);
 extern struct file_operations mce_chrdev_ops;
 
 #ifdef CONFIG_X86_MCE
-extern void mcheck_init(struct cpuinfo_x86 *c);
+void mcheck_init(struct cpuinfo_x86 *c);
 #else
 #define mcheck_init(c) do { } while (0)
 #endif

From 4156e735d3abde8e9243b5d22f7999dd3fffab2e Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Mon, 1 Jun 2009 13:13:24 -0500
Subject: [PATCH 065/162] xfs: prevent deadlock in xfs_qm_shake()

It's possible to recurse into filesystem from the memory
allocation, which deadlocks in xfs_qm_shake(). Add check
for __GFP_FS, and bail out if it is not set.

Signed-off-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Hedi Berriche <hedi@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/kmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index af6843c7ee4b..179cbd630f69 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -103,7 +103,7 @@ extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
 static inline int
 kmem_shake_allow(gfp_t gfp_mask)
 {
-	return (gfp_mask & __GFP_WAIT) != 0;
+	return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
 }
 
 #endif /* __XFS_SUPPORT_KMEM_H__ */

From 376bacb0a26bca722981d1610ffb76f951572bb1 Mon Sep 17 00:00:00 2001
From: Frank Seidel <frank@f-seidel.de>
Date: Sun, 29 Mar 2009 15:18:39 +0800
Subject: [PATCH 066/162] crypto: tcrypt - Reduce stack size

Applying kernel janitors todos (printk calls need KERN_*
constants on linebeginnings, reduce stack footprint where
possible) to tcrypts test_hash_speed (where stacks
memory footprint was very high (on i386 1184 bytes to
160 now).

Signed-off-by: Frank Seidel <frank@f-seidel.de>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index c3c9124209a1..50d1e35bbff4 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -396,16 +396,16 @@ static void test_hash_speed(const char *algo, unsigned int sec,
 	struct scatterlist sg[TVMEMSIZE];
 	struct crypto_hash *tfm;
 	struct hash_desc desc;
-	char output[1024];
+	static char output[1024];
 	int i;
 	int ret;
 
-	printk("\ntesting speed of %s\n", algo);
+	printk(KERN_INFO "\ntesting speed of %s\n", algo);
 
 	tfm = crypto_alloc_hash(algo, 0, CRYPTO_ALG_ASYNC);
 
 	if (IS_ERR(tfm)) {
-		printk("failed to load transform for %s: %ld\n", algo,
+		printk(KERN_ERR "failed to load transform for %s: %ld\n", algo,
 		       PTR_ERR(tfm));
 		return;
 	}
@@ -414,7 +414,7 @@ static void test_hash_speed(const char *algo, unsigned int sec,
 	desc.flags = 0;
 
 	if (crypto_hash_digestsize(tfm) > sizeof(output)) {
-		printk("digestsize(%u) > outputbuffer(%zu)\n",
+		printk(KERN_ERR "digestsize(%u) > outputbuffer(%zu)\n",
 		       crypto_hash_digestsize(tfm), sizeof(output));
 		goto out;
 	}
@@ -427,12 +427,14 @@ static void test_hash_speed(const char *algo, unsigned int sec,
 
 	for (i = 0; speed[i].blen != 0; i++) {
 		if (speed[i].blen > TVMEMSIZE * PAGE_SIZE) {
-			printk("template (%u) too big for tvmem (%lu)\n",
+			printk(KERN_ERR
+			       "template (%u) too big for tvmem (%lu)\n",
 			       speed[i].blen, TVMEMSIZE * PAGE_SIZE);
 			goto out;
 		}
 
-		printk("test%3u (%5u byte blocks,%5u bytes per update,%4u updates): ",
+		printk(KERN_INFO "test%3u "
+		       "(%5u byte blocks,%5u bytes per update,%4u updates): ",
 		       i, speed[i].blen, speed[i].plen, speed[i].blen / speed[i].plen);
 
 		if (sec)
@@ -443,7 +445,7 @@ static void test_hash_speed(const char *algo, unsigned int sec,
 					       speed[i].plen, output);
 
 		if (ret) {
-			printk("hashing failed ret=%d\n", ret);
+			printk(KERN_ERR "hashing failed ret=%d\n", ret);
 			break;
 		}
 	}

From 811d8f062668077e268a7292202bb923fe2ae896 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Sun, 29 Mar 2009 15:20:48 +0800
Subject: [PATCH 067/162] crypto: api - Use kzfree

Use kzfree() instead of memset() + kfree().

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/crypto/api.c b/crypto/api.c
index fd2545decb28..f500fb840be9 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -580,20 +580,17 @@ EXPORT_SYMBOL_GPL(crypto_alloc_tfm);
 void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
 {
 	struct crypto_alg *alg;
-	int size;
 
 	if (unlikely(!mem))
 		return;
 
 	alg = tfm->__crt_alg;
-	size = ksize(mem);
 
 	if (!tfm->exit && alg->cra_exit)
 		alg->cra_exit(tfm);
 	crypto_exit_ops(tfm);
 	crypto_mod_put(alg);
-	memset(mem, 0, size);
-	kfree(mem);
+	kzfree(mem);
 }
 EXPORT_SYMBOL_GPL(crypto_destroy_tfm);
 

From 505fd21d6138545aa5e96aa738975e6a9deb98a9 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Sun, 29 Mar 2009 15:33:53 +0800
Subject: [PATCH 068/162] crypto: cryptd - Use nivcipher in
 cryptd_alloc_ablkcipher

Use crypto_alloc_base() instead of crypto_alloc_ablkcipher() to
allocate underlying tfm in cryptd_alloc_ablkcipher. Because
crypto_alloc_ablkcipher() prefer GENIV encapsulated crypto instead of
raw one, while cryptd_alloc_ablkcipher needed the raw one.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/cryptd.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index d14b22658d7a..ae5fa99d5d36 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -586,20 +586,24 @@ struct cryptd_ablkcipher *cryptd_alloc_ablkcipher(const char *alg_name,
 						  u32 type, u32 mask)
 {
 	char cryptd_alg_name[CRYPTO_MAX_ALG_NAME];
-	struct crypto_ablkcipher *tfm;
+	struct crypto_tfm *tfm;
 
 	if (snprintf(cryptd_alg_name, CRYPTO_MAX_ALG_NAME,
 		     "cryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME)
 		return ERR_PTR(-EINVAL);
-	tfm = crypto_alloc_ablkcipher(cryptd_alg_name, type, mask);
+	type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
+	type |= CRYPTO_ALG_TYPE_BLKCIPHER;
+	mask &= ~CRYPTO_ALG_TYPE_MASK;
+	mask |= (CRYPTO_ALG_GENIV | CRYPTO_ALG_TYPE_BLKCIPHER_MASK);
+	tfm = crypto_alloc_base(cryptd_alg_name, type, mask);
 	if (IS_ERR(tfm))
 		return ERR_CAST(tfm);
-	if (crypto_ablkcipher_tfm(tfm)->__crt_alg->cra_module != THIS_MODULE) {
-		crypto_free_ablkcipher(tfm);
+	if (tfm->__crt_alg->cra_module != THIS_MODULE) {
+		crypto_free_tfm(tfm);
 		return ERR_PTR(-EINVAL);
 	}
 
-	return __cryptd_ablkcipher_cast(tfm);
+	return __cryptd_ablkcipher_cast(__crypto_ablkcipher_cast(tfm));
 }
 EXPORT_SYMBOL_GPL(cryptd_alloc_ablkcipher);
 

From 150c7e85526e80474b87004f4b420e8834fdeb43 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Sun, 29 Mar 2009 15:39:02 +0800
Subject: [PATCH 069/162] crypto: fpu - Add template for blkcipher touching FPU

Blkcipher touching FPU need to be enclosed by kernel_fpu_begin() and
kernel_fpu_end(). If they are invoked in cipher algorithm
implementation, they will be invoked for each block, so that
performance will be hurt, because they are "slow" operations. This
patch implements "fpu" template, which makes these operations to be
invoked for each request.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile |   2 +
 arch/x86/crypto/fpu.c    | 166 +++++++++++++++++++++++++++++++++++++++
 crypto/Kconfig           |   5 ++
 3 files changed, 173 insertions(+)
 create mode 100644 arch/x86/crypto/fpu.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index ebe7deedd5b4..cfb0010fa940 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,8 @@
 # Arch-specific CryptoAPI modules.
 #
 
+obj-$(CONFIG_CRYPTO_FPU) += fpu.o
+
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
 obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
new file mode 100644
index 000000000000..5f9781a3815f
--- /dev/null
+++ b/arch/x86/crypto/fpu.c
@@ -0,0 +1,166 @@
+/*
+ * FPU: Wrapper for blkcipher touching fpu
+ *
+ * Copyright (c) Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/algapi.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/i387.h>
+
+struct crypto_fpu_ctx {
+	struct crypto_blkcipher *child;
+};
+
+static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key,
+			     unsigned int keylen)
+{
+	struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent);
+	struct crypto_blkcipher *child = ctx->child;
+	int err;
+
+	crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+	crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) &
+				   CRYPTO_TFM_REQ_MASK);
+	err = crypto_blkcipher_setkey(child, key, keylen);
+	crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) &
+				     CRYPTO_TFM_RES_MASK);
+	return err;
+}
+
+static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in,
+			      struct scatterlist *dst, struct scatterlist *src,
+			      unsigned int nbytes)
+{
+	int err;
+	struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
+	struct crypto_blkcipher *child = ctx->child;
+	struct blkcipher_desc desc = {
+		.tfm = child,
+		.info = desc_in->info,
+		.flags = desc_in->flags,
+	};
+
+	kernel_fpu_begin();
+	err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes);
+	kernel_fpu_end();
+	return err;
+}
+
+static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in,
+			      struct scatterlist *dst, struct scatterlist *src,
+			      unsigned int nbytes)
+{
+	int err;
+	struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
+	struct crypto_blkcipher *child = ctx->child;
+	struct blkcipher_desc desc = {
+		.tfm = child,
+		.info = desc_in->info,
+		.flags = desc_in->flags,
+	};
+
+	kernel_fpu_begin();
+	err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes);
+	kernel_fpu_end();
+	return err;
+}
+
+static int crypto_fpu_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
+	struct crypto_spawn *spawn = crypto_instance_ctx(inst);
+	struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct crypto_blkcipher *cipher;
+
+	cipher = crypto_spawn_blkcipher(spawn);
+	if (IS_ERR(cipher))
+		return PTR_ERR(cipher);
+
+	ctx->child = cipher;
+	return 0;
+}
+
+static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
+	crypto_free_blkcipher(ctx->child);
+}
+
+static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb)
+{
+	struct crypto_instance *inst;
+	struct crypto_alg *alg;
+	int err;
+
+	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER);
+	if (err)
+		return ERR_PTR(err);
+
+	alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER,
+				  CRYPTO_ALG_TYPE_MASK);
+	if (IS_ERR(alg))
+		return ERR_CAST(alg);
+
+	inst = crypto_alloc_instance("fpu", alg);
+	if (IS_ERR(inst))
+		goto out_put_alg;
+
+	inst->alg.cra_flags = alg->cra_flags;
+	inst->alg.cra_priority = alg->cra_priority;
+	inst->alg.cra_blocksize = alg->cra_blocksize;
+	inst->alg.cra_alignmask = alg->cra_alignmask;
+	inst->alg.cra_type = alg->cra_type;
+	inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize;
+	inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
+	inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
+	inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
+	inst->alg.cra_init = crypto_fpu_init_tfm;
+	inst->alg.cra_exit = crypto_fpu_exit_tfm;
+	inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey;
+	inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt;
+	inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt;
+
+out_put_alg:
+	crypto_mod_put(alg);
+	return inst;
+}
+
+static void crypto_fpu_free(struct crypto_instance *inst)
+{
+	crypto_drop_spawn(crypto_instance_ctx(inst));
+	kfree(inst);
+}
+
+static struct crypto_template crypto_fpu_tmpl = {
+	.name = "fpu",
+	.alloc = crypto_fpu_alloc,
+	.free = crypto_fpu_free,
+	.module = THIS_MODULE,
+};
+
+static int __init crypto_fpu_module_init(void)
+{
+	return crypto_register_template(&crypto_fpu_tmpl);
+}
+
+static void __exit crypto_fpu_module_exit(void)
+{
+	crypto_unregister_template(&crypto_fpu_tmpl);
+}
+
+module_init(crypto_fpu_module_init);
+module_exit(crypto_fpu_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 74d0e622a515..66ff22a36ed9 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -241,6 +241,11 @@ config CRYPTO_XTS
 	  key size 256, 384 or 512 bits. This implementation currently
 	  can't handle a sectorsize which is not a multiple of 16 bytes.
 
+config CRYPTO_FPU
+	tristate
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_MANAGER
+
 comment "Hash modes"
 
 config CRYPTO_HMAC

From 2cf4ac8beb9dc50a315a6155b7b70e754d511958 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Sun, 29 Mar 2009 15:41:20 +0800
Subject: [PATCH 070/162] crypto: aes-ni - Add support for more modes

Because kernel_fpu_begin() and kernel_fpu_end() operations are too
slow, the performance gain of general mode implementation + aes-aesni
is almost all compensated.

The AES-NI support for more modes are implemented as follow:

- Add a new AES algorithm implementation named __aes-aesni without
  kernel_fpu_begin/end()

- Use fpu(<mode>(AES)) to provide kenrel_fpu_begin/end() invoking

- Add <mode>(AES) ablkcipher, which uses cryptd(fpu(<mode>(AES))) to
  defer cryption to cryptd context in soft_irq context.

Now the ctr, lrw, pcbc and xts support are added.

Performance testing based on dm-crypt shows that cryption time can be
reduced to 50% of general mode implementation + aes-aesni implementation.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 267 ++++++++++++++++++++++++++++-
 crypto/Kconfig                     |   5 +
 2 files changed, 271 insertions(+), 1 deletion(-)

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 02af0af65497..4e663398f77f 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -21,6 +21,22 @@
 #include <asm/i387.h>
 #include <asm/aes.h>
 
+#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
+#define HAS_CTR
+#endif
+
+#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
+#define HAS_LRW
+#endif
+
+#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
+#define HAS_PCBC
+#endif
+
+#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
+#define HAS_XTS
+#endif
+
 struct async_aes_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
@@ -137,6 +153,41 @@ static struct crypto_alg aesni_alg = {
 	}
 };
 
+static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+	aesni_enc(ctx, dst, src);
+}
+
+static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+	aesni_dec(ctx, dst, src);
+}
+
+static struct crypto_alg __aesni_alg = {
+	.cra_name		= "__aes-aesni",
+	.cra_driver_name	= "__driver-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
+	.cra_alignmask		= 0,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(__aesni_alg.cra_list),
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= AES_MIN_KEY_SIZE,
+			.cia_max_keysize	= AES_MAX_KEY_SIZE,
+			.cia_setkey		= aes_set_key,
+			.cia_encrypt		= __aes_encrypt,
+			.cia_decrypt		= __aes_decrypt
+		}
+	}
+};
+
 static int ecb_encrypt(struct blkcipher_desc *desc,
 		       struct scatterlist *dst, struct scatterlist *src,
 		       unsigned int nbytes)
@@ -277,8 +328,16 @@ static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
 	struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
+	int err;
 
-	return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len);
+	crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+	crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
+				    & CRYPTO_TFM_REQ_MASK);
+	err = crypto_ablkcipher_setkey(child, key, key_len);
+	crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
+				    & CRYPTO_TFM_RES_MASK);
+	return err;
 }
 
 static int ablk_encrypt(struct ablkcipher_request *req)
@@ -411,6 +470,163 @@ static struct crypto_alg ablk_cbc_alg = {
 	},
 };
 
+#ifdef HAS_CTR
+static int ablk_ctr_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))",
+					     0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_ctr_alg = {
+	.cra_name		= "ctr(aes)",
+	.cra_driver_name	= "ctr-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
+	.cra_init		= ablk_ctr_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+			.geniv		= "chainiv",
+		},
+	},
+};
+#endif
+
+#ifdef HAS_LRW
+static int ablk_lrw_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))",
+					     0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_lrw_alg = {
+	.cra_name		= "lrw(aes)",
+	.cra_driver_name	= "lrw-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
+	.cra_init		= ablk_lrw_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+#endif
+
+#ifdef HAS_PCBC
+static int ablk_pcbc_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))",
+					     0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_pcbc_alg = {
+	.cra_name		= "pcbc(aes)",
+	.cra_driver_name	= "pcbc-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_pcbc_alg.cra_list),
+	.cra_init		= ablk_pcbc_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+#endif
+
+#ifdef HAS_XTS
+static int ablk_xts_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))",
+					     0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_xts_alg = {
+	.cra_name		= "xts(aes)",
+	.cra_driver_name	= "xts-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_xts_alg.cra_list),
+	.cra_init		= ablk_xts_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= 2 * AES_MIN_KEY_SIZE,
+			.max_keysize	= 2 * AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+#endif
+
 static int __init aesni_init(void)
 {
 	int err;
@@ -421,6 +637,8 @@ static int __init aesni_init(void)
 	}
 	if ((err = crypto_register_alg(&aesni_alg)))
 		goto aes_err;
+	if ((err = crypto_register_alg(&__aesni_alg)))
+		goto __aes_err;
 	if ((err = crypto_register_alg(&blk_ecb_alg)))
 		goto blk_ecb_err;
 	if ((err = crypto_register_alg(&blk_cbc_alg)))
@@ -429,9 +647,41 @@ static int __init aesni_init(void)
 		goto ablk_ecb_err;
 	if ((err = crypto_register_alg(&ablk_cbc_alg)))
 		goto ablk_cbc_err;
+#ifdef HAS_CTR
+	if ((err = crypto_register_alg(&ablk_ctr_alg)))
+		goto ablk_ctr_err;
+#endif
+#ifdef HAS_LRW
+	if ((err = crypto_register_alg(&ablk_lrw_alg)))
+		goto ablk_lrw_err;
+#endif
+#ifdef HAS_PCBC
+	if ((err = crypto_register_alg(&ablk_pcbc_alg)))
+		goto ablk_pcbc_err;
+#endif
+#ifdef HAS_XTS
+	if ((err = crypto_register_alg(&ablk_xts_alg)))
+		goto ablk_xts_err;
+#endif
 
 	return err;
 
+#ifdef HAS_XTS
+ablk_xts_err:
+#endif
+#ifdef HAS_PCBC
+	crypto_unregister_alg(&ablk_pcbc_alg);
+ablk_pcbc_err:
+#endif
+#ifdef HAS_LRW
+	crypto_unregister_alg(&ablk_lrw_alg);
+ablk_lrw_err:
+#endif
+#ifdef HAS_CTR
+	crypto_unregister_alg(&ablk_ctr_alg);
+ablk_ctr_err:
+#endif
+	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
 	crypto_unregister_alg(&ablk_ecb_alg);
 ablk_ecb_err:
@@ -439,6 +689,8 @@ ablk_ecb_err:
 blk_cbc_err:
 	crypto_unregister_alg(&blk_ecb_alg);
 blk_ecb_err:
+	crypto_unregister_alg(&__aesni_alg);
+__aes_err:
 	crypto_unregister_alg(&aesni_alg);
 aes_err:
 	return err;
@@ -446,10 +698,23 @@ aes_err:
 
 static void __exit aesni_exit(void)
 {
+#ifdef HAS_XTS
+	crypto_unregister_alg(&ablk_xts_alg);
+#endif
+#ifdef HAS_PCBC
+	crypto_unregister_alg(&ablk_pcbc_alg);
+#endif
+#ifdef HAS_LRW
+	crypto_unregister_alg(&ablk_lrw_alg);
+#endif
+#ifdef HAS_CTR
+	crypto_unregister_alg(&ablk_ctr_alg);
+#endif
 	crypto_unregister_alg(&ablk_cbc_alg);
 	crypto_unregister_alg(&ablk_ecb_alg);
 	crypto_unregister_alg(&blk_cbc_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
+	crypto_unregister_alg(&__aesni_alg);
 	crypto_unregister_alg(&aesni_alg);
 }
 
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 66ff22a36ed9..4dfdd03e708f 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -491,6 +491,7 @@ config CRYPTO_AES_NI_INTEL
 	select CRYPTO_AES_X86_64
 	select CRYPTO_CRYPTD
 	select CRYPTO_ALGAPI
+	select CRYPTO_FPU
 	help
 	  Use Intel AES-NI instructions for AES algorithm.
 
@@ -510,6 +511,10 @@ config CRYPTO_AES_NI_INTEL
 
 	  See <http://csrc.nist.gov/encryption/aes/> for more information.
 
+	  In addition to AES cipher algorithm support, the
+	  acceleration for some popular block cipher mode is supported
+	  too, including ECB, CBC, CTR, LRW, PCBC, XTS.
+
 config CRYPTO_ANUBIS
 	tristate "Anubis cipher algorithm"
 	select CRYPTO_ALGAPI

From c79cf91006f03adb603879013b6710b6062c8445 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Sun, 29 Mar 2009 15:44:19 +0800
Subject: [PATCH 071/162] crypto: testmgr - Kill test_comp() sparse warnings

make C=1:
| crypto/testmgr.c:846:45: warning: incorrect type in argument 5 (different signedness)
| crypto/testmgr.c:846:45:    expected unsigned int *dlen
| crypto/testmgr.c:846:45:    got int *<noident>
| crypto/testmgr.c:878:47: warning: incorrect type in argument 5 (different signedness)
| crypto/testmgr.c:878:47:    expected unsigned int *dlen
| crypto/testmgr.c:878:47:    got int *<noident>

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index b50c3c6b17a2..bfee6e9f642d 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -837,7 +837,8 @@ static int test_comp(struct crypto_comp *tfm, struct comp_testvec *ctemplate,
 	int ret;
 
 	for (i = 0; i < ctcount; i++) {
-		int ilen, dlen = COMP_BUF_SIZE;
+		int ilen;
+		unsigned int dlen = COMP_BUF_SIZE;
 
 		memset(result, 0, sizeof (result));
 
@@ -869,7 +870,8 @@ static int test_comp(struct crypto_comp *tfm, struct comp_testvec *ctemplate,
 	}
 
 	for (i = 0; i < dtcount; i++) {
-		int ilen, dlen = COMP_BUF_SIZE;
+		int ilen;
+		unsigned int dlen = COMP_BUF_SIZE;
 
 		memset(result, 0, sizeof (result));
 

From 2f6ceb7933f52f238519a9c2f65c628eecdac962 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Sun, 29 Mar 2009 15:45:30 +0800
Subject: [PATCH 072/162] crypto: pcomp - pcompress.c should include
 crypto/internal/compress.h

make C=1:
| crypto/pcompress.c:77:5: warning: symbol 'crypto_register_pcomp' was not declared. Should it be static?
| crypto/pcompress.c:89:5: warning: symbol 'crypto_unregister_pcomp' was not declared. Should it be static?

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/pcompress.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crypto/pcompress.c b/crypto/pcompress.c
index ca9a4af91efe..bcadc03726b7 100644
--- a/crypto/pcompress.c
+++ b/crypto/pcompress.c
@@ -26,6 +26,7 @@
 #include <linux/string.h>
 
 #include <crypto/compress.h>
+#include <crypto/internal/compress.h>
 
 #include "internal.h"
 

From 9f171adc192fc3c8ffbb691cfdcc70259d75c6ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Sun, 29 Mar 2009 15:47:06 +0800
Subject: [PATCH 073/162] hwrng: omap - Move probe function to .devinit.text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A pointer to omap_rng_probe is passed to the core via
platform_driver_register and so the function must not disappear when the
.init sections are discarded.  Otherwise (if also having HOTPLUG=y)
unbinding and binding a device to the driver via sysfs will result in an
oops as does a device being registered late.

An alternative to this patch is using platform_driver_probe instead of
platform_driver_register plus removing the pointer to the probe function
from the struct platform_driver.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Jan Engelhardt <jengelh@gmx.de>
Cc: Michael Buesch <mb@bu3sch.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/omap-rng.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hw_random/omap-rng.c b/drivers/char/hw_random/omap-rng.c
index 538313f9e7ac..00dd3de1be51 100644
--- a/drivers/char/hw_random/omap-rng.c
+++ b/drivers/char/hw_random/omap-rng.c
@@ -89,7 +89,7 @@ static struct hwrng omap_rng_ops = {
 	.data_read	= omap_rng_data_read,
 };
 
-static int __init omap_rng_probe(struct platform_device *pdev)
+static int __devinit omap_rng_probe(struct platform_device *pdev)
 {
 	struct resource *res, *mem;
 	int ret;

From 56af8cd44b05bd9649103b76a6e1e575682990e4 Mon Sep 17 00:00:00 2001
From: Lee Nipper <lee.nipper@gmail.com>
Date: Sun, 29 Mar 2009 15:50:50 +0800
Subject: [PATCH 074/162] crypto: talitos - scaffolding for new algorithm types

This patch is preparation for adding new algorithm types.

Some elements which are AEAD specific were renamed.
The algorithm template structure was changed to
use crypto_alg, and talitos_alg_alloc was made
more general with respect to algorithm types.
ipsec_esp_edesc is renamed to talitos_edesc
to use it in the upcoming ablkcipher routines.

Signed-off-by: Lee Nipper <lee.nipper@gmail.com>
Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/talitos.c | 245 +++++++++++++++++++++------------------
 1 file changed, 129 insertions(+), 116 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index a3918c16b3db..9833961a247e 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -684,8 +684,8 @@ struct talitos_ctx {
 	unsigned int authsize;
 };
 
-static int aead_authenc_setauthsize(struct crypto_aead *authenc,
-						 unsigned int authsize)
+static int aead_setauthsize(struct crypto_aead *authenc,
+			    unsigned int authsize)
 {
 	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
 
@@ -694,8 +694,8 @@ static int aead_authenc_setauthsize(struct crypto_aead *authenc,
 	return 0;
 }
 
-static int aead_authenc_setkey(struct crypto_aead *authenc,
-					    const u8 *key, unsigned int keylen)
+static int aead_setkey(struct crypto_aead *authenc,
+		       const u8 *key, unsigned int keylen)
 {
 	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
 	struct rtattr *rta = (void *)key;
@@ -740,7 +740,7 @@ badkey:
 }
 
 /*
- * ipsec_esp_edesc - s/w-extended ipsec_esp descriptor
+ * talitos_edesc - s/w-extended descriptor
  * @src_nents: number of segments in input scatterlist
  * @dst_nents: number of segments in output scatterlist
  * @dma_len: length of dma mapped link_tbl space
@@ -752,7 +752,7 @@ badkey:
  * is greater than 1, an integrity check value is concatenated to the end
  * of link_tbl data
  */
-struct ipsec_esp_edesc {
+struct talitos_edesc {
 	int src_nents;
 	int dst_nents;
 	int dma_len;
@@ -762,7 +762,7 @@ struct ipsec_esp_edesc {
 };
 
 static void ipsec_esp_unmap(struct device *dev,
-			    struct ipsec_esp_edesc *edesc,
+			    struct talitos_edesc *edesc,
 			    struct aead_request *areq)
 {
 	unmap_single_talitos_ptr(dev, &edesc->desc.ptr[6], DMA_FROM_DEVICE);
@@ -795,8 +795,8 @@ static void ipsec_esp_encrypt_done(struct device *dev,
 				   int err)
 {
 	struct aead_request *areq = context;
-	struct ipsec_esp_edesc *edesc =
-		 container_of(desc, struct ipsec_esp_edesc, desc);
+	struct talitos_edesc *edesc =
+		 container_of(desc, struct talitos_edesc, desc);
 	struct crypto_aead *authenc = crypto_aead_reqtfm(areq);
 	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
 	struct scatterlist *sg;
@@ -823,8 +823,8 @@ static void ipsec_esp_decrypt_swauth_done(struct device *dev,
 				   int err)
 {
 	struct aead_request *req = context;
-	struct ipsec_esp_edesc *edesc =
-		 container_of(desc, struct ipsec_esp_edesc, desc);
+	struct talitos_edesc *edesc =
+		 container_of(desc, struct talitos_edesc, desc);
 	struct crypto_aead *authenc = crypto_aead_reqtfm(req);
 	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
 	struct scatterlist *sg;
@@ -855,8 +855,8 @@ static void ipsec_esp_decrypt_hwauth_done(struct device *dev,
 				   int err)
 {
 	struct aead_request *req = context;
-	struct ipsec_esp_edesc *edesc =
-		 container_of(desc, struct ipsec_esp_edesc, desc);
+	struct talitos_edesc *edesc =
+		 container_of(desc, struct talitos_edesc, desc);
 
 	ipsec_esp_unmap(dev, edesc, req);
 
@@ -910,7 +910,7 @@ static int sg_to_link_tbl(struct scatterlist *sg, int sg_count,
 /*
  * fill in and submit ipsec_esp descriptor
  */
-static int ipsec_esp(struct ipsec_esp_edesc *edesc, struct aead_request *areq,
+static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq,
 		     u8 *giv, u64 seq,
 		     void (*callback) (struct device *dev,
 				       struct talitos_desc *desc,
@@ -1052,14 +1052,14 @@ static int sg_count(struct scatterlist *sg_list, int nbytes)
 }
 
 /*
- * allocate and map the ipsec_esp extended descriptor
+ * allocate and map the extended descriptor
  */
-static struct ipsec_esp_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq,
+static struct talitos_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq,
 						     int icv_stashing)
 {
 	struct crypto_aead *authenc = crypto_aead_reqtfm(areq);
 	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
-	struct ipsec_esp_edesc *edesc;
+	struct talitos_edesc *edesc;
 	int src_nents, dst_nents, alloc_len, dma_len;
 	gfp_t flags = areq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
 		      GFP_ATOMIC;
@@ -1084,7 +1084,7 @@ static struct ipsec_esp_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq,
 	 * allowing for two separate entries for ICV and generated ICV (+ 2),
 	 * and the ICV data itself
 	 */
-	alloc_len = sizeof(struct ipsec_esp_edesc);
+	alloc_len = sizeof(struct talitos_edesc);
 	if (src_nents || dst_nents) {
 		dma_len = (src_nents + dst_nents + 2) *
 				 sizeof(struct talitos_ptr) + ctx->authsize;
@@ -1109,11 +1109,11 @@ static struct ipsec_esp_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq,
 	return edesc;
 }
 
-static int aead_authenc_encrypt(struct aead_request *req)
+static int aead_encrypt(struct aead_request *req)
 {
 	struct crypto_aead *authenc = crypto_aead_reqtfm(req);
 	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
-	struct ipsec_esp_edesc *edesc;
+	struct talitos_edesc *edesc;
 
 	/* allocate extended descriptor */
 	edesc = ipsec_esp_edesc_alloc(req, 0);
@@ -1128,13 +1128,13 @@ static int aead_authenc_encrypt(struct aead_request *req)
 
 
 
-static int aead_authenc_decrypt(struct aead_request *req)
+static int aead_decrypt(struct aead_request *req)
 {
 	struct crypto_aead *authenc = crypto_aead_reqtfm(req);
 	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
 	unsigned int authsize = ctx->authsize;
 	struct talitos_private *priv = dev_get_drvdata(ctx->dev);
-	struct ipsec_esp_edesc *edesc;
+	struct talitos_edesc *edesc;
 	struct scatterlist *sg;
 	void *icvdata;
 
@@ -1180,13 +1180,12 @@ static int aead_authenc_decrypt(struct aead_request *req)
 	}
 }
 
-static int aead_authenc_givencrypt(
-	struct aead_givcrypt_request *req)
+static int aead_givencrypt(struct aead_givcrypt_request *req)
 {
 	struct aead_request *areq = &req->areq;
 	struct crypto_aead *authenc = crypto_aead_reqtfm(areq);
 	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
-	struct ipsec_esp_edesc *edesc;
+	struct talitos_edesc *edesc;
 
 	/* allocate extended descriptor */
 	edesc = ipsec_esp_edesc_alloc(areq, 0);
@@ -1205,30 +1204,30 @@ static int aead_authenc_givencrypt(
 }
 
 struct talitos_alg_template {
-	char name[CRYPTO_MAX_ALG_NAME];
-	char driver_name[CRYPTO_MAX_ALG_NAME];
-	unsigned int blocksize;
-	struct aead_alg aead;
-	struct device *dev;
+	struct crypto_alg alg;
 	__be32 desc_hdr_template;
 };
 
 static struct talitos_alg_template driver_algs[] = {
-	/* single-pass ipsec_esp descriptor */
+	/* AEAD algorithms.  These use a single-pass ipsec_esp descriptor */
 	{
-		.name = "authenc(hmac(sha1),cbc(aes))",
-		.driver_name = "authenc-hmac-sha1-cbc-aes-talitos",
-		.blocksize = AES_BLOCK_SIZE,
-		.aead = {
-			.setkey = aead_authenc_setkey,
-			.setauthsize = aead_authenc_setauthsize,
-			.encrypt = aead_authenc_encrypt,
-			.decrypt = aead_authenc_decrypt,
-			.givencrypt = aead_authenc_givencrypt,
-			.geniv = "<built-in>",
-			.ivsize = AES_BLOCK_SIZE,
-			.maxauthsize = SHA1_DIGEST_SIZE,
-			},
+		.alg = {
+			.cra_name = "authenc(hmac(sha1),cbc(aes))",
+			.cra_driver_name = "authenc-hmac-sha1-cbc-aes-talitos",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+			.cra_type = &crypto_aead_type,
+			.cra_aead = {
+				.setkey = aead_setkey,
+				.setauthsize = aead_setauthsize,
+				.encrypt = aead_encrypt,
+				.decrypt = aead_decrypt,
+				.givencrypt = aead_givencrypt,
+				.geniv = "<built-in>",
+				.ivsize = AES_BLOCK_SIZE,
+				.maxauthsize = SHA1_DIGEST_SIZE,
+			}
+		},
 		.desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP |
 			             DESC_HDR_SEL0_AESU |
 		                     DESC_HDR_MODE0_AESU_CBC |
@@ -1238,19 +1237,23 @@ static struct talitos_alg_template driver_algs[] = {
 		                     DESC_HDR_MODE1_MDEU_SHA1_HMAC,
 	},
 	{
-		.name = "authenc(hmac(sha1),cbc(des3_ede))",
-		.driver_name = "authenc-hmac-sha1-cbc-3des-talitos",
-		.blocksize = DES3_EDE_BLOCK_SIZE,
-		.aead = {
-			.setkey = aead_authenc_setkey,
-			.setauthsize = aead_authenc_setauthsize,
-			.encrypt = aead_authenc_encrypt,
-			.decrypt = aead_authenc_decrypt,
-			.givencrypt = aead_authenc_givencrypt,
-			.geniv = "<built-in>",
-			.ivsize = DES3_EDE_BLOCK_SIZE,
-			.maxauthsize = SHA1_DIGEST_SIZE,
-			},
+		.alg = {
+			.cra_name = "authenc(hmac(sha1),cbc(des3_ede))",
+			.cra_driver_name = "authenc-hmac-sha1-cbc-3des-talitos",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+			.cra_type = &crypto_aead_type,
+			.cra_aead = {
+				.setkey = aead_setkey,
+				.setauthsize = aead_setauthsize,
+				.encrypt = aead_encrypt,
+				.decrypt = aead_decrypt,
+				.givencrypt = aead_givencrypt,
+				.geniv = "<built-in>",
+				.ivsize = DES3_EDE_BLOCK_SIZE,
+				.maxauthsize = SHA1_DIGEST_SIZE,
+			}
+		},
 		.desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP |
 			             DESC_HDR_SEL0_DEU |
 		                     DESC_HDR_MODE0_DEU_CBC |
@@ -1261,19 +1264,23 @@ static struct talitos_alg_template driver_algs[] = {
 		                     DESC_HDR_MODE1_MDEU_SHA1_HMAC,
 	},
 	{
-		.name = "authenc(hmac(sha256),cbc(aes))",
-		.driver_name = "authenc-hmac-sha256-cbc-aes-talitos",
-		.blocksize = AES_BLOCK_SIZE,
-		.aead = {
-			.setkey = aead_authenc_setkey,
-			.setauthsize = aead_authenc_setauthsize,
-			.encrypt = aead_authenc_encrypt,
-			.decrypt = aead_authenc_decrypt,
-			.givencrypt = aead_authenc_givencrypt,
-			.geniv = "<built-in>",
-			.ivsize = AES_BLOCK_SIZE,
-			.maxauthsize = SHA256_DIGEST_SIZE,
-			},
+		.alg = {
+			.cra_name = "authenc(hmac(sha256),cbc(aes))",
+			.cra_driver_name = "authenc-hmac-sha256-cbc-aes-talitos",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+			.cra_type = &crypto_aead_type,
+			.cra_aead = {
+				.setkey = aead_setkey,
+				.setauthsize = aead_setauthsize,
+				.encrypt = aead_encrypt,
+				.decrypt = aead_decrypt,
+				.givencrypt = aead_givencrypt,
+				.geniv = "<built-in>",
+				.ivsize = AES_BLOCK_SIZE,
+				.maxauthsize = SHA256_DIGEST_SIZE,
+			}
+		},
 		.desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP |
 			             DESC_HDR_SEL0_AESU |
 		                     DESC_HDR_MODE0_AESU_CBC |
@@ -1283,19 +1290,23 @@ static struct talitos_alg_template driver_algs[] = {
 		                     DESC_HDR_MODE1_MDEU_SHA256_HMAC,
 	},
 	{
-		.name = "authenc(hmac(sha256),cbc(des3_ede))",
-		.driver_name = "authenc-hmac-sha256-cbc-3des-talitos",
-		.blocksize = DES3_EDE_BLOCK_SIZE,
-		.aead = {
-			.setkey = aead_authenc_setkey,
-			.setauthsize = aead_authenc_setauthsize,
-			.encrypt = aead_authenc_encrypt,
-			.decrypt = aead_authenc_decrypt,
-			.givencrypt = aead_authenc_givencrypt,
-			.geniv = "<built-in>",
-			.ivsize = DES3_EDE_BLOCK_SIZE,
-			.maxauthsize = SHA256_DIGEST_SIZE,
-			},
+		.alg = {
+			.cra_name = "authenc(hmac(sha256),cbc(des3_ede))",
+			.cra_driver_name = "authenc-hmac-sha256-cbc-3des-talitos",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+			.cra_type = &crypto_aead_type,
+			.cra_aead = {
+				.setkey = aead_setkey,
+				.setauthsize = aead_setauthsize,
+				.encrypt = aead_encrypt,
+				.decrypt = aead_decrypt,
+				.givencrypt = aead_givencrypt,
+				.geniv = "<built-in>",
+				.ivsize = DES3_EDE_BLOCK_SIZE,
+				.maxauthsize = SHA256_DIGEST_SIZE,
+			}
+		},
 		.desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP |
 			             DESC_HDR_SEL0_DEU |
 		                     DESC_HDR_MODE0_DEU_CBC |
@@ -1306,19 +1317,23 @@ static struct talitos_alg_template driver_algs[] = {
 		                     DESC_HDR_MODE1_MDEU_SHA256_HMAC,
 	},
 	{
-		.name = "authenc(hmac(md5),cbc(aes))",
-		.driver_name = "authenc-hmac-md5-cbc-aes-talitos",
-		.blocksize = AES_BLOCK_SIZE,
-		.aead = {
-			.setkey = aead_authenc_setkey,
-			.setauthsize = aead_authenc_setauthsize,
-			.encrypt = aead_authenc_encrypt,
-			.decrypt = aead_authenc_decrypt,
-			.givencrypt = aead_authenc_givencrypt,
-			.geniv = "<built-in>",
-			.ivsize = AES_BLOCK_SIZE,
-			.maxauthsize = MD5_DIGEST_SIZE,
-			},
+		.alg = {
+			.cra_name = "authenc(hmac(md5),cbc(aes))",
+			.cra_driver_name = "authenc-hmac-md5-cbc-aes-talitos",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+			.cra_type = &crypto_aead_type,
+			.cra_aead = {
+				.setkey = aead_setkey,
+				.setauthsize = aead_setauthsize,
+				.encrypt = aead_encrypt,
+				.decrypt = aead_decrypt,
+				.givencrypt = aead_givencrypt,
+				.geniv = "<built-in>",
+				.ivsize = AES_BLOCK_SIZE,
+				.maxauthsize = MD5_DIGEST_SIZE,
+			}
+		},
 		.desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP |
 			             DESC_HDR_SEL0_AESU |
 		                     DESC_HDR_MODE0_AESU_CBC |
@@ -1328,19 +1343,23 @@ static struct talitos_alg_template driver_algs[] = {
 		                     DESC_HDR_MODE1_MDEU_MD5_HMAC,
 	},
 	{
-		.name = "authenc(hmac(md5),cbc(des3_ede))",
-		.driver_name = "authenc-hmac-md5-cbc-3des-talitos",
-		.blocksize = DES3_EDE_BLOCK_SIZE,
-		.aead = {
-			.setkey = aead_authenc_setkey,
-			.setauthsize = aead_authenc_setauthsize,
-			.encrypt = aead_authenc_encrypt,
-			.decrypt = aead_authenc_decrypt,
-			.givencrypt = aead_authenc_givencrypt,
-			.geniv = "<built-in>",
-			.ivsize = DES3_EDE_BLOCK_SIZE,
-			.maxauthsize = MD5_DIGEST_SIZE,
-			},
+		.alg = {
+			.cra_name = "authenc(hmac(md5),cbc(des3_ede))",
+			.cra_driver_name = "authenc-hmac-md5-cbc-3des-talitos",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+			.cra_type = &crypto_aead_type,
+			.cra_aead = {
+				.setkey = aead_setkey,
+				.setauthsize = aead_setauthsize,
+				.encrypt = aead_encrypt,
+				.decrypt = aead_decrypt,
+				.givencrypt = aead_givencrypt,
+				.geniv = "<built-in>",
+				.ivsize = DES3_EDE_BLOCK_SIZE,
+				.maxauthsize = MD5_DIGEST_SIZE,
+			}
+		},
 		.desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP |
 			             DESC_HDR_SEL0_DEU |
 		                     DESC_HDR_MODE0_DEU_CBC |
@@ -1453,19 +1472,13 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev,
 		return ERR_PTR(-ENOMEM);
 
 	alg = &t_alg->crypto_alg;
+	*alg = template->alg;
 
-	snprintf(alg->cra_name, CRYPTO_MAX_ALG_NAME, "%s", template->name);
-	snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s",
-		 template->driver_name);
 	alg->cra_module = THIS_MODULE;
 	alg->cra_init = talitos_cra_init;
 	alg->cra_priority = TALITOS_CRA_PRIORITY;
-	alg->cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC;
-	alg->cra_blocksize = template->blocksize;
 	alg->cra_alignmask = 0;
-	alg->cra_type = &crypto_aead_type;
 	alg->cra_ctxsize = sizeof(struct talitos_ctx);
-	alg->cra_u.aead = template->aead;
 
 	t_alg->desc_hdr_template = template->desc_hdr_template;
 	t_alg->dev = dev;

From 4de9d0b547b97e40c93a885ac6246c2c5fef05cb Mon Sep 17 00:00:00 2001
From: Lee Nipper <lee.nipper@gmail.com>
Date: Sun, 29 Mar 2009 15:52:32 +0800
Subject: [PATCH 075/162] crypto: talitos - Add ablkcipher algorithms

Add these ablkcipher algorithms:
cbc(aes),
cbc(des3_ede).

Added handling of chained scatterlists with zero length entry
because eseqiv uses it.
Added new map and unmap routines.

Signed-off-by: Lee Nipper <lee.nipper@gmail.com>
Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/talitos.c | 380 +++++++++++++++++++++++++++++++++++----
 1 file changed, 342 insertions(+), 38 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 9833961a247e..a0b0a6319088 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -44,6 +44,8 @@
 #include <crypto/sha.h>
 #include <crypto/aead.h>
 #include <crypto/authenc.h>
+#include <crypto/skcipher.h>
+#include <crypto/scatterwalk.h>
 
 #include "talitos.h"
 
@@ -755,12 +757,62 @@ badkey:
 struct talitos_edesc {
 	int src_nents;
 	int dst_nents;
+	int src_is_chained;
+	int dst_is_chained;
 	int dma_len;
 	dma_addr_t dma_link_tbl;
 	struct talitos_desc desc;
 	struct talitos_ptr link_tbl[0];
 };
 
+static int talitos_map_sg(struct device *dev, struct scatterlist *sg,
+			  unsigned int nents, enum dma_data_direction dir,
+			  int chained)
+{
+	if (unlikely(chained))
+		while (sg) {
+			dma_map_sg(dev, sg, 1, dir);
+			sg = scatterwalk_sg_next(sg);
+		}
+	else
+		dma_map_sg(dev, sg, nents, dir);
+	return nents;
+}
+
+static void talitos_unmap_sg_chain(struct device *dev, struct scatterlist *sg,
+				   enum dma_data_direction dir)
+{
+	while (sg) {
+		dma_unmap_sg(dev, sg, 1, dir);
+		sg = scatterwalk_sg_next(sg);
+	}
+}
+
+static void talitos_sg_unmap(struct device *dev,
+			     struct talitos_edesc *edesc,
+			     struct scatterlist *src,
+			     struct scatterlist *dst)
+{
+	unsigned int src_nents = edesc->src_nents ? : 1;
+	unsigned int dst_nents = edesc->dst_nents ? : 1;
+
+	if (src != dst) {
+		if (edesc->src_is_chained)
+			talitos_unmap_sg_chain(dev, src, DMA_TO_DEVICE);
+		else
+			dma_unmap_sg(dev, src, src_nents, DMA_TO_DEVICE);
+
+		if (edesc->dst_is_chained)
+			talitos_unmap_sg_chain(dev, dst, DMA_FROM_DEVICE);
+		else
+			dma_unmap_sg(dev, dst, dst_nents, DMA_FROM_DEVICE);
+	} else
+		if (edesc->src_is_chained)
+			talitos_unmap_sg_chain(dev, src, DMA_BIDIRECTIONAL);
+		else
+			dma_unmap_sg(dev, src, src_nents, DMA_BIDIRECTIONAL);
+}
+
 static void ipsec_esp_unmap(struct device *dev,
 			    struct talitos_edesc *edesc,
 			    struct aead_request *areq)
@@ -772,15 +824,7 @@ static void ipsec_esp_unmap(struct device *dev,
 
 	dma_unmap_sg(dev, areq->assoc, 1, DMA_TO_DEVICE);
 
-	if (areq->src != areq->dst) {
-		dma_unmap_sg(dev, areq->src, edesc->src_nents ? : 1,
-			     DMA_TO_DEVICE);
-		dma_unmap_sg(dev, areq->dst, edesc->dst_nents ? : 1,
-			     DMA_FROM_DEVICE);
-	} else {
-		dma_unmap_sg(dev, areq->src, edesc->src_nents ? : 1,
-			     DMA_BIDIRECTIONAL);
-	}
+	talitos_sg_unmap(dev, edesc, areq->src, areq->dst);
 
 	if (edesc->dma_len)
 		dma_unmap_single(dev, edesc->dma_link_tbl, edesc->dma_len,
@@ -886,7 +930,7 @@ static int sg_to_link_tbl(struct scatterlist *sg, int sg_count,
 		link_tbl_ptr->j_extent = 0;
 		link_tbl_ptr++;
 		cryptlen -= sg_dma_len(sg);
-		sg = sg_next(sg);
+		sg = scatterwalk_sg_next(sg);
 	}
 
 	/* adjust (decrease) last one (or two) entry's len to cryptlen */
@@ -952,12 +996,11 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq,
 	desc->ptr[4].len = cpu_to_be16(cryptlen);
 	desc->ptr[4].j_extent = authsize;
 
-	if (areq->src == areq->dst)
-		sg_count = dma_map_sg(dev, areq->src, edesc->src_nents ? : 1,
-				      DMA_BIDIRECTIONAL);
-	else
-		sg_count = dma_map_sg(dev, areq->src, edesc->src_nents ? : 1,
-				      DMA_TO_DEVICE);
+	sg_count = talitos_map_sg(dev, areq->src,
+				  edesc->src_nents ? : 1,
+				  (areq->src == areq->dst) ? DMA_BIDIRECTIONAL :
+				      DMA_TO_DEVICE,
+				  edesc->src_is_chained);
 
 	if (sg_count == 1) {
 		desc->ptr[4].ptr = cpu_to_be32(sg_dma_address(areq->src));
@@ -986,8 +1029,10 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq,
 	desc->ptr[5].j_extent = authsize;
 
 	if (areq->src != areq->dst) {
-		sg_count = dma_map_sg(dev, areq->dst, edesc->dst_nents ? : 1,
-				      DMA_FROM_DEVICE);
+		sg_count = talitos_map_sg(dev, areq->dst,
+					  edesc->dst_nents ? : 1,
+					  DMA_FROM_DEVICE,
+					  edesc->dst_is_chained);
 	}
 
 	if (sg_count == 1) {
@@ -1037,15 +1082,18 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq,
 /*
  * derive number of elements in scatterlist
  */
-static int sg_count(struct scatterlist *sg_list, int nbytes)
+static int sg_count(struct scatterlist *sg_list, int nbytes, int *chained)
 {
 	struct scatterlist *sg = sg_list;
 	int sg_nents = 0;
 
-	while (nbytes) {
+	*chained = 0;
+	while (nbytes > 0) {
 		sg_nents++;
 		nbytes -= sg->length;
-		sg = sg_next(sg);
+		if (!sg_is_last(sg) && (sg + 1)->length == 0)
+			*chained = 1;
+		sg = scatterwalk_sg_next(sg);
 	}
 
 	return sg_nents;
@@ -1054,28 +1102,32 @@ static int sg_count(struct scatterlist *sg_list, int nbytes)
 /*
  * allocate and map the extended descriptor
  */
-static struct talitos_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq,
-						     int icv_stashing)
+static struct talitos_edesc *talitos_edesc_alloc(struct device *dev,
+						 struct scatterlist *src,
+						 struct scatterlist *dst,
+						 unsigned int cryptlen,
+						 unsigned int authsize,
+						 int icv_stashing,
+						 u32 cryptoflags)
 {
-	struct crypto_aead *authenc = crypto_aead_reqtfm(areq);
-	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
 	struct talitos_edesc *edesc;
 	int src_nents, dst_nents, alloc_len, dma_len;
-	gfp_t flags = areq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
+	int src_chained, dst_chained = 0;
+	gfp_t flags = cryptoflags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
 		      GFP_ATOMIC;
 
-	if (areq->cryptlen + ctx->authsize > TALITOS_MAX_DATA_LEN) {
-		dev_err(ctx->dev, "cryptlen exceeds h/w max limit\n");
+	if (cryptlen + authsize > TALITOS_MAX_DATA_LEN) {
+		dev_err(dev, "length exceeds h/w max limit\n");
 		return ERR_PTR(-EINVAL);
 	}
 
-	src_nents = sg_count(areq->src, areq->cryptlen + ctx->authsize);
+	src_nents = sg_count(src, cryptlen + authsize, &src_chained);
 	src_nents = (src_nents == 1) ? 0 : src_nents;
 
-	if (areq->dst == areq->src) {
+	if (dst == src) {
 		dst_nents = src_nents;
 	} else {
-		dst_nents = sg_count(areq->dst, areq->cryptlen + ctx->authsize);
+		dst_nents = sg_count(dst, cryptlen + authsize, &dst_chained);
 		dst_nents = (dst_nents == 1) ? 0 : dst_nents;
 	}
 
@@ -1087,28 +1139,41 @@ static struct talitos_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq,
 	alloc_len = sizeof(struct talitos_edesc);
 	if (src_nents || dst_nents) {
 		dma_len = (src_nents + dst_nents + 2) *
-				 sizeof(struct talitos_ptr) + ctx->authsize;
+				 sizeof(struct talitos_ptr) + authsize;
 		alloc_len += dma_len;
 	} else {
 		dma_len = 0;
-		alloc_len += icv_stashing ? ctx->authsize : 0;
+		alloc_len += icv_stashing ? authsize : 0;
 	}
 
 	edesc = kmalloc(alloc_len, GFP_DMA | flags);
 	if (!edesc) {
-		dev_err(ctx->dev, "could not allocate edescriptor\n");
+		dev_err(dev, "could not allocate edescriptor\n");
 		return ERR_PTR(-ENOMEM);
 	}
 
 	edesc->src_nents = src_nents;
 	edesc->dst_nents = dst_nents;
+	edesc->src_is_chained = src_chained;
+	edesc->dst_is_chained = dst_chained;
 	edesc->dma_len = dma_len;
-	edesc->dma_link_tbl = dma_map_single(ctx->dev, &edesc->link_tbl[0],
+	edesc->dma_link_tbl = dma_map_single(dev, &edesc->link_tbl[0],
 					     edesc->dma_len, DMA_BIDIRECTIONAL);
 
 	return edesc;
 }
 
+static struct talitos_edesc *aead_edesc_alloc(struct aead_request *areq,
+					      int icv_stashing)
+{
+	struct crypto_aead *authenc = crypto_aead_reqtfm(areq);
+	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
+
+	return talitos_edesc_alloc(ctx->dev, areq->src, areq->dst,
+				   areq->cryptlen, ctx->authsize, icv_stashing,
+				   areq->base.flags);
+}
+
 static int aead_encrypt(struct aead_request *req)
 {
 	struct crypto_aead *authenc = crypto_aead_reqtfm(req);
@@ -1116,7 +1181,7 @@ static int aead_encrypt(struct aead_request *req)
 	struct talitos_edesc *edesc;
 
 	/* allocate extended descriptor */
-	edesc = ipsec_esp_edesc_alloc(req, 0);
+	edesc = aead_edesc_alloc(req, 0);
 	if (IS_ERR(edesc))
 		return PTR_ERR(edesc);
 
@@ -1141,7 +1206,7 @@ static int aead_decrypt(struct aead_request *req)
 	req->cryptlen -= authsize;
 
 	/* allocate extended descriptor */
-	edesc = ipsec_esp_edesc_alloc(req, 1);
+	edesc = aead_edesc_alloc(req, 1);
 	if (IS_ERR(edesc))
 		return PTR_ERR(edesc);
 
@@ -1188,7 +1253,7 @@ static int aead_givencrypt(struct aead_givcrypt_request *req)
 	struct talitos_edesc *edesc;
 
 	/* allocate extended descriptor */
-	edesc = ipsec_esp_edesc_alloc(areq, 0);
+	edesc = aead_edesc_alloc(areq, 0);
 	if (IS_ERR(edesc))
 		return PTR_ERR(edesc);
 
@@ -1203,6 +1268,199 @@ static int aead_givencrypt(struct aead_givcrypt_request *req)
 			 ipsec_esp_encrypt_done);
 }
 
+static int ablkcipher_setkey(struct crypto_ablkcipher *cipher,
+			     const u8 *key, unsigned int keylen)
+{
+	struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher);
+	struct ablkcipher_alg *alg = crypto_ablkcipher_alg(cipher);
+
+	if (keylen > TALITOS_MAX_KEY_SIZE)
+		goto badkey;
+
+	if (keylen < alg->min_keysize || keylen > alg->max_keysize)
+		goto badkey;
+
+	memcpy(&ctx->key, key, keylen);
+	ctx->keylen = keylen;
+
+	return 0;
+
+badkey:
+	crypto_ablkcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+	return -EINVAL;
+}
+
+static void common_nonsnoop_unmap(struct device *dev,
+				  struct talitos_edesc *edesc,
+				  struct ablkcipher_request *areq)
+{
+	unmap_single_talitos_ptr(dev, &edesc->desc.ptr[5], DMA_FROM_DEVICE);
+	unmap_single_talitos_ptr(dev, &edesc->desc.ptr[2], DMA_TO_DEVICE);
+	unmap_single_talitos_ptr(dev, &edesc->desc.ptr[1], DMA_TO_DEVICE);
+
+	talitos_sg_unmap(dev, edesc, areq->src, areq->dst);
+
+	if (edesc->dma_len)
+		dma_unmap_single(dev, edesc->dma_link_tbl, edesc->dma_len,
+				 DMA_BIDIRECTIONAL);
+}
+
+static void ablkcipher_done(struct device *dev,
+			    struct talitos_desc *desc, void *context,
+			    int err)
+{
+	struct ablkcipher_request *areq = context;
+	struct talitos_edesc *edesc =
+		 container_of(desc, struct talitos_edesc, desc);
+
+	common_nonsnoop_unmap(dev, edesc, areq);
+
+	kfree(edesc);
+
+	areq->base.complete(&areq->base, err);
+}
+
+static int common_nonsnoop(struct talitos_edesc *edesc,
+			   struct ablkcipher_request *areq,
+			   u8 *giv,
+			   void (*callback) (struct device *dev,
+					     struct talitos_desc *desc,
+					     void *context, int error))
+{
+	struct crypto_ablkcipher *cipher = crypto_ablkcipher_reqtfm(areq);
+	struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher);
+	struct device *dev = ctx->dev;
+	struct talitos_desc *desc = &edesc->desc;
+	unsigned int cryptlen = areq->nbytes;
+	unsigned int ivsize;
+	int sg_count, ret;
+
+	/* first DWORD empty */
+	desc->ptr[0].len = 0;
+	desc->ptr[0].ptr = 0;
+	desc->ptr[0].j_extent = 0;
+
+	/* cipher iv */
+	ivsize = crypto_ablkcipher_ivsize(cipher);
+	map_single_talitos_ptr(dev, &desc->ptr[1], ivsize, giv ?: areq->info, 0,
+			       DMA_TO_DEVICE);
+
+	/* cipher key */
+	map_single_talitos_ptr(dev, &desc->ptr[2], ctx->keylen,
+			       (char *)&ctx->key, 0, DMA_TO_DEVICE);
+
+	/*
+	 * cipher in
+	 */
+	desc->ptr[3].len = cpu_to_be16(cryptlen);
+	desc->ptr[3].j_extent = 0;
+
+	sg_count = talitos_map_sg(dev, areq->src, edesc->src_nents ? : 1,
+				  (areq->src == areq->dst) ? DMA_BIDIRECTIONAL
+							   : DMA_TO_DEVICE,
+				  edesc->src_is_chained);
+
+	if (sg_count == 1) {
+		desc->ptr[3].ptr = cpu_to_be32(sg_dma_address(areq->src));
+	} else {
+		sg_count = sg_to_link_tbl(areq->src, sg_count, cryptlen,
+					  &edesc->link_tbl[0]);
+		if (sg_count > 1) {
+			desc->ptr[3].j_extent |= DESC_PTR_LNKTBL_JUMP;
+			desc->ptr[3].ptr = cpu_to_be32(edesc->dma_link_tbl);
+			dma_sync_single_for_device(ctx->dev, edesc->dma_link_tbl,
+						   edesc->dma_len, DMA_BIDIRECTIONAL);
+		} else {
+			/* Only one segment now, so no link tbl needed */
+			desc->ptr[3].ptr = cpu_to_be32(sg_dma_address(areq->src));
+		}
+	}
+
+	/* cipher out */
+	desc->ptr[4].len = cpu_to_be16(cryptlen);
+	desc->ptr[4].j_extent = 0;
+
+	if (areq->src != areq->dst)
+		sg_count = talitos_map_sg(dev, areq->dst,
+					  edesc->dst_nents ? : 1,
+					  DMA_FROM_DEVICE,
+					  edesc->dst_is_chained);
+
+	if (sg_count == 1) {
+		desc->ptr[4].ptr = cpu_to_be32(sg_dma_address(areq->dst));
+	} else {
+		struct talitos_ptr *link_tbl_ptr =
+			&edesc->link_tbl[edesc->src_nents + 1];
+
+		desc->ptr[4].j_extent |= DESC_PTR_LNKTBL_JUMP;
+		desc->ptr[4].ptr = cpu_to_be32((struct talitos_ptr *)
+					       edesc->dma_link_tbl +
+					       edesc->src_nents + 1);
+		sg_count = sg_to_link_tbl(areq->dst, sg_count, cryptlen,
+					  link_tbl_ptr);
+		dma_sync_single_for_device(ctx->dev, edesc->dma_link_tbl,
+					   edesc->dma_len, DMA_BIDIRECTIONAL);
+	}
+
+	/* iv out */
+	map_single_talitos_ptr(dev, &desc->ptr[5], ivsize, ctx->iv, 0,
+			       DMA_FROM_DEVICE);
+
+	/* last DWORD empty */
+	desc->ptr[6].len = 0;
+	desc->ptr[6].ptr = 0;
+	desc->ptr[6].j_extent = 0;
+
+	ret = talitos_submit(dev, desc, callback, areq);
+	if (ret != -EINPROGRESS) {
+		common_nonsnoop_unmap(dev, edesc, areq);
+		kfree(edesc);
+	}
+	return ret;
+}
+
+static struct talitos_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request *areq)
+{
+	struct crypto_ablkcipher *cipher = crypto_ablkcipher_reqtfm(areq);
+	struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher);
+
+	return talitos_edesc_alloc(ctx->dev, areq->src, areq->dst, areq->nbytes,
+				   0, 0, areq->base.flags);
+}
+
+static int ablkcipher_encrypt(struct ablkcipher_request *areq)
+{
+	struct crypto_ablkcipher *cipher = crypto_ablkcipher_reqtfm(areq);
+	struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher);
+	struct talitos_edesc *edesc;
+
+	/* allocate extended descriptor */
+	edesc = ablkcipher_edesc_alloc(areq);
+	if (IS_ERR(edesc))
+		return PTR_ERR(edesc);
+
+	/* set encrypt */
+	edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_MODE0_ENCRYPT;
+
+	return common_nonsnoop(edesc, areq, NULL, ablkcipher_done);
+}
+
+static int ablkcipher_decrypt(struct ablkcipher_request *areq)
+{
+	struct crypto_ablkcipher *cipher = crypto_ablkcipher_reqtfm(areq);
+	struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher);
+	struct talitos_edesc *edesc;
+
+	/* allocate extended descriptor */
+	edesc = ablkcipher_edesc_alloc(areq);
+	if (IS_ERR(edesc))
+		return PTR_ERR(edesc);
+
+	edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_DIR_INBOUND;
+
+	return common_nonsnoop(edesc, areq, NULL, ablkcipher_done);
+}
+
 struct talitos_alg_template {
 	struct crypto_alg alg;
 	__be32 desc_hdr_template;
@@ -1368,6 +1626,52 @@ static struct talitos_alg_template driver_algs[] = {
 		                     DESC_HDR_MODE1_MDEU_INIT |
 		                     DESC_HDR_MODE1_MDEU_PAD |
 		                     DESC_HDR_MODE1_MDEU_MD5_HMAC,
+	},
+	/* ABLKCIPHER algorithms. */
+	{
+		.alg = {
+			.cra_name = "cbc(aes)",
+			.cra_driver_name = "cbc-aes-talitos",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER |
+                                     CRYPTO_ALG_ASYNC,
+			.cra_type = &crypto_ablkcipher_type,
+			.cra_ablkcipher = {
+				.setkey = ablkcipher_setkey,
+				.encrypt = ablkcipher_encrypt,
+				.decrypt = ablkcipher_decrypt,
+				.geniv = "eseqiv",
+				.min_keysize = AES_MIN_KEY_SIZE,
+				.max_keysize = AES_MAX_KEY_SIZE,
+				.ivsize = AES_BLOCK_SIZE,
+			}
+		},
+		.desc_hdr_template = DESC_HDR_TYPE_COMMON_NONSNOOP_NO_AFEU |
+				     DESC_HDR_SEL0_AESU |
+				     DESC_HDR_MODE0_AESU_CBC,
+	},
+	{
+		.alg = {
+			.cra_name = "cbc(des3_ede)",
+			.cra_driver_name = "cbc-3des-talitos",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER |
+                                     CRYPTO_ALG_ASYNC,
+			.cra_type = &crypto_ablkcipher_type,
+			.cra_ablkcipher = {
+				.setkey = ablkcipher_setkey,
+				.encrypt = ablkcipher_encrypt,
+				.decrypt = ablkcipher_decrypt,
+				.geniv = "eseqiv",
+				.min_keysize = DES3_EDE_KEY_SIZE,
+				.max_keysize = DES3_EDE_KEY_SIZE,
+				.ivsize = DES3_EDE_BLOCK_SIZE,
+			}
+		},
+		.desc_hdr_template = DESC_HDR_TYPE_COMMON_NONSNOOP_NO_AFEU |
+			             DESC_HDR_SEL0_DEU |
+		                     DESC_HDR_MODE0_DEU_CBC |
+		                     DESC_HDR_MODE0_DEU_3DES,
 	}
 };
 

From e938e4656b3ee32e046ee8293411a07be9d72eb8 Mon Sep 17 00:00:00 2001
From: Kim Phillips <kim.phillips@freescale.com>
Date: Sun, 29 Mar 2009 15:53:23 +0800
Subject: [PATCH 076/162] crypto: talitos - Whitespace/codingstyle/overrun
 lines cleanup

no functional changes.

Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/talitos.c | 103 ++++++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 51 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index a0b0a6319088..a073e6b6a3c8 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -341,7 +341,8 @@ static void flush_channel(struct device *dev, int ch, int error, int reset_ch)
 				status = error;
 
 		dma_unmap_single(dev, request->dma_desc,
-			sizeof(struct talitos_desc), DMA_BIDIRECTIONAL);
+				 sizeof(struct talitos_desc),
+				 DMA_BIDIRECTIONAL);
 
 		/* copy entries so we can call callback outside lock */
 		saved_req.desc = request->desc;
@@ -415,7 +416,8 @@ static struct talitos_desc *current_desc(struct device *dev, int ch)
 /*
  * user diagnostics; report root cause of error based on execution unit status
  */
-static void report_eu_error(struct device *dev, int ch, struct talitos_desc *desc)
+static void report_eu_error(struct device *dev, int ch,
+			    struct talitos_desc *desc)
 {
 	struct talitos_private *priv = dev_get_drvdata(dev);
 	int i;
@@ -863,8 +865,8 @@ static void ipsec_esp_encrypt_done(struct device *dev,
 }
 
 static void ipsec_esp_decrypt_swauth_done(struct device *dev,
-				   struct talitos_desc *desc, void *context,
-				   int err)
+					  struct talitos_desc *desc,
+					  void *context, int err)
 {
 	struct aead_request *req = context;
 	struct talitos_edesc *edesc =
@@ -895,8 +897,8 @@ static void ipsec_esp_decrypt_swauth_done(struct device *dev,
 }
 
 static void ipsec_esp_decrypt_hwauth_done(struct device *dev,
-				   struct talitos_desc *desc, void *context,
-				   int err)
+					  struct talitos_desc *desc,
+					  void *context, int err)
 {
 	struct aead_request *req = context;
 	struct talitos_edesc *edesc =
@@ -905,10 +907,9 @@ static void ipsec_esp_decrypt_hwauth_done(struct device *dev,
 	ipsec_esp_unmap(dev, edesc, req);
 
 	/* check ICV auth status */
-	if (!err)
-		if ((desc->hdr_lo & DESC_HDR_LO_ICCR1_MASK) !=
-		    DESC_HDR_LO_ICCR1_PASS)
-			err = -EBADMSG;
+	if (!err && ((desc->hdr_lo & DESC_HDR_LO_ICCR1_MASK) !=
+		     DESC_HDR_LO_ICCR1_PASS))
+		err = -EBADMSG;
 
 	kfree(edesc);
 
@@ -996,10 +997,9 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq,
 	desc->ptr[4].len = cpu_to_be16(cryptlen);
 	desc->ptr[4].j_extent = authsize;
 
-	sg_count = talitos_map_sg(dev, areq->src,
-				  edesc->src_nents ? : 1,
-				  (areq->src == areq->dst) ? DMA_BIDIRECTIONAL :
-				      DMA_TO_DEVICE,
+	sg_count = talitos_map_sg(dev, areq->src, edesc->src_nents ? : 1,
+				  (areq->src == areq->dst) ? DMA_BIDIRECTIONAL
+							   : DMA_TO_DEVICE,
 				  edesc->src_is_chained);
 
 	if (sg_count == 1) {
@@ -1008,19 +1008,21 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq,
 		sg_link_tbl_len = cryptlen;
 
 		if ((edesc->desc.hdr & DESC_HDR_MODE1_MDEU_CICV) &&
-			(edesc->desc.hdr & DESC_HDR_MODE0_ENCRYPT) == 0) {
+			(edesc->desc.hdr & DESC_HDR_MODE0_ENCRYPT) == 0)
 			sg_link_tbl_len = cryptlen + authsize;
-		}
+
 		sg_count = sg_to_link_tbl(areq->src, sg_count, sg_link_tbl_len,
 					  &edesc->link_tbl[0]);
 		if (sg_count > 1) {
 			desc->ptr[4].j_extent |= DESC_PTR_LNKTBL_JUMP;
 			desc->ptr[4].ptr = cpu_to_be32(edesc->dma_link_tbl);
-			dma_sync_single_for_device(ctx->dev, edesc->dma_link_tbl,
-						   edesc->dma_len, DMA_BIDIRECTIONAL);
+			dma_sync_single_for_device(dev, edesc->dma_link_tbl,
+						   edesc->dma_len,
+						   DMA_BIDIRECTIONAL);
 		} else {
 			/* Only one segment now, so no link tbl needed */
-			desc->ptr[4].ptr = cpu_to_be32(sg_dma_address(areq->src));
+			desc->ptr[4].ptr = cpu_to_be32(sg_dma_address(areq->
+								      src));
 		}
 	}
 
@@ -1028,12 +1030,11 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq,
 	desc->ptr[5].len = cpu_to_be16(cryptlen);
 	desc->ptr[5].j_extent = authsize;
 
-	if (areq->src != areq->dst) {
+	if (areq->src != areq->dst)
 		sg_count = talitos_map_sg(dev, areq->dst,
 					  edesc->dst_nents ? : 1,
 					  DMA_FROM_DEVICE,
 					  edesc->dst_is_chained);
-	}
 
 	if (sg_count == 1) {
 		desc->ptr[5].ptr = cpu_to_be32(sg_dma_address(areq->dst));
@@ -1078,7 +1079,6 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq,
 	return ret;
 }
 
-
 /*
  * derive number of elements in scatterlist
  */
@@ -1191,8 +1191,6 @@ static int aead_encrypt(struct aead_request *req)
 	return ipsec_esp(edesc, req, NULL, 0, ipsec_esp_encrypt_done);
 }
 
-
-
 static int aead_decrypt(struct aead_request *req)
 {
 	struct crypto_aead *authenc = crypto_aead_reqtfm(req);
@@ -1211,38 +1209,38 @@ static int aead_decrypt(struct aead_request *req)
 		return PTR_ERR(edesc);
 
 	if ((priv->features & TALITOS_FTR_HW_AUTH_CHECK) &&
-	    (((!edesc->src_nents && !edesc->dst_nents) ||
-		priv->features & TALITOS_FTR_SRC_LINK_TBL_LEN_INCLUDES_EXTENT))) {
+	    ((!edesc->src_nents && !edesc->dst_nents) ||
+	     priv->features & TALITOS_FTR_SRC_LINK_TBL_LEN_INCLUDES_EXTENT)) {
 
 		/* decrypt and check the ICV */
-		edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_DIR_INBOUND |
+		edesc->desc.hdr = ctx->desc_hdr_template |
+				  DESC_HDR_DIR_INBOUND |
 				  DESC_HDR_MODE1_MDEU_CICV;
 
 		/* reset integrity check result bits */
 		edesc->desc.hdr_lo = 0;
 
-		return ipsec_esp(edesc, req, NULL, 0, ipsec_esp_decrypt_hwauth_done);
+		return ipsec_esp(edesc, req, NULL, 0,
+				 ipsec_esp_decrypt_hwauth_done);
 
-	} else {
-
-		/* Have to check the ICV with software */
-
-		edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_DIR_INBOUND;
-
-		/* stash incoming ICV for later cmp with ICV generated by the h/w */
-		if (edesc->dma_len)
-			icvdata = &edesc->link_tbl[edesc->src_nents +
-						   edesc->dst_nents + 2];
-		else
-			icvdata = &edesc->link_tbl[0];
-
-		sg = sg_last(req->src, edesc->src_nents ? : 1);
-
-		memcpy(icvdata, (char *)sg_virt(sg) + sg->length - ctx->authsize,
-		       ctx->authsize);
-
-		return ipsec_esp(edesc, req, NULL, 0, ipsec_esp_decrypt_swauth_done);
 	}
+
+	/* Have to check the ICV with software */
+	edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_DIR_INBOUND;
+
+	/* stash incoming ICV for later cmp with ICV generated by the h/w */
+	if (edesc->dma_len)
+		icvdata = &edesc->link_tbl[edesc->src_nents +
+					   edesc->dst_nents + 2];
+	else
+		icvdata = &edesc->link_tbl[0];
+
+	sg = sg_last(req->src, edesc->src_nents ? : 1);
+
+	memcpy(icvdata, (char *)sg_virt(sg) + sg->length - ctx->authsize,
+	       ctx->authsize);
+
+	return ipsec_esp(edesc, req, NULL, 0, ipsec_esp_decrypt_swauth_done);
 }
 
 static int aead_givencrypt(struct aead_givcrypt_request *req)
@@ -1368,11 +1366,13 @@ static int common_nonsnoop(struct talitos_edesc *edesc,
 		if (sg_count > 1) {
 			desc->ptr[3].j_extent |= DESC_PTR_LNKTBL_JUMP;
 			desc->ptr[3].ptr = cpu_to_be32(edesc->dma_link_tbl);
-			dma_sync_single_for_device(ctx->dev, edesc->dma_link_tbl,
-						   edesc->dma_len, DMA_BIDIRECTIONAL);
+			dma_sync_single_for_device(dev, edesc->dma_link_tbl,
+						   edesc->dma_len,
+						   DMA_BIDIRECTIONAL);
 		} else {
 			/* Only one segment now, so no link tbl needed */
-			desc->ptr[3].ptr = cpu_to_be32(sg_dma_address(areq->src));
+			desc->ptr[3].ptr = cpu_to_be32(sg_dma_address(areq->
+								      src));
 		}
 	}
 
@@ -1419,7 +1419,8 @@ static int common_nonsnoop(struct talitos_edesc *edesc,
 	return ret;
 }
 
-static struct talitos_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request *areq)
+static struct talitos_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request *
+						    areq)
 {
 	struct crypto_ablkcipher *cipher = crypto_ablkcipher_reqtfm(areq);
 	struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher);

From 19bbbc635523703ece28409e59694d5b512b819e Mon Sep 17 00:00:00 2001
From: Kim Phillips <kim.phillips@freescale.com>
Date: Sun, 29 Mar 2009 15:53:59 +0800
Subject: [PATCH 077/162] crypto: talitos - containerof related codingstyle

no functional changes.

Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/talitos.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index a073e6b6a3c8..1cc1c411e551 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -841,13 +841,14 @@ static void ipsec_esp_encrypt_done(struct device *dev,
 				   int err)
 {
 	struct aead_request *areq = context;
-	struct talitos_edesc *edesc =
-		 container_of(desc, struct talitos_edesc, desc);
 	struct crypto_aead *authenc = crypto_aead_reqtfm(areq);
 	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
+	struct talitos_edesc *edesc;
 	struct scatterlist *sg;
 	void *icvdata;
 
+	edesc = container_of(desc, struct talitos_edesc, desc);
+
 	ipsec_esp_unmap(dev, edesc, areq);
 
 	/* copy the generated ICV to dst */
@@ -869,13 +870,14 @@ static void ipsec_esp_decrypt_swauth_done(struct device *dev,
 					  void *context, int err)
 {
 	struct aead_request *req = context;
-	struct talitos_edesc *edesc =
-		 container_of(desc, struct talitos_edesc, desc);
 	struct crypto_aead *authenc = crypto_aead_reqtfm(req);
 	struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
+	struct talitos_edesc *edesc;
 	struct scatterlist *sg;
 	void *icvdata;
 
+	edesc = container_of(desc, struct talitos_edesc, desc);
+
 	ipsec_esp_unmap(dev, edesc, req);
 
 	if (!err) {
@@ -901,8 +903,9 @@ static void ipsec_esp_decrypt_hwauth_done(struct device *dev,
 					  void *context, int err)
 {
 	struct aead_request *req = context;
-	struct talitos_edesc *edesc =
-		 container_of(desc, struct talitos_edesc, desc);
+	struct talitos_edesc *edesc;
+
+	edesc = container_of(desc, struct talitos_edesc, desc);
 
 	ipsec_esp_unmap(dev, edesc, req);
 
@@ -1308,8 +1311,9 @@ static void ablkcipher_done(struct device *dev,
 			    int err)
 {
 	struct ablkcipher_request *areq = context;
-	struct talitos_edesc *edesc =
-		 container_of(desc, struct talitos_edesc, desc);
+	struct talitos_edesc *edesc;
+
+	edesc = container_of(desc, struct talitos_edesc, desc);
 
 	common_nonsnoop_unmap(dev, edesc, areq);
 
@@ -1686,12 +1690,14 @@ struct talitos_crypto_alg {
 static int talitos_cra_init(struct crypto_tfm *tfm)
 {
 	struct crypto_alg *alg = tfm->__crt_alg;
-	struct talitos_crypto_alg *talitos_alg =
-		 container_of(alg, struct talitos_crypto_alg, crypto_alg);
+	struct talitos_crypto_alg *talitos_alg;
 	struct talitos_ctx *ctx = crypto_tfm_ctx(tfm);
 
+	talitos_alg =  container_of(alg, struct talitos_crypto_alg, crypto_alg);
+
 	/* update context with ptr to dev */
 	ctx->dev = talitos_alg->dev;
+
 	/* copy descriptor header template value */
 	ctx->desc_hdr_template = talitos_alg->desc_hdr_template;
 

From 962a9c99496f98041d14d64a9fdcf58050fefb4d Mon Sep 17 00:00:00 2001
From: Kim Phillips <kim.phillips@freescale.com>
Date: Sun, 29 Mar 2009 15:54:30 +0800
Subject: [PATCH 078/162] crypto: talitos - Avoid unnecessary decrypt check

the ICV check bit only gets set in decrypt entry points

Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/talitos.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 1cc1c411e551..c70775fd3ce2 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -1010,8 +1010,7 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq,
 	} else {
 		sg_link_tbl_len = cryptlen;
 
-		if ((edesc->desc.hdr & DESC_HDR_MODE1_MDEU_CICV) &&
-			(edesc->desc.hdr & DESC_HDR_MODE0_ENCRYPT) == 0)
+		if (edesc->desc.hdr & DESC_HDR_MODE1_MDEU_CICV)
 			sg_link_tbl_len = cryptlen + authsize;
 
 		sg_count = sg_to_link_tbl(areq->src, sg_count, sg_link_tbl_len,

From d1c8b0a7692e81b46550bcc493465ed10510cd33 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Date: Tue, 21 Apr 2009 14:14:37 +0800
Subject: [PATCH 079/162] crypto: padlock - Enable on x86_64

Almost everything stays the same, we need just to use the extended registers
on the bit variant.

Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/Kconfig       |  2 +-
 drivers/crypto/padlock-aes.c | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 01afd758072f..39eedd431413 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -12,7 +12,7 @@ if CRYPTO_HW
 
 config CRYPTO_DEV_PADLOCK
 	tristate "Support for VIA PadLock ACE"
-	depends on X86_32 && !UML
+	depends on !UML
 	select CRYPTO_ALGAPI
 	help
 	  Some VIA processors come with an integrated crypto engine
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 856b3cc25583..87f92c39b5f0 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -154,7 +154,11 @@ static inline void padlock_reset_key(struct cword *cword)
 	int cpu = raw_smp_processor_id();
 
 	if (cword != per_cpu(last_cword, cpu))
+#ifndef CONFIG_X86_64
 		asm volatile ("pushfl; popfl");
+#else
+		asm volatile ("pushfq; popfq");
+#endif
 }
 
 static inline void padlock_store_cword(struct cword *cword)
@@ -208,10 +212,19 @@ static inline void padlock_xcrypt_ecb(const u8 *input, u8 *output, void *key,
 
 	asm volatile ("test $1, %%cl;"
 		      "je 1f;"
+#ifndef CONFIG_X86_64
 		      "lea -1(%%ecx), %%eax;"
 		      "mov $1, %%ecx;"
+#else
+		      "lea -1(%%rcx), %%rax;"
+		      "mov $1, %%rcx;"
+#endif
 		      ".byte 0xf3,0x0f,0xa7,0xc8;"	/* rep xcryptecb */
+#ifndef CONFIG_X86_64
 		      "mov %%eax, %%ecx;"
+#else
+		      "mov %%rax, %%rcx;"
+#endif
 		      "1:"
 		      ".byte 0xf3,0x0f,0xa7,0xc8"	/* rep xcryptecb */
 		      : "+S"(input), "+D"(output)

From 2f8174187f409213e63c3589af163c627e8a182a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 22 Apr 2009 13:00:15 +0800
Subject: [PATCH 080/162] crypto: padlock - Restore dependency on x86

When we added 64-bit support to padlock the dependency on x86
was lost.  This causes build failures on non-x86 architectures.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 39eedd431413..e748e55bd86b 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -12,7 +12,7 @@ if CRYPTO_HW
 
 config CRYPTO_DEV_PADLOCK
 	tristate "Support for VIA PadLock ACE"
-	depends on !UML
+	depends on X86 && !UML
 	select CRYPTO_ALGAPI
 	help
 	  Some VIA processors come with an integrated crypto engine

From e44a1b44c3a9794236fe038b89a0fbef5adcd523 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 4 May 2009 19:22:11 +0800
Subject: [PATCH 081/162] crypto: testmgr - Handle AEAD test vectors expected
 to fail verification

Add infrastructure to tcrypt/testmgr to support handling ccm decryption
test vectors that are expected to fail verification.

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 28 ++++++++++++++++++++++++++++
 crypto/testmgr.h |  1 +
 2 files changed, 29 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index bfee6e9f642d..84f96401b29a 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -363,6 +363,16 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 
 			switch (ret) {
 			case 0:
+				if (template[i].novrfy) {
+					/* verification was supposed to fail */
+					printk(KERN_ERR "alg: aead: %s failed "
+					       "on test %d for %s: ret was 0, "
+					       "expected -EBADMSG\n",
+					       e, j, algo);
+					/* so really, we got a bad message */
+					ret = -EBADMSG;
+					goto out;
+				}
 				break;
 			case -EINPROGRESS:
 			case -EBUSY:
@@ -372,6 +382,10 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 					INIT_COMPLETION(result.completion);
 					break;
 				}
+			case -EBADMSG:
+				if (template[i].novrfy)
+					/* verification failure was expected */
+					continue;
 				/* fall through */
 			default:
 				printk(KERN_ERR "alg: aead: %s failed on test "
@@ -481,6 +495,16 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 
 			switch (ret) {
 			case 0:
+				if (template[i].novrfy) {
+					/* verification was supposed to fail */
+					printk(KERN_ERR "alg: aead: %s failed "
+					       "on chunk test %d for %s: ret "
+					       "was 0, expected -EBADMSG\n",
+					       e, j, algo);
+					/* so really, we got a bad message */
+					ret = -EBADMSG;
+					goto out;
+				}
 				break;
 			case -EINPROGRESS:
 			case -EBUSY:
@@ -490,6 +514,10 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 					INIT_COMPLETION(result.completion);
 					break;
 				}
+			case -EBADMSG:
+				if (template[i].novrfy)
+					/* verification failure was expected */
+					continue;
 				/* fall through */
 			default:
 				printk(KERN_ERR "alg: aead: %s failed on "
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 526f00a9c72f..b77b61dad262 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -62,6 +62,7 @@ struct aead_testvec {
 	int np;
 	int anp;
 	unsigned char fail;
+	unsigned char novrfy;	/* ccm dec verification failure expected */
 	unsigned char wk; /* weak key flag */
 	unsigned char klen;
 	unsigned short ilen;

From 5d667322a25ab4ecb91176db118fd663fee4da35 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 4 May 2009 19:23:40 +0800
Subject: [PATCH 082/162] crypto: testmgr - Add self-tests for
 rfc4309(ccm(aes))

Add an array of encryption and decryption + verification self-tests
for rfc4309(ccm(aes)).

Test vectors all come from sample FIPS CAVS files provided to
Red Hat by a testing lab. Unfortunately, all the published sample
vectors in RFC 3610 and NIST Special Publication 800-38C contain nonce
lengths that the kernel's rfc4309 implementation doesn't support, so
while using some public domain vectors would have been preferred, its
not possible at this time.

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c  |   4 +
 crypto/testmgr.c |  15 ++
 crypto/testmgr.h | 370 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 389 insertions(+)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 50d1e35bbff4..0452036b1d45 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -667,6 +667,10 @@ static void do_test(int m)
 		tcrypt_test("zlib");
 		break;
 
+	case 45:
+		tcrypt_test("rfc4309(ccm(aes))");
+		break;
+
 	case 100:
 		tcrypt_test("hmac(md5)");
 		break;
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 84f96401b29a..40c10789f7f3 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1878,6 +1878,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "rfc4309(ccm(aes))",
+		.test = alg_test_aead,
+		.suite = {
+			.aead = {
+				.enc = {
+					.vecs = aes_ccm_rfc4309_enc_tv_template,
+					.count = AES_CCM_4309_ENC_TEST_VECTORS
+				},
+				.dec = {
+					.vecs = aes_ccm_rfc4309_dec_tv_template,
+					.count = AES_CCM_4309_DEC_TEST_VECTORS
+				}
+			}
+		}
 	}, {
 		.alg = "rmd128",
 		.test = alg_test_hash,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index b77b61dad262..5add65196a98 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -2848,6 +2848,8 @@ static struct cipher_testvec cast6_dec_tv_template[] = {
 #define AES_GCM_DEC_TEST_VECTORS 8
 #define AES_CCM_ENC_TEST_VECTORS 7
 #define AES_CCM_DEC_TEST_VECTORS 7
+#define AES_CCM_4309_ENC_TEST_VECTORS 7
+#define AES_CCM_4309_DEC_TEST_VECTORS 10
 
 static struct cipher_testvec aes_enc_tv_template[] = {
 	{ /* From FIPS-197 */
@@ -5826,6 +5828,374 @@ static struct aead_testvec aes_ccm_dec_tv_template[] = {
 	},
 };
 
+/*
+ * rfc4309 refers to section 8 of rfc3610 for test vectors, but they all
+ * use a 13-byte nonce, we only support an 11-byte nonce. Similarly, all of
+ * Special Publication 800-38C's test vectors also use nonce lengths our
+ * implementation doesn't support. The following are taken from fips cavs
+ * fax files on hand at Red Hat.
+ *
+ * nb: actual key lengths are (klen - 3), the last 3 bytes are actually
+ * part of the nonce which combine w/the iv, but need to be input this way.
+ */
+static struct aead_testvec aes_ccm_rfc4309_enc_tv_template[] = {
+	{
+		.key	= "\x83\xac\x54\x66\xc2\xeb\xe5\x05"
+			  "\x2e\x01\xd1\xfc\x5d\x82\x66\x2e"
+			  "\x96\xac\x59",
+		.klen	= 19,
+		.iv	= "\x30\x07\xa1\xe2\xa2\xc7\x55\x24",
+		.alen	= 0,
+		.input	= "\x19\xc8\x81\xf6\xe9\x86\xff\x93"
+			  "\x0b\x78\x67\xe5\xbb\xb7\xfc\x6e"
+			  "\x83\x77\xb3\xa6\x0c\x8c\x9f\x9c"
+			  "\x35\x2e\xad\xe0\x62\xf9\x91\xa1",
+		.ilen	= 32,
+		.result	= "\xab\x6f\xe1\x69\x1d\x19\x99\xa8"
+			  "\x92\xa0\xc4\x6f\x7e\xe2\x8b\xb1"
+			  "\x70\xbb\x8c\xa6\x4c\x6e\x97\x8a"
+			  "\x57\x2b\xbe\x5d\x98\xa6\xb1\x32"
+			  "\xda\x24\xea\xd9\xa1\x39\x98\xfd"
+			  "\xa4\xbe\xd9\xf2\x1a\x6d\x22\xa8",
+		.rlen	= 48,
+	}, {
+		.key	= "\x1e\x2c\x7e\x01\x41\x9a\xef\xc0"
+			  "\x0d\x58\x96\x6e\x5c\xa2\x4b\xd3"
+			  "\x4f\xa3\x19",
+		.klen	= 19,
+		.iv	= "\xd3\x01\x5a\xd8\x30\x60\x15\x56",
+		.assoc	= "\xda\xe6\x28\x9c\x45\x2d\xfd\x63"
+			  "\x5e\xda\x4c\xb6\xe6\xfc\xf9\xb7"
+			  "\x0c\x56\xcb\xe4\xe0\x05\x7a\xe1"
+			  "\x0a\x63\x09\x78\xbc\x2c\x55\xde",
+		.alen	= 32,
+		.input	= "\x87\xa3\x36\xfd\x96\xb3\x93\x78"
+			  "\xa9\x28\x63\xba\x12\xa3\x14\x85"
+			  "\x57\x1e\x06\xc9\x7b\x21\xef\x76"
+			  "\x7f\x38\x7e\x8e\x29\xa4\x3e\x7e",
+		.ilen	= 32,
+		.result	= "\x8a\x1e\x11\xf0\x02\x6b\xe2\x19"
+			  "\xfc\x70\xc4\x6d\x8e\xb7\x99\xab"
+			  "\xc5\x4b\xa2\xac\xd3\xf3\x48\xff"
+			  "\x3b\xb5\xce\x53\xef\xde\xbb\x02"
+			  "\xa9\x86\x15\x6c\x13\xfe\xda\x0a"
+			  "\x22\xb8\x29\x3d\xd8\x39\x9a\x23",
+		.rlen	= 48,
+	}, {
+		.key	= "\xf4\x6b\xc2\x75\x62\xfe\xb4\xe1"
+			  "\xa3\xf0\xff\xdd\x4e\x4b\x12\x75"
+			  "\x53\x14\x73\x66\x8d\x88\xf6\x80"
+			  "\xa0\x20\x35",
+		.klen	= 27,
+		.iv	= "\x26\xf2\x21\x8d\x50\x20\xda\xe2",
+		.assoc	= "\x5b\x9e\x13\x67\x02\x5e\xef\xc1"
+			  "\x6c\xf9\xd7\x1e\x52\x8f\x7a\x47"
+			  "\xe9\xd4\xcf\x20\x14\x6e\xf0\x2d"
+			  "\xd8\x9e\x2b\x56\x10\x23\x56\xe7",
+		.alen	= 32,
+		.ilen	= 0,
+		.result	= "\x36\xea\x7a\x70\x08\xdc\x6a\xbc"
+			  "\xad\x0c\x7a\x63\xf6\x61\xfd\x9b",
+		.rlen	= 16,
+	}, {
+		.key	= "\x56\xdf\x5c\x8f\x26\x3f\x0e\x42"
+			  "\xef\x7a\xd3\xce\xfc\x84\x60\x62"
+			  "\xca\xb4\x40\xaf\x5f\xc9\xc9\x01"
+			  "\xd6\x3c\x8c",
+		.klen	= 27,
+		.iv	= "\x86\x84\xb6\xcd\xef\x09\x2e\x94",
+		.assoc	= "\x02\x65\x78\x3c\xe9\x21\x30\x91"
+			  "\xb1\xb9\xda\x76\x9a\x78\x6d\x95"
+			  "\xf2\x88\x32\xa3\xf2\x50\xcb\x4c"
+			  "\xe3\x00\x73\x69\x84\x69\x87\x79",
+		.alen	= 32,
+		.input	= "\x9f\xd2\x02\x4b\x52\x49\x31\x3c"
+			  "\x43\x69\x3a\x2d\x8e\x70\xad\x7e"
+			  "\xe0\xe5\x46\x09\x80\x89\x13\xb2"
+			  "\x8c\x8b\xd9\x3f\x86\xfb\xb5\x6b",
+		.ilen	= 32,
+		.result	= "\x39\xdf\x7c\x3c\x5a\x29\xb9\x62"
+			  "\x5d\x51\xc2\x16\xd8\xbd\x06\x9f"
+			  "\x9b\x6a\x09\x70\xc1\x51\x83\xc2"
+			  "\x66\x88\x1d\x4f\x9a\xda\xe0\x1e"
+			  "\xc7\x79\x11\x58\xe5\x6b\x20\x40"
+			  "\x7a\xea\x46\x42\x8b\xe4\x6f\xe1",
+		.rlen	= 48,
+	}, {
+		.key	= "\xe0\x8d\x99\x71\x60\xd7\x97\x1a"
+			  "\xbd\x01\x99\xd5\x8a\xdf\x71\x3a"
+			  "\xd3\xdf\x24\x4b\x5e\x3d\x4b\x4e"
+			  "\x30\x7a\xb9\xd8\x53\x0a\x5e\x2b"
+			  "\x1e\x29\x91",
+		.klen	= 35,
+		.iv	= "\xad\x8e\xc1\x53\x0a\xcf\x2d\xbe",
+		.assoc	= "\x19\xb6\x1f\x57\xc4\xf3\xf0\x8b"
+			  "\x78\x2b\x94\x02\x29\x0f\x42\x27"
+			  "\x6b\x75\xcb\x98\x34\x08\x7e\x79"
+			  "\xe4\x3e\x49\x0d\x84\x8b\x22\x87",
+		.alen	= 32,
+		.input	= "\xe1\xd9\xd8\x13\xeb\x3a\x75\x3f"
+			  "\x9d\xbd\x5f\x66\xbe\xdc\xbb\x66"
+			  "\xbf\x17\x99\x62\x4a\x39\x27\x1f"
+			  "\x1d\xdc\x24\xae\x19\x2f\x98\x4c",
+		.ilen	= 32,
+		.result	= "\x19\xb8\x61\x33\x45\x2b\x43\x96"
+			  "\x6f\x51\xd0\x20\x30\x7d\x9b\xc6"
+			  "\x26\x3d\xf8\xc9\x65\x16\xa8\x9f"
+			  "\xf0\x62\x17\x34\xf2\x1e\x8d\x75"
+			  "\x4e\x13\xcc\xc0\xc3\x2a\x54\x2d",
+		.rlen	= 40,
+	}, {
+		.key	= "\x7c\xc8\x18\x3b\x8d\x99\xe0\x7c"
+			  "\x45\x41\xb8\xbd\x5c\xa7\xc2\x32"
+			  "\x8a\xb8\x02\x59\xa4\xfe\xa9\x2c"
+			  "\x09\x75\x9a\x9b\x3c\x9b\x27\x39"
+			  "\xf9\xd9\x4e",
+		.klen	= 35,
+		.iv	= "\x63\xb5\x3d\x9d\x43\xf6\x1e\x50",
+		.assoc	= "\x57\xf5\x6b\x8b\x57\x5c\x3d\x3b"
+			  "\x13\x02\x01\x0c\x83\x4c\x96\x35"
+			  "\x8e\xd6\x39\xcf\x7d\x14\x9b\x94"
+			  "\xb0\x39\x36\xe6\x8f\x57\xe0\x13",
+		.alen	= 32,
+		.input	= "\x3b\x6c\x29\x36\xb6\xef\x07\xa6"
+			  "\x83\x72\x07\x4f\xcf\xfa\x66\x89"
+			  "\x5f\xca\xb1\xba\xd5\x8f\x2c\x27"
+			  "\x30\xdb\x75\x09\x93\xd4\x65\xe4",
+		.ilen	= 32,
+		.result	= "\xb0\x88\x5a\x33\xaa\xe5\xc7\x1d"
+			  "\x85\x23\xc7\xc6\x2f\xf4\x1e\x3d"
+			  "\xcc\x63\x44\x25\x07\x78\x4f\x9e"
+			  "\x96\xb8\x88\xeb\xbc\x48\x1f\x06"
+			  "\x39\xaf\x39\xac\xd8\x4a\x80\x39"
+			  "\x7b\x72\x8a\xf7",
+		.rlen	= 44,
+	}, {
+		.key	= "\xab\xd0\xe9\x33\x07\x26\xe5\x83"
+			  "\x8c\x76\x95\xd4\xb6\xdc\xf3\x46"
+			  "\xf9\x8f\xad\xe3\x02\x13\x83\x77"
+			  "\x3f\xb0\xf1\xa1\xa1\x22\x0f\x2b"
+			  "\x24\xa7\x8b",
+		.klen	= 35,
+		.iv	= "\x07\xcb\xcc\x0e\xe6\x33\xbf\xf5",
+		.assoc	= "\xd4\xdb\x30\x1d\x03\xfe\xfd\x5f"
+			  "\x87\xd4\x8c\xb6\xb6\xf1\x7a\x5d"
+			  "\xab\x90\x65\x8d\x8e\xca\x4d\x4f"
+			  "\x16\x0c\x40\x90\x4b\xc7\x36\x73",
+		.alen	= 32,
+		.input	= "\xf5\xc6\x7d\x48\xc1\xb7\xe6\x92"
+			  "\x97\x5a\xca\xc4\xa9\x6d\xf9\x3d"
+			  "\x6c\xde\xbc\xf1\x90\xea\x6a\xb2"
+			  "\x35\x86\x36\xaf\x5c\xfe\x4b\x3a",
+		.ilen	= 32,
+		.result	= "\x83\x6f\x40\x87\x72\xcf\xc1\x13"
+			  "\xef\xbb\x80\x21\x04\x6c\x58\x09"
+			  "\x07\x1b\xfc\xdf\xc0\x3f\x5b\xc7"
+			  "\xe0\x79\xa8\x6e\x71\x7c\x3f\xcf"
+			  "\x5c\xda\xb2\x33\xe5\x13\xe2\x0d"
+			  "\x74\xd1\xef\xb5\x0f\x3a\xb5\xf8",
+		.rlen	= 48,
+	},
+};
+
+static struct aead_testvec aes_ccm_rfc4309_dec_tv_template[] = {
+	{
+		.key	= "\xab\x2f\x8a\x74\xb7\x1c\xd2\xb1"
+			  "\xff\x80\x2e\x48\x7d\x82\xf8\xb9"
+			  "\xc6\xfb\x7d",
+		.klen	= 19,
+		.iv	= "\x80\x0d\x13\xab\xd8\xa6\xb2\xd8",
+		.alen	= 0,
+		.input	= "\xd5\xe8\x93\x9f\xc7\x89\x2e\x2b",
+		.ilen	= 8,
+		.result	= "\x00",
+		.rlen	= 0,
+		.novrfy	= 1,
+	}, {
+		.key	= "\xab\x2f\x8a\x74\xb7\x1c\xd2\xb1"
+			  "\xff\x80\x2e\x48\x7d\x82\xf8\xb9"
+			  "\xaf\x94\x87",
+		.klen	= 19,
+		.iv	= "\x78\x35\x82\x81\x7f\x88\x94\x68",
+		.alen	= 0,
+		.input	= "\x41\x3c\xb8\x87\x73\xcb\xf3\xf3",
+		.ilen	= 8,
+		.result	= "\x00",
+		.rlen	= 0,
+	}, {
+		.key	= "\x61\x0e\x8c\xae\xe3\x23\xb6\x38"
+			  "\x76\x1c\xf6\x3a\x67\xa3\x9c\xd8"
+			  "\xc6\xfb\x7d",
+		.klen	= 19,
+		.iv	= "\x80\x0d\x13\xab\xd8\xa6\xb2\xd8",
+		.assoc	= "\xf3\x94\x87\x78\x35\x82\x81\x7f"
+			  "\x88\x94\x68\xb1\x78\x6b\x2b\xd6"
+			  "\x04\x1f\x4e\xed\x78\xd5\x33\x66"
+			  "\xd8\x94\x99\x91\x81\x54\x62\x57",
+		.alen	= 32,
+		.input	= "\xf0\x7c\x29\x02\xae\x1c\x2f\x55"
+			  "\xd0\xd1\x3d\x1a\xa3\x6d\xe4\x0a"
+			  "\x86\xb0\x87\x6b\x62\x33\x8c\x34"
+			  "\xce\xab\x57\xcc\x79\x0b\xe0\x6f"
+			  "\x5c\x3e\x48\x1f\x6c\x46\xf7\x51"
+			  "\x8b\x84\x83\x2a\xc1\x05\xb8\xc5",
+		.ilen	= 48,
+		.result	= "\x50\x82\x3e\x07\xe2\x1e\xb6\xfb"
+			  "\x33\xe4\x73\xce\xd2\xfb\x95\x79"
+			  "\xe8\xb4\xb5\x77\x11\x10\x62\x6f"
+			  "\x6a\x82\xd1\x13\xec\xf5\xd0\x48",
+		.rlen	= 32,
+		.novrfy	= 1,
+	}, {
+		.key	= "\x61\x0e\x8c\xae\xe3\x23\xb6\x38"
+			  "\x76\x1c\xf6\x3a\x67\xa3\x9c\xd8"
+			  "\x05\xe0\xc9",
+		.klen	= 19,
+		.iv	= "\x0f\xed\x34\xea\x97\xd4\x3b\xdf",
+		.assoc	= "\x49\x5c\x50\x1f\x1d\x94\xcc\x81"
+			  "\xba\xb7\xb6\x03\xaf\xa5\xc1\xa1"
+			  "\xd8\x5c\x42\x68\xe0\x6c\xda\x89"
+			  "\x05\xac\x56\xac\x1b\x2a\xd3\x86",
+		.alen	= 32,
+		.input	= "\x39\xbe\x7d\x15\x62\x77\xf3\x3c"
+			  "\xad\x83\x52\x6d\x71\x03\x25\x1c"
+			  "\xed\x81\x3a\x9a\x16\x7d\x19\x80"
+			  "\x72\x04\x72\xd0\xf6\xff\x05\x0f"
+			  "\xb7\x14\x30\x00\x32\x9e\xa0\xa6"
+			  "\x9e\x5a\x18\xa1\xb8\xfe\xdb\xd3",
+		.ilen	= 48,
+		.result	= "\x75\x05\xbe\xc2\xd9\x1e\xde\x60"
+			  "\x47\x3d\x8c\x7d\xbd\xb5\xd9\xb7"
+			  "\xf2\xae\x61\x05\x8f\x82\x24\x3f"
+			  "\x9c\x67\x91\xe1\x38\x4f\xe4\x0c",
+		.rlen	= 32,
+	}, {
+		.key	= "\x39\xbb\xa7\xbe\x59\x97\x9e\x73"
+			  "\xa2\xbc\x6b\x98\xd7\x75\x7f\xe3"
+			  "\xa4\x48\x93\x39\x26\x71\x4a\xc6"
+			  "\xee\x49\x83",
+		.klen	= 27,
+		.iv	= "\xe9\xa9\xff\xe9\x57\xba\xfd\x9e",
+		.assoc	= "\x44\xa6\x2c\x05\xe9\xe1\x43\xb1"
+			  "\x58\x7c\xf2\x5c\x6d\x39\x0a\x64"
+			  "\xa4\xf0\x13\x05\xd1\x77\x99\x67"
+			  "\x11\xc4\xc6\xdb\x00\x56\x36\x61",
+		.alen	= 32,
+		.input	= "\x71\x99\xfa\xf4\x44\x12\x68\x9b",
+		.ilen	= 8,
+		.result	= "\x00",
+		.rlen	= 0,
+	}, {
+		.key	= "\x58\x5d\xa0\x96\x65\x1a\x04\xd7"
+			  "\x96\xe5\xc5\x68\xaa\x95\x35\xe0"
+			  "\x29\xa0\xba\x9e\x48\x78\xd1\xba"
+			  "\xee\x49\x83",
+		.klen	= 27,
+		.iv	= "\xe9\xa9\xff\xe9\x57\xba\xfd\x9e",
+		.assoc	= "\x44\xa6\x2c\x05\xe9\xe1\x43\xb1"
+			  "\x58\x7c\xf2\x5c\x6d\x39\x0a\x64"
+			  "\xa4\xf0\x13\x05\xd1\x77\x99\x67"
+			  "\x11\xc4\xc6\xdb\x00\x56\x36\x61",
+		.alen	= 32,
+		.input	= "\xfb\xe5\x5d\x34\xbe\xe5\xe8\xe7"
+			  "\x5a\xef\x2f\xbf\x1f\x7f\xd4\xb2"
+			  "\x66\xca\x61\x1e\x96\x7a\x61\xb3"
+			  "\x1c\x16\x45\x52\xba\x04\x9c\x9f"
+			  "\xb1\xd2\x40\xbc\x52\x7c\x6f\xb1",
+		.ilen	= 40,
+		.result	= "\x85\x34\x66\x42\xc8\x92\x0f\x36"
+			  "\x58\xe0\x6b\x91\x3c\x98\x5c\xbb"
+			  "\x0a\x85\xcc\x02\xad\x7a\x96\xe9"
+			  "\x65\x43\xa4\xc3\x0f\xdc\x55\x81",
+		.rlen	= 32,
+	}, {
+		.key	= "\x58\x5d\xa0\x96\x65\x1a\x04\xd7"
+			  "\x96\xe5\xc5\x68\xaa\x95\x35\xe0"
+			  "\x29\xa0\xba\x9e\x48\x78\xd1\xba"
+			  "\xd1\xfc\x57",
+		.klen	= 27,
+		.iv	= "\x9c\xfe\xb8\x9c\xad\x71\xaa\x1f",
+		.assoc	= "\x86\x67\xa5\xa9\x14\x5f\x0d\xc6"
+			  "\xff\x14\xc7\x44\xbf\x6c\x3a\xc3"
+			  "\xff\xb6\x81\xbd\xe2\xd5\x06\xc7"
+			  "\x3c\xa1\x52\x13\x03\x8a\x23\x3a",
+		.alen	= 32,
+		.input	= "\x3f\x66\xb0\x9d\xe5\x4b\x38\x00"
+			  "\xc6\x0e\x6e\xe5\xd6\x98\xa6\x37"
+			  "\x8c\x26\x33\xc6\xb2\xa2\x17\xfa"
+			  "\x64\x19\xc0\x30\xd7\xfc\x14\x6b"
+			  "\xe3\x33\xc2\x04\xb0\x37\xbe\x3f"
+			  "\xa9\xb4\x2d\x68\x03\xa3\x44\xef",
+		.ilen	= 48,
+		.result	= "\x02\x87\x4d\x28\x80\x6e\xb2\xed"
+			  "\x99\x2a\xa8\xca\x04\x25\x45\x90"
+			  "\x1d\xdd\x5a\xd9\xe4\xdb\x9c\x9c"
+			  "\x49\xe9\x01\xfe\xa7\x80\x6d\x6b",
+		.rlen	= 32,
+		.novrfy	= 1,
+	}, {
+		.key	= "\xa4\x4b\x54\x29\x0a\xb8\x6d\x01"
+			  "\x5b\x80\x2a\xcf\x25\xc4\xb7\x5c"
+			  "\x20\x2c\xad\x30\xc2\x2b\x41\xfb"
+			  "\x0e\x85\xbc\x33\xad\x0f\x2b\xff"
+			  "\xee\x49\x83",
+		.klen	= 35,
+		.iv	= "\xe9\xa9\xff\xe9\x57\xba\xfd\x9e",
+		.alen	= 0,
+		.input	= "\x1f\xb8\x8f\xa3\xdd\x54\x00\xf2",
+		.ilen	= 8,
+		.result	= "\x00",
+		.rlen	= 0,
+	}, {
+		.key	= "\x39\xbb\xa7\xbe\x59\x97\x9e\x73"
+			  "\xa2\xbc\x6b\x98\xd7\x75\x7f\xe3"
+			  "\xa4\x48\x93\x39\x26\x71\x4a\xc6"
+			  "\xae\x8f\x11\x4c\xc2\x9c\x4a\xbb"
+			  "\x85\x34\x66",
+		.klen	= 35,
+		.iv	= "\x42\xc8\x92\x0f\x36\x58\xe0\x6b",
+		.alen	= 0,
+		.input	= "\x48\x01\x5e\x02\x24\x04\x66\x47"
+			  "\xa1\xea\x6f\xaf\xe8\xfc\xfb\xdd"
+			  "\xa5\xa9\x87\x8d\x84\xee\x2e\x77"
+			  "\xbb\x86\xb9\xf5\x5c\x6c\xff\xf6"
+			  "\x72\xc3\x8e\xf7\x70\xb1\xb2\x07"
+			  "\xbc\xa8\xa3\xbd\x83\x7c\x1d\x2a",
+		.ilen	= 48,
+		.result	= "\xdc\x56\xf2\x71\xb0\xb1\xa0\x6c"
+			  "\xf0\x97\x3a\xfb\x6d\xe7\x32\x99"
+			  "\x3e\xaf\x70\x5e\xb2\x4d\xea\x39"
+			  "\x89\xd4\x75\x7a\x63\xb1\xda\x93",
+		.rlen	= 32,
+		.novrfy	= 1,
+	}, {
+		.key	= "\x58\x5d\xa0\x96\x65\x1a\x04\xd7"
+			  "\x96\xe5\xc5\x68\xaa\x95\x35\xe0"
+			  "\x29\xa0\xba\x9e\x48\x78\xd1\xba"
+			  "\x0d\x1a\x53\x3b\xb5\xe3\xf8\x8b"
+			  "\xcf\x76\x3f",
+		.klen	= 35,
+		.iv	= "\xd9\x95\x75\x8f\x44\x89\x40\x7b",
+		.assoc	= "\x8f\x86\x6c\x4d\x1d\xc5\x39\x88"
+			  "\xc8\xf3\x5c\x52\x10\x63\x6f\x2b"
+			  "\x8a\x2a\xc5\x6f\x30\x23\x58\x7b"
+			  "\xfb\x36\x03\x11\xb4\xd9\xf2\xfe",
+		.alen	= 32,
+		.input	= "\x48\x58\xd6\xf3\xad\x63\x58\xbf"
+			  "\xae\xc7\x5e\xae\x83\x8f\x7b\xe4"
+			  "\x78\x5c\x4c\x67\x71\x89\x94\xbf"
+			  "\x47\xf1\x63\x7e\x1c\x59\xbd\xc5"
+			  "\x7f\x44\x0a\x0c\x01\x18\x07\x92"
+			  "\xe1\xd3\x51\xce\x32\x6d\x0c\x5b",
+		.ilen	= 48,
+		.result	= "\xc2\x54\xc8\xde\x78\x87\x77\x40"
+			  "\x49\x71\xe4\xb7\xe7\xcb\x76\x61"
+			  "\x0a\x41\xb9\xe9\xc0\x76\x54\xab"
+			  "\x04\x49\x3b\x19\x93\x57\x25\x5d",
+		.rlen	= 32,
+	},
+};
+
 /* Cast5 test vectors from RFC 2144 */
 #define CAST5_ENC_TEST_VECTORS	3
 #define CAST5_DEC_TEST_VECTORS	3

From 7647d6ce2077d9e1c3d72359f6b4492be129cfe8 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 4 May 2009 19:44:50 +0800
Subject: [PATCH 083/162] crypto: testmgr - Add infrastructure for ansi_cprng
 self-tests

Add some necessary infrastructure to make it possible to run
self-tests for ansi_cprng. The bits are likely very specific
to the ANSI X9.31 CPRNG in AES mode, and thus perhaps should
be named more specifically if/when we grow additional CPRNG
support...

Successfully tested against the cryptodev-2.6 tree and a
Red Hat Enterprise Linux 5.x kernel with the follow-on
patch that adds the actual test vectors.

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++
 crypto/testmgr.h | 12 +++++++
 2 files changed, 101 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 40c10789f7f3..adc54cfd39df 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -19,6 +19,7 @@
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <crypto/rng.h>
 
 #include "internal.h"
 #include "testmgr.h"
@@ -84,6 +85,11 @@ struct hash_test_suite {
 	unsigned int count;
 };
 
+struct cprng_test_suite {
+	struct cprng_testvec *vecs;
+	unsigned int count;
+};
+
 struct alg_test_desc {
 	const char *alg;
 	int (*test)(const struct alg_test_desc *desc, const char *driver,
@@ -95,6 +101,7 @@ struct alg_test_desc {
 		struct comp_test_suite comp;
 		struct pcomp_test_suite pcomp;
 		struct hash_test_suite hash;
+		struct cprng_test_suite cprng;
 	} suite;
 };
 
@@ -1089,6 +1096,68 @@ static int test_pcomp(struct crypto_pcomp *tfm,
 	return 0;
 }
 
+
+static int test_cprng(struct crypto_rng *tfm, struct cprng_testvec *template,
+		      unsigned int tcount)
+{
+	const char *algo = crypto_tfm_alg_driver_name(crypto_rng_tfm(tfm));
+	int err, i, j, seedsize;
+	u8 *seed;
+	char result[32];
+
+	seedsize = crypto_rng_seedsize(tfm);
+
+	seed = kmalloc(seedsize, GFP_KERNEL);
+	if (!seed) {
+		printk(KERN_ERR "alg: cprng: Failed to allocate seed space "
+		       "for %s\n", algo);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < tcount; i++) {
+		memset(result, 0, 32);
+
+		memcpy(seed, template[i].v, template[i].vlen);
+		memcpy(seed + template[i].vlen, template[i].key,
+		       template[i].klen);
+		memcpy(seed + template[i].vlen + template[i].klen,
+		       template[i].dt, template[i].dtlen);
+
+		err = crypto_rng_reset(tfm, seed, seedsize);
+		if (err) {
+			printk(KERN_ERR "alg: cprng: Failed to reset rng "
+			       "for %s\n", algo);
+			goto out;
+		}
+
+		for (j = 0; j < template[i].loops; j++) {
+			err = crypto_rng_get_bytes(tfm, result,
+						   template[i].rlen);
+			if (err != template[i].rlen) {
+				printk(KERN_ERR "alg: cprng: Failed to obtain "
+				       "the correct amount of random data for "
+				       "%s (requested %d, got %d)\n", algo,
+				       template[i].rlen, err);
+				goto out;
+			}
+		}
+
+		err = memcmp(result, template[i].result,
+			     template[i].rlen);
+		if (err) {
+			printk(KERN_ERR "alg: cprng: Test %d failed for %s\n",
+			       i, algo);
+			hexdump(result, template[i].rlen);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+out:
+	kfree(seed);
+	return err;
+}
+
 static int alg_test_aead(const struct alg_test_desc *desc, const char *driver,
 			 u32 type, u32 mask)
 {
@@ -1288,6 +1357,26 @@ out:
 	return err;
 }
 
+static int alg_test_cprng(const struct alg_test_desc *desc, const char *driver,
+			  u32 type, u32 mask)
+{
+	struct crypto_rng *rng;
+	int err;
+
+	rng = crypto_alloc_rng(driver, type, mask);
+	if (IS_ERR(rng)) {
+		printk(KERN_ERR "alg: cprng: Failed to load transform for %s: "
+		       "%ld\n", driver, PTR_ERR(rng));
+		return PTR_ERR(rng);
+	}
+
+	err = test_cprng(rng, desc->suite.cprng.vecs, desc->suite.cprng.count);
+
+	crypto_free_rng(rng);
+
+	return err;
+}
+
 /* Please keep this list sorted by algorithm name. */
 static const struct alg_test_desc alg_test_descs[] = {
 	{
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 5add65196a98..13d5a61d0e77 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -70,6 +70,18 @@ struct aead_testvec {
 	unsigned short rlen;
 };
 
+struct cprng_testvec {
+	char *key;
+	char *dt;
+	char *v;
+	char *result;
+	unsigned char klen;
+	unsigned short dtlen;
+	unsigned short vlen;
+	unsigned short rlen;
+	unsigned short loops;
+};
+
 static char zeroed_string[48];
 
 /*

From e08ca2da39db22da569dc23578103cdc942fe3ac Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 4 May 2009 19:46:29 +0800
Subject: [PATCH 084/162] crypto: testmgr - Add ansi_cprng test vectors

Add ANSI X9.31 Continuous Pseudo-Random Number Generator (AES mode),
aka 'ansi_cprng' test vectors, taken from Appendix B.2.9 and B.2.10
of the NIST RNGVS document, found here:
    http://csrc.nist.gov/groups/STM/cavp/documents/rng/RNGVS.pdf

Successfully tested against both the cryptodev-2.6 tree and a Red
Hat Enterprise Linux 5.4 kernel, via 'modprobe tcrypt mode=150'.

The selection of 150 was semi-arbitrary, didn't seem like it should
go any place in particular, so I started a new range for rng tests.

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c  |  4 ++
 crypto/testmgr.c |  9 +++++
 crypto/testmgr.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 0452036b1d45..ea3b8a8db721 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -707,6 +707,10 @@ static void do_test(int m)
 		tcrypt_test("hmac(rmd160)");
 		break;
 
+	case 150:
+		tcrypt_test("ansi_cprng");
+		break;
+
 	case 200:
 		test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0,
 				speed_template_16_24_32);
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index adc54cfd39df..5183ec5a4517 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1380,6 +1380,15 @@ static int alg_test_cprng(const struct alg_test_desc *desc, const char *driver,
 /* Please keep this list sorted by algorithm name. */
 static const struct alg_test_desc alg_test_descs[] = {
 	{
+		.alg = "ansi_cprng",
+		.test = alg_test_cprng,
+		.suite = {
+			.cprng = {
+				.vecs = ansi_cprng_aes_tv_template,
+				.count = ANSI_CPRNG_AES_TEST_VECTORS
+			}
+		}
+	}, {
 		.alg = "cbc(aes)",
 		.test = alg_test_skcipher,
 		.suite = {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 13d5a61d0e77..c1c709b57ddb 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -6208,6 +6208,102 @@ static struct aead_testvec aes_ccm_rfc4309_dec_tv_template[] = {
 	},
 };
 
+/*
+ * ANSI X9.31 Continuous Pseudo-Random Number Generator (AES mode)
+ * test vectors, taken from Appendix B.2.9 and B.2.10:
+ *     http://csrc.nist.gov/groups/STM/cavp/documents/rng/RNGVS.pdf
+ * Only AES-128 is supported at this time.
+ */
+#define ANSI_CPRNG_AES_TEST_VECTORS	6
+
+static struct cprng_testvec ansi_cprng_aes_tv_template[] = {
+	{
+		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
+			  "\xed\x06\x1c\xab\xb8\xd4\x62\x02",
+		.klen	= 16,
+		.dt	= "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62"
+			  "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xf9",
+		.dtlen	= 16,
+		.v	= "\x80\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.vlen	= 16,
+		.result	= "\x59\x53\x1e\xd1\x3b\xb0\xc0\x55"
+			  "\x84\x79\x66\x85\xc1\x2f\x76\x41",
+		.rlen	= 16,
+		.loops	= 1,
+	}, {
+		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
+			  "\xed\x06\x1c\xab\xb8\xd4\x62\x02",
+		.klen	= 16,
+		.dt	= "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62"
+			  "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfa",
+		.dtlen	= 16,
+		.v	= "\xc0\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.vlen	= 16,
+		.result	= "\x7c\x22\x2c\xf4\xca\x8f\xa2\x4c"
+			  "\x1c\x9c\xb6\x41\xa9\xf3\x22\x0d",
+		.rlen	= 16,
+		.loops	= 1,
+	}, {
+		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
+			  "\xed\x06\x1c\xab\xb8\xd4\x62\x02",
+		.klen	= 16,
+		.dt	= "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62"
+			  "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfb",
+		.dtlen	= 16,
+		.v	= "\xe0\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.vlen	= 16,
+		.result	= "\x8a\xaa\x00\x39\x66\x67\x5b\xe5"
+			  "\x29\x14\x28\x81\xa9\x4d\x4e\xc7",
+		.rlen	= 16,
+		.loops	= 1,
+	}, {
+		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
+			  "\xed\x06\x1c\xab\xb8\xd4\x62\x02",
+		.klen	= 16,
+		.dt	= "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62"
+			  "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfc",
+		.dtlen	= 16,
+		.v	= "\xf0\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.vlen	= 16,
+		.result	= "\x88\xdd\xa4\x56\x30\x24\x23\xe5"
+			  "\xf6\x9d\xa5\x7e\x7b\x95\xc7\x3a",
+		.rlen	= 16,
+		.loops	= 1,
+	}, {
+		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
+			  "\xed\x06\x1c\xab\xb8\xd4\x62\x02",
+		.klen	= 16,
+		.dt	= "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62"
+			  "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfd",
+		.dtlen	= 16,
+		.v	= "\xf8\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.vlen	= 16,
+		.result	= "\x05\x25\x92\x46\x61\x79\xd2\xcb"
+			  "\x78\xc4\x0b\x14\x0a\x5a\x9a\xc8",
+		.rlen	= 16,
+		.loops	= 1,
+	}, {	/* Monte Carlo Test */
+		.key	= "\x9f\x5b\x51\x20\x0b\xf3\x34\xb5"
+			  "\xd8\x2b\xe8\xc3\x72\x55\xc8\x48",
+		.klen	= 16,
+		.dt	= "\x63\x76\xbb\xe5\x29\x02\xba\x3b"
+			  "\x67\xc9\x25\xfa\x70\x1f\x11\xac",
+		.dtlen	= 16,
+		.v	= "\x57\x2c\x8e\x76\x87\x26\x47\x97"
+			  "\x7e\x74\xfb\xdd\xc4\x95\x01\xd1",
+		.vlen	= 16,
+		.result	= "\x48\xe9\xbd\x0d\x06\xee\x18\xfb"
+			  "\xe4\x57\x90\xd5\xc3\xfc\x9b\x73",
+		.rlen	= 16,
+		.loops	= 10000,
+	},
+};
+
 /* Cast5 test vectors from RFC 2144 */
 #define CAST5_ENC_TEST_VECTORS	3
 #define CAST5_DEC_TEST_VECTORS	3

From 941fb3287c0c0d84000b669db5450ac4886da640 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 4 May 2009 19:49:23 +0800
Subject: [PATCH 085/162] crypto: testmgr - Catch base cipher self-test
 failures in fips mode

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 5183ec5a4517..e3f9973abbdc 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -2220,7 +2220,8 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask)
 		if (i < 0)
 			goto notest;
 
-		return alg_test_cipher(alg_test_descs + i, driver, type, mask);
+		rc = alg_test_cipher(alg_test_descs + i, driver, type, mask);
+		goto test_done;
 	}
 
 	i = alg_find_test(alg);
@@ -2229,6 +2230,7 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask)
 
 	rc = alg_test_descs[i].test(alg_test_descs + i, driver,
 				      type, mask);
+test_done:
 	if (fips_enabled && rc)
 		panic("%s: %s alg self test failed in fips mode!\n", driver, alg);
 

From 29ecd4ab3d3aa8bb231361937165dfbbbc534e9a Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 4 May 2009 19:51:17 +0800
Subject: [PATCH 086/162] crypto: testmgr - Print self-test pass notices in
 fips mode

According to our FIPS CAVS testing lab guru, when we're in fips mode,
we must print out notices of successful self-test completion for
every alg to be compliant.

New and improved v2, without strncmp crap. Doesn't need to touch a flag
though, due to not moving the notest label around anymore.

Applies atop '[PATCH v2] crypto: catch base cipher self-test failures
in fips mode'.

Personally, I wouldn't mind seeing this info printed out regardless of
whether or not we're in fips mode, I think its useful info, but will
stick with only in fips mode for now.

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index e3f9973abbdc..e76af78d2fa0 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -2234,6 +2234,10 @@ test_done:
 	if (fips_enabled && rc)
 		panic("%s: %s alg self test failed in fips mode!\n", driver, alg);
 
+	if (fips_enabled && !rc)
+		printk(KERN_INFO "alg: self-tests for %s (%s) passed\n",
+		       driver, alg);
+
 	return rc;
 
 notest:

From f8b0d4d09dc9d0a73fcdcf6c2724650529ec417d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 6 May 2009 14:15:47 +0800
Subject: [PATCH 087/162] crypto: testmgr - Dynamically allocate xbuf and axbuf

We currently allocate temporary memory that is used for testing
statically.  This renders the testing engine non-reentrant. As
algorithms may nest, i.e., one may construct another in order to
carry out a part of its operation, this is unacceptable.  For
example, it has been reported that an AEAD implementation allocates
a cipher in its setkey function, which causes it to fail during
testing as the temporary memory is overwritten.

This patch replaces the static memory with dynamically allocated
buffers.  We need a maximum of 16 pages so this slightly increases
the chances of an algorithm failing due to memory shortage.
However, as testing usually occurs at registration, this shouldn't
be a big problem.

Reported-by: Shasi Pulijala <spulijala@amcc.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algboss.c  |  18 +-------
 crypto/internal.h |   3 --
 crypto/testmgr.c  | 108 +++++++++++++++++++++++++---------------------
 3 files changed, 61 insertions(+), 68 deletions(-)

diff --git a/crypto/algboss.c b/crypto/algboss.c
index 6906f92aeac0..9908dd830c26 100644
--- a/crypto/algboss.c
+++ b/crypto/algboss.c
@@ -280,29 +280,13 @@ static struct notifier_block cryptomgr_notifier = {
 
 static int __init cryptomgr_init(void)
 {
-	int err;
-
-	err = testmgr_init();
-	if (err)
-		return err;
-
-	err = crypto_register_notifier(&cryptomgr_notifier);
-	if (err)
-		goto free_testmgr;
-
-	return 0;
-
-free_testmgr:
-	testmgr_exit();
-	return err;
+	return crypto_register_notifier(&cryptomgr_notifier);
 }
 
 static void __exit cryptomgr_exit(void)
 {
 	int err = crypto_unregister_notifier(&cryptomgr_notifier);
 	BUG_ON(err);
-
-	testmgr_exit();
 }
 
 subsys_initcall(cryptomgr_init);
diff --git a/crypto/internal.h b/crypto/internal.h
index fc76e1f37fc3..113579a82dff 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -121,9 +121,6 @@ int crypto_register_notifier(struct notifier_block *nb);
 int crypto_unregister_notifier(struct notifier_block *nb);
 int crypto_probing_notify(unsigned long val, void *v);
 
-int __init testmgr_init(void);
-void testmgr_exit(void);
-
 static inline void crypto_alg_put(struct crypto_alg *alg)
 {
 	if (atomic_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy)
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index e76af78d2fa0..c5550943411d 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -107,9 +107,6 @@ struct alg_test_desc {
 
 static unsigned int IDX[8] = { IDX1, IDX2, IDX3, IDX4, IDX5, IDX6, IDX7, IDX8 };
 
-static char *xbuf[XBUFSIZE];
-static char *axbuf[XBUFSIZE];
-
 static void hexdump(unsigned char *buf, unsigned int len)
 {
 	print_hex_dump(KERN_CONT, "", DUMP_PREFIX_OFFSET,
@@ -128,6 +125,33 @@ static void tcrypt_complete(struct crypto_async_request *req, int err)
 	complete(&res->completion);
 }
 
+static int testmgr_alloc_buf(char *buf[XBUFSIZE])
+{
+	int i;
+
+	for (i = 0; i < XBUFSIZE; i++) {
+		buf[i] = (void *)__get_free_page(GFP_KERNEL);
+		if (!buf[i])
+			goto err_free_buf;
+	}
+
+	return 0;
+
+err_free_buf:
+	while (i-- > 0)
+		free_page((unsigned long)buf[i]);
+
+	return -ENOMEM;
+}
+
+static void testmgr_free_buf(char *buf[XBUFSIZE])
+{
+	int i;
+
+	for (i = 0; i < XBUFSIZE; i++)
+		free_page((unsigned long)buf[i]);
+}
+
 static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 		     unsigned int tcount)
 {
@@ -137,8 +161,12 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 	char result[64];
 	struct ahash_request *req;
 	struct tcrypt_result tresult;
-	int ret;
 	void *hash_buff;
+	char *xbuf[XBUFSIZE];
+	int ret = -ENOMEM;
+
+	if (testmgr_alloc_buf(xbuf))
+		goto out_nobuf;
 
 	init_completion(&tresult.completion);
 
@@ -146,7 +174,6 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 	if (!req) {
 		printk(KERN_ERR "alg: hash: Failed to allocate request for "
 		       "%s\n", algo);
-		ret = -ENOMEM;
 		goto out_noreq;
 	}
 	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
@@ -272,6 +299,8 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 out:
 	ahash_request_free(req);
 out_noreq:
+	testmgr_free_buf(xbuf);
+out_nobuf:
 	return ret;
 }
 
@@ -280,7 +309,7 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm));
 	unsigned int i, j, k, n, temp;
-	int ret = 0;
+	int ret = -ENOMEM;
 	char *q;
 	char *key;
 	struct aead_request *req;
@@ -292,6 +321,13 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 	void *input;
 	void *assoc;
 	char iv[MAX_IVLEN];
+	char *xbuf[XBUFSIZE];
+	char *axbuf[XBUFSIZE];
+
+	if (testmgr_alloc_buf(xbuf))
+		goto out_noxbuf;
+	if (testmgr_alloc_buf(axbuf))
+		goto out_noaxbuf;
 
 	if (enc == ENCRYPT)
 		e = "encryption";
@@ -304,7 +340,6 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 	if (!req) {
 		printk(KERN_ERR "alg: aead: Failed to allocate request for "
 		       "%s\n", algo);
-		ret = -ENOMEM;
 		goto out;
 	}
 
@@ -581,6 +616,10 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 
 out:
 	aead_request_free(req);
+	testmgr_free_buf(axbuf);
+out_noaxbuf:
+	testmgr_free_buf(xbuf);
+out_noxbuf:
 	return ret;
 }
 
@@ -589,10 +628,14 @@ static int test_cipher(struct crypto_cipher *tfm, int enc,
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_cipher_tfm(tfm));
 	unsigned int i, j, k;
-	int ret;
 	char *q;
 	const char *e;
 	void *data;
+	char *xbuf[XBUFSIZE];
+	int ret = -ENOMEM;
+
+	if (testmgr_alloc_buf(xbuf))
+		goto out_nobuf;
 
 	if (enc == ENCRYPT)
 	        e = "encryption";
@@ -646,6 +689,8 @@ static int test_cipher(struct crypto_cipher *tfm, int enc,
 	ret = 0;
 
 out:
+	testmgr_free_buf(xbuf);
+out_nobuf:
 	return ret;
 }
 
@@ -655,7 +700,6 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 	const char *algo =
 		crypto_tfm_alg_driver_name(crypto_ablkcipher_tfm(tfm));
 	unsigned int i, j, k, n, temp;
-	int ret;
 	char *q;
 	struct ablkcipher_request *req;
 	struct scatterlist sg[8];
@@ -663,6 +707,11 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 	struct tcrypt_result result;
 	void *data;
 	char iv[MAX_IVLEN];
+	char *xbuf[XBUFSIZE];
+	int ret = -ENOMEM;
+
+	if (testmgr_alloc_buf(xbuf))
+		goto out_nobuf;
 
 	if (enc == ENCRYPT)
 	        e = "encryption";
@@ -675,7 +724,6 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 	if (!req) {
 		printk(KERN_ERR "alg: skcipher: Failed to allocate request "
 		       "for %s\n", algo);
-		ret = -ENOMEM;
 		goto out;
 	}
 
@@ -860,6 +908,8 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 
 out:
 	ablkcipher_request_free(req);
+	testmgr_free_buf(xbuf);
+out_nobuf:
 	return ret;
 }
 
@@ -2245,41 +2295,3 @@ notest:
 	return 0;
 }
 EXPORT_SYMBOL_GPL(alg_test);
-
-int __init testmgr_init(void)
-{
-	int i;
-
-	for (i = 0; i < XBUFSIZE; i++) {
-		xbuf[i] = (void *)__get_free_page(GFP_KERNEL);
-		if (!xbuf[i])
-			goto err_free_xbuf;
-	}
-
-	for (i = 0; i < XBUFSIZE; i++) {
-		axbuf[i] = (void *)__get_free_page(GFP_KERNEL);
-		if (!axbuf[i])
-			goto err_free_axbuf;
-	}
-
-	return 0;
-
-err_free_axbuf:
-	for (i = 0; i < XBUFSIZE && axbuf[i]; i++)
-		free_page((unsigned long)axbuf[i]);
-err_free_xbuf:
-	for (i = 0; i < XBUFSIZE && xbuf[i]; i++)
-		free_page((unsigned long)xbuf[i]);
-
-	return -ENOMEM;
-}
-
-void testmgr_exit(void)
-{
-	int i;
-
-	for (i = 0; i < XBUFSIZE; i++)
-		free_page((unsigned long)axbuf[i]);
-	for (i = 0; i < XBUFSIZE; i++)
-		free_page((unsigned long)xbuf[i]);
-}

From f7cb80f2b9fa06730be20d17c80b12e511a36c1c Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Wed, 6 May 2009 17:29:17 +0800
Subject: [PATCH 088/162] crypto: testmgr - Add ctr(aes) test vectors

Now with multi-block test vectors, all from SP800-38A, Appendix F.5.
Also added ctr(aes) to case 10 in tcrypt.

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c  |   1 +
 crypto/testmgr.c |  23 +++++--
 crypto/testmgr.h | 166 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 183 insertions(+), 7 deletions(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index ea3b8a8db721..9e4974eb7825 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -526,6 +526,7 @@ static void do_test(int m)
 		tcrypt_test("cbc(aes)");
 		tcrypt_test("lrw(aes)");
 		tcrypt_test("xts(aes)");
+		tcrypt_test("ctr(aes)");
 		tcrypt_test("rfc3686(ctr(aes))");
 		break;
 
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index c5550943411d..f4cc1780aee2 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1567,6 +1567,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				.count = CRC32C_TEST_VECTORS
 			}
 		}
+	}, {
+		.alg = "ctr(aes)",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = aes_ctr_enc_tv_template,
+					.count = AES_CTR_ENC_TEST_VECTORS
+				},
+				.dec = {
+					.vecs = aes_ctr_dec_tv_template,
+					.count = AES_CTR_DEC_TEST_VECTORS
+				}
+			}
+		}
 	}, {
 		.alg = "cts(cbc(aes))",
 		.test = alg_test_skcipher,
@@ -2017,12 +2032,12 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.cipher = {
 				.enc = {
-					.vecs = aes_ctr_enc_tv_template,
-					.count = AES_CTR_ENC_TEST_VECTORS
+					.vecs = aes_ctr_rfc3686_enc_tv_template,
+					.count = AES_CTR_3686_ENC_TEST_VECTORS
 				},
 				.dec = {
-					.vecs = aes_ctr_dec_tv_template,
-					.count = AES_CTR_DEC_TEST_VECTORS
+					.vecs = aes_ctr_rfc3686_dec_tv_template,
+					.count = AES_CTR_3686_DEC_TEST_VECTORS
 				}
 			}
 		}
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index c1c709b57ddb..69316228fc19 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -2854,8 +2854,10 @@ static struct cipher_testvec cast6_dec_tv_template[] = {
 #define AES_LRW_DEC_TEST_VECTORS 8
 #define AES_XTS_ENC_TEST_VECTORS 4
 #define AES_XTS_DEC_TEST_VECTORS 4
-#define AES_CTR_ENC_TEST_VECTORS 7
-#define AES_CTR_DEC_TEST_VECTORS 6
+#define AES_CTR_ENC_TEST_VECTORS 3
+#define AES_CTR_DEC_TEST_VECTORS 3
+#define AES_CTR_3686_ENC_TEST_VECTORS 7
+#define AES_CTR_3686_DEC_TEST_VECTORS 6
 #define AES_GCM_ENC_TEST_VECTORS 9
 #define AES_GCM_DEC_TEST_VECTORS 8
 #define AES_CCM_ENC_TEST_VECTORS 7
@@ -3998,6 +4000,164 @@ static struct cipher_testvec aes_xts_dec_tv_template[] = {
 
 
 static struct cipher_testvec aes_ctr_enc_tv_template[] = {
+	{ /* From NIST Special Publication 800-38A, Appendix F.5 */
+		.key	= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+			  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.klen	= 16,
+		.iv	= "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.input	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ilen	= 64,
+		.result	= "\x87\x4d\x61\x91\xb6\x20\xe3\x26"
+			  "\x1b\xef\x68\x64\x99\x0d\xb6\xce"
+			  "\x98\x06\xf6\x6b\x79\x70\xfd\xff"
+			  "\x86\x17\x18\x7b\xb9\xff\xfd\xff"
+			  "\x5a\xe4\xdf\x3e\xdb\xd5\xd3\x5e"
+			  "\x5b\x4f\x09\x02\x0d\xb0\x3e\xab"
+			  "\x1e\x03\x1d\xda\x2f\xbe\x03\xd1"
+			  "\x79\x21\x70\xa0\xf3\x00\x9c\xee",
+		.rlen	= 64,
+	}, {
+		.key	= "\x8e\x73\xb0\xf7\xda\x0e\x64\x52"
+			  "\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+			  "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+		.klen	= 24,
+		.iv	= "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.input	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ilen	= 64,
+		.result	= "\x1a\xbc\x93\x24\x17\x52\x1c\xa2"
+			  "\x4f\x2b\x04\x59\xfe\x7e\x6e\x0b"
+			  "\x09\x03\x39\xec\x0a\xa6\xfa\xef"
+			  "\xd5\xcc\xc2\xc6\xf4\xce\x8e\x94"
+			  "\x1e\x36\xb2\x6b\xd1\xeb\xc6\x70"
+			  "\xd1\xbd\x1d\x66\x56\x20\xab\xf7"
+			  "\x4f\x78\xa7\xf6\xd2\x98\x09\x58"
+			  "\x5a\x97\xda\xec\x58\xc6\xb0\x50",
+		.rlen	= 64,
+	}, {
+		.key	= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+			  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+			  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+			  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.klen	= 32,
+		.iv	= "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.input	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ilen	= 64,
+		.result	= "\x60\x1e\xc3\x13\x77\x57\x89\xa5"
+			  "\xb7\xa7\xf5\x04\xbb\xf3\xd2\x28"
+			  "\xf4\x43\xe3\xca\x4d\x62\xb5\x9a"
+			  "\xca\x84\xe9\x90\xca\xca\xf5\xc5"
+			  "\x2b\x09\x30\xda\xa2\x3d\xe9\x4c"
+			  "\xe8\x70\x17\xba\x2d\x84\x98\x8d"
+			  "\xdf\xc9\xc5\x8d\xb6\x7a\xad\xa6"
+			  "\x13\xc2\xdd\x08\x45\x79\x41\xa6",
+		.rlen	= 64,
+	}
+};
+
+static struct cipher_testvec aes_ctr_dec_tv_template[] = {
+	{ /* From NIST Special Publication 800-38A, Appendix F.5 */
+		.key	= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+			  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.klen	= 16,
+		.iv	= "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.input	= "\x87\x4d\x61\x91\xb6\x20\xe3\x26"
+			  "\x1b\xef\x68\x64\x99\x0d\xb6\xce"
+			  "\x98\x06\xf6\x6b\x79\x70\xfd\xff"
+			  "\x86\x17\x18\x7b\xb9\xff\xfd\xff"
+			  "\x5a\xe4\xdf\x3e\xdb\xd5\xd3\x5e"
+			  "\x5b\x4f\x09\x02\x0d\xb0\x3e\xab"
+			  "\x1e\x03\x1d\xda\x2f\xbe\x03\xd1"
+			  "\x79\x21\x70\xa0\xf3\x00\x9c\xee",
+		.ilen	= 64,
+		.result	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.rlen	= 64,
+	}, {
+		.key	= "\x8e\x73\xb0\xf7\xda\x0e\x64\x52"
+			  "\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+			  "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+		.klen	= 24,
+		.iv	= "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.input	= "\x1a\xbc\x93\x24\x17\x52\x1c\xa2"
+			  "\x4f\x2b\x04\x59\xfe\x7e\x6e\x0b"
+			  "\x09\x03\x39\xec\x0a\xa6\xfa\xef"
+			  "\xd5\xcc\xc2\xc6\xf4\xce\x8e\x94"
+			  "\x1e\x36\xb2\x6b\xd1\xeb\xc6\x70"
+			  "\xd1\xbd\x1d\x66\x56\x20\xab\xf7"
+			  "\x4f\x78\xa7\xf6\xd2\x98\x09\x58"
+			  "\x5a\x97\xda\xec\x58\xc6\xb0\x50",
+		.ilen	= 64,
+		.result	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.rlen	= 64,
+	}, {
+		.key	= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+			  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+			  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+			  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.klen	= 32,
+		.iv	= "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+			  "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+		.input	= "\x60\x1e\xc3\x13\x77\x57\x89\xa5"
+			  "\xb7\xa7\xf5\x04\xbb\xf3\xd2\x28"
+			  "\xf4\x43\xe3\xca\x4d\x62\xb5\x9a"
+			  "\xca\x84\xe9\x90\xca\xca\xf5\xc5"
+			  "\x2b\x09\x30\xda\xa2\x3d\xe9\x4c"
+			  "\xe8\x70\x17\xba\x2d\x84\x98\x8d"
+			  "\xdf\xc9\xc5\x8d\xb6\x7a\xad\xa6"
+			  "\x13\xc2\xdd\x08\x45\x79\x41\xa6",
+		.ilen	= 64,
+		.result	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.rlen	= 64,
+	}
+};
+
+static struct cipher_testvec aes_ctr_rfc3686_enc_tv_template[] = {
 	{ /* From RFC 3686 */
 		.key	= "\xae\x68\x52\xf8\x12\x10\x67\xcc"
 			  "\x4b\xf7\xa5\x76\x55\x77\xf3\x9e"
@@ -5129,7 +5289,7 @@ static struct cipher_testvec aes_ctr_enc_tv_template[] = {
 	},
 };
 
-static struct cipher_testvec aes_ctr_dec_tv_template[] = {
+static struct cipher_testvec aes_ctr_rfc3686_dec_tv_template[] = {
 	{ /* From RFC 3686 */
 		.key	= "\xae\x68\x52\xf8\x12\x10\x67\xcc"
 			  "\x4b\xf7\xa5\x76\x55\x77\xf3\x9e"

From a1915d51e8e7ee192d2101d621d425379088cbb0 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Fri, 15 May 2009 15:16:03 +1000
Subject: [PATCH 089/162] crypto: testmgr - Mark algs allowed in fips mode

Set the fips_allowed flag in testmgr.c's alg_test_descs[] for algs
that are allowed to be used when in fips mode.

One caveat: des isn't actually allowed anymore, but des (and thus also
ecb(des)) has to be permitted, because disallowing them results in
des3_ede being unable to properly register (see des module init func).

Also, crc32 isn't technically on the fips approved list, but I think
it gets used in various places that necessitate it being allowed.

This list is based on
http://csrc.nist.gov/groups/STM/cavp/index.html

Important note: allowed/approved here does NOT mean "validated", just
that its an alg that *could* be validated.

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index f4cc1780aee2..51bae62c332a 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -94,6 +94,7 @@ struct alg_test_desc {
 	const char *alg;
 	int (*test)(const struct alg_test_desc *desc, const char *driver,
 		    u32 type, u32 mask);
+	int fips_allowed;	/* set if alg is allowed in fips mode */
 
 	union {
 		struct aead_test_suite aead;
@@ -1432,6 +1433,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	{
 		.alg = "ansi_cprng",
 		.test = alg_test_cprng,
+		.fips_allowed = 1,
 		.suite = {
 			.cprng = {
 				.vecs = ansi_cprng_aes_tv_template,
@@ -1441,6 +1443,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "cbc(aes)",
 		.test = alg_test_skcipher,
+		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
 				.enc = {
@@ -1516,6 +1519,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "cbc(des3_ede)",
 		.test = alg_test_skcipher,
+		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
 				.enc = {
@@ -1546,6 +1550,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "ccm(aes)",
 		.test = alg_test_aead,
+		.fips_allowed = 1,
 		.suite = {
 			.aead = {
 				.enc = {
@@ -1561,6 +1566,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "crc32c",
 		.test = alg_test_crc32c,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = crc32c_tv_template,
@@ -1570,6 +1576,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "ctr(aes)",
 		.test = alg_test_skcipher,
+		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
 				.enc = {
@@ -1615,6 +1622,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "ecb(aes)",
 		.test = alg_test_skcipher,
+		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
 				.enc = {
@@ -1720,6 +1728,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "ecb(des)",
 		.test = alg_test_skcipher,
+		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
 				.enc = {
@@ -1735,6 +1744,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "ecb(des3_ede)",
 		.test = alg_test_skcipher,
+		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
 				.enc = {
@@ -1870,6 +1880,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "gcm(aes)",
 		.test = alg_test_aead,
+		.fips_allowed = 1,
 		.suite = {
 			.aead = {
 				.enc = {
@@ -1912,6 +1923,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "hmac(sha1)",
 		.test = alg_test_hash,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = hmac_sha1_tv_template,
@@ -1921,6 +1933,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "hmac(sha224)",
 		.test = alg_test_hash,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = hmac_sha224_tv_template,
@@ -1930,6 +1943,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "hmac(sha256)",
 		.test = alg_test_hash,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = hmac_sha256_tv_template,
@@ -1939,6 +1953,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "hmac(sha384)",
 		.test = alg_test_hash,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = hmac_sha384_tv_template,
@@ -1948,6 +1963,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "hmac(sha512)",
 		.test = alg_test_hash,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = hmac_sha512_tv_template,
@@ -2029,6 +2045,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "rfc3686(ctr(aes))",
 		.test = alg_test_skcipher,
+		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
 				.enc = {
@@ -2044,6 +2061,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "rfc4309(ccm(aes))",
 		.test = alg_test_aead,
+		.fips_allowed = 1,
 		.suite = {
 			.aead = {
 				.enc = {
@@ -2106,6 +2124,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "sha1",
 		.test = alg_test_hash,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = sha1_tv_template,
@@ -2115,6 +2134,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "sha224",
 		.test = alg_test_hash,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = sha224_tv_template,
@@ -2124,6 +2144,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "sha256",
 		.test = alg_test_hash,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = sha256_tv_template,
@@ -2133,6 +2154,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "sha384",
 		.test = alg_test_hash,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = sha384_tv_template,
@@ -2142,6 +2164,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "sha512",
 		.test = alg_test_hash,
+		.fips_allowed = 1,
 		.suite = {
 			.hash = {
 				.vecs = sha512_tv_template,

From a3bef3a31a19bd943047ba8bf5b2cc7b5d164362 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Fri, 15 May 2009 15:17:05 +1000
Subject: [PATCH 090/162] crypto: testmgr - Skip algs not flagged fips_allowed
 in fips mode

Because all fips-allowed algorithms must be self-tested before they
can be used, they will all have entries in testmgr.c's alg_test_descs[].
Skip self-tests for any algs not flagged as fips_approved and return
-EINVAL when in fips mode.

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 51bae62c332a..f93b26d0fcfb 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -2308,6 +2308,9 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask)
 		if (i < 0)
 			goto notest;
 
+		if (fips_enabled && !alg_test_descs[i].fips_allowed)
+			goto non_fips_alg;
+
 		rc = alg_test_cipher(alg_test_descs + i, driver, type, mask);
 		goto test_done;
 	}
@@ -2316,6 +2319,9 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask)
 	if (i < 0)
 		goto notest;
 
+	if (fips_enabled && !alg_test_descs[i].fips_allowed)
+		goto non_fips_alg;
+
 	rc = alg_test_descs[i].test(alg_test_descs + i, driver,
 				      type, mask);
 test_done:
@@ -2331,5 +2337,7 @@ test_done:
 notest:
 	printk(KERN_INFO "alg: No test for %s (%s)\n", alg, driver);
 	return 0;
+non_fips_alg:
+	return -EINVAL;
 }
 EXPORT_SYMBOL_GPL(alg_test);

From 608d1cd5d375580a49d01b5ed1f9944f5141ae19 Mon Sep 17 00:00:00 2001
From: Harald Welte <HaraldWelte@viatech.com>
Date: Fri, 15 May 2009 15:57:35 +1000
Subject: [PATCH 091/162] hwrng: via_rng - The VIA Hardware RNG driver is for
 the CPU, not Chipset

This is a cosmetic change, fixing the MODULE_DESCRIPTION() of via-rng.c

Signed-off-by: Harald Welte <HaraldWelte@viatech.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/via-rng.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c
index 4e9573c1d39e..02ee63906713 100644
--- a/drivers/char/hw_random/via-rng.c
+++ b/drivers/char/hw_random/via-rng.c
@@ -205,5 +205,5 @@ static void __exit mod_exit(void)
 module_init(mod_init);
 module_exit(mod_exit);
 
-MODULE_DESCRIPTION("H/W RNG driver for VIA chipsets");
+MODULE_DESCRIPTION("H/W RNG driver for VIA CPU with PadLock");
 MODULE_LICENSE("GPL");

From 858576bdc5d65edf1fffd2e65b2165ec1dc68486 Mon Sep 17 00:00:00 2001
From: Harald Welte <HaraldWelte@viatech.com>
Date: Fri, 15 May 2009 16:00:32 +1000
Subject: [PATCH 092/162] hwrng: via_rng - Support VIA Nano hardware RNG

The VIA Nano CPU supports the same XSTORE instruction based RNG,
but it lacks the MSR present in earlier CPUs.

Signed-off-by: Harald Welte <HaraldWelte@viatech.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/via-rng.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c
index 02ee63906713..794aacb715c1 100644
--- a/drivers/char/hw_random/via-rng.c
+++ b/drivers/char/hw_random/via-rng.c
@@ -132,6 +132,19 @@ static int via_rng_init(struct hwrng *rng)
 	struct cpuinfo_x86 *c = &cpu_data(0);
 	u32 lo, hi, old_lo;
 
+	/* VIA Nano CPUs don't have the MSR_VIA_RNG anymore.  The RNG
+	 * is always enabled if CPUID rng_en is set.  There is no
+	 * RNG configuration like it used to be the case in this
+	 * register */
+	if ((c->x86 == 6) && (c->x86_model >= 0x0f)) {
+		if (!cpu_has_xstore_enabled) {
+			printk(KERN_ERR PFX "can't enable hardware RNG "
+				"if XSTORE is not enabled\n");
+			return -ENODEV;
+		}
+		return 0;
+	}
+
 	/* Control the RNG via MSR.  Tread lightly and pay very close
 	 * close attention to values written, as the reserved fields
 	 * are documented to be "undefined and unpredictable"; but it

From e9736c16da9077728802f42393d18258e6685428 Mon Sep 17 00:00:00 2001
From: Harald Welte <HaraldWelte@viatech.com>
Date: Fri, 15 May 2009 16:01:52 +1000
Subject: [PATCH 093/162] hwrng: via_rng - Support VIA Nano hardware RNG on
 X86_64 builds

Fix Kconfig to build via-rng.ko on X86_64 builds, as the VIA Nano
CPU supports x86_64, too.

Signed-off-by: Harald Welte <HaraldWelte@viatech.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index 5fab6470f4b2..9c00440dcf86 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -88,7 +88,7 @@ config HW_RANDOM_N2RNG
 
 config HW_RANDOM_VIA
 	tristate "VIA HW Random Number Generator support"
-	depends on HW_RANDOM && X86_32
+	depends on HW_RANDOM && X86
 	default HW_RANDOM
 	---help---
 	  This driver provides kernel-side support for the Random Number

From 3ce858cb04de8bc83449eac707c8012a1944daca Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Wed, 27 May 2009 15:05:02 +1000
Subject: [PATCH 094/162] crypto: compress - Return produced bytes in
 crypto_{,de}compress_{update,final}

If crypto_{,de}compress_{update,final}() succeed, return the actual number of
bytes produced instead of zero, so their users don't have to calculate that
theirselves.

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 117 ++++++++++++++++++++++++++++-------------------
 crypto/zlib.c    |  24 +++++-----
 2 files changed, 82 insertions(+), 59 deletions(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index f93b26d0fcfb..376ea88158b9 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1002,24 +1002,25 @@ static int test_pcomp(struct crypto_pcomp *tfm,
 	const char *algo = crypto_tfm_alg_driver_name(crypto_pcomp_tfm(tfm));
 	unsigned int i;
 	char result[COMP_BUF_SIZE];
-	int error;
+	int res;
 
 	for (i = 0; i < ctcount; i++) {
 		struct comp_request req;
+		unsigned int produced = 0;
 
-		error = crypto_compress_setup(tfm, ctemplate[i].params,
-					      ctemplate[i].paramsize);
-		if (error) {
+		res = crypto_compress_setup(tfm, ctemplate[i].params,
+					    ctemplate[i].paramsize);
+		if (res) {
 			pr_err("alg: pcomp: compression setup failed on test "
-			       "%d for %s: error=%d\n", i + 1, algo, error);
-			return error;
+			       "%d for %s: error=%d\n", i + 1, algo, res);
+			return res;
 		}
 
-		error = crypto_compress_init(tfm);
-		if (error) {
+		res = crypto_compress_init(tfm);
+		if (res) {
 			pr_err("alg: pcomp: compression init failed on test "
-			       "%d for %s: error=%d\n", i + 1, algo, error);
-			return error;
+			       "%d for %s: error=%d\n", i + 1, algo, res);
+			return res;
 		}
 
 		memset(result, 0, sizeof(result));
@@ -1029,32 +1030,37 @@ static int test_pcomp(struct crypto_pcomp *tfm,
 		req.next_out = result;
 		req.avail_out = ctemplate[i].outlen / 2;
 
-		error = crypto_compress_update(tfm, &req);
-		if (error && (error != -EAGAIN || req.avail_in)) {
+		res = crypto_compress_update(tfm, &req);
+		if (res < 0 && (res != -EAGAIN || req.avail_in)) {
 			pr_err("alg: pcomp: compression update failed on test "
-			       "%d for %s: error=%d\n", i + 1, algo, error);
-			return error;
+			       "%d for %s: error=%d\n", i + 1, algo, res);
+			return res;
 		}
+		if (res > 0)
+			produced += res;
 
 		/* Add remaining input data */
 		req.avail_in += (ctemplate[i].inlen + 1) / 2;
 
-		error = crypto_compress_update(tfm, &req);
-		if (error && (error != -EAGAIN || req.avail_in)) {
+		res = crypto_compress_update(tfm, &req);
+		if (res < 0 && (res != -EAGAIN || req.avail_in)) {
 			pr_err("alg: pcomp: compression update failed on test "
-			       "%d for %s: error=%d\n", i + 1, algo, error);
-			return error;
+			       "%d for %s: error=%d\n", i + 1, algo, res);
+			return res;
 		}
+		if (res > 0)
+			produced += res;
 
 		/* Provide remaining output space */
 		req.avail_out += COMP_BUF_SIZE - ctemplate[i].outlen / 2;
 
-		error = crypto_compress_final(tfm, &req);
-		if (error) {
+		res = crypto_compress_final(tfm, &req);
+		if (res < 0) {
 			pr_err("alg: pcomp: compression final failed on test "
-			       "%d for %s: error=%d\n", i + 1, algo, error);
-			return error;
+			       "%d for %s: error=%d\n", i + 1, algo, res);
+			return res;
 		}
+		produced += res;
 
 		if (COMP_BUF_SIZE - req.avail_out != ctemplate[i].outlen) {
 			pr_err("alg: comp: Compression test %d failed for %s: "
@@ -1064,6 +1070,13 @@ static int test_pcomp(struct crypto_pcomp *tfm,
 			return -EINVAL;
 		}
 
+		if (produced != ctemplate[i].outlen) {
+			pr_err("alg: comp: Compression test %d failed for %s: "
+			       "returned len = %u (expected %d)\n", i + 1,
+			       algo, produced, ctemplate[i].outlen);
+			return -EINVAL;
+		}
+
 		if (memcmp(result, ctemplate[i].output, ctemplate[i].outlen)) {
 			pr_err("alg: pcomp: Compression test %d failed for "
 			       "%s\n", i + 1, algo);
@@ -1074,21 +1087,21 @@ static int test_pcomp(struct crypto_pcomp *tfm,
 
 	for (i = 0; i < dtcount; i++) {
 		struct comp_request req;
+		unsigned int produced = 0;
 
-		error = crypto_decompress_setup(tfm, dtemplate[i].params,
-						dtemplate[i].paramsize);
-		if (error) {
+		res = crypto_decompress_setup(tfm, dtemplate[i].params,
+					      dtemplate[i].paramsize);
+		if (res) {
 			pr_err("alg: pcomp: decompression setup failed on "
-			       "test %d for %s: error=%d\n", i + 1, algo,
-			       error);
-			return error;
+			       "test %d for %s: error=%d\n", i + 1, algo, res);
+			return res;
 		}
 
-		error = crypto_decompress_init(tfm);
-		if (error) {
+		res = crypto_decompress_init(tfm);
+		if (res) {
 			pr_err("alg: pcomp: decompression init failed on test "
-			       "%d for %s: error=%d\n", i + 1, algo, error);
-			return error;
+			       "%d for %s: error=%d\n", i + 1, algo, res);
+			return res;
 		}
 
 		memset(result, 0, sizeof(result));
@@ -1098,35 +1111,38 @@ static int test_pcomp(struct crypto_pcomp *tfm,
 		req.next_out = result;
 		req.avail_out = dtemplate[i].outlen / 2;
 
-		error = crypto_decompress_update(tfm, &req);
-		if (error  && (error != -EAGAIN || req.avail_in)) {
+		res = crypto_decompress_update(tfm, &req);
+		if (res < 0 && (res != -EAGAIN || req.avail_in)) {
 			pr_err("alg: pcomp: decompression update failed on "
-			       "test %d for %s: error=%d\n", i + 1, algo,
-			       error);
-			return error;
+			       "test %d for %s: error=%d\n", i + 1, algo, res);
+			return res;
 		}
+		if (res > 0)
+			produced += res;
 
 		/* Add remaining input data */
 		req.avail_in += (dtemplate[i].inlen + 1) / 2;
 
-		error = crypto_decompress_update(tfm, &req);
-		if (error  && (error != -EAGAIN || req.avail_in)) {
+		res = crypto_decompress_update(tfm, &req);
+		if (res < 0 && (res != -EAGAIN || req.avail_in)) {
 			pr_err("alg: pcomp: decompression update failed on "
-			       "test %d for %s: error=%d\n", i + 1, algo,
-			       error);
-			return error;
+			       "test %d for %s: error=%d\n", i + 1, algo, res);
+			return res;
 		}
+		if (res > 0)
+			produced += res;
 
 		/* Provide remaining output space */
 		req.avail_out += COMP_BUF_SIZE - dtemplate[i].outlen / 2;
 
-		error = crypto_decompress_final(tfm, &req);
-		if (error  && (error != -EAGAIN || req.avail_in)) {
+		res = crypto_decompress_final(tfm, &req);
+		if (res < 0 && (res != -EAGAIN || req.avail_in)) {
 			pr_err("alg: pcomp: decompression final failed on "
-			       "test %d for %s: error=%d\n", i + 1, algo,
-			       error);
-			return error;
+			       "test %d for %s: error=%d\n", i + 1, algo, res);
+			return res;
 		}
+		if (res > 0)
+			produced += res;
 
 		if (COMP_BUF_SIZE - req.avail_out != dtemplate[i].outlen) {
 			pr_err("alg: comp: Decompression test %d failed for "
@@ -1136,6 +1152,13 @@ static int test_pcomp(struct crypto_pcomp *tfm,
 			return -EINVAL;
 		}
 
+		if (produced != dtemplate[i].outlen) {
+			pr_err("alg: comp: Decompression test %d failed for "
+			       "%s: returned len = %u (expected %d)\n", i + 1,
+			       algo, produced, dtemplate[i].outlen);
+			return -EINVAL;
+		}
+
 		if (memcmp(result, dtemplate[i].output, dtemplate[i].outlen)) {
 			pr_err("alg: pcomp: Decompression test %d failed for "
 			       "%s\n", i + 1, algo);
diff --git a/crypto/zlib.c b/crypto/zlib.c
index 33609bab614e..c3015733c990 100644
--- a/crypto/zlib.c
+++ b/crypto/zlib.c
@@ -165,15 +165,15 @@ static int zlib_compress_update(struct crypto_pcomp *tfm,
 		return -EINVAL;
 	}
 
+	ret = req->avail_out - stream->avail_out;
 	pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n",
 		 stream->avail_in, stream->avail_out,
-		 req->avail_in - stream->avail_in,
-		 req->avail_out - stream->avail_out);
+		 req->avail_in - stream->avail_in, ret);
 	req->next_in = stream->next_in;
 	req->avail_in = stream->avail_in;
 	req->next_out = stream->next_out;
 	req->avail_out = stream->avail_out;
-	return 0;
+	return ret;
 }
 
 static int zlib_compress_final(struct crypto_pcomp *tfm,
@@ -195,15 +195,15 @@ static int zlib_compress_final(struct crypto_pcomp *tfm,
 		return -EINVAL;
 	}
 
+	ret = req->avail_out - stream->avail_out;
 	pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n",
 		 stream->avail_in, stream->avail_out,
-		 req->avail_in - stream->avail_in,
-		 req->avail_out - stream->avail_out);
+		 req->avail_in - stream->avail_in, ret);
 	req->next_in = stream->next_in;
 	req->avail_in = stream->avail_in;
 	req->next_out = stream->next_out;
 	req->avail_out = stream->avail_out;
-	return 0;
+	return ret;
 }
 
 
@@ -280,15 +280,15 @@ static int zlib_decompress_update(struct crypto_pcomp *tfm,
 		return -EINVAL;
 	}
 
+	ret = req->avail_out - stream->avail_out;
 	pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n",
 		 stream->avail_in, stream->avail_out,
-		 req->avail_in - stream->avail_in,
-		 req->avail_out - stream->avail_out);
+		 req->avail_in - stream->avail_in, ret);
 	req->next_in = stream->next_in;
 	req->avail_in = stream->avail_in;
 	req->next_out = stream->next_out;
 	req->avail_out = stream->avail_out;
-	return 0;
+	return ret;
 }
 
 static int zlib_decompress_final(struct crypto_pcomp *tfm,
@@ -328,15 +328,15 @@ static int zlib_decompress_final(struct crypto_pcomp *tfm,
 		return -EINVAL;
 	}
 
+	ret = req->avail_out - stream->avail_out;
 	pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n",
 		 stream->avail_in, stream->avail_out,
-		 req->avail_in - stream->avail_in,
-		 req->avail_out - stream->avail_out);
+		 req->avail_in - stream->avail_in, ret);
 	req->next_in = stream->next_in;
 	req->avail_in = stream->avail_in;
 	req->next_out = stream->next_out;
 	req->avail_out = stream->avail_out;
-	return 0;
+	return ret;
 }
 
 

From 4e033a6bc70f094d36128c328f6ca725c6ca4b4c Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Wed, 27 May 2009 15:10:21 +1000
Subject: [PATCH 095/162] crypto: tcrypt - Do not exit on success in fips mode

At present, the tcrypt module always exits with an -EAGAIN upon
successfully completing all the tests its been asked to run. In fips
mode, integrity checking is done by running all self-tests from the
initrd, and its much simpler to check the ret from modprobe for
success than to scrape dmesg and/or /proc/crypto. Simply stay
loaded, giving modprobe a retval of 0, if self-tests all pass and
we're in fips mode.

A side-effect of tracking success/failure for fips mode is that in
non-fips mode, self-test failures will return the actual failure
return codes, rather than always returning -EAGAIN, which seems more
correct anyway.

The tcrypt_test() portion of the patch is dependent on my earlier
pair of patches that skip non-fips algs in fips mode, at least to
achieve the fully intended behavior.

Nb: testing this patch against the cryptodev tree revealed a test
failure for sha384, which I have yet to look into...

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c | 164 ++++++++++++++++++++++++++----------------------
 1 file changed, 90 insertions(+), 74 deletions(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 9e4974eb7825..d59ba5079d14 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -27,6 +27,7 @@
 #include <linux/timex.h>
 #include <linux/interrupt.h>
 #include "tcrypt.h"
+#include "internal.h"
 
 /*
  * Need slab memory for testing (size in number of pages).
@@ -468,248 +469,255 @@ static void test_available(void)
 
 static inline int tcrypt_test(const char *alg)
 {
-	return alg_test(alg, alg, 0, 0);
+	int ret;
+
+	ret = alg_test(alg, alg, 0, 0);
+	/* non-fips algs return -EINVAL in fips mode */
+	if (fips_enabled && ret == -EINVAL)
+		ret = 0;
+	return ret;
 }
 
-static void do_test(int m)
+static int do_test(int m)
 {
 	int i;
+	int ret = 0;
 
 	switch (m) {
 	case 0:
 		for (i = 1; i < 200; i++)
-			do_test(i);
+			ret += do_test(i);
 		break;
 
 	case 1:
-		tcrypt_test("md5");
+		ret += tcrypt_test("md5");
 		break;
 
 	case 2:
-		tcrypt_test("sha1");
+		ret += tcrypt_test("sha1");
 		break;
 
 	case 3:
-		tcrypt_test("ecb(des)");
-		tcrypt_test("cbc(des)");
+		ret += tcrypt_test("ecb(des)");
+		ret += tcrypt_test("cbc(des)");
 		break;
 
 	case 4:
-		tcrypt_test("ecb(des3_ede)");
-		tcrypt_test("cbc(des3_ede)");
+		ret += tcrypt_test("ecb(des3_ede)");
+		ret += tcrypt_test("cbc(des3_ede)");
 		break;
 
 	case 5:
-		tcrypt_test("md4");
+		ret += tcrypt_test("md4");
 		break;
 
 	case 6:
-		tcrypt_test("sha256");
+		ret += tcrypt_test("sha256");
 		break;
 
 	case 7:
-		tcrypt_test("ecb(blowfish)");
-		tcrypt_test("cbc(blowfish)");
+		ret += tcrypt_test("ecb(blowfish)");
+		ret += tcrypt_test("cbc(blowfish)");
 		break;
 
 	case 8:
-		tcrypt_test("ecb(twofish)");
-		tcrypt_test("cbc(twofish)");
+		ret += tcrypt_test("ecb(twofish)");
+		ret += tcrypt_test("cbc(twofish)");
 		break;
 
 	case 9:
-		tcrypt_test("ecb(serpent)");
+		ret += tcrypt_test("ecb(serpent)");
 		break;
 
 	case 10:
-		tcrypt_test("ecb(aes)");
-		tcrypt_test("cbc(aes)");
-		tcrypt_test("lrw(aes)");
-		tcrypt_test("xts(aes)");
-		tcrypt_test("ctr(aes)");
-		tcrypt_test("rfc3686(ctr(aes))");
+		ret += tcrypt_test("ecb(aes)");
+		ret += tcrypt_test("cbc(aes)");
+		ret += tcrypt_test("lrw(aes)");
+		ret += tcrypt_test("xts(aes)");
+		ret += tcrypt_test("ctr(aes)");
+		ret += tcrypt_test("rfc3686(ctr(aes))");
 		break;
 
 	case 11:
-		tcrypt_test("sha384");
+		ret += tcrypt_test("sha384");
 		break;
 
 	case 12:
-		tcrypt_test("sha512");
+		ret += tcrypt_test("sha512");
 		break;
 
 	case 13:
-		tcrypt_test("deflate");
+		ret += tcrypt_test("deflate");
 		break;
 
 	case 14:
-		tcrypt_test("ecb(cast5)");
+		ret += tcrypt_test("ecb(cast5)");
 		break;
 
 	case 15:
-		tcrypt_test("ecb(cast6)");
+		ret += tcrypt_test("ecb(cast6)");
 		break;
 
 	case 16:
-		tcrypt_test("ecb(arc4)");
+		ret += tcrypt_test("ecb(arc4)");
 		break;
 
 	case 17:
-		tcrypt_test("michael_mic");
+		ret += tcrypt_test("michael_mic");
 		break;
 
 	case 18:
-		tcrypt_test("crc32c");
+		ret += tcrypt_test("crc32c");
 		break;
 
 	case 19:
-		tcrypt_test("ecb(tea)");
+		ret += tcrypt_test("ecb(tea)");
 		break;
 
 	case 20:
-		tcrypt_test("ecb(xtea)");
+		ret += tcrypt_test("ecb(xtea)");
 		break;
 
 	case 21:
-		tcrypt_test("ecb(khazad)");
+		ret += tcrypt_test("ecb(khazad)");
 		break;
 
 	case 22:
-		tcrypt_test("wp512");
+		ret += tcrypt_test("wp512");
 		break;
 
 	case 23:
-		tcrypt_test("wp384");
+		ret += tcrypt_test("wp384");
 		break;
 
 	case 24:
-		tcrypt_test("wp256");
+		ret += tcrypt_test("wp256");
 		break;
 
 	case 25:
-		tcrypt_test("ecb(tnepres)");
+		ret += tcrypt_test("ecb(tnepres)");
 		break;
 
 	case 26:
-		tcrypt_test("ecb(anubis)");
-		tcrypt_test("cbc(anubis)");
+		ret += tcrypt_test("ecb(anubis)");
+		ret += tcrypt_test("cbc(anubis)");
 		break;
 
 	case 27:
-		tcrypt_test("tgr192");
+		ret += tcrypt_test("tgr192");
 		break;
 
 	case 28:
 
-		tcrypt_test("tgr160");
+		ret += tcrypt_test("tgr160");
 		break;
 
 	case 29:
-		tcrypt_test("tgr128");
+		ret += tcrypt_test("tgr128");
 		break;
 
 	case 30:
-		tcrypt_test("ecb(xeta)");
+		ret += tcrypt_test("ecb(xeta)");
 		break;
 
 	case 31:
-		tcrypt_test("pcbc(fcrypt)");
+		ret += tcrypt_test("pcbc(fcrypt)");
 		break;
 
 	case 32:
-		tcrypt_test("ecb(camellia)");
-		tcrypt_test("cbc(camellia)");
+		ret += tcrypt_test("ecb(camellia)");
+		ret += tcrypt_test("cbc(camellia)");
 		break;
 	case 33:
-		tcrypt_test("sha224");
+		ret += tcrypt_test("sha224");
 		break;
 
 	case 34:
-		tcrypt_test("salsa20");
+		ret += tcrypt_test("salsa20");
 		break;
 
 	case 35:
-		tcrypt_test("gcm(aes)");
+		ret += tcrypt_test("gcm(aes)");
 		break;
 
 	case 36:
-		tcrypt_test("lzo");
+		ret += tcrypt_test("lzo");
 		break;
 
 	case 37:
-		tcrypt_test("ccm(aes)");
+		ret += tcrypt_test("ccm(aes)");
 		break;
 
 	case 38:
-		tcrypt_test("cts(cbc(aes))");
+		ret += tcrypt_test("cts(cbc(aes))");
 		break;
 
         case 39:
-		tcrypt_test("rmd128");
+		ret += tcrypt_test("rmd128");
 		break;
 
         case 40:
-		tcrypt_test("rmd160");
+		ret += tcrypt_test("rmd160");
 		break;
 
 	case 41:
-		tcrypt_test("rmd256");
+		ret += tcrypt_test("rmd256");
 		break;
 
 	case 42:
-		tcrypt_test("rmd320");
+		ret += tcrypt_test("rmd320");
 		break;
 
 	case 43:
-		tcrypt_test("ecb(seed)");
+		ret += tcrypt_test("ecb(seed)");
 		break;
 
 	case 44:
-		tcrypt_test("zlib");
+		ret += tcrypt_test("zlib");
 		break;
 
 	case 45:
-		tcrypt_test("rfc4309(ccm(aes))");
+		ret += tcrypt_test("rfc4309(ccm(aes))");
 		break;
 
 	case 100:
-		tcrypt_test("hmac(md5)");
+		ret += tcrypt_test("hmac(md5)");
 		break;
 
 	case 101:
-		tcrypt_test("hmac(sha1)");
+		ret += tcrypt_test("hmac(sha1)");
 		break;
 
 	case 102:
-		tcrypt_test("hmac(sha256)");
+		ret += tcrypt_test("hmac(sha256)");
 		break;
 
 	case 103:
-		tcrypt_test("hmac(sha384)");
+		ret += tcrypt_test("hmac(sha384)");
 		break;
 
 	case 104:
-		tcrypt_test("hmac(sha512)");
+		ret += tcrypt_test("hmac(sha512)");
 		break;
 
 	case 105:
-		tcrypt_test("hmac(sha224)");
+		ret += tcrypt_test("hmac(sha224)");
 		break;
 
 	case 106:
-		tcrypt_test("xcbc(aes)");
+		ret += tcrypt_test("xcbc(aes)");
 		break;
 
 	case 107:
-		tcrypt_test("hmac(rmd128)");
+		ret += tcrypt_test("hmac(rmd128)");
 		break;
 
 	case 108:
-		tcrypt_test("hmac(rmd160)");
+		ret += tcrypt_test("hmac(rmd160)");
 		break;
 
 	case 150:
-		tcrypt_test("ansi_cprng");
+		ret += tcrypt_test("ansi_cprng");
 		break;
 
 	case 200:
@@ -873,6 +881,8 @@ static void do_test(int m)
 		test_available();
 		break;
 	}
+
+	return ret;
 }
 
 static int __init tcrypt_mod_init(void)
@@ -886,15 +896,21 @@ static int __init tcrypt_mod_init(void)
 			goto err_free_tv;
 	}
 
-	do_test(mode);
+	err = do_test(mode);
+	if (err) {
+		printk(KERN_ERR "tcrypt: one or more tests failed!\n");
+		goto err_free_tv;
+	}
 
-	/* We intentionaly return -EAGAIN to prevent keeping
-	 * the module. It does all its work from init()
-	 * and doesn't offer any runtime functionality 
+	/* We intentionaly return -EAGAIN to prevent keeping the module,
+	 * unless we're running in fips mode. It does all its work from
+	 * init() and doesn't offer any runtime functionality, but in
+	 * the fips case, checking for a successful load is helpful.
 	 * => we don't need it in the memory, do we?
 	 *                                        -- mludvig
 	 */
-	err = -EAGAIN;
+	if (!fips_enabled)
+		err = -EAGAIN;
 
 err_free_tv:
 	for (i = 0; i < TVMEMSIZE && tvmem[i]; i++)

From f3d8fe40498eea9f45be260bdf6ccada845411f3 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Wed, 27 May 2009 15:16:21 +1000
Subject: [PATCH 096/162] crypto: hifn_795x - fix __dev{init,exit} markings

The remove member of the pci_driver hifn_pci_driver uses __devexit_p(),
so the remove function itself should be marked with __devexit.  And where
there be __devexit on the remove, so is there __devinit on the probe.

Similarly, the module_init/module_exit functions should be declared with
plain __init/__exit markings, not the hotplug __dev{init,exit} ones.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
CC: Patrick McHardy <kaber@trash.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hifn_795x.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c
index 2bef086fb342..5f753fc08730 100644
--- a/drivers/crypto/hifn_795x.c
+++ b/drivers/crypto/hifn_795x.c
@@ -2564,7 +2564,7 @@ static void hifn_tasklet_callback(unsigned long data)
 		hifn_process_queue(dev);
 }
 
-static int hifn_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+static int __devinit hifn_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int err, i;
 	struct hifn_device *dev;
@@ -2696,7 +2696,7 @@ err_out_disable_pci_device:
 	return err;
 }
 
-static void hifn_remove(struct pci_dev *pdev)
+static void __devexit hifn_remove(struct pci_dev *pdev)
 {
 	int i;
 	struct hifn_device *dev;
@@ -2744,7 +2744,7 @@ static struct pci_driver hifn_pci_driver = {
 	.remove   = __devexit_p(hifn_remove),
 };
 
-static int __devinit hifn_init(void)
+static int __init hifn_init(void)
 {
 	unsigned int freq;
 	int err;
@@ -2789,7 +2789,7 @@ static int __devinit hifn_init(void)
 	return 0;
 }
 
-static void __devexit hifn_fini(void)
+static void __exit hifn_fini(void)
 {
 	pci_unregister_driver(&hifn_pci_driver);
 

From fd57f22a09ae276ca3e9cd11ed99b617d611ba82 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 29 May 2009 16:05:42 +1000
Subject: [PATCH 097/162] crypto: testmgr - Check all test vector lengths

As we cannot guarantee the availability of contiguous pages at
run-time, all test vectors must either fit within a page, or use
scatter lists.  In some cases vectors were not checked as to
whether they fit inside a page.  This patch adds all the missing
checks.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 376ea88158b9..8fcea70ed267 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -185,6 +185,10 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 
 		hash_buff = xbuf[0];
 
+		ret = -EINVAL;
+		if (WARN_ON(template[i].psize > PAGE_SIZE))
+			goto out;
+
 		memcpy(hash_buff, template[i].plaintext, template[i].psize);
 		sg_init_one(&sg[0], hash_buff, template[i].psize);
 
@@ -238,7 +242,11 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 
 			temp = 0;
 			sg_init_table(sg, template[i].np);
+			ret = -EINVAL;
 			for (k = 0; k < template[i].np; k++) {
+				if (WARN_ON(offset_in_page(IDX[k]) +
+					    template[i].tap[k] > PAGE_SIZE))
+					goto out;
 				sg_set_buf(&sg[k],
 					   memcpy(xbuf[IDX[k] >> PAGE_SHIFT] +
 						  offset_in_page(IDX[k]),
@@ -357,6 +365,11 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 			input = xbuf[0];
 			assoc = axbuf[0];
 
+			ret = -EINVAL;
+			if (WARN_ON(template[i].ilen > PAGE_SIZE ||
+				    template[i].alen > PAGE_SIZE))
+				goto out;
+
 			memcpy(input, template[i].input, template[i].ilen);
 			memcpy(assoc, template[i].assoc, template[i].alen);
 			if (template[i].iv)
@@ -516,7 +529,11 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 			}
 
 			sg_init_table(asg, template[i].anp);
+			ret = -EINVAL;
 			for (k = 0, temp = 0; k < template[i].anp; k++) {
+				if (WARN_ON(offset_in_page(IDX[k]) +
+					    template[i].atap[k] > PAGE_SIZE))
+					goto out;
 				sg_set_buf(&asg[k],
 					   memcpy(axbuf[IDX[k] >> PAGE_SHIFT] +
 						  offset_in_page(IDX[k]),
@@ -650,6 +667,10 @@ static int test_cipher(struct crypto_cipher *tfm, int enc,
 
 		j++;
 
+		ret = -EINVAL;
+		if (WARN_ON(template[i].ilen > PAGE_SIZE))
+			goto out;
+
 		data = xbuf[0];
 		memcpy(data, template[i].input, template[i].ilen);
 
@@ -741,6 +762,10 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 		if (!(template[i].np)) {
 			j++;
 
+			ret = -EINVAL;
+			if (WARN_ON(template[i].ilen > PAGE_SIZE))
+				goto out;
+
 			data = xbuf[0];
 			memcpy(data, template[i].input, template[i].ilen);
 

From a0cfae59f8381c5c670fce2cc3de70b35421f920 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 29 May 2009 16:23:12 +1000
Subject: [PATCH 098/162] crypto: testmgr - Allow hash test vectors longer than
 a page

As it stands we will each test hash vector both linearly and as
a scatter list if applicable.  This means that we cannot have
vectors longer than a page, even with scatter lists.

This patch fixes this by skipping test vectors with np != 0 when
testing linearly.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 8fcea70ed267..e9e9d84293b9 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -180,7 +180,12 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   tcrypt_complete, &tresult);
 
+	j = 0;
 	for (i = 0; i < tcount; i++) {
+		if (template[i].np)
+			continue;
+
+		j++;
 		memset(result, 0, 64);
 
 		hash_buff = xbuf[0];
@@ -198,7 +203,7 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 						  template[i].ksize);
 			if (ret) {
 				printk(KERN_ERR "alg: hash: setkey failed on "
-				       "test %d for %s: ret=%d\n", i + 1, algo,
+				       "test %d for %s: ret=%d\n", j, algo,
 				       -ret);
 				goto out;
 			}
@@ -220,14 +225,14 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 			/* fall through */
 		default:
 			printk(KERN_ERR "alg: hash: digest failed on test %d "
-			       "for %s: ret=%d\n", i + 1, algo, -ret);
+			       "for %s: ret=%d\n", j, algo, -ret);
 			goto out;
 		}
 
 		if (memcmp(result, template[i].digest,
 			   crypto_ahash_digestsize(tfm))) {
 			printk(KERN_ERR "alg: hash: Test %d failed for %s\n",
-			       i + 1, algo);
+			       j, algo);
 			hexdump(result, crypto_ahash_digestsize(tfm));
 			ret = -EINVAL;
 			goto out;

From aa07a6990f4b6a8ef9fc538dea55bac6f92255f2 Mon Sep 17 00:00:00 2001
From: Alex Riesen <raa.lkml@gmail.com>
Date: Tue, 2 Jun 2009 14:13:14 +1000
Subject: [PATCH 099/162] crypto: api - Use formatting of module name

Besdies, for the old code, gcc-4.3.3 produced this warning:
  "format not a string literal and no format arguments"

Signed-off-by: Alex Riesen <raa.lkml@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/crypto/api.c b/crypto/api.c
index f500fb840be9..d5944f92b416 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -217,14 +217,11 @@ struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask)
 
 	alg = crypto_alg_lookup(name, type, mask);
 	if (!alg) {
-		char tmp[CRYPTO_MAX_ALG_NAME];
-
-		request_module(name);
+		request_module("%s", name);
 
 		if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask &
-		      CRYPTO_ALG_NEED_FALLBACK) &&
-		    snprintf(tmp, sizeof(tmp), "%s-all", name) < sizeof(tmp))
-			request_module(tmp);
+		      CRYPTO_ALG_NEED_FALLBACK))
+			request_module("%s-all", name);
 
 		alg = crypto_alg_lookup(name, type, mask);
 	}

From 08ced854fc4a979d9e59ba01000bf96e7057cfbc Mon Sep 17 00:00:00 2001
From: Alexander Clouter <alex@digriz.org.uk>
Date: Wed, 3 Jun 2009 19:28:03 +1000
Subject: [PATCH 100/162] hwrng: timeriomem - Fix potential oops
 (request_mem_region/__devinit)

Fixed oops when calling device_unregister followed by device_register
(changing __init to __devinit) and removed request_mem_region() as
platform_device_register already does this which can result in EBUSY

Signed-off-by: Alexander Clouter <alex@digriz.org.uk>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/timeriomem-rng.c | 26 ++++++-------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c
index dcd352ad0e7f..a94e930575f2 100644
--- a/drivers/char/hw_random/timeriomem-rng.c
+++ b/drivers/char/hw_random/timeriomem-rng.c
@@ -88,9 +88,9 @@ static struct hwrng timeriomem_rng_ops = {
 	.priv		= 0,
 };
 
-static int __init timeriomem_rng_probe(struct platform_device *pdev)
+static int __devinit timeriomem_rng_probe(struct platform_device *pdev)
 {
-	struct resource *res, *mem;
+	struct resource *res;
 	int ret;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -98,21 +98,12 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev)
 	if (!res)
 		return -ENOENT;
 
-	mem = request_mem_region(res->start, res->end - res->start + 1,
-				 pdev->name);
-	if (mem == NULL)
-		return -EBUSY;
-
-	dev_set_drvdata(&pdev->dev, mem);
-
 	timeriomem_rng_data = pdev->dev.platform_data;
 
 	timeriomem_rng_data->address = ioremap(res->start,
 						res->end - res->start + 1);
-	if (!timeriomem_rng_data->address) {
-		ret = -ENOMEM;
-		goto err_ioremap;
-	}
+	if (!timeriomem_rng_data->address)
+		return -EIO;
 
 	if (timeriomem_rng_data->period != 0
 		&& usecs_to_jiffies(timeriomem_rng_data->period) > 0) {
@@ -125,7 +116,7 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev)
 
 	ret = hwrng_register(&timeriomem_rng_ops);
 	if (ret)
-		goto err_register;
+		goto failed;
 
 	dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
 			timeriomem_rng_data->address,
@@ -133,24 +124,19 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev)
 
 	return 0;
 
-err_register:
+failed:
 	dev_err(&pdev->dev, "problem registering\n");
 	iounmap(timeriomem_rng_data->address);
-err_ioremap:
-	release_resource(mem);
 
 	return ret;
 }
 
 static int __devexit timeriomem_rng_remove(struct platform_device *pdev)
 {
-	struct resource *mem = dev_get_drvdata(&pdev->dev);
-
 	del_timer_sync(&timeriomem_rng_timer);
 	hwrng_unregister(&timeriomem_rng_ops);
 
 	iounmap(timeriomem_rng_data->address);
-	release_resource(mem);
 
 	return 0;
 }

From c076b9937f4912ffc09b30e155fe5246f002f21a Mon Sep 17 00:00:00 2001
From: Ben Nizette <bn@niasdigital.com>
Date: Wed, 3 Jun 2009 11:38:20 +0200
Subject: [PATCH 101/162] avr32: Fix clash in ATMEL_USART_ flags

At the moment ATMEL_USART_{RTS,CTS,CLK} have the values
1, 2 and 3 respectively.  Given these are used in bitmasks,
trying to turn on the CLK line will in fact turn on the
RTS and CTS lines as well.

Change the value of ATMEL_USART_CLK to 4.

Signed-off-by: Ben Nizette <bn@niasdigital.com>
Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
---
 arch/avr32/mach-at32ap/include/mach/board.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/avr32/mach-at32ap/include/mach/board.h b/arch/avr32/mach-at32ap/include/mach/board.h
index 0b8164281899..ddedb471f33e 100644
--- a/arch/avr32/mach-at32ap/include/mach/board.h
+++ b/arch/avr32/mach-at32ap/include/mach/board.h
@@ -29,7 +29,7 @@ extern struct platform_device *atmel_default_console_device;
 /* Flags for selecting USART extra pins */
 #define	ATMEL_USART_RTS		0x01
 #define	ATMEL_USART_CTS		0x02
-#define	ATMEL_USART_CLK		0x03
+#define	ATMEL_USART_CLK		0x04
 
 struct atmel_uart_data {
 	short		use_dma_tx;	/* use transmit DMA? */

From c878b7d60418a45c36d99c2dc876ebb76035d404 Mon Sep 17 00:00:00 2001
From: Mark Jackson <mpfj@mimc.co.uk>
Date: Wed, 3 Jun 2009 11:16:38 +0100
Subject: [PATCH 102/162] Fix MIMC200 board LCD init

This patch updates the LCD init code for the MIMC200 board.

V2 fixes a .yres typo and corrects an incorrect setup value
in the call to at32_add_device_lcdc()

Without this patch, the lcd setup is wrong and won't display images
correctly (if at all !!)

Signed-off-by: Mark Jackson <mpfj@mimc.co.uk>
Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
---
 arch/avr32/boards/mimc200/setup.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/avr32/boards/mimc200/setup.c b/arch/avr32/boards/mimc200/setup.c
index c1b2175b4fea..523d8e183bef 100644
--- a/arch/avr32/boards/mimc200/setup.c
+++ b/arch/avr32/boards/mimc200/setup.c
@@ -43,16 +43,16 @@ unsigned long at32_board_osc_rates[3] = {
 /* Initialized by bootloader-specific startup code. */
 struct tag *bootloader_tags __initdata;
 
-static struct fb_videomode __initdata tx14d14_modes[] = {
+static struct fb_videomode __initdata pt0434827_modes[] = {
 	{
-		.name		= "640x480 @ 60",
-		.refresh	= 60,
-		.xres		= 640,		.yres		= 480,
-		.pixclock	= KHZ2PICOS(11666),
+		.name		= "480x272 @ 72",
+		.refresh	= 72,
+		.xres		= 480,		.yres		= 272,
+		.pixclock	= KHZ2PICOS(10000),
 
-		.left_margin	= 80,		.right_margin	= 1,
-		.upper_margin	= 13,		.lower_margin	= 2,
-		.hsync_len	= 64,		.vsync_len	= 1,
+		.left_margin	= 1,		.right_margin	= 1,
+		.upper_margin	= 12,		.lower_margin	= 1,
+		.hsync_len	= 42,		.vsync_len	= 1,
 
 		.sync		= 0,
 		.vmode		= FB_VMODE_NONINTERLACED,
@@ -60,14 +60,14 @@ static struct fb_videomode __initdata tx14d14_modes[] = {
 };
 
 static struct fb_monspecs __initdata mimc200_default_monspecs = {
-	.manufacturer		= "HIT",
-	.monitor		= "TX14D14VM1BAB",
-	.modedb			= tx14d14_modes,
-	.modedb_len		= ARRAY_SIZE(tx14d14_modes),
+	.manufacturer		= "PT",
+	.monitor		= "PT0434827-A401",
+	.modedb			= pt0434827_modes,
+	.modedb_len		= ARRAY_SIZE(pt0434827_modes),
 	.hfmin			= 14820,
 	.hfmax			= 22230,
 	.vfmin			= 60,
-	.vfmax			= 73.3,
+	.vfmax			= 85,
 	.dclkmax		= 25200000,
 };
 
@@ -228,7 +228,8 @@ static int __init mimc200_init(void)
 	i2c_register_board_info(0, i2c_info, ARRAY_SIZE(i2c_info));
 
 	at32_add_device_lcdc(0, &mimc200_lcdc_data,
-			     fbmem_start, fbmem_size, 1);
+			     fbmem_start, fbmem_size,
+			     ATMEL_LCDC_CONTROL | ATMEL_LCDC_ALT_CONTROL | ATMEL_LCDC_ALT_24B_DATA);
 
 	return 0;
 }

From 01ca79f1411eae2a45352709c838b946b1af9fbd Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:52 +0200
Subject: [PATCH 103/162] x86, mce: add machine check exception count in
 /proc/interrupts

Useful for debugging, but it's also good general policy
to have a counter for all special interrupts there. This makes it easier
to diagnose where a CPU is spending its time.

[ Impact: feature, debugging tool ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h       |  3 +++
 arch/x86/kernel/cpu/mcheck/mce.c |  4 ++++
 arch/x86/kernel/irq.c            | 10 ++++++++++
 3 files changed, 17 insertions(+)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index ac6e0303bf21..1156dae295ac 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -89,6 +89,7 @@ struct mce_log {
 extern int mce_disabled;
 
 #include <asm/atomic.h>
+#include <linux/percpu.h>
 
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
@@ -123,6 +124,8 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
 
 int mce_available(struct cpuinfo_x86 *c);
 
+DECLARE_PER_CPU(unsigned, mce_exception_count);
+
 void mce_log_therm_throt_event(__u64 status);
 
 extern atomic_t mce_entry;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1d0aa9c4e15b..287268d21836 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -57,6 +57,8 @@ int				mce_disabled;
 
 atomic_t mce_entry;
 
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
 /*
  * Tolerant levels:
  *   0: always panic on uncorrected errors, log corrected errors
@@ -359,6 +361,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	atomic_inc(&mce_entry);
 
+	__get_cpu_var(mce_exception_count)++;
+
 	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
 		goto out;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index a05660bf0299..05fc635c28c0 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
 #include <asm/io_apic.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
+#include <asm/mce.h>
 
 atomic_t irq_err_count;
 
@@ -93,6 +94,12 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
 	seq_printf(p, "  Threshold APIC interrupts\n");
 # endif
+#endif
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+	seq_printf(p, "%*s: ", prec, "MCE");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
+	seq_printf(p, "  Machine check exceptions\n");
 #endif
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
@@ -161,6 +168,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 {
 	u64 sum = irq_stats(cpu)->__nmi_count;
 
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+	sum += per_cpu(mce_exception_count, cpu);
+#endif
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
 	sum += irq_stats(cpu)->irq_spurious_count;

From ca84f69697da0f004135e45b63ca560b6bd3554e Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:57 +0200
Subject: [PATCH 104/162] x86, mce: add MCE poll count to /proc/interrupts

Keep a count of the machine check polls (or CMCI events) in
/proc/interrupts.

Andi needs this for debugging, but it's also useful in general
to see what's going in by the kernel.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h       | 1 +
 arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++
 arch/x86/kernel/irq.c            | 4 ++++
 3 files changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 1156dae295ac..63abf3b19432 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -125,6 +125,7 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
 int mce_available(struct cpuinfo_x86 *c);
 
 DECLARE_PER_CPU(unsigned, mce_exception_count);
+DECLARE_PER_CPU(unsigned, mce_poll_count);
 
 void mce_log_therm_throt_event(__u64 status);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 287268d21836..784f6ae9d6f4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -264,6 +264,8 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 	}
 }
 
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
 /*
  * Poll for corrected events or events that happened before reset.
  * Those are just logged through /dev/mcelog.
@@ -275,6 +277,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 	struct mce m;
 	int i;
 
+	__get_cpu_var(mce_poll_count)++;
+
 	mce_setup(&m);
 
 	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 05fc635c28c0..eff46b5de62f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -100,6 +100,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
 	seq_printf(p, "  Machine check exceptions\n");
+	seq_printf(p, "%*s: ", prec, "MCP");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
+	seq_printf(p, "  Machine check polls\n");
 #endif
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)

From f6fb0ac0869500323c78fa21992fe1933af61e91 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:55 +0200
Subject: [PATCH 105/162] x86, mce: store record length into memory struct mce
 anchor

This makes it easier for tools who want to extract the mcelog out of
crash images or memory dumps to adapt to changing struct mce size.
The length field replaces padding, so it's fully compatible.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h       | 2 +-
 arch/x86/kernel/cpu/mcheck/mce.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 63abf3b19432..0a61946d4396 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -59,7 +59,7 @@ struct mce_log {
 	unsigned len;	    /* = MCE_LOG_LEN */
 	unsigned next;
 	unsigned flags;
-	unsigned pad0;
+	unsigned recordlen;	/* length of struct mce */
 	struct mce entry[MCE_LOG_LEN];
 };
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 784f6ae9d6f4..3db047e7a0fb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -108,8 +108,9 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm);
  */
 
 static struct mce_log mcelog = {
-	MCE_LOG_SIGNATURE,
-	MCE_LOG_LEN,
+	.signature	= MCE_LOG_SIGNATURE,
+	.len		= MCE_LOG_LEN,
+	.recordlen	= sizeof(struct mce),
 };
 
 void mce_log(struct mce *mce)

From d620c67fb92aa11736112f9a03e31d8e3079c57a Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:56 +0200
Subject: [PATCH 106/162] x86, mce: support more than 256 CPUs in struct mce

The old struct mce had a limitation to 256 CPUs. But x86 Linux supports
more than that now with x2apic. Add a new field extcpu to report the
extended number.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h              |  4 ++--
 arch/x86/kernel/cpu/mcheck/mce-inject.c | 10 +++++-----
 arch/x86/kernel/cpu/mcheck/mce.c        |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 0a61946d4396..b4a04b60b740 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,9 +40,9 @@ struct mce {
 	__u64 res2;	/* dito. */
 	__u8  cs;		/* code segment */
 	__u8  bank;	/* machine check bank */
-	__u8  cpu;	/* cpu that raised the error */
+	__u8  cpu;	/* cpu number; obsolete; use extcpu now */
 	__u8  finished;   /* entry is valid */
-	__u32 pad;
+	__u32 extcpu;	/* linux cpu number that detected the error */
 };
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 7b3a5428396a..7d858fb4ce67 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -23,14 +23,14 @@
 /* Update fake mce registers on current CPU. */
 static void inject_mce(struct mce *m)
 {
-	struct mce *i = &per_cpu(injectm, m->cpu);
+	struct mce *i = &per_cpu(injectm, m->extcpu);
 
 	/* Make sure noone reads partially written injectm */
 	i->finished = 0;
 	mb();
 	m->finished = 0;
 	/* First set the fields after finished */
-	i->cpu = m->cpu;
+	i->extcpu = m->extcpu;
 	mb();
 	/* Now write record in order, finished last (except above) */
 	memcpy(i, m, sizeof(struct mce));
@@ -49,7 +49,7 @@ static void raise_mce(unsigned long data)
 {
 	struct delayed_mce *dm = (struct delayed_mce *)data;
 	struct mce *m = &dm->m;
-	int cpu = m->cpu;
+	int cpu = m->extcpu;
 
 	inject_mce(m);
 	if (m->status & MCI_STATUS_UC) {
@@ -93,7 +93,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
 	if (copy_from_user(&m, ubuf, usize))
 		return -EFAULT;
 
-	if (m.cpu >= num_possible_cpus() || !cpu_online(m.cpu))
+	if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
 		return -EINVAL;
 
 	dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
@@ -108,7 +108,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
 	memcpy(&dm->m, &m, sizeof(struct mce));
 	setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
 	dm->timer.expires = jiffies + 2;
-	add_timer_on(&dm->timer, m.cpu);
+	add_timer_on(&dm->timer, m.extcpu);
 	return usize;
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3db047e7a0fb..2c4dd6c422c3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -94,7 +94,7 @@ static inline int skip_bank_init(int i)
 void mce_setup(struct mce *m)
 {
 	memset(m, 0, sizeof(struct mce));
-	m->cpu = smp_processor_id();
+	m->cpu = m->extcpu = smp_processor_id();
 	rdtscll(m->tsc);
 }
 
@@ -158,7 +158,7 @@ static void print_mce(struct mce *m)
 	       KERN_EMERG "HARDWARE ERROR\n"
 	       KERN_EMERG
 	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
-	       m->cpu, m->mcgstatus, m->bank, m->status);
+	       m->extcpu, m->mcgstatus, m->bank, m->status);
 	if (m->ip) {
 		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",

From 8ee08347c1e8b5680b3b3ce081e42e97bcaa1abe Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:56 +0200
Subject: [PATCH 107/162] x86, mce: extend struct mce user interface with more
 information.

Experience has shown that struct mce which is used to pass an machine
check to the user space daemon currently a few limitations.  Also some
data which is useful to print at panic level is also missing.

This patch addresses most of them. The same information is also
printed out together with mce panic.

struct mce can be painlessly extended in a compatible way, the mcelog
user space code just ignores additional fields with a warning.

- It doesn't provide a wall time timestamp. There have been a few
  complaints about that. Fix that by adding a 64bit time_t

- It doesn't provide the exact CPU identification. This makes
  it awkward for mcelog to decode the event correctly, especially
  when there are variations in the supported MCE codes on different
  CPU models or when mcelog is running on a different host after a panic.
  Previously the administrator had to specify the correct CPU
  when mcelog ran on a different host, but with the more variation
  in machine checks now it's better to auto detect that.
  It's also useful for more detailed analysis of CPU events.
  Pass CPUID 1.EAX and the cpu vendor (as encoded in processor.h) instead.

- Socket ID and initial APIC ID are useful to report because they
  allow to identify the failing CPU in some (not all) cases.
  This is also especially useful for the panic situation.
  This addresses one of the complaints from Thomas Gleixner earlier.

- The MCG capabilities MSR needs to be reported for some advanced
  error processing in mcelog

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h       | 10 ++++++++--
 arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b4a04b60b740..ba1f8890cf51 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -36,13 +36,19 @@ struct mce {
 	__u64 mcgstatus;
 	__u64 ip;
 	__u64 tsc;	/* cpu time stamp counter */
-	__u64 res1;	/* for future extension */
-	__u64 res2;	/* dito. */
+	__u64 time;	/* wall time_t when error was detected */
+	__u8  cpuvendor;	/* cpu vendor as encoded in system.h */
+	__u8  pad1;
+	__u16 pad2;
+	__u32 cpuid;	/* CPUID 1 EAX */
 	__u8  cs;		/* code segment */
 	__u8  bank;	/* machine check bank */
 	__u8  cpu;	/* cpu number; obsolete; use extcpu now */
 	__u8  finished;   /* entry is valid */
 	__u32 extcpu;	/* linux cpu number that detected the error */
+	__u32 socketid;	/* CPU socket ID */
+	__u32 apicid;	/* CPU initial apic ID */
+	__u64 mcgcap;	/* MCGCAP MSR: machine check capabilities of CPU */
 };
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2c4dd6c422c3..ba68449c22a1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -96,6 +96,15 @@ void mce_setup(struct mce *m)
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
 	rdtscll(m->tsc);
+	/* We hope get_seconds stays lockless */
+	m->time = get_seconds();
+	m->cpuvendor = boot_cpu_data.x86_vendor;
+	m->cpuid = cpuid_eax(1);
+#ifdef CONFIG_SMP
+	m->socketid = cpu_data(m->extcpu).phys_proc_id;
+#endif
+	m->apicid = cpu_data(m->extcpu).initial_apicid;
+	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 }
 
 DEFINE_PER_CPU(struct mce, injectm);
@@ -173,6 +182,9 @@ static void print_mce(struct mce *m)
 	if (m->misc)
 		printk("MISC %llx ", m->misc);
 	printk("\n");
+	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+			m->cpuvendor, m->cpuid, m->time, m->socketid,
+			m->apicid);
 	printk(KERN_EMERG "This is not a software problem!\n");
 	printk(KERN_EMERG "Run through mcelog --ascii to decode "
 	       "and contact your hardware vendor\n");

From de8a84d85ad8bb46d01d72ebc57030b95075603c Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:53 +0200
Subject: [PATCH 108/162] x86, mce: log corrected errors when panicing

Normally the machine check handler ignores corrected errors and leaves
them to machine_check_poll(). But when panicing mcp won't run, so
log all errors.

Note: this can still miss some cases until the "early no way out"
patch later is applied too.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ba68449c22a1..86806e52fc40 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -412,9 +412,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 		/*
 		 * Non uncorrected errors are handled by machine_check_poll
-		 * Leave them alone.
+		 * Leave them alone, unless this panics.
 		 */
-		if ((m.status & MCI_STATUS_UC) == 0)
+		if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out)
 			continue;
 
 		/*

From a0189c70e5f17f4253dd7bc575c97469900e23d6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:54 +0200
Subject: [PATCH 109/162] x86, mce: remove TSC print heuristic

Previously mce_panic used a simple heuristic to avoid printing
old so far unreported machine check events on a mce panic. This worked
by comparing the TSC value at the start of the machine check handler
with the event time stamp and only printing newer ones.

This has a couple of issues, in particular on systems where the TSC
is not fully synchronized between CPUs it could lose events or print
old ones.

It is also problematic with full system synchronization as it is
added by the next patch.

Remove the TSC heuristic and instead replace it with a simple heuristic
to print corrected errors first and after that uncorrected errors
and finally the worst machine check as determined by the machine
check handler.

This simplifies the code because there is no need to pass the
original TSC value around.

Contains fixes from Ying Huang

[ Impact: bug fix, cleanup ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Ying Huang <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 34 ++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 86806e52fc40..6773610061d8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -158,6 +158,7 @@ void mce_log(struct mce *mce)
 	mcelog.entry[entry].finished = 1;
 	wmb();
 
+	mce->finished = 1;
 	set_bit(0, &notify_user);
 }
 
@@ -190,23 +191,29 @@ static void print_mce(struct mce *m)
 	       "and contact your hardware vendor\n");
 }
 
-static void mce_panic(char *msg, struct mce *backup, u64 start)
+static void mce_panic(char *msg, struct mce *final)
 {
 	int i;
 
 	bust_spinlocks(1);
 	console_verbose();
+	/* First print corrected ones that are still unlogged */
 	for (i = 0; i < MCE_LOG_LEN; i++) {
-		u64 tsc = mcelog.entry[i].tsc;
-
-		if ((s64)(tsc - start) < 0)
-			continue;
-		print_mce(&mcelog.entry[i]);
-		if (backup && mcelog.entry[i].tsc == backup->tsc)
-			backup = NULL;
+		struct mce *m = &mcelog.entry[i];
+		if ((m->status & MCI_STATUS_VAL) &&
+			!(m->status & MCI_STATUS_UC))
+			print_mce(m);
 	}
-	if (backup)
-		print_mce(backup);
+	/* Now print uncorrected but with the final one last */
+	for (i = 0; i < MCE_LOG_LEN; i++) {
+		struct mce *m = &mcelog.entry[i];
+		if (!(m->status & MCI_STATUS_VAL))
+			continue;
+		if (!final || memcmp(m, final, sizeof(struct mce)))
+			print_mce(m);
+	}
+	if (final)
+		print_mce(final);
 	panic(msg);
 }
 
@@ -362,7 +369,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 {
 	struct mce m, panicm;
 	int panicm_found = 0;
-	u64 mcestart = 0;
 	int i;
 	/*
 	 * If no_way_out gets set, there is no safe way to recover from this
@@ -394,7 +400,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
 		no_way_out = 1;
 
-	rdtscll(mcestart);
 	barrier();
 
 	for (i = 0; i < banks; i++) {
@@ -478,7 +483,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	 * has not set tolerant to an insane level, give up and die.
 	 */
 	if (no_way_out && tolerant < 3)
-		mce_panic("Machine check", &panicm, mcestart);
+		mce_panic("Machine check", &panicm);
 
 	/*
 	 * If the error seems to be unrecoverable, something should be
@@ -506,8 +511,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		if (user_space) {
 			force_sig(SIGBUS, current);
 		} else if (panic_on_oops || tolerant < 2) {
-			mce_panic("Uncorrected machine check",
-				&panicm, mcestart);
+			mce_panic("Uncorrected machine check", &panicm);
 		}
 	}
 

From 817f32d02a52dd7f5941534e0699883691e918df Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:54 +0200
Subject: [PATCH 110/162] x86, mce: add table driven machine check grading

The machine check grading (as in deciding what should be done for a given
register value) has to be done multiple times soon and it's also getting
more complicated.
So it makes sense to consolidate it into a single function. To get smaller
and more straight forward and possibly more extensible code I opted towards
a new table driven method. The various rules are put into a table
when is then executed by a very simple interpreter.

The grading engine is in a new file mce-severity.c. I also added a private
include file mce-internal.h, because mce.h is already a bit too cluttered.

This is dead code right now, but will be used in followon patches.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/Makefile       |  1 +
 arch/x86/kernel/cpu/mcheck/mce-internal.h | 10 ++++
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 61 +++++++++++++++++++++++
 3 files changed, 72 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/mcheck/mce-internal.h
 create mode 100644 arch/x86/kernel/cpu/mcheck/mce-severity.c

diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 60ee182c6c52..45004faf67ea 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,5 +1,6 @@
 obj-y				=  mce.o therm_throt.o
 
+obj-$(CONFIG_X86_NEW_MCE)	+= mce-severity.o
 obj-$(CONFIG_X86_OLD_MCE)	+= k7.o p4.o p6.o
 obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o
 obj-$(CONFIG_X86_MCE_P4THERMAL)	+= mce_intel.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 000000000000..f126b4ae7a25
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,10 @@
+#include <asm/mce.h>
+
+enum severity_level {
+	MCE_NO_SEVERITY,
+	MCE_SOME_SEVERITY,
+	MCE_UC_SEVERITY,
+	MCE_PANIC_SEVERITY,
+};
+
+int mce_severity(struct mce *a, int tolerant, char **msg);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 000000000000..c189e89a89ae
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,61 @@
+/*
+ * MCE grading rules.
+ * Copyright 2008, 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Author: Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+/*
+ * Grade an mce by severity. In general the most severe ones are processed
+ * first. Since there are quite a lot of combinations test the bits in a
+ * table-driven way. The rules are simply processed in order, first
+ * match wins.
+ */
+
+static struct severity {
+	u64 mask;
+	u64 result;
+	unsigned char sev;
+	unsigned char mcgmask;
+	unsigned char mcgres;
+	char *msg;
+} severities[] = {
+#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
+#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
+#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
+#define MCGMASK(x, res, s, m, r...) \
+	{ .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
+	BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
+	BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
+	BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
+	MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "No restart IP"),
+	BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
+	BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
+	BITSET(0, SOME, "No match")	/* always matches. keep at end */
+};
+
+int mce_severity(struct mce *a, int tolerant, char **msg)
+{
+	struct severity *s;
+	for (s = severities;; s++) {
+		if ((a->status & s->mask) != s->result)
+			continue;
+		if ((a->mcgstatus & s->mcgmask) != s->mcgres)
+			continue;
+		if (s->sev > MCE_NO_SEVERITY && (a->status & MCI_STATUS_UC) &&
+			tolerant < 1)
+			return MCE_PANIC_SEVERITY;
+		if (msg)
+			*msg = s->msg;
+		return s->sev;
+	}
+}

From bd19a5e6b73df276e1ccedf9059e9ee70c372d7d Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:55 +0200
Subject: [PATCH 111/162] x86, mce: check early in exception handler if panic
 is needed

The exception handler should behave differently if the exception is
fatal versus one that can be returned from.  In the first case it should
never clear any registers because these need to be preserved
for logging after the next boot. Otherwise it should clear them
on each CPU step by step so that other CPUs sharing the same bank don't
see duplicate events. Otherwise we risk reporting events multiple
times on any CPUs which have shared machine check banks, which
is a common problem on Intel Nehalem which has both SMT (two
CPU threads sharing banks) and shared machine check banks in the uncore.

Determine early in a special pass if any event requires a panic.
This uses the mce_severity() function added earlier.

This is needed for the next patch.

Also fixes a problem together with an earlier patch
that corrected events weren't logged on a fatal MCE.

[ Impact: Feature ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 38 +++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6773610061d8..5031814ac943 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -36,6 +36,7 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 
+#include "mce-internal.h"
 #include "mce.h"
 
 /* Handle unconfigured int18 (should never happen) */
@@ -191,7 +192,7 @@ static void print_mce(struct mce *m)
 	       "and contact your hardware vendor\n");
 }
 
-static void mce_panic(char *msg, struct mce *final)
+static void mce_panic(char *msg, struct mce *final, char *exp)
 {
 	int i;
 
@@ -214,6 +215,8 @@ static void mce_panic(char *msg, struct mce *final)
 	}
 	if (final)
 		print_mce(final);
+	if (exp)
+		printk(KERN_EMERG "Machine check: %s\n", exp);
 	panic(msg);
 }
 
@@ -357,6 +360,22 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 }
 EXPORT_SYMBOL_GPL(machine_check_poll);
 
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static int mce_no_way_out(struct mce *m, char **msg)
+{
+	int i;
+
+	for (i = 0; i < banks; i++) {
+		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
+			return 1;
+	}
+	return 0;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -381,6 +400,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	 */
 	int kill_it = 0;
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+	char *msg = "Unknown";
 
 	atomic_inc(&mce_entry);
 
@@ -395,10 +415,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	mce_setup(&m);
 
 	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
-
-	/* if the restart IP is not valid, we're done for */
-	if (!(m.mcgstatus & MCG_STATUS_RIPV))
-		no_way_out = 1;
+	no_way_out = mce_no_way_out(&m, &msg);
 
 	barrier();
 
@@ -430,18 +447,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		__set_bit(i, toclear);
 
 		if (m.status & MCI_STATUS_EN) {
-			/* if PCC was set, there's no way out */
-			no_way_out |= !!(m.status & MCI_STATUS_PCC);
 			/*
 			 * If this error was uncorrectable and there was
 			 * an overflow, we're in trouble.  If no overflow,
 			 * we might get away with just killing a task.
 			 */
-			if (m.status & MCI_STATUS_UC) {
-				if (tolerant < 1 || m.status & MCI_STATUS_OVER)
-					no_way_out = 1;
+			if (m.status & MCI_STATUS_UC)
 				kill_it = 1;
-			}
 		} else {
 			/*
 			 * Machine check event was not enabled. Clear, but
@@ -483,7 +495,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	 * has not set tolerant to an insane level, give up and die.
 	 */
 	if (no_way_out && tolerant < 3)
-		mce_panic("Machine check", &panicm);
+		mce_panic("Machine check", &panicm, msg);
 
 	/*
 	 * If the error seems to be unrecoverable, something should be
@@ -511,7 +523,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		if (user_space) {
 			force_sig(SIGBUS, current);
 		} else if (panic_on_oops || tolerant < 2) {
-			mce_panic("Uncorrected machine check", &panicm);
+			mce_panic("Uncorrected machine check", &panicm, msg);
 		}
 	}
 

From ccc3c3192ae78dd56dcdf5353fd1a9ef5f9a3e2b Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:54 +0200
Subject: [PATCH 112/162] x86, mce: implement bootstrapping for machine check
 wakeups

Machine checks support waking up the mcelog daemon quickly.

The original wake up code for this was pretty ugly, relying on
a idle notifier and a special process flag. The reason it did
it this way is that the machine check handler is not subject
to normal interrupt locking rules so it's not safe
to call wake_up().  Instead it set a process flag
and then either did the wakeup in the syscall return
or in the idle notifier.

This patch adds a new "bootstraping" method as replacement.

The idea is that the handler checks if it's in a state where
it is unsafe to call wake_up(). If it's safe it calls it directly.
When it's not safe -- that is it interrupted in a critical
section with interrupts disables -- it uses a new "self IPI" to trigger
an IPI to its own CPU. This can be done safely because IPI
triggers are atomic with some care. The IPI is raised
once the interrupts are reenabled and can then safely call
wake_up().

When APICs are disabled the event is just queued and will be picked up
eventually by the next polling timer. I think that's a reasonable
compromise, since it should only happen quite rarely.

Contains fixes from Ying Huang.

[ solve conflict on irqinit, make it work on 32bit (entry_arch.h) - HS ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/entry_arch.h  |  4 +++
 arch/x86/include/asm/hw_irq.h      |  1 +
 arch/x86/include/asm/irq_vectors.h |  5 +++
 arch/x86/kernel/cpu/mcheck/mce.c   | 54 ++++++++++++++++++++++++++++++
 arch/x86/kernel/entry_64.S         |  5 +++
 arch/x86/kernel/irqinit.c          |  3 ++
 6 files changed, 72 insertions(+)

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index b2eb9c066843..4cdcf5a3c96b 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -60,4 +60,8 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
 #endif
 
+#ifdef CONFIG_X86_NEW_MCE
+BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
+#endif
+
 #endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index a7d14bbae110..4e59197e29ba 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -32,6 +32,7 @@ extern void error_interrupt(void);
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
 extern void reschedule_interrupt(void);
+extern void mce_self_interrupt(void);
 
 extern void invalidate_interrupt(void);
 extern void invalidate_interrupt0(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8c46b851296a..68f7cf84a333 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -117,6 +117,11 @@
  */
 #define GENERIC_INTERRUPT_VECTOR	0xed
 
+/*
+ * Self IPI vector for machine checks
+ */
+#define MCE_SELF_VECTOR			0xeb
+
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5031814ac943..121781627858 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,6 +10,7 @@
 #include <linux/thread_info.h>
 #include <linux/capability.h>
 #include <linux/miscdevice.h>
+#include <linux/interrupt.h>
 #include <linux/ratelimit.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
@@ -32,7 +33,10 @@
 #include <linux/fs.h>
 
 #include <asm/processor.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
 #include <asm/idle.h>
+#include <asm/ipi.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
@@ -287,6 +291,54 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 	}
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC 
+/*
+ * Called after interrupts have been reenabled again
+ * when a MCE happened during an interrupts off region
+ * in the kernel.
+ */
+asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+	exit_idle();
+	irq_enter();
+	mce_notify_user();
+	irq_exit();
+}
+#endif
+
+static void mce_report_event(struct pt_regs *regs)
+{
+	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
+		mce_notify_user();
+		return;
+	}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	/*
+	 * Without APIC do not notify. The event will be picked
+	 * up eventually.
+	 */
+	if (!cpu_has_apic)
+		return;
+
+	/*
+	 * When interrupts are disabled we cannot use
+	 * kernel services safely. Trigger an self interrupt
+	 * through the APIC to instead do the notification
+	 * after interrupts are reenabled again.
+	 */
+	apic->send_IPI_self(MCE_SELF_VECTOR);
+
+	/*
+	 * Wait for idle afterwards again so that we don't leave the
+	 * APIC in a non idle state because the normal APIC writes
+	 * cannot exclude us.
+	 */
+	apic_wait_icr_idle();
+#endif
+}
+
 DEFINE_PER_CPU(unsigned, mce_poll_count);
 
 /*
@@ -530,6 +582,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	/* notify userspace ASAP */
 	set_thread_flag(TIF_MCE_NOTIFY);
 
+	mce_report_event(regs);
+
 	/* the last thing we do is clear state */
 	for (i = 0; i < banks; i++) {
 		if (test_bit(i, toclear))
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a31a7f29cffe..711c130a8411 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1011,6 +1011,11 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
 apicinterrupt THERMAL_APIC_VECTOR \
 	thermal_interrupt smp_thermal_interrupt
 
+#ifdef CONFIG_X86_MCE
+apicinterrupt MCE_SELF_VECTOR \
+	mce_self_interrupt smp_mce_self_interrupt
+#endif
+
 #ifdef CONFIG_SMP
 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
 	call_function_single_interrupt smp_call_function_single_interrupt
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index aab3d277766c..441f6ec6e9d4 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -187,6 +187,9 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_X86_THRESHOLD
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
+#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
+	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
+#endif
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/* self generated IPI for local APIC timer */

From f94b61c2c9fdcc90773c49df9ccf9ede3ad0d7db Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:55 +0200
Subject: [PATCH 113/162] x86, mce: implement panic synchronization

In some circumstances multiple CPUs can enter mce_panic() in parallel.
This gives quite confused output because they will all dump the same
machine check buffer.

The other problem is that they would all panic in parallel, but not
process each other's shutdown IPIs because interrupts are disabled.

Detect this situation early on in mce_panic(). On the first CPU
entering will do the panic, the others will just wait to be killed.

For paranoia reasons in case the other CPU dies during the MCE I added
a 5 seconds timeout. If it expires each CPU will panic on its own again.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 121781627858..421020f1d7db 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -196,10 +196,32 @@ static void print_mce(struct mce *m)
 	       "and contact your hardware vendor\n");
 }
 
+#define PANIC_TIMEOUT 5 /* 5 seconds */
+
+static atomic_t mce_paniced;
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+	preempt_disable();
+	local_irq_enable();
+	while (timeout-- > 0)
+		udelay(1);
+	panic("Panicing machine check CPU died");
+}
+
 static void mce_panic(char *msg, struct mce *final, char *exp)
 {
 	int i;
 
+	/*
+	 * Make sure only one CPU runs in machine check panic
+	 */
+	if (atomic_add_return(1, &mce_paniced) > 1)
+		wait_for_panic();
+	barrier();
+
 	bust_spinlocks(1);
 	console_verbose();
 	/* First print corrected ones that are still unlogged */

From 3c0797925f4ef9d55a32059d2af61a9c262e639d Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:55 +0200
Subject: [PATCH 114/162] x86, mce: switch x86 machine check handler to Monarch
 election.

On Intel platforms machine check exceptions are always broadcast to
all CPUs.  This patch makes the machine check handler synchronize all
these machine checks, elect a Monarch to handle the event and collect
the worst event from all CPUs and then process it first.

This has some advantages:

- When there is a truly data corrupting error the system panics as
  quickly as possible. This improves containment of corrupted
  data and makes sure the corrupted data never hits stable storage.

- The panics are synchronized and do not reenter the panic code
  on multiple CPUs (which currently does not handle this well).

- All the errors are reported. Currently it often happens that
  another CPU happens to do the panic first, but reports useless
  information (empty machine check) because the real error
  happened on another CPU which came in later.
  This is a big advantage on Nehalem where the 8 threads per CPU
  lead to often the wrong CPU winning the race and dumping
  useless information on a machine check.  The problem also occurs
  in a less severe form on older CPUs.

- The system can detect when no CPUs detected a machine check
  and shut down the system.  This can happen when one CPU is so
  badly hung that that it cannot process a machine check anymore
  or when some external agent wants to stop the system by
  asserting the machine check pin.  This follows Intel hardware
  recommendations.

- This matches the recommended error model by the CPU designers.

- The events can be output in true severity order

- When a panic happens on another CPU it makes sure to be actually
  be able to process the stop IPI by enabling interrupts.

The code is extremly careful to handle timeouts while waiting
for other CPUs. It can't rely on the normal timing mechanisms
(jiffies, ktime_get) because of its asynchronous/lockless nature,
so it uses own timeouts using ndelay() and a "SPINUNIT"

The timeout is configurable. By default it waits for upto one
second for the other CPUs.  This can be also disabled.

From some informal testing AMD systems do not see to broadcast
machine checks, so right now it's always disabled by default on
non Intel CPUs or also on very old Intel systems.

Includes fixes from Ying Huang
Fixed a "ecception" in a comment (H.Seto)
Moved global_nwo reset later based on suggestion from H.Seto
v2: Avoid duplicate messages

[ Impact: feature, fixes long standing problems. ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/x86/x86_64/boot-options.txt |   6 +-
 Documentation/x86/x86_64/machinecheck     |   4 +
 arch/x86/kernel/cpu/mcheck/mce.c          | 360 ++++++++++++++++++++--
 3 files changed, 340 insertions(+), 30 deletions(-)

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 63fca718256e..0ee5e3b212f3 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -15,13 +15,17 @@ Machine check
                in a reboot. On Intel systems it is enabled by default.
    mce=nobootlog
 		Disable boot machine check logging.
-   mce=tolerancelevel (number)
+   mce=tolerancelevel[,monarchtimeout] (number,number)
+		tolerance levels:
 		0: always panic on uncorrected errors, log corrected errors
 		1: panic or SIGBUS on uncorrected errors, log corrected errors
 		2: SIGBUS or log uncorrected errors, log corrected errors
 		3: never panic or SIGBUS, log all errors (for testing only)
 		Default is 1
 		Can be also set using sysfs which is preferable.
+		monarchtimeout:
+		Sets the time in us to wait for other CPUs on machine checks. 0
+		to disable.
 
    nomce (for compatibility with i386): same as mce=off
 
diff --git a/Documentation/x86/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck
index a4fdb25446e0..b1fb30273286 100644
--- a/Documentation/x86/x86_64/machinecheck
+++ b/Documentation/x86/x86_64/machinecheck
@@ -69,6 +69,10 @@ trigger
 	Program to run when a machine check event is detected.
 	This is an alternative to running mcelog regularly from cron
 	and allows to detect events faster.
+monarch_timeout
+	How long to wait for the other CPUs to machine check too on a
+	exception. 0 to disable waiting for other CPUs.
+	Unit: us
 
 TBD document entries for AMD threshold interrupt configuration
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 421020f1d7db..ba431893e31d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/string.h>
 #include <linux/sysdev.h>
+#include <linux/delay.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
 #include <linux/sysfs.h>
@@ -28,6 +29,7 @@
 #include <linux/init.h>
 #include <linux/kmod.h>
 #include <linux/poll.h>
+#include <linux/nmi.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/fs.h>
@@ -60,6 +62,8 @@ int				mce_disabled;
 
 #define MISC_MCELOG_MINOR	227
 
+#define SPINUNIT 100	/* 100ns */
+
 atomic_t mce_entry;
 
 DEFINE_PER_CPU(unsigned, mce_exception_count);
@@ -77,6 +81,7 @@ static u64			*bank;
 static unsigned long		notify_user;
 static int			rip_msr;
 static int			mce_bootlog = -1;
+static int			monarch_timeout = -1;
 
 static char			trigger[128];
 static char			*trigger_argv[2] = { trigger, NULL };
@@ -84,6 +89,9 @@ static char			*trigger_argv[2] = { trigger, NULL };
 static unsigned long		dont_init_banks;
 
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DEFINE_PER_CPU(struct mce, mces_seen);
+static int			cpu_missing;
+
 
 /* MCA banks polled by the period polling timer for corrected events */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -241,6 +249,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 	}
 	if (final)
 		print_mce(final);
+	if (cpu_missing)
+		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
 	if (exp)
 		printk(KERN_EMERG "Machine check: %s\n", exp);
 	panic(msg);
@@ -450,6 +460,264 @@ static int mce_no_way_out(struct mce *m, char **msg)
 	return 0;
 }
 
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static int mce_timed_out(u64 *t)
+{
+	/*
+	 * The others already did panic for some reason.
+	 * Bail out like in a timeout.
+	 * rmb() to tell the compiler that system_state
+	 * might have been modified by someone else.
+	 */
+	rmb();
+	if (atomic_read(&mce_paniced))
+		wait_for_panic();
+	if (!monarch_timeout)
+		goto out;
+	if ((s64)*t < SPINUNIT) {
+		/* CHECKME: Make panic default for 1 too? */
+		if (tolerant < 1)
+			mce_panic("Timeout synchronizing machine check over CPUs",
+				  NULL, NULL);
+		cpu_missing = 1;
+		return 1;
+	}
+	*t -= SPINUNIT;
+out:
+	touch_nmi_watchdog();
+	return 0;
+}
+
+/*
+ * The Monarch's reign.  The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of an machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+	int cpu;
+	struct mce *m = NULL;
+	int global_worst = 0;
+	char *msg = NULL;
+	char *nmsg = NULL;
+
+	/*
+	 * This CPU is the Monarch and the other CPUs have run
+	 * through their handlers.
+	 * Grade the severity of the errors of all the CPUs.
+	 */
+	for_each_possible_cpu(cpu) {
+		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
+					    &nmsg);
+		if (severity > global_worst) {
+			msg = nmsg;
+			global_worst = severity;
+			m = &per_cpu(mces_seen, cpu);
+		}
+	}
+
+	/*
+	 * Cannot recover? Panic here then.
+	 * This dumps all the mces in the log buffer and stops the
+	 * other CPUs.
+	 */
+	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
+		mce_panic("Fatal machine check", m, msg);
+
+	/*
+	 * For UC somewhere we let the CPU who detects it handle it.
+	 * Also must let continue the others, otherwise the handling
+	 * CPU could deadlock on a lock.
+	 */
+
+	/*
+	 * No machine check event found. Must be some external
+	 * source or one CPU is hung. Panic.
+	 */
+	if (!m && tolerant < 3)
+		mce_panic("Machine check from unknown source", NULL, NULL);
+
+	/*
+	 * Now clear all the mces_seen so that they don't reappear on
+	 * the next mce.
+	 */
+	for_each_possible_cpu(cpu)
+		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static int mce_start(int no_way_out, int *order)
+{
+	int nwo;
+	int cpus = num_online_cpus();
+	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+
+	if (!timeout) {
+		*order = -1;
+		return no_way_out;
+	}
+
+	atomic_add(no_way_out, &global_nwo);
+
+	/*
+	 * Wait for everyone.
+	 */
+	while (atomic_read(&mce_callin) != cpus) {
+		if (mce_timed_out(&timeout)) {
+			atomic_set(&global_nwo, 0);
+			*order = -1;
+			return no_way_out;
+		}
+		ndelay(SPINUNIT);
+	}
+
+	/*
+	 * Cache the global no_way_out state.
+	 */
+	nwo = atomic_read(&global_nwo);
+
+	/*
+	 * Monarch starts executing now, the others wait.
+	 */
+	if (*order == 1) {
+		atomic_set(&mce_executing, 1);
+		return nwo;
+	}
+
+	/*
+	 * Now start the scanning loop one by one
+	 * in the original callin order.
+	 * This way when there are any shared banks it will
+	 * be only seen by one CPU before cleared, avoiding duplicates.
+	 */
+	while (atomic_read(&mce_executing) < *order) {
+		if (mce_timed_out(&timeout)) {
+			atomic_set(&global_nwo, 0);
+			*order = -1;
+			return no_way_out;
+		}
+		ndelay(SPINUNIT);
+	}
+	return nwo;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static int mce_end(int order)
+{
+	int ret = -1;
+	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+
+	if (!timeout)
+		goto reset;
+	if (order < 0)
+		goto reset;
+
+	/*
+	 * Allow others to run.
+	 */
+	atomic_inc(&mce_executing);
+
+	if (order == 1) {
+		/* CHECKME: Can this race with a parallel hotplug? */
+		int cpus = num_online_cpus();
+
+		/*
+		 * Monarch: Wait for everyone to go through their scanning
+		 * loops.
+		 */
+		while (atomic_read(&mce_executing) <= cpus) {
+			if (mce_timed_out(&timeout))
+				goto reset;
+			ndelay(SPINUNIT);
+		}
+
+		mce_reign();
+		barrier();
+		ret = 0;
+	} else {
+		/*
+		 * Subject: Wait for Monarch to finish.
+		 */
+		while (atomic_read(&mce_executing) != 0) {
+			if (mce_timed_out(&timeout))
+				goto reset;
+			ndelay(SPINUNIT);
+		}
+
+		/*
+		 * Don't reset anything. That's done by the Monarch.
+		 */
+		return 0;
+	}
+
+	/*
+	 * Reset all global state.
+	 */
+reset:
+	atomic_set(&global_nwo, 0);
+	atomic_set(&mce_callin, 0);
+	barrier();
+
+	/*
+	 * Let others run again.
+	 */
+	atomic_set(&mce_executing, 0);
+	return ret;
+}
+
+static void mce_clear_state(unsigned long *toclear)
+{
+	int i;
+
+	for (i = 0; i < banks; i++) {
+		if (test_bit(i, toclear))
+			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	}
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -457,12 +725,23 @@ static int mce_no_way_out(struct mce *m, char **msg)
  * This is executed in NMI context not subject to normal locking rules. This
  * implies that most kernel services cannot be safely used. Don't even
  * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
  */
 void do_machine_check(struct pt_regs *regs, long error_code)
 {
-	struct mce m, panicm;
-	int panicm_found = 0;
+	struct mce m, *final;
 	int i;
+	int worst = 0;
+	int severity;
+	/*
+	 * Establish sequential order between the CPUs entering the machine
+	 * check handler.
+	 */
+	int order;
+
 	/*
 	 * If no_way_out gets set, there is no safe way to recover from this
 	 * MCE.  If tolerant is cranked up, we'll try anyway.
@@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	if (!banks)
 		goto out;
 
+	order = atomic_add_return(1, &mce_callin);
 	mce_setup(&m);
 
 	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 	no_way_out = mce_no_way_out(&m, &msg);
 
+	final = &__get_cpu_var(mces_seen);
+	*final = m;
+
 	barrier();
 
+	/*
+	 * Go through all the banks in exclusion of the other CPUs.
+	 * This way we don't report duplicated events on shared banks
+	 * because the first one to see it will clear it.
+	 */
+	no_way_out = mce_start(no_way_out, &order);
 	for (i = 0; i < banks; i++) {
 		__clear_bit(i, toclear);
 		if (!bank[i])
@@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		mce_get_rip(&m, regs);
 		mce_log(&m);
 
-		/*
-		 * Did this bank cause the exception?
-		 *
-		 * Assume that the bank with uncorrectable errors did it,
-		 * and that there is only a single one:
-		 */
-		if ((m.status & MCI_STATUS_UC) &&
-					(m.status & MCI_STATUS_EN)) {
-			panicm = m;
-			panicm_found = 1;
+		severity = mce_severity(&m, tolerant, NULL);
+		if (severity > worst) {
+			*final = m;
+			worst = severity;
 		}
 	}
 
+	if (!no_way_out)
+		mce_clear_state(toclear);
+
 	/*
-	 * If we didn't find an uncorrectable error, pick
-	 * the last one (shouldn't happen, just being safe).
+	 * Do most of the synchronization with other CPUs.
+	 * When there's any problem use only local no_way_out state.
 	 */
-	if (!panicm_found)
-		panicm = m;
+	if (mce_end(order) < 0)
+		no_way_out = worst >= MCE_PANIC_SEVERITY;
 
 	/*
 	 * If we have decided that we just CAN'T continue, and the user
 	 * has not set tolerant to an insane level, give up and die.
+	 *
+	 * This is mainly used in the case when the system doesn't
+	 * support MCE broadcasting or it has been disabled.
 	 */
 	if (no_way_out && tolerant < 3)
-		mce_panic("Machine check", &panicm, msg);
+		mce_panic("Machine check", final, msg);
 
 	/*
 	 * If the error seems to be unrecoverable, something should be
@@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		 * instruction which caused the MCE.
 		 */
 		if (m.mcgstatus & MCG_STATUS_EIPV)
-			user_space = panicm.ip && (panicm.cs & 3);
+			user_space = final->ip && (final->cs & 3);
 
 		/*
 		 * If we know that the error was in user space, send a
@@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		if (user_space) {
 			force_sig(SIGBUS, current);
 		} else if (panic_on_oops || tolerant < 2) {
-			mce_panic("Uncorrected machine check", &panicm, msg);
+			mce_panic("Uncorrected machine check", final, msg);
 		}
 	}
 
 	/* notify userspace ASAP */
 	set_thread_flag(TIF_MCE_NOTIFY);
 
-	mce_report_event(regs);
-
-	/* the last thing we do is clear state */
-	for (i = 0; i < banks; i++) {
-		if (test_bit(i, toclear))
-			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
-	}
+	if (worst > 0)
+		mce_report_event(regs);
 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
 out:
 	atomic_dec(&mce_entry);
@@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 
 		if (c->x86 == 6 && c->x86_model < 0x1A)
 			__set_bit(0, &dont_init_banks);
+
+		/*
+		 * All newer Intel systems support MCE broadcasting. Enable
+		 * synchronization with a one second timeout.
+		 */
+		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
+			monarch_timeout < 0)
+			monarch_timeout = USEC_PER_SEC;
 	}
+	if (monarch_timeout < 0)
+		monarch_timeout = 0;
 }
 
 static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = {
 
 /*
  * mce=off disables machine check
- * mce=TOLERANCELEVEL (number, see above)
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ *	monarchtimeout is how long to wait for other CPUs on machine
+ *	check, or 0 to not wait
  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
  * mce=nobootlog Don't log MCEs from before booting.
  */
@@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str)
 		mce_disabled = 1;
 	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
 		mce_bootlog = (str[0] == 'b');
-	else if (isdigit(str[0]))
+	else if (isdigit(str[0])) {
 		get_option(&str, &tolerant);
-	else {
+		if (*str == ',') {
+			++str;
+			get_option(&str, &monarch_timeout);
+		}
+	} else {
 		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
 		       str);
 		return 0;
@@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s,
 
 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
+static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
 
 static struct sysdev_ext_attribute attr_check_interval = {
 	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
@@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = {
 
 static struct sysdev_attribute *mce_attrs[] = {
 	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
+	&attr_monarch_timeout.attr,
 	NULL
 };
 

From ac9603754dc7e286e62ae4f1067958d5b0075f99 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:58 +0200
Subject: [PATCH 115/162] x86, mce: make non Monarch panic message "Fatal
 machine check" too

... instead of "Machine check". This is for consistency with the Monarch
panic message.

Based on a report from Ying Huang.

v2: But add a descriptive postfix so that the test suite can distingush.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ba431893e31d..d5cb0b4c17ff 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -554,7 +554,7 @@ static void mce_reign(void)
 	 * other CPUs.
 	 */
 	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
-		mce_panic("Fatal machine check", m, msg);
+		mce_panic("Fatal Machine check", m, msg);
 
 	/*
 	 * For UC somewhere we let the CPU who detects it handle it.
@@ -858,7 +858,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	 * support MCE broadcasting or it has been disabled.
 	 */
 	if (no_way_out && tolerant < 3)
-		mce_panic("Machine check", final, msg);
+		mce_panic("Fatal machine check on current CPU", final, msg);
 
 	/*
 	 * If the error seems to be unrecoverable, something should be

From 1b2797dcc9f0ad89bc382ace26c6baafbc7e33c2 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 27 May 2009 21:56:51 +0200
Subject: [PATCH 116/162] x86, mce: improve mce_get_rip

Assume IP on the stack is valid when either EIPV or RIPV are set.
This influences whether the machine check exception handler decides
to return or panic.

This fixes a test case in the mce-test suite and is more compliant
to the specification.

This currently only makes a difference in a artificial testing
scenario with the mce-test test suite.

Also in addition do not force the EIPV to be valid with the exact
register MSRs, and keep in trust the CS value on stack even if MSR
is available.

[AK: combination of patches from Huang Ying and Hidetoshi Seto, with
new description by me]
[add some description, no code changed - HS]

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d5cb0b4c17ff..a7dc369a9974 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -306,21 +306,22 @@ int mce_available(struct cpuinfo_x86 *c)
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
+/*
+ * Get the address of the instruction at the time of the machine check
+ * error.
+ */
 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 {
-	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
+
+	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
 		m->ip = regs->ip;
 		m->cs = regs->cs;
 	} else {
 		m->ip = 0;
 		m->cs = 0;
 	}
-	if (rip_msr) {
-		/* Assume the RIP in the MSR is exact. Is this true? */
-		m->mcgstatus |= MCG_STATUS_EIPV;
+	if (rip_msr)
 		m->ip = mce_rdmsrl(rip_msr);
-		m->cs = 0;
-	}
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC 

From 29b0f591d678838435fbb3e15ef20266f1a9e01d Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:56 +0200
Subject: [PATCH 117/162] x86, mce: default to panic timeout for machine checks

Fatal machine checks can be logged to disk after boot, but only if
the system did a warm reboot. That's unfortunately difficult with the
default panic behaviour, which waits forever and the admin has to
press the power button because modern systems usually miss a reset button.
This clears the machine checks in the registers and make
it impossible to log them.

This patch changes the default for machine check panic to always
reboot after 30s. Then the mce can be successfully logged after
reboot.

I believe this will improve machine check experience for any
system running the X server.

This is dependent on successfull boot logging of MCEs. This currently
only works on Intel systems, on AMD there are quite a lot of systems
around which leave junk in the machine check registers after boot,
so it's disabled here. These systems will continue to default
to endless waiting panic.

v2: Only force panic timeout when it's shorter (H.Seto)
v3: Only force timeout when there is no timeout
(based on comment H.Seto)

[ Fix changelog - HS ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a7dc369a9974..79d243145b8f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -82,6 +82,7 @@ static unsigned long		notify_user;
 static int			rip_msr;
 static int			mce_bootlog = -1;
 static int			monarch_timeout = -1;
+static int			mce_panic_timeout;
 
 static char			trigger[128];
 static char			*trigger_argv[2] = { trigger, NULL };
@@ -216,6 +217,8 @@ static void wait_for_panic(void)
 	local_irq_enable();
 	while (timeout-- > 0)
 		udelay(1);
+	if (panic_timeout == 0)
+		panic_timeout = mce_panic_timeout;
 	panic("Panicing machine check CPU died");
 }
 
@@ -253,6 +256,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
 	if (exp)
 		printk(KERN_EMERG "Machine check: %s\n", exp);
+	if (panic_timeout == 0)
+		panic_timeout = mce_panic_timeout;
 	panic(msg);
 }
 
@@ -1117,6 +1122,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 	}
 	if (monarch_timeout < 0)
 		monarch_timeout = 0;
+	if (mce_bootlog != 0)
+		mce_panic_timeout = 30;
 }
 
 static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)

From 86503560e48153aba539ff117450d31ab2ef76d7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:58 +0200
Subject: [PATCH 118/162] x86, mce: print header/footer only once for multiple
 MCEs

When multiple MCEs are printed print the "HARDWARE ERROR" header
and "This is not a software error" footer only once. This
makes the output much more compact with many CPUs.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 79d243145b8f..ff9c732989de 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -176,11 +176,13 @@ void mce_log(struct mce *mce)
 	set_bit(0, &notify_user);
 }
 
-static void print_mce(struct mce *m)
+static void print_mce(struct mce *m, int *first)
 {
-	printk(KERN_EMERG "\n"
-	       KERN_EMERG "HARDWARE ERROR\n"
-	       KERN_EMERG
+	if (*first) {
+		printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
+		*first = 0;
+	}
+	printk(KERN_EMERG
 	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 	       m->extcpu, m->mcgstatus, m->bank, m->status);
 	if (m->ip) {
@@ -200,9 +202,12 @@ static void print_mce(struct mce *m)
 	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
 			m->cpuvendor, m->cpuid, m->time, m->socketid,
 			m->apicid);
-	printk(KERN_EMERG "This is not a software problem!\n");
-	printk(KERN_EMERG "Run through mcelog --ascii to decode "
-	       "and contact your hardware vendor\n");
+}
+
+static void print_mce_tail(void)
+{
+	printk(KERN_EMERG "This is not a software problem!\n"
+	       KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 }
 
 #define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -225,6 +230,7 @@ static void wait_for_panic(void)
 static void mce_panic(char *msg, struct mce *final, char *exp)
 {
 	int i;
+	int first = 1;
 
 	/*
 	 * Make sure only one CPU runs in machine check panic
@@ -240,7 +246,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 		struct mce *m = &mcelog.entry[i];
 		if ((m->status & MCI_STATUS_VAL) &&
 			!(m->status & MCI_STATUS_UC))
-			print_mce(m);
+			print_mce(m, &first);
 	}
 	/* Now print uncorrected but with the final one last */
 	for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -248,12 +254,13 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 		if (!(m->status & MCI_STATUS_VAL))
 			continue;
 		if (!final || memcmp(m, final, sizeof(struct mce)))
-			print_mce(m);
+			print_mce(m, &first);
 	}
 	if (final)
-		print_mce(final);
+		print_mce(final, &first);
 	if (cpu_missing)
 		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
+	print_mce_tail();
 	if (exp)
 		printk(KERN_EMERG "Machine check: %s\n", exp);
 	if (panic_timeout == 0)

From ed7290d0ee8f81aa78bfe816f01b012f208cafc5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:57 +0200
Subject: [PATCH 119/162] x86, mce: implement new status bits

The x86 architecture recently added some new machine check status bits:
S(ignalled) and AR (Action-Required). Signalled allows to check
if a specific event caused an exception or was just logged through CMCI.
AR allows the kernel to decide if an event needs immediate action
or can be delayed or ignored.

Implement support for these new status bits. mce_severity() uses
the new bits to grade the machine check correctly and decide what
to do. The exception handler uses AR to decide to kill or not.
The S bit is used to separate events between the poll/CMCI handler
and the exception handler.

Classical UC always leads to panic. That was true before anyways
because the existing CPUs always passed a PCC with it.

Also corrects the rules whether to kill in user or kernel context
and how to handle missing RIPV.

The machine check handler largely uses the mce-severity grading
engine now instead of making its own decisions. This means the logic
is centralized in one place.  This is useful because it has to be
evaluated multiple times.

v2: Some rule fixes; Add AO events
Fix RIPV, RIPV|EIPV order (Ying Huang)
Fix UCNA with AR=1 message (Ying Huang)
Add comment about panicing in m_c_p.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h                | 10 +++
 arch/x86/kernel/cpu/mcheck/mce-internal.h |  5 ++
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 82 ++++++++++++++++++++--
 arch/x86/kernel/cpu/mcheck/mce.c          | 84 ++++++++++++-----------
 4 files changed, 137 insertions(+), 44 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index ba1f8890cf51..afd3cdf6f8ad 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -15,6 +15,7 @@
 #define MCG_EXT_CNT_MASK	0xff0000     /* Number of Extended registers */
 #define MCG_EXT_CNT_SHIFT	16
 #define MCG_EXT_CNT(c)		(((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
+#define MCG_SER_P	 	(1ULL<<24)   /* MCA recovery/new status bits */
 
 #define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1ULL<<1)   /* ip points to correct instruction */
@@ -27,6 +28,15 @@
 #define MCI_STATUS_MISCV (1ULL<<59)  /* misc error reg. valid */
 #define MCI_STATUS_ADDRV (1ULL<<58)  /* addr reg. valid */
 #define MCI_STATUS_PCC   (1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	 (1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF  0	/* segment offset */
+#define MCM_ADDR_LINEAR  1	/* linear address */
+#define MCM_ADDR_PHYS	 2	/* physical address */
+#define MCM_ADDR_MEM	 3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 /* Fields are zero when not available */
 struct mce {
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index f126b4ae7a25..54dcb8ff12e5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -2,9 +2,14 @@
 
 enum severity_level {
 	MCE_NO_SEVERITY,
+	MCE_KEEP_SEVERITY,
 	MCE_SOME_SEVERITY,
+	MCE_AO_SEVERITY,
 	MCE_UC_SEVERITY,
+	MCE_AR_SEVERITY,
 	MCE_PANIC_SEVERITY,
 };
 
 int mce_severity(struct mce *a, int tolerant, char **msg);
+
+extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index c189e89a89ae..4f4d2caf4043 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -19,43 +19,117 @@
  * first. Since there are quite a lot of combinations test the bits in a
  * table-driven way. The rules are simply processed in order, first
  * match wins.
+ *
+ * Note this is only used for machine check exceptions, the corrected
+ * errors use much simpler rules. The exceptions still check for the corrected
+ * errors, but only to leave them alone for the CMCI handler (except for
+ * panic situations)
  */
 
+enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum ser { SER_REQUIRED = 1, NO_SER = 2 };
+
 static struct severity {
 	u64 mask;
 	u64 result;
 	unsigned char sev;
 	unsigned char mcgmask;
 	unsigned char mcgres;
+	unsigned char ser;
+	unsigned char context;
 	char *msg;
 } severities[] = {
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
 #define SEV(s) .sev = MCE_ ## s ## _SEVERITY
 #define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
 #define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
 #define MCGMASK(x, res, s, m, r...) \
 	{ .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
+#define MASK(x, y, s, m, r...) \
+	{ .mask = x, .result = y, SEV(s), .msg = m, ## r }
+#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define MCACOD 0xffff
+
 	BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
 	BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
 	BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
-	MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "No restart IP"),
+	/* When MCIP is not set something is very confused */
+	MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
+	/* Neither return not error IP -- no chance to recover -> PANIC */
+	MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
+		"Neither restart nor error IP"),
+	MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
+		KERNEL),
+	BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
+	MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
+	     "Spurious not enabled", SER),
+
+	/* ignore OVER for UCNA */
+	MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
+	     "Uncorrected no action required", SER),
+	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
+	     "Illegal combination (UCNA with AR=1)", SER),
+	MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
+
+	/* AR add known MCACODs here */
+	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
+	     "Action required with lost events", SER),
+	MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
+	     "Action required; unknown MCACOD", SER),
+
+	/* known AO MCACODs: */
+	MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
+	     "Action optional: memory scrubbing error", SER),
+	MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
+	     "Action optional: last level cache writeback error", SER),
+
+	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
+	     "Action optional unknown MCACOD", SER),
+	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
+	     "Action optional with lost events", SER),
 	BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
 	BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
 	BITSET(0, SOME, "No match")	/* always matches. keep at end */
 };
 
+/*
+ * If the EIPV bit is set, it means the saved IP is the
+ * instruction which caused the MCE.
+ */
+static int error_context(struct mce *m)
+{
+	if (m->mcgstatus & MCG_STATUS_EIPV)
+		return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	/* Unknown, assume kernel */
+	return IN_KERNEL;
+}
+
 int mce_severity(struct mce *a, int tolerant, char **msg)
 {
+	enum context ctx = error_context(a);
 	struct severity *s;
+
 	for (s = severities;; s++) {
 		if ((a->status & s->mask) != s->result)
 			continue;
 		if ((a->mcgstatus & s->mcgmask) != s->mcgres)
 			continue;
-		if (s->sev > MCE_NO_SEVERITY && (a->status & MCI_STATUS_UC) &&
-			tolerant < 1)
-			return MCE_PANIC_SEVERITY;
+		if (s->ser == SER_REQUIRED && !mce_ser)
+			continue;
+		if (s->ser == NO_SER && mce_ser)
+			continue;
+		if (s->context && ctx != s->context)
+			continue;
 		if (msg)
 			*msg = s->msg;
+		if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
+			if (panic_on_oops || tolerant < 1)
+				return MCE_PANIC_SEVERITY;
+		}
 		return s->sev;
 	}
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ff9c732989de..f051a7807ab4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -83,6 +83,7 @@ static int			rip_msr;
 static int			mce_bootlog = -1;
 static int			monarch_timeout = -1;
 static int			mce_panic_timeout;
+int				mce_ser;
 
 static char			trigger[128];
 static char			*trigger_argv[2] = { trigger, NULL };
@@ -391,6 +392,15 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
  * Those are just logged through /dev/mcelog.
  *
  * This is executed in standard interrupt context.
+ *
+ * Note: spec recommends to panic for fatal unsignalled
+ * errors here. However this would be quite problematic --
+ * we would need to reimplement the Monarch handling and
+ * it would mess up the exclusion between exception handler
+ * and poll hander -- * so we skip this for now.
+ * These cases should not happen anyways, or only when the CPU
+ * is already totally * confused. In this case it's likely it will
+ * not fully execute the machine check handler either.
  */
 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 {
@@ -417,13 +427,13 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 			continue;
 
 		/*
-		 * Uncorrected events are handled by the exception handler
-		 * when it is enabled. But when the exception is disabled log
-		 * everything.
+		 * Uncorrected or signalled events are handled by the exception
+		 * handler when it is enabled, so don't process those here.
 		 *
 		 * TBD do the same check for MCI_STATUS_EN here?
 		 */
-		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+		if (!(flags & MCP_UC) &&
+		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 			continue;
 
 		if (m.status & MCI_STATUS_MISCV)
@@ -789,6 +799,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	barrier();
 
+	/*
+	 * When no restart IP must always kill or panic.
+	 */
+	if (!(m.mcgstatus & MCG_STATUS_RIPV))
+		kill_it = 1;
+
 	/*
 	 * Go through all the banks in exclusion of the other CPUs.
 	 * This way we don't report duplicated events on shared banks
@@ -809,10 +825,11 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 			continue;
 
 		/*
-		 * Non uncorrected errors are handled by machine_check_poll
-		 * Leave them alone, unless this panics.
+		 * Non uncorrected or non signaled errors are handled by
+		 * machine_check_poll. Leave them alone, unless this panics.
 		 */
-		if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out)
+		if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+			!no_way_out)
 			continue;
 
 		/*
@@ -820,17 +837,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		 */
 		add_taint(TAINT_MACHINE_CHECK);
 
-		__set_bit(i, toclear);
+		severity = mce_severity(&m, tolerant, NULL);
 
-		if (m.status & MCI_STATUS_EN) {
-			/*
-			 * If this error was uncorrectable and there was
-			 * an overflow, we're in trouble.  If no overflow,
-			 * we might get away with just killing a task.
-			 */
-			if (m.status & MCI_STATUS_UC)
-				kill_it = 1;
-		} else {
+		/*
+		 * When machine check was for corrected handler don't touch,
+		 * unless we're panicing.
+		 */
+		if (severity == MCE_KEEP_SEVERITY && !no_way_out)
+			continue;
+		__set_bit(i, toclear);
+		if (severity == MCE_NO_SEVERITY) {
 			/*
 			 * Machine check event was not enabled. Clear, but
 			 * ignore.
@@ -838,6 +854,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 			continue;
 		}
 
+		/*
+		 * Kill on action required.
+		 */
+		if (severity == MCE_AR_SEVERITY)
+			kill_it = 1;
+
 		if (m.status & MCI_STATUS_MISCV)
 			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
 		if (m.status & MCI_STATUS_ADDRV)
@@ -846,7 +868,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		mce_get_rip(&m, regs);
 		mce_log(&m);
 
-		severity = mce_severity(&m, tolerant, NULL);
 		if (severity > worst) {
 			*final = m;
 			worst = severity;
@@ -879,29 +900,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	 * one task, do that.  If the user has set the tolerance very
 	 * high, don't try to do anything at all.
 	 */
-	if (kill_it && tolerant < 3) {
-		int user_space = 0;
 
-		/*
-		 * If the EIPV bit is set, it means the saved IP is the
-		 * instruction which caused the MCE.
-		 */
-		if (m.mcgstatus & MCG_STATUS_EIPV)
-			user_space = final->ip && (final->cs & 3);
-
-		/*
-		 * If we know that the error was in user space, send a
-		 * SIGBUS.  Otherwise, panic if tolerance is low.
-		 *
-		 * force_sig() takes an awful lot of locks and has a slight
-		 * risk of deadlocking.
-		 */
-		if (user_space) {
-			force_sig(SIGBUS, current);
-		} else if (panic_on_oops || tolerant < 2) {
-			mce_panic("Uncorrected machine check", final, msg);
-		}
-	}
+	if (kill_it && tolerant < 3)
+		force_sig(SIGBUS, current);
 
 	/* notify userspace ASAP */
 	set_thread_flag(TIF_MCE_NOTIFY);
@@ -1049,6 +1050,9 @@ static int mce_cap_init(void)
 	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
 		rip_msr = MSR_IA32_MCG_EIP;
 
+	if (cap & MCG_SER_P)
+		mce_ser = 1;
+
 	return 0;
 }
 

From 4611a6fa4b37cf6b8b6066ed0d605c994c62a1a0 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 27 May 2009 21:56:57 +0200
Subject: [PATCH 120/162] x86, mce: export MCE severities coverage via debugfs

The MCE severity judgement code is data-driven, so code coverage tools
such as gcov can not be used for measuring coverage. Instead a dedicated
coverage mechanism is implemented.  The kernel keeps track of rules
executed and reports them in debugfs.

This is useful for increasing coverage of the mce-test testsuite.

Right now it's unconditionally enabled because it's very little code.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 83 +++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4f4d2caf4043..ff0807f97056 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -10,6 +10,9 @@
  * Author: Andi Kleen
  */
 #include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
 #include <asm/mce.h>
 
 #include "mce-internal.h"
@@ -37,6 +40,7 @@ static struct severity {
 	unsigned char mcgres;
 	unsigned char ser;
 	unsigned char context;
+	unsigned char covered;
 	char *msg;
 } severities[] = {
 #define KERNEL .context = IN_KERNEL
@@ -126,6 +130,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg)
 			continue;
 		if (msg)
 			*msg = s->msg;
+		s->covered = 1;
 		if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
 			if (panic_on_oops || tolerant < 1)
 				return MCE_PANIC_SEVERITY;
@@ -133,3 +138,81 @@ int mce_severity(struct mce *a, int tolerant, char **msg)
 		return s->sev;
 	}
 }
+
+static void *s_start(struct seq_file *f, loff_t *pos)
+{
+	if (*pos >= ARRAY_SIZE(severities))
+		return NULL;
+	return &severities[*pos];
+}
+
+static void *s_next(struct seq_file *f, void *data, loff_t *pos)
+{
+	if (++(*pos) >= ARRAY_SIZE(severities))
+		return NULL;
+	return &severities[*pos];
+}
+
+static void s_stop(struct seq_file *f, void *data)
+{
+}
+
+static int s_show(struct seq_file *f, void *data)
+{
+	struct severity *ser = data;
+	seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
+	return 0;
+}
+
+static const struct seq_operations severities_seq_ops = {
+	.start	= s_start,
+	.next	= s_next,
+	.stop	= s_stop,
+	.show	= s_show,
+};
+
+static int severities_coverage_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &severities_seq_ops);
+}
+
+static ssize_t severities_coverage_write(struct file *file,
+					 const char __user *ubuf,
+					 size_t count, loff_t *ppos)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(severities); i++)
+		severities[i].covered = 0;
+	return count;
+}
+
+static const struct file_operations severities_coverage_fops = {
+	.open		= severities_coverage_open,
+	.release	= seq_release,
+	.read		= seq_read,
+	.write		= severities_coverage_write,
+};
+
+static int __init severities_debugfs_init(void)
+{
+	struct dentry *dmce = NULL, *fseverities_coverage = NULL;
+
+	dmce = debugfs_create_dir("mce", NULL);
+	if (dmce == NULL)
+		goto err_out;
+	fseverities_coverage = debugfs_create_file("severities-coverage",
+						   0444, dmce, NULL,
+						   &severities_coverage_fops);
+	if (fseverities_coverage == NULL)
+		goto err_out;
+
+	return 0;
+
+err_out:
+	if (fseverities_coverage)
+		debugfs_remove(fseverities_coverage);
+	if (dmce)
+		debugfs_remove(dmce);
+	return -ENOMEM;
+}
+late_initcall(severities_debugfs_init);

From 4ef702c10b5df18ab04921fc252c26421d4d6c75 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:52 +0200
Subject: [PATCH 121/162] x86: fix panic with interrupts off (needed for MCE)

For some time each panic() called with interrupts disabled
triggered the !irqs_disabled() WARN_ON in smp_call_function(),
producing ugly backtraces and confusing users.

This is a common situation with machine checks for example which
tend to call panic with interrupts disabled, but will also hit
in other situations e.g. panic during early boot.  In fact it
means that panic cannot be called in many circumstances, which
would be bad.

This all started with the new fancy queued smp_call_function,
which is then used by the shutdown path to shut down the other
CPUs.

On closer examination it turned out that the fancy RCU
smp_call_function() does lots of things not suitable in a panic
situation anyways, like allocating memory and relying on complex
system state.

I originally tried to patch this over by checking for panic
there, but it was quite complicated and the original patch
was also not very popular.  This also didn't fix some of the
underlying complexity problems.

The new code in post 2.6.29 tries to patch around this by
checking for oops_in_progress, but that is not enough to make
this fully safe and I don't think that's a real solution
because panic has to be reliable.

So instead use an own vector to reboot.  This makes the reboot
code extremly straight forward, which is definitely a big plus
in a panic situation where it is important to avoid relying on
too much kernel state.  The new simple code is also safe to be
called from interupts off region because it is very very simple.

There can be situations where it is important that panic
is reliable.  For example on a fatal machine check the panic
is needed to get the system up again and running as quickly
as possible.  So it's important that panic is reliable and
all function it calls simple.

This is why I came up with this simple vector scheme.
It's very hard to beat in simplicity.  Vectors are not
particularly precious anymore since all big systems are
using per CPU vectors.

Another possibility would have been to use an NMI similar
to kdump, but there is still the problem that NMIs don't
work reliably on some systems due to BIOS issues.  NMIs
would have been able to stop CPUs running with interrupts
off too.  In the sake of universal reliability I opted for
using a non NMI vector for now.

I put the reboot vector into the highest priority bucket of
the APIC vectors and moved the 64bit UV_BAU message down
instead into the next lower priority.

[ Impact: bug fix, fixes an old regression ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/entry_arch.h  |  1 +
 arch/x86/include/asm/hw_irq.h      |  1 +
 arch/x86/include/asm/irq_vectors.h |  9 +++------
 arch/x86/kernel/entry_64.S         |  2 ++
 arch/x86/kernel/irqinit.c          |  3 +++
 arch/x86/kernel/smp.c              | 28 +++++++++++++++++++++++++++-
 6 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 4cdcf5a3c96b..69f886805ecb 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
 BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
 BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
 BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
+BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
 
 BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
 		 smp_invalidate_interrupt)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 4e59197e29ba..1c8f28a63058 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -45,6 +45,7 @@ extern void invalidate_interrupt6(void);
 extern void invalidate_interrupt7(void);
 
 extern void irq_move_cleanup_interrupt(void);
+extern void reboot_interrupt(void);
 extern void threshold_interrupt(void);
 
 extern void call_function_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 68f7cf84a333..28477e4f2d49 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -88,12 +88,7 @@
 #define CALL_FUNCTION_SINGLE_VECTOR	0xfb
 #define THERMAL_APIC_VECTOR		0xfa
 #define THRESHOLD_APIC_VECTOR		0xf9
-
-#ifdef CONFIG_X86_32
-/* 0xf8 : free */
-#else
-# define UV_BAU_MESSAGE			0xf8
-#endif
+#define REBOOT_VECTOR			0xf8
 
 /* f0-f7 used for spreading out TLB flushes: */
 #define INVALIDATE_TLB_VECTOR_END	0xf7
@@ -117,6 +112,8 @@
  */
 #define GENERIC_INTERRUPT_VECTOR	0xed
 
+#define UV_BAU_MESSAGE			0xec
+
 /*
  * Self IPI vector for machine checks
  */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 711c130a8411..4234b1235652 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -976,6 +976,8 @@ END(\sym)
 #ifdef CONFIG_SMP
 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
 	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+apicinterrupt REBOOT_VECTOR \
+	reboot_interrupt smp_reboot_interrupt
 #endif
 
 #ifdef CONFIG_X86_UV
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 441f6ec6e9d4..4a69ec55be3d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -173,6 +173,9 @@ static void __init smp_intr_init(void)
 	/* Low priority IPI to cleanup after moving an irq */
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
+
+	/* IPI used for rebooting/stopping */
+	alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
 #endif
 #endif /* CONFIG_SMP */
 }
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index f6db48c405b8..bf1831aa14fa 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask)
  * this function calls the 'stop' function on all other CPUs in the system.
  */
 
+asmlinkage void smp_reboot_interrupt(void)
+{
+	ack_APIC_irq();
+	irq_enter();
+	stop_this_cpu(NULL);
+	irq_exit();
+}
+
 static void native_smp_send_stop(void)
 {
 	unsigned long flags;
+	unsigned long wait;
 
 	if (reboot_force)
 		return;
 
-	smp_call_function(stop_this_cpu, NULL, 0);
+	/*
+	 * Use an own vector here because smp_call_function
+	 * does lots of things not suitable in a panic situation.
+	 * On most systems we could also use an NMI here,
+	 * but there are a few systems around where NMI
+	 * is problematic so stay with an non NMI for now
+	 * (this implies we cannot stop CPUs spinning with irq off
+	 * currently)
+	 */
+	if (num_online_cpus() > 1) {
+		apic->send_IPI_allbutself(REBOOT_VECTOR);
+
+		/* Don't wait longer than a second */
+		wait = USEC_PER_SEC;
+		while (num_online_cpus() > 1 && wait--)
+			udelay(1);
+	}
+
 	local_irq_save(flags);
 	disable_local_APIC();
 	local_irq_restore(flags);

From 9ff36ee9668ff41ec3274597c730524645929b0f Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:58 +0200
Subject: [PATCH 122/162] x86, mce: rename mce_notify_user to mce_notify_irq

Rename the mce_notify_user function to mce_notify_irq. The next
patch will split the wakeup handling of interrupt context
and of process context and it's better to give it a clearer
name for this.

Contains a fix from Ying Huang

[ Impact: cleanup ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h                |  2 +-
 arch/x86/kernel/cpu/mcheck/mce-inject.c   |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c          | 10 +++++-----
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c |  2 +-
 arch/x86/kernel/signal.c                  |  2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index afd3cdf6f8ad..713926b62cbb 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -159,7 +159,7 @@ enum mcp_flags {
 };
 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
-int mce_notify_user(void);
+int mce_notify_irq(void);
 
 DECLARE_PER_CPU(struct mce, injectm);
 extern struct file_operations mce_chrdev_ops;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 7d858fb4ce67..a3a235a53f09 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -65,7 +65,7 @@ static void raise_mce(unsigned long data)
 		memset(&b, 0xff, sizeof(mce_banks_t));
 		printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
 		machine_check_poll(0, &b);
-		mce_notify_user();
+		mce_notify_irq();
 		printk(KERN_INFO "Finished machine check poll on CPU %d\n",
 		       cpu);
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index f051a7807ab4..13e1b7ffe73a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -348,7 +348,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
 	ack_APIC_irq();
 	exit_idle();
 	irq_enter();
-	mce_notify_user();
+	mce_notify_irq();
 	irq_exit();
 }
 #endif
@@ -356,7 +356,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
 static void mce_report_event(struct pt_regs *regs)
 {
 	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
-		mce_notify_user();
+		mce_notify_irq();
 		return;
 	}
 
@@ -968,7 +968,7 @@ static void mcheck_timer(unsigned long data)
 	 * polling interval, otherwise increase the polling interval.
 	 */
 	n = &__get_cpu_var(next_interval);
-	if (mce_notify_user())
+	if (mce_notify_irq())
 		*n = max(*n/2, HZ/100);
 	else
 		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
@@ -989,7 +989,7 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  * Can be called from interrupt context, but not from machine check/NMI
  * context.
  */
-int mce_notify_user(void)
+int mce_notify_irq(void)
 {
 	/* Not more than two messages every minute */
 	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
@@ -1014,7 +1014,7 @@ int mce_notify_user(void)
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(mce_notify_user);
+EXPORT_SYMBOL_GPL(mce_notify_irq);
 
 /*
  * Initialize Machine Checks for a CPU.
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index eff3740501a3..b7c5a2470b40 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -80,7 +80,7 @@ static int cmci_supported(int *banks)
 static void intel_threshold_interrupt(void)
 {
 	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
-	mce_notify_user();
+	mce_notify_irq();
 }
 
 static void print_update(char *type, int *hdr, int num)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d0851e3f77eb..d5dc15bce005 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 #ifdef CONFIG_X86_NEW_MCE
 	/* notify userspace of pending MCEs */
 	if (thread_info_flags & _TIF_MCE_NOTIFY)
-		mce_notify_user();
+		mce_notify_irq();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
 	/* deal with pending signal delivery */

From 8fa8dd9e3aafb7b440b7d54219891615abc6390e Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:58 +0200
Subject: [PATCH 123/162] x86, mce: define MCE_VECTOR

Add MCE_VECTOR for the #MC exception.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/irq_vectors.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 28477e4f2d49..1b35c4357ea8 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -25,6 +25,7 @@
  */
 
 #define NMI_VECTOR			0x02
+#define MCE_VECTOR			0x12
 
 /*
  * IDT vectors usable for external interrupt sources start

From 9b1beaf2b551a8a1604f104025b24e9c535c8963 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 27 May 2009 21:56:59 +0200
Subject: [PATCH 124/162] x86, mce: support action-optional machine checks

Newer Intel CPUs support a new class of machine checks called recoverable
action optional.

Action Optional means that the CPU detected some form of corruption in
the background and tells the OS about using a machine check
exception. The OS can then take appropiate action, like killing the
process with the corrupted data or logging the event properly to disk.

This is done by the new generic high level memory failure handler added
in a earlier patch. The high level handler takes the address with the
failed memory and does the appropiate action, like killing the process.

In this version of the patch the high level handler is stubbed out
with a weak function to not create a direct dependency on the hwpoison
branch.

The high level handler cannot be directly called from the machine check
exception though, because it has to run in a defined process context to
be able to sleep when taking VM locks (it is not expected to sleep for a
long time, just do so in some exceptional cases like lock contention)

Thus the MCE handler has to queue a work item for process context,
trigger process context and then call the high level handler from there.

This patch adds two path to process context: through a per thread kernel
exit notify_user() callback or through a high priority work item.
The first runs when the process exits back to user space, the other when
it goes to sleep and there is no higher priority process.

The machine check handler will schedule both, and whoever runs first
will grab the event. This is done because quick reaction to this
event is critical to avoid a potential more fatal machine check
when the corruption is consumed.

There is a simple lock less ring buffer to queue the corrupted
addresses between the exception handler and the process context handler.
Then in process context it just calls the high level VM code with
the corrupted PFNs.

The code adds the required code to extract the failed address from
the CPU's machine check registers. It doesn't try to handle all
possible cases -- the specification has 6 different ways to specify
memory address -- but only the linear address.

Most of the required checking has been already done earlier in the
mce_severity rule checking engine.  Following the Intel
recommendations Action Optional errors are only enabled for known
situations (encoded in MCACODs). The errors are ignored otherwise,
because they are action optional.

v2: Improve comment, disable preemption while processing ring buffer
    (reported by Ying Huang)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h       |   1 +
 arch/x86/kernel/cpu/mcheck/mce.c | 133 +++++++++++++++++++++++++++++++
 arch/x86/kernel/signal.c         |   2 +-
 3 files changed, 135 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 713926b62cbb..82978ad12072 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -160,6 +160,7 @@ enum mcp_flags {
 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
 int mce_notify_irq(void);
+void mce_notify_process(void);
 
 DECLARE_PER_CPU(struct mce, injectm);
 extern struct file_operations mce_chrdev_ops;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 13e1b7ffe73a..d4e7b5947a0e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -33,6 +33,7 @@
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/fs.h>
+#include <linux/mm.h>
 
 #include <asm/processor.h>
 #include <asm/hw_irq.h>
@@ -105,6 +106,8 @@ static inline int skip_bank_init(int i)
 	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
 }
 
+static DEFINE_PER_CPU(struct work_struct, mce_work);
+
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
 {
@@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v)
 	wrmsrl(msr, v);
 }
 
+/*
+ * Simple lockless ring to communicate PFNs from the exception handler with the
+ * process context work function. This is vastly simplified because there's
+ * only a single reader and a single writer.
+ */
+#define MCE_RING_SIZE 16	/* we use one entry less */
+
+struct mce_ring {
+	unsigned short start;
+	unsigned short end;
+	unsigned long ring[MCE_RING_SIZE];
+};
+static DEFINE_PER_CPU(struct mce_ring, mce_ring);
+
+/* Runs with CPU affinity in workqueue */
+static int mce_ring_empty(void)
+{
+	struct mce_ring *r = &__get_cpu_var(mce_ring);
+
+	return r->start == r->end;
+}
+
+static int mce_ring_get(unsigned long *pfn)
+{
+	struct mce_ring *r;
+	int ret = 0;
+
+	*pfn = 0;
+	get_cpu();
+	r = &__get_cpu_var(mce_ring);
+	if (r->start == r->end)
+		goto out;
+	*pfn = r->ring[r->start];
+	r->start = (r->start + 1) % MCE_RING_SIZE;
+	ret = 1;
+out:
+	put_cpu();
+	return ret;
+}
+
+/* Always runs in MCE context with preempt off */
+static int mce_ring_add(unsigned long pfn)
+{
+	struct mce_ring *r = &__get_cpu_var(mce_ring);
+	unsigned next;
+
+	next = (r->end + 1) % MCE_RING_SIZE;
+	if (next == r->start)
+		return -1;
+	r->ring[r->end] = pfn;
+	wmb();
+	r->end = next;
+	return 0;
+}
+
 int mce_available(struct cpuinfo_x86 *c)
 {
 	if (mce_disabled)
@@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c)
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
+static void mce_schedule_work(void)
+{
+	if (!mce_ring_empty()) {
+		struct work_struct *work = &__get_cpu_var(mce_work);
+		if (!work_pending(work))
+			schedule_work(work);
+	}
+}
+
 /*
  * Get the address of the instruction at the time of the machine check
  * error.
@@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
 	exit_idle();
 	irq_enter();
 	mce_notify_irq();
+	mce_schedule_work();
 	irq_exit();
 }
 #endif
@@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs)
 {
 	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 		mce_notify_irq();
+		/*
+		 * Triggering the work queue here is just an insurance
+		 * policy in case the syscall exit notify handler
+		 * doesn't run soon enough or ends up running on the
+		 * wrong CPU (can happen when audit sleeps)
+		 */
+		mce_schedule_work();
 		return;
 	}
 
@@ -731,6 +806,23 @@ reset:
 	return ret;
 }
 
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses upto page granuality for now.
+ */
+static int mce_usable_address(struct mce *m)
+{
+	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+		return 0;
+	if ((m->misc & 0x3f) > PAGE_SHIFT)
+		return 0;
+	if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+		return 0;
+	return 1;
+}
+
 static void mce_clear_state(unsigned long *toclear)
 {
 	int i;
@@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		if (m.status & MCI_STATUS_ADDRV)
 			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
 
+		/*
+		 * Action optional error. Queue address for later processing.
+		 * When the ring overflows we just ignore the AO error.
+		 * RED-PEN add some logging mechanism when
+		 * usable_address or mce_add_ring fails.
+		 * RED-PEN don't ignore overflow for tolerant == 0
+		 */
+		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
+			mce_ring_add(m.addr >> PAGE_SHIFT);
+
 		mce_get_rip(&m, regs);
 		mce_log(&m);
 
@@ -916,6 +1018,36 @@ out:
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
 
+/* dummy to break dependency. actual code is in mm/memory-failure.c */
+void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+{
+	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+}
+
+/*
+ * Called after mce notification in process context. This code
+ * is allowed to sleep. Call the high level VM handler to process
+ * any corrupted pages.
+ * Assume that the work queue code only calls this one at a time
+ * per CPU.
+ * Note we don't disable preemption, so this code might run on the wrong
+ * CPU. In this case the event is picked up by the scheduled work queue.
+ * This is merely a fast path to expedite processing in some common
+ * cases.
+ */
+void mce_notify_process(void)
+{
+	unsigned long pfn;
+	mce_notify_irq();
+	while (mce_ring_get(&pfn))
+		memory_failure(pfn, MCE_VECTOR);
+}
+
+static void mce_process_work(struct work_struct *dummy)
+{
+	mce_notify_process();
+}
+
 #ifdef CONFIG_X86_MCE_INTEL
 /***
  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
@@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 	mce_init();
 	mce_cpu_features(c);
 	mce_init_timer();
+	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
 }
 
 /*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d5dc15bce005..4976888094f0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 #ifdef CONFIG_X86_NEW_MCE
 	/* notify userspace of pending MCEs */
 	if (thread_info_flags & _TIF_MCE_NOTIFY)
-		mce_notify_irq();
+		mce_notify_process();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
 	/* deal with pending signal delivery */

From 8051dbd2dfd1427cc102888d7d96bf39de0be150 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Tue, 2 Jun 2009 16:53:23 +0900
Subject: [PATCH 125/162] x86, mce: fix for mce counters

Make the MCE counters work on 32bit and add poll count in
arch_irq_stat_cpu.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/irq.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index eff46b5de62f..9773395aa758 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -95,7 +95,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	seq_printf(p, "  Threshold APIC interrupts\n");
 # endif
 #endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+#ifdef CONFIG_X86_NEW_MCE
 	seq_printf(p, "%*s: ", prec, "MCE");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
@@ -172,9 +172,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 {
 	u64 sum = irq_stats(cpu)->__nmi_count;
 
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
-	sum += per_cpu(mce_exception_count, cpu);
-#endif
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
 	sum += irq_stats(cpu)->irq_spurious_count;
@@ -191,6 +188,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 # ifdef CONFIG_X86_MCE_THRESHOLD
 	sum += irq_stats(cpu)->irq_threshold_count;
 # endif
+#endif
+#ifdef CONFIG_X86_NEW_MCE
+	sum += per_cpu(mce_exception_count, cpu);
+	sum += per_cpu(mce_poll_count, cpu);
 #endif
 	return sum;
 }

From bbb0a4247aaf1eabbd6d87750eafe99c577920f7 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 16 Jan 2009 09:49:50 -0700
Subject: [PATCH 126/162] Document Reported-by in SubmittingPatches

Randy pointed out that the Reported-By tag should be documented with the
others in SubmittingPatches.

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/SubmittingPatches | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index f309d3c6221c..6f29e29555c0 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -405,7 +405,14 @@ person it names.  This tag documents that potentially interested parties
 have been included in the discussion
 
 
-14) Using Tested-by: and Reviewed-by:
+14) Using Reported-by:, Tested-by: and Reviewed-by:
+
+If this patch fixes a problem reported by somebody else, consider adding a
+Reported-by: tag to credit the reporter for their contribution.  Please
+note that this tag should not be added without the reporter's permission,
+especially if the problem was not reported in a public forum.  That said,
+if we diligently credit our bug reporters, they will, hopefully, be
+inspired to help us again in the future.
 
 A Tested-by: tag indicates that the patch has been successfully tested (in
 some environment) by the person named.  This tag informs maintainers that

From 5d98932ab0acb699dc56d9e252f056b9b2cdab25 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 21 Apr 2009 13:33:06 -0600
Subject: [PATCH 127/162] docs: Encourage better changelogs in the development
 process document

Add a couple of paragraphs to the "patch formatting" section on how patches
should be described.  This text is shamelessly cribbed from suggestions
posted by Rusty Russell.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/development-process/5.Posting | 31 +++++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/Documentation/development-process/5.Posting b/Documentation/development-process/5.Posting
index dd48132a74dd..f622c1e9f0f9 100644
--- a/Documentation/development-process/5.Posting
+++ b/Documentation/development-process/5.Posting
@@ -119,7 +119,7 @@ which takes quite a bit of time and thought after the "real work" has been
 done.  When done properly, though, it is time well spent.
 
 
-5.4: PATCH FORMATTING
+5.4: PATCH FORMATTING AND CHANGELOGS
 
 So now you have a perfect series of patches for posting, but the work is
 not done quite yet.  Each patch needs to be formatted into a message which
@@ -146,8 +146,33 @@ that end, each patch will be composed of the following:
  - One or more tag lines, with, at a minimum, one Signed-off-by: line from
    the author of the patch.  Tags will be described in more detail below.
 
-The above three items should, normally, be the text used when committing
-the change to a revision control system.  They are followed by:
+The items above, together, form the changelog for the patch.  Writing good
+changelogs is a crucial but often-neglected art; it's worth spending
+another moment discussing this issue.  When writing a changelog, you should
+bear in mind that a number of different people will be reading your words.
+These include subsystem maintainers and reviewers who need to decide
+whether the patch should be included, distributors and other maintainers
+trying to decide whether a patch should be backported to other kernels, bug
+hunters wondering whether the patch is responsible for a problem they are
+chasing, users who want to know how the kernel has changed, and more.  A
+good changelog conveys the needed information to all of these people in the
+most direct and concise way possible.
+
+To that end, the summary line should describe the effects of and motivation
+for the change as well as possible given the one-line constraint.  The
+detailed description can then amplify on those topics and provide any
+needed additional information.  If the patch fixes a bug, cite the commit
+which introduced the bug if possible.  If a problem is associated with
+specific log or compiler output, include that output to help others
+searching for a solution to the same problem.  If the change is meant to
+support other changes coming in later patch, say so.  If internal APIs are
+changed, detail those changes and how other developers should respond.  In
+general, the more you can put yourself into the shoes of everybody who will
+be reading your changelog, the better that changelog (and the kernel as a
+whole) will be.
+
+Needless to say, the changelog should be the text used when committing the
+change to a revision control system.  It will be followed by:
 
  - The patch itself, in the unified ("-u") patch format.  Using the "-p"
    option to diff will associate function names with changes, making the

From 5801da1b2f1207da21271ffd6768cd40a6c7f1c4 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Thu, 4 Jun 2009 16:26:50 +0200
Subject: [PATCH 128/162] SubmittingPatches: fix typo

Fix typo.

Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/SubmittingPatches | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 6f29e29555c0..71b6da2b7175 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -451,7 +451,7 @@ offer a Reviewed-by tag for a patch.  This tag serves to give credit to
 reviewers and to inform maintainers of the degree of review which has been
 done on the patch.  Reviewed-by: tags, when supplied by reviewers known to
 understand the subject area and to perform thorough reviews, will normally
-increase the liklihood of your patch getting into the kernel.
+increase the likelihood of your patch getting into the kernel.
 
 
 15) The canonical patch format

From 2ae19acaa50a09c1099956efb895c0aca74ab050 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 16 Apr 2009 07:44:45 -0400
Subject: [PATCH 129/162] Documentation: Add "how to write a good patch
 summary" to SubmittingPatches

Unfortunately many patch submissions are arriving with painfully poor
patch descriptions.   As a result of the discussion on LKML:

      http://lkml.org/lkml/2009/4/15/296

explain how to submit a better patch description, in the (perhaps
vain) hope that maintainers won't end up having to rewrite the git
commit logs as often as they do today.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/SubmittingPatches | 65 ++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 14 deletions(-)

diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 71b6da2b7175..6c456835c1fd 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -91,6 +91,10 @@ Be as specific as possible.  The WORST descriptions possible include
 things like "update driver X", "bug fix for driver X", or "this patch
 includes updates for subsystem X.  Please apply."
 
+The maintainer will thank you if you write your patch description in a
+form which can be easily pulled into Linux's source code management
+system, git, as a "commit log".  See #15, below.
+
 If your description starts to get long, that's a sign that you probably
 need to split up your patch.  See #3, next.
 
@@ -492,12 +496,33 @@ phrase" should not be a filename.  Do not use the same "summary
 phrase" for every patch in a whole patch series (where a "patch
 series" is an ordered sequence of multiple, related patches).
 
-Bear in mind that the "summary phrase" of your email becomes
-a globally-unique identifier for that patch.  It propagates
-all the way into the git changelog.  The "summary phrase" may
-later be used in developer discussions which refer to the patch.
-People will want to google for the "summary phrase" to read
-discussion regarding that patch.
+Bear in mind that the "summary phrase" of your email becomes a
+globally-unique identifier for that patch.  It propagates all the way
+into the git changelog.  The "summary phrase" may later be used in
+developer discussions which refer to the patch.  People will want to
+google for the "summary phrase" to read discussion regarding that
+patch.  It will also be the only thing that people may quickly see
+when, two or three months later, they are going through perhaps
+thousands of patches using tools such as "gitk" or "git log
+--oneline".
+
+For these reasons, the "summary" must be no more than 70-75
+characters, and it must describe both what the patch changes, as well
+as why the patch might be necessary.  It is challenging to be both
+succinct and descriptive, but that is what a well-written summary
+should do.
+
+The "summary phrase" may be prefixed by tags enclosed in square
+brackets: "Subject: [PATCH tag] <summary phrase>".  The tags are not
+considered part of the summary phrase, but describe how the patch
+should be treated.  Common tags might include a version descriptor if
+the multiple versions of the patch have been sent out in response to
+comments (i.e., "v1, v2, v3"), or "RFC" to indicate a request for
+comments.  If there are four patches in a patch series the individual
+patches may be numbered like this: 1/4, 2/4, 3/4, 4/4.  This assures
+that developers understand the order in which the patches should be
+applied and that they have reviewed or applied all of the patches in
+the patch series.
 
 A couple of example Subjects:
 
@@ -517,19 +542,31 @@ the patch author in the changelog.
 The explanation body will be committed to the permanent source
 changelog, so should make sense to a competent reader who has long
 since forgotten the immediate details of the discussion that might
-have led to this patch.
+have led to this patch.  Including symptoms of the failure which the
+patch addresses (kernel log messages, oops messages, etc.) is
+especially useful for people who might be searching the commit logs
+looking for the applicable patch.  If a patch fixes a compile failure,
+it may not be necessary to include _all_ of the compile failures; just
+enough that it is likely that someone searching for the patch can find
+it.  As in the "summary phrase", it is important to be both succinct as
+well as descriptive.
 
 The "---" marker line serves the essential purpose of marking for patch
 handling tools where the changelog message ends.
 
 One good use for the additional comments after the "---" marker is for
-a diffstat, to show what files have changed, and the number of inserted
-and deleted lines per file.  A diffstat is especially useful on bigger
-patches.  Other comments relevant only to the moment or the maintainer,
-not suitable for the permanent changelog, should also go here.
-Use diffstat options "-p 1 -w 70" so that filenames are listed from the
-top of the kernel source tree and don't use too much horizontal space
-(easily fit in 80 columns, maybe with some indentation).
+a diffstat, to show what files have changed, and the number of
+inserted and deleted lines per file.  A diffstat is especially useful
+on bigger patches.  Other comments relevant only to the moment or the
+maintainer, not suitable for the permanent changelog, should also go
+here.  A good example of such comments might be "patch changelogs"
+which describe what has changed between the v1 and v2 version of the
+patch.
+
+If you are going to include a diffstat after the "---" marker, please
+use diffstat options "-p 1 -w 70" so that filenames are listed from
+the top of the kernel source tree and don't use too much horizontal
+space (easily fit in 80 columns, maybe with some indentation).
 
 See more details on the proper patch format in the following
 references.

From f89d7eaf6c34828070f407d0e04b73127f176ec5 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 4 Jun 2009 16:35:25 -0600
Subject: [PATCH 130/162] Document the debugfs API

This is an updated document covering the internal API for the debugfs
filesystem.  Thanks to Shen Feng for suggesting that I put this text here
and noting that the old LWN version was rather out of date.

Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Reported-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/filesystems/debugfs.txt | 158 ++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100644 Documentation/filesystems/debugfs.txt

diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
new file mode 100644
index 000000000000..ed52af60c2d8
--- /dev/null
+++ b/Documentation/filesystems/debugfs.txt
@@ -0,0 +1,158 @@
+Copyright 2009 Jonathan Corbet <corbet@lwn.net>
+
+Debugfs exists as a simple way for kernel developers to make information
+available to user space.  Unlike /proc, which is only meant for information
+about a process, or sysfs, which has strict one-value-per-file rules,
+debugfs has no rules at all.  Developers can put any information they want
+there.  The debugfs filesystem is also intended to not serve as a stable
+ABI to user space; in theory, there are no stability constraints placed on
+files exported there.  The real world is not always so simple, though [1];
+even debugfs interfaces are best designed with the idea that they will need
+to be maintained forever.
+
+Debugfs is typically mounted with a command like:
+
+    mount -t debugfs none /sys/kernel/debug
+
+(Or an equivalent /etc/fstab line). 
+
+Note that the debugfs API is exported GPL-only to modules.
+
+Code using debugfs should include <linux/debugfs.h>.  Then, the first order
+of business will be to create at least one directory to hold a set of
+debugfs files:
+
+    struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
+
+This call, if successful, will make a directory called name underneath the
+indicated parent directory.  If parent is NULL, the directory will be
+created in the debugfs root.  On success, the return value is a struct
+dentry pointer which can be used to create files in the directory (and to
+clean it up at the end).  A NULL return value indicates that something went
+wrong.  If ERR_PTR(-ENODEV) is returned, that is an indication that the
+kernel has been built without debugfs support and none of the functions
+described below will work.
+
+The most general way to create a file within a debugfs directory is with:
+
+    struct dentry *debugfs_create_file(const char *name, mode_t mode,
+				       struct dentry *parent, void *data,
+				       const struct file_operations *fops);
+
+Here, name is the name of the file to create, mode describes the access
+permissions the file should have, parent indicates the directory which
+should hold the file, data will be stored in the i_private field of the
+resulting inode structure, and fops is a set of file operations which
+implement the file's behavior.  At a minimum, the read() and/or write()
+operations should be provided; others can be included as needed.  Again,
+the return value will be a dentry pointer to the created file, NULL for
+error, or ERR_PTR(-ENODEV) if debugfs support is missing.
+
+In a number of cases, the creation of a set of file operations is not
+actually necessary; the debugfs code provides a number of helper functions
+for simple situations.  Files containing a single integer value can be
+created with any of:
+
+    struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+				     struct dentry *parent, u8 *value);
+    struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+				      struct dentry *parent, u16 *value);
+    struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+				      struct dentry *parent, u32 *value);
+    struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+				      struct dentry *parent, u64 *value);
+
+These files support both reading and writing the given value; if a specific
+file should not be written to, simply set the mode bits accordingly.  The
+values in these files are in decimal; if hexadecimal is more appropriate,
+the following functions can be used instead:
+
+    struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+				     struct dentry *parent, u8 *value);
+    struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+				      struct dentry *parent, u16 *value);
+    struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+				      struct dentry *parent, u32 *value);
+
+Note that there is no debugfs_create_x64().
+
+These functions are useful as long as the developer knows the size of the
+value to be exported.  Some types can have different widths on different
+architectures, though, complicating the situation somewhat.  There is a
+function meant to help out in one special case:
+
+    struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+				         struct dentry *parent, 
+					 size_t *value);
+
+As might be expected, this function will create a debugfs file to represent
+a variable of type size_t.
+
+Boolean values can be placed in debugfs with:
+
+    struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+				       struct dentry *parent, u32 *value);
+
+A read on the resulting file will yield either Y (for non-zero values) or
+N, followed by a newline.  If written to, it will accept either upper- or
+lower-case values, or 1 or 0.  Any other input will be silently ignored.
+
+Finally, a block of arbitrary binary data can be exported with:
+
+    struct debugfs_blob_wrapper {
+	void *data;
+	unsigned long size;
+    };
+
+    struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+				       struct dentry *parent,
+				       struct debugfs_blob_wrapper *blob);
+
+A read of this file will return the data pointed to by the
+debugfs_blob_wrapper structure.  Some drivers use "blobs" as a simple way
+to return several lines of (static) formatted text output.  This function
+can be used to export binary information, but there does not appear to be
+any code which does so in the mainline.  Note that all files created with
+debugfs_create_blob() are read-only.
+
+There are a couple of other directory-oriented helper functions:
+
+    struct dentry *debugfs_rename(struct dentry *old_dir, 
+    				  struct dentry *old_dentry,
+		                  struct dentry *new_dir, 
+				  const char *new_name);
+
+    struct dentry *debugfs_create_symlink(const char *name, 
+                                          struct dentry *parent,
+				      	  const char *target);
+
+A call to debugfs_rename() will give a new name to an existing debugfs
+file, possibly in a different directory.  The new_name must not exist prior
+to the call; the return value is old_dentry with updated information.
+Symbolic links can be created with debugfs_create_symlink().
+
+There is one important thing that all debugfs users must take into account:
+there is no automatic cleanup of any directories created in debugfs.  If a
+module is unloaded without explicitly removing debugfs entries, the result
+will be a lot of stale pointers and no end of highly antisocial behavior.
+So all debugfs users - at least those which can be built as modules - must
+be prepared to remove all files and directories they create there.  A file
+can be removed with:
+
+    void debugfs_remove(struct dentry *dentry);
+
+The dentry value can be NULL, in which case nothing will be removed.
+
+Once upon a time, debugfs users were required to remember the dentry
+pointer for every debugfs file they created so that all files could be
+cleaned up.  We live in more civilized times now, though, and debugfs users
+can call:
+
+    void debugfs_remove_recursive(struct dentry *dentry);
+
+If this function is passed a pointer for the dentry corresponding to the
+top-level directory, the entire hierarchy below that directory will be
+removed.
+
+Notes:
+	[1] http://lwn.net/Articles/309298/

From a89ab11454b476d2d9e8a10f903360a62e21bac8 Mon Sep 17 00:00:00 2001
From: Peter Ma <pma@mediamatech.com>
Date: Mon, 8 Jun 2009 12:54:02 +0200
Subject: [PATCH 131/162] avr32: Change Atmel ATNGW100 config to add choice of
 add-on board

Signed-off-by: Peter Ma <pma@mediamatech.com>
Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
---
 arch/avr32/boards/atngw100/Kconfig | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/avr32/boards/atngw100/Kconfig b/arch/avr32/boards/atngw100/Kconfig
index b3f99477bbeb..f41c5335e6a9 100644
--- a/arch/avr32/boards/atngw100/Kconfig
+++ b/arch/avr32/boards/atngw100/Kconfig
@@ -2,8 +2,15 @@
 
 if BOARD_ATNGW100
 
+choice
+	prompt "Select an NGW100 add-on board to support"
+	default BOARD_ATNGW100_ADDON_NONE
+
+config BOARD_ATNGW100_ADDON_NONE
+	bool "None"
+
 config BOARD_ATNGW100_EVKLCD10X
-	bool "Add support for EVKLCD10X addon board"
+	bool "EVKLCD10X addon board"
 	help
 	  This enables support for the EVKLCD100 (QVGA) or EVKLCD101 (VGA)
 	  addon board for the NGW100. By enabling this the LCD controller and
@@ -14,7 +21,7 @@ config BOARD_ATNGW100_EVKLCD10X
 	  The MCI pins can be reenabled by editing the "add device function" but
 	  this may break the setup for other displays that use these pins.
 
-	  Choose 'Y' here if you have a EVKLCD100/101 connected to the NGW100.
+endchoice
 
 choice
 	prompt "LCD panel resolution on EVKLCD10X"

From 4024533e60787a5507818b0c0fdb44ddb522cdf5 Mon Sep 17 00:00:00 2001
From: Peter Ma <pma@mediamatech.com>
Date: Mon, 8 Jun 2009 12:55:35 +0200
Subject: [PATCH 132/162] avr32: Add support for Mediama RMTx add-on board for
 ATNGW100

Signed-off-by: Peter Ma <pma@mediamatech.com>
Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
---
 arch/avr32/boards/atngw100/Kconfig         |   16 +
 arch/avr32/boards/atngw100/Kconfig_mrmt    |   80 ++
 arch/avr32/boards/atngw100/Makefile        |    1 +
 arch/avr32/boards/atngw100/mrmt.c          |  373 ++++++
 arch/avr32/boards/atngw100/setup.c         |    5 +
 arch/avr32/configs/atngw100_mrmt_defconfig | 1363 ++++++++++++++++++++
 6 files changed, 1838 insertions(+)
 create mode 100644 arch/avr32/boards/atngw100/Kconfig_mrmt
 create mode 100644 arch/avr32/boards/atngw100/mrmt.c
 create mode 100644 arch/avr32/configs/atngw100_mrmt_defconfig

diff --git a/arch/avr32/boards/atngw100/Kconfig b/arch/avr32/boards/atngw100/Kconfig
index f41c5335e6a9..be27a0218ab4 100644
--- a/arch/avr32/boards/atngw100/Kconfig
+++ b/arch/avr32/boards/atngw100/Kconfig
@@ -21,6 +21,18 @@ config BOARD_ATNGW100_EVKLCD10X
 	  The MCI pins can be reenabled by editing the "add device function" but
 	  this may break the setup for other displays that use these pins.
 
+config BOARD_ATNGW100_MRMT
+	bool "Mediama RMT1/2 add-on board"
+	help
+	  This enables support for the Mediama RMT1 or RMT2 board.
+	  RMT provides LCD support, AC97 codec and other
+	  optional peripherals to the Atmel NGW100.
+
+	  This choice disables the detect pin and the write-protect pin for the
+	  MCI platform device, since it conflicts with the LCD platform device.
+	  The MCI pins can be reenabled by editing the "add device function" but
+	  this may break the setup for other displays that use these pins.
+
 endchoice
 
 choice
@@ -39,4 +51,8 @@ config BOARD_ATNGW100_EVKLCD10X_POW_QVGA
 
 endchoice
 
+if BOARD_ATNGW100_MRMT
+source	"arch/avr32/boards/atngw100/Kconfig_mrmt"
+endif
+
 endif	# BOARD_ATNGW100
diff --git a/arch/avr32/boards/atngw100/Kconfig_mrmt b/arch/avr32/boards/atngw100/Kconfig_mrmt
new file mode 100644
index 000000000000..9a199a207f3c
--- /dev/null
+++ b/arch/avr32/boards/atngw100/Kconfig_mrmt
@@ -0,0 +1,80 @@
+# RMT for NGW100 customization
+
+choice
+	prompt "RMT Version"
+	help
+	  Select the RMTx board version.
+
+config BOARD_MRMT_REV1
+	bool "RMT1"
+config BOARD_MRMT_REV2
+	bool "RMT2"
+
+endchoice
+
+config BOARD_MRMT_AC97
+	bool "Enable AC97 CODEC"
+	help
+	  Enable the UCB1400 AC97 CODEC driver.
+
+choice
+	prompt "Touchscreen Driver"
+	default BOARD_MRMT_ADS7846_TS
+
+config BOARD_MRMT_UCB1400_TS
+	bool "Use UCB1400 Touchscreen"
+
+config BOARD_MRMT_ADS7846_TS
+	bool "Use ADS7846 Touchscreen"
+
+endchoice
+
+choice
+	prompt "RMTx LCD Selection"
+	default BOARD_MRMT_LCD_DISABLE
+
+config BOARD_MRMT_LCD_DISABLE
+	bool "LCD Disabled"
+
+config BOARD_MRMT_LCD_LQ043T3DX0X
+	bool "Sharp LQ043T3DX0x or compatible"
+	help
+	  If using RMT2, be sure to load the resistor pack selectors accordingly
+
+if BOARD_MRMT_REV2
+config BOARD_MRMT_LCD_KWH043GM08
+	bool "Formike KWH043GM08 or compatible"
+	help
+	  Be sure to load the RMT2 resistor pack selectors accordingly
+endif
+
+endchoice
+
+if !BOARD_MRMT_LCD_DISABLE
+config BOARD_MRMT_BL_PWM
+	bool "Use PWM control for LCD Backlight"
+	help
+		Use PWM driver for controlling LCD Backlight.
+		Otherwise, LCD Backlight is always on.
+endif
+
+config BOARD_MRMT_RTC_I2C
+	bool "Use External RTC on I2C Bus"
+	help
+		RMT1 has an optional RTC device on the I2C bus.
+		It is a SII S35390A.  Be sure to select the
+		matching RTC driver.
+
+choice
+	prompt "Wireless Module on ttyS2"
+	default BOARD_MRMT_WIRELESS_ZB
+
+config BOARD_MRMT_WIRELESS_ZB
+	bool "Use ZigBee/802.15.4 Module"
+
+config BOARD_MRMT_WIRELESS_BT
+	bool "Use Bluetooth (HCI) Module"
+
+config BOARD_MRMT_WIRELESS_NONE
+	bool "Not Installed"
+endchoice
diff --git a/arch/avr32/boards/atngw100/Makefile b/arch/avr32/boards/atngw100/Makefile
index 6376f5322e4d..f4ebe42a8254 100644
--- a/arch/avr32/boards/atngw100/Makefile
+++ b/arch/avr32/boards/atngw100/Makefile
@@ -1,2 +1,3 @@
 obj-y					+= setup.o flash.o
 obj-$(CONFIG_BOARD_ATNGW100_EVKLCD10X)	+= evklcd10x.o
+obj-$(CONFIG_BOARD_ATNGW100_MRMT)	+= mrmt.o
diff --git a/arch/avr32/boards/atngw100/mrmt.c b/arch/avr32/boards/atngw100/mrmt.c
new file mode 100644
index 000000000000..bf78e516a85f
--- /dev/null
+++ b/arch/avr32/boards/atngw100/mrmt.c
@@ -0,0 +1,373 @@
+/*
+ * Board-specific setup code for Remote Media Terminal 1 (RMT1)
+ * add-on board for the ATNGW100 Network Gateway
+ *
+ * Copyright (C) 2008 Mediama Technologies
+ * Based on ATNGW100 Network Gateway (Copyright (C) Atmel)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/gpio.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/linkage.h>
+#include <linux/platform_device.h>
+#include <linux/types.h>
+#include <linux/fb.h>
+#include <linux/leds.h>
+#include <linux/input.h>
+#include <linux/gpio_keys.h>
+#include <linux/atmel_serial.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/ads7846.h>
+
+#include <video/atmel_lcdc.h>
+#include <sound/atmel-ac97c.h>
+
+#include <asm/delay.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+
+#include <mach/at32ap700x.h>
+#include <mach/board.h>
+#include <mach/init.h>
+#include <mach/portmux.h>
+
+/* Define board-specifoic GPIO assignments */
+#define PIN_LCD_BL	GPIO_PIN_PA(28)
+#define PWM_CH_BL	0	/* Must match with GPIO pin definition */
+#define PIN_LCD_DISP	GPIO_PIN_PA(31)
+#define	PIN_AC97_RST_N	GPIO_PIN_PA(30)
+#define PB_EXTINT_BASE	25
+#define TS_IRQ		0
+#define PIN_TS_EXTINT	GPIO_PIN_PB(PB_EXTINT_BASE+TS_IRQ)
+#define PIN_PB_LEFT	GPIO_PIN_PB(11)
+#define PIN_PB_RIGHT	GPIO_PIN_PB(12)
+#define PIN_PWR_SW_N	GPIO_PIN_PB(14)
+#define PIN_PWR_ON	GPIO_PIN_PB(13)
+#define PIN_ZB_RST_N	GPIO_PIN_PA(21)
+#define PIN_BT_RST	GPIO_PIN_PA(22)
+#define PIN_LED_SYS	GPIO_PIN_PA(16)
+#define PIN_LED_A	GPIO_PIN_PA(19)
+#define PIN_LED_B	GPIO_PIN_PE(19)
+
+#ifdef CONFIG_BOARD_MRMT_LCD_LQ043T3DX0X
+/* Sharp LQ043T3DX0x (or compatible) panel */
+static struct fb_videomode __initdata lcd_fb_modes[] = {
+	{
+		.name		= "480x272 @ 59.94Hz",
+		.refresh	= 59.94,
+		.xres		= 480,		.yres		= 272,
+		.pixclock	= KHZ2PICOS(9000),
+
+		.left_margin	= 2,		.right_margin	= 2,
+		.upper_margin	= 3,		.lower_margin	= 9,
+		.hsync_len	= 41,		.vsync_len	= 1,
+
+		.sync		= 0,
+		.vmode		= FB_VMODE_NONINTERLACED,
+	},
+};
+
+static struct fb_monspecs __initdata lcd_fb_default_monspecs = {
+	.manufacturer		= "SHA",
+	.monitor		= "LQ043T3DX02",
+	.modedb			= lcd_fb_modes,
+	.modedb_len		= ARRAY_SIZE(lcd_fb_modes),
+	.hfmin			= 14915,
+	.hfmax			= 17638,
+	.vfmin			= 53,
+	.vfmax			= 61,
+	.dclkmax		= 9260000,
+};
+
+static struct atmel_lcdfb_info __initdata rmt_lcdc_data = {
+	.default_bpp		= 24,
+	.default_dmacon		= ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
+	.default_lcdcon2	= (ATMEL_LCDC_DISTYPE_TFT
+				   | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
+				   | ATMEL_LCDC_INVCLK_NORMAL
+				   | ATMEL_LCDC_MEMOR_BIG),
+	.lcd_wiring_mode	= ATMEL_LCDC_WIRING_RGB,
+	.default_monspecs	= &lcd_fb_default_monspecs,
+	.guard_time		= 2,
+};
+#endif
+
+#ifdef CONFIG_BOARD_MRMT_LCD_KWH043GM08
+/* Sharp KWH043GM08-Fxx (or compatible) panel */
+static struct fb_videomode __initdata lcd_fb_modes[] = {
+	{
+		.name		= "480x272 @ 59.94Hz",
+		.refresh	= 59.94,
+		.xres		= 480,		.yres		= 272,
+		.pixclock	= KHZ2PICOS(9000),
+
+		.left_margin	= 2,		.right_margin	= 2,
+		.upper_margin	= 3,		.lower_margin	= 9,
+		.hsync_len	= 41,		.vsync_len	= 1,
+
+		.sync		= 0,
+		.vmode		= FB_VMODE_NONINTERLACED,
+	},
+};
+
+static struct fb_monspecs __initdata lcd_fb_default_monspecs = {
+	.manufacturer		= "FOR",
+	.monitor		= "KWH043GM08",
+	.modedb			= lcd_fb_modes,
+	.modedb_len		= ARRAY_SIZE(lcd_fb_modes),
+	.hfmin			= 14915,
+	.hfmax			= 17638,
+	.vfmin			= 53,
+	.vfmax			= 61,
+	.dclkmax		= 9260000,
+};
+
+static struct atmel_lcdfb_info __initdata rmt_lcdc_data = {
+	.default_bpp		= 24,
+	.default_dmacon		= ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
+	.default_lcdcon2	= (ATMEL_LCDC_DISTYPE_TFT
+				   | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
+				   | ATMEL_LCDC_INVCLK_INVERTED
+				   | ATMEL_LCDC_MEMOR_BIG),
+	.lcd_wiring_mode	= ATMEL_LCDC_WIRING_RGB,
+	.default_monspecs	= &lcd_fb_default_monspecs,
+	.guard_time		= 2,
+};
+#endif
+
+#ifdef CONFIG_BOARD_MRMT_AC97
+static struct ac97c_platform_data __initdata ac97c0_data = {
+	.reset_pin		= PIN_AC97_RST_N,
+};
+#endif
+
+#ifdef CONFIG_BOARD_MRMT_UCB1400_TS
+/* NOTE: IRQ assignment relies on kernel module parameter */
+static struct platform_device rmt_ts_device = {
+	.name	= "ucb1400_ts",
+	.id	= -1,
+	}
+};
+#endif
+
+#ifdef CONFIG_BOARD_MRMT_BL_PWM
+/* PWM LEDs: LCD Backlight, etc */
+static struct gpio_led rmt_pwm_led[] = {
+	/* here the "gpio" is actually a PWM channel */
+	{ .name = "backlight",	.gpio = PWM_CH_BL, },
+};
+
+static struct gpio_led_platform_data rmt_pwm_led_data = {
+	.num_leds	= ARRAY_SIZE(rmt_pwm_led),
+	.leds		= rmt_pwm_led,
+};
+
+static struct platform_device rmt_pwm_led_dev = {
+	.name		= "leds-atmel-pwm",
+	.id		= -1,
+	.dev		= {
+		.platform_data	= &rmt_pwm_led_data,
+	},
+};
+#endif
+
+#ifdef CONFIG_BOARD_MRMT_ADS7846_TS
+static int ads7846_pendown_state(void)
+{
+	return !gpio_get_value( PIN_TS_EXTINT );	/* PENIRQ.*/
+}
+
+static struct ads7846_platform_data ads_info = {
+	.model				= 7846,
+	.keep_vref_on			= 0,	/* Use external VREF pin */
+	.vref_delay_usecs		= 0,
+	.vref_mv			= 3300,	/* VREF = 3.3V */
+	.settle_delay_usecs		= 800,
+	.penirq_recheck_delay_usecs	= 800,
+	.x_plate_ohms			= 750,
+	.y_plate_ohms			= 300,
+	.pressure_max			= 4096,
+	.debounce_max			= 1,
+	.debounce_rep			= 0,
+	.debounce_tol			= (~0),
+	.get_pendown_state		= ads7846_pendown_state,
+	.filter				= NULL,
+	.filter_init			= NULL,
+};
+
+static struct spi_board_info spi01_board_info[] __initdata = {
+	{
+		.modalias	= "ads7846",
+		.max_speed_hz	= 31250*26,
+		.bus_num	= 0,
+		.chip_select	= 1,
+		.platform_data	= &ads_info,
+		.irq		= AT32_EXTINT(TS_IRQ),
+	},
+};
+#endif
+
+/* GPIO Keys: left, right, power, etc */
+static const struct gpio_keys_button rmt_gpio_keys_buttons[] = {
+	[0] = {
+		.type		= EV_KEY,
+		.code		= KEY_POWER,
+		.gpio		= PIN_PWR_SW_N,
+		.active_low	= 1,
+		.desc		= "power button",
+	},
+	[1] = {
+		.type		= EV_KEY,
+		.code		= KEY_LEFT,
+		.gpio		= PIN_PB_LEFT,
+		.active_low	= 1,
+		.desc		= "left button",
+	},
+	[2] = {
+		.type		= EV_KEY,
+		.code		= KEY_RIGHT,
+		.gpio		= PIN_PB_RIGHT,
+		.active_low	= 1,
+		.desc		= "right button",
+	},
+};
+
+static const struct gpio_keys_platform_data rmt_gpio_keys_data = {
+	.nbuttons =	ARRAY_SIZE(rmt_gpio_keys_buttons),
+	.buttons =	(void *) rmt_gpio_keys_buttons,
+};
+
+static struct platform_device rmt_gpio_keys = {
+	.name =		"gpio-keys",
+	.id =		-1,
+	.dev = {
+		.platform_data = (void *) &rmt_gpio_keys_data,
+	}
+};
+
+#ifdef CONFIG_BOARD_MRMT_RTC_I2C
+static struct i2c_board_info __initdata mrmt1_i2c_rtc = {
+	I2C_BOARD_INFO("s35390a", 0x30),
+	.irq		= 0,
+};
+#endif
+
+static void mrmt_power_off(void)
+{
+	/* PWR_ON=0 will force power off */
+	gpio_set_value( PIN_PWR_ON, 0 );
+}
+
+static int __init mrmt1_init(void)
+{
+	gpio_set_value( PIN_PWR_ON, 1 );	/* Ensure PWR_ON is enabled */
+
+	pm_power_off = mrmt_power_off;
+
+	/* Setup USARTS (other than console) */
+	at32_map_usart(2, 1, 0);	/* USART 2: /dev/ttyS1, RMT1:DB9M */
+	at32_map_usart(3, 2, ATMEL_USART_RTS | ATMEL_USART_CTS);
+			/* USART 3: /dev/ttyS2, RMT1:Wireless, w/ RTS/CTS */
+	at32_add_device_usart(1);
+	at32_add_device_usart(2);
+
+	/* Select GPIO Key pins */
+	at32_select_gpio( PIN_PWR_SW_N, AT32_GPIOF_DEGLITCH);
+	at32_select_gpio( PIN_PB_LEFT, AT32_GPIOF_DEGLITCH);
+	at32_select_gpio( PIN_PB_RIGHT, AT32_GPIOF_DEGLITCH);
+	platform_device_register(&rmt_gpio_keys);
+
+#ifdef CONFIG_BOARD_MRMT_RTC_I2C
+	i2c_register_board_info(0, &mrmt1_i2c_rtc, 1);
+#endif
+
+#ifndef CONFIG_BOARD_MRMT_LCD_DISABLE
+	/* User "alternate" LCDC inferface on Port E & D */
+	/* NB: exclude LCDC_CC pin, as NGW100 reserves it for other use */
+	at32_add_device_lcdc(0, &rmt_lcdc_data,
+		fbmem_start, fbmem_size,
+		(ATMEL_LCDC_ALT_24BIT | ATMEL_LCDC_PE_DVAL ) );
+#endif
+
+#ifdef CONFIG_BOARD_MRMT_AC97
+	at32_add_device_ac97c(0, &ac97c0_data, AC97C_BOTH);
+#endif
+
+#ifdef CONFIG_BOARD_MRMT_ADS7846_TS
+	/* Select the Touchscreen interrupt pin mode */
+	at32_select_periph( GPIO_PIOB_BASE, 1 << (PB_EXTINT_BASE+TS_IRQ),
+			GPIO_PERIPH_A, AT32_GPIOF_DEGLITCH);
+	set_irq_type( AT32_EXTINT(TS_IRQ), IRQ_TYPE_EDGE_FALLING );
+	spi_register_board_info(spi01_board_info,ARRAY_SIZE(spi01_board_info));
+#endif
+
+#ifdef CONFIG_BOARD_MRMT_UCB1400_TS
+	/* Select the Touchscreen interrupt pin mode */
+	at32_select_periph( GPIO_PIOB_BASE, 1 << (PB_EXTINT_BASE+TS_IRQ),
+			GPIO_PERIPH_A, AT32_GPIOF_DEGLITCH);
+	platform_device_register(&rmt_ts_device);
+#endif
+
+	at32_select_gpio( PIN_LCD_DISP, AT32_GPIOF_OUTPUT );
+	gpio_request( PIN_LCD_DISP, "LCD_DISP" );
+	gpio_direction_output( PIN_LCD_DISP, 0 );	/* LCD DISP */
+#ifdef CONFIG_BOARD_MRMT_LCD_DISABLE
+	/* Keep Backlight and DISP off */
+	at32_select_gpio( PIN_LCD_BL, AT32_GPIOF_OUTPUT );
+	gpio_request( PIN_LCD_BL, "LCD_BL" );
+	gpio_direction_output( PIN_LCD_BL, 0 );		/* Backlight */
+#else
+	gpio_set_value( PIN_LCD_DISP, 1 );	/* DISP asserted first */
+#ifdef CONFIG_BOARD_MRMT_BL_PWM
+	/* Use PWM for Backlight controls */
+	at32_add_device_pwm(1 << PWM_CH_BL);
+	platform_device_register(&rmt_pwm_led_dev);
+#else
+	/* Backlight always on */
+	udelay( 1 );
+	at32_select_gpio( PIN_LCD_BL, AT32_GPIOF_OUTPUT );
+	gpio_request( PIN_LCD_BL, "LCD_BL" );
+	gpio_direction_output( PIN_LCD_BL, 1 );
+#endif
+#endif
+
+	/* Make sure BT and Zigbee modules in reset */
+	at32_select_gpio( PIN_BT_RST, AT32_GPIOF_OUTPUT );
+	gpio_request( PIN_BT_RST, "BT_RST" );
+	gpio_direction_output( PIN_BT_RST, 1 );
+	/* BT Module in Reset */
+
+	at32_select_gpio( PIN_ZB_RST_N, AT32_GPIOF_OUTPUT );
+	gpio_request( PIN_ZB_RST_N, "ZB_RST_N" );
+	gpio_direction_output( PIN_ZB_RST_N, 0 );
+	/* XBee Module in Reset */
+
+#ifdef CONFIG_BOARD_MRMT_WIRELESS_ZB
+	udelay( 1000 );
+	/* Unreset the XBee Module */
+	gpio_set_value( PIN_ZB_RST_N, 1 );
+#endif
+#ifdef CONFIG_BOARD_MRMT_WIRELESS_BT
+	udelay( 1000 );
+	/* Unreset the BT Module */
+	gpio_set_value( PIN_BT_RST, 0 );
+#endif
+
+	return 0;
+}
+arch_initcall(mrmt1_init);
+
+static int __init mrmt1_early_init(void)
+{
+	/* To maintain power-on signal in case boot loader did not already */
+	at32_select_gpio( PIN_PWR_ON, AT32_GPIOF_OUTPUT );
+	gpio_request( PIN_PWR_ON, "PIN_PWR_ON" );
+	gpio_direction_output( PIN_PWR_ON, 1 );
+
+	return 0;
+}
+core_initcall(mrmt1_early_init);
diff --git a/arch/avr32/boards/atngw100/setup.c b/arch/avr32/boards/atngw100/setup.c
index 5b022aad4bd9..bc299fbbeb4e 100644
--- a/arch/avr32/boards/atngw100/setup.c
+++ b/arch/avr32/boards/atngw100/setup.c
@@ -56,8 +56,13 @@ static struct spi_board_info spi0_board_info[] __initdata = {
 static struct mci_platform_data __initdata mci0_data = {
 	.slot[0] = {
 		.bus_width	= 4,
+#if defined(CONFIG_BOARD_ATNGW100_EVKLCD10X) || defined(CONFIG_BOARD_ATNGW100_MRMT1)
+		.detect_pin     = GPIO_PIN_NONE,
+		.wp_pin         = GPIO_PIN_NONE,
+#else
 		.detect_pin	= GPIO_PIN_PC(25),
 		.wp_pin		= GPIO_PIN_PE(0),
+#endif
 	},
 };
 
diff --git a/arch/avr32/configs/atngw100_mrmt_defconfig b/arch/avr32/configs/atngw100_mrmt_defconfig
new file mode 100644
index 000000000000..17b030777d36
--- /dev/null
+++ b/arch/avr32/configs/atngw100_mrmt_defconfig
@@ -0,0 +1,1363 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.30-rc1
+# Wed Jun  3 00:24:53 2009
+#
+CONFIG_AVR32=y
+CONFIG_GENERIC_GPIO=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_STACKTRACE_SUPPORT=y
+CONFIG_LOCKDEP_SUPPORT=y
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+CONFIG_HARDIRQS_SW_RESEND=y
+CONFIG_GENERIC_IRQ_PROBE=y
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_TIME=y
+CONFIG_GENERIC_CLOCKEVENTS=y
+# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
+# CONFIG_ARCH_HAS_ILOG2_U32 is not set
+# CONFIG_ARCH_HAS_ILOG2_U64 is not set
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+CONFIG_GENERIC_BUG=y
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# General setup
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_BROKEN_ON_SMP=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_SYSVIPC_SYSCTL=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_BSD_PROCESS_ACCT_V3=y
+# CONFIG_TASKSTATS is not set
+# CONFIG_AUDIT is not set
+
+#
+# RCU Subsystem
+#
+CONFIG_CLASSIC_RCU=y
+# CONFIG_TREE_RCU is not set
+# CONFIG_PREEMPT_RCU is not set
+# CONFIG_TREE_RCU_TRACE is not set
+# CONFIG_PREEMPT_RCU_TRACE is not set
+# CONFIG_IKCONFIG is not set
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_GROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+# CONFIG_RT_GROUP_SCHED is not set
+CONFIG_USER_SCHED=y
+# CONFIG_CGROUP_SCHED is not set
+# CONFIG_CGROUPS is not set
+CONFIG_SYSFS_DEPRECATED=y
+CONFIG_SYSFS_DEPRECATED_V2=y
+# CONFIG_RELAY is not set
+# CONFIG_NAMESPACES is not set
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+# CONFIG_RD_BZIP2 is not set
+# CONFIG_RD_LZMA is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
+CONFIG_EMBEDDED=y
+# CONFIG_SYSCTL_SYSCALL is not set
+CONFIG_KALLSYMS=y
+# CONFIG_KALLSYMS_ALL is not set
+# CONFIG_KALLSYMS_EXTRA_PASS is not set
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+# CONFIG_BASE_FULL is not set
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SIGNALFD=y
+CONFIG_TIMERFD=y
+CONFIG_EVENTFD=y
+CONFIG_SHMEM=y
+CONFIG_AIO=y
+CONFIG_VM_EVENT_COUNTERS=y
+# CONFIG_SLUB_DEBUG is not set
+CONFIG_COMPAT_BRK=y
+# CONFIG_SLAB is not set
+CONFIG_SLUB=y
+# CONFIG_SLOB is not set
+# CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
+CONFIG_HAVE_OPROFILE=y
+# CONFIG_KPROBES is not set
+CONFIG_HAVE_KPROBES=y
+CONFIG_HAVE_CLK=y
+# CONFIG_SLOW_WORK is not set
+# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
+CONFIG_RT_MUTEXES=y
+CONFIG_BASE_SMALL=1
+CONFIG_MODULES=y
+# CONFIG_MODULE_FORCE_LOAD is not set
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_FORCE_UNLOAD=y
+# CONFIG_MODVERSIONS is not set
+# CONFIG_MODULE_SRCVERSION_ALL is not set
+CONFIG_BLOCK=y
+# CONFIG_LBD is not set
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_BLK_DEV_INTEGRITY is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+# CONFIG_IOSCHED_AS is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_DEFAULT_AS is not set
+# CONFIG_DEFAULT_DEADLINE is not set
+CONFIG_DEFAULT_CFQ=y
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="cfq"
+# CONFIG_FREEZER is not set
+
+#
+# System Type and features
+#
+# CONFIG_NO_HZ is not set
+# CONFIG_HIGH_RES_TIMERS is not set
+CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
+CONFIG_SUBARCH_AVR32B=y
+CONFIG_MMU=y
+CONFIG_PERFORMANCE_COUNTERS=y
+CONFIG_PLATFORM_AT32AP=y
+CONFIG_CPU_AT32AP700X=y
+CONFIG_CPU_AT32AP7000=y
+# CONFIG_BOARD_ATSTK1000 is not set
+CONFIG_BOARD_ATNGW100=y
+# CONFIG_BOARD_HAMMERHEAD is not set
+# CONFIG_BOARD_FAVR_32 is not set
+# CONFIG_BOARD_MERISC is not set
+# CONFIG_BOARD_MIMC200 is not set
+# CONFIG_BOARD_ATNGW100_ADDON_NONE is not set
+# CONFIG_BOARD_ATNGW100_EVKLCD10X is not set
+CONFIG_BOARD_ATNGW100_MRMT=y
+CONFIG_BOARD_MRMT_REV1=y
+# CONFIG_BOARD_MRMT_REV2 is not set
+CONFIG_BOARD_MRMT_AC97=y
+# CONFIG_BOARD_MRMT_UCB1400_TS is not set
+CONFIG_BOARD_MRMT_ADS7846_TS=y
+# CONFIG_BOARD_MRMT_LCD_DISABLE is not set
+CONFIG_BOARD_MRMT_LCD_LQ043T3DX0X=y
+# CONFIG_BOARD_MRMT_LCD_KWH043GM08 is not set
+CONFIG_BOARD_MRMT_BL_PWM=y
+CONFIG_BOARD_MRMT_RTC_I2C=y
+CONFIG_BOARD_MRMT_WIRELESS_ZB=y
+# CONFIG_BOARD_MRMT_WIRELESS_BT is not set
+# CONFIG_BOARD_MRMT_WIRELESS_NONE is not set
+CONFIG_LOADER_U_BOOT=y
+
+#
+# Atmel AVR32 AP options
+#
+# CONFIG_AP700X_32_BIT_SMC is not set
+CONFIG_AP700X_16_BIT_SMC=y
+# CONFIG_AP700X_8_BIT_SMC is not set
+CONFIG_LOAD_ADDRESS=0x10000000
+CONFIG_ENTRY_ADDRESS=0x90000000
+CONFIG_PHYS_OFFSET=0x10000000
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+CONFIG_QUICKLIST=y
+# CONFIG_HAVE_ARCH_BOOTMEM is not set
+# CONFIG_ARCH_HAVE_MEMORY_PRESENT is not set
+# CONFIG_NEED_NODE_MEMMAP_SIZE is not set
+CONFIG_ARCH_FLATMEM_ENABLE=y
+# CONFIG_ARCH_DISCONTIGMEM_ENABLE is not set
+# CONFIG_ARCH_SPARSEMEM_ENABLE is not set
+CONFIG_SELECT_MEMORY_MODEL=y
+CONFIG_FLATMEM_MANUAL=y
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+# CONFIG_SPARSEMEM_MANUAL is not set
+CONFIG_FLATMEM=y
+CONFIG_FLAT_NODE_MEM_MAP=y
+CONFIG_PAGEFLAGS_EXTENDED=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+# CONFIG_PHYS_ADDR_T_64BIT is not set
+CONFIG_ZONE_DMA_FLAG=0
+CONFIG_NR_QUICK=2
+CONFIG_VIRT_TO_BUS=y
+CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
+# CONFIG_OWNERSHIP_TRACE is not set
+# CONFIG_NMI_DEBUGGING is not set
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
+# CONFIG_HZ_300 is not set
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=250
+# CONFIG_SCHED_HRTICK is not set
+CONFIG_CMDLINE=""
+
+#
+# Power management options
+#
+CONFIG_PM=y
+# CONFIG_PM_DEBUG is not set
+# CONFIG_SUSPEND is not set
+CONFIG_ARCH_SUSPEND_POSSIBLE=y
+
+#
+# CPU Frequency scaling
+#
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_TABLE=y
+# CONFIG_CPU_FREQ_DEBUG is not set
+CONFIG_CPU_FREQ_STAT=y
+# CONFIG_CPU_FREQ_STAT_DETAILS is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
+CONFIG_CPU_FREQ_AT32AP=y
+
+#
+# Bus options
+#
+# CONFIG_ARCH_SUPPORTS_MSI is not set
+# CONFIG_PCCARD is not set
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_HAVE_AOUT is not set
+# CONFIG_BINFMT_MISC is not set
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_UNIX=y
+# CONFIG_NET_KEY is not set
+CONFIG_INET=y
+# CONFIG_IP_MULTICAST is not set
+# CONFIG_IP_ADVANCED_ROUTER is not set
+CONFIG_IP_FIB_HASH=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+# CONFIG_IP_PNP_BOOTP is not set
+# CONFIG_IP_PNP_RARP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+# CONFIG_INET_AH is not set
+# CONFIG_INET_ESP is not set
+# CONFIG_INET_IPCOMP is not set
+# CONFIG_INET_XFRM_TUNNEL is not set
+# CONFIG_INET_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG=y
+CONFIG_INET_TCP_DIAG=y
+# CONFIG_TCP_CONG_ADVANCED is not set
+CONFIG_TCP_CONG_CUBIC=y
+CONFIG_DEFAULT_TCP_CONG="cubic"
+# CONFIG_TCP_MD5SIG is not set
+# CONFIG_IPV6 is not set
+# CONFIG_NETWORK_SECMARK is not set
+# CONFIG_NETFILTER is not set
+# CONFIG_IP_DCCP is not set
+# CONFIG_IP_SCTP is not set
+# CONFIG_TIPC is not set
+# CONFIG_ATM is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_NET_DSA is not set
+# CONFIG_VLAN_8021Q is not set
+# CONFIG_DECNET is not set
+# CONFIG_LLC2 is not set
+# CONFIG_IPX is not set
+# CONFIG_ATALK is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
+# CONFIG_NET_SCHED is not set
+# CONFIG_DCB is not set
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+# CONFIG_HAMRADIO is not set
+# CONFIG_CAN is not set
+# CONFIG_IRDA is not set
+CONFIG_BT=m
+CONFIG_BT_L2CAP=m
+# CONFIG_BT_SCO is not set
+CONFIG_BT_RFCOMM=m
+CONFIG_BT_RFCOMM_TTY=y
+# CONFIG_BT_BNEP is not set
+CONFIG_BT_HIDP=m
+
+#
+# Bluetooth device drivers
+#
+# CONFIG_BT_HCIBTSDIO is not set
+CONFIG_BT_HCIUART=m
+CONFIG_BT_HCIUART_H4=y
+CONFIG_BT_HCIUART_BCSP=y
+# CONFIG_BT_HCIUART_LL is not set
+# CONFIG_BT_HCIVHCI is not set
+# CONFIG_AF_RXRPC is not set
+# CONFIG_WIRELESS is not set
+# CONFIG_WIMAX is not set
+# CONFIG_RFKILL is not set
+# CONFIG_NET_9P is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_STANDALONE=y
+# CONFIG_PREVENT_FIRMWARE_BUILD is not set
+# CONFIG_FW_LOADER is not set
+# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_DEBUG_DEVRES is not set
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_CONNECTOR is not set
+CONFIG_MTD=y
+# CONFIG_MTD_DEBUG is not set
+# CONFIG_MTD_CONCAT is not set
+CONFIG_MTD_PARTITIONS=y
+# CONFIG_MTD_TESTS is not set
+# CONFIG_MTD_REDBOOT_PARTS is not set
+CONFIG_MTD_CMDLINE_PARTS=y
+# CONFIG_MTD_AR7_PARTS is not set
+
+#
+# User Modules And Translation Layers
+#
+CONFIG_MTD_CHAR=y
+CONFIG_MTD_BLKDEVS=y
+CONFIG_MTD_BLOCK=y
+# CONFIG_FTL is not set
+# CONFIG_NFTL is not set
+# CONFIG_INFTL is not set
+# CONFIG_RFD_FTL is not set
+# CONFIG_SSFDC is not set
+# CONFIG_MTD_OOPS is not set
+
+#
+# RAM/ROM/Flash chip drivers
+#
+CONFIG_MTD_CFI=y
+# CONFIG_MTD_JEDECPROBE is not set
+CONFIG_MTD_GEN_PROBE=y
+# CONFIG_MTD_CFI_ADV_OPTIONS is not set
+CONFIG_MTD_MAP_BANK_WIDTH_1=y
+CONFIG_MTD_MAP_BANK_WIDTH_2=y
+CONFIG_MTD_MAP_BANK_WIDTH_4=y
+# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
+CONFIG_MTD_CFI_I1=y
+CONFIG_MTD_CFI_I2=y
+# CONFIG_MTD_CFI_I4 is not set
+# CONFIG_MTD_CFI_I8 is not set
+# CONFIG_MTD_CFI_INTELEXT is not set
+CONFIG_MTD_CFI_AMDSTD=y
+# CONFIG_MTD_CFI_STAA is not set
+CONFIG_MTD_CFI_UTIL=y
+# CONFIG_MTD_RAM is not set
+# CONFIG_MTD_ROM is not set
+# CONFIG_MTD_ABSENT is not set
+
+#
+# Mapping drivers for chip access
+#
+# CONFIG_MTD_COMPLEX_MAPPINGS is not set
+CONFIG_MTD_PHYSMAP=y
+# CONFIG_MTD_PHYSMAP_COMPAT is not set
+# CONFIG_MTD_PLATRAM is not set
+
+#
+# Self-contained MTD device drivers
+#
+CONFIG_MTD_DATAFLASH=y
+# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set
+# CONFIG_MTD_DATAFLASH_OTP is not set
+# CONFIG_MTD_M25P80 is not set
+# CONFIG_MTD_SLRAM is not set
+# CONFIG_MTD_PHRAM is not set
+# CONFIG_MTD_MTDRAM is not set
+# CONFIG_MTD_BLOCK2MTD is not set
+
+#
+# Disk-On-Chip Device Drivers
+#
+# CONFIG_MTD_DOC2000 is not set
+# CONFIG_MTD_DOC2001 is not set
+# CONFIG_MTD_DOC2001PLUS is not set
+# CONFIG_MTD_NAND is not set
+# CONFIG_MTD_ONENAND is not set
+
+#
+# LPDDR flash memory drivers
+#
+# CONFIG_MTD_LPDDR is not set
+
+#
+# UBI - Unsorted block images
+#
+# CONFIG_MTD_UBI is not set
+# CONFIG_PARPORT is not set
+CONFIG_BLK_DEV=y
+# CONFIG_BLK_DEV_COW_COMMON is not set
+CONFIG_BLK_DEV_LOOP=y
+# CONFIG_BLK_DEV_CRYPTOLOOP is not set
+# CONFIG_BLK_DEV_NBD is not set
+# CONFIG_BLK_DEV_RAM is not set
+# CONFIG_CDROM_PKTCDVD is not set
+# CONFIG_ATA_OVER_ETH is not set
+CONFIG_MISC_DEVICES=y
+CONFIG_ATMEL_PWM=y
+# CONFIG_ATMEL_TCLIB is not set
+# CONFIG_ICS932S401 is not set
+# CONFIG_ATMEL_SSC is not set
+# CONFIG_ENCLOSURE_SERVICES is not set
+# CONFIG_ISL29003 is not set
+# CONFIG_C2PORT is not set
+
+#
+# EEPROM support
+#
+# CONFIG_EEPROM_AT24 is not set
+# CONFIG_EEPROM_AT25 is not set
+# CONFIG_EEPROM_LEGACY is not set
+# CONFIG_EEPROM_93CX6 is not set
+
+#
+# SCSI device support
+#
+# CONFIG_RAID_ATTRS is not set
+# CONFIG_SCSI is not set
+# CONFIG_SCSI_DMA is not set
+# CONFIG_SCSI_NETLINK is not set
+# CONFIG_ATA is not set
+# CONFIG_MD is not set
+CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
+# CONFIG_DUMMY is not set
+# CONFIG_BONDING is not set
+# CONFIG_MACVLAN is not set
+# CONFIG_EQUALIZER is not set
+# CONFIG_TUN is not set
+# CONFIG_VETH is not set
+CONFIG_PHYLIB=y
+
+#
+# MII PHY device drivers
+#
+# CONFIG_MARVELL_PHY is not set
+# CONFIG_DAVICOM_PHY is not set
+# CONFIG_QSEMI_PHY is not set
+# CONFIG_LXT_PHY is not set
+# CONFIG_CICADA_PHY is not set
+# CONFIG_VITESSE_PHY is not set
+# CONFIG_SMSC_PHY is not set
+# CONFIG_BROADCOM_PHY is not set
+# CONFIG_ICPLUS_PHY is not set
+# CONFIG_REALTEK_PHY is not set
+# CONFIG_NATIONAL_PHY is not set
+# CONFIG_STE10XP is not set
+# CONFIG_LSI_ET1011C_PHY is not set
+# CONFIG_FIXED_PHY is not set
+# CONFIG_MDIO_BITBANG is not set
+CONFIG_NET_ETHERNET=y
+# CONFIG_MII is not set
+CONFIG_MACB=y
+# CONFIG_ENC28J60 is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
+# CONFIG_IBM_NEW_EMAC_ZMII is not set
+# CONFIG_IBM_NEW_EMAC_RGMII is not set
+# CONFIG_IBM_NEW_EMAC_TAH is not set
+# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
+# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
+# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
+# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
+# CONFIG_B44 is not set
+# CONFIG_NETDEV_1000 is not set
+# CONFIG_NETDEV_10000 is not set
+
+#
+# Wireless LAN
+#
+# CONFIG_WLAN_PRE80211 is not set
+# CONFIG_WLAN_80211 is not set
+
+#
+# Enable WiMAX (Networking options) to see the WiMAX drivers
+#
+# CONFIG_WAN is not set
+# CONFIG_PPP is not set
+# CONFIG_SLIP is not set
+# CONFIG_NETCONSOLE is not set
+# CONFIG_NETPOLL is not set
+# CONFIG_NET_POLL_CONTROLLER is not set
+# CONFIG_ISDN is not set
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+# CONFIG_INPUT_FF_MEMLESS is not set
+# CONFIG_INPUT_POLLDEV is not set
+
+#
+# Userland interfaces
+#
+# CONFIG_INPUT_MOUSEDEV is not set
+# CONFIG_INPUT_JOYDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+# CONFIG_KEYBOARD_ATKBD is not set
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+# CONFIG_KEYBOARD_STOWAWAY is not set
+CONFIG_KEYBOARD_GPIO=y
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TABLET is not set
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_TOUCHSCREEN_ADS7846=m
+# CONFIG_TOUCHSCREEN_FUJITSU is not set
+# CONFIG_TOUCHSCREEN_GUNZE is not set
+# CONFIG_TOUCHSCREEN_ELO is not set
+# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
+# CONFIG_TOUCHSCREEN_MTOUCH is not set
+# CONFIG_TOUCHSCREEN_INEXIO is not set
+# CONFIG_TOUCHSCREEN_MK712 is not set
+# CONFIG_TOUCHSCREEN_PENMOUNT is not set
+# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
+# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
+# CONFIG_TOUCHSCREEN_WM97XX is not set
+# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
+# CONFIG_TOUCHSCREEN_TSC2007 is not set
+# CONFIG_INPUT_MISC is not set
+
+#
+# Hardware I/O ports
+#
+# CONFIG_SERIO is not set
+# CONFIG_GAMEPORT is not set
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_CONSOLE_TRANSLATIONS=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_VT_HW_CONSOLE_BINDING=y
+CONFIG_DEVKMEM=y
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+# CONFIG_SERIAL_8250 is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_ATMEL=y
+CONFIG_SERIAL_ATMEL_CONSOLE=y
+CONFIG_SERIAL_ATMEL_PDC=y
+# CONFIG_SERIAL_ATMEL_TTYAT is not set
+# CONFIG_SERIAL_MAX3100 is not set
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_UNIX98_PTYS=y
+# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_IPMI_HANDLER is not set
+# CONFIG_HW_RANDOM is not set
+# CONFIG_R3964 is not set
+# CONFIG_RAW_DRIVER is not set
+# CONFIG_TCG_TPM is not set
+CONFIG_I2C=y
+CONFIG_I2C_BOARDINFO=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_HELPER_AUTO=y
+CONFIG_I2C_ALGOBIT=y
+
+#
+# I2C Hardware Bus support
+#
+
+#
+# I2C system bus drivers (mostly embedded / system-on-chip)
+#
+CONFIG_I2C_GPIO=y
+# CONFIG_I2C_OCORES is not set
+# CONFIG_I2C_SIMTEC is not set
+
+#
+# External I2C/SMBus adapter drivers
+#
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_TAOS_EVM is not set
+
+#
+# Other I2C/SMBus bus drivers
+#
+# CONFIG_I2C_PCA_PLATFORM is not set
+# CONFIG_I2C_STUB is not set
+
+#
+# Miscellaneous I2C Chip support
+#
+# CONFIG_DS1682 is not set
+# CONFIG_SENSORS_PCF8574 is not set
+# CONFIG_PCF8575 is not set
+# CONFIG_SENSORS_PCA9539 is not set
+# CONFIG_SENSORS_MAX6875 is not set
+# CONFIG_SENSORS_TSL2550 is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+CONFIG_SPI=y
+# CONFIG_SPI_DEBUG is not set
+CONFIG_SPI_MASTER=y
+
+#
+# SPI Master Controller Drivers
+#
+CONFIG_SPI_ATMEL=y
+# CONFIG_SPI_BITBANG is not set
+# CONFIG_SPI_GPIO is not set
+
+#
+# SPI Protocol Masters
+#
+CONFIG_SPI_SPIDEV=y
+# CONFIG_SPI_TLE62X0 is not set
+CONFIG_ARCH_REQUIRE_GPIOLIB=y
+CONFIG_GPIOLIB=y
+# CONFIG_DEBUG_GPIO is not set
+# CONFIG_GPIO_SYSFS is not set
+
+#
+# Memory mapped GPIO expanders:
+#
+
+#
+# I2C GPIO expanders:
+#
+# CONFIG_GPIO_MAX732X is not set
+# CONFIG_GPIO_PCA953X is not set
+# CONFIG_GPIO_PCF857X is not set
+
+#
+# PCI GPIO expanders:
+#
+
+#
+# SPI GPIO expanders:
+#
+# CONFIG_GPIO_MAX7301 is not set
+# CONFIG_GPIO_MCP23S08 is not set
+# CONFIG_W1 is not set
+# CONFIG_POWER_SUPPLY is not set
+CONFIG_HWMON=y
+# CONFIG_HWMON_VID is not set
+# CONFIG_SENSORS_AD7414 is not set
+# CONFIG_SENSORS_AD7418 is not set
+# CONFIG_SENSORS_ADCXX is not set
+# CONFIG_SENSORS_ADM1021 is not set
+# CONFIG_SENSORS_ADM1025 is not set
+# CONFIG_SENSORS_ADM1026 is not set
+# CONFIG_SENSORS_ADM1029 is not set
+# CONFIG_SENSORS_ADM1031 is not set
+# CONFIG_SENSORS_ADM9240 is not set
+# CONFIG_SENSORS_ADT7462 is not set
+# CONFIG_SENSORS_ADT7470 is not set
+# CONFIG_SENSORS_ADT7473 is not set
+# CONFIG_SENSORS_ADT7475 is not set
+# CONFIG_SENSORS_ATXP1 is not set
+# CONFIG_SENSORS_DS1621 is not set
+# CONFIG_SENSORS_F71805F is not set
+# CONFIG_SENSORS_F71882FG is not set
+# CONFIG_SENSORS_F75375S is not set
+# CONFIG_SENSORS_G760A is not set
+# CONFIG_SENSORS_GL518SM is not set
+# CONFIG_SENSORS_GL520SM is not set
+# CONFIG_SENSORS_IT87 is not set
+# CONFIG_SENSORS_LM63 is not set
+# CONFIG_SENSORS_LM70 is not set
+# CONFIG_SENSORS_LM75 is not set
+# CONFIG_SENSORS_LM77 is not set
+# CONFIG_SENSORS_LM78 is not set
+# CONFIG_SENSORS_LM80 is not set
+# CONFIG_SENSORS_LM83 is not set
+# CONFIG_SENSORS_LM85 is not set
+# CONFIG_SENSORS_LM87 is not set
+# CONFIG_SENSORS_LM90 is not set
+# CONFIG_SENSORS_LM92 is not set
+# CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
+# CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
+# CONFIG_SENSORS_MAX1111 is not set
+# CONFIG_SENSORS_MAX1619 is not set
+# CONFIG_SENSORS_MAX6650 is not set
+# CONFIG_SENSORS_PC87360 is not set
+# CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
+# CONFIG_SENSORS_DME1737 is not set
+# CONFIG_SENSORS_SMSC47M1 is not set
+# CONFIG_SENSORS_SMSC47M192 is not set
+# CONFIG_SENSORS_SMSC47B397 is not set
+# CONFIG_SENSORS_ADS7828 is not set
+# CONFIG_SENSORS_THMC50 is not set
+# CONFIG_SENSORS_VT1211 is not set
+# CONFIG_SENSORS_W83781D is not set
+# CONFIG_SENSORS_W83791D is not set
+# CONFIG_SENSORS_W83792D is not set
+# CONFIG_SENSORS_W83793 is not set
+# CONFIG_SENSORS_W83L785TS is not set
+# CONFIG_SENSORS_W83L786NG is not set
+# CONFIG_SENSORS_W83627HF is not set
+# CONFIG_SENSORS_W83627EHF is not set
+# CONFIG_SENSORS_LIS3_SPI is not set
+# CONFIG_HWMON_DEBUG_CHIP is not set
+# CONFIG_THERMAL is not set
+# CONFIG_THERMAL_HWMON is not set
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+# CONFIG_SOFT_WATCHDOG is not set
+CONFIG_AT32AP700X_WDT=y
+CONFIG_SSB_POSSIBLE=y
+
+#
+# Sonics Silicon Backplane
+#
+# CONFIG_SSB is not set
+
+#
+# Multifunction device drivers
+#
+# CONFIG_MFD_CORE is not set
+# CONFIG_MFD_SM501 is not set
+# CONFIG_HTC_PASIC3 is not set
+# CONFIG_UCB1400_CORE is not set
+# CONFIG_TPS65010 is not set
+# CONFIG_TWL4030_CORE is not set
+# CONFIG_MFD_TMIO is not set
+# CONFIG_PMIC_DA903X is not set
+# CONFIG_MFD_WM8400 is not set
+# CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_PCF50633 is not set
+# CONFIG_REGULATOR is not set
+
+#
+# Multimedia devices
+#
+
+#
+# Multimedia core support
+#
+# CONFIG_VIDEO_DEV is not set
+# CONFIG_DVB_CORE is not set
+# CONFIG_VIDEO_MEDIA is not set
+
+#
+# Multimedia drivers
+#
+# CONFIG_DAB is not set
+
+#
+# Graphics support
+#
+# CONFIG_VGASTATE is not set
+# CONFIG_VIDEO_OUTPUT_CONTROL is not set
+CONFIG_FB=y
+# CONFIG_FIRMWARE_EDID is not set
+# CONFIG_FB_DDC is not set
+# CONFIG_FB_BOOT_VESA_SUPPORT is not set
+CONFIG_FB_CFB_FILLRECT=y
+CONFIG_FB_CFB_COPYAREA=y
+CONFIG_FB_CFB_IMAGEBLIT=y
+# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
+# CONFIG_FB_SYS_FILLRECT is not set
+# CONFIG_FB_SYS_COPYAREA is not set
+# CONFIG_FB_SYS_IMAGEBLIT is not set
+# CONFIG_FB_FOREIGN_ENDIAN is not set
+# CONFIG_FB_SYS_FOPS is not set
+# CONFIG_FB_SVGALIB is not set
+# CONFIG_FB_MACMODES is not set
+# CONFIG_FB_BACKLIGHT is not set
+# CONFIG_FB_MODE_HELPERS is not set
+# CONFIG_FB_TILEBLITTING is not set
+
+#
+# Frame buffer hardware drivers
+#
+# CONFIG_FB_S1D13XXX is not set
+CONFIG_FB_ATMEL=y
+# CONFIG_FB_VIRTUAL is not set
+# CONFIG_FB_METRONOME is not set
+# CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LCD_CLASS_DEVICE=y
+# CONFIG_LCD_LTV350QV is not set
+# CONFIG_LCD_ILI9320 is not set
+# CONFIG_LCD_TDO24M is not set
+# CONFIG_LCD_VGG2432A4 is not set
+# CONFIG_LCD_PLATFORM is not set
+CONFIG_BACKLIGHT_CLASS_DEVICE=y
+# CONFIG_BACKLIGHT_ATMEL_LCDC is not set
+# CONFIG_BACKLIGHT_ATMEL_PWM is not set
+CONFIG_BACKLIGHT_GENERIC=y
+
+#
+# Display device support
+#
+# CONFIG_DISPLAY_SUPPORT is not set
+
+#
+# Console display driver support
+#
+CONFIG_DUMMY_CONSOLE=y
+# CONFIG_FRAMEBUFFER_CONSOLE is not set
+# CONFIG_LOGO is not set
+CONFIG_SOUND=m
+CONFIG_SOUND_OSS_CORE=y
+CONFIG_SND=m
+CONFIG_SND_TIMER=m
+CONFIG_SND_PCM=m
+# CONFIG_SND_SEQUENCER is not set
+CONFIG_SND_OSSEMUL=y
+CONFIG_SND_MIXER_OSS=m
+CONFIG_SND_PCM_OSS=m
+CONFIG_SND_PCM_OSS_PLUGINS=y
+# CONFIG_SND_DYNAMIC_MINORS is not set
+# CONFIG_SND_SUPPORT_OLD_API is not set
+# CONFIG_SND_VERBOSE_PROCFS is not set
+# CONFIG_SND_VERBOSE_PRINTK is not set
+# CONFIG_SND_DEBUG is not set
+CONFIG_SND_VMASTER=y
+CONFIG_SND_AC97_CODEC=m
+CONFIG_SND_DRIVERS=y
+# CONFIG_SND_DUMMY is not set
+# CONFIG_SND_MTPAV is not set
+# CONFIG_SND_SERIAL_U16550 is not set
+# CONFIG_SND_MPU401 is not set
+# CONFIG_SND_AC97_POWER_SAVE is not set
+
+#
+# Atmel devices (AVR32 and AT91)
+#
+# CONFIG_SND_ATMEL_ABDAC is not set
+CONFIG_SND_ATMEL_AC97C=m
+# CONFIG_SND_SPI is not set
+# CONFIG_SND_SOC is not set
+# CONFIG_SOUND_PRIME is not set
+CONFIG_AC97_BUS=m
+CONFIG_HID_SUPPORT=y
+CONFIG_HID=y
+# CONFIG_HID_DEBUG is not set
+# CONFIG_HIDRAW is not set
+# CONFIG_HID_PID is not set
+
+#
+# Special HID drivers
+#
+# CONFIG_HID_APPLE is not set
+CONFIG_USB_SUPPORT=y
+# CONFIG_USB_ARCH_HAS_HCD is not set
+# CONFIG_USB_ARCH_HAS_OHCI is not set
+# CONFIG_USB_ARCH_HAS_EHCI is not set
+# CONFIG_USB_OTG_WHITELIST is not set
+# CONFIG_USB_OTG_BLACKLIST_HUB is not set
+# CONFIG_USB_MUSB_HDRC is not set
+# CONFIG_USB_GADGET_MUSB_HDRC is not set
+
+#
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
+#
+CONFIG_USB_GADGET=m
+# CONFIG_USB_GADGET_DEBUG is not set
+CONFIG_USB_GADGET_DEBUG_FILES=y
+# CONFIG_USB_GADGET_DEBUG_FS is not set
+CONFIG_USB_GADGET_VBUS_DRAW=2
+CONFIG_USB_GADGET_SELECTED=y
+# CONFIG_USB_GADGET_AT91 is not set
+CONFIG_USB_GADGET_ATMEL_USBA=y
+CONFIG_USB_ATMEL_USBA=m
+# CONFIG_USB_GADGET_FSL_USB2 is not set
+# CONFIG_USB_GADGET_LH7A40X is not set
+# CONFIG_USB_GADGET_OMAP is not set
+# CONFIG_USB_GADGET_PXA25X is not set
+# CONFIG_USB_GADGET_PXA27X is not set
+# CONFIG_USB_GADGET_S3C2410 is not set
+# CONFIG_USB_GADGET_IMX is not set
+# CONFIG_USB_GADGET_M66592 is not set
+# CONFIG_USB_GADGET_AMD5536UDC is not set
+# CONFIG_USB_GADGET_FSL_QE is not set
+# CONFIG_USB_GADGET_CI13XXX is not set
+# CONFIG_USB_GADGET_NET2280 is not set
+# CONFIG_USB_GADGET_GOKU is not set
+# CONFIG_USB_GADGET_DUMMY_HCD is not set
+CONFIG_USB_GADGET_DUALSPEED=y
+# CONFIG_USB_ZERO is not set
+# CONFIG_USB_ETH is not set
+# CONFIG_USB_GADGETFS is not set
+CONFIG_USB_FILE_STORAGE=m
+# CONFIG_USB_FILE_STORAGE_TEST is not set
+CONFIG_USB_G_SERIAL=m
+# CONFIG_USB_MIDI_GADGET is not set
+# CONFIG_USB_G_PRINTER is not set
+# CONFIG_USB_CDC_COMPOSITE is not set
+
+#
+# OTG and related infrastructure
+#
+# CONFIG_USB_GPIO_VBUS is not set
+# CONFIG_NOP_USB_XCEIV is not set
+CONFIG_MMC=y
+# CONFIG_MMC_DEBUG is not set
+# CONFIG_MMC_UNSAFE_RESUME is not set
+
+#
+# MMC/SD/SDIO Card Drivers
+#
+CONFIG_MMC_BLOCK=y
+CONFIG_MMC_BLOCK_BOUNCE=y
+# CONFIG_SDIO_UART is not set
+# CONFIG_MMC_TEST is not set
+
+#
+# MMC/SD/SDIO Host Controller Drivers
+#
+# CONFIG_MMC_SDHCI is not set
+CONFIG_MMC_ATMELMCI=y
+# CONFIG_MMC_ATMELMCI_DMA is not set
+# CONFIG_MMC_SPI is not set
+# CONFIG_MEMSTICK is not set
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=y
+
+#
+# LED drivers
+#
+CONFIG_LEDS_ATMEL_PWM=y
+# CONFIG_LEDS_PCA9532 is not set
+CONFIG_LEDS_GPIO=y
+CONFIG_LEDS_GPIO_PLATFORM=y
+# CONFIG_LEDS_LP5521 is not set
+# CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_DAC124S085 is not set
+# CONFIG_LEDS_BD2802 is not set
+
+#
+# LED Triggers
+#
+CONFIG_LEDS_TRIGGERS=y
+CONFIG_LEDS_TRIGGER_TIMER=y
+CONFIG_LEDS_TRIGGER_HEARTBEAT=y
+# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
+# CONFIG_LEDS_TRIGGER_GPIO is not set
+# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
+# CONFIG_ACCESSIBILITY is not set
+CONFIG_RTC_LIB=m
+CONFIG_RTC_CLASS=m
+
+#
+# RTC interfaces
+#
+CONFIG_RTC_INTF_SYSFS=y
+CONFIG_RTC_INTF_PROC=y
+CONFIG_RTC_INTF_DEV=y
+# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
+# CONFIG_RTC_DRV_TEST is not set
+
+#
+# I2C RTC drivers
+#
+# CONFIG_RTC_DRV_DS1307 is not set
+# CONFIG_RTC_DRV_DS1374 is not set
+# CONFIG_RTC_DRV_DS1672 is not set
+# CONFIG_RTC_DRV_MAX6900 is not set
+# CONFIG_RTC_DRV_RS5C372 is not set
+# CONFIG_RTC_DRV_ISL1208 is not set
+# CONFIG_RTC_DRV_X1205 is not set
+# CONFIG_RTC_DRV_PCF8563 is not set
+# CONFIG_RTC_DRV_PCF8583 is not set
+# CONFIG_RTC_DRV_M41T80 is not set
+CONFIG_RTC_DRV_S35390A=m
+# CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
+
+#
+# SPI RTC drivers
+#
+# CONFIG_RTC_DRV_M41T94 is not set
+# CONFIG_RTC_DRV_DS1305 is not set
+# CONFIG_RTC_DRV_DS1390 is not set
+# CONFIG_RTC_DRV_MAX6902 is not set
+# CONFIG_RTC_DRV_R9701 is not set
+# CONFIG_RTC_DRV_RS5C348 is not set
+# CONFIG_RTC_DRV_DS3234 is not set
+
+#
+# Platform RTC drivers
+#
+# CONFIG_RTC_DRV_DS1286 is not set
+# CONFIG_RTC_DRV_DS1511 is not set
+# CONFIG_RTC_DRV_DS1553 is not set
+# CONFIG_RTC_DRV_DS1742 is not set
+# CONFIG_RTC_DRV_STK17TA8 is not set
+# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_M48T35 is not set
+# CONFIG_RTC_DRV_M48T59 is not set
+# CONFIG_RTC_DRV_BQ4802 is not set
+# CONFIG_RTC_DRV_V3020 is not set
+
+#
+# on-CPU RTC drivers
+#
+CONFIG_RTC_DRV_AT32AP700X=m
+CONFIG_DMADEVICES=y
+
+#
+# DMA Devices
+#
+CONFIG_DW_DMAC=y
+CONFIG_DMA_ENGINE=y
+
+#
+# DMA Clients
+#
+# CONFIG_NET_DMA is not set
+# CONFIG_ASYNC_TX_DMA is not set
+# CONFIG_DMATEST is not set
+# CONFIG_AUXDISPLAY is not set
+CONFIG_UIO=y
+# CONFIG_UIO_PDRV is not set
+# CONFIG_UIO_PDRV_GENIRQ is not set
+# CONFIG_UIO_SMX is not set
+# CONFIG_UIO_SERCOS3 is not set
+# CONFIG_STAGING is not set
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+# CONFIG_EXT2_FS_POSIX_ACL is not set
+# CONFIG_EXT2_FS_SECURITY is not set
+# CONFIG_EXT2_FS_XIP is not set
+CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
+CONFIG_EXT3_FS_XATTR=y
+# CONFIG_EXT3_FS_POSIX_ACL is not set
+# CONFIG_EXT3_FS_SECURITY is not set
+# CONFIG_EXT4_FS is not set
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+# CONFIG_FS_POSIX_ACL is not set
+CONFIG_FILE_LOCKING=y
+# CONFIG_XFS_FS is not set
+# CONFIG_OCFS2_FS is not set
+# CONFIG_BTRFS_FS is not set
+# CONFIG_DNOTIFY is not set
+# CONFIG_INOTIFY is not set
+# CONFIG_QUOTA is not set
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_FUSE_FS is not set
+
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_FAT_DEFAULT_CODEPAGE=850
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+CONFIG_NTFS_FS=m
+# CONFIG_NTFS_DEBUG is not set
+CONFIG_NTFS_RW=y
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+# CONFIG_PROC_KCORE is not set
+CONFIG_PROC_SYSCTL=y
+CONFIG_PROC_PAGE_MONITOR=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+# CONFIG_TMPFS_POSIX_ACL is not set
+# CONFIG_HUGETLB_PAGE is not set
+CONFIG_CONFIGFS_FS=y
+CONFIG_MISC_FILESYSTEMS=y
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_HFSPLUS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+CONFIG_JFFS2_FS=y
+CONFIG_JFFS2_FS_DEBUG=0
+CONFIG_JFFS2_FS_WRITEBUFFER=y
+# CONFIG_JFFS2_FS_WBUF_VERIFY is not set
+# CONFIG_JFFS2_SUMMARY is not set
+# CONFIG_JFFS2_FS_XATTR is not set
+# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set
+CONFIG_JFFS2_ZLIB=y
+# CONFIG_JFFS2_LZO is not set
+CONFIG_JFFS2_RTIME=y
+# CONFIG_JFFS2_RUBIN is not set
+# CONFIG_CRAMFS is not set
+# CONFIG_SQUASHFS is not set
+# CONFIG_VXFS_FS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_OMFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_ROMFS_FS is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+# CONFIG_NFS_V3_ACL is not set
+# CONFIG_NFS_V4 is not set
+CONFIG_ROOT_NFS=y
+# CONFIG_NFSD is not set
+CONFIG_LOCKD=y
+CONFIG_LOCKD_V4=y
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=y
+# CONFIG_RPCSEC_GSS_KRB5 is not set
+# CONFIG_RPCSEC_GSS_SPKM3 is not set
+CONFIG_SMB_FS=m
+CONFIG_SMB_NLS_DEFAULT=y
+CONFIG_SMB_NLS_REMOTE="cp437"
+CONFIG_CIFS=m
+CONFIG_CIFS_STATS=y
+# CONFIG_CIFS_STATS2 is not set
+CONFIG_CIFS_WEAK_PW_HASH=y
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+# CONFIG_CIFS_DEBUG2 is not set
+# CONFIG_CIFS_EXPERIMENTAL is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=y
+# CONFIG_NLS_CODEPAGE_737 is not set
+# CONFIG_NLS_CODEPAGE_775 is not set
+CONFIG_NLS_CODEPAGE_850=y
+# CONFIG_NLS_CODEPAGE_852 is not set
+# CONFIG_NLS_CODEPAGE_855 is not set
+# CONFIG_NLS_CODEPAGE_857 is not set
+# CONFIG_NLS_CODEPAGE_860 is not set
+# CONFIG_NLS_CODEPAGE_861 is not set
+# CONFIG_NLS_CODEPAGE_862 is not set
+# CONFIG_NLS_CODEPAGE_863 is not set
+# CONFIG_NLS_CODEPAGE_864 is not set
+# CONFIG_NLS_CODEPAGE_865 is not set
+# CONFIG_NLS_CODEPAGE_866 is not set
+# CONFIG_NLS_CODEPAGE_869 is not set
+# CONFIG_NLS_CODEPAGE_936 is not set
+# CONFIG_NLS_CODEPAGE_950 is not set
+# CONFIG_NLS_CODEPAGE_932 is not set
+# CONFIG_NLS_CODEPAGE_949 is not set
+# CONFIG_NLS_CODEPAGE_874 is not set
+# CONFIG_NLS_ISO8859_8 is not set
+# CONFIG_NLS_CODEPAGE_1250 is not set
+# CONFIG_NLS_CODEPAGE_1251 is not set
+# CONFIG_NLS_ASCII is not set
+CONFIG_NLS_ISO8859_1=y
+# CONFIG_NLS_ISO8859_2 is not set
+# CONFIG_NLS_ISO8859_3 is not set
+# CONFIG_NLS_ISO8859_4 is not set
+# CONFIG_NLS_ISO8859_5 is not set
+# CONFIG_NLS_ISO8859_6 is not set
+# CONFIG_NLS_ISO8859_7 is not set
+# CONFIG_NLS_ISO8859_9 is not set
+# CONFIG_NLS_ISO8859_13 is not set
+# CONFIG_NLS_ISO8859_14 is not set
+# CONFIG_NLS_ISO8859_15 is not set
+# CONFIG_NLS_KOI8_R is not set
+# CONFIG_NLS_KOI8_U is not set
+CONFIG_NLS_UTF8=y
+# CONFIG_DLM is not set
+
+#
+# Kernel hacking
+#
+# CONFIG_PRINTK_TIME is not set
+CONFIG_ENABLE_WARN_DEPRECATED=y
+CONFIG_ENABLE_MUST_CHECK=y
+CONFIG_FRAME_WARN=1024
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_FS=y
+# CONFIG_HEADERS_CHECK is not set
+CONFIG_DEBUG_KERNEL=y
+# CONFIG_DEBUG_SHIRQ is not set
+CONFIG_DETECT_SOFTLOCKUP=y
+# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
+CONFIG_SCHED_DEBUG=y
+# CONFIG_SCHEDSTATS is not set
+# CONFIG_TIMER_STATS is not set
+# CONFIG_DEBUG_OBJECTS is not set
+# CONFIG_DEBUG_RT_MUTEXES is not set
+# CONFIG_RT_MUTEX_TESTER is not set
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_MUTEXES is not set
+# CONFIG_DEBUG_LOCK_ALLOC is not set
+# CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
+# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+# CONFIG_DEBUG_KOBJECT is not set
+CONFIG_DEBUG_BUGVERBOSE=y
+# CONFIG_DEBUG_INFO is not set
+# CONFIG_DEBUG_VM is not set
+# CONFIG_DEBUG_WRITECOUNT is not set
+# CONFIG_DEBUG_MEMORY_INIT is not set
+# CONFIG_DEBUG_LIST is not set
+# CONFIG_DEBUG_SG is not set
+# CONFIG_DEBUG_NOTIFIERS is not set
+CONFIG_FRAME_POINTER=y
+# CONFIG_BOOT_PRINTK_DELAY is not set
+# CONFIG_RCU_TORTURE_TEST is not set
+# CONFIG_RCU_CPU_STALL_DETECTOR is not set
+# CONFIG_BACKTRACE_SELF_TEST is not set
+# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
+# CONFIG_FAULT_INJECTION is not set
+# CONFIG_PAGE_POISONING is not set
+CONFIG_TRACING_SUPPORT=y
+
+#
+# Tracers
+#
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_SAMPLES is not set
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITY is not set
+# CONFIG_SECURITYFS is not set
+# CONFIG_SECURITY_FILE_CAPABILITIES is not set
+# CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
+
+#
+# Library routines
+#
+CONFIG_BITREVERSE=y
+CONFIG_GENERIC_FIND_LAST_BIT=y
+CONFIG_CRC_CCITT=y
+# CONFIG_CRC16 is not set
+# CONFIG_CRC_T10DIF is not set
+# CONFIG_CRC_ITU_T is not set
+CONFIG_CRC32=y
+# CONFIG_CRC7 is not set
+# CONFIG_LIBCRC32C is not set
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_GENERIC_ALLOCATOR=y
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y

From e1696834e8a15d7ef9ae8ffdffe00bac1399a2e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 15:32:59 +0200
Subject: [PATCH 133/162] xfs: update max log size

Commit a6634fba3dec4a92f0a2c4e30c80b634c0576ad5 in xfsprogs increased the
maximum log size supported by mkfs.  Merged back the changes to xfs_fs.h
so the growfs enforced the same limit and the headers are in sync.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/xfs_fs.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f7c06fac8229..c4ea51b55dce 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -239,10 +239,13 @@ typedef struct xfs_fsop_resblks {
  * Minimum and maximum sizes need for growth checks
  */
 #define XFS_MIN_AG_BLOCKS	64
-#define XFS_MIN_LOG_BLOCKS	512
-#define XFS_MAX_LOG_BLOCKS	(64 * 1024)
-#define XFS_MIN_LOG_BYTES	(256 * 1024)
-#define XFS_MAX_LOG_BYTES	(128 * 1024 * 1024)
+#define XFS_MIN_LOG_BLOCKS	512ULL
+#define XFS_MAX_LOG_BLOCKS	(1024 * 1024ULL)
+#define XFS_MIN_LOG_BYTES	(10 * 1024 * 1024ULL)
+
+/* keep the maximum size under 2^31 by a small amount */
+#define XFS_MAX_LOG_BYTES \
+	((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
 
 /*
  * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT

From 0c5e1ce89f1eacc366ec421c0f5f681159479c28 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 15:33:21 +0200
Subject: [PATCH 134/162] xfs: validate quota log items during log recovery

Arkadiusz has seen really strange crashes in xfs_qm_dqcheck that
I can only explain by a log item being too smal to actually fit the
xfs_dqblk_t we're dereferencing all over xfs_qm_dqcheck.  So add
graceful checks for NULL or too small quota items to the log recovery
code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/xfs_log_recover.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7ba450116d4f..47da2fb45377 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1975,16 +1975,30 @@ xlog_recover_do_reg_buffer(
 		error = 0;
 		if (buf_f->blf_flags &
 		   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+			if (item->ri_buf[i].i_addr == NULL) {
+				cmn_err(CE_ALERT,
+					"XFS: NULL dquot in %s.", __func__);
+				goto next;
+			}
+			if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) {
+				cmn_err(CE_ALERT,
+					"XFS: dquot too small (%d) in %s.",
+					item->ri_buf[i].i_len, __func__);
+				goto next;
+			}
 			error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
 					       item->ri_buf[i].i_addr,
 					       -1, 0, XFS_QMOPT_DOWARN,
 					       "dquot_buf_recover");
+			if (error)
+				goto next;
 		}
-		if (!error)
-			memcpy(xfs_buf_offset(bp,
-				(uint)bit << XFS_BLI_SHIFT),	/* dest */
-				item->ri_buf[i].i_addr,		/* source */
-				nbits<<XFS_BLI_SHIFT);		/* length */
+
+		memcpy(xfs_buf_offset(bp,
+			(uint)bit << XFS_BLI_SHIFT),	/* dest */
+			item->ri_buf[i].i_addr,		/* source */
+			nbits<<XFS_BLI_SHIFT);		/* length */
+ next:
 		i++;
 		bit += nbits;
 	}
@@ -2615,7 +2629,19 @@ xlog_recover_do_dquot_trans(
 		return (0);
 
 	recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
-	ASSERT(recddq);
+
+	if (item->ri_buf[1].i_addr == NULL) {
+		cmn_err(CE_ALERT,
+			"XFS: NULL dquot in %s.", __func__);
+		return XFS_ERROR(EIO);
+	}
+	if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) {
+		cmn_err(CE_ALERT,
+			"XFS: dquot too small (%d) in %s.",
+			item->ri_buf[1].i_len, __func__);
+		return XFS_ERROR(EIO);
+	}
+
 	/*
 	 * This type of quotas was turned off, so ignore this record.
 	 */

From 7d095257e321214e4cf359abd131ba1f09c60cba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 15:33:32 +0200
Subject: [PATCH 135/162] xfs: kill xfs_qmops

Kill the quota ops function vector and replace it with direct calls or
stubs in the CONFIG_XFS_QUOTA=n case.

Make sure we check XFS_IS_QUOTA_RUNNING in the right spots.  We can remove
the number of those checks because the XFS_TRANS_DQ_DIRTY flag can't be set
otherwise.

This brings us back closer to the way this code worked in IRIX and earlier
Linux versions, but we keep a lot of the more useful factoring of common
code.

Eventually we should also kill xfs_qm_bhv.c, but that's left for a later
patch.

Reduces the size of the source code by about 250 lines and the size of
XFS module by about 1.5 kilobytes with quotas enabled:

   text	   data	    bss	    dec	    hex	filename
 615957	   2960	   3848	 622765	  980ad	fs/xfs/xfs.o
 617231	   3152	   3848	 624231	  98667	fs/xfs/xfs.o.old

Fallout:

 - xfs_qm_dqattach is split into xfs_qm_dqattach_locked which expects
   the inode locked and xfs_qm_dqattach which does the locking around it,
   thus removing XFS_QMOPT_ILOCKED.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/Makefile                |   3 +-
 fs/xfs/linux-2.6/xfs_ioctl.c   |  24 +++---
 fs/xfs/linux-2.6/xfs_super.c   |  22 +++--
 fs/xfs/linux-2.6/xfs_sync.c    |   7 +-
 fs/xfs/quota/xfs_dquot.c       |   4 +-
 fs/xfs/quota/xfs_dquot.h       |   1 -
 fs/xfs/quota/xfs_qm.c          | 139 ++++++++++++++++--------------
 fs/xfs/quota/xfs_qm.h          |  21 -----
 fs/xfs/quota/xfs_qm_bhv.c      |  76 +----------------
 fs/xfs/quota/xfs_trans_dquot.c |  65 ++++++--------
 fs/xfs/xfs_attr.c              |  12 +--
 fs/xfs/xfs_bmap.c              |  33 +++----
 fs/xfs/xfs_bmap_btree.c        |   4 +-
 fs/xfs/xfs_iget.c              |   5 +-
 fs/xfs/xfs_iomap.c             |  12 +--
 fs/xfs/xfs_mount.c             | 103 ++++++++++++++++++----
 fs/xfs/xfs_mount.h             |  84 +-----------------
 fs/xfs/xfs_qmops.c             | 152 ---------------------------------
 fs/xfs/xfs_quota.h             | 116 ++++++++++++-------------
 fs/xfs/xfs_rename.c            |   3 +-
 fs/xfs/xfs_trans.c             |  15 ++--
 fs/xfs/xfs_utils.c             |   2 +-
 fs/xfs/xfs_vnodeops.c          |  99 ++++++++++-----------
 23 files changed, 377 insertions(+), 625 deletions(-)
 delete mode 100644 fs/xfs/xfs_qmops.c

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 60f107e47fe9..222c59e541ce 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -88,8 +88,7 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_utils.o \
 				   xfs_vnodeops.o \
 				   xfs_rw.o \
-				   xfs_dmops.o \
-				   xfs_qmops.o
+				   xfs_dmops.o
 
 xfs-$(CONFIG_XFS_TRACE)		+= xfs_btree_trace.o \
 				   xfs_dir2_trace.o
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index d0b499418a7d..c7d684f02f89 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -908,7 +908,8 @@ xfs_ioctl_setattr(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
 	unsigned int		lock_flags = 0;
-	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
 	struct xfs_dquot	*olddquot = NULL;
 	int			code;
 
@@ -928,7 +929,7 @@ xfs_ioctl_setattr(
 	 * because the i_*dquot fields will get updated anyway.
 	 */
 	if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
-		code = XFS_QM_DQVOPALLOC(mp, ip, ip->i_d.di_uid,
+		code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
 					 ip->i_d.di_gid, fa->fsx_projid,
 					 XFS_QMOPT_PQUOTA, &udqp, &gdqp);
 		if (code)
@@ -963,10 +964,11 @@ xfs_ioctl_setattr(
 	 * Do a quota reservation only if projid is actually going to change.
 	 */
 	if (mask & FSX_PROJID) {
-		if (XFS_IS_PQUOTA_ON(mp) &&
+		if (XFS_IS_QUOTA_RUNNING(mp) &&
+		    XFS_IS_PQUOTA_ON(mp) &&
 		    ip->i_d.di_projid != fa->fsx_projid) {
 			ASSERT(tp);
-			code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
+			code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
 						capable(CAP_FOWNER) ?
 						XFS_QMOPT_FORCE_RES : 0);
 			if (code)	/* out of quota */
@@ -1068,8 +1070,8 @@ xfs_ioctl_setattr(
 		 * in the transaction.
 		 */
 		if (ip->i_d.di_projid != fa->fsx_projid) {
-			if (XFS_IS_PQUOTA_ON(mp)) {
-				olddquot = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
+				olddquot = xfs_qm_vop_chown(tp, ip,
 							&ip->i_gdquot, gdqp);
 			}
 			ip->i_d.di_projid = fa->fsx_projid;
@@ -1115,9 +1117,9 @@ xfs_ioctl_setattr(
 	/*
 	 * Release any dquot(s) the inode had kept before chown.
 	 */
-	XFS_QM_DQRELE(mp, olddquot);
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
+	xfs_qm_dqrele(olddquot);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
 
 	if (code)
 		return code;
@@ -1131,8 +1133,8 @@ xfs_ioctl_setattr(
 	return 0;
 
  error_return:
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
 	xfs_trans_cancel(tp, 0);
 	if (lock_flags)
 		xfs_iunlock(ip, lock_flags);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bb685269f832..0d9b64b219e0 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -405,6 +405,14 @@ xfs_parseargs(
 		return EINVAL;
 	}
 
+#ifndef CONFIG_XFS_QUOTA
+	if (XFS_IS_QUOTA_RUNNING(mp)) {
+		cmn_err(CE_WARN,
+			"XFS: quota support not available in this kernel.");
+		return EINVAL;
+	}
+#endif
+
 	if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
 	    (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
 		cmn_err(CE_WARN,
@@ -1098,7 +1106,6 @@ xfs_fs_put_super(
 	xfs_freesb(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_close_devices(mp);
-	xfs_qmops_put(mp);
 	xfs_dmops_put(mp);
 	xfs_free_fsname(mp);
 	kfree(mp);
@@ -1168,6 +1175,7 @@ xfs_fs_statfs(
 {
 	struct xfs_mount	*mp = XFS_M(dentry->d_sb);
 	xfs_sb_t		*sbp = &mp->m_sb;
+	struct xfs_inode	*ip = XFS_I(dentry->d_inode);
 	__uint64_t		fakeinos, id;
 	xfs_extlen_t		lsize;
 
@@ -1196,7 +1204,10 @@ xfs_fs_statfs(
 	statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
 	spin_unlock(&mp->m_sb_lock);
 
-	XFS_QM_DQSTATVFS(XFS_I(dentry->d_inode), statp);
+	if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
+	    ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
+			      (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+		xfs_qm_statvfs(ip, statp);
 	return 0;
 }
 
@@ -1404,16 +1415,13 @@ xfs_fs_fill_super(
 	error = xfs_dmops_get(mp);
 	if (error)
 		goto out_free_fsname;
-	error = xfs_qmops_get(mp);
-	if (error)
-		goto out_put_dmops;
 
 	if (silent)
 		flags |= XFS_MFSI_QUIET;
 
 	error = xfs_open_devices(mp);
 	if (error)
-		goto out_put_qmops;
+		goto out_put_dmops;
 
 	if (xfs_icsb_init_counters(mp))
 		mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
@@ -1482,8 +1490,6 @@ xfs_fs_fill_super(
  out_destroy_counters:
 	xfs_icsb_destroy_counters(mp);
 	xfs_close_devices(mp);
- out_put_qmops:
-	xfs_qmops_put(mp);
  out_put_dmops:
 	xfs_dmops_put(mp);
  out_free_fsname:
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index f7ba76633c29..b06b95c154cc 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -43,6 +43,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_rw.h"
+#include "xfs_quota.h"
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -317,12 +318,12 @@ xfs_quiesce_data(
 
 	/* push non-blocking */
 	xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
-	XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+	xfs_qm_sync(mp, SYNC_BDFLUSH);
 	xfs_filestream_flush(mp);
 
 	/* push and block */
 	xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
-	XFS_QM_DQSYNC(mp, SYNC_WAIT);
+	xfs_qm_sync(mp, SYNC_WAIT);
 
 	/* write superblock and hoover up shutdown errors */
 	error = xfs_sync_fsdata(mp, 0);
@@ -467,7 +468,7 @@ xfs_sync_worker(
 		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
 		xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
 		/* dgc: errors ignored here */
-		error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+		error = xfs_qm_sync(mp, SYNC_BDFLUSH);
 		error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
 		if (xfs_log_need_covered(mp))
 			error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e4babcc63423..4d6d051ee56e 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1194,7 +1194,9 @@ void
 xfs_qm_dqrele(
 	xfs_dquot_t	*dqp)
 {
-	ASSERT(dqp);
+	if (!dqp)
+		return;
+
 	xfs_dqtrace_entry(dqp, "DQRELE");
 
 	xfs_dqlock(dqp);
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index de0f402ddb4c..6533ead9b889 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -181,7 +181,6 @@ extern void		xfs_qm_adjust_dqlimits(xfs_mount_t *,
 extern int		xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
 					xfs_dqid_t, uint, uint, xfs_dquot_t **);
 extern void		xfs_qm_dqput(xfs_dquot_t *);
-extern void		xfs_qm_dqrele(xfs_dquot_t *);
 extern void		xfs_dqlock(xfs_dquot_t *);
 extern void		xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
 extern void		xfs_dqunlock(xfs_dquot_t *);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 5b6695049e00..aa5d8212661c 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -287,11 +287,13 @@ xfs_qm_rele_quotafs_ref(
  * Just destroy the quotainfo structure.
  */
 void
-xfs_qm_unmount_quotadestroy(
-	xfs_mount_t	*mp)
+xfs_qm_unmount(
+	struct xfs_mount	*mp)
 {
-	if (mp->m_quotainfo)
+	if (mp->m_quotainfo) {
+		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
 		xfs_qm_destroy_quotainfo(mp);
+	}
 }
 
 
@@ -385,8 +387,13 @@ xfs_qm_mount_quotas(
 	if (error) {
 		xfs_fs_cmn_err(CE_WARN, mp,
 			"Failed to initialize disk quotas.");
+		return;
 	}
-	return;
+
+#ifdef QUOTADEBUG
+	if (XFS_IS_QUOTA_ON(mp))
+		xfs_qm_internalqcheck(mp);
+#endif
 }
 
 /*
@@ -774,12 +781,11 @@ xfs_qm_dqattach_grouphint(
  * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
  * into account.
  * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
- * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
  * Inode may get unlocked and relocked in here, and the caller must deal with
  * the consequences.
  */
 int
-xfs_qm_dqattach(
+xfs_qm_dqattach_locked(
 	xfs_inode_t	*ip,
 	uint		flags)
 {
@@ -787,17 +793,14 @@ xfs_qm_dqattach(
 	uint		nquotas = 0;
 	int		error = 0;
 
-	if ((! XFS_IS_QUOTA_ON(mp)) ||
-	    (! XFS_NOT_DQATTACHED(mp, ip)) ||
-	    (ip->i_ino == mp->m_sb.sb_uquotino) ||
-	    (ip->i_ino == mp->m_sb.sb_gquotino))
+	if (!XFS_IS_QUOTA_RUNNING(mp) ||
+	    !XFS_IS_QUOTA_ON(mp) ||
+	    !XFS_NOT_DQATTACHED(mp, ip) ||
+	    ip->i_ino == mp->m_sb.sb_uquotino ||
+	    ip->i_ino == mp->m_sb.sb_gquotino)
 		return 0;
 
-	ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
-	       xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-	if (! (flags & XFS_QMOPT_ILOCKED))
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	if (XFS_IS_UQUOTA_ON(mp)) {
 		error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
@@ -849,8 +852,7 @@ xfs_qm_dqattach(
 		xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
 	}
 
-      done:
-
+ done:
 #ifdef QUOTADEBUG
 	if (! error) {
 		if (XFS_IS_UQUOTA_ON(mp))
@@ -858,15 +860,22 @@ xfs_qm_dqattach(
 		if (XFS_IS_OQUOTA_ON(mp))
 			ASSERT(ip->i_gdquot);
 	}
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 #endif
+	return error;
+}
 
-	if (! (flags & XFS_QMOPT_ILOCKED))
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+int
+xfs_qm_dqattach(
+	struct xfs_inode	*ip,
+	uint			flags)
+{
+	int			error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_qm_dqattach_locked(ip, flags);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
-#ifdef QUOTADEBUG
-	else
-		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-#endif
 	return error;
 }
 
@@ -912,7 +921,7 @@ xfs_qm_sync(
 	boolean_t	nowait;
 	int		error;
 
-	if (! XFS_IS_QUOTA_ON(mp))
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return 0;
 
 	restarts = 0;
@@ -2319,20 +2328,20 @@ xfs_qm_write_sb_changes(
  */
 int
 xfs_qm_vop_dqalloc(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip,
-	uid_t		uid,
-	gid_t		gid,
-	prid_t		prid,
-	uint		flags,
-	xfs_dquot_t	**O_udqpp,
-	xfs_dquot_t	**O_gdqpp)
+	struct xfs_inode	*ip,
+	uid_t			uid,
+	gid_t			gid,
+	prid_t			prid,
+	uint			flags,
+	struct xfs_dquot	**O_udqpp,
+	struct xfs_dquot	**O_gdqpp)
 {
-	int		error;
-	xfs_dquot_t	*uq, *gq;
-	uint		lockflags;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_dquot	*uq, *gq;
+	int			error;
+	uint			lockflags;
 
-	if (!XFS_IS_QUOTA_ON(mp))
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return 0;
 
 	lockflags = XFS_ILOCK_EXCL;
@@ -2346,8 +2355,8 @@ xfs_qm_vop_dqalloc(
 	 * if necessary. The dquot(s) will not be locked.
 	 */
 	if (XFS_NOT_DQATTACHED(mp, ip)) {
-		if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC |
-					    XFS_QMOPT_ILOCKED))) {
+		error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
+		if (error) {
 			xfs_iunlock(ip, lockflags);
 			return error;
 		}
@@ -2469,6 +2478,7 @@ xfs_qm_vop_chown(
 	uint		bfield = XFS_IS_REALTIME_INODE(ip) ?
 				 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
 
+
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
 
@@ -2508,13 +2518,13 @@ xfs_qm_vop_chown_reserve(
 	xfs_dquot_t	*gdqp,
 	uint		flags)
 {
-	int		error;
-	xfs_mount_t	*mp;
+	xfs_mount_t	*mp = ip->i_mount;
 	uint		delblks, blkflags, prjflags = 0;
 	xfs_dquot_t	*unresudq, *unresgdq, *delblksudq, *delblksgdq;
+	int		error;
+
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	mp = ip->i_mount;
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
 	delblks = ip->i_delayed_blks;
@@ -2582,28 +2592,23 @@ xfs_qm_vop_chown_reserve(
 
 int
 xfs_qm_vop_rename_dqattach(
-	xfs_inode_t	**i_tab)
+	struct xfs_inode	**i_tab)
 {
-	xfs_inode_t	*ip;
-	int		i;
-	int		error;
+	struct xfs_mount	*mp = i_tab[0]->i_mount;
+	int			i;
 
-	ip = i_tab[0];
-
-	if (! XFS_IS_QUOTA_ON(ip->i_mount))
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return 0;
 
-	if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
-		error = xfs_qm_dqattach(ip, 0);
-		if (error)
-			return error;
-	}
-	for (i = 1; (i < 4 && i_tab[i]); i++) {
+	for (i = 0; (i < 4 && i_tab[i]); i++) {
+		struct xfs_inode	*ip = i_tab[i];
+		int			error;
+
 		/*
 		 * Watch out for duplicate entries in the table.
 		 */
-		if ((ip = i_tab[i]) != i_tab[i-1]) {
-			if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+		if (i == 0 || ip != i_tab[i-1]) {
+			if (XFS_NOT_DQATTACHED(mp, ip)) {
 				error = xfs_qm_dqattach(ip, 0);
 				if (error)
 					return error;
@@ -2614,17 +2619,19 @@ xfs_qm_vop_rename_dqattach(
 }
 
 void
-xfs_qm_vop_dqattach_and_dqmod_newinode(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip,
-	xfs_dquot_t	*udqp,
-	xfs_dquot_t	*gdqp)
+xfs_qm_vop_create_dqattach(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp)
 {
-	if (!XFS_IS_QUOTA_ON(tp->t_mountp))
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
 	if (udqp) {
 		xfs_dqlock(udqp);
@@ -2632,7 +2639,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
 		xfs_dqunlock(udqp);
 		ASSERT(ip->i_udquot == NULL);
 		ip->i_udquot = udqp;
-		ASSERT(XFS_IS_UQUOTA_ON(tp->t_mountp));
+		ASSERT(XFS_IS_UQUOTA_ON(mp));
 		ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
 		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
@@ -2642,8 +2649,8 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
 		xfs_dqunlock(gdqp);
 		ASSERT(ip->i_gdquot == NULL);
 		ip->i_gdquot = gdqp;
-		ASSERT(XFS_IS_OQUOTA_ON(tp->t_mountp));
-		ASSERT((XFS_IS_GQUOTA_ON(tp->t_mountp) ?
+		ASSERT(XFS_IS_OQUOTA_ON(mp));
+		ASSERT((XFS_IS_GQUOTA_ON(mp) ?
 			ip->i_d.di_gid : ip->i_d.di_projid) ==
 				be32_to_cpu(gdqp->q_core.d_id));
 		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index a371954cae1b..495564b8af38 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -127,8 +127,6 @@ typedef struct xfs_quotainfo {
 } xfs_quotainfo_t;
 
 
-extern xfs_dqtrxops_t	xfs_trans_dquot_ops;
-
 extern void	xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
 extern int	xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
 			xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
@@ -159,17 +157,11 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RTBWARNLIMIT	5
 
 extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern void		xfs_qm_mount_quotas(xfs_mount_t *);
 extern int		xfs_qm_quotacheck(xfs_mount_t *);
-extern void		xfs_qm_unmount_quotadestroy(xfs_mount_t *);
-extern void		xfs_qm_unmount_quotas(xfs_mount_t *);
 extern int		xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
-extern int		xfs_qm_sync(xfs_mount_t *, int);
 
 /* dquot stuff */
 extern boolean_t	xfs_qm_dqalloc_incore(xfs_dquot_t **);
-extern int		xfs_qm_dqattach(xfs_inode_t *, uint);
-extern void		xfs_qm_dqdetach(xfs_inode_t *);
 extern int		xfs_qm_dqpurge_all(xfs_mount_t *, uint);
 extern void		xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
 
@@ -183,19 +175,6 @@ extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
 extern int		xfs_qm_scall_quotaon(xfs_mount_t *, uint);
 extern int		xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
 
-/* vop stuff */
-extern int		xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
-					uid_t, gid_t, prid_t, uint,
-					xfs_dquot_t **, xfs_dquot_t **);
-extern void		xfs_qm_vop_dqattach_and_dqmod_newinode(
-					xfs_trans_t *, xfs_inode_t *,
-					xfs_dquot_t *, xfs_dquot_t *);
-extern int		xfs_qm_vop_rename_dqattach(xfs_inode_t **);
-extern xfs_dquot_t *	xfs_qm_vop_chown(xfs_trans_t *, xfs_inode_t *,
-					xfs_dquot_t **, xfs_dquot_t *);
-extern int		xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
-					xfs_dquot_t *, xfs_dquot_t *, uint);
-
 /* list stuff */
 extern void		xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
 extern void		xfs_qm_freelist_unlink(xfs_dquot_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 63037c689a4b..56a5965f3c8b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -84,7 +84,7 @@ xfs_fill_statvfs_from_dquot(
  * return a statvfs of the project, not the entire filesystem.
  * This makes such trees appear as if they are filesystems in themselves.
  */
-STATIC void
+void
 xfs_qm_statvfs(
 	xfs_inode_t		*ip,
 	struct kstatfs		*statp)
@@ -92,20 +92,13 @@ xfs_qm_statvfs(
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_dquot_t		*dqp;
 
-	if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
-	    !((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
-	    		      (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
-		return;
-
 	if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) {
-		xfs_disk_dquot_t	*dp = &dqp->q_core;
-
-		xfs_fill_statvfs_from_dquot(statp, dp);
+		xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
 		xfs_qm_dqput(dqp);
 	}
 }
 
-STATIC int
+int
 xfs_qm_newmount(
 	xfs_mount_t	*mp,
 	uint		*needquotamount,
@@ -114,9 +107,6 @@ xfs_qm_newmount(
 	uint		quotaondisk;
 	uint		uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
 
-	*quotaflags = 0;
-	*needquotamount = B_FALSE;
-
 	quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
 				(mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
 
@@ -179,66 +169,6 @@ xfs_qm_newmount(
 	return 0;
 }
 
-STATIC int
-xfs_qm_endmount(
-	xfs_mount_t	*mp,
-	uint		needquotamount,
-	uint		quotaflags)
-{
-	if (needquotamount) {
-		ASSERT(mp->m_qflags == 0);
-		mp->m_qflags = quotaflags;
-		xfs_qm_mount_quotas(mp);
-	}
-
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-	if (! (XFS_IS_QUOTA_ON(mp)))
-		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
-	else
-		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
-#endif
-
-#ifdef QUOTADEBUG
-	if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp))
-		cmn_err(CE_WARN, "XFS: mount internalqcheck failed");
-#endif
-
-	return 0;
-}
-
-STATIC void
-xfs_qm_dqrele_null(
-	xfs_dquot_t	*dq)
-{
-	/*
-	 * Called from XFS, where we always check first for a NULL dquot.
-	 */
-	if (!dq)
-		return;
-	xfs_qm_dqrele(dq);
-}
-
-
-struct xfs_qmops xfs_qmcore_xfs = {
-	.xfs_qminit		= xfs_qm_newmount,
-	.xfs_qmdone		= xfs_qm_unmount_quotadestroy,
-	.xfs_qmmount		= xfs_qm_endmount,
-	.xfs_qmunmount		= xfs_qm_unmount_quotas,
-	.xfs_dqrele		= xfs_qm_dqrele_null,
-	.xfs_dqattach		= xfs_qm_dqattach,
-	.xfs_dqdetach		= xfs_qm_dqdetach,
-	.xfs_dqpurgeall		= xfs_qm_dqpurge_all,
-	.xfs_dqvopalloc		= xfs_qm_vop_dqalloc,
-	.xfs_dqvopcreate	= xfs_qm_vop_dqattach_and_dqmod_newinode,
-	.xfs_dqvoprename	= xfs_qm_vop_rename_dqattach,
-	.xfs_dqvopchown		= xfs_qm_vop_chown,
-	.xfs_dqvopchownresv	= xfs_qm_vop_chown_reserve,
-	.xfs_dqstatvfs		= xfs_qm_statvfs,
-	.xfs_dqsync		= xfs_qm_sync,
-	.xfs_dqtrxops		= &xfs_trans_dquot_ops,
-};
-EXPORT_SYMBOL(xfs_qmcore_xfs);
-
 void __init
 xfs_qm_init(void)
 {
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 447173bcf96d..eafa7ab34085 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -111,7 +111,7 @@ xfs_trans_log_dquot(
  * Carry forward whatever is left of the quota blk reservation to
  * the spanky new transaction
  */
-STATIC void
+void
 xfs_trans_dup_dqinfo(
 	xfs_trans_t	*otp,
 	xfs_trans_t	*ntp)
@@ -167,19 +167,17 @@ xfs_trans_dup_dqinfo(
 /*
  * Wrap around mod_dquot to account for both user and group quotas.
  */
-STATIC void
+void
 xfs_trans_mod_dquot_byino(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*ip,
 	uint		field,
 	long		delta)
 {
-	xfs_mount_t	*mp;
+	xfs_mount_t	*mp = tp->t_mountp;
 
-	ASSERT(tp);
-	mp = tp->t_mountp;
-
-	if (!XFS_IS_QUOTA_ON(mp) ||
+	if (!XFS_IS_QUOTA_RUNNING(mp) ||
+	    !XFS_IS_QUOTA_ON(mp) ||
 	    ip->i_ino == mp->m_sb.sb_uquotino ||
 	    ip->i_ino == mp->m_sb.sb_gquotino)
 		return;
@@ -229,6 +227,7 @@ xfs_trans_mod_dquot(
 	xfs_dqtrx_t	*qtrx;
 
 	ASSERT(tp);
+	ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
 	qtrx = NULL;
 
 	if (tp->t_dqinfo == NULL)
@@ -346,7 +345,7 @@ xfs_trans_dqlockedjoin(
  * Unreserve just the reservations done by this transaction.
  * dquot is still left locked at exit.
  */
-STATIC void
+void
 xfs_trans_apply_dquot_deltas(
 	xfs_trans_t		*tp)
 {
@@ -357,7 +356,7 @@ xfs_trans_apply_dquot_deltas(
 	long			totalbdelta;
 	long			totalrtbdelta;
 
-	if (! (tp->t_flags & XFS_TRANS_DQ_DIRTY))
+	if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
 		return;
 
 	ASSERT(tp->t_dqinfo);
@@ -531,7 +530,7 @@ xfs_trans_apply_dquot_deltas(
  * we simply throw those away, since that's the expected behavior
  * when a transaction is curtailed without a commit.
  */
-STATIC void
+void
 xfs_trans_unreserve_and_mod_dquots(
 	xfs_trans_t		*tp)
 {
@@ -768,7 +767,7 @@ xfs_trans_reserve_quota_bydquots(
 {
 	int		resvd = 0, error;
 
-	if (!XFS_IS_QUOTA_ON(mp))
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return 0;
 
 	if (tp && tp->t_dqinfo == NULL)
@@ -811,18 +810,17 @@ xfs_trans_reserve_quota_bydquots(
  * This doesn't change the actual usage, just the reservation.
  * The inode sent in is locked.
  */
-STATIC int
+int
 xfs_trans_reserve_quota_nblks(
-	xfs_trans_t	*tp,
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip,
-	long		nblks,
-	long		ninos,
-	uint		flags)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	long			nblks,
+	long			ninos,
+	uint			flags)
 {
-	int		error;
+	struct xfs_mount	*mp = ip->i_mount;
 
-	if (!XFS_IS_QUOTA_ON(mp))
+	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return 0;
 	if (XFS_IS_PQUOTA_ON(mp))
 		flags |= XFS_QMOPT_ENOSPC;
@@ -831,7 +829,6 @@ xfs_trans_reserve_quota_nblks(
 	ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
 	ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
 				XFS_TRANS_DQ_RES_RTBLKS ||
 	       (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
@@ -840,11 +837,9 @@ xfs_trans_reserve_quota_nblks(
 	/*
 	 * Reserve nblks against these dquots, with trans as the mediator.
 	 */
-	error = xfs_trans_reserve_quota_bydquots(tp, mp,
-						 ip->i_udquot, ip->i_gdquot,
-						 nblks, ninos,
-						 flags);
-	return error;
+	return xfs_trans_reserve_quota_bydquots(tp, mp,
+						ip->i_udquot, ip->i_gdquot,
+						nblks, ninos, flags);
 }
 
 /*
@@ -895,25 +890,15 @@ STATIC void
 xfs_trans_alloc_dqinfo(
 	xfs_trans_t	*tp)
 {
-	(tp)->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+	tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
 }
 
-STATIC void
+void
 xfs_trans_free_dqinfo(
 	xfs_trans_t	*tp)
 {
 	if (!tp->t_dqinfo)
 		return;
-	kmem_zone_free(xfs_Gqm->qm_dqtrxzone, (tp)->t_dqinfo);
-	(tp)->t_dqinfo = NULL;
+	kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
+	tp->t_dqinfo = NULL;
 }
-
-xfs_dqtrxops_t	xfs_trans_dquot_ops = {
-	.qo_dup_dqinfo			= xfs_trans_dup_dqinfo,
-	.qo_free_dqinfo			= xfs_trans_free_dqinfo,
-	.qo_mod_dquot_byino		= xfs_trans_mod_dquot_byino,
-	.qo_apply_dquot_deltas		= xfs_trans_apply_dquot_deltas,
-	.qo_reserve_quota_nblks		= xfs_trans_reserve_quota_nblks,
-	.qo_reserve_quota_bydquots	= xfs_trans_reserve_quota_bydquots,
-	.qo_unreserve_and_mod_dquots	= xfs_trans_unreserve_and_mod_dquots,
-};
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 5fde1654b430..cd1008b1c4cd 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -249,8 +249,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
 	/*
 	 * Attach the dquots to the inode.
 	 */
-	if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
-		return (error);
+	error = xfs_qm_dqattach(dp, 0);
+	if (error)
+		return error;
 
 	/*
 	 * If the inode doesn't have an attribute fork, add one.
@@ -311,7 +312,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
 	}
 	xfs_ilock(dp, XFS_ILOCK_EXCL);
 
-	error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0,
+	error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
 				rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
 				       XFS_QMOPT_RES_REGBLKS);
 	if (error) {
@@ -501,8 +502,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
 	/*
 	 * Attach the dquots to the inode.
 	 */
-	if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
-		return (error);
+	error = xfs_qm_dqattach(dp, 0);
+	if (error)
+		return error;
 
 	/*
 	 * Start our first transaction of the day.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index ca7c6005a487..4b0f6efb046c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2691,7 +2691,7 @@ xfs_bmap_rtalloc(
 		 * Adjust the disk quota also. This was reserved
 		 * earlier.
 		 */
-		XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
 			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
 					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
 	} else {
@@ -2995,7 +2995,7 @@ xfs_bmap_btalloc(
 		 * Adjust the disk quota also. This was reserved
 		 * earlier.
 		 */
-		XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
 			ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
 					XFS_TRANS_DQ_BCOUNT,
 			(long) args.len);
@@ -3066,7 +3066,7 @@ xfs_bmap_btree_to_extents(
 		return error;
 	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
 	ip->i_d.di_nblocks--;
-	XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
 	xfs_trans_binval(tp, cbp);
 	if (cur->bc_bufs[0] == cbp)
 		cur->bc_bufs[0] = NULL;
@@ -3386,7 +3386,7 @@ xfs_bmap_del_extent(
 	 * Adjust quota data.
 	 */
 	if (qfield)
-		XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, qfield, (long)-nblks);
+		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
 
 	/*
 	 * Account for change in delayed indirect blocks.
@@ -3523,7 +3523,7 @@ xfs_bmap_extents_to_btree(
 	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
 	cur->bc_private.b.allocated++;
 	ip->i_d.di_nblocks++;
-	XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
 	abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
 	/*
 	 * Fill in the child block.
@@ -3690,7 +3690,7 @@ xfs_bmap_local_to_extents(
 		XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork);
 		XFS_IFORK_NEXT_SET(ip, whichfork, 1);
 		ip->i_d.di_nblocks = 1;
-		XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
+		xfs_trans_mod_dquot_byino(tp, ip,
 			XFS_TRANS_DQ_BCOUNT, 1L);
 		flags |= xfs_ilog_fext(whichfork);
 	} else {
@@ -4048,7 +4048,7 @@ xfs_bmap_add_attrfork(
 			XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
 		goto error0;
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, blks, 0, rsvd ?
+	error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
 			XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
 			XFS_QMOPT_RES_REGBLKS);
 	if (error) {
@@ -4983,10 +4983,11 @@ xfs_bmapi(
 				 * adjusted later.  We return if we haven't
 				 * allocated blocks already inside this loop.
 				 */
-				if ((error = XFS_TRANS_RESERVE_QUOTA_NBLKS(
-						mp, NULL, ip, (long)alen, 0,
+				error = xfs_trans_reserve_quota_nblks(
+						NULL, ip, (long)alen, 0,
 						rt ? XFS_QMOPT_RES_RTBLKS :
-						     XFS_QMOPT_RES_REGBLKS))) {
+						     XFS_QMOPT_RES_REGBLKS);
+				if (error) {
 					if (n == 0) {
 						*nmap = 0;
 						ASSERT(cur == NULL);
@@ -5035,8 +5036,8 @@ xfs_bmapi(
 					if (XFS_IS_QUOTA_ON(mp))
 						/* unreserve the blocks now */
 						(void)
-						XFS_TRANS_UNRESERVE_QUOTA_NBLKS(
-							mp, NULL, ip,
+						xfs_trans_unreserve_quota_nblks(
+							NULL, ip,
 							(long)alen, 0, rt ?
 							XFS_QMOPT_RES_RTBLKS :
 							XFS_QMOPT_RES_REGBLKS);
@@ -5691,14 +5692,14 @@ xfs_bunmapi(
 				do_div(rtexts, mp->m_sb.sb_rextsize);
 				xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
 						(int64_t)rtexts, rsvd);
-				(void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
-					NULL, ip, -((long)del.br_blockcount), 0,
+				(void)xfs_trans_reserve_quota_nblks(NULL,
+					ip, -((long)del.br_blockcount), 0,
 					XFS_QMOPT_RES_RTBLKS);
 			} else {
 				xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
 						(int64_t)del.br_blockcount, rsvd);
-				(void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
-					NULL, ip, -((long)del.br_blockcount), 0,
+				(void)xfs_trans_reserve_quota_nblks(NULL,
+					ip, -((long)del.br_blockcount), 0,
 					XFS_QMOPT_RES_REGBLKS);
 			}
 			ip->i_delayed_blks -= del.br_blockcount;
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 0760d352586f..5c1ade06578e 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -590,7 +590,7 @@ xfs_bmbt_alloc_block(
 	cur->bc_private.b.allocated++;
 	cur->bc_private.b.ip->i_d.di_nblocks++;
 	xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-	XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+	xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
 			XFS_TRANS_DQ_BCOUNT, 1L);
 
 	new->l = cpu_to_be64(args.fsbno);
@@ -618,7 +618,7 @@ xfs_bmbt_free_block(
 	ip->i_d.di_nblocks--;
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
 	xfs_trans_binval(tp, bp);
 	return 0;
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 89b81eedce6a..73e1c0d767ac 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -500,10 +500,7 @@ xfs_ireclaim(
 	 * ilock one but will still hold the iolock.
 	 */
 	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	/*
-	 * Release dquots (and their references) if any.
-	 */
-	XFS_QM_DQDETACH(ip->i_mount, ip);
+	xfs_qm_dqdetach(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
 	switch (ip->i_d.di_mode & S_IFMT) {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5aaa2d7ec155..feb30a92549b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -385,7 +385,7 @@ xfs_iomap_write_direct(
 	 * Make sure that the dquots are there. This doesn't hold
 	 * the ilock across a disk read.
 	 */
-	error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
+	error = xfs_qm_dqattach_locked(ip, 0);
 	if (error)
 		return XFS_ERROR(error);
 
@@ -444,8 +444,7 @@ xfs_iomap_write_direct(
 	if (error)
 		goto error_out;
 
-	error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
-					      qblocks, 0, quota_flag);
+	error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
 	if (error)
 		goto error1;
 
@@ -495,7 +494,7 @@ xfs_iomap_write_direct(
 
 error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 	xfs_bmap_cancel(&free_list);
-	XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
+	xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
 
 error1:	/* Just cancel transaction */
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
@@ -582,7 +581,7 @@ xfs_iomap_write_delay(
 	 * Make sure that the dquots are there. This doesn't hold
 	 * the ilock across a disk read.
 	 */
-	error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+	error = xfs_qm_dqattach_locked(ip, 0);
 	if (error)
 		return XFS_ERROR(error);
 
@@ -684,7 +683,8 @@ xfs_iomap_write_allocate(
 	/*
 	 * Make sure that the dquots are there.
 	 */
-	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
 		return XFS_ERROR(error);
 
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 65a99725d0cc..b659db5e7165 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -959,6 +959,53 @@ xfs_check_sizes(xfs_mount_t *mp)
 	return 0;
 }
 
+/*
+ * Clear the quotaflags in memory and in the superblock.
+ */
+int
+xfs_mount_reset_sbqflags(
+	struct xfs_mount	*mp)
+{
+	int			error;
+	struct xfs_trans	*tp;
+
+	mp->m_qflags = 0;
+
+	/*
+	 * It is OK to look at sb_qflags here in mount path,
+	 * without m_sb_lock.
+	 */
+	if (mp->m_sb.sb_qflags == 0)
+		return 0;
+	spin_lock(&mp->m_sb_lock);
+	mp->m_sb.sb_qflags = 0;
+	spin_unlock(&mp->m_sb_lock);
+
+	/*
+	 * If the fs is readonly, let the incore superblock run
+	 * with quotas off but don't flush the update out to disk
+	 */
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return 0;
+
+#ifdef QUOTADEBUG
+	xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+#endif
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+				      XFS_DEFAULT_LOG_COUNT);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_mount_reset_sbqflags: Superblock update failed!");
+		return error;
+	}
+
+	xfs_mod_sb(tp, XFS_SB_QFLAGS);
+	return xfs_trans_commit(tp, 0);
+}
+
 /*
  * This function does the following on an initial mount of a file system:
  *	- reads the superblock from disk and init the mount struct
@@ -976,7 +1023,8 @@ xfs_mountfs(
 	xfs_sb_t	*sbp = &(mp->m_sb);
 	xfs_inode_t	*rip;
 	__uint64_t	resblks;
-	uint		quotamount, quotaflags;
+	uint		quotamount = 0;
+	uint		quotaflags = 0;
 	int		error = 0;
 
 	xfs_mount_common(mp, sbp);
@@ -1210,9 +1258,28 @@ xfs_mountfs(
 	/*
 	 * Initialise the XFS quota management subsystem for this mount
 	 */
-	error = XFS_QM_INIT(mp, &quotamount, &quotaflags);
-	if (error)
-		goto out_rtunmount;
+	if (XFS_IS_QUOTA_RUNNING(mp)) {
+		error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
+		if (error)
+			goto out_rtunmount;
+	} else {
+		ASSERT(!XFS_IS_QUOTA_ON(mp));
+
+		/*
+		 * If a file system had quotas running earlier, but decided to
+		 * mount without -o uquota/pquota/gquota options, revoke the
+		 * quotachecked license.
+		 */
+		if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
+			cmn_err(CE_NOTE,
+				"XFS: resetting qflags for filesystem %s",
+				mp->m_fsname);
+
+			error = xfs_mount_reset_sbqflags(mp);
+			if (error)
+				return error;
+		}
+	}
 
 	/*
 	 * Finish recovering the file system.  This part needed to be
@@ -1228,9 +1295,19 @@ xfs_mountfs(
 	/*
 	 * Complete the quota initialisation, post-log-replay component.
 	 */
-	error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
-	if (error)
-		goto out_rtunmount;
+	if (quotamount) {
+		ASSERT(mp->m_qflags == 0);
+		mp->m_qflags = quotaflags;
+
+		xfs_qm_mount_quotas(mp);
+	}
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+	if (XFS_IS_QUOTA_ON(mp))
+		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
+	else
+		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
+#endif
 
 	/*
 	 * Now we are mounted, reserve a small amount of unused space for
@@ -1279,12 +1356,7 @@ xfs_unmountfs(
 	__uint64_t		resblks;
 	int			error;
 
-	/*
-	 * Release dquot that rootinode, rbmino and rsumino might be holding,
-	 * and release the quota inodes.
-	 */
-	XFS_QM_UNMOUNT(mp);
-
+	xfs_qm_unmount_quotas(mp);
 	xfs_rtunmount_inodes(mp);
 	IRELE(mp->m_rootip);
 
@@ -1301,10 +1373,7 @@ xfs_unmountfs(
 	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
 	xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
 
-	XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
-
-	if (mp->m_quotainfo)
-		XFS_QM_DONE(mp);
+	xfs_qm_unmount(mp);
 
 	/*
 	 * Flush out the log synchronously so that we know for sure
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d6a64392f983..a5122382afde 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -64,6 +64,8 @@ struct xfs_swapext;
 struct xfs_mru_cache;
 struct xfs_nameops;
 struct xfs_ail;
+struct xfs_quotainfo;
+
 
 /*
  * Prototypes and functions for the Data Migration subsystem.
@@ -107,86 +109,6 @@ typedef struct xfs_dmops {
 	(*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
 
 
-/*
- * Prototypes and functions for the Quota Management subsystem.
- */
-
-struct xfs_dquot;
-struct xfs_dqtrxops;
-struct xfs_quotainfo;
-
-typedef int	(*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
-typedef int	(*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
-typedef void	(*xfs_qmunmount_t)(struct xfs_mount *);
-typedef void	(*xfs_qmdone_t)(struct xfs_mount *);
-typedef void	(*xfs_dqrele_t)(struct xfs_dquot *);
-typedef int	(*xfs_dqattach_t)(struct xfs_inode *, uint);
-typedef void	(*xfs_dqdetach_t)(struct xfs_inode *);
-typedef int	(*xfs_dqpurgeall_t)(struct xfs_mount *, uint);
-typedef int	(*xfs_dqvopalloc_t)(struct xfs_mount *,
-			struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-			struct xfs_dquot **, struct xfs_dquot **);
-typedef void	(*xfs_dqvopcreate_t)(struct xfs_trans *, struct xfs_inode *,
-			struct xfs_dquot *, struct xfs_dquot *);
-typedef int	(*xfs_dqvoprename_t)(struct xfs_inode **);
-typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
-			struct xfs_trans *, struct xfs_inode *,
-			struct xfs_dquot **, struct xfs_dquot *);
-typedef int	(*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
-			struct xfs_dquot *, struct xfs_dquot *, uint);
-typedef void	(*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
-typedef int	(*xfs_dqsync_t)(struct xfs_mount *, int flags);
-
-typedef struct xfs_qmops {
-	xfs_qminit_t		xfs_qminit;
-	xfs_qmdone_t		xfs_qmdone;
-	xfs_qmmount_t		xfs_qmmount;
-	xfs_qmunmount_t		xfs_qmunmount;
-	xfs_dqrele_t		xfs_dqrele;
-	xfs_dqattach_t		xfs_dqattach;
-	xfs_dqdetach_t		xfs_dqdetach;
-	xfs_dqpurgeall_t	xfs_dqpurgeall;
-	xfs_dqvopalloc_t	xfs_dqvopalloc;
-	xfs_dqvopcreate_t	xfs_dqvopcreate;
-	xfs_dqvoprename_t	xfs_dqvoprename;
-	xfs_dqvopchown_t	xfs_dqvopchown;
-	xfs_dqvopchownresv_t	xfs_dqvopchownresv;
-	xfs_dqstatvfs_t		xfs_dqstatvfs;
-	xfs_dqsync_t		xfs_dqsync;
-	struct xfs_dqtrxops	*xfs_dqtrxops;
-} xfs_qmops_t;
-
-#define XFS_QM_INIT(mp, mnt, fl) \
-	(*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
-#define XFS_QM_MOUNT(mp, mnt, fl) \
-	(*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
-#define XFS_QM_UNMOUNT(mp) \
-	(*(mp)->m_qm_ops->xfs_qmunmount)(mp)
-#define XFS_QM_DONE(mp) \
-	(*(mp)->m_qm_ops->xfs_qmdone)(mp)
-#define XFS_QM_DQRELE(mp, dq) \
-	(*(mp)->m_qm_ops->xfs_dqrele)(dq)
-#define XFS_QM_DQATTACH(mp, ip, fl) \
-	(*(mp)->m_qm_ops->xfs_dqattach)(ip, fl)
-#define XFS_QM_DQDETACH(mp, ip) \
-	(*(mp)->m_qm_ops->xfs_dqdetach)(ip)
-#define XFS_QM_DQPURGEALL(mp, fl) \
-	(*(mp)->m_qm_ops->xfs_dqpurgeall)(mp, fl)
-#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, prid, fl, dq1, dq2) \
-	(*(mp)->m_qm_ops->xfs_dqvopalloc)(mp, ip, uid, gid, prid, fl, dq1, dq2)
-#define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \
-	(*(mp)->m_qm_ops->xfs_dqvopcreate)(tp, ip, dq1, dq2)
-#define XFS_QM_DQVOPRENAME(mp, ip) \
-	(*(mp)->m_qm_ops->xfs_dqvoprename)(ip)
-#define XFS_QM_DQVOPCHOWN(mp, tp, ip, dqp, dq) \
-	(*(mp)->m_qm_ops->xfs_dqvopchown)(tp, ip, dqp, dq)
-#define XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, dq1, dq2, fl) \
-	(*(mp)->m_qm_ops->xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl)
-#define XFS_QM_DQSTATVFS(ip, statp) \
-	(*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
-#define XFS_QM_DQSYNC(mp, flags) \
-	(*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
-
 #ifdef HAVE_PERCPU_SB
 
 /*
@@ -510,8 +432,6 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 
 extern int	xfs_dmops_get(struct xfs_mount *);
 extern void	xfs_dmops_put(struct xfs_mount *);
-extern int	xfs_qmops_get(struct xfs_mount *);
-extern void	xfs_qmops_put(struct xfs_mount *);
 
 extern struct xfs_dmops xfs_dmcore_xfs;
 
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
deleted file mode 100644
index e101790ea8e7..000000000000
--- a/fs/xfs/xfs_qmops.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_quota.h"
-#include "xfs_error.h"
-
-
-STATIC struct xfs_dquot *
-xfs_dqvopchown_default(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	struct xfs_dquot	**dqp,
-	struct xfs_dquot	*dq)
-{
-	return NULL;
-}
-
-/*
- * Clear the quotaflags in memory and in the superblock.
- */
-int
-xfs_mount_reset_sbqflags(xfs_mount_t *mp)
-{
-	int			error;
-	xfs_trans_t		*tp;
-
-	mp->m_qflags = 0;
-	/*
-	 * It is OK to look at sb_qflags here in mount path,
-	 * without m_sb_lock.
-	 */
-	if (mp->m_sb.sb_qflags == 0)
-		return 0;
-	spin_lock(&mp->m_sb_lock);
-	mp->m_sb.sb_qflags = 0;
-	spin_unlock(&mp->m_sb_lock);
-
-	/*
-	 * if the fs is readonly, let the incore superblock run
-	 * with quotas off but don't flush the update out to disk
-	 */
-	if (mp->m_flags & XFS_MOUNT_RDONLY)
-		return 0;
-#ifdef QUOTADEBUG
-	xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
-#endif
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-	if ((error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-				      XFS_DEFAULT_LOG_COUNT))) {
-		xfs_trans_cancel(tp, 0);
-		xfs_fs_cmn_err(CE_ALERT, mp,
-			"xfs_mount_reset_sbqflags: Superblock update failed!");
-		return error;
-	}
-	xfs_mod_sb(tp, XFS_SB_QFLAGS);
-	error = xfs_trans_commit(tp, 0);
-	return error;
-}
-
-STATIC int
-xfs_noquota_init(
-	xfs_mount_t	*mp,
-	uint		*needquotamount,
-	uint		*quotaflags)
-{
-	int		error = 0;
-
-	*quotaflags = 0;
-	*needquotamount = B_FALSE;
-
-	ASSERT(!XFS_IS_QUOTA_ON(mp));
-
-	/*
-	 * If a file system had quotas running earlier, but decided to
-	 * mount without -o uquota/pquota/gquota options, revoke the
-	 * quotachecked license.
-	 */
-	if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
-		cmn_err(CE_NOTE,
-                        "XFS resetting qflags for filesystem %s",
-                        mp->m_fsname);
-
-		error = xfs_mount_reset_sbqflags(mp);
-	}
-	return error;
-}
-
-static struct xfs_qmops xfs_qmcore_stub = {
-	.xfs_qminit		= (xfs_qminit_t) xfs_noquota_init,
-	.xfs_qmdone		= (xfs_qmdone_t) fs_noerr,
-	.xfs_qmmount		= (xfs_qmmount_t) fs_noerr,
-	.xfs_qmunmount		= (xfs_qmunmount_t) fs_noerr,
-	.xfs_dqrele		= (xfs_dqrele_t) fs_noerr,
-	.xfs_dqattach		= (xfs_dqattach_t) fs_noerr,
-	.xfs_dqdetach		= (xfs_dqdetach_t) fs_noerr,
-	.xfs_dqpurgeall		= (xfs_dqpurgeall_t) fs_noerr,
-	.xfs_dqvopalloc		= (xfs_dqvopalloc_t) fs_noerr,
-	.xfs_dqvopcreate	= (xfs_dqvopcreate_t) fs_noerr,
-	.xfs_dqvoprename	= (xfs_dqvoprename_t) fs_noerr,
-	.xfs_dqvopchown		= xfs_dqvopchown_default,
-	.xfs_dqvopchownresv	= (xfs_dqvopchownresv_t) fs_noerr,
-	.xfs_dqstatvfs		= (xfs_dqstatvfs_t) fs_noval,
-	.xfs_dqsync		= (xfs_dqsync_t) fs_noerr,
-};
-
-int
-xfs_qmops_get(struct xfs_mount *mp)
-{
-	if (XFS_IS_QUOTA_RUNNING(mp)) {
-#ifdef CONFIG_XFS_QUOTA
-		mp->m_qm_ops = &xfs_qmcore_xfs;
-#else
-		cmn_err(CE_WARN,
-			"XFS: qouta support not available in this kernel.");
-		return EINVAL;
-#endif
-	} else {
-		mp->m_qm_ops = &xfs_qmcore_stub;
-	}
-
-	return 0;
-}
-
-void
-xfs_qmops_put(struct xfs_mount *mp)
-{
-}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index f5d1202dde25..079d2e61c626 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -197,7 +197,6 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_UMOUNTING	0x0000100 /* filesys is being unmounted */
 #define XFS_QMOPT_DOLOG		0x0000200 /* log buf changes (in quotacheck) */
 #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
-#define XFS_QMOPT_ILOCKED	0x0000800 /* inode is already locked (excl) */
 #define XFS_QMOPT_DQREPAIR	0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA	0x0002000 /* group dquot requested */
 #define XFS_QMOPT_ENOSPC	0x0004000 /* enospc instead of edquot (prj) */
@@ -302,69 +301,72 @@ typedef struct xfs_dqtrx {
 	long		qt_delrtb_delta;  /* delayed RT blk count changes */
 } xfs_dqtrx_t;
 
-/*
- * Dquot transaction functions, used if quota is enabled.
- */
-typedef void	(*qo_dup_dqinfo_t)(struct xfs_trans *, struct xfs_trans *);
-typedef void	(*qo_mod_dquot_byino_t)(struct xfs_trans *,
-				struct xfs_inode *, uint, long);
-typedef void	(*qo_free_dqinfo_t)(struct xfs_trans *);
-typedef void	(*qo_apply_dquot_deltas_t)(struct xfs_trans *);
-typedef void	(*qo_unreserve_and_mod_dquots_t)(struct xfs_trans *);
-typedef int	(*qo_reserve_quota_nblks_t)(
-				struct xfs_trans *, struct xfs_mount *,
-				struct xfs_inode *, long, long, uint);
-typedef int	(*qo_reserve_quota_bydquots_t)(
-				struct xfs_trans *, struct xfs_mount *,
-				struct xfs_dquot *, struct xfs_dquot *,
-				long, long, uint);
-typedef struct xfs_dqtrxops {
-	qo_dup_dqinfo_t			qo_dup_dqinfo;
-	qo_free_dqinfo_t		qo_free_dqinfo;
-	qo_mod_dquot_byino_t		qo_mod_dquot_byino;
-	qo_apply_dquot_deltas_t		qo_apply_dquot_deltas;
-	qo_reserve_quota_nblks_t	qo_reserve_quota_nblks;
-	qo_reserve_quota_bydquots_t	qo_reserve_quota_bydquots;
-	qo_unreserve_and_mod_dquots_t	qo_unreserve_and_mod_dquots;
-} xfs_dqtrxops_t;
+#ifdef CONFIG_XFS_QUOTA
+extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
+extern void xfs_trans_free_dqinfo(struct xfs_trans *);
+extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
+		uint, long);
+extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
+extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
+		struct xfs_inode *, long, long, uint);
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+		struct xfs_mount *, struct xfs_dquot *,
+		struct xfs_dquot *, long, long, uint);
 
-#define XFS_DQTRXOP(mp, tp, op, args...) \
-		((mp)->m_qm_ops->xfs_dqtrxops ? \
-		((mp)->m_qm_ops->xfs_dqtrxops->op)(tp, ## args) : 0)
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
+		struct xfs_dquot **, struct xfs_dquot **);
+extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
+		struct xfs_dquot *, struct xfs_dquot *);
+extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
+extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
+		struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
+extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
+		struct xfs_dquot *, struct xfs_dquot *, uint);
+extern int xfs_qm_dqattach(struct xfs_inode *, uint);
+extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
+extern void xfs_qm_dqdetach(struct xfs_inode *);
+extern void xfs_qm_dqrele(struct xfs_dquot *);
+extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
+extern int xfs_qm_sync(struct xfs_mount *, int);
+extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
+extern void xfs_qm_mount_quotas(struct xfs_mount *);
+extern void xfs_qm_unmount(struct xfs_mount *);
+extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 
-#define XFS_DQTRXOP_VOID(mp, tp, op, args...) \
-		((mp)->m_qm_ops->xfs_dqtrxops ? \
-		((mp)->m_qm_ops->xfs_dqtrxops->op)(tp, ## args) : (void)0)
+#else
+#define xfs_trans_dup_dqinfo(tp, tp2)
+#define xfs_trans_free_dqinfo(tp)
+#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
+#define xfs_trans_apply_dquot_deltas(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp)
+#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)	(0)
+#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)	(0)
+#define xfs_qm_vop_dqalloc(ip, uid, gid, prid, fl, ou, og)		(0)
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
+#define xfs_qm_vop_rename_dqattach(it)					(0)
+#define xfs_qm_vop_chown(tp, ip, old, new)				(NULL)
+#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl)			(0)
+#define xfs_qm_dqattach(ip, fl)						(0)
+#define xfs_qm_dqattach_locked(ip, fl)					(0)
+#define xfs_qm_dqdetach(ip)
+#define xfs_qm_dqrele(d)
+#define xfs_qm_statvfs(ip, s)
+#define xfs_qm_sync(mp, fl)						(0)
+#define xfs_qm_newmount(mp, a, b)					(0)
+#define xfs_qm_mount_quotas(mp)
+#define xfs_qm_unmount(mp)
+#define xfs_qm_unmount_quotas(mp)					(0)
+#endif /* CONFIG_XFS_QUOTA */
 
-#define XFS_TRANS_DUP_DQINFO(mp, otp, ntp) \
-	XFS_DQTRXOP_VOID(mp, otp, qo_dup_dqinfo, ntp)
-#define XFS_TRANS_FREE_DQINFO(mp, tp) \
-	XFS_DQTRXOP_VOID(mp, tp, qo_free_dqinfo)
-#define XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, field, delta) \
-	XFS_DQTRXOP_VOID(mp, tp, qo_mod_dquot_byino, ip, field, delta)
-#define XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp) \
-	XFS_DQTRXOP_VOID(mp, tp, qo_apply_dquot_deltas)
-#define XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, fl) \
-	XFS_DQTRXOP(mp, tp, qo_reserve_quota_nblks, mp, ip, nblks, ninos, fl)
-#define XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, fl) \
-	XFS_DQTRXOP(mp, tp, qo_reserve_quota_bydquots, mp, ud, gd, nb, ni, fl)
-#define XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp) \
-	XFS_DQTRXOP_VOID(mp, tp, qo_unreserve_and_mod_dquots)
-
-#define XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, flags) \
-	XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, -(nblks), -(ninos), flags)
-#define XFS_TRANS_RESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \
-	XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, \
-				f | XFS_QMOPT_RES_REGBLKS)
-#define XFS_TRANS_UNRESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \
-	XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, -(nb), -(ni), \
+#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
+	xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
+#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
+	xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
 				f | XFS_QMOPT_RES_REGBLKS)
 
 extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 
-extern struct xfs_qmops xfs_qmcore_xfs;
-
 #endif	/* __KERNEL__ */
-
 #endif	/* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 58f85e9cd11d..b81deea0ce19 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -166,7 +166,8 @@ xfs_rename(
 	/*
 	 * Attach the dquots to the inodes
 	 */
-	if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) {
+	error = xfs_qm_vop_rename_dqattach(inodes);
+	if (error) {
 		xfs_trans_cancel(tp, cancel_flags);
 		goto std_return;
 	}
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8570b826fedd..fffabc0aefcb 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -297,7 +297,7 @@ xfs_trans_dup(
 	tp->t_rtx_res = tp->t_rtx_res_used;
 	ntp->t_pflags = tp->t_pflags;
 
-	XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp);
+	xfs_trans_dup_dqinfo(tp, ntp);
 
 	atomic_inc(&tp->t_mountp->m_active_trans);
 	return ntp;
@@ -831,7 +831,7 @@ shut_us_down:
 		 * means is that we have some (non-persistent) quota
 		 * reservations that need to be unreserved.
 		 */
-		XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp);
+		xfs_trans_unreserve_and_mod_dquots(tp);
 		if (tp->t_ticket) {
 			commit_lsn = xfs_log_done(mp, tp->t_ticket,
 							NULL, log_flags);
@@ -850,10 +850,9 @@ shut_us_down:
 	/*
 	 * If we need to update the superblock, then do it now.
 	 */
-	if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY)
 		xfs_trans_apply_sb_deltas(tp);
-	}
-	XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp);
+	xfs_trans_apply_dquot_deltas(tp);
 
 	/*
 	 * Ask each log item how many log_vector entries it will
@@ -1058,7 +1057,7 @@ xfs_trans_uncommit(
 	}
 
 	xfs_trans_unreserve_and_mod_sb(tp);
-	XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp);
+	xfs_trans_unreserve_and_mod_dquots(tp);
 
 	xfs_trans_free_items(tp, flags);
 	xfs_trans_free_busy(tp);
@@ -1183,7 +1182,7 @@ xfs_trans_cancel(
 	}
 #endif
 	xfs_trans_unreserve_and_mod_sb(tp);
-	XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp);
+	xfs_trans_unreserve_and_mod_dquots(tp);
 
 	if (tp->t_ticket) {
 		if (flags & XFS_TRANS_RELEASE_LOG_RES) {
@@ -1213,7 +1212,7 @@ xfs_trans_free(
 	xfs_trans_t	*tp)
 {
 	atomic_dec(&tp->t_mountp->m_active_trans);
-	XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp);
+	xfs_trans_free_dqinfo(tp);
 	kmem_zone_free(xfs_trans_zone, tp);
 }
 
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 79b9e5ea5359..4d88616bde91 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -166,7 +166,7 @@ xfs_dir_ialloc(
 			xfs_buf_relse(ialloc_context);
 			if (dqinfo) {
 				tp->t_dqinfo = dqinfo;
-				XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp);
+				xfs_trans_free_dqinfo(tp);
 			}
 			*tpp = ntp;
 			*ipp = NULL;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 19cf90a9c762..b56321b2b9f0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -118,7 +118,7 @@ xfs_setattr(
 		 */
 		ASSERT(udqp == NULL);
 		ASSERT(gdqp == NULL);
-		code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid,
+		code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
 					 qflags, &udqp, &gdqp);
 		if (code)
 			return code;
@@ -180,10 +180,11 @@ xfs_setattr(
 		 * Do a quota reservation only if uid/gid is actually
 		 * going to change.
 		 */
-		if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
-		    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
+		if (XFS_IS_QUOTA_RUNNING(mp) &&
+		    ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+		     (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
 			ASSERT(tp);
-			code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
+			code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
 						capable(CAP_FOWNER) ?
 						XFS_QMOPT_FORCE_RES : 0);
 			if (code)	/* out of quota */
@@ -217,7 +218,7 @@ xfs_setattr(
 		/*
 		 * Make sure that the dquots are attached to the inode.
 		 */
-		code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+		code = xfs_qm_dqattach_locked(ip, 0);
 		if (code)
 			goto error_return;
 
@@ -351,21 +352,21 @@ xfs_setattr(
 		 * in the transaction.
 		 */
 		if (iuid != uid) {
-			if (XFS_IS_UQUOTA_ON(mp)) {
+			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
 				ASSERT(mask & ATTR_UID);
 				ASSERT(udqp);
-				olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+				olddquot1 = xfs_qm_vop_chown(tp, ip,
 							&ip->i_udquot, udqp);
 			}
 			ip->i_d.di_uid = uid;
 			inode->i_uid = uid;
 		}
 		if (igid != gid) {
-			if (XFS_IS_GQUOTA_ON(mp)) {
+			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
 				ASSERT(!XFS_IS_PQUOTA_ON(mp));
 				ASSERT(mask & ATTR_GID);
 				ASSERT(gdqp);
-				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+				olddquot2 = xfs_qm_vop_chown(tp, ip,
 							&ip->i_gdquot, gdqp);
 			}
 			ip->i_d.di_gid = gid;
@@ -461,10 +462,10 @@ xfs_setattr(
 	/*
 	 * Release any dquot(s) the inode had kept before chown.
 	 */
-	XFS_QM_DQRELE(mp, olddquot1);
-	XFS_QM_DQRELE(mp, olddquot2);
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
+	xfs_qm_dqrele(olddquot1);
+	xfs_qm_dqrele(olddquot2);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
 
 	if (code) {
 		return code;
@@ -482,8 +483,8 @@ xfs_setattr(
 	commit_flags |= XFS_TRANS_ABORT;
 	/* FALLTHROUGH */
  error_return:
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
 	if (tp) {
 		xfs_trans_cancel(tp, commit_flags);
 	}
@@ -739,7 +740,8 @@ xfs_free_eofblocks(
 		/*
 		 * Attach the dquots to the inode up front.
 		 */
-		if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+		error = xfs_qm_dqattach(ip, 0);
+		if (error)
 			return error;
 
 		/*
@@ -1181,7 +1183,8 @@ xfs_inactive(
 
 	ASSERT(ip->i_d.di_nlink == 0);
 
-	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
 		return VN_INACTIVE_CACHE;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
@@ -1307,7 +1310,7 @@ xfs_inactive(
 		/*
 		 * Credit the quota account(s). The inode is gone.
 		 */
-		XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
 
 		/*
 		 * Just ignore errors at this point.  There is nothing we can
@@ -1323,11 +1326,11 @@ xfs_inactive(
 			xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
 				"xfs_trans_commit() returned error %d", error);
 	}
+
 	/*
 	 * Release the dquots held by inode, if any.
 	 */
-	XFS_QM_DQDETACH(mp, ip);
-
+	xfs_qm_dqdetach(ip);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 
  out:
@@ -1427,8 +1430,7 @@ xfs_create(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = XFS_QM_DQVOPALLOC(mp, dp,
-			current_fsuid(), current_fsgid(), prid,
+	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		goto std_return;
@@ -1489,7 +1491,7 @@ xfs_create(
 	/*
 	 * Reserve disk quota and the inode.
 	 */
-	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
+	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
 	if (error)
 		goto out_trans_cancel;
 
@@ -1561,7 +1563,7 @@ xfs_create(
 	 * These ids of the inode couldn't have changed since the new
 	 * inode has been locked ever since it was created.
 	 */
-	XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
 
 	/*
 	 * xfs_trans_commit normally decrements the vnode ref count
@@ -1580,8 +1582,8 @@ xfs_create(
 		goto out_dqrele;
 	}
 
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
 
 	*ipp = ip;
 
@@ -1602,8 +1604,8 @@ xfs_create(
  out_trans_cancel:
 	xfs_trans_cancel(tp, cancel_flags);
  out_dqrele:
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
 
 	if (unlock_dp_on_error)
 		xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -1837,11 +1839,11 @@ xfs_remove(
 			return error;
 	}
 
-	error = XFS_QM_DQATTACH(mp, dp, 0);
+	error = xfs_qm_dqattach(dp, 0);
 	if (error)
 		goto std_return;
 
-	error = XFS_QM_DQATTACH(mp, ip, 0);
+	error = xfs_qm_dqattach(ip, 0);
 	if (error)
 		goto std_return;
 
@@ -2028,11 +2030,11 @@ xfs_link(
 
 	/* Return through std_return after this point. */
 
-	error = XFS_QM_DQATTACH(mp, sip, 0);
+	error = xfs_qm_dqattach(sip, 0);
 	if (error)
 		goto std_return;
 
-	error = XFS_QM_DQATTACH(mp, tdp, 0);
+	error = xfs_qm_dqattach(tdp, 0);
 	if (error)
 		goto std_return;
 
@@ -2205,8 +2207,7 @@ xfs_symlink(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = XFS_QM_DQVOPALLOC(mp, dp,
-			current_fsuid(), current_fsgid(), prid,
+	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		goto std_return;
@@ -2248,7 +2249,7 @@ xfs_symlink(
 	/*
 	 * Reserve disk quota : blocks and inode.
 	 */
-	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
+	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
 	if (error)
 		goto error_return;
 
@@ -2288,7 +2289,7 @@ xfs_symlink(
 	/*
 	 * Also attach the dquot(s) to it, if applicable.
 	 */
-	XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
 
 	if (resblks)
 		resblks -= XFS_IALLOC_SPACE_RES(mp);
@@ -2376,8 +2377,8 @@ xfs_symlink(
 		goto error2;
 	}
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
 
 	/* Fall through to std_return with error = 0 or errno from
 	 * xfs_trans_commit	*/
@@ -2401,8 +2402,8 @@ std_return:
 	cancel_flags |= XFS_TRANS_ABORT;
  error_return:
 	xfs_trans_cancel(tp, cancel_flags);
-	XFS_QM_DQRELE(mp, udqp);
-	XFS_QM_DQRELE(mp, gdqp);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
 
 	if (unlock_dp_on_error)
 		xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -2541,7 +2542,8 @@ xfs_alloc_file_space(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
 		return error;
 
 	if (len <= 0)
@@ -2628,8 +2630,8 @@ retry:
 			break;
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
-						      qblocks, 0, quota_flag);
+		error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
+						      0, quota_flag);
 		if (error)
 			goto error1;
 
@@ -2688,7 +2690,7 @@ dmapi_enospc_check:
 
 error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 	xfs_bmap_cancel(&free_list);
-	XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
+	xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
 
 error1:	/* Just cancel transaction */
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
@@ -2827,7 +2829,8 @@ xfs_free_file_space(
 
 	xfs_itrace_entry(ip);
 
-	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
 		return error;
 
 	error = 0;
@@ -2953,9 +2956,9 @@ xfs_free_file_space(
 			break;
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
-				ip->i_udquot, ip->i_gdquot, resblks, 0,
-				XFS_QMOPT_RES_REGBLKS);
+		error = xfs_trans_reserve_quota(tp, mp,
+				ip->i_udquot, ip->i_gdquot,
+				resblks, 0, XFS_QMOPT_RES_REGBLKS);
 		if (error)
 			goto error1;
 

From 5a34d5cd096310133f9208db294021208a96660d Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 8 Jun 2009 15:35:03 +0200
Subject: [PATCH 136/162] xfs: split inode data writeback from
 xfs_sync_inodes_ag

In many cases we only want to sync inode data. Start spliting the inode sync
into data sync and inode sync by factoring out the inode data flush.

[hch: minor cleanups]

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/linux-2.6/xfs_sync.c | 52 +++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b06b95c154cc..7adc62dd14bb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -48,6 +48,35 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
+
+STATIC int
+xfs_sync_inode_data(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct address_space *mapping = inode->i_mapping;
+	int			error = 0;
+
+	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+		goto out_wait;
+
+	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+		if (flags & SYNC_TRYLOCK)
+			goto out_wait;
+		xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	}
+
+	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
+				0 : XFS_B_ASYNC, FI_NONE);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ out_wait:
+	if (flags & SYNC_IOWAIT)
+		xfs_ioend_wait(ip);
+	return error;
+}
+
 /*
  * Sync all the inodes in the given AG according to the
  * direction given by the flags.
@@ -123,27 +152,10 @@ xfs_sync_inodes_ag(
 		 * If we have to flush data or wait for I/O completion
 		 * we need to hold the iolock.
 		 */
-		if (flags & SYNC_DELWRI) {
-			if (VN_DIRTY(inode)) {
-				if (flags & SYNC_TRYLOCK) {
-					if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-						lock_flags |= XFS_IOLOCK_SHARED;
-				} else {
-					xfs_ilock(ip, XFS_IOLOCK_SHARED);
-					lock_flags |= XFS_IOLOCK_SHARED;
-				}
-				if (lock_flags & XFS_IOLOCK_SHARED) {
-					error = xfs_flush_pages(ip, 0, -1,
-							(flags & SYNC_WAIT) ? 0
-								: XFS_B_ASYNC,
-							FI_NONE);
-				}
-			}
-			if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
-				xfs_ioend_wait(ip);
-		}
-		xfs_ilock(ip, XFS_ILOCK_SHARED);
+		if (flags & SYNC_DELWRI)
+			error = xfs_sync_inode_data(ip, flags);
 
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
 		if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
 			if (flags & SYNC_WAIT) {
 				xfs_iflock(ip);

From 845b6d0cbbc2304e8a54ed4038272c55f85b2269 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 15:35:05 +0200
Subject: [PATCH 137/162] xfs: split inode flushing from xfs_sync_inodes_ag

In many cases we only want to sync inode metadata. Split out the inode
flushing into a separate helper to prepare factoring the inode sync code.

Based on a patch from Dave Chinner, but redone to keep the current behaviour
exactly and leave changes to the flushing logic to another patch.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/linux-2.6/xfs_sync.c | 50 ++++++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 7adc62dd14bb..1712caa12017 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -77,6 +77,35 @@ xfs_sync_inode_data(
 	return error;
 }
 
+STATIC int
+xfs_sync_inode_attr(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	int			error = 0;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (xfs_inode_clean(ip))
+		goto out_unlock;
+	if (!xfs_iflock_nowait(ip)) {
+		if (!(flags & SYNC_WAIT))
+			goto out_unlock;
+		xfs_iflock(ip);
+	}
+
+	if (xfs_inode_clean(ip)) {
+		xfs_ifunlock(ip);
+		goto out_unlock;
+	}
+
+	error = xfs_iflush(ip, (flags & SYNC_WAIT) ?
+			   XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
+
+ out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
+}
+
 /*
  * Sync all the inodes in the given AG according to the
  * direction given by the flags.
@@ -96,7 +125,6 @@ xfs_sync_inodes_ag(
 	do {
 		struct inode	*inode;
 		xfs_inode_t	*ip = NULL;
-		int		lock_flags = XFS_ILOCK_SHARED;
 
 		/*
 		 * use a gang lookup to find the next inode in the tree
@@ -155,22 +183,10 @@ xfs_sync_inodes_ag(
 		if (flags & SYNC_DELWRI)
 			error = xfs_sync_inode_data(ip, flags);
 
-		xfs_ilock(ip, XFS_ILOCK_SHARED);
-		if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
-			if (flags & SYNC_WAIT) {
-				xfs_iflock(ip);
-				if (!xfs_inode_clean(ip))
-					error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-				else
-					xfs_ifunlock(ip);
-			} else if (xfs_iflock_nowait(ip)) {
-				if (!xfs_inode_clean(ip))
-					error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
-				else
-					xfs_ifunlock(ip);
-			}
-		}
-		xfs_iput(ip, lock_flags);
+		if (flags & SYNC_ATTR)
+			error = xfs_sync_inode_attr(ip, flags);
+
+		IRELE(ip);
 
 		if (error)
 			last_error = error;

From 1da8eecab5f866b4f5be43adbaadf18e259a8cc5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 8 Jun 2009 15:35:07 +0200
Subject: [PATCH 138/162] xfs: factor out inode validation for sync

Separate the validation of inodes found by the radix
tree walk from the radix tree lookup.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/linux-2.6/xfs_sync.c | 59 +++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 1712caa12017..c91d5b2a568e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -49,6 +49,39 @@
 #include <linux/freezer.h>
 
 
+/* must be called with pag_ici_lock held and releases it */
+STATIC int
+xfs_sync_inode_valid(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	/* nothing to sync during shutdown */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		read_unlock(&pag->pag_ici_lock);
+		return EFSCORRUPTED;
+	}
+
+	/*
+	 * If we can't get a reference on the inode, it must be in reclaim.
+	 * Leave it for the reclaim code to flush. Also avoid inodes that
+	 * haven't been fully initialised.
+	 */
+	if (!igrab(inode)) {
+		read_unlock(&pag->pag_ici_lock);
+		return ENOENT;
+	}
+	read_unlock(&pag->pag_ici_lock);
+
+	if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+		IRELE(ip);
+		return ENOENT;
+	}
+
+	return 0;
+}
+
 STATIC int
 xfs_sync_inode_data(
 	struct xfs_inode	*ip,
@@ -123,7 +156,6 @@ xfs_sync_inodes_ag(
 	int		last_error = 0;
 
 	do {
-		struct inode	*inode;
 		xfs_inode_t	*ip = NULL;
 
 		/*
@@ -152,27 +184,10 @@ xfs_sync_inodes_ag(
 			break;
 		}
 
-		/* nothing to sync during shutdown */
-		if (XFS_FORCED_SHUTDOWN(mp)) {
-			read_unlock(&pag->pag_ici_lock);
-			return 0;
-		}
-
-		/*
-		 * If we can't get a reference on the inode, it must be
-		 * in reclaim. Leave it for the reclaim code to flush.
-		 */
-		inode = VFS_I(ip);
-		if (!igrab(inode)) {
-			read_unlock(&pag->pag_ici_lock);
-			continue;
-		}
-		read_unlock(&pag->pag_ici_lock);
-
-		/* avoid new or bad inodes */
-		if (is_bad_inode(inode) ||
-		    xfs_iflags_test(ip, XFS_INEW)) {
-			IRELE(ip);
+		error = xfs_sync_inode_valid(ip, pag);
+		if (error) {
+			if (error == EFSCORRUPTED)
+				return 0;
 			continue;
 		}
 

From abc1064742604e60a47a65fa3214dc1a84db093d Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 8 Jun 2009 15:35:12 +0200
Subject: [PATCH 139/162] xfs: remove unused parameter from xfs_reclaim_inodes

The noblock parameter of xfs_reclaim_inodes is only ever set to zero. Remove
it and all the conditional code that is never executed.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/linux-2.6/xfs_sync.c | 22 ++++------------------
 fs/xfs/linux-2.6/xfs_sync.h |  2 +-
 fs/xfs/xfs_mount.c          |  2 +-
 3 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index c91d5b2a568e..7eb9d9cca52a 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -385,7 +385,7 @@ xfs_quiesce_fs(
 	int	count = 0, pincount;
 
 	xfs_flush_buftarg(mp->m_ddev_targp, 0);
-	xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+	xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
 
 	/*
 	 * This loop must run at least twice.  The first instance of the loop
@@ -509,7 +509,7 @@ xfs_sync_worker(
 
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
 		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-		xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+		xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
 		/* dgc: errors ignored here */
 		error = xfs_qm_sync(mp, SYNC_BDFLUSH);
 		error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
@@ -703,7 +703,6 @@ STATIC void
 xfs_reclaim_inodes_ag(
 	xfs_mount_t	*mp,
 	int		ag,
-	int		noblock,
 	int		mode)
 {
 	xfs_inode_t	*ip = NULL;
@@ -749,25 +748,13 @@ restart:
 			continue;
 		}
 
-		if (noblock) {
-			if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-				read_unlock(&pag->pag_ici_lock);
-				continue;
-			}
-			if (xfs_ipincount(ip) ||
-			    !xfs_iflock_nowait(ip)) {
-				xfs_iunlock(ip, XFS_ILOCK_EXCL);
-				read_unlock(&pag->pag_ici_lock);
-				continue;
-			}
-		}
 		read_unlock(&pag->pag_ici_lock);
 
 		/*
 		 * hmmm - this is an inode already in reclaim. Do
 		 * we even bother catching it here?
 		 */
-		if (xfs_reclaim_inode(ip, noblock, mode))
+		if (xfs_reclaim_inode(ip, 0, mode))
 			skipped++;
 	} while (nr_found);
 
@@ -782,7 +769,6 @@ restart:
 int
 xfs_reclaim_inodes(
 	xfs_mount_t	*mp,
-	int		 noblock,
 	int		mode)
 {
 	int		i;
@@ -790,7 +776,7 @@ xfs_reclaim_inodes(
 	for (i = 0; i < mp->m_sb.sb_agcount; i++) {
 		if (!mp->m_perag[i].pag_ici_init)
 			continue;
-		xfs_reclaim_inodes_ag(mp, i, noblock, mode);
+		xfs_reclaim_inodes_ag(mp, i, mode);
 	}
 	return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 308d5bf6dfbd..d7b2b5f1c387 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -48,7 +48,7 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
 void xfs_flush_inodes(struct xfs_inode *ip);
 
 int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
-int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b659db5e7165..5c6f092659c1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1371,7 +1371,7 @@ xfs_unmountfs(
 	 * need to force the log first.
 	 */
 	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-	xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
+	xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
 
 	xfs_qm_unmount(mp);
 

From 75f3cb1393133682958db6f157e1b6473e5a366b Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 8 Jun 2009 15:35:14 +0200
Subject: [PATCH 140/162] xfs: introduce a per-ag inode iterator

Given that we walk across the per-ag inode lists so often, it makes sense to
introduce an iterator for this.

Convert the sync and reclaim code to use this new iterator, quota code will
follow in the next patch.

Also change xfs_reclaim_inode to return -EGAIN instead of 1 for an inode
already under reclaim.  This simplifies the AG iterator and doesn't
matter for the only other caller.

[hch: merged the lookup and execute callbacks back into one to get the
 pag_ici_lock locking correct and simplify the code flow]

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/linux-2.6/xfs_sync.c | 316 +++++++++++++++++-------------------
 fs/xfs/xfs_ag.h             |   2 +
 2 files changed, 152 insertions(+), 166 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 7eb9d9cca52a..9798643feb3b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -49,6 +49,123 @@
 #include <linux/freezer.h>
 
 
+STATIC xfs_inode_t *
+xfs_inode_ag_lookup(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	uint32_t		*first_index,
+	int			tag)
+{
+	int			nr_found;
+	struct xfs_inode	*ip;
+
+	/*
+	 * use a gang lookup to find the next inode in the tree
+	 * as the tree is sparse and a gang lookup walks to find
+	 * the number of objects requested.
+	 */
+	read_lock(&pag->pag_ici_lock);
+	if (tag == XFS_ICI_NO_TAG) {
+		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+				(void **)&ip, *first_index, 1);
+	} else {
+		nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+				(void **)&ip, *first_index, 1, tag);
+	}
+	if (!nr_found)
+		goto unlock;
+
+	/*
+	 * Update the index for the next lookup. Catch overflows
+	 * into the next AG range which can occur if we have inodes
+	 * in the last block of the AG and we are currently
+	 * pointing to the last inode.
+	 */
+	*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+	if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+		goto unlock;
+
+	return ip;
+
+unlock:
+	read_unlock(&pag->pag_ici_lock);
+	return NULL;
+}
+
+STATIC int
+xfs_inode_ag_walk(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		ag,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags),
+	int			flags,
+	int			tag)
+{
+	struct xfs_perag	*pag = &mp->m_perag[ag];
+	uint32_t		first_index;
+	int			last_error = 0;
+	int			skipped;
+
+restart:
+	skipped = 0;
+	first_index = 0;
+	do {
+		int		error = 0;
+		xfs_inode_t	*ip;
+
+		ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
+		if (!ip)
+			break;
+
+		error = execute(ip, pag, flags);
+		if (error == EAGAIN) {
+			skipped++;
+			continue;
+		}
+		if (error)
+			last_error = error;
+		/*
+		 * bail out if the filesystem is corrupted.
+		 */
+		if (error == EFSCORRUPTED)
+			break;
+
+	} while (1);
+
+	if (skipped) {
+		delay(1);
+		goto restart;
+	}
+
+	xfs_put_perag(mp, pag);
+	return last_error;
+}
+
+STATIC int
+xfs_inode_ag_iterator(
+	struct xfs_mount	*mp,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags),
+	int			flags,
+	int			tag)
+{
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+
+	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+		if (!mp->m_perag[ag].pag_ici_init)
+			continue;
+		error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
+		if (error) {
+			last_error = error;
+			if (error == EFSCORRUPTED)
+				break;
+		}
+	}
+	return XFS_ERROR(last_error);
+}
+
 /* must be called with pag_ici_lock held and releases it */
 STATIC int
 xfs_sync_inode_valid(
@@ -85,12 +202,17 @@ xfs_sync_inode_valid(
 STATIC int
 xfs_sync_inode_data(
 	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
 	int			flags)
 {
 	struct inode		*inode = VFS_I(ip);
 	struct address_space *mapping = inode->i_mapping;
 	int			error = 0;
 
+	error = xfs_sync_inode_valid(ip, pag);
+	if (error)
+		return error;
+
 	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		goto out_wait;
 
@@ -107,16 +229,22 @@ xfs_sync_inode_data(
  out_wait:
 	if (flags & SYNC_IOWAIT)
 		xfs_ioend_wait(ip);
+	IRELE(ip);
 	return error;
 }
 
 STATIC int
 xfs_sync_inode_attr(
 	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
 	int			flags)
 {
 	int			error = 0;
 
+	error = xfs_sync_inode_valid(ip, pag);
+	if (error)
+		return error;
+
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	if (xfs_inode_clean(ip))
 		goto out_unlock;
@@ -136,117 +264,33 @@ xfs_sync_inode_attr(
 
  out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	IRELE(ip);
 	return error;
 }
 
-/*
- * Sync all the inodes in the given AG according to the
- * direction given by the flags.
- */
-STATIC int
-xfs_sync_inodes_ag(
-	xfs_mount_t	*mp,
-	int		ag,
-	int		flags)
-{
-	xfs_perag_t	*pag = &mp->m_perag[ag];
-	int		nr_found;
-	uint32_t	first_index = 0;
-	int		error = 0;
-	int		last_error = 0;
-
-	do {
-		xfs_inode_t	*ip = NULL;
-
-		/*
-		 * use a gang lookup to find the next inode in the tree
-		 * as the tree is sparse and a gang lookup walks to find
-		 * the number of objects requested.
-		 */
-		read_lock(&pag->pag_ici_lock);
-		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-				(void**)&ip, first_index, 1);
-
-		if (!nr_found) {
-			read_unlock(&pag->pag_ici_lock);
-			break;
-		}
-
-		/*
-		 * Update the index for the next lookup. Catch overflows
-		 * into the next AG range which can occur if we have inodes
-		 * in the last block of the AG and we are currently
-		 * pointing to the last inode.
-		 */
-		first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-		if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
-			read_unlock(&pag->pag_ici_lock);
-			break;
-		}
-
-		error = xfs_sync_inode_valid(ip, pag);
-		if (error) {
-			if (error == EFSCORRUPTED)
-				return 0;
-			continue;
-		}
-
-		/*
-		 * If we have to flush data or wait for I/O completion
-		 * we need to hold the iolock.
-		 */
-		if (flags & SYNC_DELWRI)
-			error = xfs_sync_inode_data(ip, flags);
-
-		if (flags & SYNC_ATTR)
-			error = xfs_sync_inode_attr(ip, flags);
-
-		IRELE(ip);
-
-		if (error)
-			last_error = error;
-		/*
-		 * bail out if the filesystem is corrupted.
-		 */
-		if (error == EFSCORRUPTED)
-			return XFS_ERROR(error);
-
-	} while (nr_found);
-
-	return last_error;
-}
-
 int
 xfs_sync_inodes(
 	xfs_mount_t	*mp,
 	int		flags)
 {
-	int		error;
-	int		last_error;
-	int		i;
+	int		error = 0;
 	int		lflags = XFS_LOG_FORCE;
 
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
 		return 0;
-	error = 0;
-	last_error = 0;
 
 	if (flags & SYNC_WAIT)
 		lflags |= XFS_LOG_SYNC;
 
-	for (i = 0; i < mp->m_sb.sb_agcount; i++) {
-		if (!mp->m_perag[i].pag_ici_init)
-			continue;
-		error = xfs_sync_inodes_ag(mp, i, flags);
-		if (error)
-			last_error = error;
-		if (error == EFSCORRUPTED)
-			break;
-	}
 	if (flags & SYNC_DELWRI)
-		xfs_log_force(mp, 0, lflags);
+		error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, XFS_ICI_NO_TAG);
 
-	return XFS_ERROR(last_error);
+	if (flags & SYNC_ATTR)
+		error = xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, XFS_ICI_NO_TAG);
+
+	if (!error && (flags & SYNC_DELWRI))
+		xfs_log_force(mp, 0, lflags);
+	return XFS_ERROR(error);
 }
 
 STATIC int
@@ -613,7 +657,7 @@ xfs_reclaim_inode(
 			xfs_ifunlock(ip);
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		}
-		return 1;
+		return -EAGAIN;
 	}
 	__xfs_iflags_set(ip, XFS_IRECLAIM);
 	spin_unlock(&ip->i_flags_lock);
@@ -698,72 +742,20 @@ xfs_inode_clear_reclaim_tag(
 	xfs_put_perag(mp, pag);
 }
 
-
-STATIC void
-xfs_reclaim_inodes_ag(
-	xfs_mount_t	*mp,
-	int		ag,
-	int		mode)
+STATIC int
+xfs_reclaim_inode_now(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			flags)
 {
-	xfs_inode_t	*ip = NULL;
-	xfs_perag_t	*pag = &mp->m_perag[ag];
-	int		nr_found;
-	uint32_t	first_index;
-	int		skipped;
-
-restart:
-	first_index = 0;
-	skipped = 0;
-	do {
-		/*
-		 * use a gang lookup to find the next inode in the tree
-		 * as the tree is sparse and a gang lookup walks to find
-		 * the number of objects requested.
-		 */
-		read_lock(&pag->pag_ici_lock);
-		nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
-					(void**)&ip, first_index, 1,
-					XFS_ICI_RECLAIM_TAG);
-
-		if (!nr_found) {
-			read_unlock(&pag->pag_ici_lock);
-			break;
-		}
-
-		/*
-		 * Update the index for the next lookup. Catch overflows
-		 * into the next AG range which can occur if we have inodes
-		 * in the last block of the AG and we are currently
-		 * pointing to the last inode.
-		 */
-		first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-		if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
-			read_unlock(&pag->pag_ici_lock);
-			break;
-		}
-
-		/* ignore if already under reclaim */
-		if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-			read_unlock(&pag->pag_ici_lock);
-			continue;
-		}
-
+	/* ignore if already under reclaim */
+	if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
 		read_unlock(&pag->pag_ici_lock);
-
-		/*
-		 * hmmm - this is an inode already in reclaim. Do
-		 * we even bother catching it here?
-		 */
-		if (xfs_reclaim_inode(ip, 0, mode))
-			skipped++;
-	} while (nr_found);
-
-	if (skipped) {
-		delay(1);
-		goto restart;
+		return 0;
 	}
-	return;
+	read_unlock(&pag->pag_ici_lock);
 
+	return xfs_reclaim_inode(ip, 0, flags);
 }
 
 int
@@ -771,14 +763,6 @@ xfs_reclaim_inodes(
 	xfs_mount_t	*mp,
 	int		mode)
 {
-	int		i;
-
-	for (i = 0; i < mp->m_sb.sb_agcount; i++) {
-		if (!mp->m_perag[i].pag_ici_init)
-			continue;
-		xfs_reclaim_inodes_ag(mp, i, mode);
-	}
-	return 0;
+	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
+					XFS_ICI_RECLAIM_TAG);
 }
-
-
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index c8641f713caa..f24b50b68d03 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -212,6 +212,8 @@ typedef struct xfs_perag
 /*
  * tags for inode radix tree
  */
+#define XFS_ICI_NO_TAG		(-1)	/* special flag for an untagged lookup
+					   in xfs_inode_ag_iterator */
 #define XFS_ICI_RECLAIM_TAG	0	/* inode is to be reclaimed */
 
 #define	XFS_AG_MAXLEVELS(mp)		((mp)->m_ag_maxlevels)

From fe588ed32867b42e0d906db558ca92fd9f8b128e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 15:35:27 +0200
Subject: [PATCH 141/162] xfs: use generic inode iterator in
 xfs_qm_dqrele_all_inodes

Use xfs_inode_ag_iterator instead of opencoding the inode walk in the
quota code.  Mark xfs_inode_ag_iterator and xfs_sync_inode_valid non-static
to allow using them from the quota code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/linux-2.6/xfs_sync.c    |   4 +-
 fs/xfs/linux-2.6/xfs_sync.h    |   6 ++
 fs/xfs/quota/xfs_qm_syscalls.c | 112 +++++++++------------------------
 3 files changed, 39 insertions(+), 83 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 9798643feb3b..a3d2e7713068 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -141,7 +141,7 @@ restart:
 	return last_error;
 }
 
-STATIC int
+int
 xfs_inode_ag_iterator(
 	struct xfs_mount	*mp,
 	int			(*execute)(struct xfs_inode *ip,
@@ -167,7 +167,7 @@ xfs_inode_ag_iterator(
 }
 
 /* must be called with pag_ici_lock held and releases it */
-STATIC int
+int
 xfs_sync_inode_valid(
 	struct xfs_inode	*ip,
 	struct xfs_perag	*pag)
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index d7b2b5f1c387..9bb1253a4023 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -54,4 +54,10 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 				struct xfs_inode *ip);
+
+int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+	int flags, int tag);
+
 #endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index c7b66f6506ce..7126f855e14b 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -847,105 +847,55 @@ xfs_qm_export_flags(
 }
 
 
-/*
- * Release all the dquots on the inodes in an AG.
- */
-STATIC void
-xfs_qm_dqrele_inodes_ag(
-	xfs_mount_t	*mp,
-	int		ag,
-	uint		flags)
+STATIC int
+xfs_dqrele_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			flags)
 {
-	xfs_inode_t	*ip = NULL;
-	xfs_perag_t	*pag = &mp->m_perag[ag];
-	int		first_index = 0;
-	int		nr_found;
+	int			error;
 
-	do {
-		/*
-		 * use a gang lookup to find the next inode in the tree
-		 * as the tree is sparse and a gang lookup walks to find
-		 * the number of objects requested.
-		 */
-		read_lock(&pag->pag_ici_lock);
-		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-				(void**)&ip, first_index, 1);
-
-		if (!nr_found) {
-			read_unlock(&pag->pag_ici_lock);
-			break;
-		}
-
-		/*
-		 * Update the index for the next lookup. Catch overflows
-		 * into the next AG range which can occur if we have inodes
-		 * in the last block of the AG and we are currently
-		 * pointing to the last inode.
-		 */
-		first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-		if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
-			read_unlock(&pag->pag_ici_lock);
-			break;
-		}
-
-		/* skip quota inodes */
-		if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
-			ASSERT(ip->i_udquot == NULL);
-			ASSERT(ip->i_gdquot == NULL);
-			read_unlock(&pag->pag_ici_lock);
-			continue;
-		}
-
-		/*
-		 * If we can't get a reference on the inode, it must be
-		 * in reclaim. Leave it for the reclaim code to flush.
-		 */
-		if (!igrab(VFS_I(ip))) {
-			read_unlock(&pag->pag_ici_lock);
-			continue;
-		}
+	/* skip quota inodes */
+	if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) {
+		ASSERT(ip->i_udquot == NULL);
+		ASSERT(ip->i_gdquot == NULL);
 		read_unlock(&pag->pag_ici_lock);
+		return 0;
+	}
 
-		/* avoid new inodes though we shouldn't find any here */
-		if (xfs_iflags_test(ip, XFS_INEW)) {
-			IRELE(ip);
-			continue;
-		}
+	error = xfs_sync_inode_valid(ip, pag);
+	if (error)
+		return error;
 
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
-			xfs_qm_dqrele(ip->i_udquot);
-			ip->i_udquot = NULL;
-		}
-		if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) &&
-		    ip->i_gdquot) {
-			xfs_qm_dqrele(ip->i_gdquot);
-			ip->i_gdquot = NULL;
-		}
-		xfs_iput(ip, XFS_ILOCK_EXCL);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+		xfs_qm_dqrele(ip->i_udquot);
+		ip->i_udquot = NULL;
+	}
+	if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+		xfs_qm_dqrele(ip->i_gdquot);
+		ip->i_gdquot = NULL;
+	}
+	xfs_iput(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
 
-	} while (nr_found);
+	return 0;
 }
 
+
 /*
  * Go thru all the inodes in the file system, releasing their dquots.
+ *
  * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
- * xfs_rootumount.
+ * AFTER this, in the case of quotaoff.
  */
 void
 xfs_qm_dqrele_all_inodes(
 	struct xfs_mount *mp,
 	uint		 flags)
 {
-	int		i;
-
 	ASSERT(mp->m_quotainfo);
-	for (i = 0; i < mp->m_sb.sb_agcount; i++) {
-		if (!mp->m_perag[i].pag_ici_init)
-			continue;
-		xfs_qm_dqrele_inodes_ag(mp, i, flags);
-	}
+	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
 }
 
 /*------------------------------------------------------------------------*/

From 075fe1028699f6a280545dfc2cfc5ac82d555c8c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 15:35:48 +0200
Subject: [PATCH 142/162] xfs: split xfs_sync_inodes

xfs_sync_inodes is used to write back either file data or inode metadata.
In general we always do these separately, except for one fishy case in
xfs_fs_put_super that does both.  So separate xfs_sync_inodes into
separate xfs_sync_data and xfs_sync_attr functions.  In xfs_fs_put_super
we first call the data sync and then the attr sync as that was the previous
order.  The moved log force in that path doesn't make a difference because
we will force the log again as part of the real unmount process.

The filesystem readonly checks are not performed by the new function but
instead moved into the callers, given that most callers alredy have it
further up in the stack.  Also add debug checks that we do not pass in
incorrect flags in the new xfs_sync_data and xfs_sync_attr function and
fix the one place that did pass in a wrong flag.

Also remove a comment mentioning xfs_sync_inodes that has been incorrect
for a while because we always take either the iolock or ilock in the
sync path these days.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/linux-2.6/xfs_quotaops.c |  4 ++-
 fs/xfs/linux-2.6/xfs_super.c    | 13 +++++++-
 fs/xfs/linux-2.6/xfs_sync.c     | 55 ++++++++++++++++++++-------------
 fs/xfs/linux-2.6/xfs_sync.h     |  5 ++-
 fs/xfs/xfs_filestream.c         |  6 ++--
 5 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 94d9a633d3d9..cb6e2cca214f 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -50,9 +50,11 @@ xfs_fs_quota_sync(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
 	if (!XFS_IS_QUOTA_RUNNING(mp))
 		return -ENOSYS;
-	return -xfs_sync_inodes(mp, SYNC_DELWRI);
+	return -xfs_sync_data(mp, 0);
 }
 
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 0d9b64b219e0..d4e7ef8f8df9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1071,7 +1071,18 @@ xfs_fs_put_super(
 	int			unmount_event_flags = 0;
 
 	xfs_syncd_stop(mp);
-	xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/*
+		 * XXX(hch): this should be SYNC_WAIT.
+		 *
+		 * Or more likely not needed at all because the VFS is already
+		 * calling ->sync_fs after shutting down all filestem
+		 * operations and just before calling ->put_super.
+		 */
+		xfs_sync_data(mp, 0);
+		xfs_sync_attr(mp, 0);
+	}
 
 #ifdef HAVE_DMAPI
 	if (mp->m_flags & XFS_MOUNT_DMAPI) {
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a3d2e7713068..c1a9a1135073 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -268,29 +268,42 @@ xfs_sync_inode_attr(
 	return error;
 }
 
+/*
+ * Write out pagecache data for the whole filesystem.
+ */
 int
-xfs_sync_inodes(
-	xfs_mount_t	*mp,
-	int		flags)
+xfs_sync_data(
+	struct xfs_mount	*mp,
+	int			flags)
 {
-	int		error = 0;
-	int		lflags = XFS_LOG_FORCE;
+	int			error;
 
-	if (mp->m_flags & XFS_MOUNT_RDONLY)
-		return 0;
+	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT|SYNC_IOWAIT)) == 0);
 
-	if (flags & SYNC_WAIT)
-		lflags |= XFS_LOG_SYNC;
+	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
+				      XFS_ICI_NO_TAG);
+	if (error)
+		return XFS_ERROR(error);
 
-	if (flags & SYNC_DELWRI)
-		error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, XFS_ICI_NO_TAG);
+	xfs_log_force(mp, 0,
+		      (flags & SYNC_WAIT) ?
+		       XFS_LOG_FORCE | XFS_LOG_SYNC :
+		       XFS_LOG_FORCE);
+	return 0;
+}
 
-	if (flags & SYNC_ATTR)
-		error = xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, XFS_ICI_NO_TAG);
+/*
+ * Write out inode metadata (attributes) for the whole filesystem.
+ */
+int
+xfs_sync_attr(
+	struct xfs_mount	*mp,
+	int			flags)
+{
+	ASSERT((flags & ~SYNC_WAIT) == 0);
 
-	if (!error && (flags & SYNC_DELWRI))
-		xfs_log_force(mp, 0, lflags);
-	return XFS_ERROR(error);
+	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
+				     XFS_ICI_NO_TAG);
 }
 
 STATIC int
@@ -404,12 +417,12 @@ xfs_quiesce_data(
 	int error;
 
 	/* push non-blocking */
-	xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+	xfs_sync_data(mp, 0);
 	xfs_qm_sync(mp, SYNC_BDFLUSH);
 	xfs_filestream_flush(mp);
 
 	/* push and block */
-	xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+	xfs_sync_data(mp, SYNC_WAIT|SYNC_IOWAIT);
 	xfs_qm_sync(mp, SYNC_WAIT);
 
 	/* write superblock and hoover up shutdown errors */
@@ -438,7 +451,7 @@ xfs_quiesce_fs(
 	 * logged before we can write the unmount record.
 	 */
 	do {
-		xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
+		xfs_sync_attr(mp, SYNC_WAIT);
 		pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
 		if (!pincount) {
 			delay(50);
@@ -521,8 +534,8 @@ xfs_flush_inodes_work(
 	void		*arg)
 {
 	struct inode	*inode = arg;
-	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
-	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
+	xfs_sync_data(mp, SYNC_TRYLOCK);
+	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_IOWAIT);
 	iput(inode);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 9bb1253a4023..26bfb5c42e76 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -29,8 +29,6 @@ typedef struct xfs_sync_work {
 	struct completion	*w_completion;
 } xfs_sync_work_t;
 
-#define SYNC_ATTR		0x0001	/* sync attributes */
-#define SYNC_DELWRI		0x0002	/* look at delayed writes */
 #define SYNC_WAIT		0x0004	/* wait for i/o to complete */
 #define SYNC_BDFLUSH		0x0008	/* BDFLUSH is calling -- don't block */
 #define SYNC_IOWAIT		0x0010  /* wait for all I/O to complete */
@@ -39,7 +37,8 @@ typedef struct xfs_sync_work {
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
 
-int xfs_sync_inodes(struct xfs_mount *mp, int flags);
+int xfs_sync_attr(struct xfs_mount *mp, int flags);
+int xfs_sync_data(struct xfs_mount *mp, int flags);
 int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 6c87c8f304ef..edf8bdf4141f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -542,10 +542,8 @@ xfs_filestream_associate(
 	 * waiting for the lock because someone else is waiting on the lock we
 	 * hold and we cannot drop that as we are in a transaction here.
 	 *
-	 * Lucky for us, this inversion is rarely a problem because it's a
-	 * directory inode that we are trying to lock here and that means the
-	 * only place that matters is xfs_sync_inodes() and SYNC_DELWRI is
-	 * used. i.e. freeze, remount-ro, quotasync or unmount.
+	 * Lucky for us, this inversion is not a problem because it's a
+	 * directory inode that we are trying to lock here.
 	 *
 	 * So, if we can't get the iolock without sleeping then just give up
 	 */

From b0710ccc6d9fa8fb908b5f6d1b0782a09d80e24f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 15:37:11 +0200
Subject: [PATCH 143/162] xfs: remove SYNC_IOWAIT

We want to wait for all I/O to finish when we do data integrity syncs.  So
there is no reason to keep SYNC_WAIT separate from SYNC_IOWAIT.  This
causes a little change in behaviour for the ENOSPC flushing code which now
does a second submission and wait of buffered I/O, but that should finish
ASAP as we already did an asynchronous writeout earlier.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/linux-2.6/xfs_sync.c | 8 ++++----
 fs/xfs/linux-2.6/xfs_sync.h | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index c1a9a1135073..32abd96b1095 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -227,7 +227,7 @@ xfs_sync_inode_data(
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
  out_wait:
-	if (flags & SYNC_IOWAIT)
+	if (flags & SYNC_WAIT)
 		xfs_ioend_wait(ip);
 	IRELE(ip);
 	return error;
@@ -278,7 +278,7 @@ xfs_sync_data(
 {
 	int			error;
 
-	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT|SYNC_IOWAIT)) == 0);
+	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
 
 	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
 				      XFS_ICI_NO_TAG);
@@ -422,7 +422,7 @@ xfs_quiesce_data(
 	xfs_filestream_flush(mp);
 
 	/* push and block */
-	xfs_sync_data(mp, SYNC_WAIT|SYNC_IOWAIT);
+	xfs_sync_data(mp, SYNC_WAIT);
 	xfs_qm_sync(mp, SYNC_WAIT);
 
 	/* write superblock and hoover up shutdown errors */
@@ -535,7 +535,7 @@ xfs_flush_inodes_work(
 {
 	struct inode	*inode = arg;
 	xfs_sync_data(mp, SYNC_TRYLOCK);
-	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_IOWAIT);
+	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 	iput(inode);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 26bfb5c42e76..bda33a03e12b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -31,7 +31,6 @@ typedef struct xfs_sync_work {
 
 #define SYNC_WAIT		0x0004	/* wait for i/o to complete */
 #define SYNC_BDFLUSH		0x0008	/* BDFLUSH is calling -- don't block */
-#define SYNC_IOWAIT		0x0010  /* wait for all I/O to complete */
 #define SYNC_TRYLOCK		0x0020  /* only try to lock inodes */
 
 int xfs_syncd_init(struct xfs_mount *mp);

From 8b5403a6d772d340541cfb30a668fde119c40ac1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 15:37:16 +0200
Subject: [PATCH 144/162] xfs: remove SYNC_BDFLUSH

SYNC_BDFLUSH is a leftover from IRIX and rather misnamed for todays
code.  Make xfs_sync_fsdata and xfs_dq_sync use the SYNC_TRYLOCK flag
for not blocking on logs just as the inode sync code already does.

For xfs_sync_fsdata it's a trivial 1:1 replacement, but for xfs_qm_sync
I use the opportunity to decouple the non-blocking lock case from the
different flushing modes, similar to the inode sync code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/linux-2.6/xfs_sync.c |  8 ++++----
 fs/xfs/linux-2.6/xfs_sync.h |  5 ++---
 fs/xfs/quota/xfs_qm.c       | 28 +++++-----------------------
 3 files changed, 11 insertions(+), 30 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 32abd96b1095..b619d6b8ca43 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -353,7 +353,7 @@ xfs_sync_fsdata(
 	 * If this is xfssyncd() then only sync the superblock if we can
 	 * lock it without sleeping and it is not pinned.
 	 */
-	if (flags & SYNC_BDFLUSH) {
+	if (flags & SYNC_TRYLOCK) {
 		ASSERT(!(flags & SYNC_WAIT));
 
 		bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
@@ -418,7 +418,7 @@ xfs_quiesce_data(
 
 	/* push non-blocking */
 	xfs_sync_data(mp, 0);
-	xfs_qm_sync(mp, SYNC_BDFLUSH);
+	xfs_qm_sync(mp, SYNC_TRYLOCK);
 	xfs_filestream_flush(mp);
 
 	/* push and block */
@@ -568,8 +568,8 @@ xfs_sync_worker(
 		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
 		xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
 		/* dgc: errors ignored here */
-		error = xfs_qm_sync(mp, SYNC_BDFLUSH);
-		error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
+		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
+		error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
 		if (xfs_log_need_covered(mp))
 			error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index bda33a03e12b..2a10301c99c7 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -29,9 +29,8 @@ typedef struct xfs_sync_work {
 	struct completion	*w_completion;
 } xfs_sync_work_t;
 
-#define SYNC_WAIT		0x0004	/* wait for i/o to complete */
-#define SYNC_BDFLUSH		0x0008	/* BDFLUSH is calling -- don't block */
-#define SYNC_TRYLOCK		0x0020  /* only try to lock inodes */
+#define SYNC_WAIT		0x0001	/* wait for i/o to complete */
+#define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
 
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index aa5d8212661c..22b7c8d364e8 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -905,11 +905,6 @@ xfs_qm_dqdetach(
 	}
 }
 
-/*
- * This is called to sync quotas. We can be told to use non-blocking
- * semantics by either the SYNC_BDFLUSH flag or the absence of the
- * SYNC_WAIT flag.
- */
 int
 xfs_qm_sync(
 	xfs_mount_t	*mp,
@@ -918,17 +913,13 @@ xfs_qm_sync(
 	int		recl, restarts;
 	xfs_dquot_t	*dqp;
 	uint		flush_flags;
-	boolean_t	nowait;
 	int		error;
 
 	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return 0;
 
+	flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
 	restarts = 0;
-	/*
-	 * We won't block unless we are asked to.
-	 */
-	nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0);
 
   again:
 	xfs_qm_mplist_lock(mp);
@@ -948,18 +939,10 @@ xfs_qm_sync(
 		 * don't 'seem' to be dirty. ie. don't acquire dqlock.
 		 * This is very similar to what xfs_sync does with inodes.
 		 */
-		if (flags & SYNC_BDFLUSH) {
-			if (! XFS_DQ_IS_DIRTY(dqp))
+		if (flags & SYNC_TRYLOCK) {
+			if (!XFS_DQ_IS_DIRTY(dqp))
 				continue;
-		}
-
-		if (nowait) {
-			/*
-			 * Try to acquire the dquot lock. We are NOT out of
-			 * lock order, but we just don't want to wait for this
-			 * lock, unless somebody wanted us to.
-			 */
-			if (! xfs_qm_dqlock_nowait(dqp))
+			if (!xfs_qm_dqlock_nowait(dqp))
 				continue;
 		} else {
 			xfs_dqlock(dqp);
@@ -976,7 +959,7 @@ xfs_qm_sync(
 		/* XXX a sentinel would be better */
 		recl = XFS_QI_MPLRECLAIMS(mp);
 		if (!xfs_dqflock_nowait(dqp)) {
-			if (nowait) {
+			if (flags & SYNC_TRYLOCK) {
 				xfs_dqunlock(dqp);
 				continue;
 			}
@@ -994,7 +977,6 @@ xfs_qm_sync(
 		 * Let go of the mplist lock. We don't want to hold it
 		 * across a disk write
 		 */
-		flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC;
 		xfs_qm_mplist_unlock(mp);
 		xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
 		error = xfs_qm_dqflush(dqp, flush_flags);

From ef14f0c1578dce4b688726eb2603e50b62d6665a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Jun 2009 17:07:47 +0200
Subject: [PATCH 145/162] xfs: use generic Posix ACL code

This patch rips out the XFS ACL handling code and uses the generic
fs/posix_acl.c code instead.  The ondisk format is of course left
unchanged.

This also introduces the same ACL caching all other Linux filesystems do
by adding pointers to the acl and default acl in struct xfs_inode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/Kconfig                 |   1 +
 fs/xfs/Makefile                |   2 +-
 fs/xfs/linux-2.6/xfs_acl.c     | 523 ++++++++++++++++++++
 fs/xfs/linux-2.6/xfs_ioctl.c   |   1 -
 fs/xfs/linux-2.6/xfs_iops.c    |  53 +-
 fs/xfs/linux-2.6/xfs_lrw.c     |   1 -
 fs/xfs/linux-2.6/xfs_super.c   |  14 -
 fs/xfs/linux-2.6/xfs_xattr.c   |  67 +--
 fs/xfs/quota/xfs_dquot.c       |   1 -
 fs/xfs/quota/xfs_dquot_item.c  |   1 -
 fs/xfs/quota/xfs_qm.c          |   1 -
 fs/xfs/quota/xfs_qm_bhv.c      |   1 -
 fs/xfs/quota/xfs_qm_stats.c    |   1 -
 fs/xfs/quota/xfs_qm_syscalls.c |   1 -
 fs/xfs/quota/xfs_trans_dquot.c |   1 -
 fs/xfs/xfs_acl.c               | 874 ---------------------------------
 fs/xfs/xfs_acl.h               |  97 ++--
 fs/xfs/xfs_arch.h              |  32 --
 fs/xfs/xfs_attr.c              |   1 -
 fs/xfs/xfs_iget.c              |   3 +
 fs/xfs/xfs_inode.c             |   1 -
 fs/xfs/xfs_inode.h             |   6 +
 fs/xfs/xfs_iomap.c             |   1 -
 fs/xfs/xfs_rw.c                |   1 -
 fs/xfs/xfs_vnodeops.c          |  15 +-
 fs/xfs/xfs_vnodeops.h          |   1 +
 26 files changed, 598 insertions(+), 1103 deletions(-)
 create mode 100644 fs/xfs/linux-2.6/xfs_acl.c
 delete mode 100644 fs/xfs/xfs_acl.c

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 29228f5899cd..480f28127f09 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -39,6 +39,7 @@ config XFS_QUOTA
 config XFS_POSIX_ACL
 	bool "XFS POSIX ACL support"
 	depends on XFS_FS
+	select FS_POSIX_ACL
 	help
 	  POSIX Access Control Lists (ACLs) support permissions for users and
 	  groups beyond the owner/group/world scheme.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 222c59e541ce..7a59daed1782 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -40,7 +40,7 @@ xfs-$(CONFIG_PROC_FS)		+= quota/xfs_qm_stats.o
 endif
 
 xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
+xfs-$(CONFIG_XFS_POSIX_ACL)	+= $(XFS_LINUX)/xfs_acl.o
 xfs-$(CONFIG_PROC_FS)		+= $(XFS_LINUX)/xfs_stats.o
 xfs-$(CONFIG_SYSCTL)		+= $(XFS_LINUX)/xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)		+= $(XFS_LINUX)/xfs_ioctl32.o
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
new file mode 100644
index 000000000000..1e9d1246eebc
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_vnodeops.h"
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+
+
+#define XFS_ACL_NOT_CACHED	((void *)-1)
+
+/*
+ * Locking scheme:
+ *  - all ACL updates are protected by inode->i_mutex, which is taken before
+ *    calling into this file.
+ *  - access and updates to the ip->i_acl and ip->i_default_acl pointers are
+ *    protected by inode->i_lock.
+ */
+
+STATIC struct posix_acl *
+xfs_acl_from_disk(struct xfs_acl *aclp)
+{
+	struct posix_acl_entry *acl_e;
+	struct posix_acl *acl;
+	struct xfs_acl_entry *ace;
+	int count, i;
+
+	count = be32_to_cpu(aclp->acl_cnt);
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		acl_e = &acl->a_entries[i];
+		ace = &aclp->acl_entry[i];
+
+		/*
+		 * The tag is 32 bits on disk and 16 bits in core.
+		 *
+		 * Because every access to it goes through the core
+		 * format first this is not a problem.
+		 */
+		acl_e->e_tag = be32_to_cpu(ace->ae_tag);
+		acl_e->e_perm = be16_to_cpu(ace->ae_perm);
+
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			acl_e->e_id = be32_to_cpu(ace->ae_id);
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			acl_e->e_id = ACL_UNDEFINED_ID;
+			break;
+		default:
+			goto fail;
+		}
+	}
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+STATIC void
+xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
+{
+	const struct posix_acl_entry *acl_e;
+	struct xfs_acl_entry *ace;
+	int i;
+
+	aclp->acl_cnt = cpu_to_be32(acl->a_count);
+	for (i = 0; i < acl->a_count; i++) {
+		ace = &aclp->acl_entry[i];
+		acl_e = &acl->a_entries[i];
+
+		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
+		ace->ae_id = cpu_to_be32(acl_e->e_id);
+		ace->ae_perm = cpu_to_be16(acl_e->e_perm);
+	}
+}
+
+/*
+ * Update the cached ACL pointer in the inode.
+ *
+ * Because we don't hold any locks while reading/writing the attribute
+ * from/to disk another thread could have raced and updated the cached
+ * ACL value before us. In that case we release the previous cached value
+ * and update it with our new value.
+ */
+STATIC void
+xfs_update_cached_acl(struct inode *inode, struct posix_acl **p_acl,
+		struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*p_acl && *p_acl != XFS_ACL_NOT_CACHED)
+		posix_acl_release(*p_acl);
+	*p_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
+struct posix_acl *
+xfs_get_acl(struct inode *inode, int type)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	struct posix_acl *acl = NULL, **p_acl;
+	struct xfs_acl *xfs_acl;
+	int len = sizeof(struct xfs_acl);
+	char *ea_name;
+	int error;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		ea_name = SGI_ACL_FILE;
+		p_acl = &ip->i_acl;
+		break;
+	case ACL_TYPE_DEFAULT:
+		ea_name = SGI_ACL_DEFAULT;
+		p_acl = &ip->i_default_acl;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	spin_lock(&inode->i_lock);
+	if (*p_acl != XFS_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*p_acl);
+	spin_unlock(&inode->i_lock);
+
+	/*
+	 * If we have a cached ACLs value just return it, not need to
+	 * go out to the disk.
+	 */
+	if (acl)
+		return acl;
+
+	xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+	if (!xfs_acl)
+		return ERR_PTR(-ENOMEM);
+
+	error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT);
+	if (error) {
+		/*
+		 * If the attribute doesn't exist make sure we have a negative
+		 * cache entry, for any other error assume it is transient and
+		 * leave the cache entry as XFS_ACL_NOT_CACHED.
+		 */
+		if (error == -ENOATTR) {
+			acl = NULL;
+			goto out_update_cache;
+		}
+		goto out;
+	}
+
+	acl = xfs_acl_from_disk(xfs_acl);
+	if (IS_ERR(acl))
+		goto out;
+
+ out_update_cache:
+	xfs_update_cached_acl(inode, p_acl, acl);
+ out:
+	kfree(xfs_acl);
+	return acl;
+}
+
+STATIC int
+xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	struct posix_acl **p_acl;
+	char *ea_name;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		ea_name = SGI_ACL_FILE;
+		p_acl = &ip->i_acl;
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		ea_name = SGI_ACL_DEFAULT;
+		p_acl = &ip->i_default_acl;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		struct xfs_acl *xfs_acl;
+		int len;
+
+		xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+		if (!xfs_acl)
+			return -ENOMEM;
+
+		xfs_acl_to_disk(xfs_acl, acl);
+		len = sizeof(struct xfs_acl) -
+			(sizeof(struct xfs_acl_entry) *
+			 (XFS_ACL_MAX_ENTRIES - acl->a_count));
+
+		error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl,
+				len, ATTR_ROOT);
+
+		kfree(xfs_acl);
+	} else {
+		/*
+		 * A NULL ACL argument means we want to remove the ACL.
+		 */
+		error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+
+		/*
+		 * If the attribute didn't exist to start with that's fine.
+		 */
+		if (error == -ENOATTR)
+			error = 0;
+	}
+
+	if (!error)
+		xfs_update_cached_acl(inode, p_acl, acl);
+	return error;
+}
+
+int
+xfs_check_acl(struct inode *inode, int mask)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	struct posix_acl *acl;
+	int error = -EAGAIN;
+
+	xfs_itrace_entry(ip);
+
+	/*
+	 * If there is no attribute fork no ACL exists on this inode and
+	 * we can skip the whole exercise.
+	 */
+	if (!XFS_IFORK_Q(ip))
+		return -EAGAIN;
+
+	acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+	}
+
+	return error;
+}
+
+static int
+xfs_set_mode(struct inode *inode, mode_t mode)
+{
+	int error = 0;
+
+	if (mode != inode->i_mode) {
+		struct iattr iattr;
+
+		iattr.ia_valid = ATTR_MODE;
+		iattr.ia_mode = mode;
+
+		error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+	}
+
+	return error;
+}
+
+static int
+xfs_acl_exists(struct inode *inode, char *name)
+{
+	int len = sizeof(struct xfs_acl);
+
+	return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
+			    ATTR_ROOT|ATTR_KERNOVAL) == 0);
+}
+
+int
+posix_acl_access_exists(struct inode *inode)
+{
+	return xfs_acl_exists(inode, SGI_ACL_FILE);
+}
+
+int
+posix_acl_default_exists(struct inode *inode)
+{
+	if (!S_ISDIR(inode->i_mode))
+		return 0;
+	return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
+}
+
+/*
+ * No need for i_mutex because the inode is not yet exposed to the VFS.
+ */
+int
+xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl)
+{
+	struct posix_acl *clone;
+	mode_t mode;
+	int error = 0, inherit = 0;
+
+	if (S_ISDIR(inode->i_mode)) {
+		error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+		if (error)
+			return error;
+	}
+
+	clone = posix_acl_clone(default_acl, GFP_KERNEL);
+	if (!clone)
+		return -ENOMEM;
+
+	mode = inode->i_mode;
+	error = posix_acl_create_masq(clone, &mode);
+	if (error < 0)
+		goto out_release_clone;
+
+	/*
+	 * If posix_acl_create_masq returns a positive value we need to
+	 * inherit a permission that can't be represented using the Unix
+	 * mode bits and we actually need to set an ACL.
+	 */
+	if (error > 0)
+		inherit = 1;
+
+	error = xfs_set_mode(inode, mode);
+	if (error)
+		goto out_release_clone;
+
+	if (inherit)
+		error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+
+ out_release_clone:
+	posix_acl_release(clone);
+	return error;
+}
+
+int
+xfs_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+
+	error = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!error)
+		error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+
+	posix_acl_release(clone);
+	return error;
+}
+
+void
+xfs_inode_init_acls(struct xfs_inode *ip)
+{
+	/*
+	 * No need for locking, inode is not live yet.
+	 */
+	ip->i_acl = XFS_ACL_NOT_CACHED;
+	ip->i_default_acl = XFS_ACL_NOT_CACHED;
+}
+
+void
+xfs_inode_clear_acls(struct xfs_inode *ip)
+{
+	/*
+	 * No need for locking here, the inode is not live anymore
+	 * and just about to be freed.
+	 */
+	if (ip->i_acl != XFS_ACL_NOT_CACHED)
+		posix_acl_release(ip->i_acl);
+	if (ip->i_default_acl != XFS_ACL_NOT_CACHED)
+		posix_acl_release(ip->i_default_acl);
+}
+
+
+/*
+ * System xattr handlers.
+ *
+ * Currently Posix ACLs are the only system namespace extended attribute
+ * handlers supported by XFS, so we just implement the handlers here.
+ * If we ever support other system extended attributes this will need
+ * some refactoring.
+ */
+
+static int
+xfs_decode_acl(const char *name)
+{
+	if (strcmp(name, "posix_acl_access") == 0)
+		return ACL_TYPE_ACCESS;
+	else if (strcmp(name, "posix_acl_default") == 0)
+		return ACL_TYPE_DEFAULT;
+	return -EINVAL;
+}
+
+static int
+xfs_xattr_system_get(struct inode *inode, const char *name,
+		void *value, size_t size)
+{
+	struct posix_acl *acl;
+	int type, error;
+
+	type = xfs_decode_acl(name);
+	if (type < 0)
+		return type;
+
+	acl = xfs_get_acl(inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+
+	error = posix_acl_to_xattr(acl, value, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int
+xfs_xattr_system_set(struct inode *inode, const char *name,
+		const void *value, size_t size, int flags)
+{
+	struct posix_acl *acl = NULL;
+	int error = 0, type;
+
+	type = xfs_decode_acl(name);
+	if (type < 0)
+		return type;
+	if (flags & XATTR_CREATE)
+		return -EINVAL;
+	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+		return value ? -EACCES : 0;
+	if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (!value)
+		goto set_acl;
+
+	acl = posix_acl_from_xattr(value, size);
+	if (!acl) {
+		/*
+		 * acl_set_file(3) may request that we set default ACLs with
+		 * zero length -- defend (gracefully) against that here.
+		 */
+		goto out;
+	}
+	if (IS_ERR(acl)) {
+		error = PTR_ERR(acl);
+		goto out;
+	}
+
+	error = posix_acl_valid(acl);
+	if (error)
+		goto out_release;
+
+	error = -EINVAL;
+	if (acl->a_count > XFS_ACL_MAX_ENTRIES)
+		goto out_release;
+
+	if (type == ACL_TYPE_ACCESS) {
+		mode_t mode = inode->i_mode;
+		error = posix_acl_equiv_mode(acl, &mode);
+
+		if (error <= 0) {
+			posix_acl_release(acl);
+			acl = NULL;
+
+			if (error < 0)
+				return error;
+		}
+
+		error = xfs_set_mode(inode, mode);
+		if (error)
+			goto out_release;
+	}
+
+ set_acl:
+	error = xfs_set_acl(inode, type, acl);
+ out_release:
+	posix_acl_release(acl);
+ out:
+	return error;
+}
+
+struct xattr_handler xfs_xattr_system_handler = {
+	.prefix	= XATTR_SYSTEM_PREFIX,
+	.get	= xfs_xattr_system_get,
+	.set	= xfs_xattr_system_set,
+};
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index c7d684f02f89..182d4be9ffab 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -41,7 +41,6 @@
 #include "xfs_itable.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
 #include "xfs_buf_item.h"
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 631d0137551e..84e391ab70b0 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_acl.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
@@ -51,6 +52,7 @@
 #include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/namei.h>
+#include <linux/posix_acl.h>
 #include <linux/security.h>
 #include <linux/falloc.h>
 #include <linux/fiemap.h>
@@ -202,9 +204,8 @@ xfs_vn_mknod(
 {
 	struct inode	*inode;
 	struct xfs_inode *ip = NULL;
-	xfs_acl_t	*default_acl = NULL;
+	struct posix_acl *default_acl = NULL;
 	struct xfs_name	name;
-	int (*test_default_acl)(struct inode *) = _ACL_DEFAULT_EXISTS;
 	int		error;
 
 	/*
@@ -219,18 +220,14 @@ xfs_vn_mknod(
 		rdev = 0;
 	}
 
-	if (test_default_acl && test_default_acl(dir)) {
-		if (!_ACL_ALLOC(default_acl)) {
-			return -ENOMEM;
-		}
-		if (!_ACL_GET_DEFAULT(dir, default_acl)) {
-			_ACL_FREE(default_acl);
-			default_acl = NULL;
-		}
-	}
+	if (IS_POSIXACL(dir)) {
+		default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(default_acl))
+			return -PTR_ERR(default_acl);
 
-	if (IS_POSIXACL(dir) && !default_acl)
-		mode &= ~current->fs->umask;
+		if (!default_acl)
+			mode &= ~current->fs->umask;
+	}
 
 	xfs_dentry_to_name(&name, dentry);
 	error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
@@ -244,10 +241,10 @@ xfs_vn_mknod(
 		goto out_cleanup_inode;
 
 	if (default_acl) {
-		error = _ACL_INHERIT(inode, mode, default_acl);
+		error = -xfs_inherit_acl(inode, default_acl);
 		if (unlikely(error))
 			goto out_cleanup_inode;
-		_ACL_FREE(default_acl);
+		posix_acl_release(default_acl);
 	}
 
 
@@ -257,8 +254,7 @@ xfs_vn_mknod(
  out_cleanup_inode:
 	xfs_cleanup_inode(dir, inode, dentry);
  out_free_acl:
-	if (default_acl)
-		_ACL_FREE(default_acl);
+	posix_acl_release(default_acl);
 	return -error;
 }
 
@@ -488,26 +484,6 @@ xfs_vn_put_link(
 		kfree(s);
 }
 
-#ifdef CONFIG_XFS_POSIX_ACL
-STATIC int
-xfs_check_acl(
-	struct inode		*inode,
-	int			mask)
-{
-	struct xfs_inode	*ip = XFS_I(inode);
-	int			error;
-
-	xfs_itrace_entry(ip);
-
-	if (XFS_IFORK_Q(ip)) {
-		error = xfs_acl_iaccess(ip, mask, NULL);
-		if (error != -1)
-			return -error;
-	}
-
-	return -EAGAIN;
-}
-
 STATIC int
 xfs_vn_permission(
 	struct inode		*inode,
@@ -515,9 +491,6 @@ xfs_vn_permission(
 {
 	return generic_permission(inode, mask, xfs_check_acl);
 }
-#else
-#define xfs_vn_permission NULL
-#endif
 
 STATIC int
 xfs_vn_getattr(
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 9142192ccbe6..7078974a6eee 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_inode_item.h"
 #include "xfs_buf_item.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index d4e7ef8f8df9..36fb8a657c55 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -43,7 +43,6 @@
 #include "xfs_itable.h"
 #include "xfs_fsops.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -1735,18 +1734,8 @@ xfs_init_zones(void)
 	if (!xfs_ili_zone)
 		goto out_destroy_inode_zone;
 
-#ifdef CONFIG_XFS_POSIX_ACL
-	xfs_acl_zone = kmem_zone_init(sizeof(xfs_acl_t), "xfs_acl");
-	if (!xfs_acl_zone)
-		goto out_destroy_ili_zone;
-#endif
-
 	return 0;
 
-#ifdef CONFIG_XFS_POSIX_ACL
- out_destroy_ili_zone:
-#endif
-	kmem_zone_destroy(xfs_ili_zone);
  out_destroy_inode_zone:
 	kmem_zone_destroy(xfs_inode_zone);
  out_destroy_efi_zone:
@@ -1780,9 +1769,6 @@ xfs_init_zones(void)
 STATIC void
 xfs_destroy_zones(void)
 {
-#ifdef CONFIG_XFS_POSIX_ACL
-	kmem_zone_destroy(xfs_acl_zone);
-#endif
 	kmem_zone_destroy(xfs_ili_zone);
 	kmem_zone_destroy(xfs_inode_zone);
 	kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 964621fde6ed..497c7fb75cc1 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -29,67 +29,6 @@
 #include <linux/xattr.h>
 
 
-/*
- * ACL handling.  Should eventually be moved into xfs_acl.c
- */
-
-static int
-xfs_decode_acl(const char *name)
-{
-	if (strcmp(name, "posix_acl_access") == 0)
-		return _ACL_TYPE_ACCESS;
-	else if (strcmp(name, "posix_acl_default") == 0)
-		return _ACL_TYPE_DEFAULT;
-	return -EINVAL;
-}
-
-/*
- * Get system extended attributes which at the moment only
- * includes Posix ACLs.
- */
-static int
-xfs_xattr_system_get(struct inode *inode, const char *name,
-		void *buffer, size_t size)
-{
-	int acl;
-
-	acl = xfs_decode_acl(name);
-	if (acl < 0)
-		return acl;
-
-	return xfs_acl_vget(inode, buffer, size, acl);
-}
-
-static int
-xfs_xattr_system_set(struct inode *inode, const char *name,
-		const void *value, size_t size, int flags)
-{
-	int acl;
-
-	acl = xfs_decode_acl(name);
-	if (acl < 0)
-		return acl;
-	if (flags & XATTR_CREATE)
-		return -EINVAL;
-
-	if (!value)
-		return xfs_acl_vremove(inode, acl);
-
-	return xfs_acl_vset(inode, (void *)value, size, acl);
-}
-
-static struct xattr_handler xfs_xattr_system_handler = {
-	.prefix	= XATTR_SYSTEM_PREFIX,
-	.get	= xfs_xattr_system_get,
-	.set	= xfs_xattr_system_set,
-};
-
-
-/*
- * Real xattr handling.  The only difference between the namespaces is
- * a flag passed to the low-level attr code.
- */
-
 static int
 __xfs_xattr_get(struct inode *inode, const char *name,
 		void *value, size_t size, int xflags)
@@ -199,7 +138,9 @@ struct xattr_handler *xfs_xattr_handlers[] = {
 	&xfs_xattr_user_handler,
 	&xfs_xattr_trusted_handler,
 	&xfs_xattr_security_handler,
+#ifdef CONFIG_XFS_POSIX_ACL
 	&xfs_xattr_system_handler,
+#endif
 	NULL
 };
 
@@ -310,7 +251,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
 	/*
 	 * Then add the two synthetic ACL attributes.
 	 */
-	if (xfs_acl_vhasacl_access(inode)) {
+	if (posix_acl_access_exists(inode)) {
 		error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
 				strlen(POSIX_ACL_XATTR_ACCESS) + 1,
 				data, size, &context.count);
@@ -318,7 +259,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
 			return error;
 	}
 
-	if (xfs_acl_vhasacl_default(inode)) {
+	if (posix_acl_default_exists(inode)) {
 		error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
 				strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
 				data, size, &context.count);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 4d6d051ee56e..2f3f2229eaaf 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1728f6a7c4f5..d0d4a9a0bbd7 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 22b7c8d364e8..45b1bfef7388 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 56a5965f3c8b..a5346630dfae 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -42,7 +42,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 709f5f545cf5..21b08c0396a1 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -42,7 +42,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 7126f855e14b..4e4276b956e8 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -45,7 +45,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index eafa7ab34085..97ac9640be98 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -42,7 +42,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
deleted file mode 100644
index a8cdd73999a4..000000000000
--- a/fs/xfs/xfs_acl.c
+++ /dev/null
@@ -1,874 +0,0 @@
-/*
- * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_vnodeops.h"
-
-#include <linux/capability.h>
-#include <linux/posix_acl_xattr.h>
-
-STATIC int	xfs_acl_setmode(struct inode *, xfs_acl_t *, int *);
-STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
-STATIC void	xfs_acl_get_endian(xfs_acl_t *);
-STATIC int	xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
-STATIC int	xfs_acl_invalid(xfs_acl_t *);
-STATIC void	xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void	xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *);
-STATIC void	xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *);
-STATIC int	xfs_acl_allow_set(struct inode *, int);
-
-kmem_zone_t *xfs_acl_zone;
-
-
-/*
- * Test for existence of access ACL attribute as efficiently as possible.
- */
-int
-xfs_acl_vhasacl_access(
-	struct inode	*vp)
-{
-	int		error;
-
-	xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error);
-	return (error == 0);
-}
-
-/*
- * Test for existence of default ACL attribute as efficiently as possible.
- */
-int
-xfs_acl_vhasacl_default(
-	struct inode	*vp)
-{
-	int		error;
-
-	if (!S_ISDIR(vp->i_mode))
-		return 0;
-	xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
-	return (error == 0);
-}
-
-/*
- * Convert from extended attribute representation to in-memory for XFS.
- */
-STATIC int
-posix_acl_xattr_to_xfs(
-	posix_acl_xattr_header	*src,
-	size_t			size,
-	xfs_acl_t		*dest)
-{
-	posix_acl_xattr_entry	*src_entry;
-	xfs_acl_entry_t		*dest_entry;
-	int			n;
-
-	if (!src || !dest)
-		return EINVAL;
-
-	if (size < sizeof(posix_acl_xattr_header))
-		return EINVAL;
-
-	if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
-		return EOPNOTSUPP;
-
-	memset(dest, 0, sizeof(xfs_acl_t));
-	dest->acl_cnt = posix_acl_xattr_count(size);
-	if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES)
-		return EINVAL;
-
-	/*
-	 * acl_set_file(3) may request that we set default ACLs with
-	 * zero length -- defend (gracefully) against that here.
-	 */
-	if (!dest->acl_cnt)
-		return 0;
-
-	src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src));
-	dest_entry = &dest->acl_entry[0];
-
-	for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) {
-		dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm);
-		if (_ACL_PERM_INVALID(dest_entry->ae_perm))
-			return EINVAL;
-		dest_entry->ae_tag  = le16_to_cpu(src_entry->e_tag);
-		switch(dest_entry->ae_tag) {
-		case ACL_USER:
-		case ACL_GROUP:
-			dest_entry->ae_id = le32_to_cpu(src_entry->e_id);
-			break;
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			dest_entry->ae_id = ACL_UNDEFINED_ID;
-			break;
-		default:
-			return EINVAL;
-		}
-	}
-	if (xfs_acl_invalid(dest))
-		return EINVAL;
-
-	return 0;
-}
-
-/*
- * Comparison function called from xfs_sort().
- * Primary key is ae_tag, secondary key is ae_id.
- */
-STATIC int
-xfs_acl_entry_compare(
-	const void	*va,
-	const void	*vb)
-{
-	xfs_acl_entry_t	*a = (xfs_acl_entry_t *)va,
-			*b = (xfs_acl_entry_t *)vb;
-
-	if (a->ae_tag == b->ae_tag)
-		return (a->ae_id - b->ae_id);
-	return (a->ae_tag - b->ae_tag);
-}
-
-/*
- * Convert from in-memory XFS to extended attribute representation.
- */
-STATIC int
-posix_acl_xfs_to_xattr(
-	xfs_acl_t		*src,
-	posix_acl_xattr_header	*dest,
-	size_t			size)
-{
-	int			n;
-	size_t			new_size = posix_acl_xattr_size(src->acl_cnt);
-	posix_acl_xattr_entry	*dest_entry;
-	xfs_acl_entry_t		*src_entry;
-
-	if (size < new_size)
-		return -ERANGE;
-
-	/* Need to sort src XFS ACL by <ae_tag,ae_id> */
-	xfs_sort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]),
-		 xfs_acl_entry_compare);
-
-	dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
-	dest_entry = &dest->a_entries[0];
-	src_entry = &src->acl_entry[0];
-	for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) {
-		dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm);
-		if (_ACL_PERM_INVALID(src_entry->ae_perm))
-			return -EINVAL;
-		dest_entry->e_tag  = cpu_to_le16(src_entry->ae_tag);
-		switch (src_entry->ae_tag) {
-		case ACL_USER:
-		case ACL_GROUP:
-			dest_entry->e_id = cpu_to_le32(src_entry->ae_id);
-				break;
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
-			break;
-		default:
-			return -EINVAL;
-		}
-	}
-	return new_size;
-}
-
-int
-xfs_acl_vget(
-	struct inode	*vp,
-	void		*acl,
-	size_t		size,
-	int		kind)
-{
-	int			error;
-	xfs_acl_t		*xfs_acl = NULL;
-	posix_acl_xattr_header	*ext_acl = acl;
-	int			flags = 0;
-
-	if(size) {
-		if (!(_ACL_ALLOC(xfs_acl))) {
-			error = ENOMEM;
-			goto out;
-		}
-		memset(xfs_acl, 0, sizeof(xfs_acl_t));
-	} else
-		flags = ATTR_KERNOVAL;
-
-	xfs_acl_get_attr(vp, xfs_acl, kind, flags, &error);
-	if (error)
-		goto out;
-
-	if (!size) {
-		error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES);
-	} else {
-		if (xfs_acl_invalid(xfs_acl)) {
-			error = EINVAL;
-			goto out;
-		}
-		if (kind == _ACL_TYPE_ACCESS)
-			xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl);
-		error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
-	}
-out:
-	if(xfs_acl)
-		_ACL_FREE(xfs_acl);
-	return -error;
-}
-
-int
-xfs_acl_vremove(
-	struct inode	*vp,
-	int		kind)
-{
-	int		error;
-
-	error = xfs_acl_allow_set(vp, kind);
-	if (!error) {
-		error = xfs_attr_remove(XFS_I(vp),
-						kind == _ACL_TYPE_DEFAULT?
-						SGI_ACL_DEFAULT: SGI_ACL_FILE,
-						ATTR_ROOT);
-		if (error == ENOATTR)
-			error = 0;	/* 'scool */
-	}
-	return -error;
-}
-
-int
-xfs_acl_vset(
-	struct inode		*vp,
-	void			*acl,
-	size_t			size,
-	int			kind)
-{
-	posix_acl_xattr_header	*ext_acl = acl;
-	xfs_acl_t		*xfs_acl;
-	int			error;
-	int			basicperms = 0; /* more than std unix perms? */
-
-	if (!acl)
-		return -EINVAL;
-
-	if (!(_ACL_ALLOC(xfs_acl)))
-		return -ENOMEM;
-
-	error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl);
-	if (error) {
-		_ACL_FREE(xfs_acl);
-		return -error;
-	}
-	if (!xfs_acl->acl_cnt) {
-		_ACL_FREE(xfs_acl);
-		return 0;
-	}
-
-	error = xfs_acl_allow_set(vp, kind);
-
-	/* Incoming ACL exists, set file mode based on its value */
-	if (!error && kind == _ACL_TYPE_ACCESS)
-		error = xfs_acl_setmode(vp, xfs_acl, &basicperms);
-
-	if (error)
-		goto out;
-
-	/*
-	 * If we have more than std unix permissions, set up the actual attr.
-	 * Otherwise, delete any existing attr.  This prevents us from
-	 * having actual attrs for permissions that can be stored in the
-	 * standard permission bits.
-	 */
-	if (!basicperms) {
-		xfs_acl_set_attr(vp, xfs_acl, kind, &error);
-	} else {
-		error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
-	}
-
-out:
-	_ACL_FREE(xfs_acl);
-	return -error;
-}
-
-int
-xfs_acl_iaccess(
-	xfs_inode_t	*ip,
-	mode_t		mode,
-	cred_t		*cr)
-{
-	xfs_acl_t	*acl;
-	int		rval;
-	struct xfs_name	acl_name = {SGI_ACL_FILE, SGI_ACL_FILE_SIZE};
-
-	if (!(_ACL_ALLOC(acl)))
-		return -1;
-
-	/* If the file has no ACL return -1. */
-	rval = sizeof(xfs_acl_t);
-	if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, ATTR_ROOT)) {
-		_ACL_FREE(acl);
-		return -1;
-	}
-	xfs_acl_get_endian(acl);
-
-	/* If the file has an empty ACL return -1. */
-	if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) {
-		_ACL_FREE(acl);
-		return -1;
-	}
-
-	/* Synchronize ACL with mode bits */
-	xfs_acl_sync_mode(ip->i_d.di_mode, acl);
-
-	rval = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr);
-	_ACL_FREE(acl);
-	return rval;
-}
-
-STATIC int
-xfs_acl_allow_set(
-	struct inode	*vp,
-	int		kind)
-{
-	if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
-		return EPERM;
-	if (kind == _ACL_TYPE_DEFAULT && !S_ISDIR(vp->i_mode))
-		return ENOTDIR;
-	if (vp->i_sb->s_flags & MS_RDONLY)
-		return EROFS;
-	if (XFS_I(vp)->i_d.di_uid != current_fsuid() && !capable(CAP_FOWNER))
-		return EPERM;
-	return 0;
-}
-
-/*
- * Note: cr is only used here for the capability check if the ACL test fails.
- *       It is not used to find out the credentials uid or groups etc, as was
- *       done in IRIX. It is assumed that the uid and groups for the current
- *       thread are taken from "current" instead of the cr parameter.
- */
-STATIC int
-xfs_acl_access(
-	uid_t		fuid,
-	gid_t		fgid,
-	xfs_acl_t	*fap,
-	mode_t		md,
-	cred_t		*cr)
-{
-	xfs_acl_entry_t	matched;
-	int		i, allows;
-	int		maskallows = -1;	/* true, but not 1, either */
-	int		seen_userobj = 0;
-
-	matched.ae_tag = 0;	/* Invalid type */
-	matched.ae_perm = 0;
-
-	for (i = 0; i < fap->acl_cnt; i++) {
-		/*
-		 * Break out if we've got a user_obj entry or
-		 * a user entry and the mask (and have processed USER_OBJ)
-		 */
-		if (matched.ae_tag == ACL_USER_OBJ)
-			break;
-		if (matched.ae_tag == ACL_USER) {
-			if (maskallows != -1 && seen_userobj)
-				break;
-			if (fap->acl_entry[i].ae_tag != ACL_MASK &&
-			    fap->acl_entry[i].ae_tag != ACL_USER_OBJ)
-				continue;
-		}
-		/* True if this entry allows the requested access */
-		allows = ((fap->acl_entry[i].ae_perm & md) == md);
-
-		switch (fap->acl_entry[i].ae_tag) {
-		case ACL_USER_OBJ:
-			seen_userobj = 1;
-			if (fuid != current_fsuid())
-				continue;
-			matched.ae_tag = ACL_USER_OBJ;
-			matched.ae_perm = allows;
-			break;
-		case ACL_USER:
-			if (fap->acl_entry[i].ae_id != current_fsuid())
-				continue;
-			matched.ae_tag = ACL_USER;
-			matched.ae_perm = allows;
-			break;
-		case ACL_GROUP_OBJ:
-			if ((matched.ae_tag == ACL_GROUP_OBJ ||
-			    matched.ae_tag == ACL_GROUP) && !allows)
-				continue;
-			if (!in_group_p(fgid))
-				continue;
-			matched.ae_tag = ACL_GROUP_OBJ;
-			matched.ae_perm = allows;
-			break;
-		case ACL_GROUP:
-			if ((matched.ae_tag == ACL_GROUP_OBJ ||
-			    matched.ae_tag == ACL_GROUP) && !allows)
-				continue;
-			if (!in_group_p(fap->acl_entry[i].ae_id))
-				continue;
-			matched.ae_tag = ACL_GROUP;
-			matched.ae_perm = allows;
-			break;
-		case ACL_MASK:
-			maskallows = allows;
-			break;
-		case ACL_OTHER:
-			if (matched.ae_tag != 0)
-				continue;
-			matched.ae_tag = ACL_OTHER;
-			matched.ae_perm = allows;
-			break;
-		}
-	}
-	/*
-	 * First possibility is that no matched entry allows access.
-	 * The capability to override DAC may exist, so check for it.
-	 */
-	switch (matched.ae_tag) {
-	case ACL_OTHER:
-	case ACL_USER_OBJ:
-		if (matched.ae_perm)
-			return 0;
-		break;
-	case ACL_USER:
-	case ACL_GROUP_OBJ:
-	case ACL_GROUP:
-		if (maskallows && matched.ae_perm)
-			return 0;
-		break;
-	case 0:
-		break;
-	}
-
-	/* EACCES tells generic_permission to check for capability overrides */
-	return EACCES;
-}
-
-/*
- * ACL validity checker.
- *   This acl validation routine checks each ACL entry read in makes sense.
- */
-STATIC int
-xfs_acl_invalid(
-	xfs_acl_t	*aclp)
-{
-	xfs_acl_entry_t	*entry, *e;
-	int		user = 0, group = 0, other = 0, mask = 0;
-	int		mask_required = 0;
-	int		i, j;
-
-	if (!aclp)
-		goto acl_invalid;
-
-	if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES)
-		goto acl_invalid;
-
-	for (i = 0; i < aclp->acl_cnt; i++) {
-		entry = &aclp->acl_entry[i];
-		switch (entry->ae_tag) {
-		case ACL_USER_OBJ:
-			if (user++)
-				goto acl_invalid;
-			break;
-		case ACL_GROUP_OBJ:
-			if (group++)
-				goto acl_invalid;
-			break;
-		case ACL_OTHER:
-			if (other++)
-				goto acl_invalid;
-			break;
-		case ACL_USER:
-		case ACL_GROUP:
-			for (j = i + 1; j < aclp->acl_cnt; j++) {
-				e = &aclp->acl_entry[j];
-				if (e->ae_id == entry->ae_id &&
-				    e->ae_tag == entry->ae_tag)
-					goto acl_invalid;
-			}
-			mask_required++;
-			break;
-		case ACL_MASK:
-			if (mask++)
-				goto acl_invalid;
-			break;
-		default:
-			goto acl_invalid;
-		}
-	}
-	if (!user || !group || !other || (mask_required && !mask))
-		goto acl_invalid;
-	else
-		return 0;
-acl_invalid:
-	return EINVAL;
-}
-
-/*
- * Do ACL endian conversion.
- */
-STATIC void
-xfs_acl_get_endian(
-	xfs_acl_t	*aclp)
-{
-	xfs_acl_entry_t	*ace, *end;
-
-	INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-	end = &aclp->acl_entry[0]+aclp->acl_cnt;
-	for (ace = &aclp->acl_entry[0]; ace < end; ace++) {
-		INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag);
-		INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id);
-		INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm);
-	}
-}
-
-/*
- * Get the ACL from the EA and do endian conversion.
- */
-STATIC void
-xfs_acl_get_attr(
-	struct inode	*vp,
-	xfs_acl_t	*aclp,
-	int		kind,
-	int		flags,
-	int		*error)
-{
-	int		len = sizeof(xfs_acl_t);
-
-	ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
-	flags |= ATTR_ROOT;
-	*error = xfs_attr_get(XFS_I(vp),
-					kind == _ACL_TYPE_ACCESS ?
-					SGI_ACL_FILE : SGI_ACL_DEFAULT,
-					(char *)aclp, &len, flags);
-	if (*error || (flags & ATTR_KERNOVAL))
-		return;
-	xfs_acl_get_endian(aclp);
-}
-
-/*
- * Set the EA with the ACL and do endian conversion.
- */
-STATIC void
-xfs_acl_set_attr(
-	struct inode	*vp,
-	xfs_acl_t	*aclp,
-	int		kind,
-	int		*error)
-{
-	xfs_acl_entry_t	*ace, *newace, *end;
-	xfs_acl_t	*newacl;
-	int		len;
-
-	if (!(_ACL_ALLOC(newacl))) {
-		*error = ENOMEM;
-		return;
-	}
-
-	len = sizeof(xfs_acl_t) -
-	      (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt));
-	end = &aclp->acl_entry[0]+aclp->acl_cnt;
-	for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0];
-	     ace < end;
-	     ace++, newace++) {
-		INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag);
-		INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id);
-		INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
-	}
-	INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-	*error = xfs_attr_set(XFS_I(vp),
-				kind == _ACL_TYPE_ACCESS ?
-				SGI_ACL_FILE: SGI_ACL_DEFAULT,
-				(char *)newacl, len, ATTR_ROOT);
-	_ACL_FREE(newacl);
-}
-
-int
-xfs_acl_vtoacl(
-	struct inode	*vp,
-	xfs_acl_t	*access_acl,
-	xfs_acl_t	*default_acl)
-{
-	int		error = 0;
-
-	if (access_acl) {
-		/*
-		 * Get the Access ACL and the mode.  If either cannot
-		 * be obtained for some reason, invalidate the access ACL.
-		 */
-		xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
-		if (error)
-			access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
-		else /* We have a good ACL and the file mode, synchronize. */
-			xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl);
-	}
-
-	if (default_acl) {
-		xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error);
-		if (error)
-			default_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
-	}
-	return error;
-}
-
-/*
- * This function retrieves the parent directory's acl, processes it
- * and lets the child inherit the acl(s) that it should.
- */
-int
-xfs_acl_inherit(
-	struct inode	*vp,
-	mode_t		mode,
-	xfs_acl_t	*pdaclp)
-{
-	xfs_acl_t	*cacl;
-	int		error = 0;
-	int		basicperms = 0;
-
-	/*
-	 * If the parent does not have a default ACL, or it's an
-	 * invalid ACL, we're done.
-	 */
-	if (!vp)
-		return 0;
-	if (!pdaclp || xfs_acl_invalid(pdaclp))
-		return 0;
-
-	/*
-	 * Copy the default ACL of the containing directory to
-	 * the access ACL of the new file and use the mode that
-	 * was passed in to set up the correct initial values for
-	 * the u::,g::[m::], and o:: entries.  This is what makes
-	 * umask() "work" with ACL's.
-	 */
-
-	if (!(_ACL_ALLOC(cacl)))
-		return ENOMEM;
-
-	memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
-	xfs_acl_filter_mode(mode, cacl);
-	error = xfs_acl_setmode(vp, cacl, &basicperms);
-	if (error)
-		goto out_error;
-
-	/*
-	 * Set the Default and Access ACL on the file.  The mode is already
-	 * set on the file, so we don't need to worry about that.
-	 *
-	 * If the new file is a directory, its default ACL is a copy of
-	 * the containing directory's default ACL.
-	 */
-	if (S_ISDIR(vp->i_mode))
-		xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
-	if (!error && !basicperms)
-		xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
-out_error:
-	_ACL_FREE(cacl);
-	return error;
-}
-
-/*
- * Set up the correct mode on the file based on the supplied ACL.  This
- * makes sure that the mode on the file reflects the state of the
- * u::,g::[m::], and o:: entries in the ACL.  Since the mode is where
- * the ACL is going to get the permissions for these entries, we must
- * synchronize the mode whenever we set the ACL on a file.
- */
-STATIC int
-xfs_acl_setmode(
-	struct inode	*vp,
-	xfs_acl_t	*acl,
-	int		*basicperms)
-{
-	struct iattr	iattr;
-	xfs_acl_entry_t	*ap;
-	xfs_acl_entry_t	*gap = NULL;
-	int		i, nomask = 1;
-
-	*basicperms = 1;
-
-	if (acl->acl_cnt == XFS_ACL_NOT_PRESENT)
-		return 0;
-
-	/*
-	 * Copy the u::, g::, o::, and m:: bits from the ACL into the
-	 * mode.  The m:: bits take precedence over the g:: bits.
-	 */
-	iattr.ia_valid = ATTR_MODE;
-	iattr.ia_mode = XFS_I(vp)->i_d.di_mode;
-	iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
-	ap = acl->acl_entry;
-	for (i = 0; i < acl->acl_cnt; ++i) {
-		switch (ap->ae_tag) {
-		case ACL_USER_OBJ:
-			iattr.ia_mode |= ap->ae_perm << 6;
-			break;
-		case ACL_GROUP_OBJ:
-			gap = ap;
-			break;
-		case ACL_MASK:	/* more than just standard modes */
-			nomask = 0;
-			iattr.ia_mode |= ap->ae_perm << 3;
-			*basicperms = 0;
-			break;
-		case ACL_OTHER:
-			iattr.ia_mode |= ap->ae_perm;
-			break;
-		default:	/* more than just standard modes */
-			*basicperms = 0;
-			break;
-		}
-		ap++;
-	}
-
-	/* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
-	if (gap && nomask)
-		iattr.ia_mode |= gap->ae_perm << 3;
-
-	return xfs_setattr(XFS_I(vp), &iattr, 0);
-}
-
-/*
- * The permissions for the special ACL entries (u::, g::[m::], o::) are
- * actually stored in the file mode (if there is both a group and a mask,
- * the group is stored in the ACL entry and the mask is stored on the file).
- * This allows the mode to remain automatically in sync with the ACL without
- * the need for a call-back to the ACL system at every point where the mode
- * could change.  This function takes the permissions from the specified mode
- * and places it in the supplied ACL.
- *
- * This implementation draws its validity from the fact that, when the ACL
- * was assigned, the mode was copied from the ACL.
- * If the mode did not change, therefore, the mode remains exactly what was
- * taken from the special ACL entries at assignment.
- * If a subsequent chmod() was done, the POSIX spec says that the change in
- * mode must cause an update to the ACL seen at user level and used for
- * access checks.  Before and after a mode change, therefore, the file mode
- * most accurately reflects what the special ACL entries should permit/deny.
- *
- * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly,
- *         the existing mode bits will override whatever is in the
- *         ACL. Similarly, if there is a pre-existing ACL that was
- *         never in sync with its mode (owing to a bug in 6.5 and
- *         before), it will now magically (or mystically) be
- *         synchronized.  This could cause slight astonishment, but
- *         it is better than inconsistent permissions.
- *
- * The supplied ACL is a template that may contain any combination
- * of special entries.  These are treated as place holders when we fill
- * out the ACL.  This routine does not add or remove special entries, it
- * simply unites each special entry with its associated set of permissions.
- */
-STATIC void
-xfs_acl_sync_mode(
-	mode_t		mode,
-	xfs_acl_t	*acl)
-{
-	int		i, nomask = 1;
-	xfs_acl_entry_t	*ap;
-	xfs_acl_entry_t	*gap = NULL;
-
-	/*
-	 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
-	 * be set instead of the GROUP entry, if there is a MASK.
-	 */
-	for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
-		switch (ap->ae_tag) {
-		case ACL_USER_OBJ:
-			ap->ae_perm = (mode >> 6) & 0x7;
-			break;
-		case ACL_GROUP_OBJ:
-			gap = ap;
-			break;
-		case ACL_MASK:
-			nomask = 0;
-			ap->ae_perm = (mode >> 3) & 0x7;
-			break;
-		case ACL_OTHER:
-			ap->ae_perm = mode & 0x7;
-			break;
-		default:
-			break;
-		}
-	}
-	/* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
-	if (gap && nomask)
-		gap->ae_perm = (mode >> 3) & 0x7;
-}
-
-/*
- * When inheriting an Access ACL from a directory Default ACL,
- * the ACL bits are set to the intersection of the ACL default
- * permission bits and the file permission bits in mode. If there
- * are no permission bits on the file then we must not give them
- * the ACL. This is what what makes umask() work with ACLs.
- */
-STATIC void
-xfs_acl_filter_mode(
-	mode_t		mode,
-	xfs_acl_t	*acl)
-{
-	int		i, nomask = 1;
-	xfs_acl_entry_t	*ap;
-	xfs_acl_entry_t	*gap = NULL;
-
-	/*
-	 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
-	 * be merged with GROUP entry, if there is a MASK.
-	 */
-	for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
-		switch (ap->ae_tag) {
-		case ACL_USER_OBJ:
-			ap->ae_perm &= (mode >> 6) & 0x7;
-			break;
-		case ACL_GROUP_OBJ:
-			gap = ap;
-			break;
-		case ACL_MASK:
-			nomask = 0;
-			ap->ae_perm &= (mode >> 3) & 0x7;
-			break;
-		case ACL_OTHER:
-			ap->ae_perm &= mode & 0x7;
-			break;
-		default:
-			break;
-		}
-	}
-	/* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
-	if (gap && nomask)
-		gap->ae_perm &= (mode >> 3) & 0x7;
-}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 642f1db4def4..63dc1f2efad5 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -18,81 +18,48 @@
 #ifndef __XFS_ACL_H__
 #define __XFS_ACL_H__
 
-/*
- * Access Control Lists
- */
-typedef __uint16_t	xfs_acl_perm_t;
-typedef __int32_t	xfs_acl_tag_t;
-typedef __int32_t	xfs_acl_id_t;
+struct inode;
+struct posix_acl;
+struct xfs_inode;
 
 #define XFS_ACL_MAX_ENTRIES 25
 #define XFS_ACL_NOT_PRESENT (-1)
 
-typedef struct xfs_acl_entry {
-	xfs_acl_tag_t	ae_tag;
-	xfs_acl_id_t	ae_id;
-	xfs_acl_perm_t	ae_perm;
-} xfs_acl_entry_t;
-
-typedef struct xfs_acl {
-	__int32_t	acl_cnt;
-	xfs_acl_entry_t	acl_entry[XFS_ACL_MAX_ENTRIES];
-} xfs_acl_t;
+/* On-disk XFS access control list structure */
+struct xfs_acl {
+	__be32		acl_cnt;
+	struct xfs_acl_entry {
+		__be32	ae_tag;
+		__be32	ae_id;
+		__be16	ae_perm;
+	} acl_entry[XFS_ACL_MAX_ENTRIES];
+};
 
 /* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE	"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT	"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE		"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT		"SGI_ACL_DEFAULT"
 #define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
 
-#define _ACL_TYPE_ACCESS	1
-#define _ACL_TYPE_DEFAULT	2
-
 #ifdef CONFIG_XFS_POSIX_ACL
+extern int xfs_check_acl(struct inode *inode, int mask);
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
+extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
+extern int xfs_acl_chmod(struct inode *inode);
+extern void xfs_inode_init_acls(struct xfs_inode *ip);
+extern void xfs_inode_clear_acls(struct xfs_inode *ip);
+extern int posix_acl_access_exists(struct inode *inode);
+extern int posix_acl_default_exists(struct inode *inode);
 
-struct vattr;
-struct xfs_inode;
-
-extern struct kmem_zone *xfs_acl_zone;
-#define xfs_acl_zone_init(zone, name)	\
-		(zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
-#define xfs_acl_zone_destroy(zone)	kmem_zone_destroy(zone)
-
-extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *);
-extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(struct inode *);
-extern int xfs_acl_vhasacl_default(struct inode *);
-extern int xfs_acl_vset(struct inode *, void *, size_t, int);
-extern int xfs_acl_vget(struct inode *, void *, size_t, int);
-extern int xfs_acl_vremove(struct inode *, int);
-
-#define _ACL_PERM_INVALID(perm)	((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
-
-#define _ACL_INHERIT(c,m,d)	(xfs_acl_inherit(c,m,d))
-#define _ACL_GET_ACCESS(pv,pa)	(xfs_acl_vtoacl(pv,pa,NULL) == 0)
-#define _ACL_GET_DEFAULT(pv,pd)	(xfs_acl_vtoacl(pv,NULL,pd) == 0)
-#define _ACL_ACCESS_EXISTS	xfs_acl_vhasacl_access
-#define _ACL_DEFAULT_EXISTS	xfs_acl_vhasacl_default
-
-#define _ACL_ALLOC(a)		((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP))
-#define _ACL_FREE(a)		((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0)
-
+extern struct xattr_handler xfs_xattr_system_handler;
 #else
-#define xfs_acl_zone_init(zone,name)
-#define xfs_acl_zone_destroy(zone)
-#define xfs_acl_vset(v,p,sz,t)	(-EOPNOTSUPP)
-#define xfs_acl_vget(v,p,sz,t)	(-EOPNOTSUPP)
-#define xfs_acl_vremove(v,t)	(-EOPNOTSUPP)
-#define xfs_acl_vhasacl_access(v)	(0)
-#define xfs_acl_vhasacl_default(v)	(0)
-#define _ACL_ALLOC(a)		(1)	/* successfully allocate nothing */
-#define _ACL_FREE(a)		((void)0)
-#define _ACL_INHERIT(c,m,d)	(0)
-#define _ACL_GET_ACCESS(pv,pa)	(0)
-#define _ACL_GET_DEFAULT(pv,pd)	(0)
-#define _ACL_ACCESS_EXISTS	(NULL)
-#define _ACL_DEFAULT_EXISTS	(NULL)
-#endif
-
+# define xfs_check_acl					NULL
+# define xfs_get_acl(inode, type)			NULL
+# define xfs_inherit_acl(inode, default_acl)		0
+# define xfs_acl_chmod(inode)				0
+# define xfs_inode_init_acls(ip)
+# define xfs_inode_clear_acls(ip)
+# define posix_acl_access_exists(inode)			0
+# define posix_acl_default_exists(inode)		0
+#endif /* CONFIG_XFS_POSIX_ACL */
 #endif	/* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 53d5e70d1360..0902249354a0 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -73,28 +73,6 @@ static inline void be64_add_cpu(__be64 *a, __s64 b)
 
 #endif	/* __KERNEL__ */
 
-/* do we need conversion? */
-#define ARCH_NOCONVERT 1
-#ifdef XFS_NATIVE_HOST
-# define ARCH_CONVERT	ARCH_NOCONVERT
-#else
-# define ARCH_CONVERT	0
-#endif
-
-/* generic swapping macros */
-
-#ifndef HAVE_SWABMACROS
-#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var))))
-#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var))))
-#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var))))
-#endif
-
-#define INT_SWAP(type, var) \
-    ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \
-    ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \
-    ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \
-    (var))))
-
 /*
  * get and set integers from potentially unaligned locations
  */
@@ -107,16 +85,6 @@ static inline void be64_add_cpu(__be64 *a, __s64 b)
 	((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
     }
 
-/* does not return a value */
-#define INT_SET(reference,arch,valueref) \
-    (__builtin_constant_p(valueref) ? \
-	(void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \
-	(void)( \
-	    ((reference) = (valueref)), \
-	    ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \
-	) \
-    )
-
 /*
  * In directories inode numbers are stored as unaligned arrays of unsigned
  * 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index cd1008b1c4cd..db15feb906ff 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -45,7 +45,6 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trans_space.h"
-#include "xfs_acl.h"
 #include "xfs_rw.h"
 #include "xfs_vnodeops.h"
 
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 73e1c0d767ac..76c540f719e4 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
+#include "xfs_acl.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
@@ -82,6 +83,7 @@ xfs_inode_alloc(
 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
 	ip->i_size = 0;
 	ip->i_new_size = 0;
+	xfs_inode_init_acls(ip);
 
 	/*
 	 * Initialize inode's trace buffers.
@@ -558,6 +560,7 @@ xfs_ireclaim(
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
+	xfs_inode_clear_acls(ip);
 	kmem_zone_free(xfs_inode_zone, ip);
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 123b20c8cbf2..1f22d65fed0a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -49,7 +49,6 @@
 #include "xfs_utils.h"
 #include "xfs_dir2_trace.h"
 #include "xfs_quota.h"
-#include "xfs_acl.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f879c1bc4b96..77016702938b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -18,6 +18,7 @@
 #ifndef	__XFS_INODE_H__
 #define	__XFS_INODE_H__
 
+struct posix_acl;
 struct xfs_dinode;
 struct xfs_inode;
 
@@ -272,6 +273,11 @@ typedef struct xfs_inode {
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 
+#ifdef CONFIG_XFS_POSIX_ACL
+	struct posix_acl	*i_acl;
+	struct posix_acl	*i_default_acl;
+#endif
+
 	/* Trace buffers per inode. */
 #ifdef XFS_INODE_TRACE
 	struct ktrace		*i_trace;	/* general inode trace */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index feb30a92549b..67ae5555a30a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 36f3a21c54d2..fea68615ed23 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -41,7 +41,6 @@
 #include "xfs_ialloc.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
-#include "xfs_acl.h"
 #include "xfs_error.h"
 #include "xfs_buf_item.h"
 #include "xfs_rw.h"
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index b56321b2b9f0..c4eca5ed5dab 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -42,6 +42,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
+#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_rw.h"
 #include "xfs_error.h"
@@ -467,8 +468,20 @@ xfs_setattr(
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(gdqp);
 
-	if (code) {
+	if (code)
 		return code;
+
+	/*
+	 * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
+	 * 	     update.  We could avoid this with linked transactions
+	 * 	     and passing down the transaction pointer all the way
+	 *	     to attr_set.  No previous user of the generic
+	 * 	     Posix ACL code seems to care about this issue either.
+	 */
+	if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
+		code = -xfs_acl_chmod(inode);
+		if (code)
+			return XFS_ERROR(code);
 	}
 
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 04373c6c61ff..a9e102de71a1 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
 #define	XFS_ATTR_DMI		0x01	/* invocation from a DMI function */
 #define	XFS_ATTR_NONBLOCK	0x02	/* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK		0x04	/* Don't grab any conflicting locks */
+#define XFS_ATTR_NOACL		0x08	/* Don't call xfs_acl_chmod */
 
 int xfs_readlink(struct xfs_inode *ip, char *link);
 int xfs_fsync(struct xfs_inode *ip);

From 77e26cca20013e9352a8df86a54640543304a23a Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 11 Jun 2009 16:04:35 +0900
Subject: [PATCH 146/162] x86, mce: Fix mce printing

This patch:

 - Adds print_mce_head() instead of first flag
 - Makes the header to be printed always
 - Stops double printing of corrected errors

[ This portion originates from Huang Ying's patch ]

Originally-From: Huang Ying <ying.huang@intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
LKML-Reference: <4A30AC83.5010708@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d4e7b5947a0e..6a3127ecb5cc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -180,12 +180,8 @@ void mce_log(struct mce *mce)
 	set_bit(0, &notify_user);
 }
 
-static void print_mce(struct mce *m, int *first)
+static void print_mce(struct mce *m)
 {
-	if (*first) {
-		printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
-		*first = 0;
-	}
 	printk(KERN_EMERG
 	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 	       m->extcpu, m->mcgstatus, m->bank, m->status);
@@ -208,6 +204,11 @@ static void print_mce(struct mce *m, int *first)
 			m->apicid);
 }
 
+static void print_mce_head(void)
+{
+	printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
+}
+
 static void print_mce_tail(void)
 {
 	printk(KERN_EMERG "This is not a software problem!\n"
@@ -234,7 +235,6 @@ static void wait_for_panic(void)
 static void mce_panic(char *msg, struct mce *final, char *exp)
 {
 	int i;
-	int first = 1;
 
 	/*
 	 * Make sure only one CPU runs in machine check panic
@@ -245,23 +245,27 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 
 	bust_spinlocks(1);
 	console_verbose();
+	print_mce_head();
 	/* First print corrected ones that are still unlogged */
 	for (i = 0; i < MCE_LOG_LEN; i++) {
 		struct mce *m = &mcelog.entry[i];
-		if ((m->status & MCI_STATUS_VAL) &&
-			!(m->status & MCI_STATUS_UC))
-			print_mce(m, &first);
+		if (!(m->status & MCI_STATUS_VAL))
+			continue;
+		if (!(m->status & MCI_STATUS_UC))
+			print_mce(m);
 	}
 	/* Now print uncorrected but with the final one last */
 	for (i = 0; i < MCE_LOG_LEN; i++) {
 		struct mce *m = &mcelog.entry[i];
 		if (!(m->status & MCI_STATUS_VAL))
 			continue;
+		if (!(m->status & MCI_STATUS_UC))
+			continue;
 		if (!final || memcmp(m, final, sizeof(struct mce)))
-			print_mce(m, &first);
+			print_mce(m);
 	}
 	if (final)
-		print_mce(final, &first);
+		print_mce(final);
 	if (cpu_missing)
 		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
 	print_mce_tail();

From 62fdac5913f71f8f200bd2c9bd59a02e9a1498e9 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 11 Jun 2009 16:06:07 +0900
Subject: [PATCH 147/162] x86, mce: Add boot options for corrected errors

This patch introduces three boot options (no_cmci, dont_log_ce
and ignore_ce) to control handling for corrected errors.

The "mce=no_cmci" boot option disables the CMCI feature.

Since CMCI is a new feature so having boot controls to disable
it will be a help if the hardware is misbehaving.

The "mce=dont_log_ce" boot option disables logging for corrected
errors. All reported corrected errors will be cleared silently.
This option will be useful if you never care about corrected
errors.

The "mce=ignore_ce" boot option disables features for corrected
errors, i.e. polling timer and cmci.  All corrected events are
not cleared and kept in bank MSRs.

Usually this disablement is not recommended, however it will be
a help if there are some conflict with the BIOS or hardware
monitoring applications etc., that clears corrected events in
banks instead of OS.

[ And trivial cleanup (space -> tab) for doc is included. ]

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <4A30ACDF.5030408@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/x86/x86_64/boot-options.txt | 36 +++++++++++++++++++----
 arch/x86/include/asm/mce.h                |  2 ++
 arch/x86/kernel/cpu/mcheck/mce.c          | 19 ++++++++++--
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c |  3 ++
 4 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 0ee5e3b212f3..fa2bed07d21e 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -7,12 +7,36 @@ Machine check
 
    Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables.
 
-   mce=off disable machine check
-   mce=bootlog Enable logging of machine checks left over from booting.
-               Disabled by default on AMD because some BIOS leave bogus ones.
-               If your BIOS doesn't do that it's a good idea to enable though
-               to make sure you log even machine check events that result
-               in a reboot. On Intel systems it is enabled by default.
+   mce=off
+		Disable machine check
+   mce=no_cmci
+		Disable CMCI(Corrected Machine Check Interrupt) that
+		Intel processor supports.  Usually this disablement is
+		not recommended, but it might be handy if your hardware
+		is misbehaving.
+		Note that you'll get more problems without CMCI than with
+		due to the shared banks, i.e. you might get duplicated
+		error logs.
+   mce=dont_log_ce
+		Don't make logs for corrected errors.  All events reported
+		as corrected are silently cleared by OS.
+		This option will be useful if you have no interest in any
+		of corrected errors.
+   mce=ignore_ce
+		Disable features for corrected errors, e.g. polling timer
+		and CMCI.  All events reported as corrected are not cleared
+		by OS and remained in its error banks.
+		Usually this disablement is not recommended, however if
+		there is an agent checking/clearing corrected errors
+		(e.g. BIOS or hardware monitoring applications), conflicting
+		with OS's error handling, and you cannot deactivate the agent,
+		then this option will be a help.
+   mce=bootlog
+		Enable logging of machine checks left over from booting.
+		Disabled by default on AMD because some BIOS leave bogus ones.
+		If your BIOS doesn't do that it's a good idea to enable though
+		to make sure you log even machine check events that result
+		in a reboot. On Intel systems it is enabled by default.
    mce=nobootlog
 		Disable boot machine check logging.
    mce=tolerancelevel[,monarchtimeout] (number,number)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 82978ad12072..540a466e50f5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -119,6 +119,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
 
 #ifdef CONFIG_X86_MCE_INTEL
+extern int mce_cmci_disabled;
+extern int mce_ignore_ce;
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
 void cmci_clear(void);
 void cmci_reenable(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6a3127ecb5cc..fabba15e4558 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -84,6 +84,9 @@ static int			rip_msr;
 static int			mce_bootlog = -1;
 static int			monarch_timeout = -1;
 static int			mce_panic_timeout;
+static int			mce_dont_log_ce;
+int				mce_cmci_disabled;
+int				mce_ignore_ce;
 int				mce_ser;
 
 static char			trigger[128];
@@ -526,7 +529,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 * Don't get the IP here because it's unlikely to
 		 * have anything to do with the actual error location.
 		 */
-		if (!(flags & MCP_DONTLOG)) {
+		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
 			mce_log(&m);
 			add_taint(TAINT_MACHINE_CHECK);
 		}
@@ -1307,6 +1310,9 @@ static void mce_init_timer(void)
 	struct timer_list *t = &__get_cpu_var(mce_timer);
 	int *n = &__get_cpu_var(next_interval);
 
+	if (mce_ignore_ce)
+		return;
+
 	*n = check_interval * HZ;
 	if (!*n)
 		return;
@@ -1517,7 +1523,10 @@ static struct miscdevice mce_log_device = {
 };
 
 /*
- * mce=off disables machine check
+ * mce=off Disables machine check
+ * mce=no_cmci Disables CMCI
+ * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
  *	monarchtimeout is how long to wait for other CPUs on machine
  *	check, or 0 to not wait
@@ -1532,6 +1541,12 @@ static int __init mcheck_enable(char *str)
 		str++;
 	if (!strcmp(str, "off"))
 		mce_disabled = 1;
+	else if (!strcmp(str, "no_cmci"))
+		mce_cmci_disabled = 1;
+	else if (!strcmp(str, "dont_log_ce"))
+		mce_dont_log_ce = 1;
+	else if (!strcmp(str, "ignore_ce"))
+		mce_ignore_ce = 1;
 	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
 		mce_bootlog = (str[0] == 'b');
 	else if (isdigit(str[0])) {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index b7c5a2470b40..046087e9808f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -57,6 +57,9 @@ static int cmci_supported(int *banks)
 {
 	u64 cap;
 
+	if (mce_cmci_disabled || mce_ignore_ce)
+		return 0;
+
 	/*
 	 * Vendor check is not strictly needed, but the initial
 	 * initialization is vendor keyed and this

From 7747a0b0af5976ba3828796b4f7a7adc3bb76dbd Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Thu, 11 Jun 2009 17:07:28 -0500
Subject: [PATCH 148/162] xfs: fix freeing memory in xfs_getbmap()

Regression from commit 28e211700a81b0a934b6c7a4b8e7dda843634d2f.
Need to free temporary buffer allocated in xfs_getbmap().

Signed-off-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Hedi Berriche <hedi@sgi.com>
Reported-by: Justin Piszcz <jpiszcz@lucidpixels.com>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_bmap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 4b0f6efb046c..7928b9983c1d 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -6086,6 +6086,7 @@ xfs_getbmap(
 			break;
 	}
 
+	kmem_free(out);
 	return error;
 }
 

From 493b87e5ed352cf548e6456ddfc36576e28278ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 12 Jun 2009 11:34:55 -0400
Subject: [PATCH 149/162] xfs: fix warnings with CONFIG_XFS_QUOTA disabled

Fix warnings about unitialized dquot variables by making sure
xfs_qm_vop_dqalloc touches it even when quotas are disabled.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_quota.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 079d2e61c626..3ec91ac74c2a 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -335,6 +335,14 @@ extern void xfs_qm_unmount(struct xfs_mount *);
 extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 
 #else
+static inline int
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
+		uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
+{
+	*udqp = NULL;
+	*gdqp = NULL;
+	return 0;
+}
 #define xfs_trans_dup_dqinfo(tp, tp2)
 #define xfs_trans_free_dqinfo(tp)
 #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
@@ -342,7 +350,6 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 #define xfs_trans_unreserve_and_mod_dquots(tp)
 #define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)	(0)
 #define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)	(0)
-#define xfs_qm_vop_dqalloc(ip, uid, gid, prid, fl, ou, og)		(0)
 #define xfs_qm_vop_create_dqattach(tp, ip, u, g)
 #define xfs_qm_vop_rename_dqattach(it)					(0)
 #define xfs_qm_vop_chown(tp, ip, old, new)				(NULL)

From e83f1eb6bfc4004c19a99ee5f5aa65bd3fbecec3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 12 Jun 2009 11:19:11 -0400
Subject: [PATCH 150/162] xfs: fix small mismerge in xfs_vn_mknod

Identation got messed up when merging the current_umask changes with
the generic ACL support.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 800dd4f98d4e..58973bb46038 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -225,8 +225,8 @@ xfs_vn_mknod(
 		if (IS_ERR(default_acl))
 			return -PTR_ERR(default_acl);
 
-	if (!default_acl)
-		mode &= ~current_umask();
+		if (!default_acl)
+			mode &= ~current_umask();
 	}
 
 	xfs_dentry_to_name(&name, dentry);

From 9528d1c7a541b481a0e80301dc8d545848104023 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Mon, 18 May 2009 08:14:41 -0400
Subject: [PATCH 151/162] i2c: Blackfin TWI: make sure we don't end up with a
 CLKDIV=0

Make sure we don't end up with an invalid CLKDIV=0 in case someone
specifies 20kHz SCL or less (5 * 1024 / 20 = 0x100).

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
[ben-linux@fluff.org: shortened subject line]
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 drivers/i2c/busses/Kconfig        |  2 +-
 drivers/i2c/busses/i2c-bfin-twi.c | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index f1c6ca7e2852..c8460fa9cfac 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -298,7 +298,7 @@ config I2C_BLACKFIN_TWI
 config I2C_BLACKFIN_TWI_CLK_KHZ
 	int "Blackfin TWI I2C clock (kHz)"
 	depends on I2C_BLACKFIN_TWI
-	range 10 400
+	range 21 400
 	default 50
 	help
 	  The unit of the TWI clock is kHz.
diff --git a/drivers/i2c/busses/i2c-bfin-twi.c b/drivers/i2c/busses/i2c-bfin-twi.c
index fc548b3d002e..77cafb6ba923 100644
--- a/drivers/i2c/busses/i2c-bfin-twi.c
+++ b/drivers/i2c/busses/i2c-bfin-twi.c
@@ -614,6 +614,7 @@ static int i2c_bfin_twi_probe(struct platform_device *pdev)
 	struct i2c_adapter *p_adap;
 	struct resource *res;
 	int rc;
+	unsigned int clkhilow;
 
 	iface = kzalloc(sizeof(struct bfin_twi_iface), GFP_KERNEL);
 	if (!iface) {
@@ -675,10 +676,14 @@ static int i2c_bfin_twi_probe(struct platform_device *pdev)
 	/* Set TWI internal clock as 10MHz */
 	write_CONTROL(iface, ((get_sclk() / 1024 / 1024 + 5) / 10) & 0x7F);
 
+	/*
+	 * We will not end up with a CLKDIV=0 because no one will specify
+	 * 20kHz SCL or less in Kconfig now. (5 * 1024 / 20 = 0x100)
+	 */
+	clkhilow = 5 * 1024 / CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ;
+
 	/* Set Twi interface clock as specified */
-	write_CLKDIV(iface, ((5*1024 / CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ)
-			<< 8) | ((5*1024 / CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ)
-			& 0xFF));
+	write_CLKDIV(iface, (clkhilow << 8) | clkhilow);
 
 	/* Enable TWI */
 	write_CONTROL(iface, read_CONTROL(iface) | TWI_ENA);

From 57a8f32eafa6f36ea3a128e8b13f353c5a3ca9b2 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Tue, 19 May 2009 07:21:58 -0400
Subject: [PATCH 152/162] i2c: Blackfin TWI: fix REPEAT START mode doesn't
 repeat

Avoid rewrite TWI MASTER_CTL reg when issue next message
In i2c repeat transfer mode, byte count of next message should be filled
into part of the TWI MASTER_CTL reg when interrupt MCOMP of last
message transfer is triggered. But, other bits in this reg should
not be touched.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
[ben-linux@fluff.org: shorted subject]
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 drivers/i2c/busses/i2c-bfin-twi.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/i2c/busses/i2c-bfin-twi.c b/drivers/i2c/busses/i2c-bfin-twi.c
index 77cafb6ba923..4d73ad7b5703 100644
--- a/drivers/i2c/busses/i2c-bfin-twi.c
+++ b/drivers/i2c/busses/i2c-bfin-twi.c
@@ -196,8 +196,6 @@ static void bfin_twi_handle_interrupt(struct bfin_twi_iface *iface)
 			/* remove restart bit and enable master receive */
 			write_MASTER_CTL(iface,
 				read_MASTER_CTL(iface) & ~RSTART);
-			write_MASTER_CTL(iface,
-				read_MASTER_CTL(iface) | MEN | MDIR);
 			SSYNC();
 		} else if (iface->cur_mode == TWI_I2C_MODE_REPEAT &&
 				iface->cur_msg+1 < iface->msg_num) {
@@ -222,18 +220,19 @@ static void bfin_twi_handle_interrupt(struct bfin_twi_iface *iface)
 			}
 
 			if (iface->pmsg[iface->cur_msg].len <= 255)
-				write_MASTER_CTL(iface,
-				iface->pmsg[iface->cur_msg].len << 6);
+					write_MASTER_CTL(iface,
+					(read_MASTER_CTL(iface) &
+					(~(0xff << 6))) |
+				(iface->pmsg[iface->cur_msg].len << 6));
 			else {
-				write_MASTER_CTL(iface, 0xff << 6);
+				write_MASTER_CTL(iface,
+					(read_MASTER_CTL(iface) |
+					(0xff << 6)));
 				iface->manual_stop = 1;
 			}
 			/* remove restart bit and enable master receive */
 			write_MASTER_CTL(iface,
 				read_MASTER_CTL(iface) & ~RSTART);
-			write_MASTER_CTL(iface, read_MASTER_CTL(iface) |
-				MEN | ((iface->read_write == I2C_SMBUS_READ) ?
-				MDIR : 0));
 			SSYNC();
 		} else {
 			iface->result = 1;

From 94327d009e3aa20214e9dfa486a1fd14445fe736 Mon Sep 17 00:00:00 2001
From: Frank Shew <fshew@geometrics.com>
Date: Tue, 19 May 2009 07:23:49 -0400
Subject: [PATCH 153/162] i2c: Blackfin TWI: fix transfer errors with repeat
 start

We have a custom BF537 board with an I2C RTC (MAX DS3231) running
uclinux 2007R1 for some time. Recently during migration to 2008R1.5-RC3
we losted access to the RTC. The RTC driver calls 'i2c_transfer()' which
in turns calls 'bfin_twi_master_xfer()' in i2c-bfin-twi.c.

Compared with 2007R1, it looks like the 2008R1.5 version of i2c-bin-twi.c
has a new mode 'TWI_I2C-MODE_REPEAT' which corresponds to the Repeat Start
Condition described in the HRM. However, according to the HRM, at XMIT or
RECV interrupt and when the data count is 0, not only is the RESTART bit
supposed to be set, but MDIR must also be set if the next operation is a
receive sequence, and cleared if not. Currently there is no code that looks
at the I2C_M_RD bit in the flag from the next cur_msg and set/clear the MDIR
flag accordingly at the same time that the RSTART bit is set. Instead, MDIR
is set or cleared (by OR'ing with 0?) after the RESTART bit has been cleared
during handling of MCOMP interrupt.

It appears that this is causing our failure with reading the RTC, as a
quick patch to set/clear MDIR when RESTART is set seem to solve our problem.

Signed-off-by: Frank Shew <fshew@geometrics.com>
Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
[ben-linux@fluff.org: shorted subject]
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 drivers/i2c/busses/i2c-bfin-twi.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/i2c/busses/i2c-bfin-twi.c b/drivers/i2c/busses/i2c-bfin-twi.c
index 4d73ad7b5703..40136ea7a46f 100644
--- a/drivers/i2c/busses/i2c-bfin-twi.c
+++ b/drivers/i2c/busses/i2c-bfin-twi.c
@@ -104,9 +104,14 @@ static void bfin_twi_handle_interrupt(struct bfin_twi_iface *iface)
 			write_MASTER_CTL(iface,
 				read_MASTER_CTL(iface) | STOP);
 		else if (iface->cur_mode == TWI_I2C_MODE_REPEAT &&
-				iface->cur_msg+1 < iface->msg_num)
-			write_MASTER_CTL(iface,
-				read_MASTER_CTL(iface) | RSTART);
+		         iface->cur_msg + 1 < iface->msg_num) {
+			if (iface->pmsg[iface->cur_msg + 1].flags & I2C_M_RD)
+				write_MASTER_CTL(iface,
+					read_MASTER_CTL(iface) | RSTART | MDIR);
+			else
+				write_MASTER_CTL(iface,
+					(read_MASTER_CTL(iface) | RSTART) & ~MDIR);
+		}
 		SSYNC();
 		/* Clear status */
 		write_INT_STAT(iface, XMTSERV);
@@ -134,9 +139,13 @@ static void bfin_twi_handle_interrupt(struct bfin_twi_iface *iface)
 				read_MASTER_CTL(iface) | STOP);
 			SSYNC();
 		} else if (iface->cur_mode == TWI_I2C_MODE_REPEAT &&
-				iface->cur_msg+1 < iface->msg_num) {
-			write_MASTER_CTL(iface,
-				read_MASTER_CTL(iface) | RSTART);
+		           iface->cur_msg + 1 < iface->msg_num) {
+			if (iface->pmsg[iface->cur_msg + 1].flags & I2C_M_RD)
+				write_MASTER_CTL(iface,
+					read_MASTER_CTL(iface) | RSTART | MDIR);
+			else
+				write_MASTER_CTL(iface,
+					(read_MASTER_CTL(iface) | RSTART) & ~MDIR);
 			SSYNC();
 		}
 		/* Clear interrupt source */

From e0cd2dd5dd2b7c6512e46ce0b4f119cd7b0c74a4 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Wed, 27 May 2009 09:24:10 +0000
Subject: [PATCH 154/162] i2c: Blackfin TWI: implement I2C_FUNC_SMBUS_I2C_BLOCK
 functionality

Some drivers need i2c_smbus_read_i2c_block_data() functionality, so add
support for it to the Blackfin I2C bus driver.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
[ben-linux@fluff.org: shortened subject]
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 drivers/i2c/busses/i2c-bfin-twi.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/i2c/busses/i2c-bfin-twi.c b/drivers/i2c/busses/i2c-bfin-twi.c
index 40136ea7a46f..26d8987e69bf 100644
--- a/drivers/i2c/busses/i2c-bfin-twi.c
+++ b/drivers/i2c/busses/i2c-bfin-twi.c
@@ -449,6 +449,16 @@ int bfin_twi_smbus_xfer(struct i2c_adapter *adap, u16 addr,
 		}
 		iface->transPtr = data->block;
 		break;
+	case I2C_SMBUS_I2C_BLOCK_DATA:
+		if (read_write == I2C_SMBUS_READ) {
+			iface->readNum = data->block[0];
+			iface->cur_mode = TWI_I2C_MODE_COMBINED;
+		} else {
+			iface->writeNum = data->block[0];
+			iface->cur_mode = TWI_I2C_MODE_STANDARDSUB;
+		}
+		iface->transPtr = (u8 *)&data->block[1];
+		break;
 	default:
 		return -1;
 	}
@@ -572,7 +582,7 @@ static u32 bfin_twi_functionality(struct i2c_adapter *adap)
 	return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
 	       I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA |
 	       I2C_FUNC_SMBUS_BLOCK_DATA | I2C_FUNC_SMBUS_PROC_CALL |
-	       I2C_FUNC_I2C;
+	       I2C_FUNC_I2C | I2C_FUNC_SMBUS_I2C_BLOCK;
 }
 
 static struct i2c_algorithm bfin_twi_algorithm = {

From baf46b4e378d7950dff7ba30cfd50ff585987cb4 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@nokia.com>
Date: Wed, 27 May 2009 17:54:45 +0300
Subject: [PATCH 155/162] i2c: OMAP2/3: Fix scll/sclh calculations

Fix scll/sclh calculations for HS and fast modes. Currently the driver
uses equal (roughly) low/high times which will result in too short
low time.

OMAP3430 TRM gives the following equations:

	F/S: tLow  = (scll + 7) * internal_clk
	     tHigh = (sclh + 5) * internal_clk
	HS:  tLow  = (scll + 7) * fclk
	     tHigh = (sclh + 5) * fclk

Furthermore, the I2C specification sets the following minimum values
for HS tLow/tHigh for capacitive bus loads 100 pF (maximum speed 3400)
and 400 pF (maximum speed 1700):

	speed	tLow		tHigh
	3400	160 ns		60 ns
	1700	320 ns		120 ns

and for F/S:

	speed	tLow		tHigh
	400	1300 ns		600 ns
	100	4700 ns		4000 ns

By using duty cycles 33/66 (HS, F) and 50/50 (S) we stay above these
minimum values.

Signed-off-by: Aaro Koskinen <aaro.koskinen@nokia.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 drivers/i2c/busses/i2c-omap.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
index ece0125a1ee5..a879e4bf5122 100644
--- a/drivers/i2c/busses/i2c-omap.c
+++ b/drivers/i2c/busses/i2c-omap.c
@@ -343,17 +343,28 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
 
 		/* If configured for High Speed */
 		if (dev->speed > 400) {
+			unsigned long scl;
+
 			/* For first phase of HS mode */
-			fsscll = internal_clk / (400 * 2) - 6;
-			fssclh = internal_clk / (400 * 2) - 6;
+			scl = internal_clk / 400;
+			fsscll = scl - (scl / 3) - 7;
+			fssclh = (scl / 3) - 5;
 
 			/* For second phase of HS mode */
-			hsscll = fclk_rate / (dev->speed * 2) - 6;
-			hssclh = fclk_rate / (dev->speed * 2) - 6;
+			scl = fclk_rate / dev->speed;
+			hsscll = scl - (scl / 3) - 7;
+			hssclh = (scl / 3) - 5;
+		} else if (dev->speed > 100) {
+			unsigned long scl;
+
+			/* Fast mode */
+			scl = internal_clk / dev->speed;
+			fsscll = scl - (scl / 3) - 7;
+			fssclh = (scl / 3) - 5;
 		} else {
-			/* To handle F/S modes */
-			fsscll = internal_clk / (dev->speed * 2) - 6;
-			fssclh = internal_clk / (dev->speed * 2) - 6;
+			/* Standard mode */
+			fsscll = internal_clk / (dev->speed * 2) - 7;
+			fssclh = internal_clk / (dev->speed * 2) - 5;
 		}
 		scll = (hsscll << OMAP_I2C_SCLL_HSSCLL) | fsscll;
 		sclh = (hssclh << OMAP_I2C_SCLH_HSSCLH) | fssclh;

From 84bf2c868f3ca996e5bbd3beb2ef502f457140f3 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@nokia.com>
Date: Wed, 27 May 2009 17:54:46 +0300
Subject: [PATCH 156/162] i2c: OMAP3: Better noise suppression for
 fast/standard modes

Use longer noise filter period for fast and standard mode. Based on an
earlier patch by Eero Nurkkala.

Signed-off-by: Aaro Koskinen <aaro.koskinen@nokia.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 drivers/i2c/busses/i2c-omap.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
index a879e4bf5122..c73475dd0fba 100644
--- a/drivers/i2c/busses/i2c-omap.c
+++ b/drivers/i2c/busses/i2c-omap.c
@@ -333,8 +333,18 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
 
 	if (cpu_is_omap2430() || cpu_is_omap34xx()) {
 
-		/* HSI2C controller internal clk rate should be 19.2 Mhz */
-		internal_clk = 19200;
+		/*
+		 * HSI2C controller internal clk rate should be 19.2 Mhz for
+		 * HS and for all modes on 2430. On 34xx we can use lower rate
+		 * to get longer filter period for better noise suppression.
+		 * The filter is iclk (fclk for HS) period.
+		 */
+		if (dev->speed > 400 || cpu_is_omap_2430())
+			internal_clk = 19200;
+		else if (dev->speed > 100)
+			internal_clk = 9600;
+		else
+			internal_clk = 4000;
 		fclk_rate = clk_get_rate(dev->fclk) / 1000;
 
 		/* Compute prescaler divisor */

From 7d85ccd816535f56880f7dfdb4de056794376b2c Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Fri, 12 Jun 2009 10:45:29 +0100
Subject: [PATCH 157/162] i2c-s3c2410: move to using platform idtable to match
 devices

Change to using platform id table to match either of the two supported
platform device names in the driver. This simplifies the driver init and
exit code

Note, log messages will now be prefixed with 's3c-i2c' instead of the
driver name, so output will be of the form of:

s3c-i2c s3c2440-i2c.0: slave address 0x10

Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 drivers/i2c/busses/i2c-s3c2410.c | 48 +++++++++++++++-----------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c
index 1691ef0f1ee1..079a312d36fd 100644
--- a/drivers/i2c/busses/i2c-s3c2410.c
+++ b/drivers/i2c/busses/i2c-s3c2410.c
@@ -51,6 +51,11 @@ enum s3c24xx_i2c_state {
 	STATE_STOP
 };
 
+enum s3c24xx_i2c_type {
+	TYPE_S3C2410,
+	TYPE_S3C2440,
+};
+
 struct s3c24xx_i2c {
 	spinlock_t		lock;
 	wait_queue_head_t	wait;
@@ -88,8 +93,10 @@ struct s3c24xx_i2c {
 static inline int s3c24xx_i2c_is2440(struct s3c24xx_i2c *i2c)
 {
 	struct platform_device *pdev = to_platform_device(i2c->dev);
+	enum s3c24xx_i2c_type type;
 
-	return !strcmp(pdev->name, "s3c2440-i2c");
+	type = platform_get_device_id(pdev)->driver_data;
+	return type == TYPE_S3C2440;
 }
 
 /* s3c24xx_i2c_master_complete
@@ -969,52 +976,41 @@ static int s3c24xx_i2c_resume(struct platform_device *dev)
 
 /* device driver for platform bus bits */
 
-static struct platform_driver s3c2410_i2c_driver = {
-	.probe		= s3c24xx_i2c_probe,
-	.remove		= s3c24xx_i2c_remove,
-	.suspend_late	= s3c24xx_i2c_suspend_late,
-	.resume		= s3c24xx_i2c_resume,
-	.driver		= {
-		.owner	= THIS_MODULE,
-		.name	= "s3c2410-i2c",
-	},
+static struct platform_device_id s3c24xx_driver_ids[] = {
+	{
+		.name		= "s3c2410-i2c",
+		.driver_data	= TYPE_S3C2410,
+	}, {
+		.name		= "s3c2440-i2c",
+		.driver_data	= TYPE_S3C2440,
+	}, { },
 };
+MODULE_DEVICE_TABLE(platform, s3c24xx_driver_ids);
 
-static struct platform_driver s3c2440_i2c_driver = {
+static struct platform_driver s3c24xx_i2c_driver = {
 	.probe		= s3c24xx_i2c_probe,
 	.remove		= s3c24xx_i2c_remove,
 	.suspend_late	= s3c24xx_i2c_suspend_late,
 	.resume		= s3c24xx_i2c_resume,
+	.id_table	= s3c24xx_driver_ids,
 	.driver		= {
 		.owner	= THIS_MODULE,
-		.name	= "s3c2440-i2c",
+		.name	= "s3c-i2c",
 	},
 };
 
 static int __init i2c_adap_s3c_init(void)
 {
-	int ret;
-
-	ret = platform_driver_register(&s3c2410_i2c_driver);
-	if (ret == 0) {
-		ret = platform_driver_register(&s3c2440_i2c_driver);
-		if (ret)
-			platform_driver_unregister(&s3c2410_i2c_driver);
-	}
-
-	return ret;
+	return platform_driver_register(&s3c24xx_i2c_driver);
 }
 subsys_initcall(i2c_adap_s3c_init);
 
 static void __exit i2c_adap_s3c_exit(void)
 {
-	platform_driver_unregister(&s3c2410_i2c_driver);
-	platform_driver_unregister(&s3c2440_i2c_driver);
+	platform_driver_unregister(&s3c24xx_i2c_driver);
 }
 module_exit(i2c_adap_s3c_exit);
 
 MODULE_DESCRIPTION("S3C24XX I2C Bus driver");
 MODULE_AUTHOR("Ben Dooks, <ben@simtec.co.uk>");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:s3c2410-i2c");
-MODULE_ALIAS("platform:s3c2440-i2c");

From dd14be4c274fc484eccace03ae9726e516630331 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Richard=20R=C3=B6jfors?=
 <richard.rojfors.ext@mocean-labs.com>
Date: Fri, 5 Jun 2009 15:40:32 +0200
Subject: [PATCH 158/162] i2c-ocores: Can add I2C devices to the bus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is sometimes a need for the ocores driver to add devices to the
bus when installed.

i2c_register_board_info can not always be used, because the I2C devices
 are not known at an early state, they could for instance be connected
 on a I2C bus on a PCI device which has the Open Cores IP.

i2c_new_device can not be used in all cases either since the resulting
bus nummer might be unknown.

The solution is the pass a list of I2C devices in the platform data to
the Open Cores driver. This is useful for MFD drivers.

Signed-off-by: Richard Röjfors <richard.rojfors.ext@mocean-labs.com>
Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 Documentation/i2c/busses/i2c-ocores | 17 +++++++++++++++++
 drivers/i2c/busses/i2c-ocores.c     |  5 +++++
 include/linux/i2c-ocores.h          |  2 ++
 3 files changed, 24 insertions(+)

diff --git a/Documentation/i2c/busses/i2c-ocores b/Documentation/i2c/busses/i2c-ocores
index cfcebb10d14e..c269aaa2f26a 100644
--- a/Documentation/i2c/busses/i2c-ocores
+++ b/Documentation/i2c/busses/i2c-ocores
@@ -20,6 +20,8 @@ platform_device with the base address and interrupt number. The
 dev.platform_data of the device should also point to a struct
 ocores_i2c_platform_data (see linux/i2c-ocores.h) describing the
 distance between registers and the input clock speed.
+There is also a possibility to attach a list of i2c_board_info which
+the i2c-ocores driver will add to the bus upon creation.
 
 E.G. something like:
 
@@ -36,9 +38,24 @@ static struct resource ocores_resources[] = {
 	},
 };
 
+/* optional board info */
+struct i2c_board_info ocores_i2c_board_info[] = {
+	{
+		I2C_BOARD_INFO("tsc2003", 0x48),
+		.platform_data = &tsc2003_platform_data,
+		.irq = TSC_IRQ
+	},
+	{
+		I2C_BOARD_INFO("adv7180", 0x42 >> 1),
+		.irq = ADV_IRQ
+	}
+};
+
 static struct ocores_i2c_platform_data myi2c_data = {
 	.regstep	= 2,		/* two bytes between registers */
 	.clock_khz	= 50000,	/* input clock of 50MHz */
+	.devices	= ocores_i2c_board_info, /* optional table of devices */
+	.num_devices	= ARRAY_SIZE(ocores_i2c_board_info), /* table size */
 };
 
 static struct platform_device myi2c = {
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index e5193bf75483..3542c6ba98f1 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -216,6 +216,7 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
 	struct ocores_i2c_platform_data *pdata;
 	struct resource *res, *res2;
 	int ret;
+	int i;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res)
@@ -271,6 +272,10 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
 		goto add_adapter_failed;
 	}
 
+	/* add in known devices to the bus */
+	for (i = 0; i < pdata->num_devices; i++)
+		i2c_new_device(&i2c->adap, pdata->devices + i);
+
 	return 0;
 
 add_adapter_failed:
diff --git a/include/linux/i2c-ocores.h b/include/linux/i2c-ocores.h
index 8ed591b0887e..4d5e57ff6614 100644
--- a/include/linux/i2c-ocores.h
+++ b/include/linux/i2c-ocores.h
@@ -14,6 +14,8 @@
 struct ocores_i2c_platform_data {
 	u32 regstep;   /* distance between registers */
 	u32 clock_khz; /* input clock in kHz */
+	u8 num_devices; /* number of devices in the devices list */
+	struct i2c_board_info const *devices; /* devices connected to the bus */
 };
 
 #endif /* _LINUX_I2C_OCORES_H */

From bb6e647051a59dca5a72b3deef1e061d7c1c34da Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Date: Wed, 3 Jun 2009 14:29:16 +0200
Subject: [PATCH 159/162] avr32: Fix oops on unaligned user access

The unaligned address exception handler (and others) does not scan the
fixup tables before oopsing. This is bad because it means passing a
badly aligned pointer from user space might crash the kernel.

Fix this by scanning the fixup tables in _exception(). This should
resolve the issue for unaligned addresses as well as other less common
exceptions that might be happening during a userspace access. The page
fault handler already does fixup processing.

Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
---
 arch/avr32/kernel/traps.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
index d547c8df157d..6e3d491184ea 100644
--- a/arch/avr32/kernel/traps.c
+++ b/arch/avr32/kernel/traps.c
@@ -75,8 +75,17 @@ void _exception(long signr, struct pt_regs *regs, int code,
 {
 	siginfo_t info;
 
-	if (!user_mode(regs))
+	if (!user_mode(regs)) {
+		const struct exception_table_entry *fixup;
+
+		/* Are we prepared to handle this kernel fault? */
+		fixup = search_exception_tables(regs->pc);
+		if (fixup) {
+			regs->pc = fixup->fixup;
+			return;
+		}
 		die("Unhandled exception in kernel mode", regs, signr);
+	}
 
 	memset(&info, 0, sizeof(info));
 	info.si_signo = signr;

From faea56c9bb44f539da1ae0194184873fc2720b20 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 12 Jun 2009 11:43:48 -0700
Subject: [PATCH 160/162] [SCSI] cnic: fix undefined reference to
 `ip6_route_output'

Fix cnic build for case of CONFIG_INET=n.
Fix cnic build for case of CONFIG_IPV6=m and CONFIG_CNIC=y.

Fixes these build errors:

cnic.c:(.text+0x236a1d): undefined reference to `ip_route_output_key'
cnic.c:(.text+0x15a8e8): undefined reference to `ip6_route_output'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/net/cnic.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/cnic.c b/drivers/net/cnic.c
index 8d740376bbd2..a9e2fd35bb41 100644
--- a/drivers/net/cnic.c
+++ b/drivers/net/cnic.c
@@ -1454,6 +1454,7 @@ static inline u16 cnic_get_vlan(struct net_device *dev,
 static int cnic_get_v4_route(struct sockaddr_in *dst_addr,
 			     struct dst_entry **dst)
 {
+#if defined(CONFIG_INET)
 	struct flowi fl;
 	int err;
 	struct rtable *rt;
@@ -1465,12 +1466,15 @@ static int cnic_get_v4_route(struct sockaddr_in *dst_addr,
 	if (!err)
 		*dst = &rt->u.dst;
 	return err;
+#else
+	return -ENETUNREACH;
+#endif
 }
 
 static int cnic_get_v6_route(struct sockaddr_in6 *dst_addr,
 			     struct dst_entry **dst)
 {
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if defined(CONFIG_IPV6) || (defined(CONFIG_IPV6_MODULE) && defined(MODULE))
 	struct flowi fl;
 
 	memset(&fl, 0, sizeof(fl));
@@ -1550,7 +1554,7 @@ static int cnic_get_route(struct cnic_sock *csk, struct cnic_sockaddr *saddr)
 	clear_bit(SK_F_IPV6, &csk->flags);
 
 	if (is_v6) {
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if defined(CONFIG_IPV6) || (defined(CONFIG_IPV6_MODULE) && defined(MODULE))
 		set_bit(SK_F_IPV6, &csk->flags);
 		err = cnic_get_v6_route(&saddr->remote.v6, &dst);
 		if (err)

From bc3bf8fd330ce981ce632a1a4a283eee46838f32 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 13 Jun 2009 08:29:33 +0200
Subject: [PATCH 161/162] =?UTF-8?q?[SCSI]=20cnic:=20fix=20error:=20implici?=
 =?UTF-8?q?t=20declaration=20of=20function=20=E2=80=98=5F=5Fsymbol=5Fget?=
 =?UTF-8?q?=E2=80=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 drivers/net/cnic.c: In function ‘init_bnx2_cnic’:
 drivers/net/cnic.c:2520: error: implicit declaration of function ‘__symbol_get’
 drivers/net/cnic.c:2520: warning: assignment makes pointer from integer without a cast
 make[1]: *** [drivers/net/cnic.o] Error 1
 make: *** [drivers/net/cnic.o] Error 2

Caused by not including linux/module.h

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 drivers/net/cnic.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/cnic.c b/drivers/net/cnic.c
index a9e2fd35bb41..44f77eb1180f 100644
--- a/drivers/net/cnic.c
+++ b/drivers/net/cnic.c
@@ -25,6 +25,8 @@
 #include <linux/delay.h>
 #include <linux/ethtool.h>
 #include <linux/if_vlan.h>
+#include <linux/module.h>
+
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
 #define BCM_VLAN 1
 #endif

From 5c55b40b27bc3249358dcfc86c0845be409ab7a6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 13 Jun 2009 13:23:54 +0100
Subject: [PATCH 162/162] FRV: Fix interaction with new generic header stuff

Fix interaction with new generic header stuff as added by:

	commit 6103ec56c65c33774c7c38652c8204120c3c7519
	Author: Arnd Bergmann <arnd@arndb.de>
	Date:   Wed May 13 22:56:27 2009 +0000

	    asm-generic: add generic ABI headers

The problem is that asm/signal.h has been made to include asm-generic/signal.h,
but the redundant stuff from asm/signal.h has not been discarded, leading to
multiple redefinitions.

Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/include/asm/signal.h | 123 +---------------------------------
 1 file changed, 3 insertions(+), 120 deletions(-)

diff --git a/arch/frv/include/asm/signal.h b/arch/frv/include/asm/signal.h
index 2079197d483d..f071e813dcb3 100644
--- a/arch/frv/include/asm/signal.h
+++ b/arch/frv/include/asm/signal.h
@@ -3,107 +3,15 @@
 
 #include <linux/types.h>
 
-/* Avoid too many header ordering problems.  */
-struct siginfo;
-
-#ifdef __KERNEL__
-/* Most things should be clean enough to redefine this at will, if care
-   is taken to make libc match.  */
-
-#define _NSIG		64
-#define _NSIG_BPW	32
-#define _NSIG_WORDS	(_NSIG / _NSIG_BPW)
-
-typedef unsigned long old_sigset_t;		/* at least 32 bits */
-
-typedef struct {
-	unsigned long sig[_NSIG_WORDS];
-} sigset_t;
-
-#else
+#ifndef __KERNEL__
 /* Here we must cater to libcs that poke about in kernel headers.  */
 
 #define NSIG		32
 typedef unsigned long sigset_t;
 
-#endif /* __KERNEL__ */
+#endif /* !__KERNEL__ */
 
-#define SIGHUP		 1
-#define SIGINT		 2
-#define SIGQUIT		 3
-#define SIGILL		 4
-#define SIGTRAP		 5
-#define SIGABRT		 6
-#define SIGIOT		 6
-#define SIGBUS		 7
-#define SIGFPE		 8
-#define SIGKILL		 9
-#define SIGUSR1		10
-#define SIGSEGV		11
-#define SIGUSR2		12
-#define SIGPIPE		13
-#define SIGALRM		14
-#define SIGTERM		15
-#define SIGSTKFLT	16
-#define SIGCHLD		17
-#define SIGCONT		18
-#define SIGSTOP		19
-#define SIGTSTP		20
-#define SIGTTIN		21
-#define SIGTTOU		22
-#define SIGURG		23
-#define SIGXCPU		24
-#define SIGXFSZ		25
-#define SIGVTALRM	26
-#define SIGPROF		27
-#define SIGWINCH	28
-#define SIGIO		29
-#define SIGPOLL		SIGIO
-/*
-#define SIGLOST		29
-*/
-#define SIGPWR		30
-#define SIGSYS		31
-#define	SIGUNUSED	31
-
-/* These should not be considered constants from userland.  */
-#define SIGRTMIN	32
-#define SIGRTMAX	(_NSIG-1)
-
-/*
- * SA_FLAGS values:
- *
- * SA_ONSTACK indicates that a registered stack_t will be used.
- * SA_RESTART flag to get restarting signals (which were the default long ago)
- * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
- * SA_RESETHAND clears the handler when the signal is delivered.
- * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
- * SA_NODEFER prevents the current signal from being masked in the handler.
- *
- * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
- * Unix names RESETHAND and NODEFER respectively.
- */
-#define SA_NOCLDSTOP	0x00000001
-#define SA_NOCLDWAIT	0x00000002 /* not supported yet */
-#define SA_SIGINFO	0x00000004
-#define SA_ONSTACK	0x08000000
-#define SA_RESTART	0x10000000
-#define SA_NODEFER	0x40000000
-#define SA_RESETHAND	0x80000000
-
-#define SA_NOMASK	SA_NODEFER
-#define SA_ONESHOT	SA_RESETHAND
-
-#define SA_RESTORER	0x04000000
-
-/*
- * sigaltstack controls
- */
-#define SS_ONSTACK	1
-#define SS_DISABLE	2
-
-#define MINSIGSTKSZ	2048
-#define SIGSTKSZ	8192
+#define SA_RESTORER	0x04000000 /* to get struct sigaction correct */
 
 #include <asm-generic/signal.h>
 
@@ -115,16 +23,6 @@ struct old_sigaction {
 	__sigrestore_t sa_restorer;
 };
 
-struct sigaction {
-	__sighandler_t sa_handler;
-	unsigned long sa_flags;
-	__sigrestore_t sa_restorer;
-	sigset_t sa_mask;		/* mask last for extensibility */
-};
-
-struct k_sigaction {
-	struct sigaction sa;
-};
 #else
 /* Here we must cater to libcs that poke about in kernel headers.  */
 
@@ -143,19 +41,4 @@ struct sigaction {
 
 #endif /* __KERNEL__ */
 
-typedef struct sigaltstack {
-	void __user *ss_sp;
-	int ss_flags;
-	size_t ss_size;
-} stack_t;
-
-#define ptrace_signal_deliver(regs, cookie) do { } while (0)
-
-#ifdef __KERNEL__
-
-#include <asm/sigcontext.h>
-#undef __HAVE_ARCH_SIG_BITOPS
-
-#endif /* __KERNEL__ */
-
 #endif /* _ASM_SIGNAL_H */