From 1c1452be2e9ae282a7316c3b23987811bd7acda6 Mon Sep 17 00:00:00 2001 From: Jonas Larsson Date: Tue, 31 Mar 2009 11:16:48 +0200 Subject: [PATCH 001/162] atmel-mci: Add support for inverted detect pin Same patch as before, modified to use bool. Also adds description of the new field in struct atmel_mci that I missed in the first patch. This patch adds Atmel MCI support for inverted detect pins. Signed-off-by: Jonas Larsson Acked-by: Pierre Ossman Signed-off-by: Haavard Skinnemoen --- drivers/mmc/host/atmel-mci.c | 12 +++++++++--- include/linux/atmel-mci.h | 2 ++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index cf6a100bb38f..7b603e4b41db 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -177,6 +177,7 @@ struct atmel_mci { * available. * @wp_pin: GPIO pin used for card write protect sending, or negative * if not available. + * @detect_is_active_high: The state of the detect pin when it is active. * @detect_timer: Timer used for debouncing @detect_pin interrupts. */ struct atmel_mci_slot { @@ -196,6 +197,7 @@ struct atmel_mci_slot { int detect_pin; int wp_pin; + bool detect_is_active_high; struct timer_list detect_timer; }; @@ -924,7 +926,8 @@ static int atmci_get_cd(struct mmc_host *mmc) struct atmel_mci_slot *slot = mmc_priv(mmc); if (gpio_is_valid(slot->detect_pin)) { - present = !gpio_get_value(slot->detect_pin); + present = !(gpio_get_value(slot->detect_pin) ^ + slot->detect_is_active_high); dev_dbg(&mmc->class_dev, "card is %spresent\n", present ? "" : "not "); } @@ -1028,7 +1031,8 @@ static void atmci_detect_change(unsigned long data) return; enable_irq(gpio_to_irq(slot->detect_pin)); - present = !gpio_get_value(slot->detect_pin); + present = !(gpio_get_value(slot->detect_pin) ^ + slot->detect_is_active_high); present_old = test_bit(ATMCI_CARD_PRESENT, &slot->flags); dev_vdbg(&slot->mmc->class_dev, "detect change: %d (was %d)\n", @@ -1456,6 +1460,7 @@ static int __init atmci_init_slot(struct atmel_mci *host, slot->host = host; slot->detect_pin = slot_data->detect_pin; slot->wp_pin = slot_data->wp_pin; + slot->detect_is_active_high = slot_data->detect_is_active_high; slot->sdc_reg = sdc_reg; mmc->ops = &atmci_ops; @@ -1477,7 +1482,8 @@ static int __init atmci_init_slot(struct atmel_mci *host, if (gpio_request(slot->detect_pin, "mmc_detect")) { dev_dbg(&mmc->class_dev, "no detect pin available\n"); slot->detect_pin = -EBUSY; - } else if (gpio_get_value(slot->detect_pin)) { + } else if (gpio_get_value(slot->detect_pin) ^ + slot->detect_is_active_high) { clear_bit(ATMCI_CARD_PRESENT, &slot->flags); } } diff --git a/include/linux/atmel-mci.h b/include/linux/atmel-mci.h index 2f1f95737acb..57b1846a3c87 100644 --- a/include/linux/atmel-mci.h +++ b/include/linux/atmel-mci.h @@ -10,6 +10,7 @@ * @bus_width: Number of data lines wired up the slot * @detect_pin: GPIO pin wired to the card detect switch * @wp_pin: GPIO pin wired to the write protect sensor + * @detect_is_active_high: The state of the detect pin when it is active * * If a given slot is not present on the board, @bus_width should be * set to 0. The other fields are ignored in this case. @@ -24,6 +25,7 @@ struct mci_slot_pdata { unsigned int bus_width; int detect_pin; int wp_pin; + bool detect_is_active_high; }; /** From 7ebcfcf19723254ebe90cdf8718436b9955a9a01 Mon Sep 17 00:00:00 2001 From: Jonas Larsson Date: Tue, 31 Mar 2009 11:16:52 +0200 Subject: [PATCH 002/162] avr32: Solves problem with inverted MCI detect pin on Merisc board Same patch as before, modified to use bool. This patch solves the problem with the inverted mci detect pin on Merisc boards. Signed-off-by: Jonas Larsson Signed-off-by: Haavard Skinnemoen --- arch/avr32/boards/merisc/setup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/avr32/boards/merisc/setup.c b/arch/avr32/boards/merisc/setup.c index 20b300cf105a..623b077594fc 100644 --- a/arch/avr32/boards/merisc/setup.c +++ b/arch/avr32/boards/merisc/setup.c @@ -94,9 +94,10 @@ static struct spi_board_info __initdata spi0_board_info[] = { static struct mci_platform_data __initdata mci0_data = { .slot[0] = { - .bus_width = 4, - .detect_pin = GPIO_PIN_PE(19), - .wp_pin = GPIO_PIN_PE(20), + .bus_width = 4, + .detect_pin = GPIO_PIN_PE(19), + .wp_pin = GPIO_PIN_PE(20), + .detect_is_active_high = true, }, }; From f25181f598cf4a8ccc40a51d8b74f8b555ecddee Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Thu, 23 Apr 2009 22:18:00 -0400 Subject: [PATCH 003/162] xfs_file_last_byte() needs to acquire ilock We had some systems crash with this stack: [] ia64_leave_kernel+0x0/0x280 [] xfs_bmbt_get_startoff+0x0/0x20 [xfs] [] xfs_bmap_last_offset+0x210/0x280 [xfs] [] xfs_file_last_byte+0x70/0x1a0 [xfs] [] xfs_itruncate_start+0xc0/0x1a0 [xfs] [] xfs_inactive_free_eofblocks+0x290/0x460 [xfs] [] xfs_release+0x1b0/0x240 [xfs] [] xfs_file_release+0x70/0xa0 [xfs] [] __fput+0x1a0/0x420 [] fput+0x40/0x60 The problem here is that xfs_file_last_byte() does not acquire the inode lock and can therefore race with another thread that is modifying the extext list. While xfs_bmap_last_offset() is trying to lookup what was the last extent some extents were merged and the extent list shrunk so the index we lookup is now beyond the end of the extent list and potentially in a freed buffer. Signed-off-by: Lachlan McIlroy Reviewed-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e7ae08d1df48..123b20c8cbf2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1258,8 +1258,10 @@ xfs_file_last_byte( * necessary. */ if (ip->i_df.if_flags & XFS_IFEXTENTS) { + xfs_ilock(ip, XFS_ILOCK_SHARED); error = xfs_bmap_last_offset(NULL, ip, &last_block, XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) { last_block = 0; } From 2ac00af7a6d2e65013e6f28bd1f37c0cd98ba134 Mon Sep 17 00:00:00 2001 From: Olaf Weber Date: Fri, 17 Apr 2009 16:12:45 -0500 Subject: [PATCH 004/162] xfs: add more checks to superblock validation There had been reports where xfs filesystem was randomly corrupted with fsfuzzer, and xfs failed to handle it gracefully. This patch fixes couple of reported problem by providing additional checks in the superblock validation routine. Signed-off-by: Olaf Weber Reviewed-by: Josef 'Jeff' Sipek Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/xfs_mount.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b101990df027..65a99725d0cc 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -291,14 +291,17 @@ xfs_mount_validate_sb( sbp->sb_sectsize > XFS_MAX_SECTORSIZE || sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || + sbp->sb_sectsize != (1 << sbp->sb_sectlog) || sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_blocksize != (1 << sbp->sb_blocklog) || sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || + sbp->sb_inodesize != (1 << sbp->sb_inodelog) || (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || From 4be4a00fb55879ef44b5914c189aecffa828deb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Apr 2009 10:50:48 -0400 Subject: [PATCH 005/162] xfs: a couple getbmap cleanups - reshuffle various conditionals for data vs attr fork to make the code more readable - do fine-grainded goto-based error handling - exit early from conditionals instead of keeping a long else branch around - allow kmem_alloc to fail Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_bmap.c | 164 ++++++++++++++++++++++------------------------ 1 file changed, 80 insertions(+), 84 deletions(-) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 3a6ed426327a..abe42448b1c0 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5880,7 +5880,7 @@ xfs_getbmap( void *arg) /* formatter arg */ { __int64_t bmvend; /* last block requested */ - int error; /* return value */ + int error = 0; /* return value */ __int64_t fixlen; /* length for -1 case */ int i; /* extent number */ int lock; /* lock state */ @@ -5899,30 +5899,8 @@ xfs_getbmap( mp = ip->i_mount; iflags = bmv->bmv_iflags; - whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; - /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not - * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ - * bit is set for the file, generate a read event in order - * that the DMAPI application may do its thing before we return - * the extents. Usually this means restoring user file data to - * regions of the file that look like holes. - * - * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify - * BMV_IF_NO_DMAPI_READ so that read events are generated. - * If this were not true, callers of ioctl( XFS_IOC_GETBMAP ) - * could misinterpret holes in a DMAPI file as true holes, - * when in fact they may represent offline user data. - */ - if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 && - DM_EVENT_ENABLED(ip, DM_EVENT_READ) && - whichfork == XFS_DATA_FORK) { - error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL); - if (error) - return XFS_ERROR(error); - } - if (whichfork == XFS_ATTR_FORK) { if (XFS_IFORK_Q(ip)) { if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && @@ -5936,11 +5914,37 @@ xfs_getbmap( ip->i_mount); return XFS_ERROR(EFSCORRUPTED); } - } else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EINVAL); - if (whichfork == XFS_DATA_FORK) { + + prealloced = 0; + fixlen = 1LL << 32; + } else { + /* + * If the BMV_IF_NO_DMAPI_READ interface bit specified, do + * not generate a DMAPI read event. Otherwise, if the + * DM_EVENT_READ bit is set for the file, generate a read + * event in order that the DMAPI application may do its thing + * before we return the extents. Usually this means restoring + * user file data to regions of the file that look like holes. + * + * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify + * BMV_IF_NO_DMAPI_READ so that read events are generated. + * If this were not true, callers of ioctl(XFS_IOC_GETBMAP) + * could misinterpret holes in a DMAPI file as true holes, + * when in fact they may represent offline user data. + */ + if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && + !(iflags & BMV_IF_NO_DMAPI_READ)) { + error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, + 0, 0, 0, NULL); + if (error) + return XFS_ERROR(error); + } + + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + if (xfs_get_extsz_hint(ip) || ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ prealloced = 1; @@ -5949,42 +5953,34 @@ xfs_getbmap( prealloced = 0; fixlen = ip->i_size; } - } else { - prealloced = 0; - fixlen = 1LL << 32; } if (bmv->bmv_length == -1) { fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); - bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset), - (__int64_t)0); - } else if (bmv->bmv_length < 0) - return XFS_ERROR(EINVAL); - if (bmv->bmv_length == 0) { + bmv->bmv_length = + max_t(__int64_t, fixlen - bmv->bmv_offset, 0); + } else if (bmv->bmv_length == 0) { bmv->bmv_entries = 0; return 0; + } else if (bmv->bmv_length < 0) { + return XFS_ERROR(EINVAL); } + nex = bmv->bmv_count - 1; if (nex <= 0) return XFS_ERROR(EINVAL); bmvend = bmv->bmv_offset + bmv->bmv_length; xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (((iflags & BMV_IF_DELALLOC) == 0) && - (whichfork == XFS_DATA_FORK) && - (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) { - /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ - error = xfs_flush_pages(ip, (xfs_off_t)0, - -1, 0, FI_REMAPF); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return error; + if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { + if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { + error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); + if (error) + goto out_unlock_iolock; } - } - ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) || - ip->i_delayed_blks == 0); + ASSERT(ip->i_delayed_blks == 0); + } lock = xfs_ilock_map_shared(ip); @@ -5995,23 +5991,25 @@ xfs_getbmap( if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; - bmapi_flags = xfs_bmapi_aflag(whichfork) | - ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE); + bmapi_flags = xfs_bmapi_aflag(whichfork); + if (!(iflags & BMV_IF_PREALLOC)) + bmapi_flags |= XFS_BMAPI_IGSTATE; /* * Allocate enough space to handle "subnex" maps at a time. */ + error = ENOMEM; subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP); + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL); + if (!map) + goto out_unlock_ilock; bmv->bmv_entries = 0; - if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) { - if (((iflags & BMV_IF_DELALLOC) == 0) || - whichfork == XFS_ATTR_FORK) { - error = 0; - goto unlock_and_return; - } + if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && + (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { + error = 0; + goto out_free_map; } nexleft = nex; @@ -6023,10 +6021,12 @@ xfs_getbmap( bmapi_flags, NULL, 0, map, &nmap, NULL, NULL); if (error) - goto unlock_and_return; + goto out_free_map; ASSERT(nmap <= subnex); for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + int full = 0; /* user array is full */ + out.bmv_oflags = 0; if (map[i].br_state == XFS_EXT_UNWRITTEN) out.bmv_oflags |= BMV_OF_PREALLOC; @@ -6041,36 +6041,32 @@ xfs_getbmap( whichfork == XFS_ATTR_FORK) { /* came to the end of attribute fork */ out.bmv_oflags |= BMV_OF_LAST; - goto unlock_and_return; - } else { - int full = 0; /* user array is full */ - - if (!xfs_getbmapx_fix_eof_hole(ip, &out, - prealloced, bmvend, - map[i].br_startblock)) { - goto unlock_and_return; - } - - /* format results & advance arg */ - error = formatter(&arg, &out, &full); - if (error || full) - goto unlock_and_return; - nexleft--; - bmv->bmv_offset = - out.bmv_offset + out.bmv_length; - bmv->bmv_length = MAX((__int64_t)0, - (__int64_t)(bmvend - bmv->bmv_offset)); - bmv->bmv_entries++; + goto out_free_map; } + + if (!xfs_getbmapx_fix_eof_hole(ip, &out, prealloced, + bmvend, map[i].br_startblock)) + goto out_free_map; + + /* format results & advance arg */ + error = formatter(&arg, &out, &full); + if (error || full) + goto out_free_map; + nexleft--; + bmv->bmv_offset = + out.bmv_offset + out.bmv_length; + bmv->bmv_length = + max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + bmv->bmv_entries++; } } while (nmap && nexleft && bmv->bmv_length); -unlock_and_return: - xfs_iunlock_map_shared(ip, lock); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - + out_free_map: kmem_free(map); - + out_unlock_ilock: + xfs_iunlock_map_shared(ip, lock); + out_unlock_iolock: + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return error; } From 6321e3ed2acf3ee9643cdd403e1c88605d7944ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Feb 2009 08:39:02 -0500 Subject: [PATCH 006/162] xfs: fix getbmap vs mmap deadlock xfs_getbmap (or rather the formatters called by it) copy out the getbmap structures under the ilock, which can deadlock against mmap. This has been reported via bugzilla a while ago (#717) and has recently also shown up via lockdep. So allocate a temporary buffer to format the kernel getbmap structures into and then copy them out after dropping the locks. A little problem with this is that we limit the number of extents we can copy out by the maximum allocation size, but I see no real way around that. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_bmap.c | 52 +++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index abe42448b1c0..ca7c6005a487 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5890,12 +5890,13 @@ xfs_getbmap( int nexleft; /* # of user extents left */ int subnex; /* # of bmapi's can do */ int nmap; /* number of map entries */ - struct getbmapx out; /* output structure */ + struct getbmapx *out; /* output structure */ int whichfork; /* data or attr fork */ int prealloced; /* this is a file with * preallocated data space */ int iflags; /* interface flags */ int bmapi_flags; /* flags for xfs_bmapi */ + int cur_ext = 0; mp = ip->i_mount; iflags = bmv->bmv_iflags; @@ -5971,6 +5972,13 @@ xfs_getbmap( return XFS_ERROR(EINVAL); bmvend = bmv->bmv_offset + bmv->bmv_length; + + if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) + return XFS_ERROR(ENOMEM); + out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); + if (!out) + return XFS_ERROR(ENOMEM); + xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { @@ -6025,39 +6033,39 @@ xfs_getbmap( ASSERT(nmap <= subnex); for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { - int full = 0; /* user array is full */ - - out.bmv_oflags = 0; + out[cur_ext].bmv_oflags = 0; if (map[i].br_state == XFS_EXT_UNWRITTEN) - out.bmv_oflags |= BMV_OF_PREALLOC; + out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; else if (map[i].br_startblock == DELAYSTARTBLOCK) - out.bmv_oflags |= BMV_OF_DELALLOC; - out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff); - out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - out.bmv_unused1 = out.bmv_unused2 = 0; + out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; + out[cur_ext].bmv_offset = + XFS_FSB_TO_BB(mp, map[i].br_startoff); + out[cur_ext].bmv_length = + XFS_FSB_TO_BB(mp, map[i].br_blockcount); + out[cur_ext].bmv_unused1 = 0; + out[cur_ext].bmv_unused2 = 0; ASSERT(((iflags & BMV_IF_DELALLOC) != 0) || (map[i].br_startblock != DELAYSTARTBLOCK)); if (map[i].br_startblock == HOLESTARTBLOCK && whichfork == XFS_ATTR_FORK) { /* came to the end of attribute fork */ - out.bmv_oflags |= BMV_OF_LAST; + out[cur_ext].bmv_oflags |= BMV_OF_LAST; goto out_free_map; } - if (!xfs_getbmapx_fix_eof_hole(ip, &out, prealloced, - bmvend, map[i].br_startblock)) + if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], + prealloced, bmvend, + map[i].br_startblock)) goto out_free_map; - /* format results & advance arg */ - error = formatter(&arg, &out, &full); - if (error || full) - goto out_free_map; nexleft--; bmv->bmv_offset = - out.bmv_offset + out.bmv_length; + out[cur_ext].bmv_offset + + out[cur_ext].bmv_length; bmv->bmv_length = max_t(__int64_t, 0, bmvend - bmv->bmv_offset); bmv->bmv_entries++; + cur_ext++; } } while (nmap && nexleft && bmv->bmv_length); @@ -6067,6 +6075,16 @@ xfs_getbmap( xfs_iunlock_map_shared(ip, lock); out_unlock_iolock: xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + for (i = 0; i < cur_ext; i++) { + int full = 0; /* user array is full */ + + /* format results & advance arg */ + error = formatter(&arg, &out[i], &full); + if (error || full) + break; + } + return error; } From ec91d1335f478c5cd089d82ffbf936075c5f24c8 Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Thu, 7 May 2009 19:49:45 -0500 Subject: [PATCH 007/162] xfs: fix double unlock in xfs_swap_extents() Regreesion from commit ef8f7fc, which rearranged the code in xfs_swap_extents() leading to double unlock of xfs inode ilock. That resulted in xfs_fsr deadlocking itself on platforms, which don't handle double unlock of rw_semaphore nicely. It caused the count go negative, which represents the write holder, without really having one. ia64 is one of the platforms where deadlock was easily reproduced and the fix was tested. Signed-off-by: Eric Sandeen Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/xfs_dfrag.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index e6d839bddbf0..7465f9ee125f 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -347,13 +347,15 @@ xfs_swap_extents( error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); out: kmem_free(tempifp); return error; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + goto out; + out_trans_cancel: xfs_trans_cancel(tp, 0); goto out_unlock; From 334034c132017fa662716b70591b2297ed7f238c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 May 2009 13:37:11 -0700 Subject: [PATCH 008/162] avr32: remove obsolete hw_interrupt_type The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) have been kept around for migration reasons. After more than two years it's time to remove them finally. This patch cleans up one of the remaining users. When all such patches hit mainline we can remove the defines and typedefs finally. Impact: cleanup Convert the last remaining users to struct irq_chip and remove the define. Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Haavard Skinnemoen --- arch/avr32/include/asm/hw_irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/avr32/include/asm/hw_irq.h b/arch/avr32/include/asm/hw_irq.h index 218b0a6bfd1b..a36f9fcb8fcd 100644 --- a/arch/avr32/include/asm/hw_irq.h +++ b/arch/avr32/include/asm/hw_irq.h @@ -1,7 +1,7 @@ #ifndef __ASM_AVR32_HW_IRQ_H #define __ASM_AVR32_HW_IRQ_H -static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) +static inline void hw_resend_irq(struct irq_chip *h, unsigned int i) { /* Nothing to do */ } From 096324873f9c7172a17aff9db1356f4f01b77afe Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 23 May 2009 14:30:12 -0500 Subject: [PATCH 009/162] xfs: fix overflow in xfs_growfs_data_private In the case where growing a filesystem would leave the last AG too small, the fixup code has an overflow in the calculation of the new size with one fewer ag, because "nagcount" is a 32 bit number. If the new filesystem has > 2^32 blocks in it this causes a problem resulting in an EINVAL return from growfs: # xfs_io -f -c "truncate 19998630180864" fsfile # mkfs.xfs -f -bsize=4096 -dagsize=76288719b,size=3905982455b fsfile # mount -o loop fsfile /mnt # xfs_growfs /mnt meta-data=/dev/loop0 isize=256 agcount=52, agsize=76288719 blks = sectsz=512 attr=2 data = bsize=4096 blocks=3905982455, imaxpct=5 = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0 log =internal bsize=4096 blocks=32768, version=2 = sectsz=512 sunit=0 blks, lazy-count=0 realtime =none extsz=4096 blocks=0, rtextents=0 xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Invalid argument Reported-by: richard.ems@cape-horn-eng.com Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_fsops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 8379e3bca26c..cbd451bb4848 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -160,7 +160,7 @@ xfs_growfs_data_private( nagcount = new + (nb_mod != 0); if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { nagcount--; - nb = nagcount * mp->m_sb.sb_agblocks; + nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; if (nb < mp->m_sb.sb_dblocks) return XFS_ERROR(EINVAL); } From 13503fa9137d9708d52214e9506c671dbf2fbdce Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Mar 2009 17:39:20 +0900 Subject: [PATCH 010/162] x86, mce: Cleanup param parser - Fix the comment formatting. - The error path does not return 0, and printk lacks level and "\n". - Move __setup("nomce") next to mcheck_disable(). - Improve readability etc. [ Impact: cleanup ] Signed-off-by: Hidetoshi Seto Acked-by: Andi Kleen LKML-Reference: <49CB3F38.7090703@jp.fujitsu.com> Signed-off-by: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 6fb0b359d2a5..77effb55afe7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -839,25 +839,29 @@ static int __init mcheck_disable(char *str) mce_dont_init = 1; return 1; } +__setup("nomce", mcheck_disable); -/* mce=off disables machine check. - mce=TOLERANCELEVEL (number, see above) - mce=bootlog Log MCEs from before booting. Disabled by default on AMD. - mce=nobootlog Don't log MCEs from before booting. */ +/* + * mce=off disables machine check + * mce=TOLERANCELEVEL (number, see above) + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=nobootlog Don't log MCEs from before booting. + */ static int __init mcheck_enable(char *str) { if (!strcmp(str, "off")) mce_dont_init = 1; - else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) - mce_bootlog = str[0] == 'b'; + else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) + mce_bootlog = (str[0] == 'b'); else if (isdigit(str[0])) get_option(&str, &tolerant); - else - printk("mce= argument %s ignored. Please use /sys", str); + else { + printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", + str); + return 0; + } return 1; } - -__setup("nomce", mcheck_disable); __setup("mce=", mcheck_enable); /* From e9eee03e99d519599eb615c3e251d5f6cc4be57d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:17 +0200 Subject: [PATCH 011/162] x86, mce: clean up mce_64.c This file has been modified many times along the years, by multiple authors, so the general style and structure has diverged in a number of areas making this file hard to read. So fix the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 261 +++++++++++++++++----------- 1 file changed, 156 insertions(+), 105 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 77effb55afe7..1491246c4d69 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -1,46 +1,47 @@ /* * Machine check handler. + * * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. * Copyright 2008 Intel Corporation * Author: Andi Kleen */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include +#include +#include #include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#define MISC_MCELOG_MINOR 227 +#include +#include +#include +#include +#include +#include + +#define MISC_MCELOG_MINOR 227 atomic_t mce_entry; -static int mce_dont_init; +static int mce_dont_init; /* * Tolerant levels: @@ -49,16 +50,16 @@ static int mce_dont_init; * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors * 3: never panic or SIGBUS, log all errors (for testing only) */ -static int tolerant = 1; -static int banks; -static u64 *bank; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = -1; -static atomic_t mce_events; +static int tolerant = 1; +static int banks; +static u64 *bank; +static unsigned long notify_user; +static int rip_msr; +static int mce_bootlog = -1; +static atomic_t mce_events; -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; +static char trigger[128]; +static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); @@ -89,19 +90,23 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { entry = rcu_dereference(mcelog.next); for (;;) { - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ if (entry >= MCE_LOG_LEN) { set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); return; } - /* Old left over entry. Skip. */ + /* Old left over entry. Skip: */ if (mcelog.entry[entry].finished) { entry++; continue; @@ -264,12 +269,12 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! */ -void do_machine_check(struct pt_regs * regs, long error_code) +void do_machine_check(struct pt_regs *regs, long error_code) { struct mce m, panicm; + int panicm_found = 0; u64 mcestart = 0; int i; - int panicm_found = 0; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If tolerant is cranked up, we'll try anyway. @@ -293,6 +298,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) mce_setup(&m); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) no_way_out = 1; @@ -356,23 +362,29 @@ void do_machine_check(struct pt_regs * regs, long error_code) mce_get_rip(&m, regs); mce_log(&m); - /* Did this bank cause the exception? */ - /* Assume that the bank with uncorrectable errors did it, - and that there is only a single one. */ - if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { + /* + * Did this bank cause the exception? + * + * Assume that the bank with uncorrectable errors did it, + * and that there is only a single one: + */ + if ((m.status & MCI_STATUS_UC) && + (m.status & MCI_STATUS_EN)) { panicm = m; panicm_found = 1; } } - /* If we didn't find an uncorrectable error, pick - the last one (shouldn't happen, just being safe). */ + /* + * If we didn't find an uncorrectable error, pick + * the last one (shouldn't happen, just being safe). + */ if (!panicm_found) panicm = m; /* * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. + * has not set tolerant to an insane level, give up and die. */ if (no_way_out && tolerant < 3) mce_panic("Machine check", &panicm, mcestart); @@ -451,10 +463,9 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ - static int check_interval = 5 * 60; /* 5 minutes */ + static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ -static void mcheck_timer(unsigned long); static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -464,9 +475,10 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); - if (mce_available(¤t_cpu_data)) + if (mce_available(¤t_cpu_data)) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); + } /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -501,6 +513,7 @@ int mce_notify_user(void) static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); clear_thread_flag(TIF_MCE_NOTIFY); + if (test_and_clear_bit(0, ¬ify_user)) { wake_up_interruptible(&mce_wait); @@ -520,9 +533,10 @@ int mce_notify_user(void) return 0; } -/* see if the idle task needs to notify userspace */ +/* see if the idle task needs to notify userspace: */ static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) +mce_idle_callback(struct notifier_block *nfb, unsigned long action, + void *unused) { /* IDLE_END should be safe - interrupts are back on */ if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) @@ -532,7 +546,7 @@ mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) } static struct notifier_block mce_idle_notifier = { - .notifier_call = mce_idle_callback, + .notifier_call = mce_idle_callback, }; static __init int periodic_mcheck_init(void) @@ -547,8 +561,8 @@ __initcall(periodic_mcheck_init); */ static int mce_cap_init(void) { - u64 cap; unsigned b; + u64 cap; rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & 0xff; @@ -578,9 +592,9 @@ static int mce_cap_init(void) static void mce_init(void *dummy) { + mce_banks_t all_banks; u64 cap; int i; - mce_banks_t all_banks; /* * Log the machine checks left over from the previous reset. @@ -605,14 +619,21 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) - /* disable GART TBL walk error reporting, which trips off - incorrectly with the IOMMU & 3ware & Cerberus. */ + if (c->x86 == 15 && banks > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ clear_bit(10, (unsigned long *)&bank[4]); - if(c->x86 <= 17 && mce_bootlog < 0) - /* Lots of broken BIOS around that don't clear them - by default and leave crap in there. Don't log. */ + } + if (c->x86 <= 17 && mce_bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ mce_bootlog = 0; + } } } @@ -646,7 +667,7 @@ static void mce_init_timer(void) /* * Called for each booted CPU to set up machine checks. - * Must be called with preempt off. + * Must be called with preempt off: */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { @@ -669,8 +690,8 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) */ static DEFINE_SPINLOCK(mce_state_lock); -static int open_count; /* #times opened */ -static int open_exclu; /* already open exclusive? */ +static int open_count; /* #times opened */ +static int open_exclu; /* already open exclusive? */ static int mce_open(struct inode *inode, struct file *file) { @@ -680,6 +701,7 @@ static int mce_open(struct inode *inode, struct file *file) if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { spin_unlock(&mce_state_lock); unlock_kernel(); + return -EBUSY; } @@ -712,13 +734,14 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } +static DEFINE_MUTEX(mce_read_mutex); + static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { - unsigned long *cpu_tsc; - static DEFINE_MUTEX(mce_read_mutex); - unsigned prev, next; char __user *buf = ubuf; + unsigned long *cpu_tsc; + unsigned prev, next; int i, err; cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); @@ -732,6 +755,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); + return -EINVAL; } @@ -770,6 +794,7 @@ timeout: * synchronize. */ on_each_cpu(collect_tscs, cpu_tsc, 1); + for (i = next; i < MCE_LOG_LEN; i++) { if (mcelog.entry[i].finished && mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { @@ -782,6 +807,7 @@ timeout: } mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); + return err ? -EFAULT : buf - ubuf; } @@ -799,6 +825,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + switch (cmd) { case MCE_GET_RECORD_LEN: return put_user(sizeof(struct mce), p); @@ -810,6 +837,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) do { flags = mcelog.flags; } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + return put_user(flags, p); } default: @@ -818,11 +846,11 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } static const struct file_operations mce_chrdev_ops = { - .open = mce_open, - .release = mce_release, - .read = mce_read, - .poll = mce_poll, - .unlocked_ioctl = mce_ioctl, + .open = mce_open, + .release = mce_release, + .read = mce_read, + .poll = mce_poll, + .unlocked_ioctl = mce_ioctl, }; static struct miscdevice mce_log_device = { @@ -891,13 +919,16 @@ static int mce_shutdown(struct sys_device *dev) return mce_disable(); } -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. - Only one CPU is active at this time, the others get readded later using - CPU hotplug. */ +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ static int mce_resume(struct sys_device *dev) { mce_init(NULL); mce_cpu_features(¤t_cpu_data); + return 0; } @@ -916,14 +947,16 @@ static void mce_restart(void) } static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, - .name = "machinecheck", + .suspend = mce_suspend, + .shutdown = mce_shutdown, + .resume = mce_resume, + .name = "machinecheck", }; DEFINE_PER_CPU(struct sys_device, device_mce); -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; + +__cpuinitdata +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -937,9 +970,12 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit const char *buf, size_t siz) { \ char *end; \ unsigned long new = simple_strtoul(buf, &end, 0); \ - if (end == buf) return -EINVAL; \ + \ + if (end == buf) \ + return -EINVAL; \ var = new; \ start; \ + \ return end-buf; \ } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); @@ -950,6 +986,7 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { u64 b = bank[attr - bank_attrs]; + return sprintf(buf, "%llx\n", b); } @@ -958,15 +995,18 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, { char *end; u64 new = simple_strtoull(buf, &end, 0); + if (end == buf) return -EINVAL; + bank[attr - bank_attrs] = new; mce_restart(); + return end-buf; } -static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, - char *buf) +static ssize_t +show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { strcpy(buf, trigger); strcat(buf, "\n"); @@ -974,21 +1014,27 @@ static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf,size_t siz) + const char *buf, size_t siz) { char *p; int len; + strncpy(trigger, buf, sizeof(trigger)); trigger[sizeof(trigger)-1] = 0; len = strlen(trigger); p = strchr(trigger, '\n'); - if (*p) *p = 0; + + if (*p) + *p = 0; + return len; } static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -ACCESSOR(check_interval,check_interval,mce_restart()) + +ACCESSOR(check_interval, check_interval, mce_restart()) + static struct sysdev_attribute *mce_attributes[] = { &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL @@ -996,7 +1042,7 @@ static struct sysdev_attribute *mce_attributes[] = { static cpumask_var_t mce_device_initialized; -/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ +/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_create_device(unsigned int cpu) { int err; @@ -1006,15 +1052,15 @@ static __cpuinit int mce_create_device(unsigned int cpu) return -EIO; memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(device_mce,cpu).id = cpu; - per_cpu(device_mce,cpu).cls = &mce_sysclass; + per_cpu(device_mce, cpu).id = cpu; + per_cpu(device_mce, cpu).cls = &mce_sysclass; - err = sysdev_register(&per_cpu(device_mce,cpu)); + err = sysdev_register(&per_cpu(device_mce, cpu)); if (err) return err; for (i = 0; mce_attributes[i]; i++) { - err = sysdev_create_file(&per_cpu(device_mce,cpu), + err = sysdev_create_file(&per_cpu(device_mce, cpu), mce_attributes[i]); if (err) goto error; @@ -1035,10 +1081,10 @@ error2: } error: while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce,cpu), + sysdev_remove_file(&per_cpu(device_mce, cpu), mce_attributes[i]); } - sysdev_unregister(&per_cpu(device_mce,cpu)); + sysdev_unregister(&per_cpu(device_mce, cpu)); return err; } @@ -1051,12 +1097,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu) return; for (i = 0; mce_attributes[i]; i++) - sysdev_remove_file(&per_cpu(device_mce,cpu), + sysdev_remove_file(&per_cpu(device_mce, cpu), mce_attributes[i]); for (i = 0; i < banks; i++) sysdev_remove_file(&per_cpu(device_mce, cpu), &bank_attrs[i]); - sysdev_unregister(&per_cpu(device_mce,cpu)); + sysdev_unregister(&per_cpu(device_mce, cpu)); cpumask_clear_cpu(cpu, mce_device_initialized); } @@ -1076,11 +1122,12 @@ static void mce_disable_cpu(void *h) static void mce_reenable_cpu(void *h) { - int i; unsigned long action = *(unsigned long *)h; + int i; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) @@ -1088,8 +1135,8 @@ static void mce_reenable_cpu(void *h) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int __cpuinit +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); @@ -1142,12 +1189,14 @@ static __init int mce_init_banks(void) for (i = 0; i < banks; i++) { struct sysdev_attribute *a = &bank_attrs[i]; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); if (!a->attr.name) goto nomem; - a->attr.mode = 0644; - a->show = show_bank; - a->store = set_bank; + + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; } return 0; @@ -1156,6 +1205,7 @@ nomem: kfree(bank_attrs[i].attr.name); kfree(bank_attrs); bank_attrs = NULL; + return -ENOMEM; } @@ -1185,6 +1235,7 @@ static __init int mce_init_device(void) register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); + return err; } From 3b58dfd04bdfa52e717ead8f3c7622610eb7f950 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:21 +0200 Subject: [PATCH 012/162] x86, mce: clean up mce_32.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_32.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 3552119b091d..05979e7eff12 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -2,13 +2,12 @@ * mce.c - x86 Machine Check Exception Reporting * (c) 2002 Alan Cox , Dave Jones */ - -#include -#include +#include #include #include +#include +#include #include -#include #include #include @@ -17,18 +16,20 @@ #include "mce.h" int mce_disabled; -int nr_mce_banks; +int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); } /* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) From c5aaf0e0702513637278ca4e27a156caa9392817 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:18 +0200 Subject: [PATCH 013/162] x86, mce: clean up p4.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p4.c | 73 +++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f53bdcbaf382..cb344aba479c 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -2,18 +2,17 @@ * P4 specific Machine Check Exception Reporting */ -#include -#include -#include #include +#include +#include +#include #include +#include #include #include -#include #include - -#include +#include #include "mce.h" @@ -36,6 +35,7 @@ static int mce_num_extended_msrs; #ifdef CONFIG_X86_MCE_P4THERMAL + static void unexpected_thermal_interrupt(struct pt_regs *regs) { printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", @@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) add_taint(TAINT_MACHINE_CHECK); } -/* P4/Xeon Thermal transition interrupt handler */ +/* P4/Xeon Thermal transition interrupt handler: */ static void intel_thermal_interrupt(struct pt_regs *regs) { __u64 msr_val; @@ -54,8 +54,9 @@ static void intel_thermal_interrupt(struct pt_regs *regs) therm_throt_process(msr_val & 0x1); } -/* Thermal interrupt handler for this CPU setup */ -static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; +/* Thermal interrupt handler for this CPU setup: */ +static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = + unexpected_thermal_interrupt; void smp_thermal_interrupt(struct pt_regs *regs) { @@ -65,67 +66,76 @@ void smp_thermal_interrupt(struct pt_regs *regs) irq_exit(); } -/* P4/Xeon Thermal regulation detect and init */ +/* P4/Xeon Thermal regulation detect and init: */ static void intel_init_thermal(struct cpuinfo_x86 *c) { - u32 l, h; unsigned int cpu = smp_processor_id(); + u32 l, h; - /* Thermal monitoring */ + /* Thermal monitoring: */ if (!cpu_has(c, X86_FEATURE_ACPI)) return; /* -ENODEV */ - /* Clock modulation */ + /* Clock modulation: */ if (!cpu_has(c, X86_FEATURE_ACC)) return; /* -ENODEV */ - /* first check if its enabled already, in which case there might + /* + * First check if its enabled already, in which case there might * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already -zwanem. + * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", - cpu); + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; /* -EBUSY */ } - /* check whether a vector already exists, temporarily masked? */ + /* Check whether a vector already exists, temporarily masked? */ if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", - cpu, (h & APIC_VECTOR_MASK)); + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; /* -EBUSY */ } - /* The temperature transition interrupt handler setup */ - h = THERMAL_APIC_VECTOR; /* our delivery vector */ - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ + /* + * The temperature transition interrupt handler setup: + */ + + /* Our delivery vector: */ + h = THERMAL_APIC_VECTOR; + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h |= APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); - /* ok we're good to go... */ + /* Ok, we're good to go... */ vendor_thermal_interrupt = intel_thermal_interrupt; rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + /* Unmask the thermal vector: */ l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); - return; } #endif /* CONFIG_X86_MCE_P4THERMAL */ - /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { u32 h; @@ -143,9 +153,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) static void intel_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -157,7 +167,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (mce_num_extended_msrs > 0) { struct intel_mce_extended_msrs dbg; + intel_get_extended_msrs(&dbg); + printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", @@ -171,6 +183,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (high & (1<<31)) { char misc[20]; char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; @@ -196,6 +209,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) panic("Unable to continue"); printk(KERN_EMERG "Attempting to continue.\n"); + /* * Do not clear the MSR_IA32_MCi_STATUS if the error is not * recoverable/continuable.This will allow BIOS to look at the MSRs @@ -217,7 +231,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } - void intel_p4_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; From ed8bc7ed9a2ad875617b24d2ba09e49ee886638c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:21 +0200 Subject: [PATCH 014/162] x86, mce: clean up p5.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p5.c | 43 +++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index c9f77ea69edc..8812f5441830 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -2,11 +2,10 @@ * P5 specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -15,39 +14,53 @@ #include "mce.h" -/* Machine check handler for Pentium class Intel */ +/* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); - printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); - if (lotype&(1<<5)) - printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); + + printk(KERN_EMERG + "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", + smp_processor_id(), loaddr, lotype); + + if (lotype & (1<<5)) { + printk(KERN_EMERG + "CPU#%d: Possible thermal failure (CPU on fire ?).\n", + smp_processor_id()); + } + add_taint(TAINT_MACHINE_CHECK); } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; - /*Check for MCE support */ + /* Check for MCE support: */ if (!cpu_has(c, X86_FEATURE_MCE)) return; - /* Default P5 to off as its often misconnected */ + /* Default P5 to off as its often misconnected: */ if (mce_disabled != -1) return; + machine_check_vector = pentium_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); - /* Read registers before enabling */ + /* Read registers before enabling: */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); rdmsr(MSR_IA32_P5_MC_TYPE, l, h); - printk(KERN_INFO "Intel old style machine check architecture supported.\n"); + printk(KERN_INFO + "Intel old style machine check architecture supported.\n"); - /* Enable MCE */ + /* Enable MCE: */ set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); + printk(KERN_INFO + "Intel old style machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); } From ea2566ff80e096eeecc0918fb5d5a4612d8f62ef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:19 +0200 Subject: [PATCH 015/162] x86, mce: clean up p6.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/p6.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 2ac52d7b434b..43c24e667457 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -2,11 +2,10 @@ * P6 specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -18,9 +17,9 @@ /* Machine Check Handler For PII/PIII */ static void intel_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) if (high & (1<<31)) { char misc[20]; char addr[24]; - misc[0] = addr[0] = '\0'; + + misc[0] = '\0'; + addr[0] = '\0'; + if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; high &= ~(1<<31); + if (high & (1<<27)) { rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); snprintf(addr, 24, " at %08x%08x", ahigh, alow); } + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", smp_processor_id(), i, high, low, misc, addr); } @@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) /* * Do not clear the MSR_IA32_MCi_STATUS if the error is not * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. + * for errors if the OS could not log the error: */ for (i = 0; i < nr_mce_banks; i++) { unsigned int msr; + msr = MSR_IA32_MC0_STATUS+i*4; rdmsr(msr, low, high); if (high & (1<<31)) { - /* Clear it */ + /* Clear it: */ wrmsr(msr, 0UL, 0UL); - /* Serialize */ + /* Serialize: */ wmb(); add_taint(TAINT_MACHINE_CHECK); } @@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code) wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } -/* Set up machine check reporting for processors with Intel style MCE */ +/* Set up machine check reporting for processors with Intel style MCE: */ void intel_p6_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; @@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c) /* Ok machine check is available */ machine_check_vector = intel_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); printk(KERN_INFO "Intel machine check architecture supported.\n"); From efee4ca80980f97b60e91e3322c3342f19623eff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:20 +0200 Subject: [PATCH 016/162] x86, mce: clean up k7.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/k7.c | 42 +++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index dd3af6e7b39a..89e510424152 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -2,11 +2,10 @@ * Athlon specific Machine Check Exception Reporting * (C) Copyright 2002 Dave Jones */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -15,12 +14,12 @@ #include "mce.h" -/* Machine Check Handler For AMD Athlon/Duron */ +/* Machine Check Handler For AMD Athlon/Duron: */ static void k7_machine_check(struct pt_regs *regs, long error_code) { - int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; + int recover = 1; int i; rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); @@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) for (i = 1; i < nr_mce_banks; i++) { rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high&(1<<31)) { + if (high & (1<<31)) { char misc[20]; char addr[24]; - misc[0] = addr[0] = '\0'; + + misc[0] = '\0'; + addr[0] = '\0'; + if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; high &= ~(1<<31); + if (high & (1<<27)) { rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); snprintf(misc, 20, "[%08x%08x]", ahigh, alow); @@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code) rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); snprintf(addr, 24, " at %08x%08x", ahigh, alow); } + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", smp_processor_id(), i, high, low, misc, addr); - /* Clear it */ + + /* Clear it: */ wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize */ + /* Serialize: */ wmb(); add_taint(TAINT_MACHINE_CHECK); } } - if (recover&2) + if (recover & 2) panic("CPU context corrupt"); - if (recover&1) + if (recover & 1) panic("Unable to continue"); + printk(KERN_EMERG "Attempting to continue.\n"); + mcgstl &= ~(1<<2); wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } -/* AMD K7 machine check is Intel like */ +/* AMD K7 machine check is Intel like: */ void amd_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; @@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) return; machine_check_vector = k7_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); printk(KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr(MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); nr_mce_banks = l & 0xff; - /* Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled. */ + /* + * Clear status for MC index 0 separately, we don't touch CTL, + * as some K7 Athlons cause spurious MCEs when its enabled: + */ if (boot_cpu_data.x86 == 6) { wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); i = 1; } else i = 0; + for (; i < nr_mce_banks; i++) { wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); From 91425084f74c0ad087b3fb6bdad79a825f952720 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:22 +0200 Subject: [PATCH 017/162] x86, mce: clean up winchip.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/winchip.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 2a043d89811d..81b02487090b 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -2,11 +2,10 @@ * IDT Winchip specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox */ - -#include -#include -#include #include +#include +#include +#include #include #include @@ -14,7 +13,7 @@ #include "mce.h" -/* Machine check handler for WinChip C6 */ +/* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); @@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) void winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; + machine_check_vector = winchip_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ wmb(); + rdmsr(MSR_IDT_FCR1, lo, hi); lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ lo &= ~(1<<4); /* Enable MCE */ wrmsr(MSR_IDT_FCR1, lo, hi); + set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); + + printk(KERN_INFO + "Winchip machine check reporting enabled on CPU#0.\n"); } From bdbfbdd5e8f0efb9bfef2e597f8ac673c36317ab Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:20 +0200 Subject: [PATCH 018/162] x86, mce: clean up non-fatal.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/non-fatal.c | 53 +++++++++++++------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index a74af128efc9..70b710420f74 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -6,15 +6,14 @@ * This file contains routines to check for non-fatal MCEs every 15s * */ - -#include -#include -#include -#include -#include #include -#include +#include +#include +#include #include +#include +#include +#include #include #include @@ -22,9 +21,9 @@ #include "mce.h" -static int firstbank; +static int firstbank; -#define MCE_RATE 15*HZ /* timer rate is 15s */ +#define MCE_RATE (15*HZ) /* timer rate is 15s */ static void mce_checkregs(void *info) { @@ -34,23 +33,24 @@ static void mce_checkregs(void *info) for (i = firstbank; i < nr_mce_banks; i++) { rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - printk(KERN_INFO "MCE: The hardware reports a non " - "fatal, correctable incident occurred on " - "CPU %d.\n", + if (!(high & (1<<31))) + continue; + + printk(KERN_INFO "MCE: The hardware reports a non fatal, " + "correctable incident occurred on CPU %d.\n", smp_processor_id()); - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time. - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } + /* + * Scrub the error so we don't pick it up in MCE_RATE + * seconds time: + */ + wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + + /* Serialize: */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); } } @@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void) /* Some Athlons misbehave when we frob bank 0 */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; + boot_cpu_data.x86 == 6) + firstbank = 1; else - firstbank = 0; + firstbank = 0; /* * Check for non-fatal errors every MCE_RATE s */ schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); printk(KERN_INFO "Machine check exception polling timer started.\n"); + return 0; } module_init(init_nonfatal_mce_checker); From cb6f3c155b0afabc48667efb9e7b1ce92ccfcab4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:19 +0200 Subject: [PATCH 019/162] x86, mce: clean up therm_throt.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 72 +++++++++++++----------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d5ae2243f0b9..a2b5d7ddb19f 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -1,7 +1,7 @@ /* - * * Thermal throttle event support code (such as syslog messaging and rate * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * * This allows consistent reporting of CPU thermal throttle events. * * Maintains a counter in /sys that keeps track of the number of thermal @@ -13,43 +13,44 @@ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. * Inspired by Ross Biro's and Al Borchers' counter code. */ - +#include +#include #include #include #include -#include -#include -#include + #include +#include /* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL (300 * HZ) +#define CHECK_INTERVAL (300 * HZ) static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -atomic_t therm_throt_en = ATOMIC_INIT(0); + +atomic_t therm_throt_en = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) +#define define_therm_throt_sysdev_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) -#define define_therm_throt_sysdev_show_func(name) \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ -{ \ - unsigned int cpu = dev->id; \ - ssize_t ret; \ - \ - preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) \ - ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_throttle_##name, cpu)); \ - else \ - ret = 0; \ - preempt_enable(); \ - \ - return ret; \ +#define define_therm_throt_sysdev_show_func(name) \ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_throttle_##name, cpu)); \ + else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ } define_therm_throt_sysdev_show_func(count); @@ -61,8 +62,8 @@ static struct attribute *thermal_throttle_attrs[] = { }; static struct attribute_group thermal_throttle_attr_group = { - .attrs = thermal_throttle_attrs, - .name = "thermal_throttle" + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" }; #endif /* CONFIG_SYSFS */ @@ -110,10 +111,11 @@ int therm_throt_process(int curr) } #ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device */ +/* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) { - return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return sysfs_create_group(&sys_dev->kobj, + &thermal_throttle_attr_group); } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) @@ -121,19 +123,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); } -/* Mutex protecting device creation against CPU hotplug */ +/* Mutex protecting device creation against CPU hotplug: */ static DEFINE_MUTEX(therm_cpu_lock); /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static __cpuinit int +thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct sys_device *sys_dev; int err = 0; sys_dev = get_cpu_sysdev(cpu); + switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: From 1cb2a8e1767ab60370ecce90654c0f281c602d95 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:18 +0200 Subject: [PATCH 020/162] x86, mce: clean up mce_amd_64.c Make the coding style match that of the rest of the x86 arch code. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 188 +++++++++++++----------- 1 file changed, 103 insertions(+), 85 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 56dde9c4bc96..4d90ec3eb51d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -13,22 +13,22 @@ * * All MC4_MISCi registers are shared between multi-cores */ - -#include -#include -#include #include -#include #include -#include -#include +#include #include +#include +#include #include +#include +#include +#include + +#include #include +#include #include #include -#include -#include #define PFX "mce_threshold: " #define VERSION "version 1.1.1" @@ -48,26 +48,26 @@ #define MCG_XBLK_ADDR 0xC0000400 struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; + unsigned int block; + unsigned int bank; + unsigned int cpu; + u32 address; + u16 interrupt_enable; + u16 threshold_limit; + struct kobject kobj; + struct list_head miscj; }; /* defaults used early on boot */ static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, + .interrupt_enable = 0, + .threshold_limit = THRESHOLD_MAX, }; struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - cpumask_var_t cpus; + struct kobject *kobj; + struct threshold_block *blocks; + cpumask_var_t cpus; }; static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); @@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void); */ struct thresh_restart { - struct threshold_block *b; - int reset; - u16 old_limit; + struct threshold_block *b; + int reset; + u16 old_limit; }; /* must be called with correct cpu affinity */ @@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr) } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (mci_misc_hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); + mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } @@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr) /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { - unsigned int bank, block; unsigned int cpu = smp_processor_id(); - u8 lvt_off; u32 low = 0, high = 0, address = 0; + unsigned int bank, block; struct thresh_restart tr; + u8 lvt_off; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!address) break; address += MCG_XBLK_ADDR; - } - else + } else ++address; if (rdmsr_safe(address, &low, &high)) @@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) */ static void amd_threshold_interrupt(void) { + u32 low = 0, high = 0, address = 0; unsigned int bank, block; struct mce m; - u32 low = 0, high = 0, address = 0; mce_setup(&m); @@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) + if (block == 0) { address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) { + } else if (block == 1) { address = (low & MASK_BLKPTR_LO) >> 21; if (!address) break; address += MCG_XBLK_ADDR; - } - else + } else { ++address; + } if (rdmsr_safe(address, &low, &high)) break; @@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void) (high & MASK_LOCKED_HI)) continue; - /* Log the machine check that caused the threshold - event. */ + /* + * Log the machine check that caused the threshold + * event. + */ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); @@ -254,48 +256,56 @@ static void amd_threshold_interrupt(void) struct threshold_attr { struct attribute attr; - ssize_t(*show) (struct threshold_block *, char *); - ssize_t(*store) (struct threshold_block *, const char *, size_t count); + ssize_t (*show) (struct threshold_block *, char *); + ssize_t (*store) (struct threshold_block *, const char *, size_t count); }; -#define SHOW_FIELDS(name) \ -static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ -{ \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ +{ \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ } SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) -static ssize_t store_interrupt_enable(struct threshold_block *b, - const char *buf, size_t count) +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) { - char *end; struct thresh_restart tr; - unsigned long new = simple_strtoul(buf, &end, 0); + unsigned long new; + char *end; + + new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + b->interrupt_enable = !!new; - tr.b = b; - tr.reset = 0; - tr.old_limit = 0; + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return end - buf; } -static ssize_t store_threshold_limit(struct threshold_block *b, - const char *buf, size_t count) +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) { - char *end; struct thresh_restart tr; - unsigned long new = simple_strtoul(buf, &end, 0); + unsigned long new; + char *end; + + new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + if (new > THRESHOLD_MAX) new = THRESHOLD_MAX; if (new < 1) new = 1; + tr.old_limit = b->threshold_limit; b->threshold_limit = new; tr.b = b; @@ -307,8 +317,8 @@ static ssize_t store_threshold_limit(struct threshold_block *b, } struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; + struct threshold_block *tb; + long retval; }; static void local_error_count_handler(void *_tbcc) @@ -338,15 +348,16 @@ static ssize_t store_error_count(struct threshold_block *b, return 1; } -#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ +#define THRESHOLD_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ }; -#define RW_ATTR(name) \ -static struct threshold_attr name = \ - THRESHOLD_ATTR(name, 0644, show_## name, store_## name) +#define RW_ATTR(name) \ +static struct threshold_attr name = \ + THRESHOLD_ATTR(name, 0644, show_## name, store_## name) RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); @@ -359,15 +370,17 @@ static struct attribute *default_attrs[] = { NULL }; -#define to_block(k) container_of(k, struct threshold_block, kobj) -#define to_attr(a) container_of(a, struct threshold_attr, attr) +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; + ret = a->show ? a->show(b, buf) : -EIO; + return ret; } @@ -377,18 +390,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; + ret = a->store ? a->store(b, buf, count) : -EIO; + return ret; } static struct sysfs_ops threshold_ops = { - .show = show, - .store = store, + .show = show, + .store = store, }; static struct kobj_type threshold_ktype = { - .sysfs_ops = &threshold_ops, - .default_attrs = default_attrs, + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, }; static __cpuinit int allocate_threshold_blocks(unsigned int cpu, @@ -396,9 +411,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, unsigned int block, u32 address) { - int err; - u32 low, high; struct threshold_block *b = NULL; + u32 low, high; + int err; if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) return 0; @@ -421,20 +436,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, if (!b) return -ENOMEM; - b->block = block; - b->bank = bank; - b->cpu = cpu; - b->address = address; - b->interrupt_enable = 0; - b->threshold_limit = THRESHOLD_MAX; + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; INIT_LIST_HEAD(&b->miscj); - if (per_cpu(threshold_banks, cpu)[bank]->blocks) + if (per_cpu(threshold_banks, cpu)[bank]->blocks) { list_add(&b->miscj, &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); - else + } else { per_cpu(threshold_banks, cpu)[bank]->blocks = b; + } err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, @@ -447,8 +463,9 @@ recurse: if (!address) return 0; address += MCG_XBLK_ADDR; - } else + } else { ++address; + } err = allocate_threshold_blocks(cpu, bank, ++block, address); if (err) @@ -507,6 +524,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) cpumask_copy(b->cpus, cpu_core_mask(cpu)); per_cpu(threshold_banks, cpu)[bank] = b; + goto out; } #endif @@ -605,15 +623,13 @@ static void deallocate_threshold_block(unsigned int cpu, static void threshold_remove_bank(unsigned int cpu, int bank) { - int i = 0; struct threshold_bank *b; char name[32]; + int i = 0; b = per_cpu(threshold_banks, cpu)[bank]; - if (!b) return; - if (!b->blocks) goto free_out; @@ -624,6 +640,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (shared_bank[bank] && b->blocks->cpu != cpu) { sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; + return; } #endif @@ -659,8 +676,8 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, - unsigned int cpu) +static void __cpuinit +amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { if (cpu >= NR_CPUS) return; @@ -686,11 +703,12 @@ static __init int threshold_init_device(void) /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { int err = threshold_create_device(lcpu); + if (err) return err; } threshold_cpu_callback = amd_64_threshold_cpu_callback; + return 0; } - device_initcall(threshold_init_device); From 6cc6f3ebd19fea722c19630af5ad68af7f51d493 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:23 +0200 Subject: [PATCH 021/162] x86, mce: unify Intel thermal init, prepare Prepare for unification, make two intel_init_thermal equal. [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 35 ++++++++++-------- arch/x86/kernel/cpu/mcheck/p4.c | 44 +++++++++++------------ 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index cef3ee30744b..b85d0c107c84 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -34,21 +34,22 @@ asmlinkage void smp_thermal_interrupt(void) irq_exit(); } +static inline void intel_set_thermal_handler(void) { } + static void intel_init_thermal(struct cpuinfo_x86 *c) { - u32 l, h; - int tm2 = 0; unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; - if (!cpu_has(c, X86_FEATURE_ACPI)) + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) return; - if (!cpu_has(c, X86_FEATURE_ACC)) - return; - - /* first check if TM1 is already enabled by the BIOS, in which - * case there might be some SMM goo which handles it, so we can't even - * put a handler since it might be delivered via SMI already. + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); @@ -61,31 +62,35 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) tm2 = 1; + /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", cpu, (h & APIC_VECTOR_MASK)); + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); return; } - h = THERMAL_APIC_VECTOR; - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + intel_set_thermal_handler(); + rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + /* Unmask the thermal vector: */ l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); + cpu, tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); - return; } /* diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index cb344aba479c..f70753a443bb 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -66,19 +66,21 @@ void smp_thermal_interrupt(struct pt_regs *regs) irq_exit(); } +static void intel_set_thermal_handler(void) +{ + vendor_thermal_interrupt = intel_thermal_interrupt; +} + /* P4/Xeon Thermal regulation detect and init: */ static void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); + int tm2 = 0; u32 l, h; - /* Thermal monitoring: */ - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; /* -ENODEV */ - - /* Clock modulation: */ - if (!cpu_has(c, X86_FEATURE_ACC)) - return; /* -ENODEV */ + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return; /* * First check if its enabled already, in which case there might @@ -90,35 +92,28 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); - - return; /* -EBUSY */ + return; } - /* Check whether a vector already exists, temporarily masked? */ + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) + tm2 = 1; + + /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n", cpu, (h & APIC_VECTOR_MASK)); - - return; /* -EBUSY */ + return; } - /* - * The temperature transition interrupt handler setup: - */ - - /* Our delivery vector: */ - h = THERMAL_APIC_VECTOR; - /* We'll mask the thermal vector in the lapic till we're ready: */ - h |= APIC_DM_FIXED | APIC_LVT_MASKED; + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - /* Ok, we're good to go... */ - vendor_thermal_interrupt = intel_thermal_interrupt; + intel_set_thermal_handler(); rdmsr(MSR_IA32_MISC_ENABLE, l, h); wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); @@ -127,7 +122,8 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); From a65d086235208a3b3546e209d2210048549099b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:23 +0200 Subject: [PATCH 022/162] x86, mce: unify Intel thermal init Mechanic unification. No change in code. [ Impact: cleanup, 32-bit / 64-bit unification ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 3 +- arch/x86/kernel/cpu/mcheck/mce.h | 11 ++++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 73 +++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 61 +------------------ arch/x86/kernel/cpu/mcheck/p4.c | 59 +----------------- 5 files changed, 89 insertions(+), 118 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce_intel.c diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index b2f89829bbe8..6def76942bf2 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,7 +1,8 @@ obj-y = mce_$(BITS).o therm_throt.o obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o +obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index ae9f628838f1..2d1a54bdadfc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -1,6 +1,8 @@ #include #include +#ifdef CONFIG_X86_32 + void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); void intel_p5_mcheck_init(struct cpuinfo_x86 *c); @@ -12,3 +14,12 @@ extern void (*machine_check_vector)(struct pt_regs *, long error_code); extern int nr_mce_banks; +void intel_set_thermal_handler(void); + +#else + +static inline void intel_set_thermal_handler(void) { } + +#endif + +void intel_init_thermal(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c new file mode 100644 index 000000000000..bad3cbb0e566 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -0,0 +1,73 @@ +/* + * Common code for Intel machine checks + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "mce.h" + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; + + /* Thermal monitoring depends on ACPI and clock modulation*/ + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return; + + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + tm2 = 1; + + /* Check whether a vector already exists */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; + } + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + + intel_set_thermal_handler(); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + + /* Unmask the thermal vector: */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index b85d0c107c84..38f9632306fa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -17,6 +17,8 @@ #include #include +#include "mce.h" + asmlinkage void smp_thermal_interrupt(void) { __u64 msr_val; @@ -34,65 +36,6 @@ asmlinkage void smp_thermal_interrupt(void) irq_exit(); } -static inline void intel_set_thermal_handler(void) { } - -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - - intel_set_thermal_handler(); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} - /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f70753a443bb..f979ffea330b 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -66,68 +66,11 @@ void smp_thermal_interrupt(struct pt_regs *regs) irq_exit(); } -static void intel_set_thermal_handler(void) +void intel_set_thermal_handler(void) { vendor_thermal_interrupt = intel_thermal_interrupt; } -/* P4/Xeon Thermal regulation detect and init: */ -static void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - - intel_set_thermal_handler(); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - /* Unmask the thermal vector: */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} #endif /* CONFIG_X86_MCE_P4THERMAL */ /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ From 06b851d98266b812b2fa23d007cdf53f41194bbb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:25 +0200 Subject: [PATCH 023/162] x86, mce: unify, prepare 64bit in mce.h Prepare mce.h for unification, so that it will build on 32-bit x86 kernels too. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4f8c199584e7..8488210b866f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_MCE_H #define _ASM_X86_MCE_H -#ifdef __x86_64__ - #include #include @@ -10,21 +8,21 @@ * Machine Check support for x86 */ -#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ -#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ -#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ -#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ +#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ +#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ +#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ -#define MCI_STATUS_VAL (1UL<<63) /* valid error */ -#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ -#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ -#define MCI_STATUS_EN (1UL<<60) /* error enabled */ -#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ -#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ -#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ +#define MCI_STATUS_VAL (1ULL<<63) /* valid error */ +#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ +#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ +#define MCI_STATUS_EN (1ULL<<60) /* error enabled */ +#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ +#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ /* Fields are zero when not available */ struct mce { @@ -82,13 +80,11 @@ struct mce_log { #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) -#endif /* __x86_64__ */ - #ifdef __KERNEL__ #ifdef CONFIG_X86_32 extern int mce_disabled; -#else /* CONFIG_X86_32 */ +#endif #include @@ -143,8 +139,6 @@ extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); extern int mce_notify_user(void); -#endif /* !CONFIG_X86_32 */ - #ifdef CONFIG_X86_MCE extern void mcheck_init(struct cpuinfo_x86 *c); #else From a988d334ae8213c0e0e62327222f6e5e6e52bcf1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:25 +0200 Subject: [PATCH 024/162] x86, mce: unify, prepare codes Move current 32-bit mce_32.c code into mce_64.c. [ Remove unused artifact stop/restart_mce pointed by Andi Kleen ] Signed-off-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 65 +++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 1491246c4d69..ce48ae75e1dc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -1240,3 +1240,68 @@ static __init int mce_init_device(void) } device_initcall(mce_init_device); + +#ifdef CONFIG_X86_32 + +int mce_disabled; + +int nr_mce_banks; +EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ + +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + +/* This has to be run for each processor */ +void mcheck_init(struct cpuinfo_x86 *c) +{ + if (mce_disabled == 1) + return; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_mcheck_init(c); + break; + + case X86_VENDOR_INTEL: + if (c->x86 == 5) + intel_p5_mcheck_init(c); + if (c->x86 == 6) + intel_p6_mcheck_init(c); + if (c->x86 == 15) + intel_p4_mcheck_init(c); + break; + + case X86_VENDOR_CENTAUR: + if (c->x86 == 5) + winchip_mcheck_init(c); + break; + + default: + break; + } +} + +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} + +static int __init mcheck_enable(char *str) +{ + mce_disabled = -1; + return 1; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); + +#endif /* CONFIG_X86_32 */ From 711c2e481c9d1113650d09de10f61ee88ab56fda Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:26 +0200 Subject: [PATCH 025/162] x86, mce: unify, prepare for 32-bit v2 Prepare the 64-bit mce_64.c code side to be built on 32-bit. [ includes ifdef relocation by Andi Kleen ] Signed-off-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.h | 4 ++-- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index 2d1a54bdadfc..cd6cffcc2de0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -1,14 +1,14 @@ #include #include -#ifdef CONFIG_X86_32 - void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); +#ifdef CONFIG_X86_32 + /* Call the installed machine check handler for this CPU setup. */ extern void (*machine_check_vector)(struct pt_regs *, long error_code); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index ce48ae75e1dc..2e2c3d2e9586 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -37,6 +37,10 @@ #include #include +#include "mce.h" + +#ifdef CONFIG_X86_64 + #define MISC_MCELOG_MINOR 227 atomic_t mce_entry; @@ -1241,7 +1245,7 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#ifdef CONFIG_X86_32 +#else /* CONFIG_X86_32: */ int mce_disabled; From dba3725d44f5dfb5711fd509fca10b5b828c43b7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:26 +0200 Subject: [PATCH 026/162] x86, mce: unify move mce_64.c => mce.c and glue it up in the Makefile. Remove mce_32.c Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 2 +- .../x86/kernel/cpu/mcheck/{mce_64.c => mce.c} | 0 arch/x86/kernel/cpu/mcheck/mce_32.c | 77 ------------------- 3 files changed, 1 insertion(+), 78 deletions(-) rename arch/x86/kernel/cpu/mcheck/{mce_64.c => mce.c} (100%) delete mode 100644 arch/x86/kernel/cpu/mcheck/mce_32.c diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 6def76942bf2..55f01b39a105 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,4 +1,4 @@ -obj-y = mce_$(BITS).o therm_throt.o +obj-y = mce.o therm_throt.o obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce.c similarity index 100% rename from arch/x86/kernel/cpu/mcheck/mce_64.c rename to arch/x86/kernel/cpu/mcheck/mce.c diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c deleted file mode 100644 index 05979e7eff12..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * mce.c - x86 Machine Check Exception Reporting - * (c) 2002 Alan Cox , Dave Jones - */ -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mce.h" - -int mce_disabled; - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled == 1) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } -} - -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} - -static int __init mcheck_enable(char *str) -{ - mce_disabled = -1; - return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); From cb491fca55e5282f0a95ef39c55352e00d6ca75e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:17 +0200 Subject: [PATCH 027/162] x86, mce: Rename sysfs variables Shorten variable names. This also compacts the code a bit. device_mce => mce_dev mce_device_initialized => mce_dev_initialized mce_attribute => mce_attrs [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 58 ++++++++++++------------- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 10 ++--- 3 files changed, 33 insertions(+), 37 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8488210b866f..b9972a6bc2a1 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -90,7 +90,7 @@ extern int mce_disabled; void mce_setup(struct mce *m); void mce_log(struct mce *m); -DECLARE_PER_CPU(struct sys_device, device_mce); +DECLARE_PER_CPU(struct sys_device, mce_dev); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); /* diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2e2c3d2e9586..ba8dd41a10dc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -957,7 +957,7 @@ static struct sysdev_class mce_sysclass = { .name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, device_mce); +DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -1039,12 +1039,12 @@ static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval, check_interval, mce_restart()) -static struct sysdev_attribute *mce_attributes[] = { +static struct sysdev_attribute *mce_attrs[] = { &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; -static cpumask_var_t mce_device_initialized; +static cpumask_var_t mce_dev_initialized; /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_create_device(unsigned int cpu) @@ -1055,40 +1055,36 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(device_mce, cpu).id = cpu; - per_cpu(device_mce, cpu).cls = &mce_sysclass; + memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); + per_cpu(mce_dev, cpu).id = cpu; + per_cpu(mce_dev, cpu).cls = &mce_sysclass; - err = sysdev_register(&per_cpu(device_mce, cpu)); + err = sysdev_register(&per_cpu(mce_dev, cpu)); if (err) return err; - for (i = 0; mce_attributes[i]; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); + for (i = 0; mce_attrs[i]; i++) { + err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); if (err) goto error; } for (i = 0; i < banks; i++) { - err = sysdev_create_file(&per_cpu(device_mce, cpu), + err = sysdev_create_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); if (err) goto error2; } - cpumask_set_cpu(cpu, mce_device_initialized); + cpumask_set_cpu(cpu, mce_dev_initialized); return 0; error2: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - } + while (--i >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); error: - while (--i >= 0) { - sysdev_remove_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); - } - sysdev_unregister(&per_cpu(device_mce, cpu)); + while (--i >= 0) + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + + sysdev_unregister(&per_cpu(mce_dev, cpu)); return err; } @@ -1097,24 +1093,24 @@ static __cpuinit void mce_remove_device(unsigned int cpu) { int i; - if (!cpumask_test_cpu(cpu, mce_device_initialized)) + if (!cpumask_test_cpu(cpu, mce_dev_initialized)) return; - for (i = 0; mce_attributes[i]; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - mce_attributes[i]); + for (i = 0; mce_attrs[i]; i++) + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(device_mce, cpu), - &bank_attrs[i]); - sysdev_unregister(&per_cpu(device_mce, cpu)); - cpumask_clear_cpu(cpu, mce_device_initialized); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + + sysdev_unregister(&per_cpu(mce_dev, cpu)); + cpumask_clear_cpu(cpu, mce_dev_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ static void mce_disable_cpu(void *h) { - int i; unsigned long action = *(unsigned long *)h; + int i; if (!mce_available(¤t_cpu_data)) return; @@ -1221,7 +1217,7 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); err = mce_init_banks(); if (err) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 4d90ec3eb51d..083f270251fa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -517,7 +517,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, + err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, b->kobj, name); if (err) goto out; @@ -540,7 +540,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); + b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); if (!b->kobj) goto out_free; @@ -560,7 +560,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(device_mce, i).kobj, + err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, b->kobj, name); if (err) goto out; @@ -638,7 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); + sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -650,7 +650,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); + sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } From b659294b779565c60f5e12ef505328e2b974eb62 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 12:31:27 +0200 Subject: [PATCH 028/162] x86, mce: print number of MCE banks The number of MCE banks supported by a CPU is a useful number to know, so print it out during CPU initialization. [ Impact: add printout ] Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ba8dd41a10dc..49c74222359d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -570,6 +570,8 @@ static int mce_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & 0xff; + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (b > MAX_NR_BANKS) { printk(KERN_WARNING "MCE: Using only %u machine check banks out of %u\n", @@ -1287,6 +1289,7 @@ void mcheck_init(struct cpuinfo_x86 *c) default: break; } + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); } static int __init mcheck_disable(char *str) From ba2d0f2b0c56d7174a0208f7c463271f39040728 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:24 +0200 Subject: [PATCH 029/162] x86, mce: Cleanup symbols in intel thermal codes Decode magic constants and turn them into symbols. [ Cleanup to use symbols already exists - HS ] [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 7 +++++++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 9 +++++---- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- arch/x86/kernel/cpu/mcheck/p4.c | 2 +- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ec41fc16c167..c86404695083 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -208,7 +208,14 @@ #define MSR_IA32_THERM_CONTROL 0x0000019a #define MSR_IA32_THERM_INTERRUPT 0x0000019b + +#define THERM_INT_LOW_ENABLE (1 << 0) +#define THERM_INT_HIGH_ENABLE (1 << 1) + #define MSR_IA32_THERM_STATUS 0x0000019c + +#define THERM_STATUS_PROCHOT (1 << 0) + #define MSR_IA32_MISC_ENABLE 0x000001a0 /* MISC_ENABLE bits: architectural */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index bad3cbb0e566..2b011d2d8579 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -32,13 +32,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); - if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) tm2 = 1; /* Check whether a vector already exists */ @@ -54,12 +54,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); intel_set_thermal_handler(); rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); /* Unmask the thermal vector: */ l = apic_read(APIC_LVTTHMR); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 38f9632306fa..13abafcb72e4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -29,7 +29,7 @@ asmlinkage void smp_thermal_interrupt(void) irq_enter(); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & 1)) + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) mce_log_therm_throt_event(msr_val); inc_irq_stat(irq_thermal_count); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index f979ffea330b..82cee108a2d3 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -51,7 +51,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - therm_throt_process(msr_val & 0x1); + therm_throt_process(msr_val & THERM_STATUS_PROCHOT); } /* Thermal interrupt handler for this CPU setup: */ From 01c6680a547a3ee8dd170c269ea8e037b3191b71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2009 12:31:24 +0200 Subject: [PATCH 030/162] x86, mce: Cleanup MCG definitions Decode more magic constants and turn them into symbols. [ Sort definitions bitwise, introduce MCG_EXT_CNT - HS ] Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 10 +++++++--- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b9972a6bc2a1..94aedaf6327f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -8,9 +8,13 @@ * Machine Check support for x86 */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ -#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ -#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ +#define MCG_BANKCNT_MASK 0xff /* Number of Banks */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ +#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ +#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ +#define MCG_EXT_CNT_SHIFT 16 +#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 49c74222359d..147333627416 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -569,7 +569,8 @@ static int mce_cap_init(void) u64 cap; rdmsrl(MSR_IA32_MCG_CAP, cap); - b = cap & 0xff; + + b = cap & MCG_BANKCNT_MASK; printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { @@ -590,7 +591,7 @@ static int mce_cap_init(void) } /* Use accurate RIP reporting if available. */ - if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) rip_msr = MSR_IA32_MCG_EIP; return 0; From 3cde5c8c839bf46a7be799ed0e1d0b4780aaf794 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 18:01:31 +0200 Subject: [PATCH 031/162] x86, mce: initial steps to make 64bit mce code 32bit clean Replace unsigned long with u64s if they need to contain 64bit values. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 147333627416..cd1313b47506 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -156,15 +156,15 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } -static void mce_panic(char *msg, struct mce *backup, unsigned long start) +static void mce_panic(char *msg, struct mce *backup, u64 start) { int i; oops_begin(); for (i = 0; i < MCE_LOG_LEN; i++) { - unsigned long tsc = mcelog.entry[i].tsc; + u64 tsc = mcelog.entry[i].tsc; - if (time_before(tsc, start)) + if ((s64)(tsc - start) < 0) continue; print_mce(&mcelog.entry[i]); if (backup && mcelog.entry[i].tsc == backup->tsc) @@ -970,13 +970,13 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); static ssize_t show_ ## name(struct sys_device *s, \ struct sysdev_attribute *attr, \ char *buf) { \ - return sprintf(buf, "%lx\n", (unsigned long)var); \ + return sprintf(buf, "%Lx\n", (u64)var); \ } \ static ssize_t set_ ## name(struct sys_device *s, \ struct sysdev_attribute *attr, \ const char *buf, size_t siz) { \ char *end; \ - unsigned long new = simple_strtoul(buf, &end, 0); \ + u64 new = simple_strtoull(buf, &end, 0); \ \ if (end == buf) \ return -EINVAL; \ From 06b7a7a5ec917761969444fee967c43868a76468 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 18:37:43 +0200 Subject: [PATCH 032/162] x86, mce: implement the PPro bank 0 quirk in the 64bit machine check code Quoting the comment: * SDM documents that on family 6 bank 0 should not be written * because it aliases to another special BIOS controlled * register. * But it's not aliased anymore on model 0x1a+ * Don't ignore bank 0 completely because there could be a valid * event later, merely don't write CTL0. This is mostly a port on the 32bit code, except that 32bit always didn't write it and didn't have the 0x1a heuristic. I checked with the CPU designers that the quirk is not required starting with this model. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 40 +++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cd1313b47506..1dcd3be03328 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -65,6 +65,8 @@ static atomic_t mce_events; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; +static unsigned long dont_init_banks; + static DECLARE_WAIT_QUEUE_HEAD(mce_wait); /* MCA banks polled by the period polling timer for corrected events */ @@ -72,6 +74,11 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +static inline int skip_bank_init(int i) +{ + return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); +} + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -616,6 +623,8 @@ static void mce_init(void *dummy) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { + if (skip_bank_init(i)) + continue; wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } @@ -643,6 +652,19 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) } } + if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + + if (c->x86 == 6 && c->x86_model < 0x1A) + __set_bit(0, &dont_init_banks); + } } static void mce_cpu_features(struct cpuinfo_x86 *c) @@ -911,8 +933,10 @@ static int mce_disable(void) { int i; - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + } return 0; } @@ -1119,8 +1143,10 @@ static void mce_disable_cpu(void *h) return; if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + } } static void mce_reenable_cpu(void *h) @@ -1133,8 +1159,10 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + for (i = 0; i < banks; i++) { + if (!skip_bank_init(i)) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + } } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ From 2e6f694fde0a7158590e121962ca2e3c06633528 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 18:42:48 +0200 Subject: [PATCH 033/162] x86, mce: port K7 bank 0 quirk to 64bit mce code Various K7 have broken bank 0s. Don't enable it by default Port from the 32bit code. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1dcd3be03328..1336280edcc2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -650,6 +650,12 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) */ mce_bootlog = 0; } + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6) + bank[0] = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { From 5d7279268b654d1f8ac43b0eb6cd9598d9cf55fd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 19:25:48 +0200 Subject: [PATCH 034/162] x86, mce: use a call vector to call the 64bit mce handler Allows to call different machine check handlers from the low level machine check entry vector. This is needed for later when it will be used for 32bit too. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 23 ++++++++++++----------- arch/x86/kernel/cpu/mcheck/mce.h | 3 ++- arch/x86/kernel/entry_64.S | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1336280edcc2..d99318b470d8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -39,6 +39,16 @@ #include "mce.h" +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; #ifdef CONFIG_X86_64 #define MISC_MCELOG_MINOR 227 @@ -715,6 +725,8 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) } mce_cpu_quirks(c); + machine_check_vector = do_machine_check; + mce_init(NULL); mce_cpu_features(c); mce_init_timer(); @@ -1285,17 +1297,6 @@ int mce_disabled; int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index cd6cffcc2de0..966ae3c5cb11 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -7,11 +7,12 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); -#ifdef CONFIG_X86_32 /* Call the installed machine check handler for this CPU setup. */ extern void (*machine_check_vector)(struct pt_regs *, long error_code); +#ifdef CONFIG_X86_32 + extern int nr_mce_banks; void intel_set_thermal_handler(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6e8433..63276c45bffa 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1382,7 +1382,7 @@ paranoiderrorentry stack_segment do_stack_segment errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check do_machine_check +paranoidzeroentry machine_check *machine_check_vector(%rip) #endif /* From 04b2b1a4df6cd0fdaa598f3c623a19c2d93cb48a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 22:50:19 +0200 Subject: [PATCH 035/162] x86, mce: rename 64bit mce_dont_init to mce_disabled Give it the same name as on 32bit. This makes further merging easier. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 -- arch/x86/kernel/cpu/mcheck/mce.c | 15 +++++++-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 94aedaf6327f..c3c7ee701753 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -86,9 +86,7 @@ struct mce_log { #ifdef __KERNEL__ -#ifdef CONFIG_X86_32 extern int mce_disabled; -#endif #include diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d99318b470d8..6ab477060f56 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -49,14 +49,15 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) /* Call the installed machine check handler for this CPU setup. */ void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; + +int mce_disabled; + #ifdef CONFIG_X86_64 #define MISC_MCELOG_MINOR 227 atomic_t mce_entry; -static int mce_dont_init; - /* * Tolerant levels: * 0: always panic on uncorrected errors, log corrected errors @@ -194,7 +195,7 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) int mce_available(struct cpuinfo_x86 *c) { - if (mce_dont_init) + if (mce_disabled) return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -720,7 +721,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) return; if (mce_cap_init() < 0) { - mce_dont_init = 1; + mce_disabled = 1; return; } mce_cpu_quirks(c); @@ -911,7 +912,7 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_disable(char *str) { - mce_dont_init = 1; + mce_disabled = 1; return 1; } __setup("nomce", mcheck_disable); @@ -925,7 +926,7 @@ __setup("nomce", mcheck_disable); static int __init mcheck_enable(char *str) { if (!strcmp(str, "off")) - mce_dont_init = 1; + mce_disabled = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); else if (isdigit(str[0])) @@ -1292,8 +1293,6 @@ device_initcall(mce_init_device); #else /* CONFIG_X86_32: */ -int mce_disabled; - int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ From d7c3c9a609563868d8a70e220399d06a25aba095 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:07:25 +0200 Subject: [PATCH 036/162] x86, mce: move mce_disabled option into common 32bit/64bit code It's the same function, so let's share it. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6ab477060f56..5395200dc9d9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -907,16 +907,6 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} -__setup("nomce", mcheck_disable); - /* * mce=off disables machine check * mce=TOLERANCELEVEL (number, see above) @@ -1327,19 +1317,22 @@ void mcheck_init(struct cpuinfo_x86 *c) printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); } -static int __init mcheck_disable(char *str) -{ - mce_disabled = 1; - return 1; -} - static int __init mcheck_enable(char *str) { mce_disabled = -1; return 1; } -__setup("nomce", mcheck_disable); __setup("mce", mcheck_enable); -#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_X86_OLD_MCE */ + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} +__setup("nomce", mcheck_disable); From 8e97aef5f43ec715f394bc15015ff263b80c3ad6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 14:23:18 +0200 Subject: [PATCH 037/162] x86, mce: remove machine check handler idle notify on 64bit i386 has no idle notifiers, but the 64bit machine check code uses them to wake up mcelog from a fatal machine check exception. For corrected machine checks found by the poller or threshold interrupts going through an idle notifier is not needed because the wake_up can is just done directly and doesn't need the idle notifier. It is only needed for logging exceptions. To be honest I never liked the idle notifier even though I signed off on it. On closer investigation the code actually turned out to be nearly. Right now machine check exceptions on x86 are always unrecoverable (lead to panic due to PCC), which means we never execute the idle notifier path. The only exception is the somewhat weird tolerant==3 case, which ignores PCC. I'll fix this in a future patch in a much cleaner way. So remove the "mcelog wakeup through idle notifier" code from 64bit. This allows to compile the 64bit machine check handler on 32bit which doesn't have idle notifiers. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5395200dc9d9..7562c1f674f3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -555,29 +555,6 @@ int mce_notify_user(void) return 0; } -/* see if the idle task needs to notify userspace: */ -static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, - void *unused) -{ - /* IDLE_END should be safe - interrupts are back on */ - if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) - mce_notify_user(); - - return NOTIFY_OK; -} - -static struct notifier_block mce_idle_notifier = { - .notifier_call = mce_idle_callback, -}; - -static __init int periodic_mcheck_init(void) -{ - idle_notifier_register(&mce_idle_notifier); - return 0; -} -__initcall(periodic_mcheck_init); - /* * Initialize Machine Checks for a CPU. */ From d896a940ef4f12a0a6bc432853b249dcfbacabf0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 14:25:18 +0200 Subject: [PATCH 038/162] x86, mce: remove oops_begin() use in 64bit machine check First 32bit doesn't have oops_begin, so it's a barrier of using this code on 32bit. On closer examination it turns out oops_begin is not a good idea in a machine check panic anyways. All oops_begin does it so check for recursive/parallel oopses and implement the "wait on oops" heuristic. But there's actually no good reason to lock machine checks against oopses or prevent them from recursion. Also "wait on oops" does not really make sense for a machine check too. Replace it with a manual bust_spinlocks/console_verbose. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7562c1f674f3..f4d6841d2bdf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -178,7 +178,8 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) { int i; - oops_begin(); + bust_spinlocks(1); + console_verbose(); for (i = 0; i < MCE_LOG_LEN; i++) { u64 tsc = mcelog.entry[i].tsc; From 4efc0670baf4b14bc95502e54a83ccf639146125 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 19:07:31 +0200 Subject: [PATCH 039/162] x86, mce: use 64bit machine check code on 32bit The 64bit machine check code is in many ways much better than the 32bit machine check code: it is more specification compliant, is cleaner, only has a single code base versus one per CPU, has better infrastructure for recovery, has a cleaner way to communicate with user space etc. etc. Use the 64bit code for 32bit too. This is the second attempt to do this. There was one a couple of years ago to unify this code for 32bit and 64bit. Back then this ran into some trouble with K7s and was reverted. I believe this time the K7 problems (and some others) are addressed. I went over the old handlers and was very careful to retain all quirks. But of course this needs a lot of testing on old systems. On newer 64bit capable systems I don't expect much problems because they have been already tested with the 64bit kernel. I made this a CONFIG for now that still allows to select the old machine check code. This is mostly to make testing easier, if someone runs into a problem we can ask them to try with the CONFIG switched. The new code is default y for more coverage. Once there is confidence the 64bit code works well on older hardware too the CONFIG_X86_OLD_MCE and the associated code can be easily removed. This causes a behaviour change for 32bit installations. They now have to install the mcelog package to be able to log corrected machine checks. The 64bit machine check code only handles CPUs which support the standard Intel machine check architecture described in the IA32 SDM. The 32bit code has special support for some older CPUs which have non standard machine check architectures, in particular WinChip C3 and Intel P5. I made those a separate CONFIG option and kept them for now. The WinChip variant could be probably removed without too much pain, it doesn't really do anything interesting. P5 is also disabled by default (like it was before) because many motherboards have it miswired, but according to Alan Cox a few embedded setups use that one. Forward ported/heavily changed version of old patch, original patch included review/fixes from Thomas Gleixner, Bert Wesarg. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 33 +++++++++++++++++++++++++++-- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/kernel/apic/apic.c | 4 ++-- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/cpu/mcheck/Makefile | 3 ++- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++---- arch/x86/kernel/cpu/mcheck/mce.h | 18 +++++++++++++--- arch/x86/kernel/cpu/mcheck/p5.c | 5 +++++ arch/x86/kernel/irq.c | 4 ++-- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/signal.c | 4 ++-- arch/x86/kernel/traps.c | 4 ++-- 12 files changed, 92 insertions(+), 21 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a6efe0a2e9ae..c1c5ccd1937f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -789,6 +789,22 @@ config X86_MCE to disable it. MCE support simply ignores non-MCE processors like the 386 and 486, so nearly everyone can say Y here. +config X86_OLD_MCE + depends on X86_32 && X86_MCE + bool "Use legacy machine check code (will go away)" + default n + select X86_ANCIENT_MCE + ---help--- + Use the old i386 machine check code. This is merely intended for + testing in a transition period. Try this if you run into any machine + check related software problems, but report the problem to + linux-kernel. When in doubt say no. + +config X86_NEW_MCE + depends on X86_MCE + bool + default y if (!X86_OLD_MCE && X86_32) || X86_64 + config X86_MCE_INTEL def_bool y prompt "Intel MCE features" @@ -805,6 +821,15 @@ config X86_MCE_AMD Additional support for AMD specific MCE features such as the DRAM Error Threshold. +config X86_ANCIENT_MCE + def_bool n + depends on X86_32 + prompt "Support for old Pentium 5 / WinChip machine checks" + ---help--- + Include support for machine check handling on old Pentium 5 or WinChip + systems. These typically need to be enabled explicitely on the command + line. + config X86_MCE_THRESHOLD depends on X86_MCE_AMD || X86_MCE_INTEL bool @@ -812,7 +837,7 @@ config X86_MCE_THRESHOLD config X86_MCE_NONFATAL tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" - depends on X86_32 && X86_MCE + depends on X86_OLD_MCE ---help--- Enabling this feature starts a timer that triggers every 5 seconds which will look at the machine check registers to see if anything happened. @@ -825,11 +850,15 @@ config X86_MCE_NONFATAL config X86_MCE_P4THERMAL bool "check for P4 thermal throttling interrupt." - depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) + depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) ---help--- Enabling this feature will cause a message to be printed when the P4 enters thermal throttling. +config X86_THERMAL_VECTOR + def_bool y + depends on X86_MCE_P4THERMAL || X86_MCE_INTEL + config VM86 bool "Enable VM86 support" if EMBEDDED default y diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index c2e6bedaf258..486c9e946f5c 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -52,7 +52,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) #endif -#ifdef CONFIG_X86_MCE_P4THERMAL +#ifdef CONFIG_X86_THERMAL_VECTOR BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2870920f246..ad532289ef2e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -843,7 +843,7 @@ void clear_local_APIC(void) } /* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); @@ -1962,7 +1962,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +#ifdef CONFIG_X86_THERMAL_VECTOR if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index ce4fbfa315a1..c4762276c17e 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#if defined(CONFIG_X86_NEW_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 55f01b39a105..5f8b09425d39 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,6 +1,7 @@ obj-y = mce.o therm_throt.o -obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o +obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o +obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f4d6841d2bdf..e193de44ef19 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -52,7 +52,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = int mce_disabled; -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_NEW_MCE #define MISC_MCELOG_MINOR 227 @@ -662,6 +662,21 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) } } +static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +{ + if (c->x86 != 5) + return; + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (mce_p5_enabled()) + intel_p5_mcheck_init(c); + break; + case X86_VENDOR_CENTAUR: + winchip_mcheck_init(c); + break; + } +} + static void mce_cpu_features(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { @@ -695,6 +710,11 @@ static void mce_init_timer(void) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { + if (mce_disabled) + return; + + mce_ancient_init(c); + if (!mce_available(c)) return; @@ -893,6 +913,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { + if (*str == 0) + enable_p5_mce(); + if (*str == '=') + str++; if (!strcmp(str, "off")) mce_disabled = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) @@ -900,13 +924,13 @@ static int __init mcheck_enable(char *str) else if (isdigit(str[0])) get_option(&str, &tolerant); else { - printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n", + printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; } -__setup("mce=", mcheck_enable); +__setup("mce", mcheck_enable); /* * Sysfs support @@ -1259,7 +1283,7 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#else /* CONFIG_X86_32: */ +#else /* CONFIG_X86_OLD_MCE: */ int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index 966ae3c5cb11..84a552b458c8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -1,17 +1,29 @@ #include #include +#ifdef CONFIG_X86_OLD_MCE void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); +#endif +#ifdef CONFIG_X86_ANCIENT_MCE +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); +extern int mce_p5_enable; +static inline int mce_p5_enabled(void) { return mce_p5_enable; } +static inline void enable_p5_mce(void) { mce_p5_enable = 1; } +#else +static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static inline int mce_p5_enabled(void) { return 0; } +static inline void enable_p5_mce(void) { } +#endif /* Call the installed machine check handler for this CPU setup. */ extern void (*machine_check_vector)(struct pt_regs *, long error_code); -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_OLD_MCE extern int nr_mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 8812f5441830..015f481ab1b0 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -14,6 +14,9 @@ #include "mce.h" +/* By default disabled */ +int mce_p5_enable; + /* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { @@ -44,9 +47,11 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_MCE)) return; +#ifdef CONFIG_X86_OLD_MCE /* Default P5 to off as its often misconnected: */ if (mce_disabled != -1) return; +#endif machine_check_vector = pentium_machine_check; /* Make sure the vector pointer is visible before we enable MCEs: */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c3fe010d74c8..35eddc9ec99e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -89,7 +89,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); @@ -176,7 +176,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #endif #ifdef CONFIG_X86_MCE sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; #endif #endif diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 368b0a8836f9..98846e03211e 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -181,7 +181,7 @@ void __init native_init_IRQ(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); #endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) +#ifdef CONFIG_X86_THERMAL_VECTOR /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e3..d0851e3f77eb 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -25,11 +25,11 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include #include -#include #endif /* CONFIG_X86_64 */ #include @@ -857,7 +857,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#ifdef CONFIG_X86_NEW_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_user(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff0..ad771f15bddd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -798,7 +798,8 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) return new_kesp; } -#else +#endif + asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } @@ -806,7 +807,6 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) { } -#endif /* * 'math_state_restore()' saves the current math information in the From 45f458e9a8a216b02b76fe61d9e8bc40d659fbe8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:18:26 +0200 Subject: [PATCH 040/162] x86, mce: deprecate old 32bit machine check code Schedule for removal in 2.6.32 Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- Documentation/feature-removal-schedule.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index de491a3e2313..ec9ef5d0d7b3 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -437,3 +437,13 @@ Why: Superseded by tdfxfb. I2C/DDC support used to live in a separate driver but this caused driver conflicts. Who: Jean Delvare Krzysztof Helt + +---------------------------- + +What: CONFIG_X86_OLD_MCE +When: 2.6.32 +Why: Remove the old legacy 32bit machine check code. This has been + superseded by the newer machine check code from the 64bit port, + but the old version has been kept around for easier testing. Note this + doesn't impact the old P5 and WinChip machine check handlers. +Who: Andi Kleen From 7856f6cce4a8cda8c1f94b99605c07d16b8d8dec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:32:56 +0200 Subject: [PATCH 041/162] x86, mce: enable MCE_INTEL for 32bit new MCE Enable the 64bit MCE_INTEL code (CMCI, thermal interrupts) for 32bit NEW_MCE. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/entry_arch.h | 4 ++++ arch/x86/include/asm/hardirq.h | 2 +- arch/x86/include/asm/irq_vectors.h | 5 +++-- arch/x86/kernel/cpu/mcheck/threshold.c | 2 +- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/irqinit_32.c | 4 ++++ arch/x86/kernel/traps.c | 2 +- 8 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c1c5ccd1937f..e1c9f77f69ec 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -808,7 +808,7 @@ config X86_NEW_MCE config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_64 && X86_MCE && X86_LOCAL_APIC + depends on X86_NEW_MCE && X86_LOCAL_APIC ---help--- Additional support for intel specific MCE features such as the thermal monitor. diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 486c9e946f5c..b2eb9c066843 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -56,4 +56,8 @@ BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) #endif +#ifdef CONFIG_X86_MCE_THRESHOLD +BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) +#endif + #endif diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 37555e52f980..922ee7c29693 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -20,7 +20,7 @@ typedef struct { #endif #ifdef CONFIG_X86_MCE unsigned int irq_thermal_count; -# ifdef CONFIG_X86_64 +# ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; # endif #endif diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 3cbd79bbb47c..451e24d18050 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -87,10 +87,11 @@ #define CALL_FUNCTION_SINGLE_VECTOR 0xfb #define THERMAL_APIC_VECTOR 0xfa +#define THRESHOLD_APIC_VECTOR 0xf9 + #ifdef CONFIG_X86_32 -/* 0xf8 - 0xf9 : free */ +/* 0xf9 : free */ #else -# define THRESHOLD_APIC_VECTOR 0xf9 # define UV_BAU_MESSAGE 0xf8 #endif diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 23ee9e730f78..d746df2909c9 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -17,7 +17,7 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage void mce_threshold_interrupt(void) +asmlinkage void smp_threshold_interrupt(void) { exit_idle(); irq_enter(); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 63276c45bffa..a31a7f29cffe 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1007,7 +1007,7 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ #endif apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt mce_threshold_interrupt + threshold_interrupt smp_threshold_interrupt apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 98846e03211e..2512ad93dabf 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -186,6 +186,10 @@ void __init native_init_IRQ(void) alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +#ifdef CONFIG_X86_MCE_THRESHOLD + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif + if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ad771f15bddd..0d358c884b35 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -804,7 +804,7 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } From de5619dfef76ddb403eb7c6de39c0130166c5dc3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:34:40 +0200 Subject: [PATCH 042/162] x86, mce: enable MCE_AMD for 32bit NEW_MCE That's very easy using the infrastructure enabled earlier for MCE_INTEL Untested. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e1c9f77f69ec..a148e7ac0d83 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -816,7 +816,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_64 && X86_MCE && X86_LOCAL_APIC + depends on X86_NEW_MCE && X86_LOCAL_APIC ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. From 172d899db4bf0beb7766d583379e5ed552130e4a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:37:02 +0200 Subject: [PATCH 043/162] x86, mce: document new 32bit mcelog requirement in Documentation/Changes Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- Documentation/Changes | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/Changes b/Documentation/Changes index b95082be4d5e..d21b3b5aa543 100644 --- a/Documentation/Changes +++ b/Documentation/Changes @@ -48,6 +48,7 @@ o procps 3.2.0 # ps --version o oprofile 0.9 # oprofiled --version o udev 081 # udevinfo -V o grub 0.93 # grub --version +o mcelog 0.6 Kernel compilation ================== @@ -276,6 +277,16 @@ before running exportfs or mountd. It is recommended that all NFS services be protected from the internet-at-large by a firewall where that is possible. +mcelog +------ + +In Linux 2.6.31+ the i386 kernel needs to run the mcelog utility +as a regular cronjob similar to the x86-64 kernel to process and log +machine check events when CONFIG_X86_NEW_MCE is enabled. Machine check +events are errors reported by the CPU. Processing them is strongly encouraged. +All x86-64 kernels since 2.6.4 require the mcelog utility to +process machine checks. + Getting updated software ======================== @@ -365,6 +376,10 @@ FUSE ---- o +mcelog +------ +o + Networking ********** From a9862e0560866eadbc59b84867492004da436516 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 19 May 2009 22:49:07 +0200 Subject: [PATCH 044/162] Export add_timer_on for modules Needed in followon patch. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- kernel/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c31..e2c47b82ac36 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -756,6 +756,7 @@ void add_timer_on(struct timer_list *timer, int cpu) wake_up_idle_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } +EXPORT_SYMBOL_GPL(add_timer_on); /** * del_timer - deactive a timer. From 5f8c1a54cab6f449fe04d42d0661bc796fa4e73e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 29 Apr 2009 19:29:12 +0200 Subject: [PATCH 045/162] x86, mce: add MSR read wrappers for easier error injection This will be used by future patches to allow machine check error injection. Right now it's a nop, except for adding some wrappers around the MSR reads. This is early in the sequence to avoid too many conflicts. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 37 +++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e193de44ef19..e755c95674dc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -194,6 +194,19 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) panic(msg); } +/* MSR access wrappers used for error injection */ +static u64 mce_rdmsrl(u32 msr) +{ + u64 v; + rdmsrl(msr, v); + return v; +} + +static void mce_wrmsrl(u32 msr, u64 v) +{ + wrmsrl(msr, v); +} + int mce_available(struct cpuinfo_x86 *c) { if (mce_disabled) @@ -213,7 +226,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) if (rip_msr) { /* Assume the RIP in the MSR is exact. Is this true? */ m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->ip); + m->ip = mce_rdmsrl(rip_msr); m->cs = 0; } } @@ -231,7 +244,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_setup(&m); - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { if (!bank[i] || !test_bit(i, *b)) continue; @@ -242,7 +255,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -257,9 +270,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -275,7 +288,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } /* @@ -320,7 +333,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -338,7 +351,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) m.addr = 0; m.bank = i; - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -378,9 +391,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) } if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); mce_get_rip(&m, regs); mce_log(&m); @@ -449,9 +462,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* the last thing we do is clear state */ for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } - wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); } From ea149b36c7f511d17dd89fee734cb09778a91fa0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 29 Apr 2009 19:31:00 +0200 Subject: [PATCH 046/162] x86, mce: add basic error injection infrastructure Allow user programs to write mce records into /dev/mcelog. When they do that a fake machine check is triggered to test the machine check code. This uses the MCE MSR wrappers added earlier. The implementation is straight forward. There is a struct mce record per CPU and the MCE MSR accesses get data from there if there is valid data injected there. This allows to test the machine check code relatively realistically because only the lowest layer of hardware access is intercepted. The test suite and injector are available at git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 8 ++ arch/x86/include/asm/mce.h | 3 + arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce-inject.c | 126 ++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce.c | 39 +++++++- 5 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce-inject.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a148e7ac0d83..e25b6358fbe3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -835,6 +835,14 @@ config X86_MCE_THRESHOLD bool default y +config X86_MCE_INJECT + depends on X86_NEW_MCE + tristate "Machine check injector support" + ---help--- + Provide support for injecting machine checks for testing purposes. + If you don't know what a machine check is and you don't do kernel + QA it is safe to say n. + config X86_MCE_NONFATAL tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" depends on X86_OLD_MCE diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c3c7ee701753..e7d2372301ef 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -141,6 +141,9 @@ extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); extern int mce_notify_user(void); +DECLARE_PER_CPU(struct mce, injectm); +extern struct file_operations mce_chrdev_ops; + #ifdef CONFIG_X86_MCE extern void mcheck_init(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 5f8b09425d39..60ee182c6c52 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o +obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c new file mode 100644 index 000000000000..58afac4b5df5 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -0,0 +1,126 @@ +/* + * Machine check injection support. + * Copyright 2008 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Authors: + * Andi Kleen + * Ying Huang + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* Update fake mce registers on current CPU. */ +static void inject_mce(struct mce *m) +{ + struct mce *i = &per_cpu(injectm, m->cpu); + + /* Make sure noone reads partially written injectm */ + i->finished = 0; + mb(); + m->finished = 0; + /* First set the fields after finished */ + i->cpu = m->cpu; + mb(); + /* Now write record in order, finished last (except above) */ + memcpy(i, m, sizeof(struct mce)); + /* Finally activate it */ + mb(); + i->finished = 1; +} + +struct delayed_mce { + struct timer_list timer; + struct mce m; +}; + +/* Inject mce on current CPU */ +static void raise_mce(unsigned long data) +{ + struct delayed_mce *dm = (struct delayed_mce *)data; + struct mce *m = &dm->m; + int cpu = m->cpu; + + inject_mce(m); + if (m->status & MCI_STATUS_UC) { + struct pt_regs regs; + memset(®s, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); + do_machine_check(®s, 0); + printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); + } else { + mce_banks_t b; + memset(&b, 0xff, sizeof(mce_banks_t)); + printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); + machine_check_poll(0, &b); + mce_notify_user(); + printk(KERN_INFO "Finished machine check poll on CPU %d\n", + cpu); + } + kfree(dm); +} + +/* Error injection interface */ +static ssize_t mce_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + struct delayed_mce *dm; + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.cpu >= NR_CPUS || !cpu_online(m.cpu)) + return -EINVAL; + + dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); + if (!dm) + return -ENOMEM; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + * Should we use a hrtimer here for better synchronization? + */ + memcpy(&dm->m, &m, sizeof(struct mce)); + setup_timer(&dm->timer, raise_mce, (unsigned long)dm); + dm->timer.expires = jiffies + 2; + add_timer_on(&dm->timer, m.cpu); + return usize; +} + +static int inject_init(void) +{ + printk(KERN_INFO "Machine check injector initialized\n"); + mce_chrdev_ops.write = mce_write; + return 0; +} + +module_init(inject_init); +/* Cannot tolerate unloading currently because we cannot + * guarantee all openers of mce_chrdev will get a reference to us. + */ +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e755c95674dc..fe216bd10f43 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -98,6 +98,9 @@ void mce_setup(struct mce *m) rdtscll(m->tsc); } +DEFINE_PER_CPU(struct mce, injectm); +EXPORT_PER_CPU_SYMBOL_GPL(injectm); + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -194,16 +197,46 @@ static void mce_panic(char *msg, struct mce *backup, u64 start) panic(msg); } +/* Support code for software error injection */ + +static int msr_to_offset(u32 msr) +{ + unsigned bank = __get_cpu_var(injectm.bank); + if (msr == rip_msr) + return offsetof(struct mce, ip); + if (msr == MSR_IA32_MC0_STATUS + bank*4) + return offsetof(struct mce, status); + if (msr == MSR_IA32_MC0_ADDR + bank*4) + return offsetof(struct mce, addr); + if (msr == MSR_IA32_MC0_MISC + bank*4) + return offsetof(struct mce, misc); + if (msr == MSR_IA32_MCG_STATUS) + return offsetof(struct mce, mcgstatus); + return -1; +} + /* MSR access wrappers used for error injection */ static u64 mce_rdmsrl(u32 msr) { u64 v; + if (__get_cpu_var(injectm).finished) { + int offset = msr_to_offset(msr); + if (offset < 0) + return 0; + return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); + } rdmsrl(msr, v); return v; } static void mce_wrmsrl(u32 msr, u64 v) { + if (__get_cpu_var(injectm).finished) { + int offset = msr_to_offset(msr); + if (offset >= 0) + *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; + return; + } wrmsrl(msr, v); } @@ -296,6 +329,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * exceptions. */ } +EXPORT_SYMBOL_GPL(machine_check_poll); /* * The actual machine check handler. This only handles real @@ -468,6 +502,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) out2: atomic_dec(&mce_entry); } +EXPORT_SYMBOL_GPL(do_machine_check); #ifdef CONFIG_X86_MCE_INTEL /*** @@ -568,6 +603,7 @@ int mce_notify_user(void) } return 0; } +EXPORT_SYMBOL_GPL(mce_notify_user); /* * Initialize Machine Checks for a CPU. @@ -904,13 +940,14 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } } -static const struct file_operations mce_chrdev_ops = { +struct file_operations mce_chrdev_ops = { .open = mce_open, .release = mce_release, .read = mce_read, .poll = mce_poll, .unlocked_ioctl = mce_ioctl, }; +EXPORT_SYMBOL_GPL(mce_chrdev_ops); static struct miscdevice mce_log_device = { MISC_MCELOG_MINOR, From a1ff41bfc1bb7a6d19cf958f89a9b539678781e5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 May 2009 22:16:14 -0700 Subject: [PATCH 047/162] x86, mce: add comment about mce_chrdev_ops being writable Add a comment explaining that mce_chrdev_ops is intentionally writable. [ Impact: comment only ] Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fe216bd10f43..156cdf6d9181 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -940,6 +940,7 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } } +/* Modified in mce-inject.c, so not static or const */ struct file_operations mce_chrdev_ops = { .open = mce_open, .release = mce_release, From 5706001aacba5d3db5f224ca135e5e91a30be39c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 May 2009 22:18:17 -0700 Subject: [PATCH 048/162] x86, mce: fix comment style in mce-inject.c Fix style of winged comment in mce-inject.c. [ Impact: comment only ] Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 58afac4b5df5..673c72855022 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -120,7 +120,8 @@ static int inject_init(void) } module_init(inject_init); -/* Cannot tolerate unloading currently because we cannot +/* + * Cannot tolerate unloading currently because we cannot * guarantee all openers of mce_chrdev will get a reference to us. */ MODULE_LICENSE("GPL"); From 88921be30296e126896ee4d30758f989d1c4ddfb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:51 +0200 Subject: [PATCH 049/162] x86, mce: synchronize core after machine check handling The example code in the IA32 SDM recommends to synchronize the CPU after machine check handling. So do that here. [ Impact: Spec compliance ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 156cdf6d9181..495c96808668 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -328,6 +328,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't clear MCG_STATUS here because it's only defined for * exceptions. */ + + sync_core(); } EXPORT_SYMBOL_GPL(machine_check_poll); @@ -501,6 +503,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); + sync_core(); } EXPORT_SYMBOL_GPL(do_machine_check); From b56f642d2bf8c1f7c6499c1e55b23311a33cc796 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:52 +0200 Subject: [PATCH 050/162] x86, mce: use extended sysattrs for the check_interval attribute. Instead of using own callbacks use the generic ones provided by the sysdev later. This finally allows to get rid of the ugly ACCESSOR macros. Should also save some text size. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 39 ++++++++++++-------------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 495c96808668..3cac9da7ce24 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1054,28 +1054,6 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - char *buf) { \ - return sprintf(buf, "%Lx\n", (u64)var); \ - } \ - static ssize_t set_ ## name(struct sys_device *s, \ - struct sysdev_attribute *attr, \ - const char *buf, size_t siz) { \ - char *end; \ - u64 new = simple_strtoull(buf, &end, 0); \ - \ - if (end == buf) \ - return -EINVAL; \ - var = new; \ - start; \ - \ - return end-buf; \ - } \ - static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - static struct sysdev_attribute *bank_attrs; static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1126,13 +1104,26 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return len; } +static ssize_t store_int_with_restart(struct sys_device *s, + struct sysdev_attribute *attr, + const char *buf, size_t size) +{ + ssize_t ret = sysdev_store_int(s, attr, buf, size); + mce_restart(); + return ret; +} + static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -ACCESSOR(check_interval, check_interval, mce_restart()) +static struct sysdev_ext_attribute attr_check_interval = { + _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, + store_int_with_restart), + &check_interval +}; static struct sysdev_attribute *mce_attrs[] = { - &attr_tolerant.attr, &attr_check_interval, &attr_trigger, + &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, NULL }; From fc016a49c2d92f2efbe22c1fb66eb7a5d2a06ed1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:53 +0200 Subject: [PATCH 051/162] x86, mce: remove unused mce_events variable Remove unused mce_events static variable. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3cac9da7ce24..69aad7e96a53 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -71,7 +71,6 @@ static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; -static atomic_t mce_events; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -116,7 +115,6 @@ void mce_log(struct mce *mce) { unsigned next, entry; - atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { From 8be9110569aec1f65d86b08aef7ec49659137bf9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 May 2009 21:56:53 +0200 Subject: [PATCH 052/162] x86, mce: remove mce_init unused argument Remove unused mce_init argument. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 69aad7e96a53..20c7e7c669d2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -643,7 +643,7 @@ static int mce_cap_init(void) return 0; } -static void mce_init(void *dummy) +static void mce_init(void) { mce_banks_t all_banks; u64 cap; @@ -776,7 +776,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) machine_check_vector = do_machine_check; - mce_init(NULL); + mce_init(); mce_cpu_features(c); mce_init_timer(); } @@ -1020,7 +1020,7 @@ static int mce_shutdown(struct sys_device *dev) */ static int mce_resume(struct sys_device *dev) { - mce_init(NULL); + mce_init(); mce_cpu_features(¤t_cpu_data); return 0; @@ -1030,7 +1030,7 @@ static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); if (mce_available(¤t_cpu_data)) - mce_init(NULL); + mce_init(); mce_init_timer(); } From 32561696c23028596f24b353d98f2e23b58f91f7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:53 +0200 Subject: [PATCH 053/162] x86, mce: rename and align out2 label There's only a single out path in do_machine_check now, so rename the label from out2 to out. Also align it at the first column. [ Impact: minor cleanup, no functional changes ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 20c7e7c669d2..18d505d80221 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -361,9 +361,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - goto out2; + goto out; if (!banks) - goto out2; + goto out; mce_setup(&m); @@ -499,7 +499,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - out2: +out: atomic_dec(&mce_entry); sync_core(); } From b170204ddb7844ffff62d2d537b20c0eeb97725e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: [PATCH 054/162] x86, mce: drop BKL in mce_open BKL is not needed for anything in mce_open because it has an own spinlock. Remove it. [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18d505d80221..8ab28368bb92 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -791,12 +790,10 @@ static int open_exclu; /* already open exclusive? */ static int mce_open(struct inode *inode, struct file *file) { - lock_kernel(); spin_lock(&mce_state_lock); if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { spin_unlock(&mce_state_lock); - unlock_kernel(); return -EBUSY; } @@ -806,7 +803,6 @@ static int mce_open(struct inode *inode, struct file *file) open_count++; spin_unlock(&mce_state_lock); - unlock_kernel(); return nonseekable_open(inode, file); } From 8780e8e0f6b34862cdf2c62d4d2674d6bc3207db Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:56 +0200 Subject: [PATCH 055/162] x86, mce: improve documentation Document that check_interval set to 0 means no polling. Noticed by Hidetoshi Seto Also add a reference from boot options to the sysfs tunables Acked-by: Hidetoshi Seto Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- Documentation/x86/x86_64/boot-options.txt | 2 ++ Documentation/x86/x86_64/machinecheck | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 34c13040a718..63fca718256e 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -5,6 +5,8 @@ only the AMD64 specific ones are listed here. Machine check + Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables. + mce=off disable machine check mce=bootlog Enable logging of machine checks left over from booting. Disabled by default on AMD because some BIOS leave bogus ones. diff --git a/Documentation/x86/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck index a05e58e7b159..a4fdb25446e0 100644 --- a/Documentation/x86/x86_64/machinecheck +++ b/Documentation/x86/x86_64/machinecheck @@ -41,7 +41,9 @@ check_interval the polling interval. When the poller stops finding MCEs, it triggers an exponential backoff (poll less often) on the polling interval. The check_interval variable is both the initial and - maximum polling interval. + maximum polling interval. 0 means no polling for corrected machine + check errors (but some corrected errors might be still reported + in other ways) tolerant Tolerance level. When a machine check exception occurs for a non From 9319cec8c185e84fc5281afb6ac5d4c47a234841 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 14 Apr 2009 17:26:30 +0900 Subject: [PATCH 056/162] x86, mce: use strict_strtoull Use strict_strtoull instead of simple_strtoull. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 9 ++++----- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 16 ++++++---------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8ab28368bb92..4375ffb5459f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1059,18 +1059,17 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, - const char *buf, size_t siz) + const char *buf, size_t size) { - char *end; - u64 new = simple_strtoull(buf, &end, 0); + u64 new; - if (end == buf) + if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; bank[attr - bank_attrs] = new; mce_restart(); - return end-buf; + return size; } static ssize_t diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 083f270251fa..0c563432e25c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -269,14 +269,12 @@ SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) static ssize_t -store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) { struct thresh_restart tr; unsigned long new; - char *end; - new = simple_strtoul(buf, &end, 0); - if (end == buf) + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; b->interrupt_enable = !!new; @@ -287,18 +285,16 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return end - buf; + return size; } static ssize_t -store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) { struct thresh_restart tr; unsigned long new; - char *end; - new = simple_strtoul(buf, &end, 0); - if (end == buf) + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; if (new > THRESHOLD_MAX) @@ -313,7 +309,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return end - buf; + return size; } struct threshold_block_cross_cpu { From cc3aec52ab8e013984270a79d1aa51f691d239b0 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 30 Apr 2009 15:58:22 +0900 Subject: [PATCH 057/162] x86, mce: trivial clean up for therm_throt.c This patch removes following checkpatch warning: WARNING: Use #include instead of +#include Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index a2b5d7ddb19f..7b1ae2e20ba5 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -20,7 +20,6 @@ #include #include -#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) From 14a02530e2239f753a0f3f089847e723adbdaa47 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 30 Apr 2009 16:04:51 +0900 Subject: [PATCH 058/162] x86, mce: trivial clean up for mce.c This fixs following checkpatch warnings: WARNING: Use #include instead of +#include WARNING: Use #include instead of +#include WARNING: line over 80 characters + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); WARNING: braces {} are not necessary for any arm of this statement + if (mce_notify_user()) { [...] + } else { [...] Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4375ffb5459f..1d0aa9c4e15b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -27,14 +28,13 @@ #include #include #include +#include #include #include -#include #include #include #include -#include #include "mce.h" @@ -125,7 +125,8 @@ void mce_log(struct mce *mce) * interesting ones: */ if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + set_bit(MCE_OVERFLOW, + (unsigned long *)&mcelog.flags); return; } /* Old left over entry. Skip: */ @@ -556,11 +557,10 @@ static void mcheck_timer(unsigned long data) * polling interval, otherwise increase the polling interval. */ n = &__get_cpu_var(next_interval); - if (mce_notify_user()) { + if (mce_notify_user()) *n = max(*n/2, HZ/100); - } else { + else *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); - } t->expires = jiffies + *n; add_timer(t); From 34fa1967aa0827776e37feb5666df0327575a0f2 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Apr 2009 12:31:18 +0200 Subject: [PATCH 059/162] x86, mce: trivial clean up for mce_amd_64.c Fix for followings: WARNING: Use #include instead of +#include ERROR: Macros with multiple statements should be enclosed in a do - while loop +#define THRESHOLD_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +}; WARNING: usage of NR_CPUS is often wrong - consider using cpu_possible(), num_possible_cpus(), for_each_possible_cpu(), etc + if (cpu >= NR_CPUS) Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 0c563432e25c..ddae21620bda 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include #include @@ -344,17 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b, return 1; } -#define THRESHOLD_ATTR(_name, _mode, _show, _store) \ -{ \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ +#define RW_ATTR(val) \ +static struct threshold_attr val = { \ + .attr = {.name = __stringify(val), .mode = 0644 }, \ + .show = show_## val, \ + .store = store_## val, \ }; -#define RW_ATTR(name) \ -static struct threshold_attr name = \ - THRESHOLD_ATTR(name, 0644, show_## name, store_## name) - RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); RW_ATTR(error_count); @@ -675,9 +671,6 @@ static void threshold_remove_device(unsigned int cpu) static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { - if (cpu >= NR_CPUS) - return; - switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: From 61a021a0700c22ee527d73d92f9acb109ff478f8 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 14 Apr 2009 17:09:04 +0900 Subject: [PATCH 060/162] x86, mce: trivial clean up for mce_intel_64.c Fix for: WARNING: space prohibited between function name and open parenthesis '(' + for_each_online_cpu (cpu) { Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 13abafcb72e4..eff3740501a3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -196,7 +196,7 @@ void cmci_rediscover(int dying) return; cpumask_copy(old, ¤t->cpus_allowed); - for_each_online_cpu (cpu) { + for_each_online_cpu(cpu) { if (cpu == dying) continue; if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) From 98a9c8c3ba13dfc3df8e6d2a126d2fa4e4621e9c Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 28 May 2009 11:41:01 +0900 Subject: [PATCH 061/162] x86, mce: trivial clean up for mce-inject.c Fix for: WARNING: Use #include instead of +#include WARNING: usage of NR_CPUS is often wrong - consider using cpu_possible(), num_possible_cpus(), for_each_possible_cpu(), etc + if (m.cpu >= NR_CPUS || !cpu_online(m.cpu)) ERROR: trailing whitespace +/* $ Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 673c72855022..7b3a5428396a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -11,13 +11,13 @@ * Andi Kleen * Ying Huang */ +#include #include #include #include #include #include #include -#include #include /* Update fake mce registers on current CPU. */ @@ -93,7 +93,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (copy_from_user(&m, ubuf, usize)) return -EFAULT; - if (m.cpu >= NR_CPUS || !cpu_online(m.cpu)) + if (m.cpu >= num_possible_cpus() || !cpu_online(m.cpu)) return -EINVAL; dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); From eb2a6ab729ac40a553797703a5a5dba3a74de004 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 28 Apr 2009 23:32:56 +0200 Subject: [PATCH 062/162] x86: trivial clean up for irq_vectors.h Fix a wrong comment. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq_vectors.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 451e24d18050..233006c4e365 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -86,11 +86,10 @@ #define CALL_FUNCTION_VECTOR 0xfc #define CALL_FUNCTION_SINGLE_VECTOR 0xfb #define THERMAL_APIC_VECTOR 0xfa - #define THRESHOLD_APIC_VECTOR 0xf9 #ifdef CONFIG_X86_32 -/* 0xf9 : free */ +/* 0xf8 : free */ #else # define UV_BAU_MESSAGE 0xf8 #endif From cd13adcc823aa421efa4efd995fa7004a58cf38d Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 27 May 2009 16:57:31 +0900 Subject: [PATCH 063/162] x86: trivial clean up for arch/x86/Kconfig Use tab. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e25b6358fbe3..8c0fff0860bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -822,13 +822,13 @@ config X86_MCE_AMD the DRAM Error Threshold. config X86_ANCIENT_MCE - def_bool n - depends on X86_32 - prompt "Support for old Pentium 5 / WinChip machine checks" - ---help--- - Include support for machine check handling on old Pentium 5 or WinChip - systems. These typically need to be enabled explicitely on the command - line. + def_bool n + depends on X86_32 + prompt "Support for old Pentium 5 / WinChip machine checks" + ---help--- + Include support for machine check handling on old Pentium 5 or WinChip + systems. These typically need to be enabled explicitely on the command + line. config X86_MCE_THRESHOLD depends on X86_MCE_AMD || X86_MCE_INTEL From 38736072d45488fd45f076388b6570419bbbc682 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 28 May 2009 10:05:33 -0700 Subject: [PATCH 064/162] x86, mce: drop "extern" from function prototypes in asm/mce.h Function prototypes don't need to be prefixed by "extern". [ Impact: cleanup ] Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index e7d2372301ef..ac6e0303bf21 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -121,13 +121,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif -extern int mce_available(struct cpuinfo_x86 *c); +int mce_available(struct cpuinfo_x86 *c); void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; -extern void do_machine_check(struct pt_regs *, long); +void do_machine_check(struct pt_regs *, long); typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); @@ -137,15 +137,15 @@ enum mcp_flags { MCP_UC = (1 << 1), /* log uncorrected errors */ MCP_DONTLOG = (1 << 2), /* only clear, don't log */ }; -extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); -extern int mce_notify_user(void); +int mce_notify_user(void); DECLARE_PER_CPU(struct mce, injectm); extern struct file_operations mce_chrdev_ops; #ifdef CONFIG_X86_MCE -extern void mcheck_init(struct cpuinfo_x86 *c); +void mcheck_init(struct cpuinfo_x86 *c); #else #define mcheck_init(c) do { } while (0) #endif From 4156e735d3abde8e9243b5d22f7999dd3fffab2e Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Mon, 1 Jun 2009 13:13:24 -0500 Subject: [PATCH 065/162] xfs: prevent deadlock in xfs_qm_shake() It's possible to recurse into filesystem from the memory allocation, which deadlocks in xfs_qm_shake(). Add check for __GFP_FS, and bail out if it is not set. Signed-off-by: Felix Blyakher Signed-off-by: Hedi Berriche Reviewed-by: Christoph Hellwig Reviewed-by: Andi Kleen Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/kmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index af6843c7ee4b..179cbd630f69 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -103,7 +103,7 @@ extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); static inline int kmem_shake_allow(gfp_t gfp_mask) { - return (gfp_mask & __GFP_WAIT) != 0; + return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)); } #endif /* __XFS_SUPPORT_KMEM_H__ */ From 376bacb0a26bca722981d1610ffb76f951572bb1 Mon Sep 17 00:00:00 2001 From: Frank Seidel Date: Sun, 29 Mar 2009 15:18:39 +0800 Subject: [PATCH 066/162] crypto: tcrypt - Reduce stack size Applying kernel janitors todos (printk calls need KERN_* constants on linebeginnings, reduce stack footprint where possible) to tcrypts test_hash_speed (where stacks memory footprint was very high (on i386 1184 bytes to 160 now). Signed-off-by: Frank Seidel Acked-by: Neil Horman Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index c3c9124209a1..50d1e35bbff4 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -396,16 +396,16 @@ static void test_hash_speed(const char *algo, unsigned int sec, struct scatterlist sg[TVMEMSIZE]; struct crypto_hash *tfm; struct hash_desc desc; - char output[1024]; + static char output[1024]; int i; int ret; - printk("\ntesting speed of %s\n", algo); + printk(KERN_INFO "\ntesting speed of %s\n", algo); tfm = crypto_alloc_hash(algo, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) { - printk("failed to load transform for %s: %ld\n", algo, + printk(KERN_ERR "failed to load transform for %s: %ld\n", algo, PTR_ERR(tfm)); return; } @@ -414,7 +414,7 @@ static void test_hash_speed(const char *algo, unsigned int sec, desc.flags = 0; if (crypto_hash_digestsize(tfm) > sizeof(output)) { - printk("digestsize(%u) > outputbuffer(%zu)\n", + printk(KERN_ERR "digestsize(%u) > outputbuffer(%zu)\n", crypto_hash_digestsize(tfm), sizeof(output)); goto out; } @@ -427,12 +427,14 @@ static void test_hash_speed(const char *algo, unsigned int sec, for (i = 0; speed[i].blen != 0; i++) { if (speed[i].blen > TVMEMSIZE * PAGE_SIZE) { - printk("template (%u) too big for tvmem (%lu)\n", + printk(KERN_ERR + "template (%u) too big for tvmem (%lu)\n", speed[i].blen, TVMEMSIZE * PAGE_SIZE); goto out; } - printk("test%3u (%5u byte blocks,%5u bytes per update,%4u updates): ", + printk(KERN_INFO "test%3u " + "(%5u byte blocks,%5u bytes per update,%4u updates): ", i, speed[i].blen, speed[i].plen, speed[i].blen / speed[i].plen); if (sec) @@ -443,7 +445,7 @@ static void test_hash_speed(const char *algo, unsigned int sec, speed[i].plen, output); if (ret) { - printk("hashing failed ret=%d\n", ret); + printk(KERN_ERR "hashing failed ret=%d\n", ret); break; } } From 811d8f062668077e268a7292202bb923fe2ae896 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sun, 29 Mar 2009 15:20:48 +0800 Subject: [PATCH 067/162] crypto: api - Use kzfree Use kzfree() instead of memset() + kfree(). Signed-off-by: Johannes Weiner Reviewed-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Herbert Xu --- crypto/api.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crypto/api.c b/crypto/api.c index fd2545decb28..f500fb840be9 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -580,20 +580,17 @@ EXPORT_SYMBOL_GPL(crypto_alloc_tfm); void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm) { struct crypto_alg *alg; - int size; if (unlikely(!mem)) return; alg = tfm->__crt_alg; - size = ksize(mem); if (!tfm->exit && alg->cra_exit) alg->cra_exit(tfm); crypto_exit_ops(tfm); crypto_mod_put(alg); - memset(mem, 0, size); - kfree(mem); + kzfree(mem); } EXPORT_SYMBOL_GPL(crypto_destroy_tfm); From 505fd21d6138545aa5e96aa738975e6a9deb98a9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sun, 29 Mar 2009 15:33:53 +0800 Subject: [PATCH 068/162] crypto: cryptd - Use nivcipher in cryptd_alloc_ablkcipher Use crypto_alloc_base() instead of crypto_alloc_ablkcipher() to allocate underlying tfm in cryptd_alloc_ablkcipher. Because crypto_alloc_ablkcipher() prefer GENIV encapsulated crypto instead of raw one, while cryptd_alloc_ablkcipher needed the raw one. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- crypto/cryptd.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/crypto/cryptd.c b/crypto/cryptd.c index d14b22658d7a..ae5fa99d5d36 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -586,20 +586,24 @@ struct cryptd_ablkcipher *cryptd_alloc_ablkcipher(const char *alg_name, u32 type, u32 mask) { char cryptd_alg_name[CRYPTO_MAX_ALG_NAME]; - struct crypto_ablkcipher *tfm; + struct crypto_tfm *tfm; if (snprintf(cryptd_alg_name, CRYPTO_MAX_ALG_NAME, "cryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME) return ERR_PTR(-EINVAL); - tfm = crypto_alloc_ablkcipher(cryptd_alg_name, type, mask); + type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV); + type |= CRYPTO_ALG_TYPE_BLKCIPHER; + mask &= ~CRYPTO_ALG_TYPE_MASK; + mask |= (CRYPTO_ALG_GENIV | CRYPTO_ALG_TYPE_BLKCIPHER_MASK); + tfm = crypto_alloc_base(cryptd_alg_name, type, mask); if (IS_ERR(tfm)) return ERR_CAST(tfm); - if (crypto_ablkcipher_tfm(tfm)->__crt_alg->cra_module != THIS_MODULE) { - crypto_free_ablkcipher(tfm); + if (tfm->__crt_alg->cra_module != THIS_MODULE) { + crypto_free_tfm(tfm); return ERR_PTR(-EINVAL); } - return __cryptd_ablkcipher_cast(tfm); + return __cryptd_ablkcipher_cast(__crypto_ablkcipher_cast(tfm)); } EXPORT_SYMBOL_GPL(cryptd_alloc_ablkcipher); From 150c7e85526e80474b87004f4b420e8834fdeb43 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sun, 29 Mar 2009 15:39:02 +0800 Subject: [PATCH 069/162] crypto: fpu - Add template for blkcipher touching FPU Blkcipher touching FPU need to be enclosed by kernel_fpu_begin() and kernel_fpu_end(). If they are invoked in cipher algorithm implementation, they will be invoked for each block, so that performance will be hurt, because they are "slow" operations. This patch implements "fpu" template, which makes these operations to be invoked for each request. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/fpu.c | 166 +++++++++++++++++++++++++++++++++++++++ crypto/Kconfig | 5 ++ 3 files changed, 173 insertions(+) create mode 100644 arch/x86/crypto/fpu.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index ebe7deedd5b4..cfb0010fa940 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,6 +2,8 @@ # Arch-specific CryptoAPI modules. # +obj-$(CONFIG_CRYPTO_FPU) += fpu.o + obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c new file mode 100644 index 000000000000..5f9781a3815f --- /dev/null +++ b/arch/x86/crypto/fpu.c @@ -0,0 +1,166 @@ +/* + * FPU: Wrapper for blkcipher touching fpu + * + * Copyright (c) Intel Corp. + * Author: Huang Ying + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include +#include +#include +#include +#include +#include + +struct crypto_fpu_ctx { + struct crypto_blkcipher *child; +}; + +static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key, + unsigned int keylen) +{ + struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent); + struct crypto_blkcipher *child = ctx->child; + int err; + + crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_blkcipher_setkey(child, key, keylen); + crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) & + CRYPTO_TFM_RES_MASK); + return err; +} + +static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + int err; + struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); + struct crypto_blkcipher *child = ctx->child; + struct blkcipher_desc desc = { + .tfm = child, + .info = desc_in->info, + .flags = desc_in->flags, + }; + + kernel_fpu_begin(); + err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes); + kernel_fpu_end(); + return err; +} + +static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + int err; + struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm); + struct crypto_blkcipher *child = ctx->child; + struct blkcipher_desc desc = { + .tfm = child, + .info = desc_in->info, + .flags = desc_in->flags, + }; + + kernel_fpu_begin(); + err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes); + kernel_fpu_end(); + return err; +} + +static int crypto_fpu_init_tfm(struct crypto_tfm *tfm) +{ + struct crypto_instance *inst = crypto_tfm_alg_instance(tfm); + struct crypto_spawn *spawn = crypto_instance_ctx(inst); + struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_blkcipher *cipher; + + cipher = crypto_spawn_blkcipher(spawn); + if (IS_ERR(cipher)) + return PTR_ERR(cipher); + + ctx->child = cipher; + return 0; +} + +static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm) +{ + struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm); + crypto_free_blkcipher(ctx->child); +} + +static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb) +{ + struct crypto_instance *inst; + struct crypto_alg *alg; + int err; + + err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER); + if (err) + return ERR_PTR(err); + + alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER, + CRYPTO_ALG_TYPE_MASK); + if (IS_ERR(alg)) + return ERR_CAST(alg); + + inst = crypto_alloc_instance("fpu", alg); + if (IS_ERR(inst)) + goto out_put_alg; + + inst->alg.cra_flags = alg->cra_flags; + inst->alg.cra_priority = alg->cra_priority; + inst->alg.cra_blocksize = alg->cra_blocksize; + inst->alg.cra_alignmask = alg->cra_alignmask; + inst->alg.cra_type = alg->cra_type; + inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize; + inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize; + inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize; + inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx); + inst->alg.cra_init = crypto_fpu_init_tfm; + inst->alg.cra_exit = crypto_fpu_exit_tfm; + inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey; + inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt; + inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt; + +out_put_alg: + crypto_mod_put(alg); + return inst; +} + +static void crypto_fpu_free(struct crypto_instance *inst) +{ + crypto_drop_spawn(crypto_instance_ctx(inst)); + kfree(inst); +} + +static struct crypto_template crypto_fpu_tmpl = { + .name = "fpu", + .alloc = crypto_fpu_alloc, + .free = crypto_fpu_free, + .module = THIS_MODULE, +}; + +static int __init crypto_fpu_module_init(void) +{ + return crypto_register_template(&crypto_fpu_tmpl); +} + +static void __exit crypto_fpu_module_exit(void) +{ + crypto_unregister_template(&crypto_fpu_tmpl); +} + +module_init(crypto_fpu_module_init); +module_exit(crypto_fpu_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("FPU block cipher wrapper"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 74d0e622a515..66ff22a36ed9 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -241,6 +241,11 @@ config CRYPTO_XTS key size 256, 384 or 512 bits. This implementation currently can't handle a sectorsize which is not a multiple of 16 bytes. +config CRYPTO_FPU + tristate + select CRYPTO_BLKCIPHER + select CRYPTO_MANAGER + comment "Hash modes" config CRYPTO_HMAC From 2cf4ac8beb9dc50a315a6155b7b70e754d511958 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sun, 29 Mar 2009 15:41:20 +0800 Subject: [PATCH 070/162] crypto: aes-ni - Add support for more modes Because kernel_fpu_begin() and kernel_fpu_end() operations are too slow, the performance gain of general mode implementation + aes-aesni is almost all compensated. The AES-NI support for more modes are implemented as follow: - Add a new AES algorithm implementation named __aes-aesni without kernel_fpu_begin/end() - Use fpu((AES)) to provide kenrel_fpu_begin/end() invoking - Add (AES) ablkcipher, which uses cryptd(fpu((AES))) to defer cryption to cryptd context in soft_irq context. Now the ctr, lrw, pcbc and xts support are added. Performance testing based on dm-crypt shows that cryption time can be reduced to 50% of general mode implementation + aes-aesni implementation. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 267 ++++++++++++++++++++++++++++- crypto/Kconfig | 5 + 2 files changed, 271 insertions(+), 1 deletion(-) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 02af0af65497..4e663398f77f 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -21,6 +21,22 @@ #include #include +#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) +#define HAS_CTR +#endif + +#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) +#define HAS_LRW +#endif + +#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) +#define HAS_PCBC +#endif + +#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) +#define HAS_XTS +#endif + struct async_aes_ctx { struct cryptd_ablkcipher *cryptd_tfm; }; @@ -137,6 +153,41 @@ static struct crypto_alg aesni_alg = { } }; +static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); + + aesni_enc(ctx, dst, src); +} + +static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); + + aesni_dec(ctx, dst, src); +} + +static struct crypto_alg __aesni_alg = { + .cra_name = "__aes-aesni", + .cra_driver_name = "__driver-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = __aes_encrypt, + .cia_decrypt = __aes_decrypt + } + } +}; + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -277,8 +328,16 @@ static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int key_len) { struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + int err; - return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len); + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, key_len); + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) + & CRYPTO_TFM_RES_MASK); + return err; } static int ablk_encrypt(struct ablkcipher_request *req) @@ -411,6 +470,163 @@ static struct crypto_alg ablk_cbc_alg = { }, }; +#ifdef HAS_CTR +static int ablk_ctr_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_ctr_alg = { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list), + .cra_init = ablk_ctr_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + .geniv = "chainiv", + }, + }, +}; +#endif + +#ifdef HAS_LRW +static int ablk_lrw_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_lrw_alg = { + .cra_name = "lrw(aes)", + .cra_driver_name = "lrw-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list), + .cra_init = ablk_lrw_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; +#endif + +#ifdef HAS_PCBC +static int ablk_pcbc_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_pcbc_alg = { + .cra_name = "pcbc(aes)", + .cra_driver_name = "pcbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list), + .cra_init = ablk_pcbc_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; +#endif + +#ifdef HAS_XTS +static int ablk_xts_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))", + 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_xts_alg = { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list), + .cra_init = ablk_xts_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; +#endif + static int __init aesni_init(void) { int err; @@ -421,6 +637,8 @@ static int __init aesni_init(void) } if ((err = crypto_register_alg(&aesni_alg))) goto aes_err; + if ((err = crypto_register_alg(&__aesni_alg))) + goto __aes_err; if ((err = crypto_register_alg(&blk_ecb_alg))) goto blk_ecb_err; if ((err = crypto_register_alg(&blk_cbc_alg))) @@ -429,9 +647,41 @@ static int __init aesni_init(void) goto ablk_ecb_err; if ((err = crypto_register_alg(&ablk_cbc_alg))) goto ablk_cbc_err; +#ifdef HAS_CTR + if ((err = crypto_register_alg(&ablk_ctr_alg))) + goto ablk_ctr_err; +#endif +#ifdef HAS_LRW + if ((err = crypto_register_alg(&ablk_lrw_alg))) + goto ablk_lrw_err; +#endif +#ifdef HAS_PCBC + if ((err = crypto_register_alg(&ablk_pcbc_alg))) + goto ablk_pcbc_err; +#endif +#ifdef HAS_XTS + if ((err = crypto_register_alg(&ablk_xts_alg))) + goto ablk_xts_err; +#endif return err; +#ifdef HAS_XTS +ablk_xts_err: +#endif +#ifdef HAS_PCBC + crypto_unregister_alg(&ablk_pcbc_alg); +ablk_pcbc_err: +#endif +#ifdef HAS_LRW + crypto_unregister_alg(&ablk_lrw_alg); +ablk_lrw_err: +#endif +#ifdef HAS_CTR + crypto_unregister_alg(&ablk_ctr_alg); +ablk_ctr_err: +#endif + crypto_unregister_alg(&ablk_cbc_alg); ablk_cbc_err: crypto_unregister_alg(&ablk_ecb_alg); ablk_ecb_err: @@ -439,6 +689,8 @@ ablk_ecb_err: blk_cbc_err: crypto_unregister_alg(&blk_ecb_alg); blk_ecb_err: + crypto_unregister_alg(&__aesni_alg); +__aes_err: crypto_unregister_alg(&aesni_alg); aes_err: return err; @@ -446,10 +698,23 @@ aes_err: static void __exit aesni_exit(void) { +#ifdef HAS_XTS + crypto_unregister_alg(&ablk_xts_alg); +#endif +#ifdef HAS_PCBC + crypto_unregister_alg(&ablk_pcbc_alg); +#endif +#ifdef HAS_LRW + crypto_unregister_alg(&ablk_lrw_alg); +#endif +#ifdef HAS_CTR + crypto_unregister_alg(&ablk_ctr_alg); +#endif crypto_unregister_alg(&ablk_cbc_alg); crypto_unregister_alg(&ablk_ecb_alg); crypto_unregister_alg(&blk_cbc_alg); crypto_unregister_alg(&blk_ecb_alg); + crypto_unregister_alg(&__aesni_alg); crypto_unregister_alg(&aesni_alg); } diff --git a/crypto/Kconfig b/crypto/Kconfig index 66ff22a36ed9..4dfdd03e708f 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -491,6 +491,7 @@ config CRYPTO_AES_NI_INTEL select CRYPTO_AES_X86_64 select CRYPTO_CRYPTD select CRYPTO_ALGAPI + select CRYPTO_FPU help Use Intel AES-NI instructions for AES algorithm. @@ -510,6 +511,10 @@ config CRYPTO_AES_NI_INTEL See for more information. + In addition to AES cipher algorithm support, the + acceleration for some popular block cipher mode is supported + too, including ECB, CBC, CTR, LRW, PCBC, XTS. + config CRYPTO_ANUBIS tristate "Anubis cipher algorithm" select CRYPTO_ALGAPI From c79cf91006f03adb603879013b6710b6062c8445 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 29 Mar 2009 15:44:19 +0800 Subject: [PATCH 071/162] crypto: testmgr - Kill test_comp() sparse warnings make C=1: | crypto/testmgr.c:846:45: warning: incorrect type in argument 5 (different signedness) | crypto/testmgr.c:846:45: expected unsigned int *dlen | crypto/testmgr.c:846:45: got int * | crypto/testmgr.c:878:47: warning: incorrect type in argument 5 (different signedness) | crypto/testmgr.c:878:47: expected unsigned int *dlen | crypto/testmgr.c:878:47: got int * Signed-off-by: Geert Uytterhoeven Signed-off-by: Herbert Xu --- crypto/testmgr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index b50c3c6b17a2..bfee6e9f642d 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -837,7 +837,8 @@ static int test_comp(struct crypto_comp *tfm, struct comp_testvec *ctemplate, int ret; for (i = 0; i < ctcount; i++) { - int ilen, dlen = COMP_BUF_SIZE; + int ilen; + unsigned int dlen = COMP_BUF_SIZE; memset(result, 0, sizeof (result)); @@ -869,7 +870,8 @@ static int test_comp(struct crypto_comp *tfm, struct comp_testvec *ctemplate, } for (i = 0; i < dtcount; i++) { - int ilen, dlen = COMP_BUF_SIZE; + int ilen; + unsigned int dlen = COMP_BUF_SIZE; memset(result, 0, sizeof (result)); From 2f6ceb7933f52f238519a9c2f65c628eecdac962 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 29 Mar 2009 15:45:30 +0800 Subject: [PATCH 072/162] crypto: pcomp - pcompress.c should include crypto/internal/compress.h make C=1: | crypto/pcompress.c:77:5: warning: symbol 'crypto_register_pcomp' was not declared. Should it be static? | crypto/pcompress.c:89:5: warning: symbol 'crypto_unregister_pcomp' was not declared. Should it be static? Signed-off-by: Geert Uytterhoeven Signed-off-by: Herbert Xu --- crypto/pcompress.c | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/pcompress.c b/crypto/pcompress.c index ca9a4af91efe..bcadc03726b7 100644 --- a/crypto/pcompress.c +++ b/crypto/pcompress.c @@ -26,6 +26,7 @@ #include #include +#include #include "internal.h" From 9f171adc192fc3c8ffbb691cfdcc70259d75c6ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 29 Mar 2009 15:47:06 +0800 Subject: [PATCH 073/162] hwrng: omap - Move probe function to .devinit.text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A pointer to omap_rng_probe is passed to the core via platform_driver_register and so the function must not disappear when the .init sections are discarded. Otherwise (if also having HOTPLUG=y) unbinding and binding a device to the driver via sysfs will result in an oops as does a device being registered late. An alternative to this patch is using platform_driver_probe instead of platform_driver_register plus removing the pointer to the probe function from the struct platform_driver. Signed-off-by: Uwe Kleine-König Cc: Russell King Cc: David Brownell Cc: Patrick McHardy Cc: Jan Engelhardt Cc: Michael Buesch Cc: Andrew Morton Signed-off-by: Herbert Xu --- drivers/char/hw_random/omap-rng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/omap-rng.c b/drivers/char/hw_random/omap-rng.c index 538313f9e7ac..00dd3de1be51 100644 --- a/drivers/char/hw_random/omap-rng.c +++ b/drivers/char/hw_random/omap-rng.c @@ -89,7 +89,7 @@ static struct hwrng omap_rng_ops = { .data_read = omap_rng_data_read, }; -static int __init omap_rng_probe(struct platform_device *pdev) +static int __devinit omap_rng_probe(struct platform_device *pdev) { struct resource *res, *mem; int ret; From 56af8cd44b05bd9649103b76a6e1e575682990e4 Mon Sep 17 00:00:00 2001 From: Lee Nipper Date: Sun, 29 Mar 2009 15:50:50 +0800 Subject: [PATCH 074/162] crypto: talitos - scaffolding for new algorithm types This patch is preparation for adding new algorithm types. Some elements which are AEAD specific were renamed. The algorithm template structure was changed to use crypto_alg, and talitos_alg_alloc was made more general with respect to algorithm types. ipsec_esp_edesc is renamed to talitos_edesc to use it in the upcoming ablkcipher routines. Signed-off-by: Lee Nipper Signed-off-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 245 +++++++++++++++++++++------------------ 1 file changed, 129 insertions(+), 116 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index a3918c16b3db..9833961a247e 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -684,8 +684,8 @@ struct talitos_ctx { unsigned int authsize; }; -static int aead_authenc_setauthsize(struct crypto_aead *authenc, - unsigned int authsize) +static int aead_setauthsize(struct crypto_aead *authenc, + unsigned int authsize) { struct talitos_ctx *ctx = crypto_aead_ctx(authenc); @@ -694,8 +694,8 @@ static int aead_authenc_setauthsize(struct crypto_aead *authenc, return 0; } -static int aead_authenc_setkey(struct crypto_aead *authenc, - const u8 *key, unsigned int keylen) +static int aead_setkey(struct crypto_aead *authenc, + const u8 *key, unsigned int keylen) { struct talitos_ctx *ctx = crypto_aead_ctx(authenc); struct rtattr *rta = (void *)key; @@ -740,7 +740,7 @@ badkey: } /* - * ipsec_esp_edesc - s/w-extended ipsec_esp descriptor + * talitos_edesc - s/w-extended descriptor * @src_nents: number of segments in input scatterlist * @dst_nents: number of segments in output scatterlist * @dma_len: length of dma mapped link_tbl space @@ -752,7 +752,7 @@ badkey: * is greater than 1, an integrity check value is concatenated to the end * of link_tbl data */ -struct ipsec_esp_edesc { +struct talitos_edesc { int src_nents; int dst_nents; int dma_len; @@ -762,7 +762,7 @@ struct ipsec_esp_edesc { }; static void ipsec_esp_unmap(struct device *dev, - struct ipsec_esp_edesc *edesc, + struct talitos_edesc *edesc, struct aead_request *areq) { unmap_single_talitos_ptr(dev, &edesc->desc.ptr[6], DMA_FROM_DEVICE); @@ -795,8 +795,8 @@ static void ipsec_esp_encrypt_done(struct device *dev, int err) { struct aead_request *areq = context; - struct ipsec_esp_edesc *edesc = - container_of(desc, struct ipsec_esp_edesc, desc); + struct talitos_edesc *edesc = + container_of(desc, struct talitos_edesc, desc); struct crypto_aead *authenc = crypto_aead_reqtfm(areq); struct talitos_ctx *ctx = crypto_aead_ctx(authenc); struct scatterlist *sg; @@ -823,8 +823,8 @@ static void ipsec_esp_decrypt_swauth_done(struct device *dev, int err) { struct aead_request *req = context; - struct ipsec_esp_edesc *edesc = - container_of(desc, struct ipsec_esp_edesc, desc); + struct talitos_edesc *edesc = + container_of(desc, struct talitos_edesc, desc); struct crypto_aead *authenc = crypto_aead_reqtfm(req); struct talitos_ctx *ctx = crypto_aead_ctx(authenc); struct scatterlist *sg; @@ -855,8 +855,8 @@ static void ipsec_esp_decrypt_hwauth_done(struct device *dev, int err) { struct aead_request *req = context; - struct ipsec_esp_edesc *edesc = - container_of(desc, struct ipsec_esp_edesc, desc); + struct talitos_edesc *edesc = + container_of(desc, struct talitos_edesc, desc); ipsec_esp_unmap(dev, edesc, req); @@ -910,7 +910,7 @@ static int sg_to_link_tbl(struct scatterlist *sg, int sg_count, /* * fill in and submit ipsec_esp descriptor */ -static int ipsec_esp(struct ipsec_esp_edesc *edesc, struct aead_request *areq, +static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq, u8 *giv, u64 seq, void (*callback) (struct device *dev, struct talitos_desc *desc, @@ -1052,14 +1052,14 @@ static int sg_count(struct scatterlist *sg_list, int nbytes) } /* - * allocate and map the ipsec_esp extended descriptor + * allocate and map the extended descriptor */ -static struct ipsec_esp_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq, +static struct talitos_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq, int icv_stashing) { struct crypto_aead *authenc = crypto_aead_reqtfm(areq); struct talitos_ctx *ctx = crypto_aead_ctx(authenc); - struct ipsec_esp_edesc *edesc; + struct talitos_edesc *edesc; int src_nents, dst_nents, alloc_len, dma_len; gfp_t flags = areq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; @@ -1084,7 +1084,7 @@ static struct ipsec_esp_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq, * allowing for two separate entries for ICV and generated ICV (+ 2), * and the ICV data itself */ - alloc_len = sizeof(struct ipsec_esp_edesc); + alloc_len = sizeof(struct talitos_edesc); if (src_nents || dst_nents) { dma_len = (src_nents + dst_nents + 2) * sizeof(struct talitos_ptr) + ctx->authsize; @@ -1109,11 +1109,11 @@ static struct ipsec_esp_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq, return edesc; } -static int aead_authenc_encrypt(struct aead_request *req) +static int aead_encrypt(struct aead_request *req) { struct crypto_aead *authenc = crypto_aead_reqtfm(req); struct talitos_ctx *ctx = crypto_aead_ctx(authenc); - struct ipsec_esp_edesc *edesc; + struct talitos_edesc *edesc; /* allocate extended descriptor */ edesc = ipsec_esp_edesc_alloc(req, 0); @@ -1128,13 +1128,13 @@ static int aead_authenc_encrypt(struct aead_request *req) -static int aead_authenc_decrypt(struct aead_request *req) +static int aead_decrypt(struct aead_request *req) { struct crypto_aead *authenc = crypto_aead_reqtfm(req); struct talitos_ctx *ctx = crypto_aead_ctx(authenc); unsigned int authsize = ctx->authsize; struct talitos_private *priv = dev_get_drvdata(ctx->dev); - struct ipsec_esp_edesc *edesc; + struct talitos_edesc *edesc; struct scatterlist *sg; void *icvdata; @@ -1180,13 +1180,12 @@ static int aead_authenc_decrypt(struct aead_request *req) } } -static int aead_authenc_givencrypt( - struct aead_givcrypt_request *req) +static int aead_givencrypt(struct aead_givcrypt_request *req) { struct aead_request *areq = &req->areq; struct crypto_aead *authenc = crypto_aead_reqtfm(areq); struct talitos_ctx *ctx = crypto_aead_ctx(authenc); - struct ipsec_esp_edesc *edesc; + struct talitos_edesc *edesc; /* allocate extended descriptor */ edesc = ipsec_esp_edesc_alloc(areq, 0); @@ -1205,30 +1204,30 @@ static int aead_authenc_givencrypt( } struct talitos_alg_template { - char name[CRYPTO_MAX_ALG_NAME]; - char driver_name[CRYPTO_MAX_ALG_NAME]; - unsigned int blocksize; - struct aead_alg aead; - struct device *dev; + struct crypto_alg alg; __be32 desc_hdr_template; }; static struct talitos_alg_template driver_algs[] = { - /* single-pass ipsec_esp descriptor */ + /* AEAD algorithms. These use a single-pass ipsec_esp descriptor */ { - .name = "authenc(hmac(sha1),cbc(aes))", - .driver_name = "authenc-hmac-sha1-cbc-aes-talitos", - .blocksize = AES_BLOCK_SIZE, - .aead = { - .setkey = aead_authenc_setkey, - .setauthsize = aead_authenc_setauthsize, - .encrypt = aead_authenc_encrypt, - .decrypt = aead_authenc_decrypt, - .givencrypt = aead_authenc_givencrypt, - .geniv = "", - .ivsize = AES_BLOCK_SIZE, - .maxauthsize = SHA1_DIGEST_SIZE, - }, + .alg = { + .cra_name = "authenc(hmac(sha1),cbc(aes))", + .cra_driver_name = "authenc-hmac-sha1-cbc-aes-talitos", + .cra_blocksize = AES_BLOCK_SIZE, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, + .cra_type = &crypto_aead_type, + .cra_aead = { + .setkey = aead_setkey, + .setauthsize = aead_setauthsize, + .encrypt = aead_encrypt, + .decrypt = aead_decrypt, + .givencrypt = aead_givencrypt, + .geniv = "", + .ivsize = AES_BLOCK_SIZE, + .maxauthsize = SHA1_DIGEST_SIZE, + } + }, .desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP | DESC_HDR_SEL0_AESU | DESC_HDR_MODE0_AESU_CBC | @@ -1238,19 +1237,23 @@ static struct talitos_alg_template driver_algs[] = { DESC_HDR_MODE1_MDEU_SHA1_HMAC, }, { - .name = "authenc(hmac(sha1),cbc(des3_ede))", - .driver_name = "authenc-hmac-sha1-cbc-3des-talitos", - .blocksize = DES3_EDE_BLOCK_SIZE, - .aead = { - .setkey = aead_authenc_setkey, - .setauthsize = aead_authenc_setauthsize, - .encrypt = aead_authenc_encrypt, - .decrypt = aead_authenc_decrypt, - .givencrypt = aead_authenc_givencrypt, - .geniv = "", - .ivsize = DES3_EDE_BLOCK_SIZE, - .maxauthsize = SHA1_DIGEST_SIZE, - }, + .alg = { + .cra_name = "authenc(hmac(sha1),cbc(des3_ede))", + .cra_driver_name = "authenc-hmac-sha1-cbc-3des-talitos", + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, + .cra_type = &crypto_aead_type, + .cra_aead = { + .setkey = aead_setkey, + .setauthsize = aead_setauthsize, + .encrypt = aead_encrypt, + .decrypt = aead_decrypt, + .givencrypt = aead_givencrypt, + .geniv = "", + .ivsize = DES3_EDE_BLOCK_SIZE, + .maxauthsize = SHA1_DIGEST_SIZE, + } + }, .desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP | DESC_HDR_SEL0_DEU | DESC_HDR_MODE0_DEU_CBC | @@ -1261,19 +1264,23 @@ static struct talitos_alg_template driver_algs[] = { DESC_HDR_MODE1_MDEU_SHA1_HMAC, }, { - .name = "authenc(hmac(sha256),cbc(aes))", - .driver_name = "authenc-hmac-sha256-cbc-aes-talitos", - .blocksize = AES_BLOCK_SIZE, - .aead = { - .setkey = aead_authenc_setkey, - .setauthsize = aead_authenc_setauthsize, - .encrypt = aead_authenc_encrypt, - .decrypt = aead_authenc_decrypt, - .givencrypt = aead_authenc_givencrypt, - .geniv = "", - .ivsize = AES_BLOCK_SIZE, - .maxauthsize = SHA256_DIGEST_SIZE, - }, + .alg = { + .cra_name = "authenc(hmac(sha256),cbc(aes))", + .cra_driver_name = "authenc-hmac-sha256-cbc-aes-talitos", + .cra_blocksize = AES_BLOCK_SIZE, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, + .cra_type = &crypto_aead_type, + .cra_aead = { + .setkey = aead_setkey, + .setauthsize = aead_setauthsize, + .encrypt = aead_encrypt, + .decrypt = aead_decrypt, + .givencrypt = aead_givencrypt, + .geniv = "", + .ivsize = AES_BLOCK_SIZE, + .maxauthsize = SHA256_DIGEST_SIZE, + } + }, .desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP | DESC_HDR_SEL0_AESU | DESC_HDR_MODE0_AESU_CBC | @@ -1283,19 +1290,23 @@ static struct talitos_alg_template driver_algs[] = { DESC_HDR_MODE1_MDEU_SHA256_HMAC, }, { - .name = "authenc(hmac(sha256),cbc(des3_ede))", - .driver_name = "authenc-hmac-sha256-cbc-3des-talitos", - .blocksize = DES3_EDE_BLOCK_SIZE, - .aead = { - .setkey = aead_authenc_setkey, - .setauthsize = aead_authenc_setauthsize, - .encrypt = aead_authenc_encrypt, - .decrypt = aead_authenc_decrypt, - .givencrypt = aead_authenc_givencrypt, - .geniv = "", - .ivsize = DES3_EDE_BLOCK_SIZE, - .maxauthsize = SHA256_DIGEST_SIZE, - }, + .alg = { + .cra_name = "authenc(hmac(sha256),cbc(des3_ede))", + .cra_driver_name = "authenc-hmac-sha256-cbc-3des-talitos", + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, + .cra_type = &crypto_aead_type, + .cra_aead = { + .setkey = aead_setkey, + .setauthsize = aead_setauthsize, + .encrypt = aead_encrypt, + .decrypt = aead_decrypt, + .givencrypt = aead_givencrypt, + .geniv = "", + .ivsize = DES3_EDE_BLOCK_SIZE, + .maxauthsize = SHA256_DIGEST_SIZE, + } + }, .desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP | DESC_HDR_SEL0_DEU | DESC_HDR_MODE0_DEU_CBC | @@ -1306,19 +1317,23 @@ static struct talitos_alg_template driver_algs[] = { DESC_HDR_MODE1_MDEU_SHA256_HMAC, }, { - .name = "authenc(hmac(md5),cbc(aes))", - .driver_name = "authenc-hmac-md5-cbc-aes-talitos", - .blocksize = AES_BLOCK_SIZE, - .aead = { - .setkey = aead_authenc_setkey, - .setauthsize = aead_authenc_setauthsize, - .encrypt = aead_authenc_encrypt, - .decrypt = aead_authenc_decrypt, - .givencrypt = aead_authenc_givencrypt, - .geniv = "", - .ivsize = AES_BLOCK_SIZE, - .maxauthsize = MD5_DIGEST_SIZE, - }, + .alg = { + .cra_name = "authenc(hmac(md5),cbc(aes))", + .cra_driver_name = "authenc-hmac-md5-cbc-aes-talitos", + .cra_blocksize = AES_BLOCK_SIZE, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, + .cra_type = &crypto_aead_type, + .cra_aead = { + .setkey = aead_setkey, + .setauthsize = aead_setauthsize, + .encrypt = aead_encrypt, + .decrypt = aead_decrypt, + .givencrypt = aead_givencrypt, + .geniv = "", + .ivsize = AES_BLOCK_SIZE, + .maxauthsize = MD5_DIGEST_SIZE, + } + }, .desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP | DESC_HDR_SEL0_AESU | DESC_HDR_MODE0_AESU_CBC | @@ -1328,19 +1343,23 @@ static struct talitos_alg_template driver_algs[] = { DESC_HDR_MODE1_MDEU_MD5_HMAC, }, { - .name = "authenc(hmac(md5),cbc(des3_ede))", - .driver_name = "authenc-hmac-md5-cbc-3des-talitos", - .blocksize = DES3_EDE_BLOCK_SIZE, - .aead = { - .setkey = aead_authenc_setkey, - .setauthsize = aead_authenc_setauthsize, - .encrypt = aead_authenc_encrypt, - .decrypt = aead_authenc_decrypt, - .givencrypt = aead_authenc_givencrypt, - .geniv = "", - .ivsize = DES3_EDE_BLOCK_SIZE, - .maxauthsize = MD5_DIGEST_SIZE, - }, + .alg = { + .cra_name = "authenc(hmac(md5),cbc(des3_ede))", + .cra_driver_name = "authenc-hmac-md5-cbc-3des-talitos", + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, + .cra_type = &crypto_aead_type, + .cra_aead = { + .setkey = aead_setkey, + .setauthsize = aead_setauthsize, + .encrypt = aead_encrypt, + .decrypt = aead_decrypt, + .givencrypt = aead_givencrypt, + .geniv = "", + .ivsize = DES3_EDE_BLOCK_SIZE, + .maxauthsize = MD5_DIGEST_SIZE, + } + }, .desc_hdr_template = DESC_HDR_TYPE_IPSEC_ESP | DESC_HDR_SEL0_DEU | DESC_HDR_MODE0_DEU_CBC | @@ -1453,19 +1472,13 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev, return ERR_PTR(-ENOMEM); alg = &t_alg->crypto_alg; + *alg = template->alg; - snprintf(alg->cra_name, CRYPTO_MAX_ALG_NAME, "%s", template->name); - snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", - template->driver_name); alg->cra_module = THIS_MODULE; alg->cra_init = talitos_cra_init; alg->cra_priority = TALITOS_CRA_PRIORITY; - alg->cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC; - alg->cra_blocksize = template->blocksize; alg->cra_alignmask = 0; - alg->cra_type = &crypto_aead_type; alg->cra_ctxsize = sizeof(struct talitos_ctx); - alg->cra_u.aead = template->aead; t_alg->desc_hdr_template = template->desc_hdr_template; t_alg->dev = dev; From 4de9d0b547b97e40c93a885ac6246c2c5fef05cb Mon Sep 17 00:00:00 2001 From: Lee Nipper Date: Sun, 29 Mar 2009 15:52:32 +0800 Subject: [PATCH 075/162] crypto: talitos - Add ablkcipher algorithms Add these ablkcipher algorithms: cbc(aes), cbc(des3_ede). Added handling of chained scatterlists with zero length entry because eseqiv uses it. Added new map and unmap routines. Signed-off-by: Lee Nipper Signed-off-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 380 +++++++++++++++++++++++++++++++++++---- 1 file changed, 342 insertions(+), 38 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 9833961a247e..a0b0a6319088 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -44,6 +44,8 @@ #include #include #include +#include +#include #include "talitos.h" @@ -755,12 +757,62 @@ badkey: struct talitos_edesc { int src_nents; int dst_nents; + int src_is_chained; + int dst_is_chained; int dma_len; dma_addr_t dma_link_tbl; struct talitos_desc desc; struct talitos_ptr link_tbl[0]; }; +static int talitos_map_sg(struct device *dev, struct scatterlist *sg, + unsigned int nents, enum dma_data_direction dir, + int chained) +{ + if (unlikely(chained)) + while (sg) { + dma_map_sg(dev, sg, 1, dir); + sg = scatterwalk_sg_next(sg); + } + else + dma_map_sg(dev, sg, nents, dir); + return nents; +} + +static void talitos_unmap_sg_chain(struct device *dev, struct scatterlist *sg, + enum dma_data_direction dir) +{ + while (sg) { + dma_unmap_sg(dev, sg, 1, dir); + sg = scatterwalk_sg_next(sg); + } +} + +static void talitos_sg_unmap(struct device *dev, + struct talitos_edesc *edesc, + struct scatterlist *src, + struct scatterlist *dst) +{ + unsigned int src_nents = edesc->src_nents ? : 1; + unsigned int dst_nents = edesc->dst_nents ? : 1; + + if (src != dst) { + if (edesc->src_is_chained) + talitos_unmap_sg_chain(dev, src, DMA_TO_DEVICE); + else + dma_unmap_sg(dev, src, src_nents, DMA_TO_DEVICE); + + if (edesc->dst_is_chained) + talitos_unmap_sg_chain(dev, dst, DMA_FROM_DEVICE); + else + dma_unmap_sg(dev, dst, dst_nents, DMA_FROM_DEVICE); + } else + if (edesc->src_is_chained) + talitos_unmap_sg_chain(dev, src, DMA_BIDIRECTIONAL); + else + dma_unmap_sg(dev, src, src_nents, DMA_BIDIRECTIONAL); +} + static void ipsec_esp_unmap(struct device *dev, struct talitos_edesc *edesc, struct aead_request *areq) @@ -772,15 +824,7 @@ static void ipsec_esp_unmap(struct device *dev, dma_unmap_sg(dev, areq->assoc, 1, DMA_TO_DEVICE); - if (areq->src != areq->dst) { - dma_unmap_sg(dev, areq->src, edesc->src_nents ? : 1, - DMA_TO_DEVICE); - dma_unmap_sg(dev, areq->dst, edesc->dst_nents ? : 1, - DMA_FROM_DEVICE); - } else { - dma_unmap_sg(dev, areq->src, edesc->src_nents ? : 1, - DMA_BIDIRECTIONAL); - } + talitos_sg_unmap(dev, edesc, areq->src, areq->dst); if (edesc->dma_len) dma_unmap_single(dev, edesc->dma_link_tbl, edesc->dma_len, @@ -886,7 +930,7 @@ static int sg_to_link_tbl(struct scatterlist *sg, int sg_count, link_tbl_ptr->j_extent = 0; link_tbl_ptr++; cryptlen -= sg_dma_len(sg); - sg = sg_next(sg); + sg = scatterwalk_sg_next(sg); } /* adjust (decrease) last one (or two) entry's len to cryptlen */ @@ -952,12 +996,11 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq, desc->ptr[4].len = cpu_to_be16(cryptlen); desc->ptr[4].j_extent = authsize; - if (areq->src == areq->dst) - sg_count = dma_map_sg(dev, areq->src, edesc->src_nents ? : 1, - DMA_BIDIRECTIONAL); - else - sg_count = dma_map_sg(dev, areq->src, edesc->src_nents ? : 1, - DMA_TO_DEVICE); + sg_count = talitos_map_sg(dev, areq->src, + edesc->src_nents ? : 1, + (areq->src == areq->dst) ? DMA_BIDIRECTIONAL : + DMA_TO_DEVICE, + edesc->src_is_chained); if (sg_count == 1) { desc->ptr[4].ptr = cpu_to_be32(sg_dma_address(areq->src)); @@ -986,8 +1029,10 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq, desc->ptr[5].j_extent = authsize; if (areq->src != areq->dst) { - sg_count = dma_map_sg(dev, areq->dst, edesc->dst_nents ? : 1, - DMA_FROM_DEVICE); + sg_count = talitos_map_sg(dev, areq->dst, + edesc->dst_nents ? : 1, + DMA_FROM_DEVICE, + edesc->dst_is_chained); } if (sg_count == 1) { @@ -1037,15 +1082,18 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq, /* * derive number of elements in scatterlist */ -static int sg_count(struct scatterlist *sg_list, int nbytes) +static int sg_count(struct scatterlist *sg_list, int nbytes, int *chained) { struct scatterlist *sg = sg_list; int sg_nents = 0; - while (nbytes) { + *chained = 0; + while (nbytes > 0) { sg_nents++; nbytes -= sg->length; - sg = sg_next(sg); + if (!sg_is_last(sg) && (sg + 1)->length == 0) + *chained = 1; + sg = scatterwalk_sg_next(sg); } return sg_nents; @@ -1054,28 +1102,32 @@ static int sg_count(struct scatterlist *sg_list, int nbytes) /* * allocate and map the extended descriptor */ -static struct talitos_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq, - int icv_stashing) +static struct talitos_edesc *talitos_edesc_alloc(struct device *dev, + struct scatterlist *src, + struct scatterlist *dst, + unsigned int cryptlen, + unsigned int authsize, + int icv_stashing, + u32 cryptoflags) { - struct crypto_aead *authenc = crypto_aead_reqtfm(areq); - struct talitos_ctx *ctx = crypto_aead_ctx(authenc); struct talitos_edesc *edesc; int src_nents, dst_nents, alloc_len, dma_len; - gfp_t flags = areq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : + int src_chained, dst_chained = 0; + gfp_t flags = cryptoflags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; - if (areq->cryptlen + ctx->authsize > TALITOS_MAX_DATA_LEN) { - dev_err(ctx->dev, "cryptlen exceeds h/w max limit\n"); + if (cryptlen + authsize > TALITOS_MAX_DATA_LEN) { + dev_err(dev, "length exceeds h/w max limit\n"); return ERR_PTR(-EINVAL); } - src_nents = sg_count(areq->src, areq->cryptlen + ctx->authsize); + src_nents = sg_count(src, cryptlen + authsize, &src_chained); src_nents = (src_nents == 1) ? 0 : src_nents; - if (areq->dst == areq->src) { + if (dst == src) { dst_nents = src_nents; } else { - dst_nents = sg_count(areq->dst, areq->cryptlen + ctx->authsize); + dst_nents = sg_count(dst, cryptlen + authsize, &dst_chained); dst_nents = (dst_nents == 1) ? 0 : dst_nents; } @@ -1087,28 +1139,41 @@ static struct talitos_edesc *ipsec_esp_edesc_alloc(struct aead_request *areq, alloc_len = sizeof(struct talitos_edesc); if (src_nents || dst_nents) { dma_len = (src_nents + dst_nents + 2) * - sizeof(struct talitos_ptr) + ctx->authsize; + sizeof(struct talitos_ptr) + authsize; alloc_len += dma_len; } else { dma_len = 0; - alloc_len += icv_stashing ? ctx->authsize : 0; + alloc_len += icv_stashing ? authsize : 0; } edesc = kmalloc(alloc_len, GFP_DMA | flags); if (!edesc) { - dev_err(ctx->dev, "could not allocate edescriptor\n"); + dev_err(dev, "could not allocate edescriptor\n"); return ERR_PTR(-ENOMEM); } edesc->src_nents = src_nents; edesc->dst_nents = dst_nents; + edesc->src_is_chained = src_chained; + edesc->dst_is_chained = dst_chained; edesc->dma_len = dma_len; - edesc->dma_link_tbl = dma_map_single(ctx->dev, &edesc->link_tbl[0], + edesc->dma_link_tbl = dma_map_single(dev, &edesc->link_tbl[0], edesc->dma_len, DMA_BIDIRECTIONAL); return edesc; } +static struct talitos_edesc *aead_edesc_alloc(struct aead_request *areq, + int icv_stashing) +{ + struct crypto_aead *authenc = crypto_aead_reqtfm(areq); + struct talitos_ctx *ctx = crypto_aead_ctx(authenc); + + return talitos_edesc_alloc(ctx->dev, areq->src, areq->dst, + areq->cryptlen, ctx->authsize, icv_stashing, + areq->base.flags); +} + static int aead_encrypt(struct aead_request *req) { struct crypto_aead *authenc = crypto_aead_reqtfm(req); @@ -1116,7 +1181,7 @@ static int aead_encrypt(struct aead_request *req) struct talitos_edesc *edesc; /* allocate extended descriptor */ - edesc = ipsec_esp_edesc_alloc(req, 0); + edesc = aead_edesc_alloc(req, 0); if (IS_ERR(edesc)) return PTR_ERR(edesc); @@ -1141,7 +1206,7 @@ static int aead_decrypt(struct aead_request *req) req->cryptlen -= authsize; /* allocate extended descriptor */ - edesc = ipsec_esp_edesc_alloc(req, 1); + edesc = aead_edesc_alloc(req, 1); if (IS_ERR(edesc)) return PTR_ERR(edesc); @@ -1188,7 +1253,7 @@ static int aead_givencrypt(struct aead_givcrypt_request *req) struct talitos_edesc *edesc; /* allocate extended descriptor */ - edesc = ipsec_esp_edesc_alloc(areq, 0); + edesc = aead_edesc_alloc(areq, 0); if (IS_ERR(edesc)) return PTR_ERR(edesc); @@ -1203,6 +1268,199 @@ static int aead_givencrypt(struct aead_givcrypt_request *req) ipsec_esp_encrypt_done); } +static int ablkcipher_setkey(struct crypto_ablkcipher *cipher, + const u8 *key, unsigned int keylen) +{ + struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher); + struct ablkcipher_alg *alg = crypto_ablkcipher_alg(cipher); + + if (keylen > TALITOS_MAX_KEY_SIZE) + goto badkey; + + if (keylen < alg->min_keysize || keylen > alg->max_keysize) + goto badkey; + + memcpy(&ctx->key, key, keylen); + ctx->keylen = keylen; + + return 0; + +badkey: + crypto_ablkcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; +} + +static void common_nonsnoop_unmap(struct device *dev, + struct talitos_edesc *edesc, + struct ablkcipher_request *areq) +{ + unmap_single_talitos_ptr(dev, &edesc->desc.ptr[5], DMA_FROM_DEVICE); + unmap_single_talitos_ptr(dev, &edesc->desc.ptr[2], DMA_TO_DEVICE); + unmap_single_talitos_ptr(dev, &edesc->desc.ptr[1], DMA_TO_DEVICE); + + talitos_sg_unmap(dev, edesc, areq->src, areq->dst); + + if (edesc->dma_len) + dma_unmap_single(dev, edesc->dma_link_tbl, edesc->dma_len, + DMA_BIDIRECTIONAL); +} + +static void ablkcipher_done(struct device *dev, + struct talitos_desc *desc, void *context, + int err) +{ + struct ablkcipher_request *areq = context; + struct talitos_edesc *edesc = + container_of(desc, struct talitos_edesc, desc); + + common_nonsnoop_unmap(dev, edesc, areq); + + kfree(edesc); + + areq->base.complete(&areq->base, err); +} + +static int common_nonsnoop(struct talitos_edesc *edesc, + struct ablkcipher_request *areq, + u8 *giv, + void (*callback) (struct device *dev, + struct talitos_desc *desc, + void *context, int error)) +{ + struct crypto_ablkcipher *cipher = crypto_ablkcipher_reqtfm(areq); + struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher); + struct device *dev = ctx->dev; + struct talitos_desc *desc = &edesc->desc; + unsigned int cryptlen = areq->nbytes; + unsigned int ivsize; + int sg_count, ret; + + /* first DWORD empty */ + desc->ptr[0].len = 0; + desc->ptr[0].ptr = 0; + desc->ptr[0].j_extent = 0; + + /* cipher iv */ + ivsize = crypto_ablkcipher_ivsize(cipher); + map_single_talitos_ptr(dev, &desc->ptr[1], ivsize, giv ?: areq->info, 0, + DMA_TO_DEVICE); + + /* cipher key */ + map_single_talitos_ptr(dev, &desc->ptr[2], ctx->keylen, + (char *)&ctx->key, 0, DMA_TO_DEVICE); + + /* + * cipher in + */ + desc->ptr[3].len = cpu_to_be16(cryptlen); + desc->ptr[3].j_extent = 0; + + sg_count = talitos_map_sg(dev, areq->src, edesc->src_nents ? : 1, + (areq->src == areq->dst) ? DMA_BIDIRECTIONAL + : DMA_TO_DEVICE, + edesc->src_is_chained); + + if (sg_count == 1) { + desc->ptr[3].ptr = cpu_to_be32(sg_dma_address(areq->src)); + } else { + sg_count = sg_to_link_tbl(areq->src, sg_count, cryptlen, + &edesc->link_tbl[0]); + if (sg_count > 1) { + desc->ptr[3].j_extent |= DESC_PTR_LNKTBL_JUMP; + desc->ptr[3].ptr = cpu_to_be32(edesc->dma_link_tbl); + dma_sync_single_for_device(ctx->dev, edesc->dma_link_tbl, + edesc->dma_len, DMA_BIDIRECTIONAL); + } else { + /* Only one segment now, so no link tbl needed */ + desc->ptr[3].ptr = cpu_to_be32(sg_dma_address(areq->src)); + } + } + + /* cipher out */ + desc->ptr[4].len = cpu_to_be16(cryptlen); + desc->ptr[4].j_extent = 0; + + if (areq->src != areq->dst) + sg_count = talitos_map_sg(dev, areq->dst, + edesc->dst_nents ? : 1, + DMA_FROM_DEVICE, + edesc->dst_is_chained); + + if (sg_count == 1) { + desc->ptr[4].ptr = cpu_to_be32(sg_dma_address(areq->dst)); + } else { + struct talitos_ptr *link_tbl_ptr = + &edesc->link_tbl[edesc->src_nents + 1]; + + desc->ptr[4].j_extent |= DESC_PTR_LNKTBL_JUMP; + desc->ptr[4].ptr = cpu_to_be32((struct talitos_ptr *) + edesc->dma_link_tbl + + edesc->src_nents + 1); + sg_count = sg_to_link_tbl(areq->dst, sg_count, cryptlen, + link_tbl_ptr); + dma_sync_single_for_device(ctx->dev, edesc->dma_link_tbl, + edesc->dma_len, DMA_BIDIRECTIONAL); + } + + /* iv out */ + map_single_talitos_ptr(dev, &desc->ptr[5], ivsize, ctx->iv, 0, + DMA_FROM_DEVICE); + + /* last DWORD empty */ + desc->ptr[6].len = 0; + desc->ptr[6].ptr = 0; + desc->ptr[6].j_extent = 0; + + ret = talitos_submit(dev, desc, callback, areq); + if (ret != -EINPROGRESS) { + common_nonsnoop_unmap(dev, edesc, areq); + kfree(edesc); + } + return ret; +} + +static struct talitos_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request *areq) +{ + struct crypto_ablkcipher *cipher = crypto_ablkcipher_reqtfm(areq); + struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher); + + return talitos_edesc_alloc(ctx->dev, areq->src, areq->dst, areq->nbytes, + 0, 0, areq->base.flags); +} + +static int ablkcipher_encrypt(struct ablkcipher_request *areq) +{ + struct crypto_ablkcipher *cipher = crypto_ablkcipher_reqtfm(areq); + struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher); + struct talitos_edesc *edesc; + + /* allocate extended descriptor */ + edesc = ablkcipher_edesc_alloc(areq); + if (IS_ERR(edesc)) + return PTR_ERR(edesc); + + /* set encrypt */ + edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_MODE0_ENCRYPT; + + return common_nonsnoop(edesc, areq, NULL, ablkcipher_done); +} + +static int ablkcipher_decrypt(struct ablkcipher_request *areq) +{ + struct crypto_ablkcipher *cipher = crypto_ablkcipher_reqtfm(areq); + struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher); + struct talitos_edesc *edesc; + + /* allocate extended descriptor */ + edesc = ablkcipher_edesc_alloc(areq); + if (IS_ERR(edesc)) + return PTR_ERR(edesc); + + edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_DIR_INBOUND; + + return common_nonsnoop(edesc, areq, NULL, ablkcipher_done); +} + struct talitos_alg_template { struct crypto_alg alg; __be32 desc_hdr_template; @@ -1368,6 +1626,52 @@ static struct talitos_alg_template driver_algs[] = { DESC_HDR_MODE1_MDEU_INIT | DESC_HDR_MODE1_MDEU_PAD | DESC_HDR_MODE1_MDEU_MD5_HMAC, + }, + /* ABLKCIPHER algorithms. */ + { + .alg = { + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-talitos", + .cra_blocksize = AES_BLOCK_SIZE, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | + CRYPTO_ALG_ASYNC, + .cra_type = &crypto_ablkcipher_type, + .cra_ablkcipher = { + .setkey = ablkcipher_setkey, + .encrypt = ablkcipher_encrypt, + .decrypt = ablkcipher_decrypt, + .geniv = "eseqiv", + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + } + }, + .desc_hdr_template = DESC_HDR_TYPE_COMMON_NONSNOOP_NO_AFEU | + DESC_HDR_SEL0_AESU | + DESC_HDR_MODE0_AESU_CBC, + }, + { + .alg = { + .cra_name = "cbc(des3_ede)", + .cra_driver_name = "cbc-3des-talitos", + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | + CRYPTO_ALG_ASYNC, + .cra_type = &crypto_ablkcipher_type, + .cra_ablkcipher = { + .setkey = ablkcipher_setkey, + .encrypt = ablkcipher_encrypt, + .decrypt = ablkcipher_decrypt, + .geniv = "eseqiv", + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .ivsize = DES3_EDE_BLOCK_SIZE, + } + }, + .desc_hdr_template = DESC_HDR_TYPE_COMMON_NONSNOOP_NO_AFEU | + DESC_HDR_SEL0_DEU | + DESC_HDR_MODE0_DEU_CBC | + DESC_HDR_MODE0_DEU_3DES, } }; From e938e4656b3ee32e046ee8293411a07be9d72eb8 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Sun, 29 Mar 2009 15:53:23 +0800 Subject: [PATCH 076/162] crypto: talitos - Whitespace/codingstyle/overrun lines cleanup no functional changes. Signed-off-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 103 ++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index a0b0a6319088..a073e6b6a3c8 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -341,7 +341,8 @@ static void flush_channel(struct device *dev, int ch, int error, int reset_ch) status = error; dma_unmap_single(dev, request->dma_desc, - sizeof(struct talitos_desc), DMA_BIDIRECTIONAL); + sizeof(struct talitos_desc), + DMA_BIDIRECTIONAL); /* copy entries so we can call callback outside lock */ saved_req.desc = request->desc; @@ -415,7 +416,8 @@ static struct talitos_desc *current_desc(struct device *dev, int ch) /* * user diagnostics; report root cause of error based on execution unit status */ -static void report_eu_error(struct device *dev, int ch, struct talitos_desc *desc) +static void report_eu_error(struct device *dev, int ch, + struct talitos_desc *desc) { struct talitos_private *priv = dev_get_drvdata(dev); int i; @@ -863,8 +865,8 @@ static void ipsec_esp_encrypt_done(struct device *dev, } static void ipsec_esp_decrypt_swauth_done(struct device *dev, - struct talitos_desc *desc, void *context, - int err) + struct talitos_desc *desc, + void *context, int err) { struct aead_request *req = context; struct talitos_edesc *edesc = @@ -895,8 +897,8 @@ static void ipsec_esp_decrypt_swauth_done(struct device *dev, } static void ipsec_esp_decrypt_hwauth_done(struct device *dev, - struct talitos_desc *desc, void *context, - int err) + struct talitos_desc *desc, + void *context, int err) { struct aead_request *req = context; struct talitos_edesc *edesc = @@ -905,10 +907,9 @@ static void ipsec_esp_decrypt_hwauth_done(struct device *dev, ipsec_esp_unmap(dev, edesc, req); /* check ICV auth status */ - if (!err) - if ((desc->hdr_lo & DESC_HDR_LO_ICCR1_MASK) != - DESC_HDR_LO_ICCR1_PASS) - err = -EBADMSG; + if (!err && ((desc->hdr_lo & DESC_HDR_LO_ICCR1_MASK) != + DESC_HDR_LO_ICCR1_PASS)) + err = -EBADMSG; kfree(edesc); @@ -996,10 +997,9 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq, desc->ptr[4].len = cpu_to_be16(cryptlen); desc->ptr[4].j_extent = authsize; - sg_count = talitos_map_sg(dev, areq->src, - edesc->src_nents ? : 1, - (areq->src == areq->dst) ? DMA_BIDIRECTIONAL : - DMA_TO_DEVICE, + sg_count = talitos_map_sg(dev, areq->src, edesc->src_nents ? : 1, + (areq->src == areq->dst) ? DMA_BIDIRECTIONAL + : DMA_TO_DEVICE, edesc->src_is_chained); if (sg_count == 1) { @@ -1008,19 +1008,21 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq, sg_link_tbl_len = cryptlen; if ((edesc->desc.hdr & DESC_HDR_MODE1_MDEU_CICV) && - (edesc->desc.hdr & DESC_HDR_MODE0_ENCRYPT) == 0) { + (edesc->desc.hdr & DESC_HDR_MODE0_ENCRYPT) == 0) sg_link_tbl_len = cryptlen + authsize; - } + sg_count = sg_to_link_tbl(areq->src, sg_count, sg_link_tbl_len, &edesc->link_tbl[0]); if (sg_count > 1) { desc->ptr[4].j_extent |= DESC_PTR_LNKTBL_JUMP; desc->ptr[4].ptr = cpu_to_be32(edesc->dma_link_tbl); - dma_sync_single_for_device(ctx->dev, edesc->dma_link_tbl, - edesc->dma_len, DMA_BIDIRECTIONAL); + dma_sync_single_for_device(dev, edesc->dma_link_tbl, + edesc->dma_len, + DMA_BIDIRECTIONAL); } else { /* Only one segment now, so no link tbl needed */ - desc->ptr[4].ptr = cpu_to_be32(sg_dma_address(areq->src)); + desc->ptr[4].ptr = cpu_to_be32(sg_dma_address(areq-> + src)); } } @@ -1028,12 +1030,11 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq, desc->ptr[5].len = cpu_to_be16(cryptlen); desc->ptr[5].j_extent = authsize; - if (areq->src != areq->dst) { + if (areq->src != areq->dst) sg_count = talitos_map_sg(dev, areq->dst, edesc->dst_nents ? : 1, DMA_FROM_DEVICE, edesc->dst_is_chained); - } if (sg_count == 1) { desc->ptr[5].ptr = cpu_to_be32(sg_dma_address(areq->dst)); @@ -1078,7 +1079,6 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq, return ret; } - /* * derive number of elements in scatterlist */ @@ -1191,8 +1191,6 @@ static int aead_encrypt(struct aead_request *req) return ipsec_esp(edesc, req, NULL, 0, ipsec_esp_encrypt_done); } - - static int aead_decrypt(struct aead_request *req) { struct crypto_aead *authenc = crypto_aead_reqtfm(req); @@ -1211,38 +1209,38 @@ static int aead_decrypt(struct aead_request *req) return PTR_ERR(edesc); if ((priv->features & TALITOS_FTR_HW_AUTH_CHECK) && - (((!edesc->src_nents && !edesc->dst_nents) || - priv->features & TALITOS_FTR_SRC_LINK_TBL_LEN_INCLUDES_EXTENT))) { + ((!edesc->src_nents && !edesc->dst_nents) || + priv->features & TALITOS_FTR_SRC_LINK_TBL_LEN_INCLUDES_EXTENT)) { /* decrypt and check the ICV */ - edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_DIR_INBOUND | + edesc->desc.hdr = ctx->desc_hdr_template | + DESC_HDR_DIR_INBOUND | DESC_HDR_MODE1_MDEU_CICV; /* reset integrity check result bits */ edesc->desc.hdr_lo = 0; - return ipsec_esp(edesc, req, NULL, 0, ipsec_esp_decrypt_hwauth_done); + return ipsec_esp(edesc, req, NULL, 0, + ipsec_esp_decrypt_hwauth_done); - } else { - - /* Have to check the ICV with software */ - - edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_DIR_INBOUND; - - /* stash incoming ICV for later cmp with ICV generated by the h/w */ - if (edesc->dma_len) - icvdata = &edesc->link_tbl[edesc->src_nents + - edesc->dst_nents + 2]; - else - icvdata = &edesc->link_tbl[0]; - - sg = sg_last(req->src, edesc->src_nents ? : 1); - - memcpy(icvdata, (char *)sg_virt(sg) + sg->length - ctx->authsize, - ctx->authsize); - - return ipsec_esp(edesc, req, NULL, 0, ipsec_esp_decrypt_swauth_done); } + + /* Have to check the ICV with software */ + edesc->desc.hdr = ctx->desc_hdr_template | DESC_HDR_DIR_INBOUND; + + /* stash incoming ICV for later cmp with ICV generated by the h/w */ + if (edesc->dma_len) + icvdata = &edesc->link_tbl[edesc->src_nents + + edesc->dst_nents + 2]; + else + icvdata = &edesc->link_tbl[0]; + + sg = sg_last(req->src, edesc->src_nents ? : 1); + + memcpy(icvdata, (char *)sg_virt(sg) + sg->length - ctx->authsize, + ctx->authsize); + + return ipsec_esp(edesc, req, NULL, 0, ipsec_esp_decrypt_swauth_done); } static int aead_givencrypt(struct aead_givcrypt_request *req) @@ -1368,11 +1366,13 @@ static int common_nonsnoop(struct talitos_edesc *edesc, if (sg_count > 1) { desc->ptr[3].j_extent |= DESC_PTR_LNKTBL_JUMP; desc->ptr[3].ptr = cpu_to_be32(edesc->dma_link_tbl); - dma_sync_single_for_device(ctx->dev, edesc->dma_link_tbl, - edesc->dma_len, DMA_BIDIRECTIONAL); + dma_sync_single_for_device(dev, edesc->dma_link_tbl, + edesc->dma_len, + DMA_BIDIRECTIONAL); } else { /* Only one segment now, so no link tbl needed */ - desc->ptr[3].ptr = cpu_to_be32(sg_dma_address(areq->src)); + desc->ptr[3].ptr = cpu_to_be32(sg_dma_address(areq-> + src)); } } @@ -1419,7 +1419,8 @@ static int common_nonsnoop(struct talitos_edesc *edesc, return ret; } -static struct talitos_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request *areq) +static struct talitos_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request * + areq) { struct crypto_ablkcipher *cipher = crypto_ablkcipher_reqtfm(areq); struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher); From 19bbbc635523703ece28409e59694d5b512b819e Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Sun, 29 Mar 2009 15:53:59 +0800 Subject: [PATCH 077/162] crypto: talitos - containerof related codingstyle no functional changes. Signed-off-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index a073e6b6a3c8..1cc1c411e551 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -841,13 +841,14 @@ static void ipsec_esp_encrypt_done(struct device *dev, int err) { struct aead_request *areq = context; - struct talitos_edesc *edesc = - container_of(desc, struct talitos_edesc, desc); struct crypto_aead *authenc = crypto_aead_reqtfm(areq); struct talitos_ctx *ctx = crypto_aead_ctx(authenc); + struct talitos_edesc *edesc; struct scatterlist *sg; void *icvdata; + edesc = container_of(desc, struct talitos_edesc, desc); + ipsec_esp_unmap(dev, edesc, areq); /* copy the generated ICV to dst */ @@ -869,13 +870,14 @@ static void ipsec_esp_decrypt_swauth_done(struct device *dev, void *context, int err) { struct aead_request *req = context; - struct talitos_edesc *edesc = - container_of(desc, struct talitos_edesc, desc); struct crypto_aead *authenc = crypto_aead_reqtfm(req); struct talitos_ctx *ctx = crypto_aead_ctx(authenc); + struct talitos_edesc *edesc; struct scatterlist *sg; void *icvdata; + edesc = container_of(desc, struct talitos_edesc, desc); + ipsec_esp_unmap(dev, edesc, req); if (!err) { @@ -901,8 +903,9 @@ static void ipsec_esp_decrypt_hwauth_done(struct device *dev, void *context, int err) { struct aead_request *req = context; - struct talitos_edesc *edesc = - container_of(desc, struct talitos_edesc, desc); + struct talitos_edesc *edesc; + + edesc = container_of(desc, struct talitos_edesc, desc); ipsec_esp_unmap(dev, edesc, req); @@ -1308,8 +1311,9 @@ static void ablkcipher_done(struct device *dev, int err) { struct ablkcipher_request *areq = context; - struct talitos_edesc *edesc = - container_of(desc, struct talitos_edesc, desc); + struct talitos_edesc *edesc; + + edesc = container_of(desc, struct talitos_edesc, desc); common_nonsnoop_unmap(dev, edesc, areq); @@ -1686,12 +1690,14 @@ struct talitos_crypto_alg { static int talitos_cra_init(struct crypto_tfm *tfm) { struct crypto_alg *alg = tfm->__crt_alg; - struct talitos_crypto_alg *talitos_alg = - container_of(alg, struct talitos_crypto_alg, crypto_alg); + struct talitos_crypto_alg *talitos_alg; struct talitos_ctx *ctx = crypto_tfm_ctx(tfm); + talitos_alg = container_of(alg, struct talitos_crypto_alg, crypto_alg); + /* update context with ptr to dev */ ctx->dev = talitos_alg->dev; + /* copy descriptor header template value */ ctx->desc_hdr_template = talitos_alg->desc_hdr_template; From 962a9c99496f98041d14d64a9fdcf58050fefb4d Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Sun, 29 Mar 2009 15:54:30 +0800 Subject: [PATCH 078/162] crypto: talitos - Avoid unnecessary decrypt check the ICV check bit only gets set in decrypt entry points Signed-off-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 1cc1c411e551..c70775fd3ce2 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1010,8 +1010,7 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct aead_request *areq, } else { sg_link_tbl_len = cryptlen; - if ((edesc->desc.hdr & DESC_HDR_MODE1_MDEU_CICV) && - (edesc->desc.hdr & DESC_HDR_MODE0_ENCRYPT) == 0) + if (edesc->desc.hdr & DESC_HDR_MODE1_MDEU_CICV) sg_link_tbl_len = cryptlen + authsize; sg_count = sg_to_link_tbl(areq->src, sg_count, sg_link_tbl_len, From d1c8b0a7692e81b46550bcc493465ed10510cd33 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 21 Apr 2009 14:14:37 +0800 Subject: [PATCH 079/162] crypto: padlock - Enable on x86_64 Almost everything stays the same, we need just to use the extended registers on the bit variant. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 2 +- drivers/crypto/padlock-aes.c | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 01afd758072f..39eedd431413 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -12,7 +12,7 @@ if CRYPTO_HW config CRYPTO_DEV_PADLOCK tristate "Support for VIA PadLock ACE" - depends on X86_32 && !UML + depends on !UML select CRYPTO_ALGAPI help Some VIA processors come with an integrated crypto engine diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c index 856b3cc25583..87f92c39b5f0 100644 --- a/drivers/crypto/padlock-aes.c +++ b/drivers/crypto/padlock-aes.c @@ -154,7 +154,11 @@ static inline void padlock_reset_key(struct cword *cword) int cpu = raw_smp_processor_id(); if (cword != per_cpu(last_cword, cpu)) +#ifndef CONFIG_X86_64 asm volatile ("pushfl; popfl"); +#else + asm volatile ("pushfq; popfq"); +#endif } static inline void padlock_store_cword(struct cword *cword) @@ -208,10 +212,19 @@ static inline void padlock_xcrypt_ecb(const u8 *input, u8 *output, void *key, asm volatile ("test $1, %%cl;" "je 1f;" +#ifndef CONFIG_X86_64 "lea -1(%%ecx), %%eax;" "mov $1, %%ecx;" +#else + "lea -1(%%rcx), %%rax;" + "mov $1, %%rcx;" +#endif ".byte 0xf3,0x0f,0xa7,0xc8;" /* rep xcryptecb */ +#ifndef CONFIG_X86_64 "mov %%eax, %%ecx;" +#else + "mov %%rax, %%rcx;" +#endif "1:" ".byte 0xf3,0x0f,0xa7,0xc8" /* rep xcryptecb */ : "+S"(input), "+D"(output) From 2f8174187f409213e63c3589af163c627e8a182a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 22 Apr 2009 13:00:15 +0800 Subject: [PATCH 080/162] crypto: padlock - Restore dependency on x86 When we added 64-bit support to padlock the dependency on x86 was lost. This causes build failures on non-x86 architectures. Reported-by: Stephen Rothwell Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 39eedd431413..e748e55bd86b 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -12,7 +12,7 @@ if CRYPTO_HW config CRYPTO_DEV_PADLOCK tristate "Support for VIA PadLock ACE" - depends on !UML + depends on X86 && !UML select CRYPTO_ALGAPI help Some VIA processors come with an integrated crypto engine From e44a1b44c3a9794236fe038b89a0fbef5adcd523 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 4 May 2009 19:22:11 +0800 Subject: [PATCH 081/162] crypto: testmgr - Handle AEAD test vectors expected to fail verification Add infrastructure to tcrypt/testmgr to support handling ccm decryption test vectors that are expected to fail verification. Signed-off-by: Jarod Wilson Signed-off-by: Herbert Xu --- crypto/testmgr.c | 28 ++++++++++++++++++++++++++++ crypto/testmgr.h | 1 + 2 files changed, 29 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index bfee6e9f642d..84f96401b29a 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -363,6 +363,16 @@ static int test_aead(struct crypto_aead *tfm, int enc, switch (ret) { case 0: + if (template[i].novrfy) { + /* verification was supposed to fail */ + printk(KERN_ERR "alg: aead: %s failed " + "on test %d for %s: ret was 0, " + "expected -EBADMSG\n", + e, j, algo); + /* so really, we got a bad message */ + ret = -EBADMSG; + goto out; + } break; case -EINPROGRESS: case -EBUSY: @@ -372,6 +382,10 @@ static int test_aead(struct crypto_aead *tfm, int enc, INIT_COMPLETION(result.completion); break; } + case -EBADMSG: + if (template[i].novrfy) + /* verification failure was expected */ + continue; /* fall through */ default: printk(KERN_ERR "alg: aead: %s failed on test " @@ -481,6 +495,16 @@ static int test_aead(struct crypto_aead *tfm, int enc, switch (ret) { case 0: + if (template[i].novrfy) { + /* verification was supposed to fail */ + printk(KERN_ERR "alg: aead: %s failed " + "on chunk test %d for %s: ret " + "was 0, expected -EBADMSG\n", + e, j, algo); + /* so really, we got a bad message */ + ret = -EBADMSG; + goto out; + } break; case -EINPROGRESS: case -EBUSY: @@ -490,6 +514,10 @@ static int test_aead(struct crypto_aead *tfm, int enc, INIT_COMPLETION(result.completion); break; } + case -EBADMSG: + if (template[i].novrfy) + /* verification failure was expected */ + continue; /* fall through */ default: printk(KERN_ERR "alg: aead: %s failed on " diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 526f00a9c72f..b77b61dad262 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -62,6 +62,7 @@ struct aead_testvec { int np; int anp; unsigned char fail; + unsigned char novrfy; /* ccm dec verification failure expected */ unsigned char wk; /* weak key flag */ unsigned char klen; unsigned short ilen; From 5d667322a25ab4ecb91176db118fd663fee4da35 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 4 May 2009 19:23:40 +0800 Subject: [PATCH 082/162] crypto: testmgr - Add self-tests for rfc4309(ccm(aes)) Add an array of encryption and decryption + verification self-tests for rfc4309(ccm(aes)). Test vectors all come from sample FIPS CAVS files provided to Red Hat by a testing lab. Unfortunately, all the published sample vectors in RFC 3610 and NIST Special Publication 800-38C contain nonce lengths that the kernel's rfc4309 implementation doesn't support, so while using some public domain vectors would have been preferred, its not possible at this time. Signed-off-by: Jarod Wilson Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 4 + crypto/testmgr.c | 15 ++ crypto/testmgr.h | 370 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 389 insertions(+) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 50d1e35bbff4..0452036b1d45 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -667,6 +667,10 @@ static void do_test(int m) tcrypt_test("zlib"); break; + case 45: + tcrypt_test("rfc4309(ccm(aes))"); + break; + case 100: tcrypt_test("hmac(md5)"); break; diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 84f96401b29a..40c10789f7f3 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1878,6 +1878,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "rfc4309(ccm(aes))", + .test = alg_test_aead, + .suite = { + .aead = { + .enc = { + .vecs = aes_ccm_rfc4309_enc_tv_template, + .count = AES_CCM_4309_ENC_TEST_VECTORS + }, + .dec = { + .vecs = aes_ccm_rfc4309_dec_tv_template, + .count = AES_CCM_4309_DEC_TEST_VECTORS + } + } + } }, { .alg = "rmd128", .test = alg_test_hash, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index b77b61dad262..5add65196a98 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -2848,6 +2848,8 @@ static struct cipher_testvec cast6_dec_tv_template[] = { #define AES_GCM_DEC_TEST_VECTORS 8 #define AES_CCM_ENC_TEST_VECTORS 7 #define AES_CCM_DEC_TEST_VECTORS 7 +#define AES_CCM_4309_ENC_TEST_VECTORS 7 +#define AES_CCM_4309_DEC_TEST_VECTORS 10 static struct cipher_testvec aes_enc_tv_template[] = { { /* From FIPS-197 */ @@ -5826,6 +5828,374 @@ static struct aead_testvec aes_ccm_dec_tv_template[] = { }, }; +/* + * rfc4309 refers to section 8 of rfc3610 for test vectors, but they all + * use a 13-byte nonce, we only support an 11-byte nonce. Similarly, all of + * Special Publication 800-38C's test vectors also use nonce lengths our + * implementation doesn't support. The following are taken from fips cavs + * fax files on hand at Red Hat. + * + * nb: actual key lengths are (klen - 3), the last 3 bytes are actually + * part of the nonce which combine w/the iv, but need to be input this way. + */ +static struct aead_testvec aes_ccm_rfc4309_enc_tv_template[] = { + { + .key = "\x83\xac\x54\x66\xc2\xeb\xe5\x05" + "\x2e\x01\xd1\xfc\x5d\x82\x66\x2e" + "\x96\xac\x59", + .klen = 19, + .iv = "\x30\x07\xa1\xe2\xa2\xc7\x55\x24", + .alen = 0, + .input = "\x19\xc8\x81\xf6\xe9\x86\xff\x93" + "\x0b\x78\x67\xe5\xbb\xb7\xfc\x6e" + "\x83\x77\xb3\xa6\x0c\x8c\x9f\x9c" + "\x35\x2e\xad\xe0\x62\xf9\x91\xa1", + .ilen = 32, + .result = "\xab\x6f\xe1\x69\x1d\x19\x99\xa8" + "\x92\xa0\xc4\x6f\x7e\xe2\x8b\xb1" + "\x70\xbb\x8c\xa6\x4c\x6e\x97\x8a" + "\x57\x2b\xbe\x5d\x98\xa6\xb1\x32" + "\xda\x24\xea\xd9\xa1\x39\x98\xfd" + "\xa4\xbe\xd9\xf2\x1a\x6d\x22\xa8", + .rlen = 48, + }, { + .key = "\x1e\x2c\x7e\x01\x41\x9a\xef\xc0" + "\x0d\x58\x96\x6e\x5c\xa2\x4b\xd3" + "\x4f\xa3\x19", + .klen = 19, + .iv = "\xd3\x01\x5a\xd8\x30\x60\x15\x56", + .assoc = "\xda\xe6\x28\x9c\x45\x2d\xfd\x63" + "\x5e\xda\x4c\xb6\xe6\xfc\xf9\xb7" + "\x0c\x56\xcb\xe4\xe0\x05\x7a\xe1" + "\x0a\x63\x09\x78\xbc\x2c\x55\xde", + .alen = 32, + .input = "\x87\xa3\x36\xfd\x96\xb3\x93\x78" + "\xa9\x28\x63\xba\x12\xa3\x14\x85" + "\x57\x1e\x06\xc9\x7b\x21\xef\x76" + "\x7f\x38\x7e\x8e\x29\xa4\x3e\x7e", + .ilen = 32, + .result = "\x8a\x1e\x11\xf0\x02\x6b\xe2\x19" + "\xfc\x70\xc4\x6d\x8e\xb7\x99\xab" + "\xc5\x4b\xa2\xac\xd3\xf3\x48\xff" + "\x3b\xb5\xce\x53\xef\xde\xbb\x02" + "\xa9\x86\x15\x6c\x13\xfe\xda\x0a" + "\x22\xb8\x29\x3d\xd8\x39\x9a\x23", + .rlen = 48, + }, { + .key = "\xf4\x6b\xc2\x75\x62\xfe\xb4\xe1" + "\xa3\xf0\xff\xdd\x4e\x4b\x12\x75" + "\x53\x14\x73\x66\x8d\x88\xf6\x80" + "\xa0\x20\x35", + .klen = 27, + .iv = "\x26\xf2\x21\x8d\x50\x20\xda\xe2", + .assoc = "\x5b\x9e\x13\x67\x02\x5e\xef\xc1" + "\x6c\xf9\xd7\x1e\x52\x8f\x7a\x47" + "\xe9\xd4\xcf\x20\x14\x6e\xf0\x2d" + "\xd8\x9e\x2b\x56\x10\x23\x56\xe7", + .alen = 32, + .ilen = 0, + .result = "\x36\xea\x7a\x70\x08\xdc\x6a\xbc" + "\xad\x0c\x7a\x63\xf6\x61\xfd\x9b", + .rlen = 16, + }, { + .key = "\x56\xdf\x5c\x8f\x26\x3f\x0e\x42" + "\xef\x7a\xd3\xce\xfc\x84\x60\x62" + "\xca\xb4\x40\xaf\x5f\xc9\xc9\x01" + "\xd6\x3c\x8c", + .klen = 27, + .iv = "\x86\x84\xb6\xcd\xef\x09\x2e\x94", + .assoc = "\x02\x65\x78\x3c\xe9\x21\x30\x91" + "\xb1\xb9\xda\x76\x9a\x78\x6d\x95" + "\xf2\x88\x32\xa3\xf2\x50\xcb\x4c" + "\xe3\x00\x73\x69\x84\x69\x87\x79", + .alen = 32, + .input = "\x9f\xd2\x02\x4b\x52\x49\x31\x3c" + "\x43\x69\x3a\x2d\x8e\x70\xad\x7e" + "\xe0\xe5\x46\x09\x80\x89\x13\xb2" + "\x8c\x8b\xd9\x3f\x86\xfb\xb5\x6b", + .ilen = 32, + .result = "\x39\xdf\x7c\x3c\x5a\x29\xb9\x62" + "\x5d\x51\xc2\x16\xd8\xbd\x06\x9f" + "\x9b\x6a\x09\x70\xc1\x51\x83\xc2" + "\x66\x88\x1d\x4f\x9a\xda\xe0\x1e" + "\xc7\x79\x11\x58\xe5\x6b\x20\x40" + "\x7a\xea\x46\x42\x8b\xe4\x6f\xe1", + .rlen = 48, + }, { + .key = "\xe0\x8d\x99\x71\x60\xd7\x97\x1a" + "\xbd\x01\x99\xd5\x8a\xdf\x71\x3a" + "\xd3\xdf\x24\x4b\x5e\x3d\x4b\x4e" + "\x30\x7a\xb9\xd8\x53\x0a\x5e\x2b" + "\x1e\x29\x91", + .klen = 35, + .iv = "\xad\x8e\xc1\x53\x0a\xcf\x2d\xbe", + .assoc = "\x19\xb6\x1f\x57\xc4\xf3\xf0\x8b" + "\x78\x2b\x94\x02\x29\x0f\x42\x27" + "\x6b\x75\xcb\x98\x34\x08\x7e\x79" + "\xe4\x3e\x49\x0d\x84\x8b\x22\x87", + .alen = 32, + .input = "\xe1\xd9\xd8\x13\xeb\x3a\x75\x3f" + "\x9d\xbd\x5f\x66\xbe\xdc\xbb\x66" + "\xbf\x17\x99\x62\x4a\x39\x27\x1f" + "\x1d\xdc\x24\xae\x19\x2f\x98\x4c", + .ilen = 32, + .result = "\x19\xb8\x61\x33\x45\x2b\x43\x96" + "\x6f\x51\xd0\x20\x30\x7d\x9b\xc6" + "\x26\x3d\xf8\xc9\x65\x16\xa8\x9f" + "\xf0\x62\x17\x34\xf2\x1e\x8d\x75" + "\x4e\x13\xcc\xc0\xc3\x2a\x54\x2d", + .rlen = 40, + }, { + .key = "\x7c\xc8\x18\x3b\x8d\x99\xe0\x7c" + "\x45\x41\xb8\xbd\x5c\xa7\xc2\x32" + "\x8a\xb8\x02\x59\xa4\xfe\xa9\x2c" + "\x09\x75\x9a\x9b\x3c\x9b\x27\x39" + "\xf9\xd9\x4e", + .klen = 35, + .iv = "\x63\xb5\x3d\x9d\x43\xf6\x1e\x50", + .assoc = "\x57\xf5\x6b\x8b\x57\x5c\x3d\x3b" + "\x13\x02\x01\x0c\x83\x4c\x96\x35" + "\x8e\xd6\x39\xcf\x7d\x14\x9b\x94" + "\xb0\x39\x36\xe6\x8f\x57\xe0\x13", + .alen = 32, + .input = "\x3b\x6c\x29\x36\xb6\xef\x07\xa6" + "\x83\x72\x07\x4f\xcf\xfa\x66\x89" + "\x5f\xca\xb1\xba\xd5\x8f\x2c\x27" + "\x30\xdb\x75\x09\x93\xd4\x65\xe4", + .ilen = 32, + .result = "\xb0\x88\x5a\x33\xaa\xe5\xc7\x1d" + "\x85\x23\xc7\xc6\x2f\xf4\x1e\x3d" + "\xcc\x63\x44\x25\x07\x78\x4f\x9e" + "\x96\xb8\x88\xeb\xbc\x48\x1f\x06" + "\x39\xaf\x39\xac\xd8\x4a\x80\x39" + "\x7b\x72\x8a\xf7", + .rlen = 44, + }, { + .key = "\xab\xd0\xe9\x33\x07\x26\xe5\x83" + "\x8c\x76\x95\xd4\xb6\xdc\xf3\x46" + "\xf9\x8f\xad\xe3\x02\x13\x83\x77" + "\x3f\xb0\xf1\xa1\xa1\x22\x0f\x2b" + "\x24\xa7\x8b", + .klen = 35, + .iv = "\x07\xcb\xcc\x0e\xe6\x33\xbf\xf5", + .assoc = "\xd4\xdb\x30\x1d\x03\xfe\xfd\x5f" + "\x87\xd4\x8c\xb6\xb6\xf1\x7a\x5d" + "\xab\x90\x65\x8d\x8e\xca\x4d\x4f" + "\x16\x0c\x40\x90\x4b\xc7\x36\x73", + .alen = 32, + .input = "\xf5\xc6\x7d\x48\xc1\xb7\xe6\x92" + "\x97\x5a\xca\xc4\xa9\x6d\xf9\x3d" + "\x6c\xde\xbc\xf1\x90\xea\x6a\xb2" + "\x35\x86\x36\xaf\x5c\xfe\x4b\x3a", + .ilen = 32, + .result = "\x83\x6f\x40\x87\x72\xcf\xc1\x13" + "\xef\xbb\x80\x21\x04\x6c\x58\x09" + "\x07\x1b\xfc\xdf\xc0\x3f\x5b\xc7" + "\xe0\x79\xa8\x6e\x71\x7c\x3f\xcf" + "\x5c\xda\xb2\x33\xe5\x13\xe2\x0d" + "\x74\xd1\xef\xb5\x0f\x3a\xb5\xf8", + .rlen = 48, + }, +}; + +static struct aead_testvec aes_ccm_rfc4309_dec_tv_template[] = { + { + .key = "\xab\x2f\x8a\x74\xb7\x1c\xd2\xb1" + "\xff\x80\x2e\x48\x7d\x82\xf8\xb9" + "\xc6\xfb\x7d", + .klen = 19, + .iv = "\x80\x0d\x13\xab\xd8\xa6\xb2\xd8", + .alen = 0, + .input = "\xd5\xe8\x93\x9f\xc7\x89\x2e\x2b", + .ilen = 8, + .result = "\x00", + .rlen = 0, + .novrfy = 1, + }, { + .key = "\xab\x2f\x8a\x74\xb7\x1c\xd2\xb1" + "\xff\x80\x2e\x48\x7d\x82\xf8\xb9" + "\xaf\x94\x87", + .klen = 19, + .iv = "\x78\x35\x82\x81\x7f\x88\x94\x68", + .alen = 0, + .input = "\x41\x3c\xb8\x87\x73\xcb\xf3\xf3", + .ilen = 8, + .result = "\x00", + .rlen = 0, + }, { + .key = "\x61\x0e\x8c\xae\xe3\x23\xb6\x38" + "\x76\x1c\xf6\x3a\x67\xa3\x9c\xd8" + "\xc6\xfb\x7d", + .klen = 19, + .iv = "\x80\x0d\x13\xab\xd8\xa6\xb2\xd8", + .assoc = "\xf3\x94\x87\x78\x35\x82\x81\x7f" + "\x88\x94\x68\xb1\x78\x6b\x2b\xd6" + "\x04\x1f\x4e\xed\x78\xd5\x33\x66" + "\xd8\x94\x99\x91\x81\x54\x62\x57", + .alen = 32, + .input = "\xf0\x7c\x29\x02\xae\x1c\x2f\x55" + "\xd0\xd1\x3d\x1a\xa3\x6d\xe4\x0a" + "\x86\xb0\x87\x6b\x62\x33\x8c\x34" + "\xce\xab\x57\xcc\x79\x0b\xe0\x6f" + "\x5c\x3e\x48\x1f\x6c\x46\xf7\x51" + "\x8b\x84\x83\x2a\xc1\x05\xb8\xc5", + .ilen = 48, + .result = "\x50\x82\x3e\x07\xe2\x1e\xb6\xfb" + "\x33\xe4\x73\xce\xd2\xfb\x95\x79" + "\xe8\xb4\xb5\x77\x11\x10\x62\x6f" + "\x6a\x82\xd1\x13\xec\xf5\xd0\x48", + .rlen = 32, + .novrfy = 1, + }, { + .key = "\x61\x0e\x8c\xae\xe3\x23\xb6\x38" + "\x76\x1c\xf6\x3a\x67\xa3\x9c\xd8" + "\x05\xe0\xc9", + .klen = 19, + .iv = "\x0f\xed\x34\xea\x97\xd4\x3b\xdf", + .assoc = "\x49\x5c\x50\x1f\x1d\x94\xcc\x81" + "\xba\xb7\xb6\x03\xaf\xa5\xc1\xa1" + "\xd8\x5c\x42\x68\xe0\x6c\xda\x89" + "\x05\xac\x56\xac\x1b\x2a\xd3\x86", + .alen = 32, + .input = "\x39\xbe\x7d\x15\x62\x77\xf3\x3c" + "\xad\x83\x52\x6d\x71\x03\x25\x1c" + "\xed\x81\x3a\x9a\x16\x7d\x19\x80" + "\x72\x04\x72\xd0\xf6\xff\x05\x0f" + "\xb7\x14\x30\x00\x32\x9e\xa0\xa6" + "\x9e\x5a\x18\xa1\xb8\xfe\xdb\xd3", + .ilen = 48, + .result = "\x75\x05\xbe\xc2\xd9\x1e\xde\x60" + "\x47\x3d\x8c\x7d\xbd\xb5\xd9\xb7" + "\xf2\xae\x61\x05\x8f\x82\x24\x3f" + "\x9c\x67\x91\xe1\x38\x4f\xe4\x0c", + .rlen = 32, + }, { + .key = "\x39\xbb\xa7\xbe\x59\x97\x9e\x73" + "\xa2\xbc\x6b\x98\xd7\x75\x7f\xe3" + "\xa4\x48\x93\x39\x26\x71\x4a\xc6" + "\xee\x49\x83", + .klen = 27, + .iv = "\xe9\xa9\xff\xe9\x57\xba\xfd\x9e", + .assoc = "\x44\xa6\x2c\x05\xe9\xe1\x43\xb1" + "\x58\x7c\xf2\x5c\x6d\x39\x0a\x64" + "\xa4\xf0\x13\x05\xd1\x77\x99\x67" + "\x11\xc4\xc6\xdb\x00\x56\x36\x61", + .alen = 32, + .input = "\x71\x99\xfa\xf4\x44\x12\x68\x9b", + .ilen = 8, + .result = "\x00", + .rlen = 0, + }, { + .key = "\x58\x5d\xa0\x96\x65\x1a\x04\xd7" + "\x96\xe5\xc5\x68\xaa\x95\x35\xe0" + "\x29\xa0\xba\x9e\x48\x78\xd1\xba" + "\xee\x49\x83", + .klen = 27, + .iv = "\xe9\xa9\xff\xe9\x57\xba\xfd\x9e", + .assoc = "\x44\xa6\x2c\x05\xe9\xe1\x43\xb1" + "\x58\x7c\xf2\x5c\x6d\x39\x0a\x64" + "\xa4\xf0\x13\x05\xd1\x77\x99\x67" + "\x11\xc4\xc6\xdb\x00\x56\x36\x61", + .alen = 32, + .input = "\xfb\xe5\x5d\x34\xbe\xe5\xe8\xe7" + "\x5a\xef\x2f\xbf\x1f\x7f\xd4\xb2" + "\x66\xca\x61\x1e\x96\x7a\x61\xb3" + "\x1c\x16\x45\x52\xba\x04\x9c\x9f" + "\xb1\xd2\x40\xbc\x52\x7c\x6f\xb1", + .ilen = 40, + .result = "\x85\x34\x66\x42\xc8\x92\x0f\x36" + "\x58\xe0\x6b\x91\x3c\x98\x5c\xbb" + "\x0a\x85\xcc\x02\xad\x7a\x96\xe9" + "\x65\x43\xa4\xc3\x0f\xdc\x55\x81", + .rlen = 32, + }, { + .key = "\x58\x5d\xa0\x96\x65\x1a\x04\xd7" + "\x96\xe5\xc5\x68\xaa\x95\x35\xe0" + "\x29\xa0\xba\x9e\x48\x78\xd1\xba" + "\xd1\xfc\x57", + .klen = 27, + .iv = "\x9c\xfe\xb8\x9c\xad\x71\xaa\x1f", + .assoc = "\x86\x67\xa5\xa9\x14\x5f\x0d\xc6" + "\xff\x14\xc7\x44\xbf\x6c\x3a\xc3" + "\xff\xb6\x81\xbd\xe2\xd5\x06\xc7" + "\x3c\xa1\x52\x13\x03\x8a\x23\x3a", + .alen = 32, + .input = "\x3f\x66\xb0\x9d\xe5\x4b\x38\x00" + "\xc6\x0e\x6e\xe5\xd6\x98\xa6\x37" + "\x8c\x26\x33\xc6\xb2\xa2\x17\xfa" + "\x64\x19\xc0\x30\xd7\xfc\x14\x6b" + "\xe3\x33\xc2\x04\xb0\x37\xbe\x3f" + "\xa9\xb4\x2d\x68\x03\xa3\x44\xef", + .ilen = 48, + .result = "\x02\x87\x4d\x28\x80\x6e\xb2\xed" + "\x99\x2a\xa8\xca\x04\x25\x45\x90" + "\x1d\xdd\x5a\xd9\xe4\xdb\x9c\x9c" + "\x49\xe9\x01\xfe\xa7\x80\x6d\x6b", + .rlen = 32, + .novrfy = 1, + }, { + .key = "\xa4\x4b\x54\x29\x0a\xb8\x6d\x01" + "\x5b\x80\x2a\xcf\x25\xc4\xb7\x5c" + "\x20\x2c\xad\x30\xc2\x2b\x41\xfb" + "\x0e\x85\xbc\x33\xad\x0f\x2b\xff" + "\xee\x49\x83", + .klen = 35, + .iv = "\xe9\xa9\xff\xe9\x57\xba\xfd\x9e", + .alen = 0, + .input = "\x1f\xb8\x8f\xa3\xdd\x54\x00\xf2", + .ilen = 8, + .result = "\x00", + .rlen = 0, + }, { + .key = "\x39\xbb\xa7\xbe\x59\x97\x9e\x73" + "\xa2\xbc\x6b\x98\xd7\x75\x7f\xe3" + "\xa4\x48\x93\x39\x26\x71\x4a\xc6" + "\xae\x8f\x11\x4c\xc2\x9c\x4a\xbb" + "\x85\x34\x66", + .klen = 35, + .iv = "\x42\xc8\x92\x0f\x36\x58\xe0\x6b", + .alen = 0, + .input = "\x48\x01\x5e\x02\x24\x04\x66\x47" + "\xa1\xea\x6f\xaf\xe8\xfc\xfb\xdd" + "\xa5\xa9\x87\x8d\x84\xee\x2e\x77" + "\xbb\x86\xb9\xf5\x5c\x6c\xff\xf6" + "\x72\xc3\x8e\xf7\x70\xb1\xb2\x07" + "\xbc\xa8\xa3\xbd\x83\x7c\x1d\x2a", + .ilen = 48, + .result = "\xdc\x56\xf2\x71\xb0\xb1\xa0\x6c" + "\xf0\x97\x3a\xfb\x6d\xe7\x32\x99" + "\x3e\xaf\x70\x5e\xb2\x4d\xea\x39" + "\x89\xd4\x75\x7a\x63\xb1\xda\x93", + .rlen = 32, + .novrfy = 1, + }, { + .key = "\x58\x5d\xa0\x96\x65\x1a\x04\xd7" + "\x96\xe5\xc5\x68\xaa\x95\x35\xe0" + "\x29\xa0\xba\x9e\x48\x78\xd1\xba" + "\x0d\x1a\x53\x3b\xb5\xe3\xf8\x8b" + "\xcf\x76\x3f", + .klen = 35, + .iv = "\xd9\x95\x75\x8f\x44\x89\x40\x7b", + .assoc = "\x8f\x86\x6c\x4d\x1d\xc5\x39\x88" + "\xc8\xf3\x5c\x52\x10\x63\x6f\x2b" + "\x8a\x2a\xc5\x6f\x30\x23\x58\x7b" + "\xfb\x36\x03\x11\xb4\xd9\xf2\xfe", + .alen = 32, + .input = "\x48\x58\xd6\xf3\xad\x63\x58\xbf" + "\xae\xc7\x5e\xae\x83\x8f\x7b\xe4" + "\x78\x5c\x4c\x67\x71\x89\x94\xbf" + "\x47\xf1\x63\x7e\x1c\x59\xbd\xc5" + "\x7f\x44\x0a\x0c\x01\x18\x07\x92" + "\xe1\xd3\x51\xce\x32\x6d\x0c\x5b", + .ilen = 48, + .result = "\xc2\x54\xc8\xde\x78\x87\x77\x40" + "\x49\x71\xe4\xb7\xe7\xcb\x76\x61" + "\x0a\x41\xb9\xe9\xc0\x76\x54\xab" + "\x04\x49\x3b\x19\x93\x57\x25\x5d", + .rlen = 32, + }, +}; + /* Cast5 test vectors from RFC 2144 */ #define CAST5_ENC_TEST_VECTORS 3 #define CAST5_DEC_TEST_VECTORS 3 From 7647d6ce2077d9e1c3d72359f6b4492be129cfe8 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 4 May 2009 19:44:50 +0800 Subject: [PATCH 083/162] crypto: testmgr - Add infrastructure for ansi_cprng self-tests Add some necessary infrastructure to make it possible to run self-tests for ansi_cprng. The bits are likely very specific to the ANSI X9.31 CPRNG in AES mode, and thus perhaps should be named more specifically if/when we grow additional CPRNG support... Successfully tested against the cryptodev-2.6 tree and a Red Hat Enterprise Linux 5.x kernel with the follow-on patch that adds the actual test vectors. Signed-off-by: Jarod Wilson Acked-by: Neil Horman Signed-off-by: Herbert Xu --- crypto/testmgr.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++ crypto/testmgr.h | 12 +++++++ 2 files changed, 101 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 40c10789f7f3..adc54cfd39df 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" #include "testmgr.h" @@ -84,6 +85,11 @@ struct hash_test_suite { unsigned int count; }; +struct cprng_test_suite { + struct cprng_testvec *vecs; + unsigned int count; +}; + struct alg_test_desc { const char *alg; int (*test)(const struct alg_test_desc *desc, const char *driver, @@ -95,6 +101,7 @@ struct alg_test_desc { struct comp_test_suite comp; struct pcomp_test_suite pcomp; struct hash_test_suite hash; + struct cprng_test_suite cprng; } suite; }; @@ -1089,6 +1096,68 @@ static int test_pcomp(struct crypto_pcomp *tfm, return 0; } + +static int test_cprng(struct crypto_rng *tfm, struct cprng_testvec *template, + unsigned int tcount) +{ + const char *algo = crypto_tfm_alg_driver_name(crypto_rng_tfm(tfm)); + int err, i, j, seedsize; + u8 *seed; + char result[32]; + + seedsize = crypto_rng_seedsize(tfm); + + seed = kmalloc(seedsize, GFP_KERNEL); + if (!seed) { + printk(KERN_ERR "alg: cprng: Failed to allocate seed space " + "for %s\n", algo); + return -ENOMEM; + } + + for (i = 0; i < tcount; i++) { + memset(result, 0, 32); + + memcpy(seed, template[i].v, template[i].vlen); + memcpy(seed + template[i].vlen, template[i].key, + template[i].klen); + memcpy(seed + template[i].vlen + template[i].klen, + template[i].dt, template[i].dtlen); + + err = crypto_rng_reset(tfm, seed, seedsize); + if (err) { + printk(KERN_ERR "alg: cprng: Failed to reset rng " + "for %s\n", algo); + goto out; + } + + for (j = 0; j < template[i].loops; j++) { + err = crypto_rng_get_bytes(tfm, result, + template[i].rlen); + if (err != template[i].rlen) { + printk(KERN_ERR "alg: cprng: Failed to obtain " + "the correct amount of random data for " + "%s (requested %d, got %d)\n", algo, + template[i].rlen, err); + goto out; + } + } + + err = memcmp(result, template[i].result, + template[i].rlen); + if (err) { + printk(KERN_ERR "alg: cprng: Test %d failed for %s\n", + i, algo); + hexdump(result, template[i].rlen); + err = -EINVAL; + goto out; + } + } + +out: + kfree(seed); + return err; +} + static int alg_test_aead(const struct alg_test_desc *desc, const char *driver, u32 type, u32 mask) { @@ -1288,6 +1357,26 @@ out: return err; } +static int alg_test_cprng(const struct alg_test_desc *desc, const char *driver, + u32 type, u32 mask) +{ + struct crypto_rng *rng; + int err; + + rng = crypto_alloc_rng(driver, type, mask); + if (IS_ERR(rng)) { + printk(KERN_ERR "alg: cprng: Failed to load transform for %s: " + "%ld\n", driver, PTR_ERR(rng)); + return PTR_ERR(rng); + } + + err = test_cprng(rng, desc->suite.cprng.vecs, desc->suite.cprng.count); + + crypto_free_rng(rng); + + return err; +} + /* Please keep this list sorted by algorithm name. */ static const struct alg_test_desc alg_test_descs[] = { { diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 5add65196a98..13d5a61d0e77 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -70,6 +70,18 @@ struct aead_testvec { unsigned short rlen; }; +struct cprng_testvec { + char *key; + char *dt; + char *v; + char *result; + unsigned char klen; + unsigned short dtlen; + unsigned short vlen; + unsigned short rlen; + unsigned short loops; +}; + static char zeroed_string[48]; /* From e08ca2da39db22da569dc23578103cdc942fe3ac Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 4 May 2009 19:46:29 +0800 Subject: [PATCH 084/162] crypto: testmgr - Add ansi_cprng test vectors Add ANSI X9.31 Continuous Pseudo-Random Number Generator (AES mode), aka 'ansi_cprng' test vectors, taken from Appendix B.2.9 and B.2.10 of the NIST RNGVS document, found here: http://csrc.nist.gov/groups/STM/cavp/documents/rng/RNGVS.pdf Successfully tested against both the cryptodev-2.6 tree and a Red Hat Enterprise Linux 5.4 kernel, via 'modprobe tcrypt mode=150'. The selection of 150 was semi-arbitrary, didn't seem like it should go any place in particular, so I started a new range for rng tests. Signed-off-by: Jarod Wilson Acked-by: Neil Horman Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 4 ++ crypto/testmgr.c | 9 +++++ crypto/testmgr.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 0452036b1d45..ea3b8a8db721 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -707,6 +707,10 @@ static void do_test(int m) tcrypt_test("hmac(rmd160)"); break; + case 150: + tcrypt_test("ansi_cprng"); + break; + case 200: test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0, speed_template_16_24_32); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index adc54cfd39df..5183ec5a4517 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1380,6 +1380,15 @@ static int alg_test_cprng(const struct alg_test_desc *desc, const char *driver, /* Please keep this list sorted by algorithm name. */ static const struct alg_test_desc alg_test_descs[] = { { + .alg = "ansi_cprng", + .test = alg_test_cprng, + .suite = { + .cprng = { + .vecs = ansi_cprng_aes_tv_template, + .count = ANSI_CPRNG_AES_TEST_VECTORS + } + } + }, { .alg = "cbc(aes)", .test = alg_test_skcipher, .suite = { diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 13d5a61d0e77..c1c709b57ddb 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -6208,6 +6208,102 @@ static struct aead_testvec aes_ccm_rfc4309_dec_tv_template[] = { }, }; +/* + * ANSI X9.31 Continuous Pseudo-Random Number Generator (AES mode) + * test vectors, taken from Appendix B.2.9 and B.2.10: + * http://csrc.nist.gov/groups/STM/cavp/documents/rng/RNGVS.pdf + * Only AES-128 is supported at this time. + */ +#define ANSI_CPRNG_AES_TEST_VECTORS 6 + +static struct cprng_testvec ansi_cprng_aes_tv_template[] = { + { + .key = "\xf3\xb1\x66\x6d\x13\x60\x72\x42" + "\xed\x06\x1c\xab\xb8\xd4\x62\x02", + .klen = 16, + .dt = "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62" + "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xf9", + .dtlen = 16, + .v = "\x80\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .vlen = 16, + .result = "\x59\x53\x1e\xd1\x3b\xb0\xc0\x55" + "\x84\x79\x66\x85\xc1\x2f\x76\x41", + .rlen = 16, + .loops = 1, + }, { + .key = "\xf3\xb1\x66\x6d\x13\x60\x72\x42" + "\xed\x06\x1c\xab\xb8\xd4\x62\x02", + .klen = 16, + .dt = "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62" + "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfa", + .dtlen = 16, + .v = "\xc0\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .vlen = 16, + .result = "\x7c\x22\x2c\xf4\xca\x8f\xa2\x4c" + "\x1c\x9c\xb6\x41\xa9\xf3\x22\x0d", + .rlen = 16, + .loops = 1, + }, { + .key = "\xf3\xb1\x66\x6d\x13\x60\x72\x42" + "\xed\x06\x1c\xab\xb8\xd4\x62\x02", + .klen = 16, + .dt = "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62" + "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfb", + .dtlen = 16, + .v = "\xe0\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .vlen = 16, + .result = "\x8a\xaa\x00\x39\x66\x67\x5b\xe5" + "\x29\x14\x28\x81\xa9\x4d\x4e\xc7", + .rlen = 16, + .loops = 1, + }, { + .key = "\xf3\xb1\x66\x6d\x13\x60\x72\x42" + "\xed\x06\x1c\xab\xb8\xd4\x62\x02", + .klen = 16, + .dt = "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62" + "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfc", + .dtlen = 16, + .v = "\xf0\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .vlen = 16, + .result = "\x88\xdd\xa4\x56\x30\x24\x23\xe5" + "\xf6\x9d\xa5\x7e\x7b\x95\xc7\x3a", + .rlen = 16, + .loops = 1, + }, { + .key = "\xf3\xb1\x66\x6d\x13\x60\x72\x42" + "\xed\x06\x1c\xab\xb8\xd4\x62\x02", + .klen = 16, + .dt = "\xe6\xb3\xbe\x78\x2a\x23\xfa\x62" + "\xd7\x1d\x4a\xfb\xb0\xe9\x22\xfd", + .dtlen = 16, + .v = "\xf8\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .vlen = 16, + .result = "\x05\x25\x92\x46\x61\x79\xd2\xcb" + "\x78\xc4\x0b\x14\x0a\x5a\x9a\xc8", + .rlen = 16, + .loops = 1, + }, { /* Monte Carlo Test */ + .key = "\x9f\x5b\x51\x20\x0b\xf3\x34\xb5" + "\xd8\x2b\xe8\xc3\x72\x55\xc8\x48", + .klen = 16, + .dt = "\x63\x76\xbb\xe5\x29\x02\xba\x3b" + "\x67\xc9\x25\xfa\x70\x1f\x11\xac", + .dtlen = 16, + .v = "\x57\x2c\x8e\x76\x87\x26\x47\x97" + "\x7e\x74\xfb\xdd\xc4\x95\x01\xd1", + .vlen = 16, + .result = "\x48\xe9\xbd\x0d\x06\xee\x18\xfb" + "\xe4\x57\x90\xd5\xc3\xfc\x9b\x73", + .rlen = 16, + .loops = 10000, + }, +}; + /* Cast5 test vectors from RFC 2144 */ #define CAST5_ENC_TEST_VECTORS 3 #define CAST5_DEC_TEST_VECTORS 3 From 941fb3287c0c0d84000b669db5450ac4886da640 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 4 May 2009 19:49:23 +0800 Subject: [PATCH 085/162] crypto: testmgr - Catch base cipher self-test failures in fips mode Signed-off-by: Jarod Wilson Signed-off-by: Herbert Xu --- crypto/testmgr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 5183ec5a4517..e3f9973abbdc 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -2220,7 +2220,8 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask) if (i < 0) goto notest; - return alg_test_cipher(alg_test_descs + i, driver, type, mask); + rc = alg_test_cipher(alg_test_descs + i, driver, type, mask); + goto test_done; } i = alg_find_test(alg); @@ -2229,6 +2230,7 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask) rc = alg_test_descs[i].test(alg_test_descs + i, driver, type, mask); +test_done: if (fips_enabled && rc) panic("%s: %s alg self test failed in fips mode!\n", driver, alg); From 29ecd4ab3d3aa8bb231361937165dfbbbc534e9a Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 4 May 2009 19:51:17 +0800 Subject: [PATCH 086/162] crypto: testmgr - Print self-test pass notices in fips mode According to our FIPS CAVS testing lab guru, when we're in fips mode, we must print out notices of successful self-test completion for every alg to be compliant. New and improved v2, without strncmp crap. Doesn't need to touch a flag though, due to not moving the notest label around anymore. Applies atop '[PATCH v2] crypto: catch base cipher self-test failures in fips mode'. Personally, I wouldn't mind seeing this info printed out regardless of whether or not we're in fips mode, I think its useful info, but will stick with only in fips mode for now. Signed-off-by: Jarod Wilson Signed-off-by: Herbert Xu --- crypto/testmgr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index e3f9973abbdc..e76af78d2fa0 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -2234,6 +2234,10 @@ test_done: if (fips_enabled && rc) panic("%s: %s alg self test failed in fips mode!\n", driver, alg); + if (fips_enabled && !rc) + printk(KERN_INFO "alg: self-tests for %s (%s) passed\n", + driver, alg); + return rc; notest: From f8b0d4d09dc9d0a73fcdcf6c2724650529ec417d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 6 May 2009 14:15:47 +0800 Subject: [PATCH 087/162] crypto: testmgr - Dynamically allocate xbuf and axbuf We currently allocate temporary memory that is used for testing statically. This renders the testing engine non-reentrant. As algorithms may nest, i.e., one may construct another in order to carry out a part of its operation, this is unacceptable. For example, it has been reported that an AEAD implementation allocates a cipher in its setkey function, which causes it to fail during testing as the temporary memory is overwritten. This patch replaces the static memory with dynamically allocated buffers. We need a maximum of 16 pages so this slightly increases the chances of an algorithm failing due to memory shortage. However, as testing usually occurs at registration, this shouldn't be a big problem. Reported-by: Shasi Pulijala Signed-off-by: Herbert Xu --- crypto/algboss.c | 18 +------- crypto/internal.h | 3 -- crypto/testmgr.c | 108 +++++++++++++++++++++++++--------------------- 3 files changed, 61 insertions(+), 68 deletions(-) diff --git a/crypto/algboss.c b/crypto/algboss.c index 6906f92aeac0..9908dd830c26 100644 --- a/crypto/algboss.c +++ b/crypto/algboss.c @@ -280,29 +280,13 @@ static struct notifier_block cryptomgr_notifier = { static int __init cryptomgr_init(void) { - int err; - - err = testmgr_init(); - if (err) - return err; - - err = crypto_register_notifier(&cryptomgr_notifier); - if (err) - goto free_testmgr; - - return 0; - -free_testmgr: - testmgr_exit(); - return err; + return crypto_register_notifier(&cryptomgr_notifier); } static void __exit cryptomgr_exit(void) { int err = crypto_unregister_notifier(&cryptomgr_notifier); BUG_ON(err); - - testmgr_exit(); } subsys_initcall(cryptomgr_init); diff --git a/crypto/internal.h b/crypto/internal.h index fc76e1f37fc3..113579a82dff 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -121,9 +121,6 @@ int crypto_register_notifier(struct notifier_block *nb); int crypto_unregister_notifier(struct notifier_block *nb); int crypto_probing_notify(unsigned long val, void *v); -int __init testmgr_init(void); -void testmgr_exit(void); - static inline void crypto_alg_put(struct crypto_alg *alg) { if (atomic_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index e76af78d2fa0..c5550943411d 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -107,9 +107,6 @@ struct alg_test_desc { static unsigned int IDX[8] = { IDX1, IDX2, IDX3, IDX4, IDX5, IDX6, IDX7, IDX8 }; -static char *xbuf[XBUFSIZE]; -static char *axbuf[XBUFSIZE]; - static void hexdump(unsigned char *buf, unsigned int len) { print_hex_dump(KERN_CONT, "", DUMP_PREFIX_OFFSET, @@ -128,6 +125,33 @@ static void tcrypt_complete(struct crypto_async_request *req, int err) complete(&res->completion); } +static int testmgr_alloc_buf(char *buf[XBUFSIZE]) +{ + int i; + + for (i = 0; i < XBUFSIZE; i++) { + buf[i] = (void *)__get_free_page(GFP_KERNEL); + if (!buf[i]) + goto err_free_buf; + } + + return 0; + +err_free_buf: + while (i-- > 0) + free_page((unsigned long)buf[i]); + + return -ENOMEM; +} + +static void testmgr_free_buf(char *buf[XBUFSIZE]) +{ + int i; + + for (i = 0; i < XBUFSIZE; i++) + free_page((unsigned long)buf[i]); +} + static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, unsigned int tcount) { @@ -137,8 +161,12 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, char result[64]; struct ahash_request *req; struct tcrypt_result tresult; - int ret; void *hash_buff; + char *xbuf[XBUFSIZE]; + int ret = -ENOMEM; + + if (testmgr_alloc_buf(xbuf)) + goto out_nobuf; init_completion(&tresult.completion); @@ -146,7 +174,6 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, if (!req) { printk(KERN_ERR "alg: hash: Failed to allocate request for " "%s\n", algo); - ret = -ENOMEM; goto out_noreq; } ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, @@ -272,6 +299,8 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, out: ahash_request_free(req); out_noreq: + testmgr_free_buf(xbuf); +out_nobuf: return ret; } @@ -280,7 +309,7 @@ static int test_aead(struct crypto_aead *tfm, int enc, { const char *algo = crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm)); unsigned int i, j, k, n, temp; - int ret = 0; + int ret = -ENOMEM; char *q; char *key; struct aead_request *req; @@ -292,6 +321,13 @@ static int test_aead(struct crypto_aead *tfm, int enc, void *input; void *assoc; char iv[MAX_IVLEN]; + char *xbuf[XBUFSIZE]; + char *axbuf[XBUFSIZE]; + + if (testmgr_alloc_buf(xbuf)) + goto out_noxbuf; + if (testmgr_alloc_buf(axbuf)) + goto out_noaxbuf; if (enc == ENCRYPT) e = "encryption"; @@ -304,7 +340,6 @@ static int test_aead(struct crypto_aead *tfm, int enc, if (!req) { printk(KERN_ERR "alg: aead: Failed to allocate request for " "%s\n", algo); - ret = -ENOMEM; goto out; } @@ -581,6 +616,10 @@ static int test_aead(struct crypto_aead *tfm, int enc, out: aead_request_free(req); + testmgr_free_buf(axbuf); +out_noaxbuf: + testmgr_free_buf(xbuf); +out_noxbuf: return ret; } @@ -589,10 +628,14 @@ static int test_cipher(struct crypto_cipher *tfm, int enc, { const char *algo = crypto_tfm_alg_driver_name(crypto_cipher_tfm(tfm)); unsigned int i, j, k; - int ret; char *q; const char *e; void *data; + char *xbuf[XBUFSIZE]; + int ret = -ENOMEM; + + if (testmgr_alloc_buf(xbuf)) + goto out_nobuf; if (enc == ENCRYPT) e = "encryption"; @@ -646,6 +689,8 @@ static int test_cipher(struct crypto_cipher *tfm, int enc, ret = 0; out: + testmgr_free_buf(xbuf); +out_nobuf: return ret; } @@ -655,7 +700,6 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, const char *algo = crypto_tfm_alg_driver_name(crypto_ablkcipher_tfm(tfm)); unsigned int i, j, k, n, temp; - int ret; char *q; struct ablkcipher_request *req; struct scatterlist sg[8]; @@ -663,6 +707,11 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, struct tcrypt_result result; void *data; char iv[MAX_IVLEN]; + char *xbuf[XBUFSIZE]; + int ret = -ENOMEM; + + if (testmgr_alloc_buf(xbuf)) + goto out_nobuf; if (enc == ENCRYPT) e = "encryption"; @@ -675,7 +724,6 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, if (!req) { printk(KERN_ERR "alg: skcipher: Failed to allocate request " "for %s\n", algo); - ret = -ENOMEM; goto out; } @@ -860,6 +908,8 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, out: ablkcipher_request_free(req); + testmgr_free_buf(xbuf); +out_nobuf: return ret; } @@ -2245,41 +2295,3 @@ notest: return 0; } EXPORT_SYMBOL_GPL(alg_test); - -int __init testmgr_init(void) -{ - int i; - - for (i = 0; i < XBUFSIZE; i++) { - xbuf[i] = (void *)__get_free_page(GFP_KERNEL); - if (!xbuf[i]) - goto err_free_xbuf; - } - - for (i = 0; i < XBUFSIZE; i++) { - axbuf[i] = (void *)__get_free_page(GFP_KERNEL); - if (!axbuf[i]) - goto err_free_axbuf; - } - - return 0; - -err_free_axbuf: - for (i = 0; i < XBUFSIZE && axbuf[i]; i++) - free_page((unsigned long)axbuf[i]); -err_free_xbuf: - for (i = 0; i < XBUFSIZE && xbuf[i]; i++) - free_page((unsigned long)xbuf[i]); - - return -ENOMEM; -} - -void testmgr_exit(void) -{ - int i; - - for (i = 0; i < XBUFSIZE; i++) - free_page((unsigned long)axbuf[i]); - for (i = 0; i < XBUFSIZE; i++) - free_page((unsigned long)xbuf[i]); -} From f7cb80f2b9fa06730be20d17c80b12e511a36c1c Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Wed, 6 May 2009 17:29:17 +0800 Subject: [PATCH 088/162] crypto: testmgr - Add ctr(aes) test vectors Now with multi-block test vectors, all from SP800-38A, Appendix F.5. Also added ctr(aes) to case 10 in tcrypt. Signed-off-by: Jarod Wilson Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 1 + crypto/testmgr.c | 23 +++++-- crypto/testmgr.h | 166 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 183 insertions(+), 7 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index ea3b8a8db721..9e4974eb7825 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -526,6 +526,7 @@ static void do_test(int m) tcrypt_test("cbc(aes)"); tcrypt_test("lrw(aes)"); tcrypt_test("xts(aes)"); + tcrypt_test("ctr(aes)"); tcrypt_test("rfc3686(ctr(aes))"); break; diff --git a/crypto/testmgr.c b/crypto/testmgr.c index c5550943411d..f4cc1780aee2 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1567,6 +1567,21 @@ static const struct alg_test_desc alg_test_descs[] = { .count = CRC32C_TEST_VECTORS } } + }, { + .alg = "ctr(aes)", + .test = alg_test_skcipher, + .suite = { + .cipher = { + .enc = { + .vecs = aes_ctr_enc_tv_template, + .count = AES_CTR_ENC_TEST_VECTORS + }, + .dec = { + .vecs = aes_ctr_dec_tv_template, + .count = AES_CTR_DEC_TEST_VECTORS + } + } + } }, { .alg = "cts(cbc(aes))", .test = alg_test_skcipher, @@ -2017,12 +2032,12 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .cipher = { .enc = { - .vecs = aes_ctr_enc_tv_template, - .count = AES_CTR_ENC_TEST_VECTORS + .vecs = aes_ctr_rfc3686_enc_tv_template, + .count = AES_CTR_3686_ENC_TEST_VECTORS }, .dec = { - .vecs = aes_ctr_dec_tv_template, - .count = AES_CTR_DEC_TEST_VECTORS + .vecs = aes_ctr_rfc3686_dec_tv_template, + .count = AES_CTR_3686_DEC_TEST_VECTORS } } } diff --git a/crypto/testmgr.h b/crypto/testmgr.h index c1c709b57ddb..69316228fc19 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -2854,8 +2854,10 @@ static struct cipher_testvec cast6_dec_tv_template[] = { #define AES_LRW_DEC_TEST_VECTORS 8 #define AES_XTS_ENC_TEST_VECTORS 4 #define AES_XTS_DEC_TEST_VECTORS 4 -#define AES_CTR_ENC_TEST_VECTORS 7 -#define AES_CTR_DEC_TEST_VECTORS 6 +#define AES_CTR_ENC_TEST_VECTORS 3 +#define AES_CTR_DEC_TEST_VECTORS 3 +#define AES_CTR_3686_ENC_TEST_VECTORS 7 +#define AES_CTR_3686_DEC_TEST_VECTORS 6 #define AES_GCM_ENC_TEST_VECTORS 9 #define AES_GCM_DEC_TEST_VECTORS 8 #define AES_CCM_ENC_TEST_VECTORS 7 @@ -3998,6 +4000,164 @@ static struct cipher_testvec aes_xts_dec_tv_template[] = { static struct cipher_testvec aes_ctr_enc_tv_template[] = { + { /* From NIST Special Publication 800-38A, Appendix F.5 */ + .key = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6" + "\xab\xf7\x15\x88\x09\xcf\x4f\x3c", + .klen = 16, + .iv = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .input = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", + .ilen = 64, + .result = "\x87\x4d\x61\x91\xb6\x20\xe3\x26" + "\x1b\xef\x68\x64\x99\x0d\xb6\xce" + "\x98\x06\xf6\x6b\x79\x70\xfd\xff" + "\x86\x17\x18\x7b\xb9\xff\xfd\xff" + "\x5a\xe4\xdf\x3e\xdb\xd5\xd3\x5e" + "\x5b\x4f\x09\x02\x0d\xb0\x3e\xab" + "\x1e\x03\x1d\xda\x2f\xbe\x03\xd1" + "\x79\x21\x70\xa0\xf3\x00\x9c\xee", + .rlen = 64, + }, { + .key = "\x8e\x73\xb0\xf7\xda\x0e\x64\x52" + "\xc8\x10\xf3\x2b\x80\x90\x79\xe5" + "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b", + .klen = 24, + .iv = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .input = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", + .ilen = 64, + .result = "\x1a\xbc\x93\x24\x17\x52\x1c\xa2" + "\x4f\x2b\x04\x59\xfe\x7e\x6e\x0b" + "\x09\x03\x39\xec\x0a\xa6\xfa\xef" + "\xd5\xcc\xc2\xc6\xf4\xce\x8e\x94" + "\x1e\x36\xb2\x6b\xd1\xeb\xc6\x70" + "\xd1\xbd\x1d\x66\x56\x20\xab\xf7" + "\x4f\x78\xa7\xf6\xd2\x98\x09\x58" + "\x5a\x97\xda\xec\x58\xc6\xb0\x50", + .rlen = 64, + }, { + .key = "\x60\x3d\xeb\x10\x15\xca\x71\xbe" + "\x2b\x73\xae\xf0\x85\x7d\x77\x81" + "\x1f\x35\x2c\x07\x3b\x61\x08\xd7" + "\x2d\x98\x10\xa3\x09\x14\xdf\xf4", + .klen = 32, + .iv = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .input = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", + .ilen = 64, + .result = "\x60\x1e\xc3\x13\x77\x57\x89\xa5" + "\xb7\xa7\xf5\x04\xbb\xf3\xd2\x28" + "\xf4\x43\xe3\xca\x4d\x62\xb5\x9a" + "\xca\x84\xe9\x90\xca\xca\xf5\xc5" + "\x2b\x09\x30\xda\xa2\x3d\xe9\x4c" + "\xe8\x70\x17\xba\x2d\x84\x98\x8d" + "\xdf\xc9\xc5\x8d\xb6\x7a\xad\xa6" + "\x13\xc2\xdd\x08\x45\x79\x41\xa6", + .rlen = 64, + } +}; + +static struct cipher_testvec aes_ctr_dec_tv_template[] = { + { /* From NIST Special Publication 800-38A, Appendix F.5 */ + .key = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6" + "\xab\xf7\x15\x88\x09\xcf\x4f\x3c", + .klen = 16, + .iv = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .input = "\x87\x4d\x61\x91\xb6\x20\xe3\x26" + "\x1b\xef\x68\x64\x99\x0d\xb6\xce" + "\x98\x06\xf6\x6b\x79\x70\xfd\xff" + "\x86\x17\x18\x7b\xb9\xff\xfd\xff" + "\x5a\xe4\xdf\x3e\xdb\xd5\xd3\x5e" + "\x5b\x4f\x09\x02\x0d\xb0\x3e\xab" + "\x1e\x03\x1d\xda\x2f\xbe\x03\xd1" + "\x79\x21\x70\xa0\xf3\x00\x9c\xee", + .ilen = 64, + .result = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", + .rlen = 64, + }, { + .key = "\x8e\x73\xb0\xf7\xda\x0e\x64\x52" + "\xc8\x10\xf3\x2b\x80\x90\x79\xe5" + "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b", + .klen = 24, + .iv = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .input = "\x1a\xbc\x93\x24\x17\x52\x1c\xa2" + "\x4f\x2b\x04\x59\xfe\x7e\x6e\x0b" + "\x09\x03\x39\xec\x0a\xa6\xfa\xef" + "\xd5\xcc\xc2\xc6\xf4\xce\x8e\x94" + "\x1e\x36\xb2\x6b\xd1\xeb\xc6\x70" + "\xd1\xbd\x1d\x66\x56\x20\xab\xf7" + "\x4f\x78\xa7\xf6\xd2\x98\x09\x58" + "\x5a\x97\xda\xec\x58\xc6\xb0\x50", + .ilen = 64, + .result = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", + .rlen = 64, + }, { + .key = "\x60\x3d\xeb\x10\x15\xca\x71\xbe" + "\x2b\x73\xae\xf0\x85\x7d\x77\x81" + "\x1f\x35\x2c\x07\x3b\x61\x08\xd7" + "\x2d\x98\x10\xa3\x09\x14\xdf\xf4", + .klen = 32, + .iv = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .input = "\x60\x1e\xc3\x13\x77\x57\x89\xa5" + "\xb7\xa7\xf5\x04\xbb\xf3\xd2\x28" + "\xf4\x43\xe3\xca\x4d\x62\xb5\x9a" + "\xca\x84\xe9\x90\xca\xca\xf5\xc5" + "\x2b\x09\x30\xda\xa2\x3d\xe9\x4c" + "\xe8\x70\x17\xba\x2d\x84\x98\x8d" + "\xdf\xc9\xc5\x8d\xb6\x7a\xad\xa6" + "\x13\xc2\xdd\x08\x45\x79\x41\xa6", + .ilen = 64, + .result = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", + .rlen = 64, + } +}; + +static struct cipher_testvec aes_ctr_rfc3686_enc_tv_template[] = { { /* From RFC 3686 */ .key = "\xae\x68\x52\xf8\x12\x10\x67\xcc" "\x4b\xf7\xa5\x76\x55\x77\xf3\x9e" @@ -5129,7 +5289,7 @@ static struct cipher_testvec aes_ctr_enc_tv_template[] = { }, }; -static struct cipher_testvec aes_ctr_dec_tv_template[] = { +static struct cipher_testvec aes_ctr_rfc3686_dec_tv_template[] = { { /* From RFC 3686 */ .key = "\xae\x68\x52\xf8\x12\x10\x67\xcc" "\x4b\xf7\xa5\x76\x55\x77\xf3\x9e" From a1915d51e8e7ee192d2101d621d425379088cbb0 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 15 May 2009 15:16:03 +1000 Subject: [PATCH 089/162] crypto: testmgr - Mark algs allowed in fips mode Set the fips_allowed flag in testmgr.c's alg_test_descs[] for algs that are allowed to be used when in fips mode. One caveat: des isn't actually allowed anymore, but des (and thus also ecb(des)) has to be permitted, because disallowing them results in des3_ede being unable to properly register (see des module init func). Also, crc32 isn't technically on the fips approved list, but I think it gets used in various places that necessitate it being allowed. This list is based on http://csrc.nist.gov/groups/STM/cavp/index.html Important note: allowed/approved here does NOT mean "validated", just that its an alg that *could* be validated. Signed-off-by: Jarod Wilson Acked-by: Neil Horman Signed-off-by: Herbert Xu --- crypto/testmgr.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index f4cc1780aee2..51bae62c332a 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -94,6 +94,7 @@ struct alg_test_desc { const char *alg; int (*test)(const struct alg_test_desc *desc, const char *driver, u32 type, u32 mask); + int fips_allowed; /* set if alg is allowed in fips mode */ union { struct aead_test_suite aead; @@ -1432,6 +1433,7 @@ static const struct alg_test_desc alg_test_descs[] = { { .alg = "ansi_cprng", .test = alg_test_cprng, + .fips_allowed = 1, .suite = { .cprng = { .vecs = ansi_cprng_aes_tv_template, @@ -1441,6 +1443,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "cbc(aes)", .test = alg_test_skcipher, + .fips_allowed = 1, .suite = { .cipher = { .enc = { @@ -1516,6 +1519,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "cbc(des3_ede)", .test = alg_test_skcipher, + .fips_allowed = 1, .suite = { .cipher = { .enc = { @@ -1546,6 +1550,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "ccm(aes)", .test = alg_test_aead, + .fips_allowed = 1, .suite = { .aead = { .enc = { @@ -1561,6 +1566,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "crc32c", .test = alg_test_crc32c, + .fips_allowed = 1, .suite = { .hash = { .vecs = crc32c_tv_template, @@ -1570,6 +1576,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "ctr(aes)", .test = alg_test_skcipher, + .fips_allowed = 1, .suite = { .cipher = { .enc = { @@ -1615,6 +1622,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "ecb(aes)", .test = alg_test_skcipher, + .fips_allowed = 1, .suite = { .cipher = { .enc = { @@ -1720,6 +1728,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "ecb(des)", .test = alg_test_skcipher, + .fips_allowed = 1, .suite = { .cipher = { .enc = { @@ -1735,6 +1744,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "ecb(des3_ede)", .test = alg_test_skcipher, + .fips_allowed = 1, .suite = { .cipher = { .enc = { @@ -1870,6 +1880,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "gcm(aes)", .test = alg_test_aead, + .fips_allowed = 1, .suite = { .aead = { .enc = { @@ -1912,6 +1923,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "hmac(sha1)", .test = alg_test_hash, + .fips_allowed = 1, .suite = { .hash = { .vecs = hmac_sha1_tv_template, @@ -1921,6 +1933,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "hmac(sha224)", .test = alg_test_hash, + .fips_allowed = 1, .suite = { .hash = { .vecs = hmac_sha224_tv_template, @@ -1930,6 +1943,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "hmac(sha256)", .test = alg_test_hash, + .fips_allowed = 1, .suite = { .hash = { .vecs = hmac_sha256_tv_template, @@ -1939,6 +1953,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "hmac(sha384)", .test = alg_test_hash, + .fips_allowed = 1, .suite = { .hash = { .vecs = hmac_sha384_tv_template, @@ -1948,6 +1963,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "hmac(sha512)", .test = alg_test_hash, + .fips_allowed = 1, .suite = { .hash = { .vecs = hmac_sha512_tv_template, @@ -2029,6 +2045,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "rfc3686(ctr(aes))", .test = alg_test_skcipher, + .fips_allowed = 1, .suite = { .cipher = { .enc = { @@ -2044,6 +2061,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "rfc4309(ccm(aes))", .test = alg_test_aead, + .fips_allowed = 1, .suite = { .aead = { .enc = { @@ -2106,6 +2124,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "sha1", .test = alg_test_hash, + .fips_allowed = 1, .suite = { .hash = { .vecs = sha1_tv_template, @@ -2115,6 +2134,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "sha224", .test = alg_test_hash, + .fips_allowed = 1, .suite = { .hash = { .vecs = sha224_tv_template, @@ -2124,6 +2144,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "sha256", .test = alg_test_hash, + .fips_allowed = 1, .suite = { .hash = { .vecs = sha256_tv_template, @@ -2133,6 +2154,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "sha384", .test = alg_test_hash, + .fips_allowed = 1, .suite = { .hash = { .vecs = sha384_tv_template, @@ -2142,6 +2164,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "sha512", .test = alg_test_hash, + .fips_allowed = 1, .suite = { .hash = { .vecs = sha512_tv_template, From a3bef3a31a19bd943047ba8bf5b2cc7b5d164362 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 15 May 2009 15:17:05 +1000 Subject: [PATCH 090/162] crypto: testmgr - Skip algs not flagged fips_allowed in fips mode Because all fips-allowed algorithms must be self-tested before they can be used, they will all have entries in testmgr.c's alg_test_descs[]. Skip self-tests for any algs not flagged as fips_approved and return -EINVAL when in fips mode. Signed-off-by: Jarod Wilson Acked-by: Neil Horman Signed-off-by: Herbert Xu --- crypto/testmgr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 51bae62c332a..f93b26d0fcfb 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -2308,6 +2308,9 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask) if (i < 0) goto notest; + if (fips_enabled && !alg_test_descs[i].fips_allowed) + goto non_fips_alg; + rc = alg_test_cipher(alg_test_descs + i, driver, type, mask); goto test_done; } @@ -2316,6 +2319,9 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask) if (i < 0) goto notest; + if (fips_enabled && !alg_test_descs[i].fips_allowed) + goto non_fips_alg; + rc = alg_test_descs[i].test(alg_test_descs + i, driver, type, mask); test_done: @@ -2331,5 +2337,7 @@ test_done: notest: printk(KERN_INFO "alg: No test for %s (%s)\n", alg, driver); return 0; +non_fips_alg: + return -EINVAL; } EXPORT_SYMBOL_GPL(alg_test); From 608d1cd5d375580a49d01b5ed1f9944f5141ae19 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Fri, 15 May 2009 15:57:35 +1000 Subject: [PATCH 091/162] hwrng: via_rng - The VIA Hardware RNG driver is for the CPU, not Chipset This is a cosmetic change, fixing the MODULE_DESCRIPTION() of via-rng.c Signed-off-by: Harald Welte Signed-off-by: Herbert Xu --- drivers/char/hw_random/via-rng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c index 4e9573c1d39e..02ee63906713 100644 --- a/drivers/char/hw_random/via-rng.c +++ b/drivers/char/hw_random/via-rng.c @@ -205,5 +205,5 @@ static void __exit mod_exit(void) module_init(mod_init); module_exit(mod_exit); -MODULE_DESCRIPTION("H/W RNG driver for VIA chipsets"); +MODULE_DESCRIPTION("H/W RNG driver for VIA CPU with PadLock"); MODULE_LICENSE("GPL"); From 858576bdc5d65edf1fffd2e65b2165ec1dc68486 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Fri, 15 May 2009 16:00:32 +1000 Subject: [PATCH 092/162] hwrng: via_rng - Support VIA Nano hardware RNG The VIA Nano CPU supports the same XSTORE instruction based RNG, but it lacks the MSR present in earlier CPUs. Signed-off-by: Harald Welte Signed-off-by: Herbert Xu --- drivers/char/hw_random/via-rng.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c index 02ee63906713..794aacb715c1 100644 --- a/drivers/char/hw_random/via-rng.c +++ b/drivers/char/hw_random/via-rng.c @@ -132,6 +132,19 @@ static int via_rng_init(struct hwrng *rng) struct cpuinfo_x86 *c = &cpu_data(0); u32 lo, hi, old_lo; + /* VIA Nano CPUs don't have the MSR_VIA_RNG anymore. The RNG + * is always enabled if CPUID rng_en is set. There is no + * RNG configuration like it used to be the case in this + * register */ + if ((c->x86 == 6) && (c->x86_model >= 0x0f)) { + if (!cpu_has_xstore_enabled) { + printk(KERN_ERR PFX "can't enable hardware RNG " + "if XSTORE is not enabled\n"); + return -ENODEV; + } + return 0; + } + /* Control the RNG via MSR. Tread lightly and pay very close * close attention to values written, as the reserved fields * are documented to be "undefined and unpredictable"; but it From e9736c16da9077728802f42393d18258e6685428 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Fri, 15 May 2009 16:01:52 +1000 Subject: [PATCH 093/162] hwrng: via_rng - Support VIA Nano hardware RNG on X86_64 builds Fix Kconfig to build via-rng.ko on X86_64 builds, as the VIA Nano CPU supports x86_64, too. Signed-off-by: Harald Welte Signed-off-by: Herbert Xu --- drivers/char/hw_random/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index 5fab6470f4b2..9c00440dcf86 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -88,7 +88,7 @@ config HW_RANDOM_N2RNG config HW_RANDOM_VIA tristate "VIA HW Random Number Generator support" - depends on HW_RANDOM && X86_32 + depends on HW_RANDOM && X86 default HW_RANDOM ---help--- This driver provides kernel-side support for the Random Number From 3ce858cb04de8bc83449eac707c8012a1944daca Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 27 May 2009 15:05:02 +1000 Subject: [PATCH 094/162] crypto: compress - Return produced bytes in crypto_{,de}compress_{update,final} If crypto_{,de}compress_{update,final}() succeed, return the actual number of bytes produced instead of zero, so their users don't have to calculate that theirselves. Signed-off-by: Geert Uytterhoeven Signed-off-by: Herbert Xu --- crypto/testmgr.c | 117 ++++++++++++++++++++++++++++------------------- crypto/zlib.c | 24 +++++----- 2 files changed, 82 insertions(+), 59 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index f93b26d0fcfb..376ea88158b9 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1002,24 +1002,25 @@ static int test_pcomp(struct crypto_pcomp *tfm, const char *algo = crypto_tfm_alg_driver_name(crypto_pcomp_tfm(tfm)); unsigned int i; char result[COMP_BUF_SIZE]; - int error; + int res; for (i = 0; i < ctcount; i++) { struct comp_request req; + unsigned int produced = 0; - error = crypto_compress_setup(tfm, ctemplate[i].params, - ctemplate[i].paramsize); - if (error) { + res = crypto_compress_setup(tfm, ctemplate[i].params, + ctemplate[i].paramsize); + if (res) { pr_err("alg: pcomp: compression setup failed on test " - "%d for %s: error=%d\n", i + 1, algo, error); - return error; + "%d for %s: error=%d\n", i + 1, algo, res); + return res; } - error = crypto_compress_init(tfm); - if (error) { + res = crypto_compress_init(tfm); + if (res) { pr_err("alg: pcomp: compression init failed on test " - "%d for %s: error=%d\n", i + 1, algo, error); - return error; + "%d for %s: error=%d\n", i + 1, algo, res); + return res; } memset(result, 0, sizeof(result)); @@ -1029,32 +1030,37 @@ static int test_pcomp(struct crypto_pcomp *tfm, req.next_out = result; req.avail_out = ctemplate[i].outlen / 2; - error = crypto_compress_update(tfm, &req); - if (error && (error != -EAGAIN || req.avail_in)) { + res = crypto_compress_update(tfm, &req); + if (res < 0 && (res != -EAGAIN || req.avail_in)) { pr_err("alg: pcomp: compression update failed on test " - "%d for %s: error=%d\n", i + 1, algo, error); - return error; + "%d for %s: error=%d\n", i + 1, algo, res); + return res; } + if (res > 0) + produced += res; /* Add remaining input data */ req.avail_in += (ctemplate[i].inlen + 1) / 2; - error = crypto_compress_update(tfm, &req); - if (error && (error != -EAGAIN || req.avail_in)) { + res = crypto_compress_update(tfm, &req); + if (res < 0 && (res != -EAGAIN || req.avail_in)) { pr_err("alg: pcomp: compression update failed on test " - "%d for %s: error=%d\n", i + 1, algo, error); - return error; + "%d for %s: error=%d\n", i + 1, algo, res); + return res; } + if (res > 0) + produced += res; /* Provide remaining output space */ req.avail_out += COMP_BUF_SIZE - ctemplate[i].outlen / 2; - error = crypto_compress_final(tfm, &req); - if (error) { + res = crypto_compress_final(tfm, &req); + if (res < 0) { pr_err("alg: pcomp: compression final failed on test " - "%d for %s: error=%d\n", i + 1, algo, error); - return error; + "%d for %s: error=%d\n", i + 1, algo, res); + return res; } + produced += res; if (COMP_BUF_SIZE - req.avail_out != ctemplate[i].outlen) { pr_err("alg: comp: Compression test %d failed for %s: " @@ -1064,6 +1070,13 @@ static int test_pcomp(struct crypto_pcomp *tfm, return -EINVAL; } + if (produced != ctemplate[i].outlen) { + pr_err("alg: comp: Compression test %d failed for %s: " + "returned len = %u (expected %d)\n", i + 1, + algo, produced, ctemplate[i].outlen); + return -EINVAL; + } + if (memcmp(result, ctemplate[i].output, ctemplate[i].outlen)) { pr_err("alg: pcomp: Compression test %d failed for " "%s\n", i + 1, algo); @@ -1074,21 +1087,21 @@ static int test_pcomp(struct crypto_pcomp *tfm, for (i = 0; i < dtcount; i++) { struct comp_request req; + unsigned int produced = 0; - error = crypto_decompress_setup(tfm, dtemplate[i].params, - dtemplate[i].paramsize); - if (error) { + res = crypto_decompress_setup(tfm, dtemplate[i].params, + dtemplate[i].paramsize); + if (res) { pr_err("alg: pcomp: decompression setup failed on " - "test %d for %s: error=%d\n", i + 1, algo, - error); - return error; + "test %d for %s: error=%d\n", i + 1, algo, res); + return res; } - error = crypto_decompress_init(tfm); - if (error) { + res = crypto_decompress_init(tfm); + if (res) { pr_err("alg: pcomp: decompression init failed on test " - "%d for %s: error=%d\n", i + 1, algo, error); - return error; + "%d for %s: error=%d\n", i + 1, algo, res); + return res; } memset(result, 0, sizeof(result)); @@ -1098,35 +1111,38 @@ static int test_pcomp(struct crypto_pcomp *tfm, req.next_out = result; req.avail_out = dtemplate[i].outlen / 2; - error = crypto_decompress_update(tfm, &req); - if (error && (error != -EAGAIN || req.avail_in)) { + res = crypto_decompress_update(tfm, &req); + if (res < 0 && (res != -EAGAIN || req.avail_in)) { pr_err("alg: pcomp: decompression update failed on " - "test %d for %s: error=%d\n", i + 1, algo, - error); - return error; + "test %d for %s: error=%d\n", i + 1, algo, res); + return res; } + if (res > 0) + produced += res; /* Add remaining input data */ req.avail_in += (dtemplate[i].inlen + 1) / 2; - error = crypto_decompress_update(tfm, &req); - if (error && (error != -EAGAIN || req.avail_in)) { + res = crypto_decompress_update(tfm, &req); + if (res < 0 && (res != -EAGAIN || req.avail_in)) { pr_err("alg: pcomp: decompression update failed on " - "test %d for %s: error=%d\n", i + 1, algo, - error); - return error; + "test %d for %s: error=%d\n", i + 1, algo, res); + return res; } + if (res > 0) + produced += res; /* Provide remaining output space */ req.avail_out += COMP_BUF_SIZE - dtemplate[i].outlen / 2; - error = crypto_decompress_final(tfm, &req); - if (error && (error != -EAGAIN || req.avail_in)) { + res = crypto_decompress_final(tfm, &req); + if (res < 0 && (res != -EAGAIN || req.avail_in)) { pr_err("alg: pcomp: decompression final failed on " - "test %d for %s: error=%d\n", i + 1, algo, - error); - return error; + "test %d for %s: error=%d\n", i + 1, algo, res); + return res; } + if (res > 0) + produced += res; if (COMP_BUF_SIZE - req.avail_out != dtemplate[i].outlen) { pr_err("alg: comp: Decompression test %d failed for " @@ -1136,6 +1152,13 @@ static int test_pcomp(struct crypto_pcomp *tfm, return -EINVAL; } + if (produced != dtemplate[i].outlen) { + pr_err("alg: comp: Decompression test %d failed for " + "%s: returned len = %u (expected %d)\n", i + 1, + algo, produced, dtemplate[i].outlen); + return -EINVAL; + } + if (memcmp(result, dtemplate[i].output, dtemplate[i].outlen)) { pr_err("alg: pcomp: Decompression test %d failed for " "%s\n", i + 1, algo); diff --git a/crypto/zlib.c b/crypto/zlib.c index 33609bab614e..c3015733c990 100644 --- a/crypto/zlib.c +++ b/crypto/zlib.c @@ -165,15 +165,15 @@ static int zlib_compress_update(struct crypto_pcomp *tfm, return -EINVAL; } + ret = req->avail_out - stream->avail_out; pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", stream->avail_in, stream->avail_out, - req->avail_in - stream->avail_in, - req->avail_out - stream->avail_out); + req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; req->avail_in = stream->avail_in; req->next_out = stream->next_out; req->avail_out = stream->avail_out; - return 0; + return ret; } static int zlib_compress_final(struct crypto_pcomp *tfm, @@ -195,15 +195,15 @@ static int zlib_compress_final(struct crypto_pcomp *tfm, return -EINVAL; } + ret = req->avail_out - stream->avail_out; pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", stream->avail_in, stream->avail_out, - req->avail_in - stream->avail_in, - req->avail_out - stream->avail_out); + req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; req->avail_in = stream->avail_in; req->next_out = stream->next_out; req->avail_out = stream->avail_out; - return 0; + return ret; } @@ -280,15 +280,15 @@ static int zlib_decompress_update(struct crypto_pcomp *tfm, return -EINVAL; } + ret = req->avail_out - stream->avail_out; pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", stream->avail_in, stream->avail_out, - req->avail_in - stream->avail_in, - req->avail_out - stream->avail_out); + req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; req->avail_in = stream->avail_in; req->next_out = stream->next_out; req->avail_out = stream->avail_out; - return 0; + return ret; } static int zlib_decompress_final(struct crypto_pcomp *tfm, @@ -328,15 +328,15 @@ static int zlib_decompress_final(struct crypto_pcomp *tfm, return -EINVAL; } + ret = req->avail_out - stream->avail_out; pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", stream->avail_in, stream->avail_out, - req->avail_in - stream->avail_in, - req->avail_out - stream->avail_out); + req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; req->avail_in = stream->avail_in; req->next_out = stream->next_out; req->avail_out = stream->avail_out; - return 0; + return ret; } From 4e033a6bc70f094d36128c328f6ca725c6ca4b4c Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Wed, 27 May 2009 15:10:21 +1000 Subject: [PATCH 095/162] crypto: tcrypt - Do not exit on success in fips mode At present, the tcrypt module always exits with an -EAGAIN upon successfully completing all the tests its been asked to run. In fips mode, integrity checking is done by running all self-tests from the initrd, and its much simpler to check the ret from modprobe for success than to scrape dmesg and/or /proc/crypto. Simply stay loaded, giving modprobe a retval of 0, if self-tests all pass and we're in fips mode. A side-effect of tracking success/failure for fips mode is that in non-fips mode, self-test failures will return the actual failure return codes, rather than always returning -EAGAIN, which seems more correct anyway. The tcrypt_test() portion of the patch is dependent on my earlier pair of patches that skip non-fips algs in fips mode, at least to achieve the fully intended behavior. Nb: testing this patch against the cryptodev tree revealed a test failure for sha384, which I have yet to look into... Signed-off-by: Jarod Wilson Acked-by: Neil Horman Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 164 ++++++++++++++++++++++++++---------------------- 1 file changed, 90 insertions(+), 74 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 9e4974eb7825..d59ba5079d14 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -27,6 +27,7 @@ #include #include #include "tcrypt.h" +#include "internal.h" /* * Need slab memory for testing (size in number of pages). @@ -468,248 +469,255 @@ static void test_available(void) static inline int tcrypt_test(const char *alg) { - return alg_test(alg, alg, 0, 0); + int ret; + + ret = alg_test(alg, alg, 0, 0); + /* non-fips algs return -EINVAL in fips mode */ + if (fips_enabled && ret == -EINVAL) + ret = 0; + return ret; } -static void do_test(int m) +static int do_test(int m) { int i; + int ret = 0; switch (m) { case 0: for (i = 1; i < 200; i++) - do_test(i); + ret += do_test(i); break; case 1: - tcrypt_test("md5"); + ret += tcrypt_test("md5"); break; case 2: - tcrypt_test("sha1"); + ret += tcrypt_test("sha1"); break; case 3: - tcrypt_test("ecb(des)"); - tcrypt_test("cbc(des)"); + ret += tcrypt_test("ecb(des)"); + ret += tcrypt_test("cbc(des)"); break; case 4: - tcrypt_test("ecb(des3_ede)"); - tcrypt_test("cbc(des3_ede)"); + ret += tcrypt_test("ecb(des3_ede)"); + ret += tcrypt_test("cbc(des3_ede)"); break; case 5: - tcrypt_test("md4"); + ret += tcrypt_test("md4"); break; case 6: - tcrypt_test("sha256"); + ret += tcrypt_test("sha256"); break; case 7: - tcrypt_test("ecb(blowfish)"); - tcrypt_test("cbc(blowfish)"); + ret += tcrypt_test("ecb(blowfish)"); + ret += tcrypt_test("cbc(blowfish)"); break; case 8: - tcrypt_test("ecb(twofish)"); - tcrypt_test("cbc(twofish)"); + ret += tcrypt_test("ecb(twofish)"); + ret += tcrypt_test("cbc(twofish)"); break; case 9: - tcrypt_test("ecb(serpent)"); + ret += tcrypt_test("ecb(serpent)"); break; case 10: - tcrypt_test("ecb(aes)"); - tcrypt_test("cbc(aes)"); - tcrypt_test("lrw(aes)"); - tcrypt_test("xts(aes)"); - tcrypt_test("ctr(aes)"); - tcrypt_test("rfc3686(ctr(aes))"); + ret += tcrypt_test("ecb(aes)"); + ret += tcrypt_test("cbc(aes)"); + ret += tcrypt_test("lrw(aes)"); + ret += tcrypt_test("xts(aes)"); + ret += tcrypt_test("ctr(aes)"); + ret += tcrypt_test("rfc3686(ctr(aes))"); break; case 11: - tcrypt_test("sha384"); + ret += tcrypt_test("sha384"); break; case 12: - tcrypt_test("sha512"); + ret += tcrypt_test("sha512"); break; case 13: - tcrypt_test("deflate"); + ret += tcrypt_test("deflate"); break; case 14: - tcrypt_test("ecb(cast5)"); + ret += tcrypt_test("ecb(cast5)"); break; case 15: - tcrypt_test("ecb(cast6)"); + ret += tcrypt_test("ecb(cast6)"); break; case 16: - tcrypt_test("ecb(arc4)"); + ret += tcrypt_test("ecb(arc4)"); break; case 17: - tcrypt_test("michael_mic"); + ret += tcrypt_test("michael_mic"); break; case 18: - tcrypt_test("crc32c"); + ret += tcrypt_test("crc32c"); break; case 19: - tcrypt_test("ecb(tea)"); + ret += tcrypt_test("ecb(tea)"); break; case 20: - tcrypt_test("ecb(xtea)"); + ret += tcrypt_test("ecb(xtea)"); break; case 21: - tcrypt_test("ecb(khazad)"); + ret += tcrypt_test("ecb(khazad)"); break; case 22: - tcrypt_test("wp512"); + ret += tcrypt_test("wp512"); break; case 23: - tcrypt_test("wp384"); + ret += tcrypt_test("wp384"); break; case 24: - tcrypt_test("wp256"); + ret += tcrypt_test("wp256"); break; case 25: - tcrypt_test("ecb(tnepres)"); + ret += tcrypt_test("ecb(tnepres)"); break; case 26: - tcrypt_test("ecb(anubis)"); - tcrypt_test("cbc(anubis)"); + ret += tcrypt_test("ecb(anubis)"); + ret += tcrypt_test("cbc(anubis)"); break; case 27: - tcrypt_test("tgr192"); + ret += tcrypt_test("tgr192"); break; case 28: - tcrypt_test("tgr160"); + ret += tcrypt_test("tgr160"); break; case 29: - tcrypt_test("tgr128"); + ret += tcrypt_test("tgr128"); break; case 30: - tcrypt_test("ecb(xeta)"); + ret += tcrypt_test("ecb(xeta)"); break; case 31: - tcrypt_test("pcbc(fcrypt)"); + ret += tcrypt_test("pcbc(fcrypt)"); break; case 32: - tcrypt_test("ecb(camellia)"); - tcrypt_test("cbc(camellia)"); + ret += tcrypt_test("ecb(camellia)"); + ret += tcrypt_test("cbc(camellia)"); break; case 33: - tcrypt_test("sha224"); + ret += tcrypt_test("sha224"); break; case 34: - tcrypt_test("salsa20"); + ret += tcrypt_test("salsa20"); break; case 35: - tcrypt_test("gcm(aes)"); + ret += tcrypt_test("gcm(aes)"); break; case 36: - tcrypt_test("lzo"); + ret += tcrypt_test("lzo"); break; case 37: - tcrypt_test("ccm(aes)"); + ret += tcrypt_test("ccm(aes)"); break; case 38: - tcrypt_test("cts(cbc(aes))"); + ret += tcrypt_test("cts(cbc(aes))"); break; case 39: - tcrypt_test("rmd128"); + ret += tcrypt_test("rmd128"); break; case 40: - tcrypt_test("rmd160"); + ret += tcrypt_test("rmd160"); break; case 41: - tcrypt_test("rmd256"); + ret += tcrypt_test("rmd256"); break; case 42: - tcrypt_test("rmd320"); + ret += tcrypt_test("rmd320"); break; case 43: - tcrypt_test("ecb(seed)"); + ret += tcrypt_test("ecb(seed)"); break; case 44: - tcrypt_test("zlib"); + ret += tcrypt_test("zlib"); break; case 45: - tcrypt_test("rfc4309(ccm(aes))"); + ret += tcrypt_test("rfc4309(ccm(aes))"); break; case 100: - tcrypt_test("hmac(md5)"); + ret += tcrypt_test("hmac(md5)"); break; case 101: - tcrypt_test("hmac(sha1)"); + ret += tcrypt_test("hmac(sha1)"); break; case 102: - tcrypt_test("hmac(sha256)"); + ret += tcrypt_test("hmac(sha256)"); break; case 103: - tcrypt_test("hmac(sha384)"); + ret += tcrypt_test("hmac(sha384)"); break; case 104: - tcrypt_test("hmac(sha512)"); + ret += tcrypt_test("hmac(sha512)"); break; case 105: - tcrypt_test("hmac(sha224)"); + ret += tcrypt_test("hmac(sha224)"); break; case 106: - tcrypt_test("xcbc(aes)"); + ret += tcrypt_test("xcbc(aes)"); break; case 107: - tcrypt_test("hmac(rmd128)"); + ret += tcrypt_test("hmac(rmd128)"); break; case 108: - tcrypt_test("hmac(rmd160)"); + ret += tcrypt_test("hmac(rmd160)"); break; case 150: - tcrypt_test("ansi_cprng"); + ret += tcrypt_test("ansi_cprng"); break; case 200: @@ -873,6 +881,8 @@ static void do_test(int m) test_available(); break; } + + return ret; } static int __init tcrypt_mod_init(void) @@ -886,15 +896,21 @@ static int __init tcrypt_mod_init(void) goto err_free_tv; } - do_test(mode); + err = do_test(mode); + if (err) { + printk(KERN_ERR "tcrypt: one or more tests failed!\n"); + goto err_free_tv; + } - /* We intentionaly return -EAGAIN to prevent keeping - * the module. It does all its work from init() - * and doesn't offer any runtime functionality + /* We intentionaly return -EAGAIN to prevent keeping the module, + * unless we're running in fips mode. It does all its work from + * init() and doesn't offer any runtime functionality, but in + * the fips case, checking for a successful load is helpful. * => we don't need it in the memory, do we? * -- mludvig */ - err = -EAGAIN; + if (!fips_enabled) + err = -EAGAIN; err_free_tv: for (i = 0; i < TVMEMSIZE && tvmem[i]; i++) From f3d8fe40498eea9f45be260bdf6ccada845411f3 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 27 May 2009 15:16:21 +1000 Subject: [PATCH 096/162] crypto: hifn_795x - fix __dev{init,exit} markings The remove member of the pci_driver hifn_pci_driver uses __devexit_p(), so the remove function itself should be marked with __devexit. And where there be __devexit on the remove, so is there __devinit on the probe. Similarly, the module_init/module_exit functions should be declared with plain __init/__exit markings, not the hotplug __dev{init,exit} ones. Signed-off-by: Mike Frysinger Acked-by: Evgeniy Polyakov CC: Patrick McHardy Signed-off-by: Herbert Xu --- drivers/crypto/hifn_795x.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c index 2bef086fb342..5f753fc08730 100644 --- a/drivers/crypto/hifn_795x.c +++ b/drivers/crypto/hifn_795x.c @@ -2564,7 +2564,7 @@ static void hifn_tasklet_callback(unsigned long data) hifn_process_queue(dev); } -static int hifn_probe(struct pci_dev *pdev, const struct pci_device_id *id) +static int __devinit hifn_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int err, i; struct hifn_device *dev; @@ -2696,7 +2696,7 @@ err_out_disable_pci_device: return err; } -static void hifn_remove(struct pci_dev *pdev) +static void __devexit hifn_remove(struct pci_dev *pdev) { int i; struct hifn_device *dev; @@ -2744,7 +2744,7 @@ static struct pci_driver hifn_pci_driver = { .remove = __devexit_p(hifn_remove), }; -static int __devinit hifn_init(void) +static int __init hifn_init(void) { unsigned int freq; int err; @@ -2789,7 +2789,7 @@ static int __devinit hifn_init(void) return 0; } -static void __devexit hifn_fini(void) +static void __exit hifn_fini(void) { pci_unregister_driver(&hifn_pci_driver); From fd57f22a09ae276ca3e9cd11ed99b617d611ba82 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 29 May 2009 16:05:42 +1000 Subject: [PATCH 097/162] crypto: testmgr - Check all test vector lengths As we cannot guarantee the availability of contiguous pages at run-time, all test vectors must either fit within a page, or use scatter lists. In some cases vectors were not checked as to whether they fit inside a page. This patch adds all the missing checks. Signed-off-by: Herbert Xu --- crypto/testmgr.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 376ea88158b9..8fcea70ed267 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -185,6 +185,10 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, hash_buff = xbuf[0]; + ret = -EINVAL; + if (WARN_ON(template[i].psize > PAGE_SIZE)) + goto out; + memcpy(hash_buff, template[i].plaintext, template[i].psize); sg_init_one(&sg[0], hash_buff, template[i].psize); @@ -238,7 +242,11 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, temp = 0; sg_init_table(sg, template[i].np); + ret = -EINVAL; for (k = 0; k < template[i].np; k++) { + if (WARN_ON(offset_in_page(IDX[k]) + + template[i].tap[k] > PAGE_SIZE)) + goto out; sg_set_buf(&sg[k], memcpy(xbuf[IDX[k] >> PAGE_SHIFT] + offset_in_page(IDX[k]), @@ -357,6 +365,11 @@ static int test_aead(struct crypto_aead *tfm, int enc, input = xbuf[0]; assoc = axbuf[0]; + ret = -EINVAL; + if (WARN_ON(template[i].ilen > PAGE_SIZE || + template[i].alen > PAGE_SIZE)) + goto out; + memcpy(input, template[i].input, template[i].ilen); memcpy(assoc, template[i].assoc, template[i].alen); if (template[i].iv) @@ -516,7 +529,11 @@ static int test_aead(struct crypto_aead *tfm, int enc, } sg_init_table(asg, template[i].anp); + ret = -EINVAL; for (k = 0, temp = 0; k < template[i].anp; k++) { + if (WARN_ON(offset_in_page(IDX[k]) + + template[i].atap[k] > PAGE_SIZE)) + goto out; sg_set_buf(&asg[k], memcpy(axbuf[IDX[k] >> PAGE_SHIFT] + offset_in_page(IDX[k]), @@ -650,6 +667,10 @@ static int test_cipher(struct crypto_cipher *tfm, int enc, j++; + ret = -EINVAL; + if (WARN_ON(template[i].ilen > PAGE_SIZE)) + goto out; + data = xbuf[0]; memcpy(data, template[i].input, template[i].ilen); @@ -741,6 +762,10 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, if (!(template[i].np)) { j++; + ret = -EINVAL; + if (WARN_ON(template[i].ilen > PAGE_SIZE)) + goto out; + data = xbuf[0]; memcpy(data, template[i].input, template[i].ilen); From a0cfae59f8381c5c670fce2cc3de70b35421f920 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 29 May 2009 16:23:12 +1000 Subject: [PATCH 098/162] crypto: testmgr - Allow hash test vectors longer than a page As it stands we will each test hash vector both linearly and as a scatter list if applicable. This means that we cannot have vectors longer than a page, even with scatter lists. This patch fixes this by skipping test vectors with np != 0 when testing linearly. Signed-off-by: Herbert Xu --- crypto/testmgr.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 8fcea70ed267..e9e9d84293b9 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -180,7 +180,12 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tcrypt_complete, &tresult); + j = 0; for (i = 0; i < tcount; i++) { + if (template[i].np) + continue; + + j++; memset(result, 0, 64); hash_buff = xbuf[0]; @@ -198,7 +203,7 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, template[i].ksize); if (ret) { printk(KERN_ERR "alg: hash: setkey failed on " - "test %d for %s: ret=%d\n", i + 1, algo, + "test %d for %s: ret=%d\n", j, algo, -ret); goto out; } @@ -220,14 +225,14 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, /* fall through */ default: printk(KERN_ERR "alg: hash: digest failed on test %d " - "for %s: ret=%d\n", i + 1, algo, -ret); + "for %s: ret=%d\n", j, algo, -ret); goto out; } if (memcmp(result, template[i].digest, crypto_ahash_digestsize(tfm))) { printk(KERN_ERR "alg: hash: Test %d failed for %s\n", - i + 1, algo); + j, algo); hexdump(result, crypto_ahash_digestsize(tfm)); ret = -EINVAL; goto out; From aa07a6990f4b6a8ef9fc538dea55bac6f92255f2 Mon Sep 17 00:00:00 2001 From: Alex Riesen Date: Tue, 2 Jun 2009 14:13:14 +1000 Subject: [PATCH 099/162] crypto: api - Use formatting of module name Besdies, for the old code, gcc-4.3.3 produced this warning: "format not a string literal and no format arguments" Signed-off-by: Alex Riesen Signed-off-by: Herbert Xu --- crypto/api.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/crypto/api.c b/crypto/api.c index f500fb840be9..d5944f92b416 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -217,14 +217,11 @@ struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask) alg = crypto_alg_lookup(name, type, mask); if (!alg) { - char tmp[CRYPTO_MAX_ALG_NAME]; - - request_module(name); + request_module("%s", name); if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask & - CRYPTO_ALG_NEED_FALLBACK) && - snprintf(tmp, sizeof(tmp), "%s-all", name) < sizeof(tmp)) - request_module(tmp); + CRYPTO_ALG_NEED_FALLBACK)) + request_module("%s-all", name); alg = crypto_alg_lookup(name, type, mask); } From 08ced854fc4a979d9e59ba01000bf96e7057cfbc Mon Sep 17 00:00:00 2001 From: Alexander Clouter Date: Wed, 3 Jun 2009 19:28:03 +1000 Subject: [PATCH 100/162] hwrng: timeriomem - Fix potential oops (request_mem_region/__devinit) Fixed oops when calling device_unregister followed by device_register (changing __init to __devinit) and removed request_mem_region() as platform_device_register already does this which can result in EBUSY Signed-off-by: Alexander Clouter Signed-off-by: Herbert Xu --- drivers/char/hw_random/timeriomem-rng.c | 26 ++++++------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c index dcd352ad0e7f..a94e930575f2 100644 --- a/drivers/char/hw_random/timeriomem-rng.c +++ b/drivers/char/hw_random/timeriomem-rng.c @@ -88,9 +88,9 @@ static struct hwrng timeriomem_rng_ops = { .priv = 0, }; -static int __init timeriomem_rng_probe(struct platform_device *pdev) +static int __devinit timeriomem_rng_probe(struct platform_device *pdev) { - struct resource *res, *mem; + struct resource *res; int ret; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); @@ -98,21 +98,12 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev) if (!res) return -ENOENT; - mem = request_mem_region(res->start, res->end - res->start + 1, - pdev->name); - if (mem == NULL) - return -EBUSY; - - dev_set_drvdata(&pdev->dev, mem); - timeriomem_rng_data = pdev->dev.platform_data; timeriomem_rng_data->address = ioremap(res->start, res->end - res->start + 1); - if (!timeriomem_rng_data->address) { - ret = -ENOMEM; - goto err_ioremap; - } + if (!timeriomem_rng_data->address) + return -EIO; if (timeriomem_rng_data->period != 0 && usecs_to_jiffies(timeriomem_rng_data->period) > 0) { @@ -125,7 +116,7 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev) ret = hwrng_register(&timeriomem_rng_ops); if (ret) - goto err_register; + goto failed; dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n", timeriomem_rng_data->address, @@ -133,24 +124,19 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev) return 0; -err_register: +failed: dev_err(&pdev->dev, "problem registering\n"); iounmap(timeriomem_rng_data->address); -err_ioremap: - release_resource(mem); return ret; } static int __devexit timeriomem_rng_remove(struct platform_device *pdev) { - struct resource *mem = dev_get_drvdata(&pdev->dev); - del_timer_sync(&timeriomem_rng_timer); hwrng_unregister(&timeriomem_rng_ops); iounmap(timeriomem_rng_data->address); - release_resource(mem); return 0; } From c076b9937f4912ffc09b30e155fe5246f002f21a Mon Sep 17 00:00:00 2001 From: Ben Nizette Date: Wed, 3 Jun 2009 11:38:20 +0200 Subject: [PATCH 101/162] avr32: Fix clash in ATMEL_USART_ flags At the moment ATMEL_USART_{RTS,CTS,CLK} have the values 1, 2 and 3 respectively. Given these are used in bitmasks, trying to turn on the CLK line will in fact turn on the RTS and CTS lines as well. Change the value of ATMEL_USART_CLK to 4. Signed-off-by: Ben Nizette Signed-off-by: Haavard Skinnemoen --- arch/avr32/mach-at32ap/include/mach/board.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/avr32/mach-at32ap/include/mach/board.h b/arch/avr32/mach-at32ap/include/mach/board.h index 0b8164281899..ddedb471f33e 100644 --- a/arch/avr32/mach-at32ap/include/mach/board.h +++ b/arch/avr32/mach-at32ap/include/mach/board.h @@ -29,7 +29,7 @@ extern struct platform_device *atmel_default_console_device; /* Flags for selecting USART extra pins */ #define ATMEL_USART_RTS 0x01 #define ATMEL_USART_CTS 0x02 -#define ATMEL_USART_CLK 0x03 +#define ATMEL_USART_CLK 0x04 struct atmel_uart_data { short use_dma_tx; /* use transmit DMA? */ From c878b7d60418a45c36d99c2dc876ebb76035d404 Mon Sep 17 00:00:00 2001 From: Mark Jackson Date: Wed, 3 Jun 2009 11:16:38 +0100 Subject: [PATCH 102/162] Fix MIMC200 board LCD init This patch updates the LCD init code for the MIMC200 board. V2 fixes a .yres typo and corrects an incorrect setup value in the call to at32_add_device_lcdc() Without this patch, the lcd setup is wrong and won't display images correctly (if at all !!) Signed-off-by: Mark Jackson Signed-off-by: Haavard Skinnemoen --- arch/avr32/boards/mimc200/setup.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/avr32/boards/mimc200/setup.c b/arch/avr32/boards/mimc200/setup.c index c1b2175b4fea..523d8e183bef 100644 --- a/arch/avr32/boards/mimc200/setup.c +++ b/arch/avr32/boards/mimc200/setup.c @@ -43,16 +43,16 @@ unsigned long at32_board_osc_rates[3] = { /* Initialized by bootloader-specific startup code. */ struct tag *bootloader_tags __initdata; -static struct fb_videomode __initdata tx14d14_modes[] = { +static struct fb_videomode __initdata pt0434827_modes[] = { { - .name = "640x480 @ 60", - .refresh = 60, - .xres = 640, .yres = 480, - .pixclock = KHZ2PICOS(11666), + .name = "480x272 @ 72", + .refresh = 72, + .xres = 480, .yres = 272, + .pixclock = KHZ2PICOS(10000), - .left_margin = 80, .right_margin = 1, - .upper_margin = 13, .lower_margin = 2, - .hsync_len = 64, .vsync_len = 1, + .left_margin = 1, .right_margin = 1, + .upper_margin = 12, .lower_margin = 1, + .hsync_len = 42, .vsync_len = 1, .sync = 0, .vmode = FB_VMODE_NONINTERLACED, @@ -60,14 +60,14 @@ static struct fb_videomode __initdata tx14d14_modes[] = { }; static struct fb_monspecs __initdata mimc200_default_monspecs = { - .manufacturer = "HIT", - .monitor = "TX14D14VM1BAB", - .modedb = tx14d14_modes, - .modedb_len = ARRAY_SIZE(tx14d14_modes), + .manufacturer = "PT", + .monitor = "PT0434827-A401", + .modedb = pt0434827_modes, + .modedb_len = ARRAY_SIZE(pt0434827_modes), .hfmin = 14820, .hfmax = 22230, .vfmin = 60, - .vfmax = 73.3, + .vfmax = 85, .dclkmax = 25200000, }; @@ -228,7 +228,8 @@ static int __init mimc200_init(void) i2c_register_board_info(0, i2c_info, ARRAY_SIZE(i2c_info)); at32_add_device_lcdc(0, &mimc200_lcdc_data, - fbmem_start, fbmem_size, 1); + fbmem_start, fbmem_size, + ATMEL_LCDC_CONTROL | ATMEL_LCDC_ALT_CONTROL | ATMEL_LCDC_ALT_24B_DATA); return 0; } From 01ca79f1411eae2a45352709c838b946b1af9fbd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:52 +0200 Subject: [PATCH 103/162] x86, mce: add machine check exception count in /proc/interrupts Useful for debugging, but it's also good general policy to have a counter for all special interrupts there. This makes it easier to diagnose where a CPU is spending its time. [ Impact: feature, debugging tool ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 3 +++ arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ arch/x86/kernel/irq.c | 10 ++++++++++ 3 files changed, 17 insertions(+) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ac6e0303bf21..1156dae295ac 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -89,6 +89,7 @@ struct mce_log { extern int mce_disabled; #include +#include void mce_setup(struct mce *m); void mce_log(struct mce *m); @@ -123,6 +124,8 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } int mce_available(struct cpuinfo_x86 *c); +DECLARE_PER_CPU(unsigned, mce_exception_count); + void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1d0aa9c4e15b..287268d21836 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -57,6 +57,8 @@ int mce_disabled; atomic_t mce_entry; +DEFINE_PER_CPU(unsigned, mce_exception_count); + /* * Tolerant levels: * 0: always panic on uncorrected errors, log corrected errors @@ -359,6 +361,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); + __get_cpu_var(mce_exception_count)++; + if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) goto out; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index a05660bf0299..05fc635c28c0 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include #include #include +#include atomic_t irq_err_count; @@ -93,6 +94,12 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); # endif +#endif +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_printf(p, " Machine check exceptions\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) @@ -161,6 +168,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) { u64 sum = irq_stats(cpu)->__nmi_count; +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + sum += per_cpu(mce_exception_count, cpu); +#endif #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; From ca84f69697da0f004135e45b63ca560b6bd3554e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:57 +0200 Subject: [PATCH 104/162] x86, mce: add MCE poll count to /proc/interrupts Keep a count of the machine check polls (or CMCI events) in /proc/interrupts. Andi needs this for debugging, but it's also useful in general to see what's going in by the kernel. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ arch/x86/kernel/irq.c | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 1156dae295ac..63abf3b19432 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -125,6 +125,7 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } int mce_available(struct cpuinfo_x86 *c); DECLARE_PER_CPU(unsigned, mce_exception_count); +DECLARE_PER_CPU(unsigned, mce_poll_count); void mce_log_therm_throt_event(__u64 status); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 287268d21836..784f6ae9d6f4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -264,6 +264,8 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } +DEFINE_PER_CPU(unsigned, mce_poll_count); + /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. @@ -275,6 +277,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; + __get_cpu_var(mce_poll_count)++; + mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 05fc635c28c0..eff46b5de62f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -100,6 +100,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); seq_printf(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_printf(p, " Machine check polls\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) From f6fb0ac0869500323c78fa21992fe1933af61e91 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: [PATCH 105/162] x86, mce: store record length into memory struct mce anchor This makes it easier for tools who want to extract the mcelog out of crash images or memory dumps to adapt to changing struct mce size. The length field replaces padding, so it's fully compatible. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 63abf3b19432..0a61946d4396 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -59,7 +59,7 @@ struct mce_log { unsigned len; /* = MCE_LOG_LEN */ unsigned next; unsigned flags; - unsigned pad0; + unsigned recordlen; /* length of struct mce */ struct mce entry[MCE_LOG_LEN]; }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 784f6ae9d6f4..3db047e7a0fb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -108,8 +108,9 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm); */ static struct mce_log mcelog = { - MCE_LOG_SIGNATURE, - MCE_LOG_LEN, + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), }; void mce_log(struct mce *mce) From d620c67fb92aa11736112f9a03e31d8e3079c57a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:56 +0200 Subject: [PATCH 106/162] x86, mce: support more than 256 CPUs in struct mce The old struct mce had a limitation to 256 CPUs. But x86 Linux supports more than that now with x2apic. Add a new field extcpu to report the extended number. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 4 ++-- arch/x86/kernel/cpu/mcheck/mce-inject.c | 10 +++++----- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 0a61946d4396..b4a04b60b740 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -40,9 +40,9 @@ struct mce { __u64 res2; /* dito. */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ - __u8 cpu; /* cpu that raised the error */ + __u8 cpu; /* cpu number; obsolete; use extcpu now */ __u8 finished; /* entry is valid */ - __u32 pad; + __u32 extcpu; /* linux cpu number that detected the error */ }; /* diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 7b3a5428396a..7d858fb4ce67 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -23,14 +23,14 @@ /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) { - struct mce *i = &per_cpu(injectm, m->cpu); + struct mce *i = &per_cpu(injectm, m->extcpu); /* Make sure noone reads partially written injectm */ i->finished = 0; mb(); m->finished = 0; /* First set the fields after finished */ - i->cpu = m->cpu; + i->extcpu = m->extcpu; mb(); /* Now write record in order, finished last (except above) */ memcpy(i, m, sizeof(struct mce)); @@ -49,7 +49,7 @@ static void raise_mce(unsigned long data) { struct delayed_mce *dm = (struct delayed_mce *)data; struct mce *m = &dm->m; - int cpu = m->cpu; + int cpu = m->extcpu; inject_mce(m); if (m->status & MCI_STATUS_UC) { @@ -93,7 +93,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (copy_from_user(&m, ubuf, usize)) return -EFAULT; - if (m.cpu >= num_possible_cpus() || !cpu_online(m.cpu)) + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); @@ -108,7 +108,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, memcpy(&dm->m, &m, sizeof(struct mce)); setup_timer(&dm->timer, raise_mce, (unsigned long)dm); dm->timer.expires = jiffies + 2; - add_timer_on(&dm->timer, m.cpu); + add_timer_on(&dm->timer, m.extcpu); return usize; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3db047e7a0fb..2c4dd6c422c3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -94,7 +94,7 @@ static inline int skip_bank_init(int i) void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); - m->cpu = smp_processor_id(); + m->cpu = m->extcpu = smp_processor_id(); rdtscll(m->tsc); } @@ -158,7 +158,7 @@ static void print_mce(struct mce *m) KERN_EMERG "HARDWARE ERROR\n" KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", - m->cpu, m->mcgstatus, m->bank, m->status); + m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", From 8ee08347c1e8b5680b3b3ce081e42e97bcaa1abe Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:56 +0200 Subject: [PATCH 107/162] x86, mce: extend struct mce user interface with more information. Experience has shown that struct mce which is used to pass an machine check to the user space daemon currently a few limitations. Also some data which is useful to print at panic level is also missing. This patch addresses most of them. The same information is also printed out together with mce panic. struct mce can be painlessly extended in a compatible way, the mcelog user space code just ignores additional fields with a warning. - It doesn't provide a wall time timestamp. There have been a few complaints about that. Fix that by adding a 64bit time_t - It doesn't provide the exact CPU identification. This makes it awkward for mcelog to decode the event correctly, especially when there are variations in the supported MCE codes on different CPU models or when mcelog is running on a different host after a panic. Previously the administrator had to specify the correct CPU when mcelog ran on a different host, but with the more variation in machine checks now it's better to auto detect that. It's also useful for more detailed analysis of CPU events. Pass CPUID 1.EAX and the cpu vendor (as encoded in processor.h) instead. - Socket ID and initial APIC ID are useful to report because they allow to identify the failing CPU in some (not all) cases. This is also especially useful for the panic situation. This addresses one of the complaints from Thomas Gleixner earlier. - The MCG capabilities MSR needs to be reported for some advanced error processing in mcelog Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 10 ++++++++-- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b4a04b60b740..ba1f8890cf51 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -36,13 +36,19 @@ struct mce { __u64 mcgstatus; __u64 ip; __u64 tsc; /* cpu time stamp counter */ - __u64 res1; /* for future extension */ - __u64 res2; /* dito. */ + __u64 time; /* wall time_t when error was detected */ + __u8 cpuvendor; /* cpu vendor as encoded in system.h */ + __u8 pad1; + __u16 pad2; + __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ __u8 cpu; /* cpu number; obsolete; use extcpu now */ __u8 finished; /* entry is valid */ __u32 extcpu; /* linux cpu number that detected the error */ + __u32 socketid; /* CPU socket ID */ + __u32 apicid; /* CPU initial apic ID */ + __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ }; /* diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2c4dd6c422c3..ba68449c22a1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -96,6 +96,15 @@ void mce_setup(struct mce *m) memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); rdtscll(m->tsc); + /* We hope get_seconds stays lockless */ + m->time = get_seconds(); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->cpuid = cpuid_eax(1); +#ifdef CONFIG_SMP + m->socketid = cpu_data(m->extcpu).phys_proc_id; +#endif + m->apicid = cpu_data(m->extcpu).initial_apicid; + rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } DEFINE_PER_CPU(struct mce, injectm); @@ -173,6 +182,9 @@ static void print_mce(struct mce *m) if (m->misc) printk("MISC %llx ", m->misc); printk("\n"); + printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, + m->apicid); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " "and contact your hardware vendor\n"); From de8a84d85ad8bb46d01d72ebc57030b95075603c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:53 +0200 Subject: [PATCH 108/162] x86, mce: log corrected errors when panicing Normally the machine check handler ignores corrected errors and leaves them to machine_check_poll(). But when panicing mcp won't run, so log all errors. Note: this can still miss some cases until the "early no way out" patch later is applied too. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ba68449c22a1..86806e52fc40 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -412,9 +412,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* * Non uncorrected errors are handled by machine_check_poll - * Leave them alone. + * Leave them alone, unless this panics. */ - if ((m.status & MCI_STATUS_UC) == 0) + if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) continue; /* From a0189c70e5f17f4253dd7bc575c97469900e23d6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: [PATCH 109/162] x86, mce: remove TSC print heuristic Previously mce_panic used a simple heuristic to avoid printing old so far unreported machine check events on a mce panic. This worked by comparing the TSC value at the start of the machine check handler with the event time stamp and only printing newer ones. This has a couple of issues, in particular on systems where the TSC is not fully synchronized between CPUs it could lose events or print old ones. It is also problematic with full system synchronization as it is added by the next patch. Remove the TSC heuristic and instead replace it with a simple heuristic to print corrected errors first and after that uncorrected errors and finally the worst machine check as determined by the machine check handler. This simplifies the code because there is no need to pass the original TSC value around. Contains fixes from Ying Huang [ Impact: bug fix, cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Cc: Ying Huang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 34 ++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 86806e52fc40..6773610061d8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -158,6 +158,7 @@ void mce_log(struct mce *mce) mcelog.entry[entry].finished = 1; wmb(); + mce->finished = 1; set_bit(0, ¬ify_user); } @@ -190,23 +191,29 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } -static void mce_panic(char *msg, struct mce *backup, u64 start) +static void mce_panic(char *msg, struct mce *final) { int i; bust_spinlocks(1); console_verbose(); + /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { - u64 tsc = mcelog.entry[i].tsc; - - if ((s64)(tsc - start) < 0) - continue; - print_mce(&mcelog.entry[i]); - if (backup && mcelog.entry[i].tsc == backup->tsc) - backup = NULL; + struct mce *m = &mcelog.entry[i]; + if ((m->status & MCI_STATUS_VAL) && + !(m->status & MCI_STATUS_UC)) + print_mce(m); } - if (backup) - print_mce(backup); + /* Now print uncorrected but with the final one last */ + for (i = 0; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + if (!(m->status & MCI_STATUS_VAL)) + continue; + if (!final || memcmp(m, final, sizeof(struct mce))) + print_mce(m); + } + if (final) + print_mce(final); panic(msg); } @@ -362,7 +369,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) { struct mce m, panicm; int panicm_found = 0; - u64 mcestart = 0; int i; /* * If no_way_out gets set, there is no safe way to recover from this @@ -394,7 +400,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (!(m.mcgstatus & MCG_STATUS_RIPV)) no_way_out = 1; - rdtscll(mcestart); barrier(); for (i = 0; i < banks; i++) { @@ -478,7 +483,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * has not set tolerant to an insane level, give up and die. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm, mcestart); + mce_panic("Machine check", &panicm); /* * If the error seems to be unrecoverable, something should be @@ -506,8 +511,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (user_space) { force_sig(SIGBUS, current); } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", - &panicm, mcestart); + mce_panic("Uncorrected machine check", &panicm); } } From 817f32d02a52dd7f5941534e0699883691e918df Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: [PATCH 110/162] x86, mce: add table driven machine check grading The machine check grading (as in deciding what should be done for a given register value) has to be done multiple times soon and it's also getting more complicated. So it makes sense to consolidate it into a single function. To get smaller and more straight forward and possibly more extensible code I opted towards a new table driven method. The various rules are put into a table when is then executed by a very simple interpreter. The grading engine is in a new file mce-severity.c. I also added a private include file mce-internal.h, because mce.h is already a bit too cluttered. This is dead code right now, but will be used in followon patches. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce-internal.h | 10 ++++ arch/x86/kernel/cpu/mcheck/mce-severity.c | 61 +++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 arch/x86/kernel/cpu/mcheck/mce-internal.h create mode 100644 arch/x86/kernel/cpu/mcheck/mce-severity.c diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 60ee182c6c52..45004faf67ea 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,5 +1,6 @@ obj-y = mce.o therm_throt.o +obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h new file mode 100644 index 000000000000..f126b4ae7a25 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -0,0 +1,10 @@ +#include + +enum severity_level { + MCE_NO_SEVERITY, + MCE_SOME_SEVERITY, + MCE_UC_SEVERITY, + MCE_PANIC_SEVERITY, +}; + +int mce_severity(struct mce *a, int tolerant, char **msg); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c new file mode 100644 index 000000000000..c189e89a89ae --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -0,0 +1,61 @@ +/* + * MCE grading rules. + * Copyright 2008, 2009 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Author: Andi Kleen + */ +#include +#include + +#include "mce-internal.h" + +/* + * Grade an mce by severity. In general the most severe ones are processed + * first. Since there are quite a lot of combinations test the bits in a + * table-driven way. The rules are simply processed in order, first + * match wins. + */ + +static struct severity { + u64 mask; + u64 result; + unsigned char sev; + unsigned char mcgmask; + unsigned char mcgres; + char *msg; +} severities[] = { +#define SEV(s) .sev = MCE_ ## s ## _SEVERITY +#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } +#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } +#define MCGMASK(x, res, s, m, r...) \ + { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } + BITCLR(MCI_STATUS_VAL, NO, "Invalid"), + BITCLR(MCI_STATUS_EN, NO, "Not enabled"), + BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), + MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "No restart IP"), + BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), + BITSET(MCI_STATUS_UC, UC, "Uncorrected"), + BITSET(0, SOME, "No match") /* always matches. keep at end */ +}; + +int mce_severity(struct mce *a, int tolerant, char **msg) +{ + struct severity *s; + for (s = severities;; s++) { + if ((a->status & s->mask) != s->result) + continue; + if ((a->mcgstatus & s->mcgmask) != s->mcgres) + continue; + if (s->sev > MCE_NO_SEVERITY && (a->status & MCI_STATUS_UC) && + tolerant < 1) + return MCE_PANIC_SEVERITY; + if (msg) + *msg = s->msg; + return s->sev; + } +} From bd19a5e6b73df276e1ccedf9059e9ee70c372d7d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: [PATCH 111/162] x86, mce: check early in exception handler if panic is needed The exception handler should behave differently if the exception is fatal versus one that can be returned from. In the first case it should never clear any registers because these need to be preserved for logging after the next boot. Otherwise it should clear them on each CPU step by step so that other CPUs sharing the same bank don't see duplicate events. Otherwise we risk reporting events multiple times on any CPUs which have shared machine check banks, which is a common problem on Intel Nehalem which has both SMT (two CPU threads sharing banks) and shared machine check banks in the uncore. Determine early in a special pass if any event requires a panic. This uses the mce_severity() function added earlier. This is needed for the next patch. Also fixes a problem together with an earlier patch that corrected events weren't logged on a fatal MCE. [ Impact: Feature ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 38 +++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6773610061d8..5031814ac943 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -36,6 +36,7 @@ #include #include +#include "mce-internal.h" #include "mce.h" /* Handle unconfigured int18 (should never happen) */ @@ -191,7 +192,7 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } -static void mce_panic(char *msg, struct mce *final) +static void mce_panic(char *msg, struct mce *final, char *exp) { int i; @@ -214,6 +215,8 @@ static void mce_panic(char *msg, struct mce *final) } if (final) print_mce(final); + if (exp) + printk(KERN_EMERG "Machine check: %s\n", exp); panic(msg); } @@ -357,6 +360,22 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) } EXPORT_SYMBOL_GPL(machine_check_poll); +/* + * Do a quick check if any of the events requires a panic. + * This decides if we keep the events around or clear them. + */ +static int mce_no_way_out(struct mce *m, char **msg) +{ + int i; + + for (i = 0; i < banks; i++) { + m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) + return 1; + } + return 0; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -381,6 +400,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ int kill_it = 0; DECLARE_BITMAP(toclear, MAX_NR_BANKS); + char *msg = "Unknown"; atomic_inc(&mce_entry); @@ -395,10 +415,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - - /* if the restart IP is not valid, we're done for */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - no_way_out = 1; + no_way_out = mce_no_way_out(&m, &msg); barrier(); @@ -430,18 +447,13 @@ void do_machine_check(struct pt_regs *regs, long error_code) __set_bit(i, toclear); if (m.status & MCI_STATUS_EN) { - /* if PCC was set, there's no way out */ - no_way_out |= !!(m.status & MCI_STATUS_PCC); /* * If this error was uncorrectable and there was * an overflow, we're in trouble. If no overflow, * we might get away with just killing a task. */ - if (m.status & MCI_STATUS_UC) { - if (tolerant < 1 || m.status & MCI_STATUS_OVER) - no_way_out = 1; + if (m.status & MCI_STATUS_UC) kill_it = 1; - } } else { /* * Machine check event was not enabled. Clear, but @@ -483,7 +495,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * has not set tolerant to an insane level, give up and die. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm); + mce_panic("Machine check", &panicm, msg); /* * If the error seems to be unrecoverable, something should be @@ -511,7 +523,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (user_space) { force_sig(SIGBUS, current); } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", &panicm); + mce_panic("Uncorrected machine check", &panicm, msg); } } From ccc3c3192ae78dd56dcdf5353fd1a9ef5f9a3e2b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:54 +0200 Subject: [PATCH 112/162] x86, mce: implement bootstrapping for machine check wakeups Machine checks support waking up the mcelog daemon quickly. The original wake up code for this was pretty ugly, relying on a idle notifier and a special process flag. The reason it did it this way is that the machine check handler is not subject to normal interrupt locking rules so it's not safe to call wake_up(). Instead it set a process flag and then either did the wakeup in the syscall return or in the idle notifier. This patch adds a new "bootstraping" method as replacement. The idea is that the handler checks if it's in a state where it is unsafe to call wake_up(). If it's safe it calls it directly. When it's not safe -- that is it interrupted in a critical section with interrupts disables -- it uses a new "self IPI" to trigger an IPI to its own CPU. This can be done safely because IPI triggers are atomic with some care. The IPI is raised once the interrupts are reenabled and can then safely call wake_up(). When APICs are disabled the event is just queued and will be picked up eventually by the next polling timer. I think that's a reasonable compromise, since it should only happen quite rarely. Contains fixes from Ying Huang. [ solve conflict on irqinit, make it work on 32bit (entry_arch.h) - HS ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/entry_arch.h | 4 +++ arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/irq_vectors.h | 5 +++ arch/x86/kernel/cpu/mcheck/mce.c | 54 ++++++++++++++++++++++++++++++ arch/x86/kernel/entry_64.S | 5 +++ arch/x86/kernel/irqinit.c | 3 ++ 6 files changed, 72 insertions(+) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index b2eb9c066843..4cdcf5a3c96b 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -60,4 +60,8 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif +#ifdef CONFIG_X86_NEW_MCE +BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) +#endif + #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a7d14bbae110..4e59197e29ba 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -32,6 +32,7 @@ extern void error_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); extern void reschedule_interrupt(void); +extern void mce_self_interrupt(void); extern void invalidate_interrupt(void); extern void invalidate_interrupt0(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 8c46b851296a..68f7cf84a333 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -117,6 +117,11 @@ */ #define GENERIC_INTERRUPT_VECTOR 0xed +/* + * Self IPI vector for machine checks + */ +#define MCE_SELF_VECTOR 0xeb + /* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5031814ac943..121781627858 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,10 @@ #include #include +#include +#include #include +#include #include #include @@ -287,6 +291,54 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Called after interrupts have been reenabled again + * when a MCE happened during an interrupts off region + * in the kernel. + */ +asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + mce_notify_user(); + irq_exit(); +} +#endif + +static void mce_report_event(struct pt_regs *regs) +{ + if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { + mce_notify_user(); + return; + } + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Without APIC do not notify. The event will be picked + * up eventually. + */ + if (!cpu_has_apic) + return; + + /* + * When interrupts are disabled we cannot use + * kernel services safely. Trigger an self interrupt + * through the APIC to instead do the notification + * after interrupts are reenabled again. + */ + apic->send_IPI_self(MCE_SELF_VECTOR); + + /* + * Wait for idle afterwards again so that we don't leave the + * APIC in a non idle state because the normal APIC writes + * cannot exclude us. + */ + apic_wait_icr_idle(); +#endif +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -530,6 +582,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); + mce_report_event(regs); + /* the last thing we do is clear state */ for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a31a7f29cffe..711c130a8411 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1011,6 +1011,11 @@ apicinterrupt THRESHOLD_APIC_VECTOR \ apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt +#ifdef CONFIG_X86_MCE +apicinterrupt MCE_SELF_VECTOR \ + mce_self_interrupt smp_mce_self_interrupt +#endif + #ifdef CONFIG_SMP apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ call_function_single_interrupt smp_call_function_single_interrupt diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index aab3d277766c..441f6ec6e9d4 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -187,6 +187,9 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif +#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) + alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); +#endif #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ From f94b61c2c9fdcc90773c49df9ccf9ede3ad0d7db Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: [PATCH 113/162] x86, mce: implement panic synchronization In some circumstances multiple CPUs can enter mce_panic() in parallel. This gives quite confused output because they will all dump the same machine check buffer. The other problem is that they would all panic in parallel, but not process each other's shutdown IPIs because interrupts are disabled. Detect this situation early on in mce_panic(). On the first CPU entering will do the panic, the others will just wait to be killed. For paranoia reasons in case the other CPU dies during the MCE I added a 5 seconds timeout. If it expires each CPU will panic on its own again. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 121781627858..421020f1d7db 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -196,10 +196,32 @@ static void print_mce(struct mce *m) "and contact your hardware vendor\n"); } +#define PANIC_TIMEOUT 5 /* 5 seconds */ + +static atomic_t mce_paniced; + +/* Panic in progress. Enable interrupts and wait for final IPI */ +static void wait_for_panic(void) +{ + long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + preempt_disable(); + local_irq_enable(); + while (timeout-- > 0) + udelay(1); + panic("Panicing machine check CPU died"); +} + static void mce_panic(char *msg, struct mce *final, char *exp) { int i; + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_add_return(1, &mce_paniced) > 1) + wait_for_panic(); + barrier(); + bust_spinlocks(1); console_verbose(); /* First print corrected ones that are still unlogged */ From 3c0797925f4ef9d55a32059d2af61a9c262e639d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:55 +0200 Subject: [PATCH 114/162] x86, mce: switch x86 machine check handler to Monarch election. On Intel platforms machine check exceptions are always broadcast to all CPUs. This patch makes the machine check handler synchronize all these machine checks, elect a Monarch to handle the event and collect the worst event from all CPUs and then process it first. This has some advantages: - When there is a truly data corrupting error the system panics as quickly as possible. This improves containment of corrupted data and makes sure the corrupted data never hits stable storage. - The panics are synchronized and do not reenter the panic code on multiple CPUs (which currently does not handle this well). - All the errors are reported. Currently it often happens that another CPU happens to do the panic first, but reports useless information (empty machine check) because the real error happened on another CPU which came in later. This is a big advantage on Nehalem where the 8 threads per CPU lead to often the wrong CPU winning the race and dumping useless information on a machine check. The problem also occurs in a less severe form on older CPUs. - The system can detect when no CPUs detected a machine check and shut down the system. This can happen when one CPU is so badly hung that that it cannot process a machine check anymore or when some external agent wants to stop the system by asserting the machine check pin. This follows Intel hardware recommendations. - This matches the recommended error model by the CPU designers. - The events can be output in true severity order - When a panic happens on another CPU it makes sure to be actually be able to process the stop IPI by enabling interrupts. The code is extremly careful to handle timeouts while waiting for other CPUs. It can't rely on the normal timing mechanisms (jiffies, ktime_get) because of its asynchronous/lockless nature, so it uses own timeouts using ndelay() and a "SPINUNIT" The timeout is configurable. By default it waits for upto one second for the other CPUs. This can be also disabled. From some informal testing AMD systems do not see to broadcast machine checks, so right now it's always disabled by default on non Intel CPUs or also on very old Intel systems. Includes fixes from Ying Huang Fixed a "ecception" in a comment (H.Seto) Moved global_nwo reset later based on suggestion from H.Seto v2: Avoid duplicate messages [ Impact: feature, fixes long standing problems. ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- Documentation/x86/x86_64/boot-options.txt | 6 +- Documentation/x86/x86_64/machinecheck | 4 + arch/x86/kernel/cpu/mcheck/mce.c | 360 ++++++++++++++++++++-- 3 files changed, 340 insertions(+), 30 deletions(-) diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 63fca718256e..0ee5e3b212f3 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -15,13 +15,17 @@ Machine check in a reboot. On Intel systems it is enabled by default. mce=nobootlog Disable boot machine check logging. - mce=tolerancelevel (number) + mce=tolerancelevel[,monarchtimeout] (number,number) + tolerance levels: 0: always panic on uncorrected errors, log corrected errors 1: panic or SIGBUS on uncorrected errors, log corrected errors 2: SIGBUS or log uncorrected errors, log corrected errors 3: never panic or SIGBUS, log all errors (for testing only) Default is 1 Can be also set using sysfs which is preferable. + monarchtimeout: + Sets the time in us to wait for other CPUs on machine checks. 0 + to disable. nomce (for compatibility with i386): same as mce=off diff --git a/Documentation/x86/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck index a4fdb25446e0..b1fb30273286 100644 --- a/Documentation/x86/x86_64/machinecheck +++ b/Documentation/x86/x86_64/machinecheck @@ -69,6 +69,10 @@ trigger Program to run when a machine check event is detected. This is an alternative to running mcelog regularly from cron and allows to detect events faster. +monarch_timeout + How long to wait for the other CPUs to machine check too on a + exception. 0 to disable waiting for other CPUs. + Unit: us TBD document entries for AMD threshold interrupt configuration diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 421020f1d7db..ba431893e31d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +62,8 @@ int mce_disabled; #define MISC_MCELOG_MINOR 227 +#define SPINUNIT 100 /* 100ns */ + atomic_t mce_entry; DEFINE_PER_CPU(unsigned, mce_exception_count); @@ -77,6 +81,7 @@ static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; +static int monarch_timeout = -1; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -84,6 +89,9 @@ static char *trigger_argv[2] = { trigger, NULL }; static unsigned long dont_init_banks; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +static DEFINE_PER_CPU(struct mce, mces_seen); +static int cpu_missing; + /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { @@ -241,6 +249,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp) } if (final) print_mce(final); + if (cpu_missing) + printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); panic(msg); @@ -450,6 +460,264 @@ static int mce_no_way_out(struct mce *m, char **msg) return 0; } +/* + * Variable to establish order between CPUs while scanning. + * Each CPU spins initially until executing is equal its number. + */ +static atomic_t mce_executing; + +/* + * Defines order of CPUs on entry. First CPU becomes Monarch. + */ +static atomic_t mce_callin; + +/* + * Check if a timeout waiting for other CPUs happened. + */ +static int mce_timed_out(u64 *t) +{ + /* + * The others already did panic for some reason. + * Bail out like in a timeout. + * rmb() to tell the compiler that system_state + * might have been modified by someone else. + */ + rmb(); + if (atomic_read(&mce_paniced)) + wait_for_panic(); + if (!monarch_timeout) + goto out; + if ((s64)*t < SPINUNIT) { + /* CHECKME: Make panic default for 1 too? */ + if (tolerant < 1) + mce_panic("Timeout synchronizing machine check over CPUs", + NULL, NULL); + cpu_missing = 1; + return 1; + } + *t -= SPINUNIT; +out: + touch_nmi_watchdog(); + return 0; +} + +/* + * The Monarch's reign. The Monarch is the CPU who entered + * the machine check handler first. It waits for the others to + * raise the exception too and then grades them. When any + * error is fatal panic. Only then let the others continue. + * + * The other CPUs entering the MCE handler will be controlled by the + * Monarch. They are called Subjects. + * + * This way we prevent any potential data corruption in a unrecoverable case + * and also makes sure always all CPU's errors are examined. + * + * Also this detects the case of an machine check event coming from outer + * space (not detected by any CPUs) In this case some external agent wants + * us to shut down, so panic too. + * + * The other CPUs might still decide to panic if the handler happens + * in a unrecoverable place, but in this case the system is in a semi-stable + * state and won't corrupt anything by itself. It's ok to let the others + * continue for a bit first. + * + * All the spin loops have timeouts; when a timeout happens a CPU + * typically elects itself to be Monarch. + */ +static void mce_reign(void) +{ + int cpu; + struct mce *m = NULL; + int global_worst = 0; + char *msg = NULL; + char *nmsg = NULL; + + /* + * This CPU is the Monarch and the other CPUs have run + * through their handlers. + * Grade the severity of the errors of all the CPUs. + */ + for_each_possible_cpu(cpu) { + int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, + &nmsg); + if (severity > global_worst) { + msg = nmsg; + global_worst = severity; + m = &per_cpu(mces_seen, cpu); + } + } + + /* + * Cannot recover? Panic here then. + * This dumps all the mces in the log buffer and stops the + * other CPUs. + */ + if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) + mce_panic("Fatal machine check", m, msg); + + /* + * For UC somewhere we let the CPU who detects it handle it. + * Also must let continue the others, otherwise the handling + * CPU could deadlock on a lock. + */ + + /* + * No machine check event found. Must be some external + * source or one CPU is hung. Panic. + */ + if (!m && tolerant < 3) + mce_panic("Machine check from unknown source", NULL, NULL); + + /* + * Now clear all the mces_seen so that they don't reappear on + * the next mce. + */ + for_each_possible_cpu(cpu) + memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); +} + +static atomic_t global_nwo; + +/* + * Start of Monarch synchronization. This waits until all CPUs have + * entered the exception handler and then determines if any of them + * saw a fatal event that requires panic. Then it executes them + * in the entry order. + * TBD double check parallel CPU hotunplug + */ +static int mce_start(int no_way_out, int *order) +{ + int nwo; + int cpus = num_online_cpus(); + u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + + if (!timeout) { + *order = -1; + return no_way_out; + } + + atomic_add(no_way_out, &global_nwo); + + /* + * Wait for everyone. + */ + while (atomic_read(&mce_callin) != cpus) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + *order = -1; + return no_way_out; + } + ndelay(SPINUNIT); + } + + /* + * Cache the global no_way_out state. + */ + nwo = atomic_read(&global_nwo); + + /* + * Monarch starts executing now, the others wait. + */ + if (*order == 1) { + atomic_set(&mce_executing, 1); + return nwo; + } + + /* + * Now start the scanning loop one by one + * in the original callin order. + * This way when there are any shared banks it will + * be only seen by one CPU before cleared, avoiding duplicates. + */ + while (atomic_read(&mce_executing) < *order) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + *order = -1; + return no_way_out; + } + ndelay(SPINUNIT); + } + return nwo; +} + +/* + * Synchronize between CPUs after main scanning loop. + * This invokes the bulk of the Monarch processing. + */ +static int mce_end(int order) +{ + int ret = -1; + u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + + if (!timeout) + goto reset; + if (order < 0) + goto reset; + + /* + * Allow others to run. + */ + atomic_inc(&mce_executing); + + if (order == 1) { + /* CHECKME: Can this race with a parallel hotplug? */ + int cpus = num_online_cpus(); + + /* + * Monarch: Wait for everyone to go through their scanning + * loops. + */ + while (atomic_read(&mce_executing) <= cpus) { + if (mce_timed_out(&timeout)) + goto reset; + ndelay(SPINUNIT); + } + + mce_reign(); + barrier(); + ret = 0; + } else { + /* + * Subject: Wait for Monarch to finish. + */ + while (atomic_read(&mce_executing) != 0) { + if (mce_timed_out(&timeout)) + goto reset; + ndelay(SPINUNIT); + } + + /* + * Don't reset anything. That's done by the Monarch. + */ + return 0; + } + + /* + * Reset all global state. + */ +reset: + atomic_set(&global_nwo, 0); + atomic_set(&mce_callin, 0); + barrier(); + + /* + * Let others run again. + */ + atomic_set(&mce_executing, 0); + return ret; +} + +static void mce_clear_state(unsigned long *toclear) +{ + int i; + + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -457,12 +725,23 @@ static int mce_no_way_out(struct mce *m, char **msg) * This is executed in NMI context not subject to normal locking rules. This * implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! + * + * On Intel systems this is entered on all CPUs in parallel through + * MCE broadcast. However some CPUs might be broken beyond repair, + * so be always careful when synchronizing with others. */ void do_machine_check(struct pt_regs *regs, long error_code) { - struct mce m, panicm; - int panicm_found = 0; + struct mce m, *final; int i; + int worst = 0; + int severity; + /* + * Establish sequential order between the CPUs entering the machine + * check handler. + */ + int order; + /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If tolerant is cranked up, we'll try anyway. @@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (!banks) goto out; + order = atomic_add_return(1, &mce_callin); mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); no_way_out = mce_no_way_out(&m, &msg); + final = &__get_cpu_var(mces_seen); + *final = m; + barrier(); + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + */ + no_way_out = mce_start(no_way_out, &order); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); if (!bank[i]) @@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_get_rip(&m, regs); mce_log(&m); - /* - * Did this bank cause the exception? - * - * Assume that the bank with uncorrectable errors did it, - * and that there is only a single one: - */ - if ((m.status & MCI_STATUS_UC) && - (m.status & MCI_STATUS_EN)) { - panicm = m; - panicm_found = 1; + severity = mce_severity(&m, tolerant, NULL); + if (severity > worst) { + *final = m; + worst = severity; } } + if (!no_way_out) + mce_clear_state(toclear); + /* - * If we didn't find an uncorrectable error, pick - * the last one (shouldn't happen, just being safe). + * Do most of the synchronization with other CPUs. + * When there's any problem use only local no_way_out state. */ - if (!panicm_found) - panicm = m; + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; /* * If we have decided that we just CAN'T continue, and the user * has not set tolerant to an insane level, give up and die. + * + * This is mainly used in the case when the system doesn't + * support MCE broadcasting or it has been disabled. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm, msg); + mce_panic("Machine check", final, msg); /* * If the error seems to be unrecoverable, something should be @@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * instruction which caused the MCE. */ if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = panicm.ip && (panicm.cs & 3); + user_space = final->ip && (final->cs & 3); /* * If we know that the error was in user space, send a @@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (user_space) { force_sig(SIGBUS, current); } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", &panicm, msg); + mce_panic("Uncorrected machine check", final, msg); } } /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); - mce_report_event(regs); - - /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) { - if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } + if (worst > 0) + mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: atomic_dec(&mce_entry); @@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model < 0x1A) __set_bit(0, &dont_init_banks); + + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && + monarch_timeout < 0) + monarch_timeout = USEC_PER_SEC; } + if (monarch_timeout < 0) + monarch_timeout = 0; } static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) @@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = { /* * mce=off disables machine check - * mce=TOLERANCELEVEL (number, see above) + * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) + * monarchtimeout is how long to wait for other CPUs on machine + * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. */ @@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str) mce_disabled = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); - else if (isdigit(str[0])) + else if (isdigit(str[0])) { get_option(&str, &tolerant); - else { + if (*str == ',') { + ++str; + get_option(&str, &monarch_timeout); + } + } else { printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", str); return 0; @@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s, static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); +static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); static struct sysdev_ext_attribute attr_check_interval = { _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, @@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = { static struct sysdev_attribute *mce_attrs[] = { &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, + &attr_monarch_timeout.attr, NULL }; From ac9603754dc7e286e62ae4f1067958d5b0075f99 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: [PATCH 115/162] x86, mce: make non Monarch panic message "Fatal machine check" too ... instead of "Machine check". This is for consistency with the Monarch panic message. Based on a report from Ying Huang. v2: But add a descriptive postfix so that the test suite can distingush. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ba431893e31d..d5cb0b4c17ff 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -554,7 +554,7 @@ static void mce_reign(void) * other CPUs. */ if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal Machine check", m, msg); /* * For UC somewhere we let the CPU who detects it handle it. @@ -858,7 +858,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * support MCE broadcasting or it has been disabled. */ if (no_way_out && tolerant < 3) - mce_panic("Machine check", final, msg); + mce_panic("Fatal machine check on current CPU", final, msg); /* * If the error seems to be unrecoverable, something should be From 1b2797dcc9f0ad89bc382ace26c6baafbc7e33c2 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 27 May 2009 21:56:51 +0200 Subject: [PATCH 116/162] x86, mce: improve mce_get_rip Assume IP on the stack is valid when either EIPV or RIPV are set. This influences whether the machine check exception handler decides to return or panic. This fixes a test case in the mce-test suite and is more compliant to the specification. This currently only makes a difference in a artificial testing scenario with the mce-test test suite. Also in addition do not force the EIPV to be valid with the exact register MSRs, and keep in trust the CS value on stack even if MSR is available. [AK: combination of patches from Huang Ying and Hidetoshi Seto, with new description by me] [add some description, no code changed - HS] Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d5cb0b4c17ff..a7dc369a9974 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -306,21 +306,22 @@ int mce_available(struct cpuinfo_x86 *c) return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } +/* + * Get the address of the instruction at the time of the machine check + * error. + */ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) { - if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { + + if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { m->ip = regs->ip; m->cs = regs->cs; } else { m->ip = 0; m->cs = 0; } - if (rip_msr) { - /* Assume the RIP in the MSR is exact. Is this true? */ - m->mcgstatus |= MCG_STATUS_EIPV; + if (rip_msr) m->ip = mce_rdmsrl(rip_msr); - m->cs = 0; - } } #ifdef CONFIG_X86_LOCAL_APIC From 29b0f591d678838435fbb3e15ef20266f1a9e01d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:56 +0200 Subject: [PATCH 117/162] x86, mce: default to panic timeout for machine checks Fatal machine checks can be logged to disk after boot, but only if the system did a warm reboot. That's unfortunately difficult with the default panic behaviour, which waits forever and the admin has to press the power button because modern systems usually miss a reset button. This clears the machine checks in the registers and make it impossible to log them. This patch changes the default for machine check panic to always reboot after 30s. Then the mce can be successfully logged after reboot. I believe this will improve machine check experience for any system running the X server. This is dependent on successfull boot logging of MCEs. This currently only works on Intel systems, on AMD there are quite a lot of systems around which leave junk in the machine check registers after boot, so it's disabled here. These systems will continue to default to endless waiting panic. v2: Only force panic timeout when it's shorter (H.Seto) v3: Only force timeout when there is no timeout (based on comment H.Seto) [ Fix changelog - HS ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a7dc369a9974..79d243145b8f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -82,6 +82,7 @@ static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; static int monarch_timeout = -1; +static int mce_panic_timeout; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -216,6 +217,8 @@ static void wait_for_panic(void) local_irq_enable(); while (timeout-- > 0) udelay(1); + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; panic("Panicing machine check CPU died"); } @@ -253,6 +256,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp) printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; panic(msg); } @@ -1117,6 +1122,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) } if (monarch_timeout < 0) monarch_timeout = 0; + if (mce_bootlog != 0) + mce_panic_timeout = 30; } static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) From 86503560e48153aba539ff117450d31ab2ef76d7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: [PATCH 118/162] x86, mce: print header/footer only once for multiple MCEs When multiple MCEs are printed print the "HARDWARE ERROR" header and "This is not a software error" footer only once. This makes the output much more compact with many CPUs. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 79d243145b8f..ff9c732989de 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -176,11 +176,13 @@ void mce_log(struct mce *mce) set_bit(0, ¬ify_user); } -static void print_mce(struct mce *m) +static void print_mce(struct mce *m, int *first) { - printk(KERN_EMERG "\n" - KERN_EMERG "HARDWARE ERROR\n" - KERN_EMERG + if (*first) { + printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); + *first = 0; + } + printk(KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { @@ -200,9 +202,12 @@ static void print_mce(struct mce *m) printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); - printk(KERN_EMERG "This is not a software problem!\n"); - printk(KERN_EMERG "Run through mcelog --ascii to decode " - "and contact your hardware vendor\n"); +} + +static void print_mce_tail(void) +{ + printk(KERN_EMERG "This is not a software problem!\n" + KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -225,6 +230,7 @@ static void wait_for_panic(void) static void mce_panic(char *msg, struct mce *final, char *exp) { int i; + int first = 1; /* * Make sure only one CPU runs in machine check panic @@ -240,7 +246,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) struct mce *m = &mcelog.entry[i]; if ((m->status & MCI_STATUS_VAL) && !(m->status & MCI_STATUS_UC)) - print_mce(m); + print_mce(m, &first); } /* Now print uncorrected but with the final one last */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -248,12 +254,13 @@ static void mce_panic(char *msg, struct mce *final, char *exp) if (!(m->status & MCI_STATUS_VAL)) continue; if (!final || memcmp(m, final, sizeof(struct mce))) - print_mce(m); + print_mce(m, &first); } if (final) - print_mce(final); + print_mce(final, &first); if (cpu_missing) printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); + print_mce_tail(); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); if (panic_timeout == 0) From ed7290d0ee8f81aa78bfe816f01b012f208cafc5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:57 +0200 Subject: [PATCH 119/162] x86, mce: implement new status bits The x86 architecture recently added some new machine check status bits: S(ignalled) and AR (Action-Required). Signalled allows to check if a specific event caused an exception or was just logged through CMCI. AR allows the kernel to decide if an event needs immediate action or can be delayed or ignored. Implement support for these new status bits. mce_severity() uses the new bits to grade the machine check correctly and decide what to do. The exception handler uses AR to decide to kill or not. The S bit is used to separate events between the poll/CMCI handler and the exception handler. Classical UC always leads to panic. That was true before anyways because the existing CPUs always passed a PCC with it. Also corrects the rules whether to kill in user or kernel context and how to handle missing RIPV. The machine check handler largely uses the mce-severity grading engine now instead of making its own decisions. This means the logic is centralized in one place. This is useful because it has to be evaluated multiple times. v2: Some rule fixes; Add AO events Fix RIPV, RIPV|EIPV order (Ying Huang) Fix UCNA with AR=1 message (Ying Huang) Add comment about panicing in m_c_p. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 10 +++ arch/x86/kernel/cpu/mcheck/mce-internal.h | 5 ++ arch/x86/kernel/cpu/mcheck/mce-severity.c | 82 ++++++++++++++++++++-- arch/x86/kernel/cpu/mcheck/mce.c | 84 ++++++++++++----------- 4 files changed, 137 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ba1f8890cf51..afd3cdf6f8ad 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -15,6 +15,7 @@ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) +#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ @@ -27,6 +28,15 @@ #define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ #define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ #define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ +#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ +#define MCI_STATUS_AR (1ULL<<55) /* Action required */ + +/* MISC register defines */ +#define MCM_ADDR_SEGOFF 0 /* segment offset */ +#define MCM_ADDR_LINEAR 1 /* linear address */ +#define MCM_ADDR_PHYS 2 /* physical address */ +#define MCM_ADDR_MEM 3 /* memory address */ +#define MCM_ADDR_GENERIC 7 /* generic */ /* Fields are zero when not available */ struct mce { diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index f126b4ae7a25..54dcb8ff12e5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -2,9 +2,14 @@ enum severity_level { MCE_NO_SEVERITY, + MCE_KEEP_SEVERITY, MCE_SOME_SEVERITY, + MCE_AO_SEVERITY, MCE_UC_SEVERITY, + MCE_AR_SEVERITY, MCE_PANIC_SEVERITY, }; int mce_severity(struct mce *a, int tolerant, char **msg); + +extern int mce_ser; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c189e89a89ae..4f4d2caf4043 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -19,43 +19,117 @@ * first. Since there are quite a lot of combinations test the bits in a * table-driven way. The rules are simply processed in order, first * match wins. + * + * Note this is only used for machine check exceptions, the corrected + * errors use much simpler rules. The exceptions still check for the corrected + * errors, but only to leave them alone for the CMCI handler (except for + * panic situations) */ +enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum ser { SER_REQUIRED = 1, NO_SER = 2 }; + static struct severity { u64 mask; u64 result; unsigned char sev; unsigned char mcgmask; unsigned char mcgres; + unsigned char ser; + unsigned char context; char *msg; } severities[] = { +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER #define SEV(s) .sev = MCE_ ## s ## _SEVERITY #define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } #define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } #define MCGMASK(x, res, s, m, r...) \ { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } +#define MASK(x, y, s, m, r...) \ + { .mask = x, .result = y, SEV(s), .msg = m, ## r } +#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCACOD 0xffff + BITCLR(MCI_STATUS_VAL, NO, "Invalid"), BITCLR(MCI_STATUS_EN, NO, "Not enabled"), BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), - MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "No restart IP"), + /* When MCIP is not set something is very confused */ + MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), + /* Neither return not error IP -- no chance to recover -> PANIC */ + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, + "Neither restart nor error IP"), + MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", + KERNEL), + BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), + MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, + "Spurious not enabled", SER), + + /* ignore OVER for UCNA */ + MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, + "Uncorrected no action required", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, + "Illegal combination (UCNA with AR=1)", SER), + MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), + + /* AR add known MCACODs here */ + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, + "Action required with lost events", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, + "Action required; unknown MCACOD", SER), + + /* known AO MCACODs: */ + MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, + "Action optional: memory scrubbing error", SER), + MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, + "Action optional: last level cache writeback error", SER), + + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, + "Action optional unknown MCACOD", SER), + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, + "Action optional with lost events", SER), BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), BITSET(MCI_STATUS_UC, UC, "Uncorrected"), BITSET(0, SOME, "No match") /* always matches. keep at end */ }; +/* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ +static int error_context(struct mce *m) +{ + if (m->mcgstatus & MCG_STATUS_EIPV) + return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; + /* Unknown, assume kernel */ + return IN_KERNEL; +} + int mce_severity(struct mce *a, int tolerant, char **msg) { + enum context ctx = error_context(a); struct severity *s; + for (s = severities;; s++) { if ((a->status & s->mask) != s->result) continue; if ((a->mcgstatus & s->mcgmask) != s->mcgres) continue; - if (s->sev > MCE_NO_SEVERITY && (a->status & MCI_STATUS_UC) && - tolerant < 1) - return MCE_PANIC_SEVERITY; + if (s->ser == SER_REQUIRED && !mce_ser) + continue; + if (s->ser == NO_SER && mce_ser) + continue; + if (s->context && ctx != s->context) + continue; if (msg) *msg = s->msg; + if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { + if (panic_on_oops || tolerant < 1) + return MCE_PANIC_SEVERITY; + } return s->sev; } } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ff9c732989de..f051a7807ab4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -83,6 +83,7 @@ static int rip_msr; static int mce_bootlog = -1; static int monarch_timeout = -1; static int mce_panic_timeout; +int mce_ser; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; @@ -391,6 +392,15 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); * Those are just logged through /dev/mcelog. * * This is executed in standard interrupt context. + * + * Note: spec recommends to panic for fatal unsignalled + * errors here. However this would be quite problematic -- + * we would need to reimplement the Monarch handling and + * it would mess up the exclusion between exception handler + * and poll hander -- * so we skip this for now. + * These cases should not happen anyways, or only when the CPU + * is already totally * confused. In this case it's likely it will + * not fully execute the machine check handler either. */ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { @@ -417,13 +427,13 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; /* - * Uncorrected events are handled by the exception handler - * when it is enabled. But when the exception is disabled log - * everything. + * Uncorrected or signalled events are handled by the exception + * handler when it is enabled, so don't process those here. * * TBD do the same check for MCI_STATUS_EN here? */ - if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + if (!(flags & MCP_UC) && + (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; if (m.status & MCI_STATUS_MISCV) @@ -789,6 +799,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) barrier(); + /* + * When no restart IP must always kill or panic. + */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + /* * Go through all the banks in exclusion of the other CPUs. * This way we don't report duplicated events on shared banks @@ -809,10 +825,11 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; /* - * Non uncorrected errors are handled by machine_check_poll - * Leave them alone, unless this panics. + * Non uncorrected or non signaled errors are handled by + * machine_check_poll. Leave them alone, unless this panics. */ - if ((m.status & MCI_STATUS_UC) == 0 && !no_way_out) + if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) continue; /* @@ -820,17 +837,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ add_taint(TAINT_MACHINE_CHECK); - __set_bit(i, toclear); + severity = mce_severity(&m, tolerant, NULL); - if (m.status & MCI_STATUS_EN) { - /* - * If this error was uncorrectable and there was - * an overflow, we're in trouble. If no overflow, - * we might get away with just killing a task. - */ - if (m.status & MCI_STATUS_UC) - kill_it = 1; - } else { + /* + * When machine check was for corrected handler don't touch, + * unless we're panicing. + */ + if (severity == MCE_KEEP_SEVERITY && !no_way_out) + continue; + __set_bit(i, toclear); + if (severity == MCE_NO_SEVERITY) { /* * Machine check event was not enabled. Clear, but * ignore. @@ -838,6 +854,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; } + /* + * Kill on action required. + */ + if (severity == MCE_AR_SEVERITY) + kill_it = 1; + if (m.status & MCI_STATUS_MISCV) m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); if (m.status & MCI_STATUS_ADDRV) @@ -846,7 +868,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_get_rip(&m, regs); mce_log(&m); - severity = mce_severity(&m, tolerant, NULL); if (severity > worst) { *final = m; worst = severity; @@ -879,29 +900,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) * one task, do that. If the user has set the tolerance very * high, don't try to do anything at all. */ - if (kill_it && tolerant < 3) { - int user_space = 0; - /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. - */ - if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = final->ip && (final->cs & 3); - - /* - * If we know that the error was in user space, send a - * SIGBUS. Otherwise, panic if tolerance is low. - * - * force_sig() takes an awful lot of locks and has a slight - * risk of deadlocking. - */ - if (user_space) { - force_sig(SIGBUS, current); - } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", final, msg); - } - } + if (kill_it && tolerant < 3) + force_sig(SIGBUS, current); /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); @@ -1049,6 +1050,9 @@ static int mce_cap_init(void) if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) rip_msr = MSR_IA32_MCG_EIP; + if (cap & MCG_SER_P) + mce_ser = 1; + return 0; } From 4611a6fa4b37cf6b8b6066ed0d605c994c62a1a0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 27 May 2009 21:56:57 +0200 Subject: [PATCH 120/162] x86, mce: export MCE severities coverage via debugfs The MCE severity judgement code is data-driven, so code coverage tools such as gcov can not be used for measuring coverage. Instead a dedicated coverage mechanism is implemented. The kernel keeps track of rules executed and reports them in debugfs. This is useful for increasing coverage of the mce-test testsuite. Right now it's unconditionally enabled because it's very little code. Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 83 +++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4f4d2caf4043..ff0807f97056 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -10,6 +10,9 @@ * Author: Andi Kleen */ #include +#include +#include +#include #include #include "mce-internal.h" @@ -37,6 +40,7 @@ static struct severity { unsigned char mcgres; unsigned char ser; unsigned char context; + unsigned char covered; char *msg; } severities[] = { #define KERNEL .context = IN_KERNEL @@ -126,6 +130,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg) continue; if (msg) *msg = s->msg; + s->covered = 1; if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { if (panic_on_oops || tolerant < 1) return MCE_PANIC_SEVERITY; @@ -133,3 +138,81 @@ int mce_severity(struct mce *a, int tolerant, char **msg) return s->sev; } } + +static void *s_start(struct seq_file *f, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void *s_next(struct seq_file *f, void *data, loff_t *pos) +{ + if (++(*pos) >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void s_stop(struct seq_file *f, void *data) +{ +} + +static int s_show(struct seq_file *f, void *data) +{ + struct severity *ser = data; + seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); + return 0; +} + +static const struct seq_operations severities_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int severities_coverage_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &severities_seq_ops); +} + +static ssize_t severities_coverage_write(struct file *file, + const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int i; + for (i = 0; i < ARRAY_SIZE(severities); i++) + severities[i].covered = 0; + return count; +} + +static const struct file_operations severities_coverage_fops = { + .open = severities_coverage_open, + .release = seq_release, + .read = seq_read, + .write = severities_coverage_write, +}; + +static int __init severities_debugfs_init(void) +{ + struct dentry *dmce = NULL, *fseverities_coverage = NULL; + + dmce = debugfs_create_dir("mce", NULL); + if (dmce == NULL) + goto err_out; + fseverities_coverage = debugfs_create_file("severities-coverage", + 0444, dmce, NULL, + &severities_coverage_fops); + if (fseverities_coverage == NULL) + goto err_out; + + return 0; + +err_out: + if (fseverities_coverage) + debugfs_remove(fseverities_coverage); + if (dmce) + debugfs_remove(dmce); + return -ENOMEM; +} +late_initcall(severities_debugfs_init); From 4ef702c10b5df18ab04921fc252c26421d4d6c75 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:52 +0200 Subject: [PATCH 121/162] x86: fix panic with interrupts off (needed for MCE) For some time each panic() called with interrupts disabled triggered the !irqs_disabled() WARN_ON in smp_call_function(), producing ugly backtraces and confusing users. This is a common situation with machine checks for example which tend to call panic with interrupts disabled, but will also hit in other situations e.g. panic during early boot. In fact it means that panic cannot be called in many circumstances, which would be bad. This all started with the new fancy queued smp_call_function, which is then used by the shutdown path to shut down the other CPUs. On closer examination it turned out that the fancy RCU smp_call_function() does lots of things not suitable in a panic situation anyways, like allocating memory and relying on complex system state. I originally tried to patch this over by checking for panic there, but it was quite complicated and the original patch was also not very popular. This also didn't fix some of the underlying complexity problems. The new code in post 2.6.29 tries to patch around this by checking for oops_in_progress, but that is not enough to make this fully safe and I don't think that's a real solution because panic has to be reliable. So instead use an own vector to reboot. This makes the reboot code extremly straight forward, which is definitely a big plus in a panic situation where it is important to avoid relying on too much kernel state. The new simple code is also safe to be called from interupts off region because it is very very simple. There can be situations where it is important that panic is reliable. For example on a fatal machine check the panic is needed to get the system up again and running as quickly as possible. So it's important that panic is reliable and all function it calls simple. This is why I came up with this simple vector scheme. It's very hard to beat in simplicity. Vectors are not particularly precious anymore since all big systems are using per CPU vectors. Another possibility would have been to use an NMI similar to kdump, but there is still the problem that NMIs don't work reliably on some systems due to BIOS issues. NMIs would have been able to stop CPUs running with interrupts off too. In the sake of universal reliability I opted for using a non NMI vector for now. I put the reboot vector into the highest priority bucket of the APIC vectors and moved the 64bit UV_BAU message down instead into the next lower priority. [ Impact: bug fix, fixes an old regression ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/entry_arch.h | 1 + arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/irq_vectors.h | 9 +++------ arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irqinit.c | 3 +++ arch/x86/kernel/smp.c | 28 +++++++++++++++++++++++++++- 6 files changed, 37 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 4cdcf5a3c96b..69f886805ecb 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) +BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, smp_invalidate_interrupt) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 4e59197e29ba..1c8f28a63058 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -45,6 +45,7 @@ extern void invalidate_interrupt6(void); extern void invalidate_interrupt7(void); extern void irq_move_cleanup_interrupt(void); +extern void reboot_interrupt(void); extern void threshold_interrupt(void); extern void call_function_interrupt(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 68f7cf84a333..28477e4f2d49 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -88,12 +88,7 @@ #define CALL_FUNCTION_SINGLE_VECTOR 0xfb #define THERMAL_APIC_VECTOR 0xfa #define THRESHOLD_APIC_VECTOR 0xf9 - -#ifdef CONFIG_X86_32 -/* 0xf8 : free */ -#else -# define UV_BAU_MESSAGE 0xf8 -#endif +#define REBOOT_VECTOR 0xf8 /* f0-f7 used for spreading out TLB flushes: */ #define INVALIDATE_TLB_VECTOR_END 0xf7 @@ -117,6 +112,8 @@ */ #define GENERIC_INTERRUPT_VECTOR 0xed +#define UV_BAU_MESSAGE 0xec + /* * Self IPI vector for machine checks */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 711c130a8411..4234b1235652 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -976,6 +976,8 @@ END(\sym) #ifdef CONFIG_SMP apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 441f6ec6e9d4..4a69ec55be3d 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -173,6 +173,9 @@ static void __init smp_intr_init(void) /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); + + /* IPI used for rebooting/stopping */ + alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); #endif #endif /* CONFIG_SMP */ } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index f6db48c405b8..bf1831aa14fa 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask) * this function calls the 'stop' function on all other CPUs in the system. */ +asmlinkage void smp_reboot_interrupt(void) +{ + ack_APIC_irq(); + irq_enter(); + stop_this_cpu(NULL); + irq_exit(); +} + static void native_smp_send_stop(void) { unsigned long flags; + unsigned long wait; if (reboot_force) return; - smp_call_function(stop_this_cpu, NULL, 0); + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + * On most systems we could also use an NMI here, + * but there are a few systems around where NMI + * is problematic so stay with an non NMI for now + * (this implies we cannot stop CPUs spinning with irq off + * currently) + */ + if (num_online_cpus() > 1) { + apic->send_IPI_allbutself(REBOOT_VECTOR); + + /* Don't wait longer than a second */ + wait = USEC_PER_SEC; + while (num_online_cpus() > 1 && wait--) + udelay(1); + } + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); From 9ff36ee9668ff41ec3274597c730524645929b0f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: [PATCH 122/162] x86, mce: rename mce_notify_user to mce_notify_irq Rename the mce_notify_user function to mce_notify_irq. The next patch will split the wakeup handling of interrupt context and of process context and it's better to give it a clearer name for this. Contains a fix from Ying Huang [ Impact: cleanup ] Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Cc: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce-inject.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++----- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- arch/x86/kernel/signal.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index afd3cdf6f8ad..713926b62cbb 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -159,7 +159,7 @@ enum mcp_flags { }; void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); -int mce_notify_user(void); +int mce_notify_irq(void); DECLARE_PER_CPU(struct mce, injectm); extern struct file_operations mce_chrdev_ops; diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 7d858fb4ce67..a3a235a53f09 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -65,7 +65,7 @@ static void raise_mce(unsigned long data) memset(&b, 0xff, sizeof(mce_banks_t)); printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); machine_check_poll(0, &b); - mce_notify_user(); + mce_notify_irq(); printk(KERN_INFO "Finished machine check poll on CPU %d\n", cpu); } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f051a7807ab4..13e1b7ffe73a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -348,7 +348,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) ack_APIC_irq(); exit_idle(); irq_enter(); - mce_notify_user(); + mce_notify_irq(); irq_exit(); } #endif @@ -356,7 +356,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) static void mce_report_event(struct pt_regs *regs) { if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { - mce_notify_user(); + mce_notify_irq(); return; } @@ -968,7 +968,7 @@ static void mcheck_timer(unsigned long data) * polling interval, otherwise increase the polling interval. */ n = &__get_cpu_var(next_interval); - if (mce_notify_user()) + if (mce_notify_irq()) *n = max(*n/2, HZ/100); else *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); @@ -989,7 +989,7 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger); * Can be called from interrupt context, but not from machine check/NMI * context. */ -int mce_notify_user(void) +int mce_notify_irq(void) { /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); @@ -1014,7 +1014,7 @@ int mce_notify_user(void) } return 0; } -EXPORT_SYMBOL_GPL(mce_notify_user); +EXPORT_SYMBOL_GPL(mce_notify_irq); /* * Initialize Machine Checks for a CPU. diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index eff3740501a3..b7c5a2470b40 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -80,7 +80,7 @@ static int cmci_supported(int *banks) static void intel_threshold_interrupt(void) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); - mce_notify_user(); + mce_notify_irq(); } static void print_update(char *type, int *hdr, int num) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d0851e3f77eb..d5dc15bce005 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) #ifdef CONFIG_X86_NEW_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_user(); + mce_notify_irq(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ From 8fa8dd9e3aafb7b440b7d54219891615abc6390e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:58 +0200 Subject: [PATCH 123/162] x86, mce: define MCE_VECTOR Add MCE_VECTOR for the #MC exception. Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq_vectors.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 28477e4f2d49..1b35c4357ea8 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -25,6 +25,7 @@ */ #define NMI_VECTOR 0x02 +#define MCE_VECTOR 0x12 /* * IDT vectors usable for external interrupt sources start From 9b1beaf2b551a8a1604f104025b24e9c535c8963 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2009 21:56:59 +0200 Subject: [PATCH 124/162] x86, mce: support action-optional machine checks Newer Intel CPUs support a new class of machine checks called recoverable action optional. Action Optional means that the CPU detected some form of corruption in the background and tells the OS about using a machine check exception. The OS can then take appropiate action, like killing the process with the corrupted data or logging the event properly to disk. This is done by the new generic high level memory failure handler added in a earlier patch. The high level handler takes the address with the failed memory and does the appropiate action, like killing the process. In this version of the patch the high level handler is stubbed out with a weak function to not create a direct dependency on the hwpoison branch. The high level handler cannot be directly called from the machine check exception though, because it has to run in a defined process context to be able to sleep when taking VM locks (it is not expected to sleep for a long time, just do so in some exceptional cases like lock contention) Thus the MCE handler has to queue a work item for process context, trigger process context and then call the high level handler from there. This patch adds two path to process context: through a per thread kernel exit notify_user() callback or through a high priority work item. The first runs when the process exits back to user space, the other when it goes to sleep and there is no higher priority process. The machine check handler will schedule both, and whoever runs first will grab the event. This is done because quick reaction to this event is critical to avoid a potential more fatal machine check when the corruption is consumed. There is a simple lock less ring buffer to queue the corrupted addresses between the exception handler and the process context handler. Then in process context it just calls the high level VM code with the corrupted PFNs. The code adds the required code to extract the failed address from the CPU's machine check registers. It doesn't try to handle all possible cases -- the specification has 6 different ways to specify memory address -- but only the linear address. Most of the required checking has been already done earlier in the mce_severity rule checking engine. Following the Intel recommendations Action Optional errors are only enabled for known situations (encoded in MCACODs). The errors are ignored otherwise, because they are action optional. v2: Improve comment, disable preemption while processing ring buffer (reported by Ying Huang) Signed-off-by: Andi Kleen Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 133 +++++++++++++++++++++++++++++++ arch/x86/kernel/signal.c | 2 +- 3 files changed, 135 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 713926b62cbb..82978ad12072 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -160,6 +160,7 @@ enum mcp_flags { void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); int mce_notify_irq(void); +void mce_notify_process(void); DECLARE_PER_CPU(struct mce, injectm); extern struct file_operations mce_chrdev_ops; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 13e1b7ffe73a..d4e7b5947a0e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -105,6 +106,8 @@ static inline int skip_bank_init(int i) return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); } +static DEFINE_PER_CPU(struct work_struct, mce_work); + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -312,6 +315,61 @@ static void mce_wrmsrl(u32 msr, u64 v) wrmsrl(msr, v); } +/* + * Simple lockless ring to communicate PFNs from the exception handler with the + * process context work function. This is vastly simplified because there's + * only a single reader and a single writer. + */ +#define MCE_RING_SIZE 16 /* we use one entry less */ + +struct mce_ring { + unsigned short start; + unsigned short end; + unsigned long ring[MCE_RING_SIZE]; +}; +static DEFINE_PER_CPU(struct mce_ring, mce_ring); + +/* Runs with CPU affinity in workqueue */ +static int mce_ring_empty(void) +{ + struct mce_ring *r = &__get_cpu_var(mce_ring); + + return r->start == r->end; +} + +static int mce_ring_get(unsigned long *pfn) +{ + struct mce_ring *r; + int ret = 0; + + *pfn = 0; + get_cpu(); + r = &__get_cpu_var(mce_ring); + if (r->start == r->end) + goto out; + *pfn = r->ring[r->start]; + r->start = (r->start + 1) % MCE_RING_SIZE; + ret = 1; +out: + put_cpu(); + return ret; +} + +/* Always runs in MCE context with preempt off */ +static int mce_ring_add(unsigned long pfn) +{ + struct mce_ring *r = &__get_cpu_var(mce_ring); + unsigned next; + + next = (r->end + 1) % MCE_RING_SIZE; + if (next == r->start) + return -1; + r->ring[r->end] = pfn; + wmb(); + r->end = next; + return 0; +} + int mce_available(struct cpuinfo_x86 *c) { if (mce_disabled) @@ -319,6 +377,15 @@ int mce_available(struct cpuinfo_x86 *c) return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } +static void mce_schedule_work(void) +{ + if (!mce_ring_empty()) { + struct work_struct *work = &__get_cpu_var(mce_work); + if (!work_pending(work)) + schedule_work(work); + } +} + /* * Get the address of the instruction at the time of the machine check * error. @@ -349,6 +416,7 @@ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) exit_idle(); irq_enter(); mce_notify_irq(); + mce_schedule_work(); irq_exit(); } #endif @@ -357,6 +425,13 @@ static void mce_report_event(struct pt_regs *regs) { if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { mce_notify_irq(); + /* + * Triggering the work queue here is just an insurance + * policy in case the syscall exit notify handler + * doesn't run soon enough or ends up running on the + * wrong CPU (can happen when audit sleeps) + */ + mce_schedule_work(); return; } @@ -731,6 +806,23 @@ reset: return ret; } +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses upto page granuality for now. + */ +static int mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + return 0; + if ((m->misc & 0x3f) > PAGE_SHIFT) + return 0; + if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) + return 0; + return 1; +} + static void mce_clear_state(unsigned long *toclear) { int i; @@ -865,6 +957,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (m.status & MCI_STATUS_ADDRV) m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + /* + * Action optional error. Queue address for later processing. + * When the ring overflows we just ignore the AO error. + * RED-PEN add some logging mechanism when + * usable_address or mce_add_ring fails. + * RED-PEN don't ignore overflow for tolerant == 0 + */ + if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) + mce_ring_add(m.addr >> PAGE_SHIFT); + mce_get_rip(&m, regs); mce_log(&m); @@ -916,6 +1018,36 @@ out: } EXPORT_SYMBOL_GPL(do_machine_check); +/* dummy to break dependency. actual code is in mm/memory-failure.c */ +void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +{ + printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); +} + +/* + * Called after mce notification in process context. This code + * is allowed to sleep. Call the high level VM handler to process + * any corrupted pages. + * Assume that the work queue code only calls this one at a time + * per CPU. + * Note we don't disable preemption, so this code might run on the wrong + * CPU. In this case the event is picked up by the scheduled work queue. + * This is merely a fast path to expedite processing in some common + * cases. + */ +void mce_notify_process(void) +{ + unsigned long pfn; + mce_notify_irq(); + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR); +} + +static void mce_process_work(struct work_struct *dummy) +{ + mce_notify_process(); +} + #ifdef CONFIG_X86_MCE_INTEL /*** * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog @@ -1204,6 +1336,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) mce_init(); mce_cpu_features(c); mce_init_timer(); + INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); } /* diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d5dc15bce005..4976888094f0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -860,7 +860,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) #ifdef CONFIG_X86_NEW_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_irq(); + mce_notify_process(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ From 8051dbd2dfd1427cc102888d7d96bf39de0be150 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 2 Jun 2009 16:53:23 +0900 Subject: [PATCH 125/162] x86, mce: fix for mce counters Make the MCE counters work on 32bit and add poll count in arch_irq_stat_cpu. Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/irq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index eff46b5de62f..9773395aa758 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -95,7 +95,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) +#ifdef CONFIG_X86_NEW_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -172,9 +172,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) { u64 sum = irq_stats(cpu)->__nmi_count; -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) - sum += per_cpu(mce_exception_count, cpu); -#endif #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; @@ -191,6 +188,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu) # ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; # endif +#endif +#ifdef CONFIG_X86_NEW_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); #endif return sum; } From bbb0a4247aaf1eabbd6d87750eafe99c577920f7 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 16 Jan 2009 09:49:50 -0700 Subject: [PATCH 126/162] Document Reported-by in SubmittingPatches Randy pointed out that the Reported-By tag should be documented with the others in SubmittingPatches. Reported-by: Randy Dunlap Signed-off-by: Jonathan Corbet --- Documentation/SubmittingPatches | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index f309d3c6221c..6f29e29555c0 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches @@ -405,7 +405,14 @@ person it names. This tag documents that potentially interested parties have been included in the discussion -14) Using Tested-by: and Reviewed-by: +14) Using Reported-by:, Tested-by: and Reviewed-by: + +If this patch fixes a problem reported by somebody else, consider adding a +Reported-by: tag to credit the reporter for their contribution. Please +note that this tag should not be added without the reporter's permission, +especially if the problem was not reported in a public forum. That said, +if we diligently credit our bug reporters, they will, hopefully, be +inspired to help us again in the future. A Tested-by: tag indicates that the patch has been successfully tested (in some environment) by the person named. This tag informs maintainers that From 5d98932ab0acb699dc56d9e252f056b9b2cdab25 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 21 Apr 2009 13:33:06 -0600 Subject: [PATCH 127/162] docs: Encourage better changelogs in the development process document Add a couple of paragraphs to the "patch formatting" section on how patches should be described. This text is shamelessly cribbed from suggestions posted by Rusty Russell. Signed-off-by: Jonathan Corbet --- Documentation/development-process/5.Posting | 31 +++++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/Documentation/development-process/5.Posting b/Documentation/development-process/5.Posting index dd48132a74dd..f622c1e9f0f9 100644 --- a/Documentation/development-process/5.Posting +++ b/Documentation/development-process/5.Posting @@ -119,7 +119,7 @@ which takes quite a bit of time and thought after the "real work" has been done. When done properly, though, it is time well spent. -5.4: PATCH FORMATTING +5.4: PATCH FORMATTING AND CHANGELOGS So now you have a perfect series of patches for posting, but the work is not done quite yet. Each patch needs to be formatted into a message which @@ -146,8 +146,33 @@ that end, each patch will be composed of the following: - One or more tag lines, with, at a minimum, one Signed-off-by: line from the author of the patch. Tags will be described in more detail below. -The above three items should, normally, be the text used when committing -the change to a revision control system. They are followed by: +The items above, together, form the changelog for the patch. Writing good +changelogs is a crucial but often-neglected art; it's worth spending +another moment discussing this issue. When writing a changelog, you should +bear in mind that a number of different people will be reading your words. +These include subsystem maintainers and reviewers who need to decide +whether the patch should be included, distributors and other maintainers +trying to decide whether a patch should be backported to other kernels, bug +hunters wondering whether the patch is responsible for a problem they are +chasing, users who want to know how the kernel has changed, and more. A +good changelog conveys the needed information to all of these people in the +most direct and concise way possible. + +To that end, the summary line should describe the effects of and motivation +for the change as well as possible given the one-line constraint. The +detailed description can then amplify on those topics and provide any +needed additional information. If the patch fixes a bug, cite the commit +which introduced the bug if possible. If a problem is associated with +specific log or compiler output, include that output to help others +searching for a solution to the same problem. If the change is meant to +support other changes coming in later patch, say so. If internal APIs are +changed, detail those changes and how other developers should respond. In +general, the more you can put yourself into the shoes of everybody who will +be reading your changelog, the better that changelog (and the kernel as a +whole) will be. + +Needless to say, the changelog should be the text used when committing the +change to a revision control system. It will be followed by: - The patch itself, in the unified ("-u") patch format. Using the "-p" option to diff will associate function names with changes, making the From 5801da1b2f1207da21271ffd6768cd40a6c7f1c4 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 4 Jun 2009 16:26:50 +0200 Subject: [PATCH 128/162] SubmittingPatches: fix typo Fix typo. Signed-off-by: Pavel Machek Signed-off-by: Jonathan Corbet --- Documentation/SubmittingPatches | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index 6f29e29555c0..71b6da2b7175 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches @@ -451,7 +451,7 @@ offer a Reviewed-by tag for a patch. This tag serves to give credit to reviewers and to inform maintainers of the degree of review which has been done on the patch. Reviewed-by: tags, when supplied by reviewers known to understand the subject area and to perform thorough reviews, will normally -increase the liklihood of your patch getting into the kernel. +increase the likelihood of your patch getting into the kernel. 15) The canonical patch format From 2ae19acaa50a09c1099956efb895c0aca74ab050 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 16 Apr 2009 07:44:45 -0400 Subject: [PATCH 129/162] Documentation: Add "how to write a good patch summary" to SubmittingPatches Unfortunately many patch submissions are arriving with painfully poor patch descriptions. As a result of the discussion on LKML: http://lkml.org/lkml/2009/4/15/296 explain how to submit a better patch description, in the (perhaps vain) hope that maintainers won't end up having to rewrite the git commit logs as often as they do today. Signed-off-by: "Theodore Ts'o" Cc: Ingo Molnar Signed-off-by: Jonathan Corbet --- Documentation/SubmittingPatches | 65 ++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 14 deletions(-) diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index 71b6da2b7175..6c456835c1fd 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches @@ -91,6 +91,10 @@ Be as specific as possible. The WORST descriptions possible include things like "update driver X", "bug fix for driver X", or "this patch includes updates for subsystem X. Please apply." +The maintainer will thank you if you write your patch description in a +form which can be easily pulled into Linux's source code management +system, git, as a "commit log". See #15, below. + If your description starts to get long, that's a sign that you probably need to split up your patch. See #3, next. @@ -492,12 +496,33 @@ phrase" should not be a filename. Do not use the same "summary phrase" for every patch in a whole patch series (where a "patch series" is an ordered sequence of multiple, related patches). -Bear in mind that the "summary phrase" of your email becomes -a globally-unique identifier for that patch. It propagates -all the way into the git changelog. The "summary phrase" may -later be used in developer discussions which refer to the patch. -People will want to google for the "summary phrase" to read -discussion regarding that patch. +Bear in mind that the "summary phrase" of your email becomes a +globally-unique identifier for that patch. It propagates all the way +into the git changelog. The "summary phrase" may later be used in +developer discussions which refer to the patch. People will want to +google for the "summary phrase" to read discussion regarding that +patch. It will also be the only thing that people may quickly see +when, two or three months later, they are going through perhaps +thousands of patches using tools such as "gitk" or "git log +--oneline". + +For these reasons, the "summary" must be no more than 70-75 +characters, and it must describe both what the patch changes, as well +as why the patch might be necessary. It is challenging to be both +succinct and descriptive, but that is what a well-written summary +should do. + +The "summary phrase" may be prefixed by tags enclosed in square +brackets: "Subject: [PATCH tag] ". The tags are not +considered part of the summary phrase, but describe how the patch +should be treated. Common tags might include a version descriptor if +the multiple versions of the patch have been sent out in response to +comments (i.e., "v1, v2, v3"), or "RFC" to indicate a request for +comments. If there are four patches in a patch series the individual +patches may be numbered like this: 1/4, 2/4, 3/4, 4/4. This assures +that developers understand the order in which the patches should be +applied and that they have reviewed or applied all of the patches in +the patch series. A couple of example Subjects: @@ -517,19 +542,31 @@ the patch author in the changelog. The explanation body will be committed to the permanent source changelog, so should make sense to a competent reader who has long since forgotten the immediate details of the discussion that might -have led to this patch. +have led to this patch. Including symptoms of the failure which the +patch addresses (kernel log messages, oops messages, etc.) is +especially useful for people who might be searching the commit logs +looking for the applicable patch. If a patch fixes a compile failure, +it may not be necessary to include _all_ of the compile failures; just +enough that it is likely that someone searching for the patch can find +it. As in the "summary phrase", it is important to be both succinct as +well as descriptive. The "---" marker line serves the essential purpose of marking for patch handling tools where the changelog message ends. One good use for the additional comments after the "---" marker is for -a diffstat, to show what files have changed, and the number of inserted -and deleted lines per file. A diffstat is especially useful on bigger -patches. Other comments relevant only to the moment or the maintainer, -not suitable for the permanent changelog, should also go here. -Use diffstat options "-p 1 -w 70" so that filenames are listed from the -top of the kernel source tree and don't use too much horizontal space -(easily fit in 80 columns, maybe with some indentation). +a diffstat, to show what files have changed, and the number of +inserted and deleted lines per file. A diffstat is especially useful +on bigger patches. Other comments relevant only to the moment or the +maintainer, not suitable for the permanent changelog, should also go +here. A good example of such comments might be "patch changelogs" +which describe what has changed between the v1 and v2 version of the +patch. + +If you are going to include a diffstat after the "---" marker, please +use diffstat options "-p 1 -w 70" so that filenames are listed from +the top of the kernel source tree and don't use too much horizontal +space (easily fit in 80 columns, maybe with some indentation). See more details on the proper patch format in the following references. From f89d7eaf6c34828070f407d0e04b73127f176ec5 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Thu, 4 Jun 2009 16:35:25 -0600 Subject: [PATCH 130/162] Document the debugfs API This is an updated document covering the internal API for the debugfs filesystem. Thanks to Shen Feng for suggesting that I put this text here and noting that the old LWN version was rather out of date. Acked-by: Greg Kroah-Hartman Reported-by: Shen Feng Signed-off-by: Jonathan Corbet --- Documentation/filesystems/debugfs.txt | 158 ++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 Documentation/filesystems/debugfs.txt diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt new file mode 100644 index 000000000000..ed52af60c2d8 --- /dev/null +++ b/Documentation/filesystems/debugfs.txt @@ -0,0 +1,158 @@ +Copyright 2009 Jonathan Corbet + +Debugfs exists as a simple way for kernel developers to make information +available to user space. Unlike /proc, which is only meant for information +about a process, or sysfs, which has strict one-value-per-file rules, +debugfs has no rules at all. Developers can put any information they want +there. The debugfs filesystem is also intended to not serve as a stable +ABI to user space; in theory, there are no stability constraints placed on +files exported there. The real world is not always so simple, though [1]; +even debugfs interfaces are best designed with the idea that they will need +to be maintained forever. + +Debugfs is typically mounted with a command like: + + mount -t debugfs none /sys/kernel/debug + +(Or an equivalent /etc/fstab line). + +Note that the debugfs API is exported GPL-only to modules. + +Code using debugfs should include . Then, the first order +of business will be to create at least one directory to hold a set of +debugfs files: + + struct dentry *debugfs_create_dir(const char *name, struct dentry *parent); + +This call, if successful, will make a directory called name underneath the +indicated parent directory. If parent is NULL, the directory will be +created in the debugfs root. On success, the return value is a struct +dentry pointer which can be used to create files in the directory (and to +clean it up at the end). A NULL return value indicates that something went +wrong. If ERR_PTR(-ENODEV) is returned, that is an indication that the +kernel has been built without debugfs support and none of the functions +described below will work. + +The most general way to create a file within a debugfs directory is with: + + struct dentry *debugfs_create_file(const char *name, mode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops); + +Here, name is the name of the file to create, mode describes the access +permissions the file should have, parent indicates the directory which +should hold the file, data will be stored in the i_private field of the +resulting inode structure, and fops is a set of file operations which +implement the file's behavior. At a minimum, the read() and/or write() +operations should be provided; others can be included as needed. Again, +the return value will be a dentry pointer to the created file, NULL for +error, or ERR_PTR(-ENODEV) if debugfs support is missing. + +In a number of cases, the creation of a set of file operations is not +actually necessary; the debugfs code provides a number of helper functions +for simple situations. Files containing a single integer value can be +created with any of: + + struct dentry *debugfs_create_u8(const char *name, mode_t mode, + struct dentry *parent, u8 *value); + struct dentry *debugfs_create_u16(const char *name, mode_t mode, + struct dentry *parent, u16 *value); + struct dentry *debugfs_create_u32(const char *name, mode_t mode, + struct dentry *parent, u32 *value); + struct dentry *debugfs_create_u64(const char *name, mode_t mode, + struct dentry *parent, u64 *value); + +These files support both reading and writing the given value; if a specific +file should not be written to, simply set the mode bits accordingly. The +values in these files are in decimal; if hexadecimal is more appropriate, +the following functions can be used instead: + + struct dentry *debugfs_create_x8(const char *name, mode_t mode, + struct dentry *parent, u8 *value); + struct dentry *debugfs_create_x16(const char *name, mode_t mode, + struct dentry *parent, u16 *value); + struct dentry *debugfs_create_x32(const char *name, mode_t mode, + struct dentry *parent, u32 *value); + +Note that there is no debugfs_create_x64(). + +These functions are useful as long as the developer knows the size of the +value to be exported. Some types can have different widths on different +architectures, though, complicating the situation somewhat. There is a +function meant to help out in one special case: + + struct dentry *debugfs_create_size_t(const char *name, mode_t mode, + struct dentry *parent, + size_t *value); + +As might be expected, this function will create a debugfs file to represent +a variable of type size_t. + +Boolean values can be placed in debugfs with: + + struct dentry *debugfs_create_bool(const char *name, mode_t mode, + struct dentry *parent, u32 *value); + +A read on the resulting file will yield either Y (for non-zero values) or +N, followed by a newline. If written to, it will accept either upper- or +lower-case values, or 1 or 0. Any other input will be silently ignored. + +Finally, a block of arbitrary binary data can be exported with: + + struct debugfs_blob_wrapper { + void *data; + unsigned long size; + }; + + struct dentry *debugfs_create_blob(const char *name, mode_t mode, + struct dentry *parent, + struct debugfs_blob_wrapper *blob); + +A read of this file will return the data pointed to by the +debugfs_blob_wrapper structure. Some drivers use "blobs" as a simple way +to return several lines of (static) formatted text output. This function +can be used to export binary information, but there does not appear to be +any code which does so in the mainline. Note that all files created with +debugfs_create_blob() are read-only. + +There are a couple of other directory-oriented helper functions: + + struct dentry *debugfs_rename(struct dentry *old_dir, + struct dentry *old_dentry, + struct dentry *new_dir, + const char *new_name); + + struct dentry *debugfs_create_symlink(const char *name, + struct dentry *parent, + const char *target); + +A call to debugfs_rename() will give a new name to an existing debugfs +file, possibly in a different directory. The new_name must not exist prior +to the call; the return value is old_dentry with updated information. +Symbolic links can be created with debugfs_create_symlink(). + +There is one important thing that all debugfs users must take into account: +there is no automatic cleanup of any directories created in debugfs. If a +module is unloaded without explicitly removing debugfs entries, the result +will be a lot of stale pointers and no end of highly antisocial behavior. +So all debugfs users - at least those which can be built as modules - must +be prepared to remove all files and directories they create there. A file +can be removed with: + + void debugfs_remove(struct dentry *dentry); + +The dentry value can be NULL, in which case nothing will be removed. + +Once upon a time, debugfs users were required to remember the dentry +pointer for every debugfs file they created so that all files could be +cleaned up. We live in more civilized times now, though, and debugfs users +can call: + + void debugfs_remove_recursive(struct dentry *dentry); + +If this function is passed a pointer for the dentry corresponding to the +top-level directory, the entire hierarchy below that directory will be +removed. + +Notes: + [1] http://lwn.net/Articles/309298/ From a89ab11454b476d2d9e8a10f903360a62e21bac8 Mon Sep 17 00:00:00 2001 From: Peter Ma Date: Mon, 8 Jun 2009 12:54:02 +0200 Subject: [PATCH 131/162] avr32: Change Atmel ATNGW100 config to add choice of add-on board Signed-off-by: Peter Ma Signed-off-by: Haavard Skinnemoen --- arch/avr32/boards/atngw100/Kconfig | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/avr32/boards/atngw100/Kconfig b/arch/avr32/boards/atngw100/Kconfig index b3f99477bbeb..f41c5335e6a9 100644 --- a/arch/avr32/boards/atngw100/Kconfig +++ b/arch/avr32/boards/atngw100/Kconfig @@ -2,8 +2,15 @@ if BOARD_ATNGW100 +choice + prompt "Select an NGW100 add-on board to support" + default BOARD_ATNGW100_ADDON_NONE + +config BOARD_ATNGW100_ADDON_NONE + bool "None" + config BOARD_ATNGW100_EVKLCD10X - bool "Add support for EVKLCD10X addon board" + bool "EVKLCD10X addon board" help This enables support for the EVKLCD100 (QVGA) or EVKLCD101 (VGA) addon board for the NGW100. By enabling this the LCD controller and @@ -14,7 +21,7 @@ config BOARD_ATNGW100_EVKLCD10X The MCI pins can be reenabled by editing the "add device function" but this may break the setup for other displays that use these pins. - Choose 'Y' here if you have a EVKLCD100/101 connected to the NGW100. +endchoice choice prompt "LCD panel resolution on EVKLCD10X" From 4024533e60787a5507818b0c0fdb44ddb522cdf5 Mon Sep 17 00:00:00 2001 From: Peter Ma Date: Mon, 8 Jun 2009 12:55:35 +0200 Subject: [PATCH 132/162] avr32: Add support for Mediama RMTx add-on board for ATNGW100 Signed-off-by: Peter Ma Signed-off-by: Haavard Skinnemoen --- arch/avr32/boards/atngw100/Kconfig | 16 + arch/avr32/boards/atngw100/Kconfig_mrmt | 80 ++ arch/avr32/boards/atngw100/Makefile | 1 + arch/avr32/boards/atngw100/mrmt.c | 373 ++++++ arch/avr32/boards/atngw100/setup.c | 5 + arch/avr32/configs/atngw100_mrmt_defconfig | 1363 ++++++++++++++++++++ 6 files changed, 1838 insertions(+) create mode 100644 arch/avr32/boards/atngw100/Kconfig_mrmt create mode 100644 arch/avr32/boards/atngw100/mrmt.c create mode 100644 arch/avr32/configs/atngw100_mrmt_defconfig diff --git a/arch/avr32/boards/atngw100/Kconfig b/arch/avr32/boards/atngw100/Kconfig index f41c5335e6a9..be27a0218ab4 100644 --- a/arch/avr32/boards/atngw100/Kconfig +++ b/arch/avr32/boards/atngw100/Kconfig @@ -21,6 +21,18 @@ config BOARD_ATNGW100_EVKLCD10X The MCI pins can be reenabled by editing the "add device function" but this may break the setup for other displays that use these pins. +config BOARD_ATNGW100_MRMT + bool "Mediama RMT1/2 add-on board" + help + This enables support for the Mediama RMT1 or RMT2 board. + RMT provides LCD support, AC97 codec and other + optional peripherals to the Atmel NGW100. + + This choice disables the detect pin and the write-protect pin for the + MCI platform device, since it conflicts with the LCD platform device. + The MCI pins can be reenabled by editing the "add device function" but + this may break the setup for other displays that use these pins. + endchoice choice @@ -39,4 +51,8 @@ config BOARD_ATNGW100_EVKLCD10X_POW_QVGA endchoice +if BOARD_ATNGW100_MRMT +source "arch/avr32/boards/atngw100/Kconfig_mrmt" +endif + endif # BOARD_ATNGW100 diff --git a/arch/avr32/boards/atngw100/Kconfig_mrmt b/arch/avr32/boards/atngw100/Kconfig_mrmt new file mode 100644 index 000000000000..9a199a207f3c --- /dev/null +++ b/arch/avr32/boards/atngw100/Kconfig_mrmt @@ -0,0 +1,80 @@ +# RMT for NGW100 customization + +choice + prompt "RMT Version" + help + Select the RMTx board version. + +config BOARD_MRMT_REV1 + bool "RMT1" +config BOARD_MRMT_REV2 + bool "RMT2" + +endchoice + +config BOARD_MRMT_AC97 + bool "Enable AC97 CODEC" + help + Enable the UCB1400 AC97 CODEC driver. + +choice + prompt "Touchscreen Driver" + default BOARD_MRMT_ADS7846_TS + +config BOARD_MRMT_UCB1400_TS + bool "Use UCB1400 Touchscreen" + +config BOARD_MRMT_ADS7846_TS + bool "Use ADS7846 Touchscreen" + +endchoice + +choice + prompt "RMTx LCD Selection" + default BOARD_MRMT_LCD_DISABLE + +config BOARD_MRMT_LCD_DISABLE + bool "LCD Disabled" + +config BOARD_MRMT_LCD_LQ043T3DX0X + bool "Sharp LQ043T3DX0x or compatible" + help + If using RMT2, be sure to load the resistor pack selectors accordingly + +if BOARD_MRMT_REV2 +config BOARD_MRMT_LCD_KWH043GM08 + bool "Formike KWH043GM08 or compatible" + help + Be sure to load the RMT2 resistor pack selectors accordingly +endif + +endchoice + +if !BOARD_MRMT_LCD_DISABLE +config BOARD_MRMT_BL_PWM + bool "Use PWM control for LCD Backlight" + help + Use PWM driver for controlling LCD Backlight. + Otherwise, LCD Backlight is always on. +endif + +config BOARD_MRMT_RTC_I2C + bool "Use External RTC on I2C Bus" + help + RMT1 has an optional RTC device on the I2C bus. + It is a SII S35390A. Be sure to select the + matching RTC driver. + +choice + prompt "Wireless Module on ttyS2" + default BOARD_MRMT_WIRELESS_ZB + +config BOARD_MRMT_WIRELESS_ZB + bool "Use ZigBee/802.15.4 Module" + +config BOARD_MRMT_WIRELESS_BT + bool "Use Bluetooth (HCI) Module" + +config BOARD_MRMT_WIRELESS_NONE + bool "Not Installed" +endchoice diff --git a/arch/avr32/boards/atngw100/Makefile b/arch/avr32/boards/atngw100/Makefile index 6376f5322e4d..f4ebe42a8254 100644 --- a/arch/avr32/boards/atngw100/Makefile +++ b/arch/avr32/boards/atngw100/Makefile @@ -1,2 +1,3 @@ obj-y += setup.o flash.o obj-$(CONFIG_BOARD_ATNGW100_EVKLCD10X) += evklcd10x.o +obj-$(CONFIG_BOARD_ATNGW100_MRMT) += mrmt.o diff --git a/arch/avr32/boards/atngw100/mrmt.c b/arch/avr32/boards/atngw100/mrmt.c new file mode 100644 index 000000000000..bf78e516a85f --- /dev/null +++ b/arch/avr32/boards/atngw100/mrmt.c @@ -0,0 +1,373 @@ +/* + * Board-specific setup code for Remote Media Terminal 1 (RMT1) + * add-on board for the ATNGW100 Network Gateway + * + * Copyright (C) 2008 Mediama Technologies + * Based on ATNGW100 Network Gateway (Copyright (C) Atmel) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include