From 5c9afbda911ce20b3f2181d1e440a0222e1027dd Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 17 Nov 2017 22:37:53 +0100 Subject: [PATCH 01/61] dmaengine: ioat: Fix error handling path If the last test in 'ioat_dma_self_test()' fails, we must release all the allocated resources and not just part of them. Signed-off-by: Christophe JAILLET Acked-by: Dave Jiang Signed-off-by: Vinod Koul --- drivers/dma/ioat/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index 2f31d3d0caa618..7792a9186f9cf3 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -390,7 +390,7 @@ static int ioat_dma_self_test(struct ioatdma_device *ioat_dma) if (memcmp(src, dest, IOAT_TEST_SIZE)) { dev_err(dev, "Self-test copy failed compare, disabling\n"); err = -ENODEV; - goto free_resources; + goto unmap_dma; } unmap_dma: From 62a277d43d47e74972de44d33bd3763e31992414 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 20 Nov 2017 08:28:14 -0600 Subject: [PATCH 02/61] dmaengine: at_hdmac: fix potential NULL pointer dereference in atc_prep_dma_interleaved _xt_ is being dereferenced before it is null checked, hence there is a potential null pointer dereference. Fix this by moving the pointer dereference after _xt_ has been null checked. This issue was detected with the help of Coccinelle. Fixes: 4483320e241c ("dmaengine: Use Pointer xt after NULL check.") Signed-off-by: Gustavo A. R. Silva Acked-by: Ludovic Desroches Signed-off-by: Vinod Koul --- drivers/dma/at_hdmac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index fbab271b3bf9f9..a861b5b4d4437d 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -708,7 +708,7 @@ atc_prep_dma_interleaved(struct dma_chan *chan, unsigned long flags) { struct at_dma_chan *atchan = to_at_dma_chan(chan); - struct data_chunk *first = xt->sgl; + struct data_chunk *first; struct at_desc *desc = NULL; size_t xfer_count; unsigned int dwidth; @@ -720,6 +720,8 @@ atc_prep_dma_interleaved(struct dma_chan *chan, if (unlikely(!xt || xt->numf != 1 || !xt->frame_size)) return NULL; + first = xt->sgl; + dev_info(chan2dev(chan), "%s: src=%pad, dest=%pad, numf=%d, frame_size=%d, flags=0x%lx\n", __func__, &xt->src_start, &xt->dst_start, xt->numf, From fc82228a5e3860502dbf3bfa4a9570cb7093cf7f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 3 Dec 2017 20:38:01 -0500 Subject: [PATCH 03/61] ext4: support fast symlinks from ext3 file systems 407cd7fb83c0 (ext4: change fast symlink test to not rely on i_blocks) broke ~10 years old ext3 file systems created by 2.6.17. Any ELF executable fails because the /lib/ld-linux.so.2 fast symlink cannot be read anymore. The patch assumed fast symlinks were created in a specific way, but that's not true on these really old file systems. The new behavior is apparently needed only with the large EA inode feature. Revert to the old behavior if the large EA inode feature is not set. This makes my old VM boot again. Fixes: 407cd7fb83c0 (ext4: change fast symlink test to not rely on i_blocks) Signed-off-by: Andi Kleen Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7df2c5644e59c9..534a9130f62578 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -149,6 +149,15 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_inode_is_fast_symlink(struct inode *inode) { + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + + if (ext4_has_inline_data(inode)) + return 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); + } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } From c894aa97577e47d3066b27b32499ecf899bfa8b0 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Sun, 3 Dec 2017 22:52:51 -0500 Subject: [PATCH 04/61] ext4: fix fdatasync(2) after fallocate(2) operation Currently, fallocate(2) with KEEP_SIZE followed by a fdatasync(2) then crash, we'll see wrong allocated block number (stat -c %b), the blocks allocated beyond EOF are all lost. fstests generic/468 exposes this bug. Commit 67a7d5f561f4 ("ext4: fix fdatasync(2) after extent manipulation operations") fixed all the other extent manipulation operation paths such as hole punch, zero range, collapse range etc., but forgot the fallocate case. So similarly, fix it by recording the correct journal tid in ext4 inode in fallocate(2) path, so that ext4_sync_file() will wait for the right tid to be committed on fdatasync(2). This addresses the test failure in xfstests test generic/468. Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 07bca11749d406..c941251ac0c008 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4722,6 +4722,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); ret2 = ext4_journal_stop(handle); if (ret2) break; From ca0168e8a77cf833f8c9ac1d26a3a4012bab4f72 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Dec 2017 09:32:25 -0500 Subject: [PATCH 05/61] alloc_super(): do ->s_umount initialization earlier ... so that failure exits could count on it having been done. Signed-off-by: Al Viro --- fs/super.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/super.c b/fs/super.c index d4e33e8f1e6fee..7ff1349609e487 100644 --- a/fs/super.c +++ b/fs/super.c @@ -191,6 +191,24 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); + init_rwsem(&s->s_umount); + lockdep_set_class(&s->s_umount, &type->s_umount_key); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; @@ -218,25 +236,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru)) goto fail; - - init_rwsem(&s->s_umount); - lockdep_set_class(&s->s_umount, &type->s_umount_key); - /* - * sget() can have s_umount recursion. - * - * When it cannot find a suitable sb, it allocates a new - * one (this one), and tries again to find a suitable old - * one. - * - * In case that succeeds, it will acquire the s_umount - * lock of the old one. Since these are clearly distrinct - * locks, and this object isn't exposed yet, there's no - * risk of deadlocks. - * - * Annotate this by putting this lock in a different - * subclass. - */ - down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); From d7ee946942bdd12394809305e3df05aa4c8b7b8f Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Wed, 11 Oct 2017 07:01:31 +0200 Subject: [PATCH 06/61] VFS: Handle lazytime in do_mount() Since commit e462ec50cb5fa ("VFS: Differentiate mount flags (MS_*) from internal superblock flags") the lazytime mount option doesn't get passed on anymore. Fix the issue by handling the option in do_mount(). Reviewed-by: Lukas Czerner Signed-off-by: Markus Trippelsdorf Signed-off-by: Al Viro --- fs/namespace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/namespace.c b/fs/namespace.c index e158ec6b527b2d..9d1374ab6e06f2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2826,6 +2826,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, SB_DIRSYNC | SB_SILENT | SB_POSIXACL | + SB_LAZYTIME | SB_I_VERSION); if (flags & MS_REMOUNT) From 6f6a23a213be51728502b88741ba6a10cda2441d Mon Sep 17 00:00:00 2001 From: Adam Wallis Date: Mon, 27 Nov 2017 10:45:01 -0500 Subject: [PATCH 07/61] dmaengine: dmatest: move callback wait queue to thread context Commit adfa543e7314 ("dmatest: don't use set_freezable_with_signal()") introduced a bug (that is in fact documented by the patch commit text) that leaves behind a dangling pointer. Since the done_wait structure is allocated on the stack, future invocations to the DMATEST can produce undesirable results (e.g., corrupted spinlocks). Commit a9df21e34b42 ("dmaengine: dmatest: warn user when dma test times out") attempted to WARN the user that the stack was likely corrupted but did not fix the actual issue. This patch fixes the issue by pushing the wait queue and callback structs into the the thread structure. If a failure occurs due to time, dmaengine_terminate_all will force the callback to safely call wake_up_all() without possibility of using a freed pointer. Cc: stable@vger.kernel.org Bug: https://bugzilla.kernel.org/show_bug.cgi?id=197605 Fixes: adfa543e7314 ("dmatest: don't use set_freezable_with_signal()") Reviewed-by: Sinan Kaya Suggested-by: Shunyong Yang Signed-off-by: Adam Wallis Signed-off-by: Vinod Koul --- drivers/dma/dmatest.c | 55 ++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index 47edc7fbf91f52..ec5f9d2bc8202f 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -155,6 +155,12 @@ MODULE_PARM_DESC(run, "Run the test (default: false)"); #define PATTERN_COUNT_MASK 0x1f #define PATTERN_MEMSET_IDX 0x01 +/* poor man's completion - we want to use wait_event_freezable() on it */ +struct dmatest_done { + bool done; + wait_queue_head_t *wait; +}; + struct dmatest_thread { struct list_head node; struct dmatest_info *info; @@ -165,6 +171,8 @@ struct dmatest_thread { u8 **dsts; u8 **udsts; enum dma_transaction_type type; + wait_queue_head_t done_wait; + struct dmatest_done test_done; bool done; }; @@ -342,18 +350,25 @@ static unsigned int dmatest_verify(u8 **bufs, unsigned int start, return error_count; } -/* poor man's completion - we want to use wait_event_freezable() on it */ -struct dmatest_done { - bool done; - wait_queue_head_t *wait; -}; static void dmatest_callback(void *arg) { struct dmatest_done *done = arg; - - done->done = true; - wake_up_all(done->wait); + struct dmatest_thread *thread = + container_of(arg, struct dmatest_thread, done_wait); + if (!thread->done) { + done->done = true; + wake_up_all(done->wait); + } else { + /* + * If thread->done, it means that this callback occurred + * after the parent thread has cleaned up. This can + * happen in the case that driver doesn't implement + * the terminate_all() functionality and a dma operation + * did not occur within the timeout period + */ + WARN(1, "dmatest: Kernel memory may be corrupted!!\n"); + } } static unsigned int min_odd(unsigned int x, unsigned int y) @@ -424,9 +439,8 @@ static unsigned long long dmatest_KBs(s64 runtime, unsigned long long len) */ static int dmatest_func(void *data) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(done_wait); struct dmatest_thread *thread = data; - struct dmatest_done done = { .wait = &done_wait }; + struct dmatest_done *done = &thread->test_done; struct dmatest_info *info; struct dmatest_params *params; struct dma_chan *chan; @@ -673,9 +687,9 @@ static int dmatest_func(void *data) continue; } - done.done = false; + done->done = false; tx->callback = dmatest_callback; - tx->callback_param = &done; + tx->callback_param = done; cookie = tx->tx_submit(tx); if (dma_submit_error(cookie)) { @@ -688,21 +702,12 @@ static int dmatest_func(void *data) } dma_async_issue_pending(chan); - wait_event_freezable_timeout(done_wait, done.done, + wait_event_freezable_timeout(thread->done_wait, done->done, msecs_to_jiffies(params->timeout)); status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); - if (!done.done) { - /* - * We're leaving the timed out dma operation with - * dangling pointer to done_wait. To make this - * correct, we'll need to allocate wait_done for - * each test iteration and perform "who's gonna - * free it this time?" dancing. For now, just - * leave it dangling. - */ - WARN(1, "dmatest: Kernel stack may be corrupted!!\n"); + if (!done->done) { dmaengine_unmap_put(um); result("test timed out", total_tests, src_off, dst_off, len, 0); @@ -789,7 +794,7 @@ static int dmatest_func(void *data) dmatest_KBs(runtime, total_len), ret); /* terminate all transfers on specified channels */ - if (ret) + if (ret || failed_tests) dmaengine_terminate_all(chan); thread->done = true; @@ -849,6 +854,8 @@ static int dmatest_add_threads(struct dmatest_info *info, thread->info = info; thread->chan = dtc->chan; thread->type = type; + thread->test_done.wait = &thread->done_wait; + init_waitqueue_head(&thread->done_wait); smp_wmb(); thread->task = kthread_create(dmatest_func, thread, "%s-%s%u", dma_chan_name(chan), op, i); From eb9436966fdc84cebdf222952a99898ab46d9bb0 Mon Sep 17 00:00:00 2001 From: Tobias Jordan Date: Wed, 6 Dec 2017 14:28:27 +0100 Subject: [PATCH 08/61] dmaengine: jz4740: disable/unprepare clk if probe fails in error path of jz4740_dma_probe(), call clk_disable_unprepare() to clean up. Found by Linux Driver Verification project (linuxtesting.org). Fixes: 25ce6c35fea0 MIPS: jz4740: Remove custom DMA API Signed-off-by: Tobias Jordan Signed-off-by: Vinod Koul --- drivers/dma/dma-jz4740.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/dma-jz4740.c b/drivers/dma/dma-jz4740.c index d50273fed71509..afd5e10f8927cb 100644 --- a/drivers/dma/dma-jz4740.c +++ b/drivers/dma/dma-jz4740.c @@ -555,7 +555,7 @@ static int jz4740_dma_probe(struct platform_device *pdev) ret = dma_async_device_register(dd); if (ret) - return ret; + goto err_clk; irq = platform_get_irq(pdev, 0); ret = request_irq(irq, jz4740_dma_irq, 0, dev_name(&pdev->dev), dmadev); @@ -568,6 +568,8 @@ static int jz4740_dma_probe(struct platform_device *pdev) err_unregister: dma_async_device_unregister(dd); +err_clk: + clk_disable_unprepare(dmadev->clk); return ret; } From 996fc4477a0ea28226b30d175f053fb6f9a4fa36 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 10 Dec 2017 23:44:11 -0500 Subject: [PATCH 09/61] ext4: add missing error check in __ext4_new_inode() It's possible for ext4_get_acl() to return an ERR_PTR. So we need to add a check for this case in __ext4_new_inode(). Otherwise on an error we can end up oops the kernel. This was getting triggered by xfstests generic/388, which is a test which exercises the shutdown code path. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ialloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b4267d72f24955..b32cf263750d1d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -816,6 +816,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, #ifdef CONFIG_EXT4_FS_POSIX_ACL struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(p)) + return ERR_CAST(p); if (p) { int acl_size = p->a_count * sizeof(ext4_acl_entry); From 9d5afec6b8bd46d6ed821aa1579634437f58ef1f Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Mon, 11 Dec 2017 15:00:57 -0500 Subject: [PATCH 10/61] ext4: fix crash when a directory's i_size is too small On a ppc64 machine, when mounting a fuzzed ext2 image (generated by fsfuzzer) the following call trace is seen, VFS: brelse: Trying to free free buffer WARNING: CPU: 1 PID: 6913 at /root/repos/linux/fs/buffer.c:1165 .__brelse.part.6+0x24/0x40 .__brelse.part.6+0x20/0x40 (unreliable) .ext4_find_entry+0x384/0x4f0 .ext4_lookup+0x84/0x250 .lookup_slow+0xdc/0x230 .walk_component+0x268/0x400 .path_lookupat+0xec/0x2d0 .filename_lookup+0x9c/0x1d0 .vfs_statx+0x98/0x140 .SyS_newfstatat+0x48/0x80 system_call+0x58/0x6c This happens because the directory that ext4_find_entry() looks up has inode->i_size that is less than the block size of the filesystem. This causes 'nblocks' to have a value of zero. ext4_bread_batch() ends up not reading any of the directory file's blocks. This renders the entries in bh_use[] array to continue to have garbage data. buffer_uptodate() on bh_use[0] can then return a zero value upon which brelse() function is invoked. This commit fixes the bug by returning -ENOENT when the directory file has no associated blocks. Reported-by: Abdul Haleem Signed-off-by: Chandan Rajendra Cc: stable@vger.kernel.org --- fs/ext4/namei.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 798b3ac680db1b..e750d68fbcb50c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1399,6 +1399,10 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, "falling back\n")); } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (!nblocks) { + ret = NULL; + goto cleanup_and_exit; + } start = EXT4_I(dir)->i_dir_start_lookup; if (start >= nblocks) start = 0; From 10a6a6975691775bbcc677a04c6fd3120b5c1160 Mon Sep 17 00:00:00 2001 From: Cyrille Pitchen Date: Tue, 12 Dec 2017 14:40:12 +0100 Subject: [PATCH 11/61] Revert "dt-bindings: mtd: add sst25wf040b and en25s64 to sip-nor list" This reverts commit b07815d4eaf658b683c345d6e643895a20d92f29. The reverted commit was merged into v4-15-rc1 by mistake: it was taken from the IMX tree but the patch has never been sent to linux-mtd nor reviewed by any spi-nor maintainers. Actually, it would have been rejected since we add new values for the 'compatible' DT property only for SPI NOR memories that don't support the JEDEC READ ID op code (0x9F). Both en25s64 and sst25wf040b support the JEDEC READ ID op code, hence should use the "jedec,spi-nor" string alone as 'compatible' value. See the following link for more details: http://lists.infradead.org/pipermail/linux-mtd/2017-November/077425.html Signed-off-by: Cyrille Pitchen Acked-by: Marek Vasut --- Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt index 376fa2f50e6bc9..956bb046e599d5 100644 --- a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt +++ b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt @@ -13,7 +13,6 @@ Required properties: at25df321a at25df641 at26df081a - en25s64 mr25h128 mr25h256 mr25h10 @@ -33,7 +32,6 @@ Required properties: s25fl008k s25fl064k sst25vf040b - sst25wf040b m25p40 m25p80 m25p16 From 2610acf46b9ed528ec2cacd717bc9d354e452b73 Mon Sep 17 00:00:00 2001 From: Andreas Platschek Date: Thu, 14 Dec 2017 12:50:51 +0100 Subject: [PATCH 12/61] dmaengine: fsl-edma: disable clks on all error paths Previously enabled clks are only disabled if clk_prepare_enable() fails. However, there are other error paths were the previously enabled clocks are not disabled. To fix the problem, fsl_disable_clocks() now takes the number of clocks that shall be disabled + unprepared. For existing calls were all clocks were already successfully prepared + enabled, DMAMUX_NR is passed to disable + unprepare all clocks. In error paths were only some clocks were successfully prepared + enabled the loop counter is passed, in order to disable + unprepare all successfully prepared + enabled clocks. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Andreas Platschek Signed-off-by: Vinod Koul --- drivers/dma/fsl-edma.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/dma/fsl-edma.c b/drivers/dma/fsl-edma.c index 6775f2c74e25b7..c7568869284e17 100644 --- a/drivers/dma/fsl-edma.c +++ b/drivers/dma/fsl-edma.c @@ -863,11 +863,11 @@ static void fsl_edma_irq_exit( } } -static void fsl_disable_clocks(struct fsl_edma_engine *fsl_edma) +static void fsl_disable_clocks(struct fsl_edma_engine *fsl_edma, int nr_clocks) { int i; - for (i = 0; i < DMAMUX_NR; i++) + for (i = 0; i < nr_clocks; i++) clk_disable_unprepare(fsl_edma->muxclk[i]); } @@ -904,25 +904,25 @@ static int fsl_edma_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 1 + i); fsl_edma->muxbase[i] = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(fsl_edma->muxbase[i])) + if (IS_ERR(fsl_edma->muxbase[i])) { + /* on error: disable all previously enabled clks */ + fsl_disable_clocks(fsl_edma, i); return PTR_ERR(fsl_edma->muxbase[i]); + } sprintf(clkname, "dmamux%d", i); fsl_edma->muxclk[i] = devm_clk_get(&pdev->dev, clkname); if (IS_ERR(fsl_edma->muxclk[i])) { dev_err(&pdev->dev, "Missing DMAMUX block clock.\n"); + /* on error: disable all previously enabled clks */ + fsl_disable_clocks(fsl_edma, i); return PTR_ERR(fsl_edma->muxclk[i]); } ret = clk_prepare_enable(fsl_edma->muxclk[i]); - if (ret) { - /* disable only clks which were enabled on error */ - for (; i >= 0; i--) - clk_disable_unprepare(fsl_edma->muxclk[i]); - - dev_err(&pdev->dev, "DMAMUX clk block failed.\n"); - return ret; - } + if (ret) + /* on error: disable all previously enabled clks */ + fsl_disable_clocks(fsl_edma, i); } @@ -976,7 +976,7 @@ static int fsl_edma_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "Can't register Freescale eDMA engine. (%d)\n", ret); - fsl_disable_clocks(fsl_edma); + fsl_disable_clocks(fsl_edma, DMAMUX_NR); return ret; } @@ -985,7 +985,7 @@ static int fsl_edma_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Can't register Freescale eDMA of_dma. (%d)\n", ret); dma_async_device_unregister(&fsl_edma->dma_dev); - fsl_disable_clocks(fsl_edma); + fsl_disable_clocks(fsl_edma, DMAMUX_NR); return ret; } @@ -1015,7 +1015,7 @@ static int fsl_edma_remove(struct platform_device *pdev) fsl_edma_cleanup_vchan(&fsl_edma->dma_dev); of_dma_controller_free(np); dma_async_device_unregister(&fsl_edma->dma_dev); - fsl_disable_clocks(fsl_edma); + fsl_disable_clocks(fsl_edma, DMAMUX_NR); return 0; } From cef31d9af908243421258f1df35a4a644604efbe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Dec 2017 10:32:03 +0100 Subject: [PATCH 13/61] posix-timer: Properly check sigevent->sigev_notify timer_create() specifies via sigevent->sigev_notify the signal delivery for the new timer. The valid modes are SIGEV_NONE, SIGEV_SIGNAL, SIGEV_THREAD and (SIGEV_SIGNAL | SIGEV_THREAD_ID). The sanity check in good_sigevent() is only checking the valid combination for the SIGEV_THREAD_ID bit, i.e. SIGEV_SIGNAL, but if SIGEV_THREAD_ID is not set it accepts any random value. This has no real effects on the posix timer and signal delivery code, but it affects show_timer() which handles the output of /proc/$PID/timers. That function uses a string array to pretty print sigev_notify. The access to that array has no bound checks, so random sigev_notify cause access beyond the array bounds. Add proper checks for the valid notify modes and remove the SIGEV_THREAD_ID masking from various code pathes as SIGEV_NONE can never be set in combination with SIGEV_THREAD_ID. Reported-by: Eric Biggers Reported-by: Dmitry Vyukov Reported-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: stable@vger.kernel.org --- kernel/time/posix-timers.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 13d6881f908b7f..ec999f32c84058 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + switch (event->sigev_notify) { + case SIGEV_SIGNAL | SIGEV_THREAD_ID: + rtn = find_task_by_vpid(event->sigev_notify_thread_id); + if (!rtn || !same_thread_group(rtn, current)) + return NULL; + /* FALLTHRU */ + case SIGEV_SIGNAL: + case SIGEV_THREAD: + if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) + return NULL; + /* FALLTHRU */ + case SIGEV_NONE: + return task_pid(rtn); + default: return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return task_pid(rtn); + } } static struct k_itimer * alloc_posix_timer(void) @@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sig_none = timr->it_sigev_notify == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ @@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); - sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); timr->it_active = !sigev_none; From fdf2e821052958a114618a95ab18a300d0b080cb Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 5 Dec 2017 11:51:40 +0100 Subject: [PATCH 14/61] mtd: nand: gpmi: Fix failure when a erased page has a bitflip at BBM When erased subpages are read then the BCH decoder returns STATUS_ERASED if they are all empty, or STATUS_UNCORRECTABLE if there are bitflips. When there are bitflips, we have to set these bits again to show the upper layers a completely erased page. When a bitflip happens in the exact byte where the bad block marker is, then this byte is swapped with another byte in block_mark_swapping(). The correction code then detects a bitflip in another subpage and no longer corrects the bitflip where it really happens. Correct this behaviour by calling block_mark_swapping() after the bitflips have been corrected. In our case UBIFS failed with this bug because it expects erased pages to be really empty: UBIFS error (pid 187): ubifs_scan: corrupt empty space at LEB 36:118735 UBIFS error (pid 187): ubifs_scanned_corruption: corruption at LEB 36:118735 UBIFS error (pid 187): ubifs_scanned_corruption: first 8192 bytes from LEB 36:118735 UBIFS error (pid 187): ubifs_scan: LEB 36 scanning failed UBIFS error (pid 187): do_commit: commit failed, error -117 Signed-off-by: Sascha Hauer Reviewed-by: Richard Weinberger Acked-by: Boris Brezillon Signed-off-by: Richard Weinberger --- drivers/mtd/nand/gpmi-nand/gpmi-nand.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c index 50f8d4a1b98323..d4d824ef64e9fb 100644 --- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c +++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c @@ -1067,9 +1067,6 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip, return ret; } - /* handle the block mark swapping */ - block_mark_swapping(this, payload_virt, auxiliary_virt); - /* Loop over status bytes, accumulating ECC status. */ status = auxiliary_virt + nfc_geo->auxiliary_status_offset; @@ -1158,6 +1155,9 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip, max_bitflips = max_t(unsigned int, max_bitflips, *status); } + /* handle the block mark swapping */ + block_mark_swapping(this, buf, auxiliary_virt); + if (oob_required) { /* * It's time to deliver the OOB bytes. See gpmi_ecc_read_oob() From e44b9a9c135727f3410e029910275f40681dc8bc Mon Sep 17 00:00:00 2001 From: Albert Hsieh Date: Mon, 20 Nov 2017 11:26:26 +0800 Subject: [PATCH 15/61] mtd: nand: brcmnand: Zero bitflip is not an error A negative return value of brcmstb_nand_verify_erased_page() indicates a real bitflip error of an erased page, and other return values (>= 0) show the corrected bitflip number. Zero return value means no bitflip, but the current driver code treats it as an error, and eventually leads to falsely reported ECC error. Fixes: 02b88eea9f9c ("mtd: brcmnand: Add check for erased page bitflip") Signed-off-by: Albert Hsieh Acked-by: Boris Brezillon Signed-off-by: Richard Weinberger --- drivers/mtd/nand/brcmnand/brcmnand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/brcmnand/brcmnand.c b/drivers/mtd/nand/brcmnand/brcmnand.c index e0eb51d8c01299..dd56a671ea4285 100644 --- a/drivers/mtd/nand/brcmnand/brcmnand.c +++ b/drivers/mtd/nand/brcmnand/brcmnand.c @@ -1763,7 +1763,7 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip, err = brcmstb_nand_verify_erased_page(mtd, chip, buf, addr); /* erased page bitflips corrected */ - if (err > 0) + if (err >= 0) return err; } From bc2fd1b11097ad981478abcc0328784ea131ac29 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 6 Dec 2017 18:27:14 +0100 Subject: [PATCH 16/61] mtd: nand: gpio: Fix ALE gpio configuration Fixes a copy/paste error in commit f3d0d8d938b4d ("mtd: nand: gpio: Convert to use GPIO descriptors") which breaks gpio-nand driver Fixes: f3d0d8d938b4d ("mtd: nand: gpio: Convert to use GPIO descriptors") Cc: Linus Walleij Signed-off-by: Christophe Leroy Reviewed-by: Richard Weinberger Acked-by: Boris Brezillon Reviewed-by: Linus Walleij Signed-off-by: Richard Weinberger --- drivers/mtd/nand/gpio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/nand/gpio.c b/drivers/mtd/nand/gpio.c index 484f7fbc3f7d2d..a8bde6665c24f7 100644 --- a/drivers/mtd/nand/gpio.c +++ b/drivers/mtd/nand/gpio.c @@ -253,9 +253,9 @@ static int gpio_nand_probe(struct platform_device *pdev) goto out_ce; } - gpiomtd->nwp = devm_gpiod_get(dev, "ale", GPIOD_OUT_LOW); - if (IS_ERR(gpiomtd->nwp)) { - ret = PTR_ERR(gpiomtd->nwp); + gpiomtd->ale = devm_gpiod_get(dev, "ale", GPIOD_OUT_LOW); + if (IS_ERR(gpiomtd->ale)) { + ret = PTR_ERR(gpiomtd->ale); goto out_ce; } From 1784f9144b143a1e8b19fe94083b040aa559182b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 5 Dec 2017 14:14:47 +0100 Subject: [PATCH 17/61] drivers/misc/intel/pti: Rename the header file to free up the namespace We'd like to use the 'PTI' acronym for 'Page Table Isolation' - free up the namespace by renaming the driver header to . (Also standardize the header guard name while at it.) Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: J Freyensee Cc: Greg Kroah-Hartman Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- drivers/misc/pti.c | 2 +- include/linux/{pti.h => intel-pti.h} | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) rename include/linux/{pti.h => intel-pti.h} (94%) diff --git a/drivers/misc/pti.c b/drivers/misc/pti.c index eda38cbe85307e..41f2a9f6851d9e 100644 --- a/drivers/misc/pti.c +++ b/drivers/misc/pti.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/pti.h b/include/linux/intel-pti.h similarity index 94% rename from include/linux/pti.h rename to include/linux/intel-pti.h index b3ea01a3197efa..2710d72de3c926 100644 --- a/include/linux/pti.h +++ b/include/linux/intel-pti.h @@ -22,8 +22,8 @@ * interface to write out it's contents for debugging a mobile system. */ -#ifndef PTI_H_ -#define PTI_H_ +#ifndef LINUX_INTEL_PTI_H_ +#define LINUX_INTEL_PTI_H_ /* offset for last dword of any PTI message. Part of MIPI P1149.7 */ #define PTI_LASTDWORD_DTS 0x30 @@ -40,4 +40,4 @@ struct pti_masterchannel *pti_request_masterchannel(u8 type, const char *thread_name); void pti_release_masterchannel(struct pti_masterchannel *mc); -#endif /*PTI_H_*/ +#endif /* LINUX_INTEL_PTI_H_ */ From a8b4db562e7283a1520f9e9730297ecaab7622ea Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:51 -0800 Subject: [PATCH 18/61] x86/cpufeature: Add User-Mode Instruction Prevention definitions [ Note, this is a Git cherry-pick of the following commit: (limited to the cpufeatures.h file) 3522c2a6a4f3 ("x86/cpufeature: Add User-Mode Instruction Prevention definitions") ... for easier x86 PTI code testing and back-porting. ] User-Mode Instruction Prevention is a security feature present in new Intel processors that, when set, prevents the execution of a subset of instructions if such instructions are executed in user mode (CPL > 0). Attempting to execute such instructions causes a general protection exception. The subset of instructions comprises: * SGDT - Store Global Descriptor Table * SIDT - Store Interrupt Descriptor Table * SLDT - Store Local Descriptor Table * SMSW - Store Machine Status Word * STR - Store Task Register This feature is also added to the list of disabled-features to allow a cleaner handling of build-time configuration. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-7-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cdf5be8668631f..c0b0e9e8aa66eb 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -296,6 +296,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ From f2dbad36c55e5d3a91dccbde6e8cae345fe5632f Mon Sep 17 00:00:00 2001 From: Rudolf Marek Date: Tue, 28 Nov 2017 22:01:06 +0100 Subject: [PATCH 19/61] x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD [ Note, this is a Git cherry-pick of the following commit: 2b67799bdf25 ("x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD") ... for easier x86 PTI code testing and back-porting. ] The latest AMD AMD64 Architecture Programmer's Manual adds a CPUID feature XSaveErPtr (CPUID_Fn80000008_EBX[2]). If this feature is set, the FXSAVE, XSAVE, FXSAVEOPT, XSAVEC, XSAVES / FXRSTOR, XRSTOR, XRSTORS always save/restore error pointers, thus making the X86_BUG_FXSAVE_LEAK workaround obsolete on such CPUs. Signed-Off-By: Rudolf Marek Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/bdcebe90-62c5-1f05-083c-eba7f08b2540@assembler.cz Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/amd.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c0b0e9e8aa66eb..800104c8a3edfe 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -266,6 +266,7 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d58184b7cd4438..bcb75dc97d4407 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -804,8 +804,11 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x17: init_amd_zn(c); break; } - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) + /* + * Enable workaround for FXSAVE leak on CPUs + * without a XSaveErPtr feature + */ + if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); cpu_detect_cache_sizes(c); From 2fe1bc1f501d55e5925b4035bcd85781adc76c63 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 14:46:30 -0700 Subject: [PATCH 20/61] perf/x86: Enable free running PEBS for REGS_USER/INTR [ Note, this is a Git cherry-pick of the following commit: a47ba4d77e12 ("perf/x86: Enable free running PEBS for REGS_USER/INTR") ... for easier x86 PTI code testing and back-porting. ] Currently free running PEBS is disabled when user or interrupt registers are requested. Most of the registers are actually available in the PEBS record and can be supported. So we just need to check for the supported registers and then allow it: it is all except for the segment register. For user registers this only works when the counter is limited to ring 3 only, so this also needs to be checked. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170831214630.21892-1-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 4 ++++ arch/x86/events/perf_event.h | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9fb9a1f1e47bd0..43445da30ceab1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2958,6 +2958,10 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) if (event->attr.use_clockid) flags &= ~PERF_SAMPLE_TIME; + if (!event->attr.exclude_kernel) + flags &= ~PERF_SAMPLE_REGS_USER; + if (event->attr.sample_regs_user & ~PEBS_REGS) + flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4196f81ec0e1b0..f7aaadf9331fb7 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -85,13 +85,15 @@ struct amd_nb { * Flags PEBS can handle without an PMI. * * TID can only be handled by flushing at context switch. + * REGS_USER can be handled for events limited to ring 3. * */ #define PEBS_FREERUNNING_FLAGS \ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) /* * A debug store configuration. @@ -110,6 +112,26 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; +#define PEBS_REGS \ + (PERF_REG_X86_AX | \ + PERF_REG_X86_BX | \ + PERF_REG_X86_CX | \ + PERF_REG_X86_DX | \ + PERF_REG_X86_DI | \ + PERF_REG_X86_SI | \ + PERF_REG_X86_SP | \ + PERF_REG_X86_BP | \ + PERF_REG_X86_IP | \ + PERF_REG_X86_FLAGS | \ + PERF_REG_X86_R8 | \ + PERF_REG_X86_R9 | \ + PERF_REG_X86_R10 | \ + PERF_REG_X86_R11 | \ + PERF_REG_X86_R12 | \ + PERF_REG_X86_R13 | \ + PERF_REG_X86_R14 | \ + PERF_REG_X86_R15) + /* * Per register state. */ From ab95477e7cb35557ecfc837687007b646bab9a9f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Dec 2017 02:25:31 +0100 Subject: [PATCH 21/61] bpf: fix build issues on um due to mising bpf_perf_event.h [ Note, this is a Git cherry-pick of the following commit: a23f06f06dbe ("bpf: fix build issues on um due to mising bpf_perf_event.h") ... for easier x86 PTI code testing and back-porting. ] Since c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") um (uml) won't build on i386 or x86_64: [...] CC init/main.o In file included from ../include/linux/perf_event.h:18:0, from ../include/linux/trace_events.h:10, from ../include/trace/syscall.h:7, from ../include/linux/syscalls.h:82, from ../init/main.c:20: ../include/uapi/linux/bpf_perf_event.h:11:32: fatal error: asm/bpf_perf_event.h: No such file or directory #include [...] Lets add missing bpf_perf_event.h also to um arch. This seems to be the only one still missing. Fixes: c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") Reported-by: Randy Dunlap Suggested-by: Richard Weinberger Signed-off-by: Daniel Borkmann Tested-by: Randy Dunlap Cc: Hendrik Brueckner Cc: Richard Weinberger Acked-by: Alexei Starovoitov Acked-by: Richard Weinberger Signed-off-by: Alexei Starovoitov Signed-off-by: Ingo Molnar --- arch/um/include/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 50a32c33d729ba..73c57f614c9e06 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,4 +1,5 @@ generic-y += barrier.h +generic-y += bpf_perf_event.h generic-y += bug.h generic-y += clkdev.h generic-y += current.h From c2bc66082e1048c7573d72e62f597bdc5ce13fea Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:47 +0100 Subject: [PATCH 22/61] locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE() [ Note, this is a Git cherry-pick of the following commit: 76ebbe78f739 ("locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()") ... for easier x86 PTI code testing and back-porting. ] In preparation for the removal of lockless_dereference(), which is the same as READ_ONCE() on all architectures other than Alpha, add an implicit smp_read_barrier_depends() to READ_ONCE() so that it can be used to head dependency chains on all architectures. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 202710420d6deb..712cd8bb00b4b9 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -341,6 +341,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __read_once_size(&(x), __u.__c, sizeof(x)); \ else \ __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ + smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ __u.__val; \ }) #define READ_ONCE(x) __READ_ONCE(x, 1) From 3382290ed2d5e275429cef510ab21889d3ccd164 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:48 +0100 Subject: [PATCH 23/61] locking/barriers: Convert users of lockless_dereference() to READ_ONCE() [ Note, this is a Git cherry-pick of the following commit: 506458efaf15 ("locking/barriers: Convert users of lockless_dereference() to READ_ONCE()") ... for easier x86 PTI code testing and back-porting. ] READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it can be used instead of lockless_dereference() without any change in semantics. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/mmu_context.h | 4 ++-- arch/x86/kernel/ldt.c | 2 +- drivers/md/dm-mpath.c | 20 ++++++++++---------- fs/dcache.c | 4 ++-- fs/overlayfs/ovl_entry.h | 2 +- fs/overlayfs/readdir.c | 2 +- include/linux/rculist.h | 4 ++-- include/linux/rcupdate.h | 4 ++-- kernel/events/core.c | 4 ++-- kernel/seccomp.c | 2 +- kernel/task_work.c | 2 +- mm/slab.h | 2 +- 13 files changed, 27 insertions(+), 27 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 80534d3c248001..589af1eec7c1cb 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2371,7 +2371,7 @@ static unsigned long get_segment_base(unsigned int segment) struct ldt_struct *ldt; /* IRQs are off, so this synchronizes with smp_store_release */ - ldt = lockless_dereference(current->active_mm->context.ldt); + ldt = READ_ONCE(current->active_mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 6699fc44164419..6d16d15d09a0da 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -73,8 +73,8 @@ static inline void load_mm_ldt(struct mm_struct *mm) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; - /* lockless_dereference synchronizes with smp_store_release */ - ldt = lockless_dereference(mm->context.ldt); + /* READ_ONCE synchronizes with smp_store_release */ + ldt = READ_ONCE(mm->context.ldt); /* * Any change to mm->context.ldt is followed by an IPI to all diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ae5615b03def7b..1c1eae9613406b 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -103,7 +103,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) static void install_ldt(struct mm_struct *current_mm, struct ldt_struct *ldt) { - /* Synchronizes with lockless_dereference in load_mm_ldt. */ + /* Synchronizes with READ_ONCE in load_mm_ldt. */ smp_store_release(¤t_mm->context.ldt, ldt); /* Activate the LDT for all CPUs using current_mm. */ diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 11f273d2f018e7..3f88c9d32f7ecc 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m, pgpath = path_to_pgpath(path); - if (unlikely(lockless_dereference(m->current_pg) != pg)) { + if (unlikely(READ_ONCE(m->current_pg) != pg)) { /* Only update current_pgpath if pg changed */ spin_lock_irqsave(&m->lock, flags); m->current_pgpath = pgpath; @@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) } /* Were we instructed to switch PG? */ - if (lockless_dereference(m->next_pg)) { + if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { @@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) /* Don't change PG until it has no remaining paths */ check_current_pg: - pg = lockless_dereference(m->current_pg); + pg = READ_ONCE(m->current_pg); if (pg) { pgpath = choose_path_in_pg(m, pg, nr_bytes); if (!IS_ERR_OR_NULL(pgpath)) @@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, struct request *clone; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) pgpath = choose_pgpath(m, nr_bytes); @@ -535,7 +535,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m bool queue_io; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); if (!pgpath || !queue_io) pgpath = choose_pgpath(m, nr_bytes); @@ -1804,7 +1804,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, struct pgpath *current_pgpath; int r; - current_pgpath = lockless_dereference(m->current_pgpath); + current_pgpath = READ_ONCE(m->current_pgpath); if (!current_pgpath) current_pgpath = choose_pgpath(m, 0); @@ -1826,7 +1826,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } if (r == -ENOTCONN) { - if (!lockless_dereference(m->current_pg)) { + if (!READ_ONCE(m->current_pg)) { /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } @@ -1895,9 +1895,9 @@ static int multipath_busy(struct dm_target *ti) return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); /* Guess which priority_group will be used at next mapping time */ - pg = lockless_dereference(m->current_pg); - next_pg = lockless_dereference(m->next_pg); - if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) + pg = READ_ONCE(m->current_pg); + next_pg = READ_ONCE(m->next_pg); + if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) pg = next_pg; if (!pg) { diff --git a/fs/dcache.c b/fs/dcache.c index f90141387f01ea..34c852af215c0f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c { /* * Be careful about RCU walk racing with rename: - * use 'lockless_dereference' to fetch the name pointer. + * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The @@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ - const unsigned char *cs = lockless_dereference(dentry->d_name.name); + const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 25d9b5adcd4290..36b49bd09264a5 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode) static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) { - return lockless_dereference(oi->__upperdentry); + return READ_ONCE(oi->__upperdentry); } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 698b74dd750ee6..c310e3ff7f3f7d 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -754,7 +754,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { struct inode *inode = file_inode(file); - realfile = lockless_dereference(od->upperfile); + realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; diff --git a/include/linux/rculist.h b/include/linux/rculist.h index c2cdd45a880aa0..127f534fec94aa 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -275,7 +275,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - container_of(lockless_dereference(ptr), type, member) + container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? @@ -368,7 +368,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * example is when items are added to the list, but never deleted. */ #define list_entry_lockless(ptr, type, member) \ - container_of((typeof(ptr))lockless_dereference(ptr), type, member) + container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1a9f70d44af954..a6ddc42f87a578 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -346,7 +346,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define __rcu_dereference_check(p, c, space) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ + typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ @@ -360,7 +360,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_dereference_raw(p) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(p) ________p1 = lockless_dereference(p); \ + typeof(p) ________p1 = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) diff --git a/kernel/events/core.c b/kernel/events/core.c index 10cdb9c26b5d12..6eee4ed97af0b9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4233,7 +4233,7 @@ static void perf_remove_from_owner(struct perf_event *event) * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - owner = lockless_dereference(event->owner); + owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -4330,7 +4330,7 @@ int perf_event_release_kernel(struct perf_event *event) * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ - ctx = lockless_dereference(child->ctx); + ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 418a1c045933dc..5f0dfb2abb8d39 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -190,7 +190,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = - lockless_dereference(current->seccomp.filter); + READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) diff --git a/kernel/task_work.c b/kernel/task_work.c index 5718b3ea202a3f..0fef395662a6ea 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -68,7 +68,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = lockless_dereference(*pprev))) { + while ((work = READ_ONCE(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) diff --git a/mm/slab.h b/mm/slab.h index 028cdc7df67ec9..86d7c7d860f92c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -259,7 +259,7 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) * memcg_caches issues a write barrier to match this (see * memcg_create_kmem_cache()). */ - cachep = lockless_dereference(arr->entries[idx]); + cachep = READ_ONCE(arr->entries[idx]); rcu_read_unlock(); return cachep; From 2aeb07365bcd489620f71390a7d2031cd4dfb83e Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 15 Nov 2017 17:36:35 -0800 Subject: [PATCH 24/61] x86/mm/kasan: Don't use vmemmap_populate() to initialize shadow [ Note, this is a Git cherry-pick of the following commit: d17a1d97dc20: ("x86/mm/kasan: don't use vmemmap_populate() to initialize shadow") ... for easier x86 PTI code testing and back-porting. ] The KASAN shadow is currently mapped using vmemmap_populate() since that provides a semi-convenient way to map pages into init_top_pgt. However, since that no longer zeroes the mapped pages, it is not suitable for KASAN, which requires zeroed shadow memory. Add kasan_populate_shadow() interface and use it instead of vmemmap_populate(). Besides, this allows us to take advantage of gigantic pages and use them to populate the shadow, which should save us some memory wasted on page tables and reduce TLB pressure. Link: http://lkml.kernel.org/r/20171103185147.2688-2-pasha.tatashin@oracle.com Signed-off-by: Andrey Ryabinin Signed-off-by: Pavel Tatashin Cc: Andy Lutomirski Cc: Steven Sistare Cc: Daniel Jordan Cc: Bob Picco Cc: Michal Hocko Cc: Alexander Potapenko Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/mm/kasan_init_64.c | 143 ++++++++++++++++++++++++++++++++++-- 2 files changed, 137 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4ae940a0ed3bc6..665eba1b610393 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -108,7 +108,7 @@ config X86 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL - select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_MMAP_RND_BITS if MMU diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 2b60dc6e64b1da..99dfed6dfef8b2 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -4,12 +4,14 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -18,7 +20,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static int __init map_range(struct range *range) +static __init void *early_alloc(size_t size, int nid) +{ + return memblock_virt_alloc_try_nid_nopanic(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); +} + +static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, + unsigned long end, int nid) +{ + pte_t *pte; + + if (pmd_none(*pmd)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_PSE) && + ((end - addr) == PMD_SIZE) && + IS_ALIGNED(addr, PMD_SIZE)) { + p = early_alloc(PMD_SIZE, nid); + if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PMD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid); + pmd_populate_kernel(&init_mm, pmd, p); + } + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t entry; + void *p; + + if (!pte_none(*pte)) + continue; + + p = early_alloc(PAGE_SIZE, nid); + entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, + unsigned long end, int nid) +{ + pmd_t *pmd; + unsigned long next; + + if (pud_none(*pud)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_GBPAGES) && + ((end - addr) == PUD_SIZE) && + IS_ALIGNED(addr, PUD_SIZE)) { + p = early_alloc(PUD_SIZE, nid); + if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PUD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid); + pud_populate(&init_mm, pud, p); + } + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_large(*pmd)) + kasan_populate_pmd(pmd, addr, next, nid); + } while (pmd++, addr = next, addr != end); +} + +static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, + unsigned long end, int nid) +{ + pud_t *pud; + unsigned long next; + + if (p4d_none(*p4d)) { + void *p = early_alloc(PAGE_SIZE, nid); + + p4d_populate(&init_mm, p4d, p); + } + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_large(*pud)) + kasan_populate_pud(pud, addr, next, nid); + } while (pud++, addr = next, addr != end); +} + +static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, + unsigned long end, int nid) +{ + void *p; + p4d_t *p4d; + unsigned long next; + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, nid); + pgd_populate(&init_mm, pgd, p); + } + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + kasan_populate_p4d(p4d, addr, next, nid); + } while (p4d++, addr = next, addr != end); +} + +static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, + int nid) +{ + pgd_t *pgd; + unsigned long next; + + addr = addr & PAGE_MASK; + end = round_up(end, PAGE_SIZE); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + kasan_populate_pgd(pgd, addr, next, nid); + } while (pgd++, addr = next, addr != end); +} + +static void __init map_range(struct range *range) { unsigned long start; unsigned long end; @@ -26,7 +155,7 @@ static int __init map_range(struct range *range) start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); - return vmemmap_populate(start, end, NUMA_NO_NODE); + kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); } static void __init clear_pgds(unsigned long start, @@ -189,16 +318,16 @@ void __init kasan_init(void) if (pfn_mapped[i].end == 0) break; - if (map_range(&pfn_mapped[i])) - panic("kasan: unable to allocate shadow!"); + map_range(&pfn_mapped[i]); } + kasan_populate_zero_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), kasan_mem_to_shadow((void *)__START_KERNEL_map)); - vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), - (unsigned long)kasan_mem_to_shadow(_end), - NUMA_NO_NODE); + kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), + (unsigned long)kasan_mem_to_shadow(_end), + early_pfn_to_nid(__pa(_stext))); kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); From e17f8234538d1ff708673f287a42457c4dee720d Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 4 Dec 2017 15:07:07 +0100 Subject: [PATCH 25/61] x86/entry/64/paravirt: Use paravirt-safe macro to access eflags Commit 1d3e53e8624a ("x86/entry/64: Refactor IRQ stacks and make them NMI-safe") added DEBUG_ENTRY_ASSERT_IRQS_OFF macro that acceses eflags using 'pushfq' instruction when testing for IF bit. On PV Xen guests looking at IF flag directly will always see it set, resulting in 'ud2'. Introduce SAVE_FLAGS() macro that will use appropriate save_fl pv op when running paravirt. Signed-off-by: Boris Ostrovsky Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: xen-devel@lists.xenproject.org Link: https://lkml.kernel.org/r/20171204150604.899457242@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 7 ++++--- arch/x86/include/asm/irqflags.h | 3 +++ arch/x86/include/asm/paravirt.h | 9 +++++++++ arch/x86/kernel/asm-offsets_64.c | 3 +++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a2b30ec6949727..32306788821c82 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -462,12 +462,13 @@ END(irq_entries_start) .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY - pushfq - testl $X86_EFLAGS_IF, (%rsp) + pushq %rax + SAVE_FLAGS(CLBR_RAX) + testl $X86_EFLAGS_IF, %eax jz .Lokay_\@ ud2 .Lokay_\@: - addq $8, %rsp + popq %rax #endif .endm diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index c8ef23f2c28f17..89f08955fff733 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void) swapgs; \ sysretl +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(x) pushfq; popq %rax +#endif #else #define INTERRUPT_RETURN iret #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 283efcaac8aff8..892df375b6155a 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -927,6 +927,15 @@ extern void default_banner(void); PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) + +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(clobbers) \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ + PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \ + PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) +#endif + #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 630212fa9b9da3..e3a5175a444bc0 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -23,6 +23,9 @@ int main(void) #ifdef CONFIG_PARAVIRT OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); +#ifdef CONFIG_DEBUG_ENTRY + OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); +#endif BLANK(); #endif From d3a09104018cf2ad5973dfa8a9c138ef9f5015a3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:08 +0100 Subject: [PATCH 26/61] x86/unwinder/orc: Dont bail on stack overflow If the stack overflows into a guard page and the ORC unwinder should work well: by construction, there can't be any meaningful data in the guard page because no writes to the guard page will have succeeded. But there is a bug that prevents unwinding from working correctly: if the starting register state has RSP pointing into a stack guard page, the ORC unwinder bails out immediately. Instead of bailing out immediately check whether the next page up is a valid check page and if so analyze that. As a result the ORC unwinder will start the unwind. Tested by intentionally overflowing the task stack. The result is an accurate call trace instead of a trace consisting purely of '?' entries. There are a few other bugs that are triggered if the unwinder encounters a stack overflow after the first step, but they are outside the scope of this fix. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150604.991389777@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_orc.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a3f973b2c97a03..ff8e1132b2ae41 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -553,8 +553,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } if (get_stack_info((unsigned long *)state->sp, state->task, - &state->stack_info, &state->stack_mask)) - return; + &state->stack_info, &state->stack_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); + if (get_stack_info(next_page, state->task, &state->stack_info, + &state->stack_mask)) + return; + } /* * The caller can provide the address of the first frame directly From b02fcf9ba1211097754b286043cd87a8b4907e75 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 4 Dec 2017 15:07:09 +0100 Subject: [PATCH 27/61] x86/unwinder: Handle stack overflows more gracefully There are at least two unwinder bugs hindering the debugging of stack-overflow crashes: - It doesn't deal gracefully with the case where the stack overflows and the stack pointer itself isn't on a valid stack but the to-be-dereferenced data *is*. - The ORC oops dump code doesn't know how to print partial pt_regs, for the case where if we get an interrupt/exception in *early* entry code before the full pt_regs have been saved. Fix both issues. http://lkml.kernel.org/r/20171126024031.uxi4numpbjm5rlbr@treble Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.071425003@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kdebug.h | 1 + arch/x86/include/asm/unwind.h | 7 ++++ arch/x86/kernel/dumpstack.c | 32 ++++++++++++--- arch/x86/kernel/process_64.c | 11 +++--- arch/x86/kernel/unwind_orc.c | 74 ++++++++++++----------------------- 5 files changed, 65 insertions(+), 60 deletions(-) diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index f86a8caa561e88..395c9631e000a3 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, int all); +extern void show_iret_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e9cc6fe1fc6f95..c1688c2d0a128f 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -7,6 +7,9 @@ #include #include +#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) +#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) + struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; @@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) +/* + * WARNING: The entire pt_regs may not be safe to dereference. In some cases, + * only the iret frame registers are accessible. Use with caution! + */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f13b4c00a5de4b..0bc95be5c6385c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -50,6 +50,28 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +void show_iret_regs(struct pt_regs *regs) +{ + printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); + printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + regs->sp, regs->flags); +} + +static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) +{ + if (on_stack(info, regs, sizeof(*regs))) + __show_regs(regs, 0); + else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { + /* + * When an interrupt or exception occurs in entry code, the + * full pt_regs might not have been saved yet. In that case + * just print the iret frame. + */ + show_iret_regs(regs); + } +} + void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl) { @@ -94,8 +116,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); - if (regs && on_stack(&stack_info, regs, sizeof(*regs))) - __show_regs(regs, 0); + if (regs) + show_regs_safe(&stack_info, regs); /* * Scan the stack, printing any text addresses we find. At the @@ -119,7 +141,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* * Don't print regs->ip again if it was already printed - * by __show_regs() below. + * by show_regs_safe() below. */ if (regs && stack == ®s->ip) goto next; @@ -155,8 +177,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state); - if (regs && on_stack(&stack_info, regs, sizeof(*regs))) - __show_regs(regs, 0); + if (regs) + show_regs_safe(&stack_info, regs); } if (stack_name) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index eeeb34f85c250e..01b119bebb6801 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, - regs->sp, regs->flags); + show_iret_regs(regs); + if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else @@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); + if (!all) + return; + asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); @@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all) rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - if (!all) - return; - cr0 = read_cr0(); cr2 = read_cr2(); cr3 = __read_cr3(); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index ff8e1132b2ae41..be86a865087a6b 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) return NULL; } -static bool stack_access_ok(struct unwind_state *state, unsigned long addr, +static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, size_t len) { struct stack_info *info = &state->stack_info; + void *addr = (void *)_addr; - /* - * If the address isn't on the current stack, switch to the next one. - * - * We may have to traverse multiple stacks to deal with the possibility - * that info->next_sp could point to an empty stack and the address - * could be on a subsequent stack. - */ - while (!on_stack(info, (void *)addr, len)) - if (get_stack_info(info->next_sp, state->task, info, - &state->stack_mask)) - return false; + if (!on_stack(info, addr, len) && + (get_stack_info(addr, state->task, info, &state->stack_mask))) + return false; return true; } @@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, return true; } -#define REGS_SIZE (sizeof(struct pt_regs)) -#define SP_OFFSET (offsetof(struct pt_regs, sp)) -#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) -#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) - static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, - unsigned long *ip, unsigned long *sp, bool full) + unsigned long *ip, unsigned long *sp) { - size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; - size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; - struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); - - if (IS_ENABLED(CONFIG_X86_64)) { - if (!stack_access_ok(state, addr, regs_size)) - return false; - - *ip = regs->ip; - *sp = regs->sp; + struct pt_regs *regs = (struct pt_regs *)addr; - return true; - } + /* x86-32 support will be more complicated due to the ®s->sp hack */ + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); - if (!stack_access_ok(state, addr, sp_offset)) + if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; *ip = regs->ip; + *sp = regs->sp; + return true; +} - if (user_mode(regs)) { - if (!stack_access_ok(state, addr + sp_offset, - REGS_SIZE - SP_OFFSET)) - return false; +static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp) +{ + struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; - *sp = regs->sp; - } else - *sp = (unsigned long)®s->sp; + if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) + return false; + *ip = regs->ip; + *sp = regs->sp; return true; } @@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state) unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; - struct pt_regs *ptregs; bool indirect = false; if (unwind_done(state)) @@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_TYPE_REGS: - if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; @@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_TYPE_REGS_IRET: - if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference iret registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; } - ptregs = container_of((void *)sp, struct pt_regs, ip); - if ((unsigned long)ptregs >= prev_sp && - on_stack(&state->stack_info, ptregs, REGS_SIZE)) { - state->regs = ptregs; - state->full_regs = false; - } else - state->regs = NULL; - + state->regs = (void *)sp - IRET_FRAME_OFFSET; + state->full_regs = false; state->signal = true; break; From 6669a692605547892a026445e460bf233958bd7f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:10 +0100 Subject: [PATCH 28/61] x86/irq: Remove an old outdated comment about context tracking races That race has been fixed and code cleaned up for a while now. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.150551639@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 52089c043160b5..aa9d51eea9d0e8 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; - /* - * NB: Unlike exception entries, IRQ entries do not reliably - * handle context tracking in the low-level entry code. This is - * because syscall entries execute briefly with IRQs on before - * updating context tracking state, so we can take an IRQ from - * kernel mode with CONTEXT_USER. The low-level entry code only - * updates the context if we came from user mode, so we won't - * switch to CONTEXT_KERNEL. We'll fix that once the syscall - * code is cleaned up enough that we can cleanly defer enabling - * IRQs. - */ - entering_irq(); /* entering_irq() tells RCU that we're not quiescent. Check it. */ From 4f3789e792296e21405f708cf3cb409d7c7d5683 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:11 +0100 Subject: [PATCH 29/61] x86/irq/64: Print the offending IP in the stack overflow warning In case something goes wrong with unwind (not unlikely in case of overflow), print the offending IP where we detected the overflow. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.231677119@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 020efbf5786b35..d86e344f5b3deb 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", current->comm, curbase, regs->sp, irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom); + estack_top, estack_bottom, (void *)regs->ip); if (sysctl_panic_on_stackoverflow) panic("low stack detected by irq handler - check messages\n"); From 1a79797b58cddfa948420a7553241c79c013e3ca Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:12 +0100 Subject: [PATCH 30/61] x86/entry/64: Allocate and enable the SYSENTER stack This will simplify future changes that want scratch variables early in the SYSENTER handler -- they'll be able to spill registers to the stack. It also lets us get rid of a SWAPGS_UNSAFE_STACK user. This does not depend on CONFIG_IA32_EMULATION=y because we'll want the stack space even without IA32 emulation. As far as I can tell, the reason that this wasn't done from day 1 is that we use IST for #DB and #BP, which is IMO rather nasty and causes a lot more problems than it solves. But, since #DB uses IST, we don't actually need a real stack for SYSENTER (because SYSENTER with TF set will invoke #DB on the IST stack rather than the SYSENTER stack). I want to remove IST usage from these vectors some day, and this patch is a prerequisite for that as well. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.312726423@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 2 +- arch/x86/include/asm/processor.h | 3 --- arch/x86/kernel/asm-offsets.c | 5 +++++ arch/x86/kernel/asm-offsets_32.c | 5 ----- arch/x86/kernel/cpu/common.c | 4 +++- arch/x86/kernel/process.c | 2 -- arch/x86/kernel/traps.c | 3 +-- 7 files changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 568e130d932cd2..dcc6987f9bae16 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -48,7 +48,7 @@ */ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + SWAPGS movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2db7cf720b04b2..789dad5da20f47 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -339,14 +339,11 @@ struct tss_struct { */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; -#ifdef CONFIG_X86_32 /* * Space for the temporary SYSENTER stack. */ unsigned long SYSENTER_stack_canary; unsigned long SYSENTER_stack[64]; -#endif - } ____cacheline_aligned; DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8ea78275480daf..b275863128ebf0 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -93,4 +93,9 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + + /* Offset from cpu_tss to SYSENTER_stack */ + OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); + /* Size of SYSENTER_stack */ + DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index dedf428b20b68b..52ce4ea16e5386 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -50,11 +50,6 @@ void foo(void) DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - offsetofend(struct tss_struct, SYSENTER_stack)); - /* Offset from cpu_tss to SYSENTER_stack */ - OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); - /* Size of SYSENTER_stack */ - DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); - #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); OFFSET(stack_canary_offset, stack_canary, canary); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cdf79ab628c244..22f5421701983e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1361,7 +1361,9 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)this_cpu_ptr(&cpu_tss) + + offsetofend(struct tss_struct, SYSENTER_stack)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 97fb3e5737f5d0..35d674157fdae6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -71,9 +71,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif -#ifdef CONFIG_X86_32 .SYSENTER_stack_canary = STACK_END_MAGIC, -#endif }; EXPORT_PER_CPU_SYMBOL(cpu_tss); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d366adfc61dabe..d3e3bbd5d3a063 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -794,14 +794,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: -#if defined(CONFIG_X86_32) /* * This is the most likely code path that involves non-trivial use * of the SYSENTER stack. Check that we haven't overrun it. */ WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, "Overran or corrupted SYSENTER stack\n"); -#endif + ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); From 33a2f1a6c4d7c0a02d1c006fb0379cc5ca3b96bb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:13 +0100 Subject: [PATCH 31/61] x86/dumpstack: Add get_stack_info() support for the SYSENTER stack get_stack_info() doesn't currently know about the SYSENTER stack, so unwinding will fail if we entered the kernel on the SYSENTER stack and haven't fully switched off. Teach get_stack_info() about the SYSENTER stack. With future patches applied that run part of the entry code on the SYSENTER stack and introduce an intentional BUG(), I would get: PANIC: double fault, error_code: 0x0 ... RIP: 0010:do_error_trap+0x33/0x1c0 ... Call Trace: Code: ... With this patch, I get: PANIC: double fault, error_code: 0x0 ... Call Trace: ? async_page_fault+0x36/0x60 ? invalid_op+0x22/0x40 ? async_page_fault+0x36/0x60 ? sync_regs+0x3c/0x40 ? sync_regs+0x2e/0x40 ? error_entry+0x6c/0xd0 ? async_page_fault+0x36/0x60 Code: ... which is a lot more informative. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.392711508@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 3 +++ arch/x86/kernel/dumpstack.c | 19 +++++++++++++++++++ arch/x86/kernel/dumpstack_32.c | 6 ++++++ arch/x86/kernel/dumpstack_64.c | 6 ++++++ 4 files changed, 34 insertions(+) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 8da111b3c342bb..f8062bfd43a072 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -16,6 +16,7 @@ enum stack_type { STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, + STACK_TYPE_SYSENTER, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; @@ -28,6 +29,8 @@ struct stack_info { bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); +bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); + int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 0bc95be5c6385c..a33a1373a2522f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -43,6 +43,25 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, return true; } +bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss); + + /* Treat the canary as part of the stack for unwinding purposes. */ + void *begin = &tss->SYSENTER_stack_canary; + void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_SYSENTER; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index daefae83a3aa86..5ff13a6b368069 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; + if (type == STACK_TYPE_SYSENTER) + return "SYSENTER"; + return NULL; } @@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (task != current) goto unknown; + if (in_sysenter_stack(stack, info)) + goto recursion_check; + if (in_hardirq_stack(stack, info)) goto recursion_check; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 88ce2ffdb11030..abc828f8c29785 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_IRQ) return "IRQ"; + if (type == STACK_TYPE_SYSENTER) + return "SYSENTER"; + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; @@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_irq_stack(stack, info)) goto recursion_check; + if (in_sysenter_stack(stack, info)) + goto recursion_check; + goto unknown; recursion_check: From aaeed3aeb39c1ba69f0a49baec8cb728121d0a91 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:14 +0100 Subject: [PATCH 32/61] x86/entry/gdt: Put per-CPU GDT remaps in ascending order We currently have CPU 0's GDT at the top of the GDT range and higher-numbered CPUs at lower addresses. This happens because the fixmap is upside down (index 0 is the top of the fixmap). Flip it so that GDTs are in ascending order by virtual address. This will simplify a future patch that will generalize the GDT remap to contain multiple pages. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.471561421@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 0a3e808b91230d..01fd944fd72118 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -63,7 +63,7 @@ static inline struct desc_struct *get_current_gdt_rw(void) /* Get the fixmap index for a specific processor */ static inline unsigned int get_cpu_gdt_ro_index(int cpu) { - return FIX_GDT_REMAP_BEGIN + cpu; + return FIX_GDT_REMAP_END - cpu; } /* Provide the fixmap address of the remapped GDT */ From ef8813ab280507972bb57e4b1b502811ad4411e9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:15 +0100 Subject: [PATCH 33/61] x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce struct cpu_entry_area Currently, the GDT is an ad-hoc array of pages, one per CPU, in the fixmap. Generalize it to be an array of a new 'struct cpu_entry_area' so that we can cleanly add new things to it. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.563271721@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 9 +-------- arch/x86/include/asm/fixmap.h | 37 +++++++++++++++++++++++++++++++++-- arch/x86/kernel/cpu/common.c | 14 ++++++------- arch/x86/xen/mmu_pv.c | 2 +- 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 01fd944fd72118..f6f428432a68d3 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void) return this_cpu_ptr(&gdt_page)->gdt; } -/* Get the fixmap index for a specific processor */ -static inline unsigned int get_cpu_gdt_ro_index(int cpu) -{ - return FIX_GDT_REMAP_END - cpu; -} - /* Provide the fixmap address of the remapped GDT */ static inline struct desc_struct *get_cpu_gdt_ro(int cpu) { - unsigned int idx = get_cpu_gdt_ro_index(cpu); - return (struct desc_struct *)__fix_to_virt(idx); + return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt; } /* Provide the current read-only GDT */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b0c505fe9a958c..b61f0242f9d057 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -44,6 +44,19 @@ extern unsigned long __FIXADDR_TOP; PAGE_SIZE) #endif +/* + * cpu_entry_area is a percpu region in the fixmap that contains things + * needed by the CPU and early entry/exit code. Real types aren't used + * for all fields here to avoid circular header dependencies. + * + * Every field is a virtual alias of some other allocated backing store. + * There is no direct allocation of a struct cpu_entry_area. + */ +struct cpu_entry_area { + char gdt[PAGE_SIZE]; +}; + +#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) /* * Here we define all the compile-time 'special' virtual @@ -101,8 +114,8 @@ enum fixed_addresses { FIX_LNW_VRTC, #endif /* Fixmap entries to remap the GDTs, one per processor. */ - FIX_GDT_REMAP_BEGIN, - FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, + FIX_CPU_ENTRY_AREA_TOP, + FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, #ifdef CONFIG_ACPI_APEI_GHES /* Used for GHES mapping from assorted contexts */ @@ -191,5 +204,25 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, void __early_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags); +static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) +{ + BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + + return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; +} + +#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ + BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ + __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ + }) + +#define get_cpu_entry_area_index(cpu, field) \ + __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) + +static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ + return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22f5421701983e..2cb394dc41537f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -466,12 +466,12 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -/* Setup the fixmap mapping only once per-processor */ -static inline void setup_fixmap_gdt(int cpu) +/* Setup the fixmap mappings only once per-processor */ +static inline void setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 /* On 64-bit systems, we use a read-only fixmap GDT. */ - pgprot_t prot = PAGE_KERNEL_RO; + pgprot_t gdt_prot = PAGE_KERNEL_RO; #else /* * On native 32-bit systems, the GDT cannot be read-only because @@ -482,11 +482,11 @@ static inline void setup_fixmap_gdt(int cpu) * On Xen PV, the GDT must be read-only because the hypervisor requires * it. */ - pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? PAGE_KERNEL_RO : PAGE_KERNEL; #endif - __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); + __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); } /* Load the original GDT from the per-cpu structure */ @@ -1589,7 +1589,7 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); - setup_fixmap_gdt(cpu); + setup_cpu_entry_area(cpu); load_fixmap_gdt(cpu); } @@ -1651,7 +1651,7 @@ void cpu_init(void) fpu__init_cpu(); - setup_fixmap_gdt(cpu); + setup_cpu_entry_area(cpu); load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2ccdaba31a0778..c2454237fa672d 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: - case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: + case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: /* All local page mappings */ pte = pfn_pte(phys, prot); break; From 21506525fb8ddb0342f2a2370812d47f6a1f3833 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:16 +0100 Subject: [PATCH 34/61] x86/kasan/64: Teach KASAN about the cpu_entry_area The cpu_entry_area will contain stacks. Make sure that KASAN has appropriate shadow mappings for them. Signed-off-by: Andy Lutomirski Signed-off-by: Andrey Ryabinin Signed-off-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: kasan-dev@googlegroups.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.642806442@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 99dfed6dfef8b2..9ec70d780f1f41 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -277,6 +277,7 @@ void __init kasan_early_init(void) void __init kasan_init(void) { int i; + void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; #ifdef CONFIG_KASAN_INLINE register_die_notifier(&kasan_die_notifier); @@ -329,8 +330,23 @@ void __init kasan_init(void) (unsigned long)kasan_mem_to_shadow(_end), early_pfn_to_nid(__pa(_stext))); + shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); + shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); + shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, + PAGE_SIZE); + + shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); + shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); + shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, + PAGE_SIZE); + kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), - (void *)KASAN_SHADOW_END); + shadow_cpu_entry_begin); + + kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, + (unsigned long)shadow_cpu_entry_end, 0); + + kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); load_cr3(init_top_pgt); __flush_tlb_all(); From 7fb983b4dd569e08564134a850dfd4eb1c63d9b8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:17 +0100 Subject: [PATCH 35/61] x86/entry: Fix assumptions that the HW TSS is at the beginning of cpu_tss A future patch will move SYSENTER_stack to the beginning of cpu_tss to help detect overflow. Before this can happen, fix several code paths that hardcode assumptions about the old layout. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.722425540@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 2 +- arch/x86/include/asm/processor.h | 9 ++++++-- arch/x86/kernel/cpu/common.c | 8 +++---- arch/x86/kernel/doublefault.c | 36 +++++++++++++++----------------- arch/x86/kvm/vmx.c | 2 +- arch/x86/power/cpu.c | 13 ++++++------ 6 files changed, 37 insertions(+), 33 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index f6f428432a68d3..2ace1f90d13830 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -178,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, #endif } -static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr) { struct desc_struct *d = get_cpu_gdt_rw(cpu); tss_desc tss; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 789dad5da20f47..555c9478f3df5a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -162,7 +162,7 @@ enum cpuid_regs_idx { extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; -extern struct tss_struct doublefault_tss; +extern struct x86_hw_tss doublefault_tss; extern __u32 cpu_caps_cleared[NCAPINTS]; extern __u32 cpu_caps_set[NCAPINTS]; @@ -252,6 +252,11 @@ static inline void load_cr3(pgd_t *pgdir) write_cr3(__sme_pa(pgdir)); } +/* + * Note that while the legacy 'TSS' name comes from 'Task State Segment', + * on modern x86 CPUs the TSS also holds information important to 64-bit mode, + * unrelated to the task-switch mechanism: + */ #ifdef CONFIG_X86_32 /* This is the TSS defined by the hardware. */ struct x86_hw_tss { @@ -322,7 +327,7 @@ struct x86_hw_tss { #define IO_BITMAP_BITS 65536 #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) -#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) +#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) #define INVALID_IO_BITMAP_OFFSET 0x8000 struct tss_struct { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2cb394dc41537f..3f285b973f500f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1557,7 +1557,7 @@ void cpu_init(void) } } - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; /* * <= is required because the CPU will access up to @@ -1576,7 +1576,7 @@ void cpu_init(void) * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. */ - set_tss_desc(cpu, t); + set_tss_desc(cpu, &t->x86_tss); load_TR_desc(); load_mm_ldt(&init_mm); @@ -1634,12 +1634,12 @@ void cpu_init(void) * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. */ - set_tss_desc(cpu, t); + set_tss_desc(cpu, &t->x86_tss); load_TR_desc(); load_mm_ldt(&init_mm); - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; #ifdef CONFIG_DOUBLEFAULT /* Set up doublefault TSS pointer in the GDT */ diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0e662c55ae902f..0b8cedb20d6d92 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -50,25 +50,23 @@ static void doublefault_fn(void) cpu_relax(); } -struct tss_struct doublefault_tss __cacheline_aligned = { - .x86_tss = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, - - .__cr3 = __pa_nodebug(swapper_pg_dir), - } +struct x86_hw_tss doublefault_tss __cacheline_aligned = { + .sp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + + .ip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, + + .__cr3 = __pa_nodebug(swapper_pg_dir), }; /* dummy for do_double_fault() call */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a6f4f095f8f4eb..2abe0073b573ea 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2291,7 +2291,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * processors. See 22.2.4. */ vmcs_writel(HOST_TR_BASE, - (unsigned long)this_cpu_ptr(&cpu_tss)); + (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss)); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ /* diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 84fcfde53f8f3f..50593e13828109 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -165,12 +165,13 @@ static void fix_processor_context(void) struct desc_struct *desc = get_cpu_gdt_rw(cpu); tss_desc tss; #endif - set_tss_desc(cpu, t); /* - * This just modifies memory; should not be - * necessary. But... This is necessary, because - * 386 hardware has concept of busy TSS or some - * similar stupidity. - */ + + /* + * This just modifies memory; should not be necessary. But... This is + * necessary, because 386 hardware has concept of busy TSS or some + * similar stupidity. + */ + set_tss_desc(cpu, &t->x86_tss); #ifdef CONFIG_X86_64 memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); From 6e60e583426c2f8751c22c2dfe5c207083b4483a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:18 +0100 Subject: [PATCH 36/61] x86/dumpstack: Handle stack overflow on all stacks We currently special-case stack overflow on the task stack. We're going to start putting special stacks in the fixmap with a custom layout, so they'll have guard pages, too. Teach the unwinder to be able to unwind an overflow of any of the stacks. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.802057305@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index a33a1373a2522f..64f8ed2a4827d3 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -112,24 +112,28 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - task stack * - interrupt stack * - HW exception stacks (double fault, nmi, debug, mce) + * - SYSENTER stack * - * x86-32 can have up to three stacks: + * x86-32 can have up to four stacks: * - task stack * - softirq stack * - hardirq stack + * - SYSENTER stack */ for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; - /* - * If we overflowed the task stack into a guard page, jump back - * to the bottom of the usable stack. - */ - if (task_stack_page(task) - (void *)stack < PAGE_SIZE) - stack = task_stack_page(task); - - if (get_stack_info(stack, task, &stack_info, &visit_mask)) - break; + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; + } stack_name = stack_type_name(stack_info.type); if (stack_name) From 1a935bc3d4ea61556461a9e92a68ca3556232efd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:19 +0100 Subject: [PATCH 37/61] x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct SYSENTER_stack should have reliable overflow detection, which means that it needs to be at the bottom of a page, not the top. Move it to the beginning of struct tss_struct and page-align it. Also add an assertion to make sure that the fixed hardware TSS doesn't cross a page boundary. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.881827433@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 21 ++++++++++++--------- arch/x86/kernel/cpu/common.c | 21 +++++++++++++++++++++ 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 555c9478f3df5a..759051251664c6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -332,7 +332,16 @@ struct x86_hw_tss { struct tss_struct { /* - * The hardware state: + * Space for the temporary SYSENTER stack, used for SYSENTER + * and the entry trampoline as well. + */ + unsigned long SYSENTER_stack_canary; + unsigned long SYSENTER_stack[64]; + + /* + * The fixed hardware portion. This must not cross a page boundary + * at risk of violating the SDM's advice and potentially triggering + * errata. */ struct x86_hw_tss x86_tss; @@ -343,15 +352,9 @@ struct tss_struct { * be within the limit. */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; +} __aligned(PAGE_SIZE); - /* - * Space for the temporary SYSENTER stack. - */ - unsigned long SYSENTER_stack_canary; - unsigned long SYSENTER_stack[64]; -} ____cacheline_aligned; - -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); /* * sizeof(unsigned long) coming from an extra "long" at the end diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3f285b973f500f..60b2dfd2a58b0d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -487,6 +487,27 @@ static inline void setup_cpu_entry_area(int cpu) #endif __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); + + /* + * The Intel SDM says (Volume 3, 7.2.1): + * + * Avoid placing a page boundary in the part of the TSS that the + * processor reads during a task switch (the first 104 bytes). The + * processor may not correctly perform address translations if a + * boundary occurs in this area. During a task switch, the processor + * reads and writes into the first 104 bytes of each TSS (using + * contiguous physical addresses beginning with the physical address + * of the first byte of the TSS). So, after TSS access begins, if + * part of the 104 bytes is not physically contiguous, the processor + * will access incorrect information without generating a page-fault + * exception. + * + * There are also a lot of errata involving the TSS spanning a page + * boundary. Assert that we're not doing that. + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + } /* Load the original GDT from the per-cpu structure */ From 72f5e08dbba2d01aa90b592cf76c378ea233b00b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:20 +0100 Subject: [PATCH 38/61] x86/entry: Remap the TSS into the CPU entry area This has a secondary purpose: it puts the entry stack into a region with a well-controlled layout. A subsequent patch will take advantage of this to streamline the SYSCALL entry code to be able to find it more easily. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.962042855@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 6 +++-- arch/x86/include/asm/fixmap.h | 7 ++++++ arch/x86/kernel/asm-offsets.c | 3 +++ arch/x86/kernel/cpu/common.c | 41 ++++++++++++++++++++++++++++++----- arch/x86/kernel/dumpstack.c | 3 ++- arch/x86/kvm/vmx.c | 2 +- arch/x86/power/cpu.c | 11 +++++----- 7 files changed, 58 insertions(+), 15 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4838037f97f6ed..0ab316c4680673 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -941,7 +941,8 @@ ENTRY(debug) movl %esp, %eax # pt_regs pointer /* Are we currently on the SYSENTER stack? */ - PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Ldebug_from_sysenter_stack @@ -984,7 +985,8 @@ ENTRY(nmi) movl %esp, %eax # pt_regs pointer /* Are we currently on the SYSENTER stack? */ - PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Lnmi_from_sysenter_stack diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b61f0242f9d057..84558b611ad33d 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -54,6 +54,13 @@ extern unsigned long __FIXADDR_TOP; */ struct cpu_entry_area { char gdt[PAGE_SIZE]; + + /* + * The GDT is just below cpu_tss and thus serves (on x86_64) as a + * a read-only guard page for the SYSENTER stack at the bottom + * of the TSS region. + */ + struct tss_struct tss; }; #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index b275863128ebf0..55858b277cf620 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -98,4 +98,7 @@ void common(void) { OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); /* Size of SYSENTER_stack */ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 60b2dfd2a58b0d..e5837bd6c67251 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -466,6 +466,22 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } +static void set_percpu_fixmap_pages(int fixmap_index, void *ptr, + int pages, pgprot_t prot) +{ + int i; + + for (i = 0; i < pages; i++) { + __set_fixmap(fixmap_index - i, + per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot); + } +} + +#ifdef CONFIG_X86_32 +/* The 32-bit entry code needs to find cpu_entry_area. */ +DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); +#endif + /* Setup the fixmap mappings only once per-processor */ static inline void setup_cpu_entry_area(int cpu) { @@ -507,7 +523,15 @@ static inline void setup_cpu_entry_area(int cpu) */ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), + &per_cpu(cpu_tss, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, + PAGE_KERNEL); +#ifdef CONFIG_X86_32 + this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); +#endif } /* Load the original GDT from the per-cpu structure */ @@ -1257,7 +1281,8 @@ void enable_sep_cpu(void) wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); wrmsr(MSR_IA32_SYSENTER_ESP, - (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), + (unsigned long)&get_cpu_entry_area(cpu)->tss + + offsetofend(struct tss_struct, SYSENTER_stack), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); @@ -1370,6 +1395,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks /* May not be marked __init: used by software suspend */ void syscall_init(void) { + int cpu = smp_processor_id(); + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); @@ -1383,7 +1410,7 @@ void syscall_init(void) */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, - (unsigned long)this_cpu_ptr(&cpu_tss) + + (unsigned long)&get_cpu_entry_area(cpu)->tss + offsetofend(struct tss_struct, SYSENTER_stack)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else @@ -1593,11 +1620,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); + setup_cpu_entry_area(cpu); + /* * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. */ - set_tss_desc(cpu, &t->x86_tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); load_mm_ldt(&init_mm); @@ -1610,7 +1639,6 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); - setup_cpu_entry_area(cpu); load_fixmap_gdt(cpu); } @@ -1651,11 +1679,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); + setup_cpu_entry_area(cpu); + /* * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. */ - set_tss_desc(cpu, &t->x86_tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); load_mm_ldt(&init_mm); @@ -1672,7 +1702,6 @@ void cpu_init(void) fpu__init_cpu(); - setup_cpu_entry_area(cpu); load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 64f8ed2a4827d3..60267850125e68 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -45,7 +45,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) { - struct tss_struct *tss = this_cpu_ptr(&cpu_tss); + int cpu = smp_processor_id(); + struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; /* Treat the canary as part of the stack for unwinding purposes. */ void *begin = &tss->SYSENTER_stack_canary; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2abe0073b573ea..62ee4362e1c109 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2291,7 +2291,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * processors. See 22.2.4. */ vmcs_writel(HOST_TR_BASE, - (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss)); + (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ /* diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 50593e13828109..04d5157fe7f8f3 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -160,18 +160,19 @@ static void do_fpu_end(void) static void fix_processor_context(void) { int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(cpu_tss, cpu); #ifdef CONFIG_X86_64 struct desc_struct *desc = get_cpu_gdt_rw(cpu); tss_desc tss; #endif /* - * This just modifies memory; should not be necessary. But... This is - * necessary, because 386 hardware has concept of busy TSS or some - * similar stupidity. + * We need to reload TR, which requires that we change the + * GDT entry to indicate "available" first. + * + * XXX: This could probably all be replaced by a call to + * force_reload_TR(). */ - set_tss_desc(cpu, &t->x86_tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); #ifdef CONFIG_X86_64 memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); From 9aaefe7b59ae00605256a7d6bd1c1456432495fc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:21 +0100 Subject: [PATCH 39/61] x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0 On 64-bit kernels, we used to assume that TSS.sp0 was the current top of stack. With the addition of an entry trampoline, this will no longer be the case. Store the current top of stack in TSS.sp1, which is otherwise unused but shares the same cacheline. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.050864668@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 18 +++++++++++++----- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/kernel/asm-offsets_64.c | 1 + arch/x86/kernel/process.c | 10 ++++++++++ arch/x86/kernel/process_64.c | 1 + 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 759051251664c6..b0cf0612a454d6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -309,7 +309,13 @@ struct x86_hw_tss { struct x86_hw_tss { u32 reserved1; u64 sp0; + + /* + * We store cpu_current_top_of_stack in sp1 so it's always accessible. + * Linux does not use ring 1, so sp1 is not otherwise needed. + */ u64 sp1; + u64 sp2; u64 reserved2; u64 ist[7]; @@ -368,6 +374,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#else +#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 #endif /* @@ -539,12 +547,12 @@ static inline void native_swapgs(void) static inline unsigned long current_top_of_stack(void) { -#ifdef CONFIG_X86_64 - return this_cpu_read_stable(cpu_tss.x86_tss.sp0); -#else - /* sp0 on x86_32 is special in and around vm86 mode. */ + /* + * We can't read directly from tss.sp0: sp0 on x86_32 is special in + * and around vm86 mode and sp0 on x86_64 is special because of the + * entry trampoline. + */ return this_cpu_read_stable(cpu_current_top_of_stack); -#endif } static inline bool on_thread_stack(void) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 70f425947dc50f..44a04999791e05 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack, #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) +# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) #endif #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e3a5175a444bc0..bf51e51d808dd8 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -66,6 +66,7 @@ int main(void) OFFSET(TSS_ist, tss_struct, x86_tss.ist); OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 35d674157fdae6..86e83762e3b378 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { * Poison it. */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + +#ifdef CONFIG_X86_64 + /* + * .sp1 is cpu_current_top_of_stack. The init task never + * runs user code, but cpu_current_top_of_stack should still + * be well defined before the first context switch. + */ + .sp1 = TOP_OF_INIT_STACK, +#endif + #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 01b119bebb6801..157f8181691524 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -461,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch the PDA and FPU contexts. */ this_cpu_write(current_task, next_p); + this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); /* Reload sp0. */ update_sp0(next_p); From 6d9256f0a89eaff97fca6006100bcaea8d1d8bdb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:22 +0100 Subject: [PATCH 40/61] x86/espfix/64: Stop assuming that pt_regs is on the entry stack When we start using an entry trampoline, a #GP from userspace will be delivered on the entry stack, not on the task stack. Fix the espfix64 #DF fixup to set up #GP according to TSS.SP0, rather than assuming that pt_regs + 1 == SP0. This won't change anything without an entry stack, but it will make the code continue to work when an entry stack is added. While we're at it, improve the comments to explain what's actually going on. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.130778051@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d3e3bbd5d3a063..f0029d17b14b1e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) /* * If IRET takes a non-IST fault on the espfix64 stack, then we - * end up promoting it to a doublefault. In that case, modify - * the stack to make it look like we just entered the #GP - * handler from user space, similar to bad_iret. + * end up promoting it to a doublefault. In that case, take + * advantage of the fact that we're not using the normal (TSS.sp0) + * stack right now. We can write a fake #GP(0) frame at TSS.sp0 + * and then modify our own IRET frame so that, when we return, + * we land directly at the #GP(0) vector with the stack already + * set up according to its expectations. + * + * The net result is that our #GP handler will think that we + * entered from usermode with the bad user context. * * No need for ist_enter here because we don't use RCU. */ @@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { - struct pt_regs *normal_regs = task_pt_regs(current); + struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; + + /* + * regs->sp points to the failing IRET frame on the + * ESPFIX64 stack. Copy it to the entry stack. This fills + * in gpregs->ss through gpregs->ip. + * + */ + memmove(&gpregs->ip, (void *)regs->sp, 5*8); + gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ - /* Fake a #GP(0) from userspace. */ - memmove(&normal_regs->ip, (void *)regs->sp, 5*8); - normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + /* + * Adjust our frame so that we return straight to the #GP + * vector with the expected RSP value. This is safe because + * we won't enable interupts or schedule before we invoke + * general_protection, so nothing will clobber the stack + * frame we just set up. + */ regs->ip = (unsigned long)general_protection; - regs->sp = (unsigned long)&normal_regs->orig_ax; + regs->sp = (unsigned long)&gpregs->orig_ax; return; } @@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being - * deliv- ered, the faulting linear address of the second fault will + * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a From 7f2590a110b837af5679d08fc25c6227c5a8c497 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:23 +0100 Subject: [PATCH 41/61] x86/entry/64: Use a per-CPU trampoline stack for IDT entries Historically, IDT entries from usermode have always gone directly to the running task's kernel stack. Rearrange it so that we enter on a per-CPU trampoline stack and then manually switch to the task's stack. This touches a couple of extra cachelines, but it gives us a chance to run some code before we touch the kernel stack. The asm isn't exactly beautiful, but I think that fully refactoring it can wait. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.225330557@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 67 ++++++++++++++++++++++++-------- arch/x86/entry/entry_64_compat.S | 5 ++- arch/x86/include/asm/switch_to.h | 4 +- arch/x86/include/asm/traps.h | 1 - arch/x86/kernel/cpu/common.c | 6 ++- arch/x86/kernel/traps.c | 21 +++++----- 6 files changed, 72 insertions(+), 32 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 32306788821c82..35b8e949ac2f1b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -560,6 +560,13 @@ END(irq_entries_start) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func cld + + testb $3, CS-ORIG_RAX(%rsp) + jz 1f + SWAPGS + call switch_to_thread_stack +1: + ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS @@ -569,12 +576,8 @@ END(irq_entries_start) jz 1f /* - * IRQ from user mode. Switch to kernel gsbase and inform context - * tracking that we're in kernel mode. - */ - SWAPGS - - /* + * IRQ from user mode. + * * We need to tell lockdep that IRQs are off. We can't do this until * we fix gsbase, and we should do it before enter_from_user_mode * (which can take locks). Since TRACE_IRQS_OFF idempotent, @@ -828,6 +831,32 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) +/* + * Switch to the thread stack. This is called with the IRET frame and + * orig_ax on the stack. (That is, RDI..R12 are not on the stack and + * space has not been allocated for them.) + */ +ENTRY(switch_to_thread_stack) + UNWIND_HINT_FUNC + + pushq %rdi + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI + + pushq 7*8(%rdi) /* regs->ss */ + pushq 6*8(%rdi) /* regs->rsp */ + pushq 5*8(%rdi) /* regs->eflags */ + pushq 4*8(%rdi) /* regs->cs */ + pushq 3*8(%rdi) /* regs->ip */ + pushq 2*8(%rdi) /* regs->orig_ax */ + pushq 8(%rdi) /* return address */ + UNWIND_HINT_FUNC + + movq (%rdi), %rdi + ret +END(switch_to_thread_stack) + .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -845,11 +874,12 @@ ENTRY(\sym) ALLOC_PT_GPREGS_ON_STACK - .if \paranoid - .if \paranoid == 1 + .if \paranoid < 2 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ - jnz 1f + jnz .Lfrom_usermode_switch_stack_\@ .endif + + .if \paranoid call paranoid_entry .else call error_entry @@ -891,20 +921,15 @@ ENTRY(\sym) jmp error_exit .endif - .if \paranoid == 1 + .if \paranoid < 2 /* - * Paranoid entry from userspace. Switch stacks and treat it + * Entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers * run in real process context if user_mode(regs). */ -1: +.Lfrom_usermode_switch_stack_\@: call error_entry - - movq %rsp, %rdi /* pt_regs pointer */ - call sync_regs - movq %rax, %rsp /* switch stack */ - movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code @@ -1165,6 +1190,14 @@ ENTRY(error_entry) SWAPGS .Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ + popq %r12 /* save return addr in %12 */ + movq %rsp, %rdi /* arg0 = pt_regs pointer */ + call sync_regs + movq %rax, %rsp /* switch stack */ + ENCODE_FRAME_POINTER + pushq %r12 + /* * We need to tell lockdep that IRQs are off. We can't do this until * we fix gsbase, and we should do it before enter_from_user_mode diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index dcc6987f9bae16..95ad40eb7effbd 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat) */ movl %eax, %eax - /* Construct struct pt_regs on stack (iret frame is already on stack) */ pushq %rax /* pt_regs->orig_ax */ + + /* switch to thread stack expects orig_ax to be pushed */ + call switch_to_thread_stack + pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8c6bd6863db9d6..cbc71e73bd3216 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) /* This is used when switching tasks or entering/exiting vm86 mode. */ static inline void update_sp0(struct task_struct *task) { + /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 load_sp0(task->thread.sp0); #else - load_sp0(task_top_of_stack(task)); + if (static_cpu_has(X86_FEATURE_XENPV)) + load_sp0(task_top_of_stack(task)); #endif } diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 1fadd310ff680e..31051f35cbb768 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long); dotraplinkage void do_stack_segment(struct pt_regs *, long); #ifdef CONFIG_X86_64 dotraplinkage void do_double_fault(struct pt_regs *, long); -asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e5837bd6c67251..57968880e39b6d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1623,11 +1623,13 @@ void cpu_init(void) setup_cpu_entry_area(cpu); /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + + offsetofend(struct tss_struct, SYSENTER_stack)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f0029d17b14b1e..ee9ca0ad438854 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -619,14 +619,15 @@ NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* - * Help handler running on IST stack to switch off the IST stack if the - * interrupted code was in user mode. The actual stack switch is done in - * entry_64.S + * Help handler running on a per-cpu (IST or entry trampoline) stack + * to switch to the normal thread stack if the interrupted code was in + * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = task_pt_regs(current); - *regs = *eregs; + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + if (regs != eregs) + *regs = *eregs; return regs; } NOKPROBE_SYMBOL(sync_regs); @@ -642,13 +643,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault - * correctly, we want move our stack frame to task_pt_regs - * and we want to pretend that the exception came from the - * iret target. + * correctly, we want to move our stack frame to where it would + * be had we entered directly on the entry stack (rather than + * just below the IRET frame) and we want to pretend that the + * exception came from the IRET target. */ struct bad_iret_stack *new_stack = - container_of(task_pt_regs(current), - struct bad_iret_stack, regs); + (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; /* Copy the IRET target to the new stack. */ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); From 3e3b9293d392c577b62e24e4bc9982320438e749 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:24 +0100 Subject: [PATCH 42/61] x86/entry/64: Return to userspace from the trampoline stack By itself, this is useless. It gives us the ability to run some final code before exit that cannnot run on the kernel stack. This could include a CR3 switch a la PAGE_TABLE_ISOLATION or some kernel stack erasing, for example. (Or even weird things like *changing* which kernel stack gets used as an ASLR-strengthening mechanism.) The SYSRET32 path is not covered yet. It could be in the future or we could just ignore it and force the slow path if needed. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.306546484@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 55 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 35b8e949ac2f1b..42a9379f7acb7c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -326,8 +326,24 @@ syscall_return_via_sysret: popq %rsi /* skip rcx */ popq %rdx popq %rsi + + /* + * Now all regs are restored except RSP and RDI. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + + pushq RSP-RDI(%rdi) /* RSP */ + pushq (%rdi) /* RDI */ + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + popq %rdi - movq RSP-ORIG_RAX(%rsp), %rsp + popq %rsp USERGS_SYSRET64 END(entry_SYSCALL_64) @@ -630,10 +646,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) ud2 1: #endif - SWAPGS POP_EXTRA_REGS - POP_C_REGS - addq $8, %rsp /* skip regs->orig_ax */ + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + + /* + * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + + /* Copy the IRET frame to the trampoline stack. */ + pushq 6*8(%rdi) /* SS */ + pushq 5*8(%rdi) /* RSP */ + pushq 4*8(%rdi) /* EFLAGS */ + pushq 3*8(%rdi) /* CS */ + pushq 2*8(%rdi) /* RIP */ + + /* Push user RDI on the trampoline stack. */ + pushq (%rdi) + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + + /* Restore RDI. */ + popq %rdi + SWAPGS INTERRUPT_RETURN From 3386bc8aed825e9f1f65ce38df4b109b2019b71a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:25 +0100 Subject: [PATCH 43/61] x86/entry/64: Create a per-CPU SYSCALL entry trampoline Handling SYSCALL is tricky: the SYSCALL handler is entered with every single register (except FLAGS), including RSP, live. It somehow needs to set RSP to point to a valid stack, which means it needs to save the user RSP somewhere and find its own stack pointer. The canonical way to do this is with SWAPGS, which lets us access percpu data using the %gs prefix. With PAGE_TABLE_ISOLATION-like pagetable switching, this is problematic. Without a scratch register, switching CR3 is impossible, so %gs-based percpu memory would need to be mapped in the user pagetables. Doing that without information leaks is difficult or impossible. Instead, use a different sneaky trick. Map a copy of the first part of the SYSCALL asm at a different address for each CPU. Now RIP varies depending on the CPU, so we can use RIP-relative memory access to access percpu memory. By putting the relevant information (one scratch slot and the stack address) at a constant offset relative to RIP, we can make SYSCALL work without relying on %gs. A nice thing about this approach is that we can easily switch it on and off if we want pagetable switching to be configurable. The compat variant of SYSCALL doesn't have this problem in the first place -- there are plenty of scratch registers, since we don't care about preserving r8-r15. This patch therefore doesn't touch SYSCALL32 at all. This patch actually seems to be a small speedup. With this patch, SYSCALL touches an extra cache line and an extra virtual page, but the pipeline no longer stalls waiting for SWAPGS. It seems that, at least in a tight loop, the latter outweights the former. Thanks to David Laight for an optimization tip. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 58 +++++++++++++++++++++++++++++++++++ arch/x86/include/asm/fixmap.h | 2 ++ arch/x86/kernel/asm-offsets.c | 1 + arch/x86/kernel/cpu/common.c | 15 ++++++++- arch/x86/kernel/vmlinux.lds.S | 9 ++++++ 5 files changed, 84 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 42a9379f7acb7c..2582984ffb4b23 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -136,6 +136,64 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ + .pushsection .entry_trampoline, "ax" + +/* + * The code in here gets remapped into cpu_entry_area's trampoline. This means + * that the assembler and linker have the wrong idea as to where this code + * lives (and, in fact, it's mapped more than once, so it's not even at a + * fixed address). So we can't reference any symbols outside the entry + * trampoline and expect it to work. + * + * Instead, we carefully abuse %rip-relative addressing. + * _entry_trampoline(%rip) refers to the start of the remapped) entry + * trampoline. We can thus find cpu_entry_area with this macro: + */ + +#define CPU_ENTRY_AREA \ + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA + +ENTRY(entry_SYSCALL_64_trampoline) + UNWIND_HINT_EMPTY + swapgs + + /* Stash the user RSP. */ + movq %rsp, RSP_SCRATCH + + /* Load the top of the task stack into RSP */ + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp + + /* Start building the simulated IRET frame. */ + pushq $__USER_DS /* pt_regs->ss */ + pushq RSP_SCRATCH /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + + /* + * x86 lacks a near absolute jump, and we can't jump to the real + * entry text with a relative jump. We could push the target + * address and then use retq, but this destroys the pipeline on + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, + * spill RDI and restore it in a second-stage trampoline. + */ + pushq %rdi + movq $entry_SYSCALL_64_stage2, %rdi + jmp *%rdi +END(entry_SYSCALL_64_trampoline) + + .popsection + +ENTRY(entry_SYSCALL_64_stage2) + UNWIND_HINT_EMPTY + popq %rdi + jmp entry_SYSCALL_64_after_hwframe +END(entry_SYSCALL_64_stage2) + ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 84558b611ad33d..6a699474c2c7dd 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -61,6 +61,8 @@ struct cpu_entry_area { * of the TSS region. */ struct tss_struct tss; + + char entry_trampoline[PAGE_SIZE]; }; #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 55858b277cf620..61b1af88ac0707 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -101,4 +101,5 @@ void common(void) { /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 57968880e39b6d..430f950b0b7fa3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); static inline void setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; + /* On 64-bit systems, we use a read-only fixmap GDT. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; #else @@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu) #ifdef CONFIG_X86_32 this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); #endif + +#ifdef CONFIG_X86_64 + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif } /* Load the original GDT from the per-cpu structure */ @@ -1395,10 +1402,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks /* May not be marked __init: used by software suspend */ void syscall_init(void) { + extern char _entry_trampoline[]; + extern char entry_SYSCALL_64_trampoline[]; + int cpu = smp_processor_id(); + unsigned long SYSCALL64_entry_trampoline = + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + + (entry_SYSCALL_64_trampoline - _entry_trampoline); wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a4009fb9be8725..d2a8b5a24a44a5 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -107,6 +107,15 @@ SECTIONS SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) + +#ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + _entry_trampoline = .; + *(.entry_trampoline) + . = ALIGN(PAGE_SIZE); + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); +#endif + /* End of text section */ _etext = .; } :text = 0x9090 From 40e7f949e0d9a33968ebde5d67f7e3a47c97742a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:26 +0100 Subject: [PATCH 44/61] x86/entry/64: Move the IST stacks into struct cpu_entry_area The IST stacks are needed when an IST exception occurs and are accessed before any kernel code at all runs. Move them into struct cpu_entry_area. The IST stacks are unlike the rest of cpu_entry_area: they're used even for entries from kernel mode. This means that they should be set up before we load the final IDT. Move cpu_entry_area setup to trap_init() for the boot CPU and set it up for all possible CPUs at once in native_smp_prepare_cpus(). Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.480598743@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 12 ++++++ arch/x86/kernel/cpu/common.c | 74 ++++++++++++++++++++--------------- arch/x86/kernel/traps.c | 3 ++ 3 files changed, 57 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 6a699474c2c7dd..451da7d9a50268 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -63,10 +63,22 @@ struct cpu_entry_area { struct tss_struct tss; char entry_trampoline[PAGE_SIZE]; + +#ifdef CONFIG_X86_64 + /* + * Exception stacks used for IST entries. + * + * In the future, this should have a separate slot for each stack + * with guard pages between them. + */ + char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; +#endif }; #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) +extern void setup_cpu_entry_areas(void); + /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 430f950b0b7fa3..fb01a8e5e9b7df 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -466,24 +466,36 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -static void set_percpu_fixmap_pages(int fixmap_index, void *ptr, - int pages, pgprot_t prot) -{ - int i; - - for (i = 0; i < pages; i++) { - __set_fixmap(fixmap_index - i, - per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot); - } -} - #ifdef CONFIG_X86_32 /* The 32-bit entry code needs to find cpu_entry_area. */ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif +#ifdef CONFIG_X86_64 +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ +}; + +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +#endif + +static void __init +set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) +{ + for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) + __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); +} + /* Setup the fixmap mappings only once per-processor */ -static inline void setup_cpu_entry_area(int cpu) +static void __init setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 extern char _entry_trampoline[]; @@ -532,15 +544,31 @@ static inline void setup_cpu_entry_area(int cpu) PAGE_KERNEL); #ifdef CONFIG_X86_32 - this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); #endif #ifdef CONFIG_X86_64 + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + BUILD_BUG_ON(sizeof(exception_stacks) != + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), + &per_cpu(exception_stacks, cpu), + sizeof(exception_stacks) / PAGE_SIZE, + PAGE_KERNEL); + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); #endif } +void __init setup_cpu_entry_areas(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); +} + /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { @@ -1385,20 +1413,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; - -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); - /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -1607,7 +1621,7 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!oist->ist[0]) { - char *estacks = per_cpu(exception_stacks, cpu); + char *estacks = get_cpu_entry_area(cpu)->exception_stacks; for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; @@ -1633,8 +1647,6 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - setup_cpu_entry_area(cpu); - /* * Initialize the TSS. sp0 points to the entry trampoline stack * regardless of what task is running. @@ -1694,8 +1706,6 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - setup_cpu_entry_area(cpu); - /* * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ee9ca0ad438854..3e29aad5c7cc67 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -947,6 +947,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) void __init trap_init(void) { + /* Init cpu_entry_area before IST entries are set up */ + setup_cpu_entry_areas(); + idt_setup_traps(); /* From 7fbbd5cbebf118a9e09f5453f686656a167c3d1c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:27 +0100 Subject: [PATCH 45/61] x86/entry/64: Remove the SYSENTER stack canary Now that the SYSENTER stack has a guard page, there's no need for a canary to detect overflow after the fact. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.572577316@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/dumpstack.c | 3 +-- arch/x86/kernel/process.c | 1 - arch/x86/kernel/traps.c | 7 ------- 4 files changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b0cf0612a454d6..d34ac13c586652 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -341,7 +341,6 @@ struct tss_struct { * Space for the temporary SYSENTER stack, used for SYSENTER * and the entry trampoline as well. */ - unsigned long SYSENTER_stack_canary; unsigned long SYSENTER_stack[64]; /* diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 60267850125e68..ae1ce2e3f13260 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -48,8 +48,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) int cpu = smp_processor_id(); struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; - /* Treat the canary as part of the stack for unwinding purposes. */ - void *begin = &tss->SYSENTER_stack_canary; + void *begin = &tss->SYSENTER_stack; void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); if ((void *)stack < begin || (void *)stack >= end) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 86e83762e3b378..6a04287f222b29 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -81,7 +81,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif - .SYSENTER_stack_canary = STACK_END_MAGIC, }; EXPORT_PER_CPU_SYMBOL(cpu_tss); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3e29aad5c7cc67..5ade4f89a6d169 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -814,13 +814,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: - /* - * This is the most likely code path that involves non-trivial use - * of the SYSENTER stack. Check that we haven't overrun it. - */ - WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, - "Overran or corrupted SYSENTER stack\n"); - ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); From 0f9a48100fba3f189724ae88a450c2261bf91c80 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:28 +0100 Subject: [PATCH 46/61] x86/entry: Clean up the SYSENTER_stack code The existing code was a mess, mainly because C arrays are nasty. Turn SYSENTER_stack into a struct, add a helper to find it, and do all the obvious cleanups this enables. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.653244723@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/fixmap.h | 5 +++++ arch/x86/include/asm/processor.h | 6 +++++- arch/x86/kernel/asm-offsets.c | 6 ++---- arch/x86/kernel/cpu/common.c | 14 +++----------- arch/x86/kernel/dumpstack.c | 7 +++---- 7 files changed, 21 insertions(+), 23 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 0ab316c4680673..3629bcbf85a2dc 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -942,7 +942,7 @@ ENTRY(debug) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Ldebug_from_sysenter_stack @@ -986,7 +986,7 @@ ENTRY(nmi) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Lnmi_from_sysenter_stack diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2582984ffb4b23..575b184f377fe5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -154,7 +154,7 @@ END(native_usergs_sysret64) _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA ENTRY(entry_SYSCALL_64_trampoline) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 451da7d9a50268..cc5d98bdca37f1 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -245,5 +245,10 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); } +static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) +{ + return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d34ac13c586652..f933869470b89b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -336,12 +336,16 @@ struct x86_hw_tss { #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) #define INVALID_IO_BITMAP_OFFSET 0x8000 +struct SYSENTER_stack { + unsigned long words[64]; +}; + struct tss_struct { /* * Space for the temporary SYSENTER stack, used for SYSENTER * and the entry trampoline as well. */ - unsigned long SYSENTER_stack[64]; + struct SYSENTER_stack SYSENTER_stack; /* * The fixed hardware portion. This must not cross a page boundary diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 61b1af88ac0707..46c0995344aaad 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -94,10 +94,8 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); - /* Offset from cpu_tss to SYSENTER_stack */ - OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); - /* Size of SYSENTER_stack */ - DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); + DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fb01a8e5e9b7df..3de7480e4f3267 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1314,12 +1314,7 @@ void enable_sep_cpu(void) tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - - wrmsr(MSR_IA32_SYSENTER_ESP, - (unsigned long)&get_cpu_entry_area(cpu)->tss + - offsetofend(struct tss_struct, SYSENTER_stack), - 0); - + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); put_cpu(); @@ -1436,9 +1431,7 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, - (unsigned long)&get_cpu_entry_area(cpu)->tss + - offsetofend(struct tss_struct, SYSENTER_stack)); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1653,8 +1646,7 @@ void cpu_init(void) */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); - load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + - offsetofend(struct tss_struct, SYSENTER_stack)); + load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae1ce2e3f13260..bbd6d986e2d0fc 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -45,11 +45,10 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) { - int cpu = smp_processor_id(); - struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; + struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); - void *begin = &tss->SYSENTER_stack; - void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); + void *begin = ss; + void *end = ss + 1; if ((void *)stack < begin || (void *)stack >= end) return false; From c482feefe1aeb150156248ba0fd3e029bc886605 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:29 +0100 Subject: [PATCH 47/61] x86/entry/64: Make cpu_entry_area.tss read-only The TSS is a fairly juicy target for exploits, and, now that the TSS is in the cpu_entry_area, it's no longer protected by kASLR. Make it read-only on x86_64. On x86_32, it can't be RO because it's written by the CPU during task switches, and we use a task gate for double faults. I'd also be nervous about errata if we tried to make it RO even on configurations without double fault handling. [ tglx: AMD confirmed that there is no problem on 64-bit with TSS RO. So it's probably safe to assume that it's a non issue, though Intel might have been creative in that area. Still waiting for confirmation. ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.733700132@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 8 ++++---- arch/x86/include/asm/fixmap.h | 13 +++++++++---- arch/x86/include/asm/processor.h | 17 ++++++++--------- arch/x86/include/asm/switch_to.h | 4 ++-- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/kernel/asm-offsets.c | 5 ++--- arch/x86/kernel/asm-offsets_32.c | 4 ++-- arch/x86/kernel/cpu/common.c | 29 +++++++++++++++++++---------- arch/x86/kernel/ioport.c | 2 +- arch/x86/kernel/process.c | 6 +++--- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- arch/x86/lib/delay.c | 4 ++-- arch/x86/xen/enlighten_pv.c | 2 +- 16 files changed, 60 insertions(+), 48 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 3629bcbf85a2dc..bd8b57a5c874bc 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -942,7 +942,7 @@ ENTRY(debug) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Ldebug_from_sysenter_stack @@ -986,7 +986,7 @@ ENTRY(nmi) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Lnmi_from_sysenter_stack diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 575b184f377fe5..2812ce043a7a8c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -154,7 +154,7 @@ END(native_usergs_sysret64) _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ +#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA ENTRY(entry_SYSCALL_64_trampoline) @@ -390,7 +390,7 @@ syscall_return_via_sysret: * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ @@ -719,7 +719,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ @@ -934,7 +934,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) /* * Switch to the thread stack. This is called with the IRET frame and diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index cc5d98bdca37f1..94fc4fa141275b 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -56,9 +56,14 @@ struct cpu_entry_area { char gdt[PAGE_SIZE]; /* - * The GDT is just below cpu_tss and thus serves (on x86_64) as a - * a read-only guard page for the SYSENTER stack at the bottom - * of the TSS region. + * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as + * a a read-only guard page. + */ + struct SYSENTER_stack_page SYSENTER_stack_page; + + /* + * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because + * we need task switches to work, and task switches write to the TSS. */ struct tss_struct tss; @@ -247,7 +252,7 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) { - return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; + return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; } #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f933869470b89b..e8991d7f7034fe 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -340,13 +340,11 @@ struct SYSENTER_stack { unsigned long words[64]; }; -struct tss_struct { - /* - * Space for the temporary SYSENTER stack, used for SYSENTER - * and the entry trampoline as well. - */ - struct SYSENTER_stack SYSENTER_stack; +struct SYSENTER_stack_page { + struct SYSENTER_stack stack; +} __aligned(PAGE_SIZE); +struct tss_struct { /* * The fixed hardware portion. This must not cross a page boundary * at risk of violating the SDM's advice and potentially triggering @@ -363,7 +361,7 @@ struct tss_struct { unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; } __aligned(PAGE_SIZE); -DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); /* * sizeof(unsigned long) coming from an extra "long" at the end @@ -378,7 +376,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else -#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 +/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ +#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif /* @@ -538,7 +537,7 @@ static inline void native_set_iopl_mask(unsigned mask) static inline void native_load_sp0(unsigned long sp0) { - this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } static inline void native_swapgs(void) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index cbc71e73bd3216..9b6df68d8fd1eb 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,10 +79,10 @@ do { \ static inline void refresh_sysenter_cs(struct thread_struct *thread) { /* Only happens when SEP is enabled, no need to test "SEP"arately: */ - if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) + if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs)) return; - this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); + this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } #endif diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 44a04999791e05..00223333821a96 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack, #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) +# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) #endif #endif diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 46c0995344aaad..cd360a5e0dca30 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -94,10 +94,9 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); - OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); - DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); - /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); + OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); + DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 52ce4ea16e5386..7d20d9c0b3d69c 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -47,8 +47,8 @@ void foo(void) BLANK(); /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - offsetofend(struct tss_struct, SYSENTER_stack)); + DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3de7480e4f3267..c2eada1056de8c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -487,6 +487,9 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); #endif +static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, + SYSENTER_stack_storage); + static void __init set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) { @@ -500,23 +503,29 @@ static void __init setup_cpu_entry_area(int cpu) #ifdef CONFIG_X86_64 extern char _entry_trampoline[]; - /* On 64-bit systems, we use a read-only fixmap GDT. */ + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; + pgprot_t tss_prot = PAGE_KERNEL_RO; #else /* * On native 32-bit systems, the GDT cannot be read-only because * our double fault handler uses a task gate, and entering through - * a task gate needs to change an available TSS to busy. If the GDT - * is read-only, that will triple fault. + * a task gate needs to change an available TSS to busy. If the + * GDT is read-only, that will triple fault. The TSS cannot be + * read-only because the CPU writes to it on task switches. * - * On Xen PV, the GDT must be read-only because the hypervisor requires - * it. + * On Xen PV, the GDT must be read-only because the hypervisor + * requires it. */ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? PAGE_KERNEL_RO : PAGE_KERNEL; + pgprot_t tss_prot = PAGE_KERNEL; #endif __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), + per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, + PAGE_KERNEL); /* * The Intel SDM says (Volume 3, 7.2.1): @@ -539,9 +548,9 @@ static void __init setup_cpu_entry_area(int cpu) offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), - &per_cpu(cpu_tss, cpu), + &per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct) / PAGE_SIZE, - PAGE_KERNEL); + tss_prot); #ifdef CONFIG_X86_32 per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); @@ -1305,7 +1314,7 @@ void enable_sep_cpu(void) return; cpu = get_cpu(); - tss = &per_cpu(cpu_tss, cpu); + tss = &per_cpu(cpu_tss_rw, cpu); /* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- @@ -1575,7 +1584,7 @@ void cpu_init(void) if (cpu) load_ucode_ap(); - t = &per_cpu(cpu_tss, cpu); + t = &per_cpu(cpu_tss_rw, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1667,7 +1676,7 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); wait_for_master_cpu(cpu); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 3feb648781c470..2f723301eb58fc 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(cpu_tss, get_cpu()); + tss = &per_cpu(cpu_tss_rw, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6a04287f222b29..51741597840936 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -47,7 +47,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower @@ -82,7 +82,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif }; -EXPORT_PER_CPU_SYMBOL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); @@ -111,7 +111,7 @@ void exit_thread(struct task_struct *tsk) struct fpu *fpu = &t->fpu; if (bp) { - struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 45bf0c5f93e151..5224c609918416 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 157f8181691524..c7546623201631 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -399,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(irq_count) != -1); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5ade4f89a6d169..74136fd16f491f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -364,7 +364,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { - struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; + struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* * regs->sp points to the failing IRET frame on the @@ -649,7 +649,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * exception came from the IRET target. */ struct bad_iret_stack *new_stack = - (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; + (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the new stack. */ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 553f8fd23cc473..4846eff7e4c8b1 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops) delay = min_t(u64, MWAITX_MAX_LOOPS, loops); /* - * Use cpu_tss as a cacheline-aligned, seldomly + * Use cpu_tss_rw as a cacheline-aligned, seldomly * accessed per-cpu variable as the monitor target. */ - __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); + __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); /* * AMD, like Intel, supports the EAX hint and EAX=0xf diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index fbd054d6ac973d..ae3a071e1d0f4e 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -818,7 +818,7 @@ static void xen_load_sp0(unsigned long sp0) mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); - this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } void xen_set_iopl_mask(unsigned mask) From a035795499ca1c2bd1928808d1a156eda1420383 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:30 +0100 Subject: [PATCH 48/61] x86/paravirt: Dont patch flush_tlb_single native_flush_tlb_single() will be changed with the upcoming PAGE_TABLE_ISOLATION feature. This requires to have more code in there than INVLPG. Remove the paravirt patching for it. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Reviewed-by: Juergen Gross Acked-by: Peter Zijlstra Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Cc: michael.schwarz@iaik.tugraz.at Cc: moritz.lipp@iaik.tugraz.at Cc: richard.fellner@student.tugraz.at Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt_patch_64.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index ac0be8283325ed..9edadabf04f66c 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): From 79cc74155218316b9a5d28577c7077b2adba8e58 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:31 +0100 Subject: [PATCH 49/61] x86/paravirt: Provide a way to check for hypervisors There is no generic way to test whether a kernel is running on a specific hypervisor. But that's required to prevent the upcoming user address space separation feature in certain guest modes. Make the hypervisor type enum unconditionally available and provide a helper function which allows to test for a specific type. Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.912938129@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hypervisor.h | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 1b0a5abcd8aeb6..96aa6b9884dc5b 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -20,16 +20,7 @@ #ifndef _ASM_X86_HYPERVISOR_H #define _ASM_X86_HYPERVISOR_H -#ifdef CONFIG_HYPERVISOR_GUEST - -#include -#include -#include - -/* - * x86 hypervisor information - */ - +/* x86 hypervisor types */ enum x86_hypervisor_type { X86_HYPER_NATIVE = 0, X86_HYPER_VMWARE, @@ -39,6 +30,12 @@ enum x86_hypervisor_type { X86_HYPER_KVM, }; +#ifdef CONFIG_HYPERVISOR_GUEST + +#include +#include +#include + struct hypervisor_x86 { /* Hypervisor name */ const char *name; @@ -58,7 +55,15 @@ struct hypervisor_x86 { extern enum x86_hypervisor_type x86_hyper_type; extern void init_hypervisor_platform(void); +static inline bool hypervisor_is_type(enum x86_hypervisor_type type) +{ + return x86_hyper_type == type; +} #else static inline void init_hypervisor_platform(void) { } +static inline bool hypervisor_is_type(enum x86_hypervisor_type type) +{ + return type == X86_HYPER_NATIVE; +} #endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* _ASM_X86_HYPERVISOR_H */ From 6cbd2171e89b13377261d15e64384df60ecb530e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:32 +0100 Subject: [PATCH 50/61] x86/cpufeatures: Make CPU bugs sticky There is currently no way to force CPU bug bits like CPU feature bits. That makes it impossible to set a bug bit once at boot and have it stick for all upcoming CPUs. Extend the force set/clear arrays to handle bug bits as well. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.992156574@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 2 ++ arch/x86/include/asm/processor.h | 4 ++-- arch/x86/kernel/cpu/common.c | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bf6a76202a779e..ea9a7dde62e5c4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) +#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) + #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) /* * Static testing of CPU features. Used the same as boot_cpu_has(). diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e8991d7f7034fe..da943411d3d88b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -163,8 +163,8 @@ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; extern struct x86_hw_tss doublefault_tss; -extern __u32 cpu_caps_cleared[NCAPINTS]; -extern __u32 cpu_caps_set[NCAPINTS]; +extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c2eada1056de8c..034900623adfe8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -452,8 +452,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS]; -__u32 cpu_caps_set[NCAPINTS]; +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; void load_percpu_segment(int cpu) { @@ -812,7 +812,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) { int i; - for (i = 0; i < NCAPINTS; i++) { + for (i = 0; i < NCAPINTS + NBUGINTS; i++) { c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] |= cpu_caps_set[i]; } From 203c110b39a89b48156c7450504e454fedb7f7f6 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 12 Dec 2017 21:32:16 +0100 Subject: [PATCH 51/61] parisc: Fix indenting in puts() Static analysis tools complain that we intended to have curly braces around this indent block. In this case this assumption is wrong, so fix the indenting. Fixes: 2f3c7b8137ef ("parisc: Add core code for self-extracting kernel") Reported-by: Dan Carpenter Signed-off-by: Helge Deller Cc: # v4.14+ --- arch/parisc/boot/compressed/misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c index 9345b44b86f036..f57118e1f6b426 100644 --- a/arch/parisc/boot/compressed/misc.c +++ b/arch/parisc/boot/compressed/misc.c @@ -123,8 +123,8 @@ int puts(const char *s) while ((nuline = strchr(s, '\n')) != NULL) { if (nuline != s) pdc_iodc_print(s, nuline - s); - pdc_iodc_print("\r\n", 2); - s = nuline + 1; + pdc_iodc_print("\r\n", 2); + s = nuline + 1; } if (*s != '\0') pdc_iodc_print(s, strlen(s)); From 0ed9d3de5f8f97e6efd5ca0e3377cab5f0451ead Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 12 Dec 2017 21:25:41 +0100 Subject: [PATCH 52/61] parisc: Align os_hpmc_size on word boundary The os_hpmc_size variable sometimes wasn't aligned at word boundary and thus triggered the unaligned fault handler at startup. Fix it by aligning it properly. Signed-off-by: Helge Deller Cc: # v4.14+ --- arch/parisc/kernel/hpmc.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/kernel/hpmc.S b/arch/parisc/kernel/hpmc.S index e3a8e5e4d5de75..8d072c44f300c1 100644 --- a/arch/parisc/kernel/hpmc.S +++ b/arch/parisc/kernel/hpmc.S @@ -305,6 +305,7 @@ ENDPROC_CFI(os_hpmc) __INITRODATA + .align 4 .export os_hpmc_size os_hpmc_size: .word .os_hpmc_end-.os_hpmc From bcf3f1752a622f1372d3252d0fea8855d89812e7 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 12 Dec 2017 21:52:26 +0100 Subject: [PATCH 53/61] parisc: Hide Diva-built-in serial aux and graphics card Diva GSP card has built-in serial AUX port and ATI graphic card which simply don't work and which both don't have external connectors. User Guides even mention that those devices shouldn't be used. So, prevent that Linux drivers try to enable those devices. Signed-off-by: Helge Deller Cc: # v3.0+ --- drivers/parisc/lba_pci.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c index a25fed52f7e94d..41b740aed3a346 100644 --- a/drivers/parisc/lba_pci.c +++ b/drivers/parisc/lba_pci.c @@ -1692,3 +1692,36 @@ void lba_set_iregs(struct parisc_device *lba, u32 ibase, u32 imask) iounmap(base_addr); } + +/* + * The design of the Diva management card in rp34x0 machines (rp3410, rp3440) + * seems rushed, so that many built-in components simply don't work. + * The following quirks disable the serial AUX port and the built-in ATI RV100 + * Radeon 7000 graphics card which both don't have any external connectors and + * thus are useless, and even worse, e.g. the AUX port occupies ttyS0 and as + * such makes those machines the only PARISC machines on which we can't use + * ttyS0 as boot console. + */ +static void quirk_diva_ati_card(struct pci_dev *dev) +{ + if (dev->subsystem_vendor != PCI_VENDOR_ID_HP || + dev->subsystem_device != 0x1292) + return; + + dev_info(&dev->dev, "Hiding Diva built-in ATI card"); + dev->device = 0; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RADEON_QY, + quirk_diva_ati_card); + +static void quirk_diva_aux_disable(struct pci_dev *dev) +{ + if (dev->subsystem_vendor != PCI_VENDOR_ID_HP || + dev->subsystem_device != 0x1291) + return; + + dev_info(&dev->dev, "Hiding Diva built-in AUX serial device"); + dev->device = 0; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_DIVA_AUX, + quirk_diva_aux_disable); From 6a16fc322085bb3163d7d6e44856adfda06a8001 Mon Sep 17 00:00:00 2001 From: Pravin Shedge Date: Sun, 10 Dec 2017 23:54:33 +0530 Subject: [PATCH 54/61] parisc: remove duplicate includes These duplicate includes have been found with scripts/checkincludes.pl but they have been removed manually to avoid removing false positives. Signed-off-by: Pravin Shedge Signed-off-by: Helge Deller --- arch/parisc/kernel/unwind.c | 1 - arch/parisc/lib/delay.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 5a657986ebbf4b..143f90e2f9f3c6 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/arch/parisc/lib/delay.c b/arch/parisc/lib/delay.c index 7eab4bb8abe630..66e506520505d8 100644 --- a/arch/parisc/lib/delay.c +++ b/arch/parisc/lib/delay.c @@ -16,9 +16,7 @@ #include #include -#include #include - #include /* for mfctl() */ #include /* for boot_cpu_data */ From 9352aeada4d8d8753fc0e414fbfe8fdfcb68a12c Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Mon, 13 Nov 2017 19:35:33 -0500 Subject: [PATCH 55/61] Revert "parisc: Re-enable interrupts early" This reverts commit 5c38602d83e584047906b41b162ababd4db4106d. Interrupts can't be enabled early because the register saves are done on the thread stack prior to switching to the IRQ stack. This caused stack overflows and the thread stack needed increasing to 32k. Even then, stack overflows still occasionally occurred. Background: Even with a 32 kB thread stack, I have seen instances where the thread stack overflowed on the mx3210 buildd. Detection of stack overflow only occurs when we have an external interrupt. When an external interrupt occurs, we switch to the thread stack if we are not already on a kernel stack. Then, registers and specials are saved to the kernel stack. The bug occurs in intr_return where interrupts are reenabled prior to returning from the interrupt. This was done incase we need to schedule or deliver signals. However, it introduces the possibility that multiple external interrupts may occur on the thread stack and cause a stack overflow. These might not be detected and cause the kernel to misbehave in random ways. This patch changes the code back to only reenable interrupts when we are going to schedule or deliver signals. As a result, we generally return from an interrupt before reenabling interrupts. This minimizes the growth of the thread stack. Fixes: 5c38602d83e5 ("parisc: Re-enable interrupts early") Signed-off-by: John David Anglin Cc: # v4.10+ Signed-off-by: Helge Deller --- arch/parisc/kernel/entry.S | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index a4fd296c958e8e..f3cecf5117cf8a 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -878,9 +878,6 @@ ENTRY_CFI(syscall_exit_rfi) STREG %r19,PT_SR7(%r16) intr_return: - /* NOTE: Need to enable interrupts incase we schedule. */ - ssm PSW_SM_I, %r0 - /* check for reschedule */ mfctl %cr30,%r1 LDREG TI_FLAGS(%r1),%r19 /* sched.h: TIF_NEED_RESCHED */ @@ -907,6 +904,11 @@ intr_check_sig: LDREG PT_IASQ1(%r16), %r20 cmpib,COND(=),n 0,%r20,intr_restore /* backward */ + /* NOTE: We need to enable interrupts if we have to deliver + * signals. We used to do this earlier but it caused kernel + * stack overflows. */ + ssm PSW_SM_I, %r0 + copy %r0, %r25 /* long in_syscall = 0 */ #ifdef CONFIG_64BIT ldo -16(%r30),%r29 /* Reference param save area */ @@ -958,6 +960,10 @@ intr_do_resched: cmpib,COND(=) 0, %r20, intr_do_preempt nop + /* NOTE: We need to enable interrupts if we schedule. We used + * to do this earlier but it caused kernel stack overflows. */ + ssm PSW_SM_I, %r0 + #ifdef CONFIG_64BIT ldo -16(%r30),%r29 /* Reference param save area */ #endif From da57c5414f49ef9e4bcb9ae0bbafd1d650b31411 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Mon, 13 Nov 2017 19:35:33 -0500 Subject: [PATCH 56/61] parisc: Reduce thread stack to 16 kb In testing, I found that the thread stack can be 16 kB when using an irq stack. Without it, the thread stack needs to be 32 kB. Currently, the irq stack is 32 kB. While it probably could be 16 kB, I would prefer to leave it as is for safety. Signed-off-by: John David Anglin Signed-off-by: Helge Deller --- arch/parisc/include/asm/thread_info.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index c980a02a52bc0d..598c8d60fa5e60 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -35,7 +35,12 @@ struct thread_info { /* thread information allocation */ +#ifdef CONFIG_IRQSTACKS +#define THREAD_SIZE_ORDER 2 /* PA-RISC requires at least 16k stack */ +#else #define THREAD_SIZE_ORDER 3 /* PA-RISC requires at least 32k stack */ +#endif + /* Be sure to hunt all references to this down when you change the size of * the kernel stack */ #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) From b9f5fb1800d8a4a3bc6cd3152c5f3d252986cf79 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 10 Nov 2017 15:57:21 +0100 Subject: [PATCH 57/61] cramfs: fix MTD dependency With CONFIG_MTD=m and CONFIG_CRAMFS=y, we now get a link failure: fs/cramfs/inode.o: In function `cramfs_mount': inode.c:(.text+0x220): undefined reference to `mount_mtd' fs/cramfs/inode.o: In function `cramfs_mtd_fill_super': inode.c:(.text+0x6d8): undefined reference to `mtd_point' inode.c:(.text+0xae4): undefined reference to `mtd_unpoint' This adds a more specific Kconfig dependency to avoid the broken configuration. Alternatively we could make CRAMFS itself depend on "MTD || !MTD" with a similar result. Fixes: 99c18ce580c6 ("cramfs: direct memory access support") Signed-off-by: Arnd Bergmann Signed-off-by: Nicolas Pitre Signed-off-by: Linus Torvalds --- fs/cramfs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index f937082f32449a..58e2fe40b2a044 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -34,6 +34,7 @@ config CRAMFS_BLOCKDEV config CRAMFS_MTD bool "Support CramFs image directly mapped in physical memory" depends on CRAMFS && MTD + depends on CRAMFS=m || MTD=y default y if !CRAMFS_BLOCKDEV help This option allows the CramFs driver to load data directly from From 779f4e1c6c7c661db40dfebd6dd6bda7b5f88aa3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Dec 2017 11:28:38 -0800 Subject: [PATCH 58/61] Revert "exec: avoid RLIMIT_STACK races with prlimit()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 04e35f4495dd560db30c25efca4eecae8ec8c375. SELinux runs with secureexec for all non-"noatsecure" domain transitions, which means lots of processes end up hitting the stack hard-limit change that was introduced in order to fix a race with prlimit(). That race fix will need to be redesigned. Reported-by: Laura Abbott Reported-by: Tomáš Trnka Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 156f56acfe8e7c..5688b5e1b93781 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1339,15 +1339,10 @@ void setup_new_exec(struct linux_binprm * bprm) * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid - * races from other threads changing the limits. This also - * must be protected from races with prlimit() calls. + * needing to clean up the change on failure. */ - task_lock(current->group_leader); if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; - if (current->signal->rlim[RLIMIT_STACK].rlim_max > _STK_LIM) - current->signal->rlim[RLIMIT_STACK].rlim_max = _STK_LIM; - task_unlock(current->group_leader); } arch_pick_mmap_layout(current->mm); From 1291a0d5049dbc06baaaf66a9ff3f53db493b19b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 Dec 2017 18:59:59 -0800 Subject: [PATCH 59/61] Linux 4.15-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3f4d157add5401..7e02f951b28418 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Fearless Coyote # *DOCUMENTATION* From d82c3682168431d29ba1741d0cd5ef45c68bf8e0 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 18 Dec 2017 08:26:28 +0100 Subject: [PATCH 60/61] mtd: Fix mtd_check_oob_ops() The mtd_check_oob_ops() helper verifies if the operation defined by the user is correct. Fix the check that verifies if the entire requested area exists. This check is too restrictive and will fail anytime the last data byte of the very last page is included in an operation. Fixes: 5cdd929da53d ("mtd: Add sanity checks in mtd_write/read_oob()") Signed-off-by: Miquel Raynal Acked-by: Boris Brezillon Signed-off-by: Richard Weinberger --- drivers/mtd/mtdcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index f80e911b884381..73b6055774474e 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -1114,7 +1114,7 @@ static int mtd_check_oob_ops(struct mtd_info *mtd, loff_t offs, if (!ops->oobbuf) ops->ooblen = 0; - if (offs < 0 || offs + ops->len >= mtd->size) + if (offs < 0 || offs + ops->len > mtd->size) return -EINVAL; if (ops->ooblen) { From 10a7e9d849150a2879efc0b04d8a51068c9dd0c5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 19 Dec 2017 13:52:23 -0800 Subject: [PATCH 61/61] Do not hash userspace addresses in fault handlers The hashing of %p was designed to restrict kernel addresses. There is no reason to hash the userspace values seen during a segfault report, so switch these to %px. (Some architectures already use %lx.) Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p") Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- arch/sparc/mm/fault_32.c | 2 +- arch/sparc/mm/fault_64.c | 2 +- arch/um/kernel/trap.c | 2 +- arch/x86/mm/fault.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index be3136f142a999..a8103a84b4ac4a 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -113,7 +113,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->pc, (void *)regs->u_regs[UREG_I7], diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 815c03d7a76552..41363f46797bf9 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -154,7 +154,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->tpc, (void *)regs->u_regs[UREG_I7], diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 4e6fcb32620ffb..42864417595623 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -150,7 +150,7 @@ static void show_segv_info(struct uml_pt_regs *regs) if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi), (void *)UPT_IP(regs), (void *)UPT_SP(regs), diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index febf6980e65355..06fe3d51d385b8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -860,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code);