From d50babbe300eedf33ea5b00a12c5df3a05bd96c7 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Wed, 22 Jul 2015 14:40:08 +0800 Subject: [PATCH 01/89] xen-blkfront: introduce blkfront_gather_backend_features() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a bug when migrate from !feature-persistent host to feature-persistent host, because domU still thinks new host/backend doesn't support persistent. Dmesg like: backed has not unmapped grant: 839 backed has not unmapped grant: 773 backed has not unmapped grant: 773 backed has not unmapped grant: 773 backed has not unmapped grant: 839 The fix is to recheck feature-persistent of new backend in blkif_recover(). See: https://lkml.org/lkml/2015/5/25/469 As Roger suggested, we can split the part of blkfront_connect that checks for optional features, like persistent grants, indirect descriptors and flush/barrier features to a separate function and call it from both blkfront_connect and blkif_recover Acked-by: Roger Pau Monné Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 122 +++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 54 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index fc770b7d3beb19..f45f4e67c5d4a7 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -179,6 +179,7 @@ static DEFINE_SPINLOCK(minor_lock); ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) static int blkfront_setup_indirect(struct blkfront_info *info); +static int blkfront_gather_backend_features(struct blkfront_info *info); static int get_id_from_freelist(struct blkfront_info *info) { @@ -1525,7 +1526,7 @@ static int blkif_recover(struct blkfront_info *info) info->shadow_free = info->ring.req_prod_pvt; info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; - rc = blkfront_setup_indirect(info); + rc = blkfront_gather_backend_features(info); if (rc) { kfree(copy); return rc; @@ -1726,20 +1727,13 @@ static void blkfront_setup_discard(struct blkfront_info *info) static int blkfront_setup_indirect(struct blkfront_info *info) { - unsigned int indirect_segments, segs; + unsigned int segs; int err, i; - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-max-indirect-segments", "%u", &indirect_segments, - NULL); - if (err) { - info->max_indirect_segments = 0; + if (info->max_indirect_segments == 0) segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; - } else { - info->max_indirect_segments = min(indirect_segments, - xen_blkif_max_segments); + else segs = info->max_indirect_segments; - } err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE(info)); if (err) @@ -1802,6 +1796,68 @@ static int blkfront_setup_indirect(struct blkfront_info *info) return -ENOMEM; } +/* + * Gather all backend feature-* + */ +static int blkfront_gather_backend_features(struct blkfront_info *info) +{ + int err; + int barrier, flush, discard, persistent; + unsigned int indirect_segments; + + info->feature_flush = 0; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%d", &barrier, + NULL); + + /* + * If there's no "feature-barrier" defined, then it means + * we're dealing with a very old backend which writes + * synchronously; nothing to do. + * + * If there are barriers, then we use flush. + */ + if (!err && barrier) + info->feature_flush = REQ_FLUSH | REQ_FUA; + /* + * And if there is "feature-flush-cache" use that above + * barriers. + */ + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-flush-cache", "%d", &flush, + NULL); + + if (!err && flush) + info->feature_flush = REQ_FLUSH; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-discard", "%d", &discard, + NULL); + + if (!err && discard) + blkfront_setup_discard(info); + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-persistent", "%u", &persistent, + NULL); + if (err) + info->feature_persistent = 0; + else + info->feature_persistent = persistent; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-max-indirect-segments", "%u", &indirect_segments, + NULL); + if (err) + info->max_indirect_segments = 0; + else + info->max_indirect_segments = min(indirect_segments, + xen_blkif_max_segments); + + return blkfront_setup_indirect(info); +} + /* * Invoked when the backend is finally 'ready' (and has told produced * the details about the physical device - #sectors, size, etc). @@ -1813,7 +1869,6 @@ static void blkfront_connect(struct blkfront_info *info) unsigned int physical_sector_size; unsigned int binfo; int err; - int barrier, flush, discard, persistent; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -1870,48 +1925,7 @@ static void blkfront_connect(struct blkfront_info *info) if (err != 1) physical_sector_size = sector_size; - info->feature_flush = 0; - - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-barrier", "%d", &barrier, - NULL); - - /* - * If there's no "feature-barrier" defined, then it means - * we're dealing with a very old backend which writes - * synchronously; nothing to do. - * - * If there are barriers, then we use flush. - */ - if (!err && barrier) - info->feature_flush = REQ_FLUSH | REQ_FUA; - /* - * And if there is "feature-flush-cache" use that above - * barriers. - */ - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-flush-cache", "%d", &flush, - NULL); - - if (!err && flush) - info->feature_flush = REQ_FLUSH; - - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-discard", "%d", &discard, - NULL); - - if (!err && discard) - blkfront_setup_discard(info); - - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-persistent", "%u", &persistent, - NULL); - if (err) - info->feature_persistent = 0; - else - info->feature_persistent = persistent; - - err = blkfront_setup_indirect(info); + err = blkfront_gather_backend_features(info); if (err) { xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", info->xbdev->otherend); From 7b0767502b5db11cb1f0daef2d01f6d71b1192dc Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Wed, 22 Jul 2015 14:40:09 +0800 Subject: [PATCH 02/89] xen-blkfront: don't add indirect pages to list when !feature_persistent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should consider info->feature_persistent when adding indirect page to list info->indirect_pages, else the BUG_ON() in blkif_free() would be triggered. When we are using persistent grants the indirect_pages list should always be empty because blkfront has pre-allocated enough persistent pages to fill all requests on the ring. CC: stable@vger.kernel.org Acked-by: Roger Pau Monné Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index f45f4e67c5d4a7..44b33d39441b12 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1135,8 +1135,10 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, * Add the used indirect page back to the list of * available pages for indirect grefs. */ - indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); - list_add(&indirect_page->lru, &info->indirect_pages); + if (!info->feature_persistent) { + indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); + list_add(&indirect_page->lru, &info->indirect_pages); + } s->indirect_grants[i]->gref = GRANT_INVALID_REF; list_add_tail(&s->indirect_grants[i]->node, &info->grants); } From 53bc7dc004fecf39e0ba70f2f8d120a1444315d3 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Wed, 22 Jul 2015 14:40:10 +0800 Subject: [PATCH 03/89] xen-blkback: replace work_pending with work_busy in purge_persistent_gnt() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BUG_ON() in purge_persistent_gnt() will be triggered when previous purge work haven't finished. There is a work_pending() before this BUG_ON, but it doesn't account if the work is still currently running. CC: stable@vger.kernel.org Acked-by: Roger Pau Monné Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 9121a2c3e26f26..73c04040c8c8a6 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -382,8 +382,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) return; } - if (work_pending(&blkif->persistent_purge_work)) { - pr_alert_ratelimited("Scheduled work from previous purge is still pending, cannot purge list\n"); + if (work_busy(&blkif->persistent_purge_work)) { + pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); return; } From c9fdec9f3970eeaa1b176422f46167f5f5158804 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 20 Jul 2015 12:14:39 +0300 Subject: [PATCH 04/89] iwlwifi: pcie: fix prepare card flow When the card is not owned by the PCIe bus, we need to acquire ownership first. This flow is implemented in iwl_pcie_prepare_card_hw. Because of a hardware bug, we need to disable link power management before we can request ownership otherwise the other user of the device won't get notified that we are requesting the device which will prevent us from acquire ownership. Same holds for the down flow where we need to make sure that any other potential user is notified that the driver is going down. CC: [4.1] Signed-off-by: Emmanuel Grumbach --- drivers/net/wireless/iwlwifi/pcie/trans.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/pcie/trans.c b/drivers/net/wireless/iwlwifi/pcie/trans.c index 6203c4ad9bba5d..9e144e71da0b59 100644 --- a/drivers/net/wireless/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/iwlwifi/pcie/trans.c @@ -478,10 +478,16 @@ static void iwl_pcie_apm_stop(struct iwl_trans *trans, bool op_mode_leave) if (trans->cfg->device_family == IWL_DEVICE_FAMILY_7000) iwl_set_bits_prph(trans, APMG_PCIDEV_STT_REG, APMG_PCIDEV_STT_VAL_WAKE_ME); - else if (trans->cfg->device_family == IWL_DEVICE_FAMILY_8000) + else if (trans->cfg->device_family == IWL_DEVICE_FAMILY_8000) { + iwl_set_bit(trans, CSR_DBG_LINK_PWR_MGMT_REG, + CSR_RESET_LINK_PWR_MGMT_DISABLED); iwl_set_bit(trans, CSR_HW_IF_CONFIG_REG, CSR_HW_IF_CONFIG_REG_PREPARE | CSR_HW_IF_CONFIG_REG_ENABLE_PME); + mdelay(1); + iwl_clear_bit(trans, CSR_DBG_LINK_PWR_MGMT_REG, + CSR_RESET_LINK_PWR_MGMT_DISABLED); + } mdelay(5); } @@ -575,6 +581,10 @@ static int iwl_pcie_prepare_card_hw(struct iwl_trans *trans) if (ret >= 0) return 0; + iwl_set_bit(trans, CSR_DBG_LINK_PWR_MGMT_REG, + CSR_RESET_LINK_PWR_MGMT_DISABLED); + msleep(1); + for (iter = 0; iter < 10; iter++) { /* If HW is not ready, prepare the conditions to check again */ iwl_set_bit(trans, CSR_HW_IF_CONFIG_REG, @@ -582,8 +592,10 @@ static int iwl_pcie_prepare_card_hw(struct iwl_trans *trans) do { ret = iwl_pcie_set_hw_ready(trans); - if (ret >= 0) - return 0; + if (ret >= 0) { + ret = 0; + goto out; + } usleep_range(200, 1000); t += 200; @@ -593,6 +605,10 @@ static int iwl_pcie_prepare_card_hw(struct iwl_trans *trans) IWL_ERR(trans, "Couldn't prepare the card\n"); +out: + iwl_clear_bit(trans, CSR_DBG_LINK_PWR_MGMT_REG, + CSR_RESET_LINK_PWR_MGMT_DISABLED); + return ret; } From dc9f69b907f3853952d3ffb7918d05a662146712 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Tue, 7 Jul 2015 16:53:42 +0300 Subject: [PATCH 05/89] iwlwifi: mvm: Fix regular scan priority The code checks the total number of iterations to differentiate between regular scan and scheduled scan. However, regular scan has a total of one iteration, not zero. As a result, regular scan will have lower priority than it should have, and in case scheduled scan is already running when regular scan is requested, regular scan will be delayed until scheduled scan is aborted. Fix that by checking for total iterations number of one as an identifier for regular scan. Fixes: 133c8259f885 ("iwlwifi: mvm: rename generic_scan_cmd functions to dwell") Signed-off-by: Avraham Stern Signed-off-by: Emmanuel Grumbach --- drivers/net/wireless/iwlwifi/mvm/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/iwlwifi/mvm/scan.c b/drivers/net/wireless/iwlwifi/mvm/scan.c index 5000bfcded617f..5514ad6d4e5437 100644 --- a/drivers/net/wireless/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/iwlwifi/mvm/scan.c @@ -1023,7 +1023,7 @@ static void iwl_mvm_scan_umac_dwell(struct iwl_mvm *mvm, cmd->scan_priority = iwl_mvm_scan_priority(mvm, IWL_SCAN_PRIORITY_EXT_6); - if (iwl_mvm_scan_total_iterations(params) == 0) + if (iwl_mvm_scan_total_iterations(params) == 1) cmd->ooc_priority = iwl_mvm_scan_priority(mvm, IWL_SCAN_PRIORITY_EXT_6); else From 8ef9724bf9718af81cfc5132253372f79c71b7e2 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 26 Jul 2015 21:34:50 -0700 Subject: [PATCH 06/89] regmap: regcache-rbtree: Clean new present bits on present bitmap resize When inserting a new register into a block, the present bit map size is increased using krealloc. krealloc does not clear the additionally allocated memory, leaving it filled with random values. Result is that some registers are considered cached even though this is not the case. Fix the problem by clearing the additionally allocated memory. Also, if the bitmap size does not increase, do not reallocate the bitmap at all to reduce overhead. Fixes: 3f4ff561bc88 ("regmap: rbtree: Make cache_present bitmap per node") Signed-off-by: Guenter Roeck Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/base/regmap/regcache-rbtree.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/base/regmap/regcache-rbtree.c b/drivers/base/regmap/regcache-rbtree.c index 81751a49d8bf23..56486d92c4e72b 100644 --- a/drivers/base/regmap/regcache-rbtree.c +++ b/drivers/base/regmap/regcache-rbtree.c @@ -296,11 +296,20 @@ static int regcache_rbtree_insert_to_block(struct regmap *map, if (!blk) return -ENOMEM; - present = krealloc(rbnode->cache_present, - BITS_TO_LONGS(blklen) * sizeof(*present), GFP_KERNEL); - if (!present) { - kfree(blk); - return -ENOMEM; + if (BITS_TO_LONGS(blklen) > BITS_TO_LONGS(rbnode->blklen)) { + present = krealloc(rbnode->cache_present, + BITS_TO_LONGS(blklen) * sizeof(*present), + GFP_KERNEL); + if (!present) { + kfree(blk); + return -ENOMEM; + } + + memset(present + BITS_TO_LONGS(rbnode->blklen), 0, + (BITS_TO_LONGS(blklen) - BITS_TO_LONGS(rbnode->blklen)) + * sizeof(*present)); + } else { + present = rbnode->cache_present; } /* insert the register value in the correct place in the rbnode block */ From f0ad462189cc898aa0ef8ced849533ee03392bcc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 23 Jul 2015 13:06:10 +0200 Subject: [PATCH 07/89] netfilter: nf_conntrack: silence warning on falling back to vmalloc() Since 88eab472ec21 ("netfilter: conntrack: adjust nf_conntrack_buckets default value"), the hashtable can easily hit this warning. We got reports from users that are getting this message in a quite spamming fashion, so better silence this. Signed-off-by: Pablo Neira Ayuso Acked-by: Florian Westphal --- net/netfilter/nf_conntrack_core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 651039ad1681db..f1680995fc490c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1544,10 +1544,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) sz = nr_slots * sizeof(struct hlist_nulls_head); hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); - if (!hash) { - printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); + if (!hash) hash = vzalloc(sz); - } if (hash && nulls) for (i = 0; i < nr_slots; i++) From 1a727c63612fc582370cf3dc01239d3d239743b5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 28 Jul 2015 01:42:28 +0300 Subject: [PATCH 08/89] netfilter: nf_conntrack: checking for IS_ERR() instead of NULL We recently changed this from nf_conntrack_alloc() to nf_ct_tmpl_alloc() so the error handling needs to changed to check for NULL instead of IS_ERR(). Fixes: 0838aa7fcfcd ('netfilter: fix netns dependencies with conntrack templates') Signed-off-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_synproxy_core.c | 4 +--- net/netfilter/xt_CT.c | 5 +++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 71f1e9fdfa18fb..d7f1685279034b 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -353,10 +353,8 @@ static int __net_init synproxy_net_init(struct net *net) int err = -ENOMEM; ct = nf_ct_tmpl_alloc(net, 0, GFP_KERNEL); - if (IS_ERR(ct)) { - err = PTR_ERR(ct); + if (!ct) goto err1; - } if (!nfct_seqadj_ext_add(ct)) goto err2; diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index c6630030c9121c..43ddeee404e91f 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -202,9 +202,10 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err1; ct = nf_ct_tmpl_alloc(par->net, info->zone, GFP_KERNEL); - ret = PTR_ERR(ct); - if (IS_ERR(ct)) + if (!ct) { + ret = -ENOMEM; goto err2; + } ret = 0; if ((info->ct_events || info->exp_events) && From aecdc63d87891c75e60906973c7b7c9cd58403d6 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 29 Jul 2015 23:06:41 +0300 Subject: [PATCH 09/89] iwlwifi: pcie: fix stuck queue detection for sleeping clients The stuck queue detection mechanism allows to detect queues that are stuck. For sleeping clients, a queue may rightfully be stuck: if a poor client implementation stays asleep for more than 10s, then we don't want to trigger recovery flows because of that client. In order to cope with this, I added a mechanism that monitors the state of the client: when a client goes to sleep, the timer of his queues is frozen. When he wakes up, the timer is reset to the right value so that if a client was awake for more than 10s and the queues are stuck, only then, the recovery flow will kick in. This is valid only on non-shared queues: A-MPDU queues. There was a bug in case we Tx to a sleeping client that has an empty A-MPDU queue: the timer was armed to now + 10s. This is bad, but pretty harmless. The problem is that when the client wakes up, the timer is modified to be now + remainder. But remainder is 0 since the queue was empty when that client went to sleep... Fix this by checking the state of the client before playing with the timer when we add a packet to an empty queue. Signed-off-by: Emmanuel Grumbach --- drivers/net/wireless/iwlwifi/pcie/tx.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/pcie/tx.c b/drivers/net/wireless/iwlwifi/pcie/tx.c index 2b86c2135de36f..607acb53c84755 100644 --- a/drivers/net/wireless/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/iwlwifi/pcie/tx.c @@ -1875,8 +1875,19 @@ int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb, /* start timer if queue currently empty */ if (q->read_ptr == q->write_ptr) { - if (txq->wd_timeout) - mod_timer(&txq->stuck_timer, jiffies + txq->wd_timeout); + if (txq->wd_timeout) { + /* + * If the TXQ is active, then set the timer, if not, + * set the timer in remainder so that the timer will + * be armed with the right value when the station will + * wake up. + */ + if (!txq->frozen) + mod_timer(&txq->stuck_timer, + jiffies + txq->wd_timeout); + else + txq->frozen_expiry_remainder = txq->wd_timeout; + } IWL_DEBUG_RPM(trans, "Q: %d first tx - take ref\n", q->id); iwl_trans_pcie_ref(trans); } From 5d5cd85ff441534a52f23f821d0a7c644d3b6cce Mon Sep 17 00:00:00 2001 From: Mike Looijmans Date: Tue, 28 Jul 2015 07:51:01 +0200 Subject: [PATCH 10/89] rsi: Fix failure to load firmware after memory leak fix and fix the leak Fixes commit eae79b4f3e82 ("rsi: fix memory leak in rsi_load_ta_instructions()") which stopped the driver from functioning. Firmware data has been allocated using vmalloc(), resulting in memory that cannot be used for DMA. Hence the firmware was first copied to a buffer allocated with kmalloc() in the original code. This patch reverts the commit and only calls "kfree()" to release the buffer after sending the data. This fixes the memory leak without breaking the driver. Add a comment to the kmemdup() calls to explain why this is done, and abort if memory allocation fails. Tested on a Topic Miami-Florida board which contains the rsi SDIO chip. Also added the same kfree() call to the USB glue driver. This was not tested on actual hardware though, as I only have the SDIO version. Fixes: eae79b4f3e82 ("rsi: fix memory leak in rsi_load_ta_instructions()") Signed-off-by: Mike Looijmans Cc: stable@vger.kernel.org Signed-off-by: Kalle Valo --- drivers/net/wireless/rsi/rsi_91x_sdio_ops.c | 8 +++++++- drivers/net/wireless/rsi/rsi_91x_usb_ops.c | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c index b6cc9ff47fc2e5..1c6788aecc6265 100644 --- a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c +++ b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c @@ -172,6 +172,7 @@ static int rsi_load_ta_instructions(struct rsi_common *common) (struct rsi_91x_sdiodev *)adapter->rsi_dev; u32 len; u32 num_blocks; + const u8 *fw; const struct firmware *fw_entry = NULL; u32 block_size = dev->tx_blk_size; int status = 0; @@ -200,6 +201,10 @@ static int rsi_load_ta_instructions(struct rsi_common *common) return status; } + /* Copy firmware into DMA-accessible memory */ + fw = kmemdup(fw_entry->data, fw_entry->size, GFP_KERNEL); + if (!fw) + return -ENOMEM; len = fw_entry->size; if (len % 4) @@ -210,7 +215,8 @@ static int rsi_load_ta_instructions(struct rsi_common *common) rsi_dbg(INIT_ZONE, "%s: Instruction size:%d\n", __func__, len); rsi_dbg(INIT_ZONE, "%s: num blocks: %d\n", __func__, num_blocks); - status = rsi_copy_to_card(common, fw_entry->data, len, num_blocks); + status = rsi_copy_to_card(common, fw, len, num_blocks); + kfree(fw); release_firmware(fw_entry); return status; } diff --git a/drivers/net/wireless/rsi/rsi_91x_usb_ops.c b/drivers/net/wireless/rsi/rsi_91x_usb_ops.c index 1106ce76707e10..30c2cf7fa93b0c 100644 --- a/drivers/net/wireless/rsi/rsi_91x_usb_ops.c +++ b/drivers/net/wireless/rsi/rsi_91x_usb_ops.c @@ -146,7 +146,10 @@ static int rsi_load_ta_instructions(struct rsi_common *common) return status; } + /* Copy firmware into DMA-accessible memory */ fw = kmemdup(fw_entry->data, fw_entry->size, GFP_KERNEL); + if (!fw) + return -ENOMEM; len = fw_entry->size; if (len % 4) @@ -158,6 +161,7 @@ static int rsi_load_ta_instructions(struct rsi_common *common) rsi_dbg(INIT_ZONE, "%s: num blocks: %d\n", __func__, num_blocks); status = rsi_copy_to_card(common, fw, len, num_blocks); + kfree(fw); release_firmware(fw_entry); return status; } From 098697dbad9070249eb07a0241c4001aa367bb89 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Wed, 29 Jul 2015 23:36:30 +0200 Subject: [PATCH 11/89] b43: fix extpa_gain check for 2GHz On the 2GHz and and on the 5GHZ band only the extpa_gain setting from the 5GHz band was checked. this patch makes it check the property from the correct band. Signed-off-by: Hauke Mehrtens Signed-off-by: Kalle Valo --- drivers/net/wireless/b43/tables_nphy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/b43/tables_nphy.c b/drivers/net/wireless/b43/tables_nphy.c index 25d1cbd34306e0..b2f0d245bcf3a0 100644 --- a/drivers/net/wireless/b43/tables_nphy.c +++ b/drivers/net/wireless/b43/tables_nphy.c @@ -3728,7 +3728,7 @@ const u32 *b43_nphy_get_tx_gain_table(struct b43_wldev *dev) switch (phy->rev) { case 6: case 5: - if (sprom->fem.ghz5.extpa_gain == 3) + if (sprom->fem.ghz2.extpa_gain == 3) return b43_ntab_tx_gain_epa_rev3_hi_pwr_2g; /* fall through */ case 4: From 7c62940165e9ae4004ce4e6b5117330bab94df68 Mon Sep 17 00:00:00 2001 From: Luis Felipe Dominguez Vega Date: Wed, 29 Jul 2015 21:11:20 -0500 Subject: [PATCH 12/89] rtlwifi: Fix NULL dereference when PCI driver used as an AP In commit 33511b157bbcebaef853cc1811992b664a2e5862 ("rtlwifi: add support to send beacon frame"), the mechanism for sending beacons was established. That patch works correctly for rtl8192cu, but there is a possibility of getting the following warnings in the PCI drivers: WARNING: CPU: 1 PID: 2439 at net/mac80211/driver-ops.h:12 ieee80211_bss_info_change_notify+0x179/0x1d0 [mac80211]() wlp5s0: Failed check-sdata-in-driver check, flags: 0x0 The warning is followed by a NULL pointer dereference as follows: BUG: unable to handle kernel NULL pointer dereference at 0000000000000006 IP: [] rtl_get_tcb_desc+0x5e/0x760 [rtlwifi] This problem was reported at http://thread.gmane.org/gmane.linux.kernel.wireless.general/138645, but no solution was found at that time. The problem was also reported at https://bugzilla.kernel.org/show_bug.cgi?id=9744 and this solution was developed and tested there. The USB driver works with a NULL final argument in the adapter_tx() callback; however, the PCI drivers need a struct rtl_tcb_desc in that position. Fixes: 33511b157bbc ("rtlwifi: add support to send beacon frame.") Signed-off-by: Luis Felipe Dominguez Vega Signed-off-by: Larry Finger Cc: Stable [3.19+] Signed-off-by: Kalle Valo --- drivers/net/wireless/rtlwifi/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/rtlwifi/core.c b/drivers/net/wireless/rtlwifi/core.c index 3b3a88b53b1199..585d0883c7e587 100644 --- a/drivers/net/wireless/rtlwifi/core.c +++ b/drivers/net/wireless/rtlwifi/core.c @@ -1015,9 +1015,12 @@ static void send_beacon_frame(struct ieee80211_hw *hw, { struct rtl_priv *rtlpriv = rtl_priv(hw); struct sk_buff *skb = ieee80211_beacon_get(hw, vif); + struct rtl_tcb_desc tcb_desc; - if (skb) - rtlpriv->intf_ops->adapter_tx(hw, NULL, skb, NULL); + if (skb) { + memset(&tcb_desc, 0, sizeof(struct rtl_tcb_desc)); + rtlpriv->intf_ops->adapter_tx(hw, NULL, skb, &tcb_desc); + } } static void rtl_op_bss_info_changed(struct ieee80211_hw *hw, From 3473f26592c1c365d376aee29433d7db75f14d1e Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Fri, 17 Jul 2015 21:40:28 +0100 Subject: [PATCH 13/89] ARM: 8405/1: VDSO: fix regression with toolchains lacking ld.bfd executable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Sourcery CodeBench Lite 2014.05 toolchain (gcc 4.8.3, binutils 2.24.51) has a GCC which implements -fuse-ld, and it doesn't include the gold linker, but it lacks an ld.bfd executable in its installation. This means that passing -fuse-ld=bfd fails with: VDSO arch/arm/vdso/vdso.so.raw collect2: fatal error: cannot find 'ld' Arguably this is a deficiency in the toolchain, but I suspect it's commonly used enough that it's worth accommodating: just use cc-ldoption (to cause a link attempt) instead of cc-option to test whether we can use -fuse-ld. So -fuse-ld=bfd won't be used with this toolchain, but the build will rightly succeed, just as it does for toolchains which don't implement -fuse-ld (and don't use gold as the default linker). Note: this will change the failure mode for a corner case I was trying to handle in d2b30cd4b722, where the toolchain defaults to the gold linker and the BFD linker is not found in PATH, from: VDSO arch/arm/vdso/vdso.so.raw collect2: fatal error: cannot find 'ld' i.e. the BFD linker is not found, to: OBJCOPY arch/arm/vdso/vdso.so BFD: arch/arm/vdso/vdso.so: Not enough room for program headers, try linking with -N that is, we fail to prevent gold from being used as the linker, and it produces an object that objcopy can't digest. Reported-by: Baruch Siach Tested-by: Baruch Siach Tested-by: Raphaël Poggi Fixes: d2b30cd4b722 ("ARM: 8384/1: VDSO: force use of BFD linker") Cc: stable@vger.kernel.org Signed-off-by: Nathan Lynch Signed-off-by: Russell King --- arch/arm/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 9d259d94e429c4..1160434eece050 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -14,7 +14,7 @@ VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 VDSO_LDFLAGS += -nostdlib -shared VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--build-id) -VDSO_LDFLAGS += $(call cc-option, -fuse-ld=bfd) +VDSO_LDFLAGS += $(call cc-ldoption, -fuse-ld=bfd) obj-$(CONFIG_VDSO) += vdso.o extra-$(CONFIG_VDSO) += vdso.lds From 1ebd47efa4e17391dfac8caa349c6a8d35f996d1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 2 Aug 2015 19:29:16 +0200 Subject: [PATCH 14/89] rocker: free netdevice during netdevice removal When removing a port's netdevice in 'rocker_remove_ports', we should also free the allocated 'net_device' structure. Do that by calling 'free_netdev' after unregistering it. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Fixes: 4b8ac9660af ("rocker: introduce rocker switch driver") Acked-by: Scott Feldman Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index 2d8578cade0379..2e7f9a2834be32 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -4821,6 +4821,7 @@ static void rocker_remove_ports(const struct rocker *rocker) rocker_port_ig_tbl(rocker_port, SWITCHDEV_TRANS_NONE, ROCKER_OP_FLAG_REMOVE); unregister_netdev(rocker_port->dev); + free_netdev(rocker_port->dev); } kfree(rocker->ports); } From 3d0e0af40672a0bf16ca0f0591165535138c1f30 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 31 Jul 2015 17:53:39 -0700 Subject: [PATCH 15/89] fq_codel: explicitly reset flows in ->reset() Alex reported the following crash when using fq_codel with htb: crash> bt PID: 630839 TASK: ffff8823c990d280 CPU: 14 COMMAND: "tc" [... snip ...] #8 [ffff8820ceec17a0] page_fault at ffffffff8160a8c2 [exception RIP: htb_qlen_notify+24] RIP: ffffffffa0841718 RSP: ffff8820ceec1858 RFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff88241747b400 RDX: ffff88241747b408 RSI: 0000000000000000 RDI: ffff8811fb27d000 RBP: ffff8820ceec1868 R8: ffff88120cdeff24 R9: ffff88120cdeff30 R10: 0000000000000bd4 R11: ffffffffa0840919 R12: ffffffffa0843340 R13: 0000000000000000 R14: 0000000000000001 R15: ffff8808dae5c2e8 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #9 [...] qdisc_tree_decrease_qlen at ffffffff81565375 #10 [...] fq_codel_dequeue at ffffffffa084e0a0 [sch_fq_codel] #11 [...] fq_codel_reset at ffffffffa084e2f8 [sch_fq_codel] #12 [...] qdisc_destroy at ffffffff81560d2d #13 [...] htb_destroy_class at ffffffffa08408f8 [sch_htb] #14 [...] htb_put at ffffffffa084095c [sch_htb] #15 [...] tc_ctl_tclass at ffffffff815645a3 #16 [...] rtnetlink_rcv_msg at ffffffff81552cb0 [... snip ...] As Jamal pointed out, there is actually no need to call dequeue to purge the queued skb's in reset, data structures can be just reset explicitly. Therefore, we reset everything except config's and stats, so that we would have a fresh start after device flipping. Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM") Reported-by: Alex Gartrell Cc: Alex Gartrell Cc: Jamal Hadi Salim Signed-off-by: Eric Dumazet [xiyou.wangcong@gmail.com: added codel_vars_init() and qdisc_qstats_backlog_dec()] Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_fq_codel.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 21ca33c9f0368b..a9ba030435a2a5 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -288,10 +288,26 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) static void fq_codel_reset(struct Qdisc *sch) { - struct sk_buff *skb; + struct fq_codel_sched_data *q = qdisc_priv(sch); + int i; - while ((skb = fq_codel_dequeue(sch)) != NULL) - kfree_skb(skb); + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + for (i = 0; i < q->flows_cnt; i++) { + struct fq_codel_flow *flow = q->flows + i; + + while (flow->head) { + struct sk_buff *skb = dequeue_head(flow); + + qdisc_qstats_backlog_dec(sch, skb); + kfree_skb(skb); + } + + INIT_LIST_HEAD(&flow->flowchain); + codel_vars_init(&flow->cvars); + } + memset(q->backlogs, 0, q->flows_cnt * sizeof(u32)); + sch->q.qlen = 0; } static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { From 741e3b9902d11585e18bfc7f8d47e913616bb070 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Sun, 2 Aug 2015 13:24:13 -0500 Subject: [PATCH 16/89] rtlwifi: rtl8723be: Add module parameter for MSI interrupts The driver code allows for the disabling of MSI interrupts; however the module_parm line was missed and the option fails to show with modinfo. Signed-off-by: Larry Finger Cc: Stable [3.15+] Signed-off-by: Kalle Valo --- drivers/net/wireless/rtlwifi/rtl8723be/sw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/rtlwifi/rtl8723be/sw.c b/drivers/net/wireless/rtlwifi/rtl8723be/sw.c index 1017f02d7bf752..7bf88d9dcdc3fc 100644 --- a/drivers/net/wireless/rtlwifi/rtl8723be/sw.c +++ b/drivers/net/wireless/rtlwifi/rtl8723be/sw.c @@ -385,6 +385,7 @@ module_param_named(debug, rtl8723be_mod_params.debug, int, 0444); module_param_named(ips, rtl8723be_mod_params.inactiveps, bool, 0444); module_param_named(swlps, rtl8723be_mod_params.swctrl_lps, bool, 0444); module_param_named(fwlps, rtl8723be_mod_params.fwctrl_lps, bool, 0444); +module_param_named(msi, rtl8723be_mod_params.msi_support, bool, 0444); module_param_named(disable_watchdog, rtl8723be_mod_params.disable_watchdog, bool, 0444); MODULE_PARM_DESC(swenc, "Set to 1 for software crypto (default 0)\n"); From 3576fd794b38306e196498ac54bb3b21c32e1ae4 Mon Sep 17 00:00:00 2001 From: Glenn Griffin Date: Mon, 3 Aug 2015 09:56:54 -0700 Subject: [PATCH 17/89] openvswitch: Fix L4 checksum handling when dealing with IP fragments openvswitch modifies the L4 checksum of a packet when modifying the ip address. When an IP packet is fragmented only the first fragment contains an L4 header and checksum. Prior to this change openvswitch would modify all fragments, modifying application data in non-first fragments, causing checksum failures in the reassembled packet. Signed-off-by: Glenn Griffin Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 8a8c0b8b4f63a4..ee34f474ad1465 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -273,28 +273,36 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, - __be32 *addr, __be32 new_addr) +static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, + __be32 addr, __be32 new_addr) { int transport_len = skb->len - skb_transport_offset(skb); + if (nh->frag_off & htons(IP_OFFSET)) + return; + if (nh->protocol == IPPROTO_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, - *addr, new_addr, 1); + addr, new_addr, 1); } else if (nh->protocol == IPPROTO_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&uh->check, skb, - *addr, new_addr, 1); + addr, new_addr, 1); if (!uh->check) uh->check = CSUM_MANGLED_0; } } } +} +static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, + __be32 *addr, __be32 new_addr) +{ + update_ip_l4_checksum(skb, nh, *addr, new_addr); csum_replace4(&nh->check, *addr, new_addr); skb_clear_hash(skb); *addr = new_addr; From 636dba8e12d797357b2063981476390f11262c08 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 30 Jul 2015 17:12:20 -0700 Subject: [PATCH 18/89] act_mirred: avoid calling tcf_hash_release() when binding When we share an action within a filter, the bind refcnt should increase, therefore we should not call tcf_hash_release(). Cc: Jamal Hadi Salim Cc: Daniel Borkmann Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index a42a3b25722617..268545050ddbd6 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -98,6 +98,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, return ret; ret = ACT_P_CREATED; } else { + if (bind) + return 0; if (!ovr) { tcf_hash_release(a, bind); return -EEXIST; From 468b732b6f76b138c0926eadf38ac88467dcd271 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 1 Aug 2015 15:33:26 +0300 Subject: [PATCH 19/89] rds: fix an integer overflow test in rds_info_getsockopt() "len" is a signed integer. We check that len is not negative, so it goes from zero to INT_MAX. PAGE_SIZE is unsigned long so the comparison is type promoted to unsigned long. ULONG_MAX - 4095 is a higher than INT_MAX so the condition can never be true. I don't know if this is harmful but it seems safe to limit "len" to INT_MAX - 4095. Fixes: a8c879a7ee98 ('RDS: Info and stats') Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/rds/info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/info.c b/net/rds/info.c index 9a6b4f66187cf3..140a44a5f7b7f1 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -176,7 +176,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, /* check for all kinds of wrapping and the like */ start = (unsigned long)optval; - if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) { + if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) { ret = -EINVAL; goto out; } From 2fc09962e24ace45154d0c16024f1eb15700f3e8 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Mon, 3 Aug 2015 11:18:12 +0800 Subject: [PATCH 20/89] 3c59x: Fix resource leaks in vortex_open When vortex_up is failed, the skb buffers allocated by __netdev_alloc_skb in vortex_open are not released, which may cause resource leaks. This bug has been submitted before. This patch modifies the error handling code to fix it. Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/ethernet/3com/3c59x.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c index 2d1ce3c5d0dd34..753887d02b46ab 100644 --- a/drivers/net/ethernet/3com/3c59x.c +++ b/drivers/net/ethernet/3com/3c59x.c @@ -1763,16 +1763,9 @@ vortex_open(struct net_device *dev) vp->rx_ring[i].addr = cpu_to_le32(pci_map_single(VORTEX_PCI(vp), skb->data, PKT_BUF_SZ, PCI_DMA_FROMDEVICE)); } if (i != RX_RING_SIZE) { - int j; pr_emerg("%s: no memory for rx ring\n", dev->name); - for (j = 0; j < i; j++) { - if (vp->rx_skbuff[j]) { - dev_kfree_skb(vp->rx_skbuff[j]); - vp->rx_skbuff[j] = NULL; - } - } retval = -ENOMEM; - goto err_free_irq; + goto err_free_skb; } /* Wrap the ring. */ vp->rx_ring[i-1].next = cpu_to_le32(vp->rx_ring_dma); @@ -1782,7 +1775,13 @@ vortex_open(struct net_device *dev) if (!retval) goto out; -err_free_irq: +err_free_skb: + for (i = 0; i < RX_RING_SIZE; i++) { + if (vp->rx_skbuff[i]) { + dev_kfree_skb(vp->rx_skbuff[i]); + vp->rx_skbuff[i] = NULL; + } + } free_irq(dev->irq, dev); err: if (vortex_debug > 1) From 10e2eb878f3ca07ac2f05fa5ca5e6c4c9174a27a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Aug 2015 12:14:33 +0200 Subject: [PATCH 21/89] udp: fix dst races with multicast early demux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multicast dst are not cached. They carry DST_NOCACHE. As mentioned in commit f8864972126899 ("ipv4: fix dst race in sk_dst_get()"), these dst need special care before caching them into a socket. Caching them is allowed only if their refcnt was not 0, ie we must use atomic_inc_not_zero() Also, we must use READ_ONCE() to fetch sk->sk_rx_dst, as mentioned in commit d0c294c53a771 ("tcp: prevent fetching dst twice in early demux code") Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux") Tested-by: Gregory Hoggarth Signed-off-by: Eric Dumazet Reported-by: Gregory Hoggarth Reported-by: Alex Gartrell Cc: Michal Kubeček Signed-off-by: David S. Miller --- net/ipv4/udp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 83aa604f9273c3..1b8c5ba7d5f732 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1995,12 +1995,19 @@ void udp_v4_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_efree; - dst = sk->sk_rx_dst; + dst = READ_ONCE(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); - if (dst) - skb_dst_set_noref(skb, dst); + if (dst) { + /* DST_NOCACHE can not be used without taking a reference */ + if (dst->flags & DST_NOCACHE) { + if (likely(atomic_inc_not_zero(&dst->__refcnt))) + skb_dst_set(skb, dst); + } else { + skb_dst_set_noref(skb, dst); + } + } } int udp_rcv(struct sk_buff *skb) From 2475b22526d70234ecfe4a1ff88aed69badefba9 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Mon, 3 Aug 2015 15:38:03 +0100 Subject: [PATCH 22/89] xen-netback: Allocate fraglist early to avoid complex rollback Determine if a fraglist is needed in the tx path, and allocate it if necessary before setting up the copy and map operations. Otherwise, undoing the copy and map operations is tricky. This fixes a use-after-free: if allocating the fraglist failed, the copy and map operations that had been set up were still executed, writing over the data area of a freed skb. Signed-off-by: Ross Lagerwall Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 61 +++++++++++++++++-------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 7d50711476fe1e..1b406e706a0183 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -810,23 +810,17 @@ static inline struct sk_buff *xenvif_alloc_skb(unsigned int size) static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif_queue *queue, struct sk_buff *skb, struct xen_netif_tx_request *txp, - struct gnttab_map_grant_ref *gop) + struct gnttab_map_grant_ref *gop, + unsigned int frag_overflow, + struct sk_buff *nskb) { struct skb_shared_info *shinfo = skb_shinfo(skb); skb_frag_t *frags = shinfo->frags; u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx; int start; pending_ring_idx_t index; - unsigned int nr_slots, frag_overflow = 0; + unsigned int nr_slots; - /* At this point shinfo->nr_frags is in fact the number of - * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX. - */ - if (shinfo->nr_frags > MAX_SKB_FRAGS) { - frag_overflow = shinfo->nr_frags - MAX_SKB_FRAGS; - BUG_ON(frag_overflow > MAX_SKB_FRAGS); - shinfo->nr_frags = MAX_SKB_FRAGS; - } nr_slots = shinfo->nr_frags; /* Skip first skb fragment if it is on same page as header fragment. */ @@ -841,13 +835,6 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif_queue *que } if (frag_overflow) { - struct sk_buff *nskb = xenvif_alloc_skb(0); - if (unlikely(nskb == NULL)) { - if (net_ratelimit()) - netdev_err(queue->vif->dev, - "Can't allocate the frag_list skb.\n"); - return NULL; - } shinfo = skb_shinfo(nskb); frags = shinfo->frags; @@ -1175,9 +1162,10 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, unsigned *copy_ops, unsigned *map_ops) { - struct gnttab_map_grant_ref *gop = queue->tx_map_ops, *request_gop; - struct sk_buff *skb; + struct gnttab_map_grant_ref *gop = queue->tx_map_ops; + struct sk_buff *skb, *nskb; int ret; + unsigned int frag_overflow; while (skb_queue_len(&queue->tx_queue) < budget) { struct xen_netif_tx_request txreq; @@ -1265,6 +1253,29 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, break; } + skb_shinfo(skb)->nr_frags = ret; + if (data_len < txreq.size) + skb_shinfo(skb)->nr_frags++; + /* At this point shinfo->nr_frags is in fact the number of + * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX. + */ + frag_overflow = 0; + nskb = NULL; + if (skb_shinfo(skb)->nr_frags > MAX_SKB_FRAGS) { + frag_overflow = skb_shinfo(skb)->nr_frags - MAX_SKB_FRAGS; + BUG_ON(frag_overflow > MAX_SKB_FRAGS); + skb_shinfo(skb)->nr_frags = MAX_SKB_FRAGS; + nskb = xenvif_alloc_skb(0); + if (unlikely(nskb == NULL)) { + kfree_skb(skb); + xenvif_tx_err(queue, &txreq, idx); + if (net_ratelimit()) + netdev_err(queue->vif->dev, + "Can't allocate the frag_list skb.\n"); + break; + } + } + if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { struct xen_netif_extra_info *gso; gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; @@ -1272,6 +1283,7 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, if (xenvif_set_skb_gso(queue->vif, skb, gso)) { /* Failure in xenvif_set_skb_gso is fatal. */ kfree_skb(skb); + kfree_skb(nskb); break; } } @@ -1294,9 +1306,7 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, (*copy_ops)++; - skb_shinfo(skb)->nr_frags = ret; if (data_len < txreq.size) { - skb_shinfo(skb)->nr_frags++; frag_set_pending_idx(&skb_shinfo(skb)->frags[0], pending_idx); xenvif_tx_create_map_op(queue, pending_idx, &txreq, gop); @@ -1310,13 +1320,8 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, queue->pending_cons++; - request_gop = xenvif_get_requests(queue, skb, txfrags, gop); - if (request_gop == NULL) { - kfree_skb(skb); - xenvif_tx_err(queue, &txreq, idx); - break; - } - gop = request_gop; + gop = xenvif_get_requests(queue, skb, txfrags, gop, + frag_overflow, nskb); __skb_queue_tail(&queue->tx_queue, skb); From f202a666e933f3c7557126d63833a6a3b577ac15 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 16 Jun 2015 21:06:24 +0200 Subject: [PATCH 23/89] batman-adv: avoid DAT to mess up LAN state When a node running DAT receives an ARP request from the LAN for the first time, it is likely that this node will request the ARP entry through the distributed ARP table (DAT) in the mesh. Once a DAT reply is received the asking node must check if the MAC address for which the IP address has been asked is local. If it is, the node must drop the ARP reply bceause the client should have replied on its own locally. Forwarding this reply means fooling any L2 bridge (e.g. Ethernet switches) lying between the batman-adv node and the LAN. This happens because the L2 bridge will think that the client sending the ARP reply lies somewhere in the mesh, while this node is sitting in the same LAN. Reported-by: Simon Wunderlich Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/distributed-arp-table.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index fb54e6aed096ed..6d0b471eede863 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1138,6 +1138,9 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check * @hdr_size: size of the encapsulation header + * + * Returns true if the packet was snooped and consumed by DAT. False if the + * packet has to be delivered to the interface */ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -1145,7 +1148,7 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, uint16_t type; __be32 ip_src, ip_dst; uint8_t *hw_src, *hw_dst; - bool ret = false; + bool dropped = false; unsigned short vid; if (!atomic_read(&bat_priv->distributed_arp_table)) @@ -1174,12 +1177,17 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, /* if this REPLY is directed to a client of mine, let's deliver the * packet to the interface */ - ret = !batadv_is_my_client(bat_priv, hw_dst, vid); + dropped = !batadv_is_my_client(bat_priv, hw_dst, vid); + + /* if this REPLY is sent on behalf of a client of mine, let's drop the + * packet because the client will reply by itself + */ + dropped |= batadv_is_my_client(bat_priv, hw_src, vid); out: - if (ret) + if (dropped) kfree_skb(skb); - /* if ret == false -> packet has to be delivered to the interface */ - return ret; + /* if dropped == false -> deliver to the interface */ + return dropped; } /** From 354136bcc3c4f40a2813bba8f57ca5267d812d15 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Tue, 9 Jun 2015 21:24:36 +0800 Subject: [PATCH 24/89] batman-adv: fix kernel crash due to missing NULL checks batadv_softif_vlan_get() may return NULL which has to be verified by the caller. Fixes: 35df3b298fc8 ("batman-adv: fix TT VLAN inconsistency on VLAN re-add") Reported-by: Ryan Thompson Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/soft-interface.c | 3 +++ net/batman-adv/translation-table.c | 18 ++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index c002961da75d65..a2fc843c22432e 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -479,6 +479,9 @@ void batadv_interface_rx(struct net_device *soft_iface, */ void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) { + if (!vlan) + return; + if (atomic_dec_and_test(&vlan->refcount)) { spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); hlist_del_rcu(&vlan->list); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index b4824951010ba6..38b83c50f936f3 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -594,6 +594,9 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, /* increase the refcounter of the related vlan */ vlan = batadv_softif_vlan_get(bat_priv, vid); + if (WARN(!vlan, "adding TT local entry %pM to non-existent VLAN %d", + addr, BATADV_PRINT_VID(vid))) + goto out; batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n", @@ -1066,6 +1069,9 @@ uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, /* decrease the reference held for this vlan */ vlan = batadv_softif_vlan_get(bat_priv, vid); + if (!vlan) + goto out; + batadv_softif_vlan_free_ref(vlan); batadv_softif_vlan_free_ref(vlan); @@ -1166,8 +1172,10 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) /* decrease the reference held for this vlan */ vlan = batadv_softif_vlan_get(bat_priv, tt_common_entry->vid); - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + if (vlan) { + batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); + } batadv_tt_local_entry_free_ref(tt_local); } @@ -3207,8 +3215,10 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) /* decrease the reference held for this vlan */ vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid); - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + if (vlan) { + batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); + } batadv_tt_local_entry_free_ref(tt_local); } From ef72706a0543d0c3a5ab29bd6378fdfb368118d9 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Wed, 17 Jun 2015 20:01:36 +0800 Subject: [PATCH 25/89] batman-adv: protect tt_local_entry from concurrent delete events The tt_local_entry deletion performed in batadv_tt_local_remove() was neither protecting against simultaneous deletes nor checking whether the element was still part of the list before calling hlist_del_rcu(). Replacing the hlist_del_rcu() call with batadv_hash_remove() provides adequate protection via hash spinlocks as well as an is-element-still-in-hash check to avoid 'blind' hash removal. Fixes: 068ee6e204e1 ("batman-adv: roaming handling mechanism redesign") Reported-by: alfonsname@web.de Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/translation-table.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 38b83c50f936f3..5e953297d3b2bd 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1037,6 +1037,7 @@ uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry; uint16_t flags, curr_flags = BATADV_NO_FLAGS; struct batadv_softif_vlan *vlan; + void *tt_entry_exists; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) @@ -1064,7 +1065,15 @@ uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, * immediately purge it */ batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL); - hlist_del_rcu(&tt_local_entry->common.hash_entry); + + tt_entry_exists = batadv_hash_remove(bat_priv->tt.local_hash, + batadv_compare_tt, + batadv_choose_tt, + &tt_local_entry->common); + if (!tt_entry_exists) + goto out; + + /* extra call to free the local tt entry */ batadv_tt_local_entry_free_ref(tt_local_entry); /* decrease the reference held for this vlan */ From 27a4d5efd417b6ef3190e9af357715532d4617a3 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Wed, 24 Jun 2015 14:50:19 +0200 Subject: [PATCH 26/89] batman-adv: initialize up/down values when adding a gateway Without this initialization, gateways which actually announce up/down bandwidth of 0/0 could be added. If these nodes get purged via _batadv_purge_orig() later, the gw_node structure does not get removed since batadv_gw_node_delete() updates the gw_node with up/down bandwidth of 0/0, and the updating function then discards the change and does not free gw_node. This results in leaking the gw_node structures, which references other structures: gw_node -> orig_node -> orig_node_ifinfo -> hardif. When removing the interface later, the open reference on the hardif may cause hangs with the infamous "unregister_netdevice: waiting for mesh1 to become free. Usage count = 1" message. Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/gateway_client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index bb015862062899..cffa92dd98778b 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -439,6 +439,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, INIT_HLIST_NODE(&gw_node->list); gw_node->orig_node = orig_node; + gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); + gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); atomic_set(&gw_node->refcount, 1); spin_lock_bh(&bat_priv->gw.list_lock); From f58e5aa7b873b8a4376b816993d4b0e903befcba Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 4 Aug 2015 18:34:00 -0700 Subject: [PATCH 27/89] netfilter: conntrack: Use flags in nf_ct_tmpl_alloc() The flags were ignored for this function when it was introduced. Also fix the style problem in kzalloc. Fixes: 0838aa7fc (netfilter: fix netns dependencies with conntrack templates) Signed-off-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f1680995fc490c..3c20d02aee738c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -292,7 +292,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags) { struct nf_conn *tmpl; - tmpl = kzalloc(sizeof(struct nf_conn), GFP_KERNEL); + tmpl = kzalloc(sizeof(*tmpl), flags); if (tmpl == NULL) return NULL; @@ -303,7 +303,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags) if (zone) { struct nf_conntrack_zone *nf_ct_zone; - nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC); + nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, flags); if (!nf_ct_zone) goto out_free; nf_ct_zone->id = zone; From cb92205bad2e4dd630b884142dd707b72504c200 Mon Sep 17 00:00:00 2001 From: Jakub Pawlowski Date: Wed, 5 Aug 2015 23:16:29 +0200 Subject: [PATCH 28/89] Bluetooth: fix MGMT_EV_NEW_LONG_TERM_KEY event This patch fixes how MGMT_EV_NEW_LONG_TERM_KEY event is build. Right now val vield is filled with only 1 byte, instead of whole value. This bug was introduced in commit 1fc62c526a57 ("Bluetooth: Fix exposing full value of shortened LTKs") Before that patch, if you paired with device using bluetoothd using simple pairing, and then restarted bluetoothd, you would be able to re-connect, but device would fail to establish encryption and would terminate connection. After this patch connecting after bluetoothd restart works fine. Signed-off-by: Jakub Pawlowski Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7998fb27916568..92720f3fe57370 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7820,7 +7820,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) /* Make sure we copy only the significant bytes based on the * encryption key size, and set the rest of the value to zeroes. */ - memcpy(ev.key.val, key->val, sizeof(key->enc_size)); + memcpy(ev.key.val, key->val, key->enc_size); memset(ev.key.val + key->enc_size, 0, sizeof(ev.key.val) - key->enc_size); From 14d2b7c1a96ef37eb571599c73d4a1a606b964d6 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 3 Aug 2015 17:50:11 +0200 Subject: [PATCH 29/89] net: fec: fix initial runtime PM refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The clocks are initially active and thus the device is marked active. This still keeps the PM refcount at 0, the pm_runtime_put_autosuspend() call at the end of probe then leaves us with an invalid refcount of -1, which in turn leads to the device staying in suspended state even though netdev open had been called. Fix this by initializing the refcount to be coherent with the initial device status. Fixes: 8fff755e9f8 (net: fec: Ensure clocks are enabled while using mdio bus) Signed-off-by: Lucas Stach Tested-by: Uwe Kleine-König Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 32e3807c650ea7..271bb5862346ed 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3433,6 +3433,7 @@ fec_probe(struct platform_device *pdev) pm_runtime_set_autosuspend_delay(&pdev->dev, FEC_MDIO_PM_TIMEOUT); pm_runtime_use_autosuspend(&pdev->dev); + pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); From a0a2a6602496a45ae838a96db8b8173794b5d398 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 4 Aug 2015 15:42:47 +0800 Subject: [PATCH 30/89] net: Fix skb_set_peeked use-after-free bug The commit 738ac1ebb96d02e0d23bc320302a6ea94c612dec ("net: Clone skb before setting peeked flag") introduced a use-after-free bug in skb_recv_datagram. This is because skb_set_peeked may create a new skb and free the existing one. As it stands the caller will continue to use the old freed skb. This patch fixes it by making skb_set_peeked return the new skb (or the old one if unchanged). Fixes: 738ac1ebb96d ("net: Clone skb before setting peeked flag") Reported-by: Brenden Blanco Signed-off-by: Herbert Xu Tested-by: Brenden Blanco Reviewed-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/core/datagram.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index 4967262b27076a..617088aee21d41 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -131,12 +131,12 @@ static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, goto out; } -static int skb_set_peeked(struct sk_buff *skb) +static struct sk_buff *skb_set_peeked(struct sk_buff *skb) { struct sk_buff *nskb; if (skb->peeked) - return 0; + return skb; /* We have to unshare an skb before modifying it. */ if (!skb_shared(skb)) @@ -144,7 +144,7 @@ static int skb_set_peeked(struct sk_buff *skb) nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) - return -ENOMEM; + return ERR_PTR(-ENOMEM); skb->prev->next = nskb; skb->next->prev = nskb; @@ -157,7 +157,7 @@ static int skb_set_peeked(struct sk_buff *skb) done: skb->peeked = 1; - return 0; + return skb; } /** @@ -229,8 +229,9 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, continue; } - error = skb_set_peeked(skb); - if (error) + skb = skb_set_peeked(skb); + error = PTR_ERR(skb); + if (IS_ERR(skb)) goto unlock_err; atomic_inc(&skb->users); From 57b229063ae6dc65036209018dc7f4290cc026bb Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Tue, 4 Aug 2015 15:40:59 +0100 Subject: [PATCH 31/89] xen/netback: Wake dealloc thread after completing zerocopy work Waking the dealloc thread before decrementing inflight_packets is racy because it means the thread may go to sleep before inflight_packets is decremented. If kthread_stop() has already been called, the dealloc thread may wait forever with nothing to wake it. Instead, wake the thread only after decrementing inflight_packets. Signed-off-by: Ross Lagerwall Signed-off-by: David S. Miller --- drivers/net/xen-netback/interface.c | 6 ++++++ drivers/net/xen-netback/netback.c | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 1a83e190fc15e4..28577a31549d15 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -61,6 +61,12 @@ void xenvif_skb_zerocopy_prepare(struct xenvif_queue *queue, void xenvif_skb_zerocopy_complete(struct xenvif_queue *queue) { atomic_dec(&queue->inflight_packets); + + /* Wake the dealloc thread _after_ decrementing inflight_packets so + * that if kthread_stop() has already been called, the dealloc thread + * does not wait forever with nothing to wake it. + */ + wake_up(&queue->dealloc_wq); } int xenvif_schedulable(struct xenvif *vif) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 1b406e706a0183..3f44b522b8311a 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1541,7 +1541,6 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success) smp_wmb(); queue->dealloc_prod++; } while (ubuf); - wake_up(&queue->dealloc_wq); spin_unlock_irqrestore(&queue->callback_lock, flags); if (likely(zerocopy_success)) From 7ba8bd75ddc6b041b5716dbb29e49df3e9cc2928 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 4 Aug 2015 18:33:34 +0200 Subject: [PATCH 32/89] net: pktgen: don't abuse current->state in pktgen_thread_worker() Commit 1fbe4b46caca "net: pktgen: kill the Wait for kthread_stop code in pktgen_thread_worker()" removed (in particular) the final __set_current_state(TASK_RUNNING) and I didn't notice the previous set_current_state(TASK_INTERRUPTIBLE). This triggers the warning in __might_sleep() after return. Afaics, we can simply remove both set_current_state()'s, and we could do this a long ago right after ef87979c273a2 "pktgen: better scheduler friendliness" which changed pktgen_thread_worker() to use wait_event_interruptible_timeout(). Reported-by: Huang Ying Signed-off-by: Oleg Nesterov Signed-off-by: David S. Miller --- net/core/pktgen.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 1ebdf1c0d1188c..1cbd209192eacd 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3514,8 +3514,6 @@ static int pktgen_thread_worker(void *arg) set_freezable(); - __set_current_state(TASK_RUNNING); - while (!kthread_should_stop()) { pkt_dev = next_to_run(t); @@ -3560,7 +3558,6 @@ static int pktgen_thread_worker(void *arg) try_to_freeze(); } - set_current_state(TASK_INTERRUPTIBLE); pr_debug("%s stopping all device\n", t->tsk->comm); pktgen_stop(t); From 355b9f9df1f0311f20087350aee8ad96eedca8a9 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 4 Aug 2015 19:06:32 +0200 Subject: [PATCH 33/89] bridge: netlink: account for the IFLA_BRPORT_PROXYARP attribute size and policy The attribute size wasn't accounted for in the get_slave_size() callback (br_port_get_slave_size) when it was introduced, so fix it now. Also add a policy entry for it in br_port_policy. Signed-off-by: Nikolay Aleksandrov Fixes: 958501163ddd ("bridge: Add support for IEEE 802.11 Proxy ARP") Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3da5525eb8a2dc..5390536d500c6f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -112,6 +112,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + 0; } @@ -506,6 +507,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_FAST_LEAVE]= { .type = NLA_U8 }, [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, + [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ From 786c2077ec8e9eab37a88fc14aac4309a8061e18 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 4 Aug 2015 19:06:33 +0200 Subject: [PATCH 34/89] bridge: netlink: account for the IFLA_BRPORT_PROXYARP_WIFI attribute size and policy The attribute size wasn't accounted for in the get_slave_size() callback (br_port_get_slave_size) when it was introduced, so fix it now. Also add a policy entry for it in br_port_policy. Signed-off-by: Nikolay Aleksandrov Fixes: 842a9ae08a25 ("bridge: Extend Proxy ARP design to allow optional rules for Wi-Fi") Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 5390536d500c6f..4d74a0639c4ccd 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -113,6 +113,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + 0; } @@ -508,6 +509,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, + [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ From 7ebc482202fd54e7cf718619e869f4320b8e3bb6 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 4 Aug 2015 22:11:43 +0200 Subject: [PATCH 35/89] r8169: enforce RX_MULTI_EN on rtl8168ep/8111ep chips Enforcing this flag in RxConfig for the mentioned chips fixes netdev watchdog issues prepended with AMD IOMMU message(s) like: AMD-Vi: Event logged [IO_PAGE_FAULT device=01:00.0 domain=0x001d address=0x0000000000003000 flags=0x0050] Note that this flag is also set in Realtek's own driver for these chips. Signed-off-by: Ivan Vecera Tested-by: Alexander Lindqvist Acked-by: Francois Romieu Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 3df51faf18ae3b..f790f61ea78a2b 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -4875,10 +4875,12 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_46: case RTL_GIGA_MAC_VER_47: case RTL_GIGA_MAC_VER_48: + RTL_W32(RxConfig, RX128_INT_EN | RX_DMA_BURST | RX_EARLY_OFF); + break; case RTL_GIGA_MAC_VER_49: case RTL_GIGA_MAC_VER_50: case RTL_GIGA_MAC_VER_51: - RTL_W32(RxConfig, RX128_INT_EN | RX_DMA_BURST | RX_EARLY_OFF); + RTL_W32(RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF); break; default: RTL_W32(RxConfig, RX128_INT_EN | RX_DMA_BURST); From 22f54bf932a0eed73134b696b238db4bdcff5cd4 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 4 Aug 2015 20:25:55 +0100 Subject: [PATCH 36/89] net: thunderx: remove effective "default y" from Kconfig if ARCH_THUNDER=y As well as for kernels built only for ThunderX ARCH_THUNDERX is also enabled for kernels which support multiple platforms (such as distro kernels). Thus "default ARCH_THUNDER" is inappropriate. I believe default m is equally frowned upon, so remove the line completely rather than "default m if ARCH_THUNDER". Signed-off-by: Ian Campbell Cc: Sunil Goutham Cc: Robert Richter Cc: Derek Chickles Cc: Satanand Burla Cc: Felix Manlunas Cc: Raghu Vatsavayi Cc: Arnd Bergmann Cc: linux-arm-kernel@lists.infradead.org Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/Kconfig | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/cavium/Kconfig b/drivers/net/ethernet/cavium/Kconfig index c4d6bbe9458dbf..02e23e6f142487 100644 --- a/drivers/net/ethernet/cavium/Kconfig +++ b/drivers/net/ethernet/cavium/Kconfig @@ -16,7 +16,6 @@ if NET_VENDOR_CAVIUM config THUNDER_NIC_PF tristate "Thunder Physical function driver" depends on 64BIT - default ARCH_THUNDER select THUNDER_NIC_BGX ---help--- This driver supports Thunder's NIC physical function. @@ -29,14 +28,12 @@ config THUNDER_NIC_PF config THUNDER_NIC_VF tristate "Thunder Virtual function driver" depends on 64BIT - default ARCH_THUNDER ---help--- This driver supports Thunder's NIC virtual function config THUNDER_NIC_BGX tristate "Thunder MAC interface driver (BGX)" depends on 64BIT - default ARCH_THUNDER ---help--- This driver supports programming and controlling of MAC interface from NIC physical function driver. From 866b8b18e380f810ba96e21d25843b841271bb07 Mon Sep 17 00:00:00 2001 From: WingMan Kwok Date: Tue, 4 Aug 2015 16:56:53 -0400 Subject: [PATCH 37/89] net: netcp: fix unused interface rx buffer size configuration Prior to this patch, rx buffer size for each rx queue of an interface is configurable through dts bindings. But for an interface, the first rx queue's rx buffer size is always the usual MTU size (plus usual overhead) and page size for the remaining rx queues (if they are enabled by specifying a non-zero rx queue depth dts binding of the corresponding interface). This patch removes the rx buffer size configuration capability. Signed-off-by: WingMan Kwok Acked-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp.h | 1 - drivers/net/ethernet/ti/netcp_core.c | 35 +++++++++++----------------- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/ti/netcp.h b/drivers/net/ethernet/ti/netcp.h index a8a730641bbb14..bb1bb72121c047 100644 --- a/drivers/net/ethernet/ti/netcp.h +++ b/drivers/net/ethernet/ti/netcp.h @@ -85,7 +85,6 @@ struct netcp_intf { struct list_head rxhook_list_head; unsigned int rx_queue_id; void *rx_fdq[KNAV_DMA_FDQ_PER_CHAN]; - u32 rx_buffer_sizes[KNAV_DMA_FDQ_PER_CHAN]; struct napi_struct rx_napi; struct napi_struct tx_napi; diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 9749dfd78c4349..4755838c6137b4 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -34,6 +34,7 @@ #define NETCP_SOP_OFFSET (NET_IP_ALIGN + NET_SKB_PAD) #define NETCP_NAPI_WEIGHT 64 #define NETCP_TX_TIMEOUT (5 * HZ) +#define NETCP_PACKET_SIZE (ETH_FRAME_LEN + ETH_FCS_LEN) #define NETCP_MIN_PACKET_SIZE ETH_ZLEN #define NETCP_MAX_MCAST_ADDR 16 @@ -804,30 +805,28 @@ static void netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq) if (likely(fdq == 0)) { unsigned int primary_buf_len; /* Allocate a primary receive queue entry */ - buf_len = netcp->rx_buffer_sizes[0] + NETCP_SOP_OFFSET; + buf_len = NETCP_PACKET_SIZE + NETCP_SOP_OFFSET; primary_buf_len = SKB_DATA_ALIGN(buf_len) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - if (primary_buf_len <= PAGE_SIZE) { - bufptr = netdev_alloc_frag(primary_buf_len); - pad[1] = primary_buf_len; - } else { - bufptr = kmalloc(primary_buf_len, GFP_ATOMIC | - GFP_DMA32 | __GFP_COLD); - pad[1] = 0; - } + bufptr = netdev_alloc_frag(primary_buf_len); + pad[1] = primary_buf_len; if (unlikely(!bufptr)) { - dev_warn_ratelimited(netcp->ndev_dev, "Primary RX buffer alloc failed\n"); + dev_warn_ratelimited(netcp->ndev_dev, + "Primary RX buffer alloc failed\n"); goto fail; } dma = dma_map_single(netcp->dev, bufptr, buf_len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(netcp->dev, dma))) + goto fail; + pad[0] = (u32)bufptr; } else { /* Allocate a secondary receive queue entry */ - page = alloc_page(GFP_ATOMIC | GFP_DMA32 | __GFP_COLD); + page = alloc_page(GFP_ATOMIC | GFP_DMA | __GFP_COLD); if (unlikely(!page)) { dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n"); goto fail; @@ -1010,7 +1009,7 @@ netcp_tx_map_skb(struct sk_buff *skb, struct netcp_intf *netcp) /* Map the linear buffer */ dma_addr = dma_map_single(dev, skb->data, pkt_len, DMA_TO_DEVICE); - if (unlikely(!dma_addr)) { + if (unlikely(dma_mapping_error(dev, dma_addr))) { dev_err(netcp->ndev_dev, "Failed to map skb buffer\n"); return NULL; } @@ -1546,8 +1545,8 @@ static int netcp_setup_navigator_resources(struct net_device *ndev) knav_queue_disable_notify(netcp->rx_queue); /* open Rx FDQs */ - for (i = 0; i < KNAV_DMA_FDQ_PER_CHAN && - netcp->rx_queue_depths[i] && netcp->rx_buffer_sizes[i]; ++i) { + for (i = 0; i < KNAV_DMA_FDQ_PER_CHAN && netcp->rx_queue_depths[i]; + ++i) { snprintf(name, sizeof(name), "rx-fdq-%s-%d", ndev->name, i); netcp->rx_fdq[i] = knav_queue_open(name, KNAV_QUEUE_GP, 0); if (IS_ERR_OR_NULL(netcp->rx_fdq[i])) { @@ -1941,14 +1940,6 @@ static int netcp_create_interface(struct netcp_device *netcp_device, netcp->rx_queue_depths[0] = 128; } - ret = of_property_read_u32_array(node_interface, "rx-buffer-size", - netcp->rx_buffer_sizes, - KNAV_DMA_FDQ_PER_CHAN); - if (ret) { - dev_err(dev, "missing \"rx-buffer-size\" parameter\n"); - netcp->rx_buffer_sizes[0] = 1536; - } - ret = of_property_read_u32_array(node_interface, "rx-pool", temp, 2); if (ret < 0) { dev_err(dev, "missing \"rx-pool\" parameter\n"); From 4f7eb70f7be1099236670f36b2f1f90a63e29699 Mon Sep 17 00:00:00 2001 From: Mathieu Olivari Date: Tue, 4 Aug 2015 17:25:02 -0700 Subject: [PATCH 38/89] stmmac: dwmac-ipq806x: fix static checker warning The patch b1c17215d718: "stmmac: add ipq806x glue layer", leads to the following static checker warning: .../stmmac/dwmac-ipq806x.c:314 ipq806x_gmac_probe() warn: double left shift '1 << (1 << gmac->id)' The NSS_COMMON_CLK_SRC_CTRL_OFFSET macro is used once as an offset, and once as a mask, which is a bug indeed. We'll fix it by defining the offset as the real offset value and computing the mask from it when required. Tested on IPQ806x ref designs AP148 & DB149. Reported-by: Dan Carpenter Signed-off-by: Mathieu Olivari Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c index 7e3129e7f143a9..f0e4bb4e3ec59f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c @@ -42,7 +42,7 @@ #define NSS_COMMON_CLK_DIV_MASK 0x7f #define NSS_COMMON_CLK_SRC_CTRL 0x14 -#define NSS_COMMON_CLK_SRC_CTRL_OFFSET(x) (1 << x) +#define NSS_COMMON_CLK_SRC_CTRL_OFFSET(x) (x) /* Mode is coded on 1 bit but is different depending on the MAC ID: * MAC0: QSGMII=0 RGMII=1 * MAC1: QSGMII=0 SGMII=0 RGMII=1 @@ -291,7 +291,7 @@ static void *ipq806x_gmac_setup(struct platform_device *pdev) /* Configure the clock src according to the mode */ regmap_read(gmac->nss_common, NSS_COMMON_CLK_SRC_CTRL, &val); - val &= ~NSS_COMMON_CLK_SRC_CTRL_OFFSET(gmac->id); + val &= ~(1 << NSS_COMMON_CLK_SRC_CTRL_OFFSET(gmac->id)); switch (gmac->phy_mode) { case PHY_INTERFACE_MODE_RGMII: val |= NSS_COMMON_CLK_SRC_CTRL_RGMII(gmac->id) << From 48900cb6af4282fa0fb6ff4d72a81aa3dadb5c39 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 5 Aug 2015 10:34:04 +0800 Subject: [PATCH 39/89] virtio-net: drop NETIF_F_FRAGLIST virtio declares support for NETIF_F_FRAGLIST, but assumes that there are at most MAX_SKB_FRAGS + 2 fragments which isn't always true with a fraglist. A longer fraglist in the skb will make the call to skb_to_sgvec overflow the sg array, leading to memory corruption. Drop NETIF_F_FRAGLIST so we only get what we can handle. Cc: Michael S. Tsirkin Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7fbca37a1adffe..237f8e5e493dda 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1756,9 +1756,9 @@ static int virtnet_probe(struct virtio_device *vdev) /* Do we support "hardware" checksums? */ if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) { /* This opens up the world of extra features. */ - dev->hw_features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; + dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG; if (csum) - dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; + dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG; if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) { dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO From bcc84140a62c04f522eacceb793e6eef92965c84 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 5 Aug 2015 03:27:48 -0400 Subject: [PATCH 40/89] be2net: enable IFACE filters only after creating RXQs HW issues were observed on Lancer adapters if IFACE filters (flags, mac addrs etc) are enabled *before* creating RXQs. This patch changes the driver design by enabling filters in be_open() -- instead of be_setup() -- after RXQs are created and buffers posted. Two new wrapper functions, be_enable_if_filters() and be_disable_if_filters() are introduced to enable/disable IFACE filters in be_open()/be_close() respectively. In be_setup() the IFACE is now created only with the RSS flag. Signed-off-by: Kalesh AP Signed-off-by: Sathya Perla Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_cmds.h | 5 + drivers/net/ethernet/emulex/benet/be_main.c | 127 ++++++++++++-------- 2 files changed, 85 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h index 2716e6f30d9a09..00e3a6b6b82254 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.h +++ b/drivers/net/ethernet/emulex/benet/be_cmds.h @@ -620,6 +620,11 @@ enum be_if_flags { BE_IF_FLAGS_VLAN_PROMISCUOUS |\ BE_IF_FLAGS_MCAST_PROMISCUOUS) +#define BE_IF_EN_FLAGS (BE_IF_FLAGS_BROADCAST | BE_IF_FLAGS_PASS_L3L4_ERRORS |\ + BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_UNTAGGED) + +#define BE_IF_ALL_FILT_FLAGS (BE_IF_EN_FLAGS | BE_IF_FLAGS_ALL_PROMISCUOUS) + /* An RX interface is an object with one or more MAC addresses and * filtering capabilities. */ struct be_cmd_req_if_create { diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 6f642426308c67..7730f21b60712d 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -273,6 +273,10 @@ static int be_mac_addr_set(struct net_device *netdev, void *p) if (ether_addr_equal(addr->sa_data, netdev->dev_addr)) return 0; + /* if device is not running, copy MAC to netdev->dev_addr */ + if (!netif_running(netdev)) + goto done; + /* The PMAC_ADD cmd may fail if the VF doesn't have FILTMGMT * privilege or if PF did not provision the new MAC address. * On BE3, this cmd will always fail if the VF doesn't have the @@ -307,9 +311,9 @@ static int be_mac_addr_set(struct net_device *netdev, void *p) status = -EPERM; goto err; } - - memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len); - dev_info(dev, "MAC address changed to %pM\n", mac); +done: + ether_addr_copy(netdev->dev_addr, addr->sa_data); + dev_info(dev, "MAC address changed to %pM\n", addr->sa_data); return 0; err: dev_warn(dev, "MAC address change to %pM failed\n", addr->sa_data); @@ -3361,6 +3365,33 @@ static void be_rx_qs_destroy(struct be_adapter *adapter) } } +static void be_disable_if_filters(struct be_adapter *adapter) +{ + be_cmd_pmac_del(adapter, adapter->if_handle, + adapter->pmac_id[0], 0); + + be_clear_uc_list(adapter); + + /* The IFACE flags are enabled in the open path and cleared + * in the close path. When a VF gets detached from the host and + * assigned to a VM the following happens: + * - VF's IFACE flags get cleared in the detach path + * - IFACE create is issued by the VF in the attach path + * Due to a bug in the BE3/Skyhawk-R FW + * (Lancer FW doesn't have the bug), the IFACE capability flags + * specified along with the IFACE create cmd issued by a VF are not + * honoured by FW. As a consequence, if a *new* driver + * (that enables/disables IFACE flags in open/close) + * is loaded in the host and an *old* driver is * used by a VM/VF, + * the IFACE gets created *without* the needed flags. + * To avoid this, disable RX-filter flags only for Lancer. + */ + if (lancer_chip(adapter)) { + be_cmd_rx_filter(adapter, BE_IF_ALL_FILT_FLAGS, OFF); + adapter->if_flags &= ~BE_IF_ALL_FILT_FLAGS; + } +} + static int be_close(struct net_device *netdev) { struct be_adapter *adapter = netdev_priv(netdev); @@ -3373,6 +3404,8 @@ static int be_close(struct net_device *netdev) if (!(adapter->flags & BE_FLAGS_SETUP_DONE)) return 0; + be_disable_if_filters(adapter); + be_roce_dev_close(adapter); if (adapter->flags & BE_FLAGS_NAPI_ENABLED) { @@ -3392,7 +3425,6 @@ static int be_close(struct net_device *netdev) be_tx_compl_clean(adapter); be_rx_qs_destroy(adapter); - be_clear_uc_list(adapter); for_all_evt_queues(adapter, eqo, i) { if (msix_enabled(adapter)) @@ -3477,6 +3509,31 @@ static int be_rx_qs_create(struct be_adapter *adapter) return 0; } +static int be_enable_if_filters(struct be_adapter *adapter) +{ + int status; + + status = be_cmd_rx_filter(adapter, BE_IF_EN_FLAGS, ON); + if (status) + return status; + + /* For BE3 VFs, the PF programs the initial MAC address */ + if (!(BEx_chip(adapter) && be_virtfn(adapter))) { + status = be_cmd_pmac_add(adapter, adapter->netdev->dev_addr, + adapter->if_handle, + &adapter->pmac_id[0], 0); + if (status) + return status; + } + + if (adapter->vlans_added) + be_vid_config(adapter); + + be_set_rx_mode(adapter->netdev); + + return 0; +} + static int be_open(struct net_device *netdev) { struct be_adapter *adapter = netdev_priv(netdev); @@ -3490,6 +3547,10 @@ static int be_open(struct net_device *netdev) if (status) goto err; + status = be_enable_if_filters(adapter); + if (status) + goto err; + status = be_irq_register(adapter); if (status) goto err; @@ -3686,16 +3747,6 @@ static void be_cancel_err_detection(struct be_adapter *adapter) } } -static void be_mac_clear(struct be_adapter *adapter) -{ - if (adapter->pmac_id) { - be_cmd_pmac_del(adapter, adapter->if_handle, - adapter->pmac_id[0], 0); - kfree(adapter->pmac_id); - adapter->pmac_id = NULL; - } -} - #ifdef CONFIG_BE2NET_VXLAN static void be_disable_vxlan_offloads(struct be_adapter *adapter) { @@ -3770,8 +3821,8 @@ static int be_clear(struct be_adapter *adapter) #ifdef CONFIG_BE2NET_VXLAN be_disable_vxlan_offloads(adapter); #endif - /* delete the primary mac along with the uc-mac list */ - be_mac_clear(adapter); + kfree(adapter->pmac_id); + adapter->pmac_id = NULL; be_cmd_if_destroy(adapter, adapter->if_handle, 0); @@ -3782,25 +3833,11 @@ static int be_clear(struct be_adapter *adapter) return 0; } -static int be_if_create(struct be_adapter *adapter, u32 *if_handle, - u32 cap_flags, u32 vf) -{ - u32 en_flags; - - en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST | - BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS | - BE_IF_FLAGS_RSS | BE_IF_FLAGS_DEFQ_RSS; - - en_flags &= cap_flags; - - return be_cmd_if_create(adapter, cap_flags, en_flags, if_handle, vf); -} - static int be_vfs_if_create(struct be_adapter *adapter) { struct be_resources res = {0}; + u32 cap_flags, en_flags, vf; struct be_vf_cfg *vf_cfg; - u32 cap_flags, vf; int status; /* If a FW profile exists, then cap_flags are updated */ @@ -3821,8 +3858,12 @@ static int be_vfs_if_create(struct be_adapter *adapter) } } - status = be_if_create(adapter, &vf_cfg->if_handle, - cap_flags, vf + 1); + en_flags = cap_flags & (BE_IF_FLAGS_UNTAGGED | + BE_IF_FLAGS_BROADCAST | + BE_IF_FLAGS_MULTICAST | + BE_IF_FLAGS_PASS_L3L4_ERRORS); + status = be_cmd_if_create(adapter, cap_flags, en_flags, + &vf_cfg->if_handle, vf + 1); if (status) return status; } @@ -4194,15 +4235,8 @@ static int be_mac_setup(struct be_adapter *adapter) memcpy(adapter->netdev->dev_addr, mac, ETH_ALEN); memcpy(adapter->netdev->perm_addr, mac, ETH_ALEN); - } else { - /* Maybe the HW was reset; dev_addr must be re-programmed */ - memcpy(mac, adapter->netdev->dev_addr, ETH_ALEN); } - /* For BE3-R VFs, the PF programs the initial MAC address */ - if (!(BEx_chip(adapter) && be_virtfn(adapter))) - be_cmd_pmac_add(adapter, mac, adapter->if_handle, - &adapter->pmac_id[0], 0); return 0; } @@ -4342,6 +4376,7 @@ static int be_func_init(struct be_adapter *adapter) static int be_setup(struct be_adapter *adapter) { struct device *dev = &adapter->pdev->dev; + u32 en_flags; int status; status = be_func_init(adapter); @@ -4364,8 +4399,11 @@ static int be_setup(struct be_adapter *adapter) if (status) goto err; - status = be_if_create(adapter, &adapter->if_handle, - be_if_cap_flags(adapter), 0); + /* will enable all the needed filter flags in be_open() */ + en_flags = BE_IF_FLAGS_RSS | BE_IF_FLAGS_DEFQ_RSS; + en_flags = en_flags & be_if_cap_flags(adapter); + status = be_cmd_if_create(adapter, be_if_cap_flags(adapter), en_flags, + &adapter->if_handle, 0); if (status) goto err; @@ -4391,11 +4429,6 @@ static int be_setup(struct be_adapter *adapter) dev_err(dev, "Please upgrade firmware to version >= 4.0\n"); } - if (adapter->vlans_added) - be_vid_config(adapter); - - be_set_rx_mode(adapter->netdev); - status = be_cmd_set_flow_control(adapter, adapter->tx_fc, adapter->rx_fc); if (status) From 99b44304f205a826501721d41928e87b0b9cf3b3 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 5 Aug 2015 03:27:49 -0400 Subject: [PATCH 41/89] be2net: post buffers before destroying RXQs in Lancer An RX stall issue was seen on Lancer adapters, when RXQs are destroyed while they are in an "out of buffer" state. This patch fixes this issue by posting 64 buffers to each RXQ before destroying them in the close path. This is done after ensuring that no more new packets are selected for transfer to the RXQs by disabling interface filters. Signed-off-by: Kalesh AP Signed-off-by: Sathya Perla Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 42 +++++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 7730f21b60712d..14ae67a8949e39 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2451,10 +2451,24 @@ static void be_eq_clean(struct be_eq_obj *eqo) be_eq_notify(eqo->adapter, eqo->q.id, false, true, num, 0); } -static void be_rx_cq_clean(struct be_rx_obj *rxo) +/* Free posted rx buffers that were not used */ +static void be_rxq_clean(struct be_rx_obj *rxo) { - struct be_rx_page_info *page_info; struct be_queue_info *rxq = &rxo->q; + struct be_rx_page_info *page_info; + + while (atomic_read(&rxq->used) > 0) { + page_info = get_rx_page_info(rxo); + put_page(page_info->page); + memset(page_info, 0, sizeof(*page_info)); + } + BUG_ON(atomic_read(&rxq->used)); + rxq->tail = 0; + rxq->head = 0; +} + +static void be_rx_cq_clean(struct be_rx_obj *rxo) +{ struct be_queue_info *rx_cq = &rxo->cq; struct be_rx_compl_info *rxcp; struct be_adapter *adapter = rxo->adapter; @@ -2491,16 +2505,6 @@ static void be_rx_cq_clean(struct be_rx_obj *rxo) /* After cleanup, leave the CQ in unarmed state */ be_cq_notify(adapter, rx_cq->id, false, 0); - - /* Then free posted rx buffers that were not used */ - while (atomic_read(&rxq->used) > 0) { - page_info = get_rx_page_info(rxo); - put_page(page_info->page); - memset(page_info, 0, sizeof(*page_info)); - } - BUG_ON(atomic_read(&rxq->used)); - rxq->tail = 0; - rxq->head = 0; } static void be_tx_compl_clean(struct be_adapter *adapter) @@ -3358,8 +3362,22 @@ static void be_rx_qs_destroy(struct be_adapter *adapter) for_all_rx_queues(adapter, rxo, i) { q = &rxo->q; if (q->created) { + /* If RXQs are destroyed while in an "out of buffer" + * state, there is a possibility of an HW stall on + * Lancer. So, post 64 buffers to each queue to relieve + * the "out of buffer" condition. + * Make sure there's space in the RXQ before posting. + */ + if (lancer_chip(adapter)) { + be_rx_cq_clean(rxo); + if (atomic_read(&q->used) == 0) + be_post_rx_frags(rxo, GFP_KERNEL, + MAX_RX_POST); + } + be_cmd_rxq_destroy(adapter, q); be_rx_cq_clean(rxo); + be_rxq_clean(rxo); } be_queue_free(adapter, q); } From 649886a36b5f023811321819eceaa8ba66444e3b Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 5 Aug 2015 03:27:50 -0400 Subject: [PATCH 42/89] be2net: protect eqo->affinity_mask from getting freed twice There are paths in the driver such as an unrecoverable error (UE) detection followed by a driver unload wherein be_clear() is invoked twice. Individual data structures are reset so that they are not cleaned/freed twice. This patch does the same for eqo->affinity_mask. It is freed only if EQs haven't yet been destroyed. This fixes a possible crash when affinity_mask is freed twice. Signed-off-by: Kalesh AP Signed-off-by: Sathya Perla Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 14ae67a8949e39..c28e3bfdccd75d 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2584,8 +2584,8 @@ static void be_evt_queues_destroy(struct be_adapter *adapter) be_cmd_q_destroy(adapter, &eqo->q, QTYPE_EQ); napi_hash_del(&eqo->napi); netif_napi_del(&eqo->napi); + free_cpumask_var(eqo->affinity_mask); } - free_cpumask_var(eqo->affinity_mask); be_queue_free(adapter, &eqo->q); } } @@ -2602,13 +2602,7 @@ static int be_evt_queues_create(struct be_adapter *adapter) for_all_evt_queues(adapter, eqo, i) { int numa_node = dev_to_node(&adapter->pdev->dev); - if (!zalloc_cpumask_var(&eqo->affinity_mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_set_cpu(cpumask_local_spread(i, numa_node), - eqo->affinity_mask); - netif_napi_add(adapter->netdev, &eqo->napi, be_poll, - BE_NAPI_WEIGHT); - napi_hash_add(&eqo->napi); + aic = &adapter->aic_obj[i]; eqo->adapter = adapter; eqo->idx = i; @@ -2624,6 +2618,14 @@ static int be_evt_queues_create(struct be_adapter *adapter) rc = be_cmd_eq_create(adapter, eqo); if (rc) return rc; + + if (!zalloc_cpumask_var(&eqo->affinity_mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_set_cpu(cpumask_local_spread(i, numa_node), + eqo->affinity_mask); + netif_napi_add(adapter->netdev, &eqo->napi, be_poll, + BE_NAPI_WEIGHT); + napi_hash_add(&eqo->napi); } return 0; } From 998ef5d81c74c752d74c7925bc370909b84adb9d Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Thu, 6 Aug 2015 15:07:04 +0100 Subject: [PATCH 43/89] ARM: 8408/1: Fix the secondary_startup function in Big Endian case Since the commit "b2c3e38a5471 ARM: redo TTBR setup code for LPAE", the setup code had been reworked. As a result the secondary CPUs failed to come online in Big Endian. As explained by Russell, the new code expected the value in r4/r5 to be the least significant 32bits in r4 and the most significant 32bits in r5. However, in the secondary code, we load this using ldrd, which on BE reverses that. This patch swap r4/r5 after the ldrd. It is done using the xor instructions in order to not use a temporary register. Signed-off-by: Gregory CLEMENT Signed-off-by: Russell King --- arch/arm/kernel/head.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index bd755d97e459d7..29e2991465cb27 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -399,6 +399,9 @@ ENTRY(secondary_startup) sub lr, r4, r5 @ mmu has been enabled add r3, r7, lr ldrd r4, [r3, #0] @ get secondary_data.pgdir +ARM_BE8(eor r4, r4, r5) @ Swap r5 and r4 in BE: +ARM_BE8(eor r5, r4, r5) @ it can be done in 3 steps +ARM_BE8(eor r4, r4, r5) @ without using a temp reg. ldr r8, [r3, #8] @ get secondary_data.swapper_pg_dir badr lr, __enable_mmu @ return address mov r13, r12 @ __secondary_switched address From e83dd3770021910293edea6fb2dc2fa306b1bf34 Mon Sep 17 00:00:00 2001 From: Drew Richardson Date: Thu, 6 Aug 2015 18:50:27 +0100 Subject: [PATCH 44/89] ARM: 8409/1: Mark ret_fast_syscall as a function ret_fast_syscall runs when user space makes a syscall. However it needs to be marked as such so the ELF information is correct. Before it was: 101: 8000f300 0 NOTYPE LOCAL DEFAULT 2 ret_fast_syscall But with this change it correctly shows as: 101: 8000f300 96 FUNC LOCAL DEFAULT 2 ret_fast_syscall I see this function when using perf to unwind call stacks from kernel space to user space. Without this change I would need to add some special case logic when using the vmlinux ELF information. Signed-off-by: Drew Richardson Acked-by: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/kernel/entry-common.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 92828a1dec80c1..b48dd4f37f8067 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -61,6 +61,7 @@ work_pending: movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE) ldmia sp, {r0 - r6} @ have to reload r0 - r6 b local_restart @ ... and off we go +ENDPROC(ret_fast_syscall) /* * "slow" syscall return path. "why" tells us if this was a real syscall. From fe1e1876d8f6d8d4b45e3940e6dd43cd3b18d958 Mon Sep 17 00:00:00 2001 From: Carol L Soto Date: Wed, 5 Aug 2015 11:05:32 -0500 Subject: [PATCH 45/89] net/mlx5_core: Set log_uar_page_sz for non 4K page size architecture failed to configure the page size for architectures with page size different than 4K. Fixes: 938fe83 ("net/mlx5_core: New device capabilities handling") Signed-off-by: Carol L Soto Acked-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index afad529838de74..06e3e1e54c35d6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -391,6 +391,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) /* disable cmdif checksum */ MLX5_SET(cmd_hca_cap, set_hca_cap, cmdif_checksum, 0); + MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12); + err = set_caps(dev, set_ctx, set_sz); query_ex: From 2701fa0864ecb9e49d47a4aa1c02f172ab79639a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 28 Jul 2015 15:31:12 +0200 Subject: [PATCH 46/89] fbdev: select versatile helpers for the integrator Commit 11c32d7b6274cb0f554943d65bd4a126c4a86dcd "video: move Versatile CLCD helpers" missed the fact that the Integrator/CP is also using the helper, and as a result the platform got only stubs and no graphics. Add this as a default selection to Kconfig so we have graphics again. Fixes: 11c32d7b6274 (video: move Versatile CLCD helpers) Signed-off-by: Linus Walleij Signed-off-by: Tomi Valkeinen --- drivers/video/fbdev/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 2d98de535e0f73..f888561568d917 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -298,7 +298,7 @@ config FB_ARMCLCD # Helper logic selected only by the ARM Versatile platform family. config PLAT_VERSATILE_CLCD - def_bool ARCH_VERSATILE || ARCH_REALVIEW || ARCH_VEXPRESS + def_bool ARCH_VERSATILE || ARCH_REALVIEW || ARCH_VEXPRESS || ARCH_INTEGRATOR depends on ARM depends on FB_ARMCLCD && FB=y From 2b55cb3b04684f131a01faf2eb8e4d822d293f24 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Fri, 7 Aug 2015 14:04:29 +0300 Subject: [PATCH 47/89] OMAPDSS: Fix node refcount leak in omapdss_of_get_next_port() Fix node refcount leak in omapdss_of_get_next_port(). Signed-off-by: Jyri Sarha Signed-off-by: Tomi Valkeinen --- drivers/video/fbdev/omap2/dss/dss-of.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/fbdev/omap2/dss/dss-of.c b/drivers/video/fbdev/omap2/dss/dss-of.c index 928ee639c0c19b..c8c065d92b5924 100644 --- a/drivers/video/fbdev/omap2/dss/dss-of.c +++ b/drivers/video/fbdev/omap2/dss/dss-of.c @@ -60,6 +60,8 @@ omapdss_of_get_next_port(const struct device_node *parent, } prev = port; } while (of_node_cmp(port->name, "port") != 0); + + of_node_put(ports); } return port; From 6266f4b19d341c531d447d689fb44609daa06c79 Mon Sep 17 00:00:00 2001 From: Jyri Sarha Date: Fri, 7 Aug 2015 14:04:30 +0300 Subject: [PATCH 48/89] OMAPDSS: Fix omap_dss_find_output_by_port_node() port refcount decrement Fix omap_dss_find_output_by_port_node() port parameter refcount decrementation. The only user of dss_of_port_get_parent_device() function is omap_dss_find_output_by_port_node() and it assumes the refcount of the port parameter is not decremented by the call. Signed-off-by: Jyri Sarha Signed-off-by: Tomi Valkeinen --- drivers/video/fbdev/omap2/dss/dss-of.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/omap2/dss/dss-of.c b/drivers/video/fbdev/omap2/dss/dss-of.c index c8c065d92b5924..bf407b6ba15ca0 100644 --- a/drivers/video/fbdev/omap2/dss/dss-of.c +++ b/drivers/video/fbdev/omap2/dss/dss-of.c @@ -96,7 +96,7 @@ struct device_node *dss_of_port_get_parent_device(struct device_node *port) if (!port) return NULL; - np = of_get_next_parent(port); + np = of_get_parent(port); for (i = 0; i < 2 && np; ++i) { struct property *prop; From 9e6e35edb330619fbfa3457eff1d15d3672c833a Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Mon, 3 Aug 2015 22:15:34 +0200 Subject: [PATCH 49/89] video: fbdev: pxa3xx_gcu: prepare the clocks The clocks need to be prepared before being enabled. Without it a warning appears in the drivers probe path : WARNING: CPU: 0 PID: 1 at drivers/clk/clk.c:707 clk_core_enable+0x84/0xa0() Modules linked in: CPU: 0 PID: 1 Comm: swapper Not tainted 4.2.0-rc3-cm-x300+ #804 Hardware name: CM-X300 module [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (warn_slowpath_common+0x7c/0xb4) [] (warn_slowpath_common) from [] (warn_slowpath_null+0x1c/0x24) [] (warn_slowpath_null) from [] (clk_core_enable+0x84/0xa0) [] (clk_core_enable) from [] (clk_enable+0x20/0x34) [] (clk_enable) from [] (pxa3xx_gcu_probe+0x148/0x338) [] (pxa3xx_gcu_probe) from [] (platform_drv_probe+0x30/0x94) Signed-off-by: Robert Jarzmik Signed-off-by: Tomi Valkeinen --- drivers/video/fbdev/pxa3xx-gcu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/fbdev/pxa3xx-gcu.c b/drivers/video/fbdev/pxa3xx-gcu.c index 86bd457d039d2a..50bce45e7f3d47 100644 --- a/drivers/video/fbdev/pxa3xx-gcu.c +++ b/drivers/video/fbdev/pxa3xx-gcu.c @@ -653,7 +653,7 @@ static int pxa3xx_gcu_probe(struct platform_device *pdev) goto err_free_dma; } - ret = clk_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); if (ret < 0) { dev_err(dev, "failed to enable clock\n"); goto err_misc_deregister; @@ -685,7 +685,7 @@ static int pxa3xx_gcu_probe(struct platform_device *pdev) misc_deregister(&priv->misc_dev); err_disable_clk: - clk_disable(priv->clk); + clk_disable_unprepare(priv->clk); return ret; } From 96fffb4f23f124f297d51dedc9cf51d19eb88ee1 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sun, 9 Aug 2015 13:14:15 +0200 Subject: [PATCH 50/89] netfilter: ip6t_SYNPROXY: fix NULL pointer dereference This happens when networking namespaces are enabled. Suggested-by: Patrick McHardy Signed-off-by: Phil Sutter Acked-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_SYNPROXY.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 6edb7b106de769..bcebc24c6f0b42 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -37,12 +37,13 @@ synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr, } static void -synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, +synproxy_send_tcp(const struct synproxy_net *snet, + const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct ipv6hdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { - struct net *net = nf_ct_net((struct nf_conn *)nfct); + struct net *net = nf_ct_net(snet->tmpl); struct dst_entry *dst; struct flowi6 fl6; @@ -83,7 +84,8 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, } static void -synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, +synproxy_send_client_synack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; @@ -119,7 +121,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } @@ -163,7 +165,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } @@ -203,7 +205,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void @@ -241,7 +243,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static bool @@ -301,7 +303,7 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(skb, th, &opts); + synproxy_send_client_synack(snet, skb, th, &opts); return NF_DROP; } else if (th->ack && !(th->fin || th->rst || th->syn)) { From 3c16241c445303a90529565e7437e1f240acfef2 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 28 Jul 2015 00:53:26 +0200 Subject: [PATCH 51/89] netfilter: SYNPROXY: fix sending window update to client Upon receipt of SYNACK from the server, ipt_SYNPROXY first sends back an ACK to finish the server handshake, then calls nf_ct_seqadj_init() to initiate sequence number adjustment of forwarded packets to the client and finally sends a window update to the client to unblock it's TX queue. Since synproxy_send_client_ack() does not set synproxy_send_tcp()'s nfct parameter, no sequence number adjustment happens and the client receives the window update with incorrect sequence number. Depending on client TCP implementation, this leads to a significant delay (until a window probe is being sent). Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_SYNPROXY.c | 3 ++- net/ipv6/netfilter/ip6t_SYNPROXY.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index fe8cc183411e05..95ea633e8356eb 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -226,7 +226,8 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); } static bool diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index bcebc24c6f0b42..ebbb754c2111b7 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -243,7 +243,8 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); } static bool From 37b617f9be527b1f1181e3ff23df4cdbe3e6441f Mon Sep 17 00:00:00 2001 From: Christian Engelmayer Date: Sat, 11 Jul 2015 19:46:11 +0200 Subject: [PATCH 52/89] video: Fix possible leak in of_get_videomode() In case videomode_from_timings() fails in function of_get_videomode(), the allocated display timing data is not freed in the exit path. Make sure that display_timings_release() is called in any case. Detected by Coverity CID 1309681. Signed-off-by: Christian Engelmayer Signed-off-by: Tomi Valkeinen --- drivers/video/of_videomode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/video/of_videomode.c b/drivers/video/of_videomode.c index 111c2d1911d32e..b5102aa6090d11 100644 --- a/drivers/video/of_videomode.c +++ b/drivers/video/of_videomode.c @@ -44,11 +44,9 @@ int of_get_videomode(struct device_node *np, struct videomode *vm, index = disp->native_mode; ret = videomode_from_timings(disp, vm, index); - if (ret) - return ret; display_timings_release(disp); - return 0; + return ret; } EXPORT_SYMBOL_GPL(of_get_videomode); From 2a17d7e80f1df44d6b94c3696d5eda44fe6638a8 Mon Sep 17 00:00:00 2001 From: Scot Doyle Date: Tue, 4 Aug 2015 12:33:32 +0000 Subject: [PATCH 53/89] fbcon: unconditionally initialize cursor blink interval A sun7i-a20-olinuxino-micro fails to boot when kernel parameter vt.global_cursor_default=0. The value is copied to vc->vc_deccm causing the initialization of ops->cur_blink_jiffies to be skipped. Unconditionally initialize it. Reported-and-tested-by: Jonathan Liu Signed-off-by: Scot Doyle Acked-by: Pavel Machek Signed-off-by: Tomi Valkeinen --- drivers/video/console/fbcon.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c index 658c34bb9076f8..1aaf89300621ab 100644 --- a/drivers/video/console/fbcon.c +++ b/drivers/video/console/fbcon.c @@ -1306,10 +1306,11 @@ static void fbcon_cursor(struct vc_data *vc, int mode) int y; int c = scr_readw((u16 *) vc->vc_pos); + ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms); + if (fbcon_is_inactive(vc, info) || vc->vc_deccm != 1) return; - ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms); if (vc->vc_cursor_type & 0x10) fbcon_del_cursor_timer(info); else From fc5fee86bdd3d720e2d1d324e4fae0c35845fa63 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 10 Aug 2015 15:40:27 +0200 Subject: [PATCH 54/89] x86/xen: build "Xen PV" APIC driver for domU as well It turns out that a PV domU also requires the "Xen PV" APIC driver. Otherwise, the flat driver is used and we get stuck in busy loops that never exit, such as in this stack trace: (gdb) target remote localhost:9999 Remote debugging using localhost:9999 __xapic_wait_icr_idle () at ./arch/x86/include/asm/ipi.h:56 56 while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY) (gdb) bt #0 __xapic_wait_icr_idle () at ./arch/x86/include/asm/ipi.h:56 #1 __default_send_IPI_shortcut (shortcut=, dest=, vector=) at ./arch/x86/include/asm/ipi.h:75 #2 apic_send_IPI_self (vector=246) at arch/x86/kernel/apic/probe_64.c:54 #3 0xffffffff81011336 in arch_irq_work_raise () at arch/x86/kernel/irq_work.c:47 #4 0xffffffff8114990c in irq_work_queue (work=0xffff88000fc0e400) at kernel/irq_work.c:100 #5 0xffffffff8110c29d in wake_up_klogd () at kernel/printk/printk.c:2633 #6 0xffffffff8110ca60 in vprintk_emit (facility=0, level=, dict=0x0 , dictlen=, fmt=, args=) at kernel/printk/printk.c:1778 #7 0xffffffff816010c8 in printk (fmt=) at kernel/printk/printk.c:1868 #8 0xffffffffc00013ea in ?? () #9 0x0000000000000000 in ?? () Mailing-list-thread: https://lkml.org/lkml/2015/8/4/755 Signed-off-by: Jason A. Donenfeld Cc: Signed-off-by: David Vrabel --- arch/x86/xen/Makefile | 4 ++-- arch/x86/xen/xen-ops.h | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 7322755f337af7..4b6e29ac0968c1 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -13,13 +13,13 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o + p2m.o apic.o obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_DOM0) += apic.o vga.o +obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9e195c683549dc..bef30cbb56c477 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -101,17 +101,15 @@ struct dom0_vga_console_info; #ifdef CONFIG_XEN_DOM0 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); -void __init xen_init_apic(void); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) { } -static inline void __init xen_init_apic(void) -{ -} #endif +void __init xen_init_apic(void); + #ifdef CONFIG_XEN_EFI extern void xen_efi_init(void); #else From 878854a374620a3f5e8c0a3c418e82a429bc2cff Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Fri, 7 Aug 2015 21:03:23 -0500 Subject: [PATCH 55/89] arm64: VDSO: fix coarse clock monotonicity regression Since 906c55579a63 ("timekeeping: Copy the shadow-timekeeper over the real timekeeper last") it has become possible on arm64 to: - Obtain a CLOCK_MONOTONIC_COARSE or CLOCK_REALTIME_COARSE timestamp via syscall. - Subsequently obtain a timestamp for the same clock ID via VDSO which predates the first timestamp (by one jiffy). This is because arm64's update_vsyscall is deriving the coarse time using the __current_kernel_time interface, when it should really be using the timekeeper object provided to it by the timekeeping core. It happened to work before only because __current_kernel_time would access the same timekeeper object which had been passed to update_vsyscall. This is no longer the case. Signed-off-by: Nathan Lynch Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index ec37ab3f524f30..97bc68f4c689f2 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -199,16 +199,15 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, */ void update_vsyscall(struct timekeeper *tk) { - struct timespec xtime_coarse; u32 use_syscall = strcmp(tk->tkr_mono.clock->name, "arch_sys_counter"); ++vdso_data->tb_seq_count; smp_wmb(); - xtime_coarse = __current_kernel_time(); vdso_data->use_syscall = use_syscall; - vdso_data->xtime_coarse_sec = xtime_coarse.tv_sec; - vdso_data->xtime_coarse_nsec = xtime_coarse.tv_nsec; + vdso_data->xtime_coarse_sec = tk->xtime_sec; + vdso_data->xtime_coarse_nsec = tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift; vdso_data->wtm_clock_sec = tk->wall_to_monotonic.tv_sec; vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec; From d53793c5d6eb0cfe4175d9c315a30d65adf3478b Mon Sep 17 00:00:00 2001 From: Marcin Wojtas Date: Thu, 6 Aug 2015 19:00:28 +0200 Subject: [PATCH 56/89] net: mvpp2: remove excessive spinlocks from driver initialization Using spinlocks protection during one-time driver initialization is not necessary. Moreover it resulted in invalid GFP_KERNEL allocation under the lock. This commit removes redundant spinlocks from buffer manager part of mvpp2 initialization. Signed-off-by: Marcin Wojtas Reported-by: Alexandre Fournier Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index 3e8b1bfb1f2e31..f94bd122f0bd53 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -913,8 +913,6 @@ struct mvpp2_bm_pool { /* Occupied buffers indicator */ atomic_t in_use; int in_use_thresh; - - spinlock_t lock; }; struct mvpp2_buff_hdr { @@ -3376,7 +3374,6 @@ static int mvpp2_bm_pool_create(struct platform_device *pdev, bm_pool->pkt_size = 0; bm_pool->buf_num = 0; atomic_set(&bm_pool->in_use, 0); - spin_lock_init(&bm_pool->lock); return 0; } @@ -3647,7 +3644,6 @@ static struct mvpp2_bm_pool * mvpp2_bm_pool_use(struct mvpp2_port *port, int pool, enum mvpp2_bm_type type, int pkt_size) { - unsigned long flags = 0; struct mvpp2_bm_pool *new_pool = &port->priv->bm_pools[pool]; int num; @@ -3656,8 +3652,6 @@ mvpp2_bm_pool_use(struct mvpp2_port *port, int pool, enum mvpp2_bm_type type, return NULL; } - spin_lock_irqsave(&new_pool->lock, flags); - if (new_pool->type == MVPP2_BM_FREE) new_pool->type = type; @@ -3686,8 +3680,6 @@ mvpp2_bm_pool_use(struct mvpp2_port *port, int pool, enum mvpp2_bm_type type, if (num != pkts_num) { WARN(1, "pool %d: %d of %d allocated\n", new_pool->id, num, pkts_num); - /* We need to undo the bufs_add() allocations */ - spin_unlock_irqrestore(&new_pool->lock, flags); return NULL; } } @@ -3695,15 +3687,12 @@ mvpp2_bm_pool_use(struct mvpp2_port *port, int pool, enum mvpp2_bm_type type, mvpp2_bm_pool_bufsize_set(port->priv, new_pool, MVPP2_RX_BUF_SIZE(new_pool->pkt_size)); - spin_unlock_irqrestore(&new_pool->lock, flags); - return new_pool; } /* Initialize pools for swf */ static int mvpp2_swf_bm_pool_init(struct mvpp2_port *port) { - unsigned long flags = 0; int rxq; if (!port->pool_long) { @@ -3714,9 +3703,7 @@ static int mvpp2_swf_bm_pool_init(struct mvpp2_port *port) if (!port->pool_long) return -ENOMEM; - spin_lock_irqsave(&port->pool_long->lock, flags); port->pool_long->port_map |= (1 << port->id); - spin_unlock_irqrestore(&port->pool_long->lock, flags); for (rxq = 0; rxq < rxq_number; rxq++) mvpp2_rxq_long_pool_set(port, rxq, port->pool_long->id); @@ -3730,9 +3717,7 @@ static int mvpp2_swf_bm_pool_init(struct mvpp2_port *port) if (!port->pool_short) return -ENOMEM; - spin_lock_irqsave(&port->pool_short->lock, flags); port->pool_short->port_map |= (1 << port->id); - spin_unlock_irqrestore(&port->pool_short->lock, flags); for (rxq = 0; rxq < rxq_number; rxq++) mvpp2_rxq_short_pool_set(port, rxq, From 71ce391dfb7843f4d31abcd7287967a00cb1b8a1 Mon Sep 17 00:00:00 2001 From: Marcin Wojtas Date: Thu, 6 Aug 2015 19:00:29 +0200 Subject: [PATCH 57/89] net: mvpp2: enable proper per-CPU TX buffers unmapping mvpp2 driver allows usage of per-CPU TX processing. Once the packets are prepared independetly on each CPU, the hardware enqueues the descriptors in common TX queue. After they are sent, the buffers and associated sk_buffs should be released on the corresponding CPU. This is why a special index is maintained in order to point to the right data to be released after transmission takes place. Each per-CPU TX queue comprise an array of sent sk_buffs, freed in mvpp2_txq_bufs_free function. However, the index was used there also for obtaining a descriptor (and therefore a buffer to be DMA-unmapped) from common TX queue, which was wrong, because it was not referring to the current CPU. This commit enables proper unmapping of sent data buffers by indexing them in per-CPU queues using a dedicated array for keeping their physical addresses. Signed-off-by: Marcin Wojtas Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 52 ++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index f94bd122f0bd53..3e25d31414bc55 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -776,6 +776,9 @@ struct mvpp2_txq_pcpu { /* Array of transmitted skb */ struct sk_buff **tx_skb; + /* Array of transmitted buffers' physical addresses */ + dma_addr_t *tx_buffs; + /* Index of last TX DMA descriptor that was inserted */ int txq_put_index; @@ -961,9 +964,13 @@ static void mvpp2_txq_inc_get(struct mvpp2_txq_pcpu *txq_pcpu) } static void mvpp2_txq_inc_put(struct mvpp2_txq_pcpu *txq_pcpu, - struct sk_buff *skb) + struct sk_buff *skb, + struct mvpp2_tx_desc *tx_desc) { txq_pcpu->tx_skb[txq_pcpu->txq_put_index] = skb; + if (skb) + txq_pcpu->tx_buffs[txq_pcpu->txq_put_index] = + tx_desc->buf_phys_addr; txq_pcpu->txq_put_index++; if (txq_pcpu->txq_put_index == txq_pcpu->size) txq_pcpu->txq_put_index = 0; @@ -4392,8 +4399,8 @@ static void mvpp2_txq_bufs_free(struct mvpp2_port *port, int i; for (i = 0; i < num; i++) { - struct mvpp2_tx_desc *tx_desc = txq->descs + - txq_pcpu->txq_get_index; + dma_addr_t buf_phys_addr = + txq_pcpu->tx_buffs[txq_pcpu->txq_get_index]; struct sk_buff *skb = txq_pcpu->tx_skb[txq_pcpu->txq_get_index]; mvpp2_txq_inc_get(txq_pcpu); @@ -4401,8 +4408,8 @@ static void mvpp2_txq_bufs_free(struct mvpp2_port *port, if (!skb) continue; - dma_unmap_single(port->dev->dev.parent, tx_desc->buf_phys_addr, - tx_desc->data_size, DMA_TO_DEVICE); + dma_unmap_single(port->dev->dev.parent, buf_phys_addr, + skb_headlen(skb), DMA_TO_DEVICE); dev_kfree_skb_any(skb); } } @@ -4634,12 +4641,13 @@ static int mvpp2_txq_init(struct mvpp2_port *port, txq_pcpu->tx_skb = kmalloc(txq_pcpu->size * sizeof(*txq_pcpu->tx_skb), GFP_KERNEL); - if (!txq_pcpu->tx_skb) { - dma_free_coherent(port->dev->dev.parent, - txq->size * MVPP2_DESC_ALIGNED_SIZE, - txq->descs, txq->descs_phys); - return -ENOMEM; - } + if (!txq_pcpu->tx_skb) + goto error; + + txq_pcpu->tx_buffs = kmalloc(txq_pcpu->size * + sizeof(dma_addr_t), GFP_KERNEL); + if (!txq_pcpu->tx_buffs) + goto error; txq_pcpu->count = 0; txq_pcpu->reserved_num = 0; @@ -4648,6 +4656,19 @@ static int mvpp2_txq_init(struct mvpp2_port *port, } return 0; + +error: + for_each_present_cpu(cpu) { + txq_pcpu = per_cpu_ptr(txq->pcpu, cpu); + kfree(txq_pcpu->tx_skb); + kfree(txq_pcpu->tx_buffs); + } + + dma_free_coherent(port->dev->dev.parent, + txq->size * MVPP2_DESC_ALIGNED_SIZE, + txq->descs, txq->descs_phys); + + return -ENOMEM; } /* Free allocated TXQ resources */ @@ -4660,6 +4681,7 @@ static void mvpp2_txq_deinit(struct mvpp2_port *port, for_each_present_cpu(cpu) { txq_pcpu = per_cpu_ptr(txq->pcpu, cpu); kfree(txq_pcpu->tx_skb); + kfree(txq_pcpu->tx_buffs); } if (txq->descs) @@ -5129,11 +5151,11 @@ static int mvpp2_tx_frag_process(struct mvpp2_port *port, struct sk_buff *skb, if (i == (skb_shinfo(skb)->nr_frags - 1)) { /* Last descriptor */ tx_desc->command = MVPP2_TXD_L_DESC; - mvpp2_txq_inc_put(txq_pcpu, skb); + mvpp2_txq_inc_put(txq_pcpu, skb, tx_desc); } else { /* Descriptor in the middle: Not First, Not Last */ tx_desc->command = 0; - mvpp2_txq_inc_put(txq_pcpu, NULL); + mvpp2_txq_inc_put(txq_pcpu, NULL, tx_desc); } } @@ -5199,12 +5221,12 @@ static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev) /* First and Last descriptor */ tx_cmd |= MVPP2_TXD_F_DESC | MVPP2_TXD_L_DESC; tx_desc->command = tx_cmd; - mvpp2_txq_inc_put(txq_pcpu, skb); + mvpp2_txq_inc_put(txq_pcpu, skb, tx_desc); } else { /* First but not Last */ tx_cmd |= MVPP2_TXD_F_DESC | MVPP2_TXD_PADDING_DISABLE; tx_desc->command = tx_cmd; - mvpp2_txq_inc_put(txq_pcpu, NULL); + mvpp2_txq_inc_put(txq_pcpu, NULL, tx_desc); /* Continue with other skb fragments */ if (mvpp2_tx_frag_process(port, skb, aggr_txq, txq)) { From edc660fa09e2295b6ee2d2bf742c2a72dfeb18d2 Mon Sep 17 00:00:00 2001 From: Marcin Wojtas Date: Thu, 6 Aug 2015 19:00:30 +0200 Subject: [PATCH 58/89] net: mvpp2: replace TX coalescing interrupts with hrtimer The PP2 controller is capable of per-CPU TX processing, which means there are per-CPU banked register sets and queues. Current version of the driver supports TX packet coalescing - once on given CPU sent packets amount reaches a threshold value, an IRQ occurs. However, there is a single interrupt line responsible for CPU0/1 TX and RX events (the latter is not per-CPU, the hardware does not support RSS). When the top-half executes the interrupt cause is not known. This is why in NAPI poll function, along with RX processing, IRQ cause register on both CPU's is accessed in order to determine on which of them the TX coalescing threshold might have been reached. Thus the egress processing and releasing the buffers is able to take place on the corresponding CPU. Hitherto approach lead to an illegal usage of on_each_cpu function in softirq context. The problem is solved by resigning from TX coalescing interrupts and separating egress finalization from NAPI processing. For that purpose a method of using hrtimer is introduced. In main transmit function (mvpp2_tx) buffers are released once a software coalescing threshold is reached. In case not all the data is processed a timer is set on this CPU - in its interrupt context a tasklet is scheduled in which all queues are processed. At once only one timer per-CPU can be running, which is controlled by a dedicated flag. This commit removes TX processing from NAPI polling function, disables hardware coalescing and enables hrtimer with tasklet, using new per-CPU port structure (mvpp2_port_pcpu). Signed-off-by: Marcin Wojtas Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 177 ++++++++++++++++++++------- 1 file changed, 130 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index 3e25d31414bc55..d9884fd15b453e 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include #include @@ -299,6 +301,7 @@ /* Coalescing */ #define MVPP2_TXDONE_COAL_PKTS_THRESH 15 +#define MVPP2_TXDONE_HRTIMER_PERIOD_NS 1000000UL #define MVPP2_RX_COAL_PKTS 32 #define MVPP2_RX_COAL_USEC 100 @@ -660,6 +663,14 @@ struct mvpp2_pcpu_stats { u64 tx_bytes; }; +/* Per-CPU port control */ +struct mvpp2_port_pcpu { + struct hrtimer tx_done_timer; + bool timer_scheduled; + /* Tasklet for egress finalization */ + struct tasklet_struct tx_done_tasklet; +}; + struct mvpp2_port { u8 id; @@ -679,6 +690,9 @@ struct mvpp2_port { u32 pending_cause_rx; struct napi_struct napi; + /* Per-CPU port control */ + struct mvpp2_port_pcpu __percpu *pcpu; + /* Flags */ unsigned long flags; @@ -3798,7 +3812,6 @@ static void mvpp2_interrupts_unmask(void *arg) mvpp2_write(port->priv, MVPP2_ISR_RX_TX_MASK_REG(port->id), (MVPP2_CAUSE_MISC_SUM_MASK | - MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK | MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK)); } @@ -4374,23 +4387,6 @@ static void mvpp2_rx_time_coal_set(struct mvpp2_port *port, rxq->time_coal = usec; } -/* Set threshold for TX_DONE pkts coalescing */ -static void mvpp2_tx_done_pkts_coal_set(void *arg) -{ - struct mvpp2_port *port = arg; - int queue; - u32 val; - - for (queue = 0; queue < txq_number; queue++) { - struct mvpp2_tx_queue *txq = port->txqs[queue]; - - val = (txq->done_pkts_coal << MVPP2_TRANSMITTED_THRESH_OFFSET) & - MVPP2_TRANSMITTED_THRESH_MASK; - mvpp2_write(port->priv, MVPP2_TXQ_NUM_REG, txq->id); - mvpp2_write(port->priv, MVPP2_TXQ_THRESH_REG, val); - } -} - /* Free Tx queue skbuffs */ static void mvpp2_txq_bufs_free(struct mvpp2_port *port, struct mvpp2_tx_queue *txq, @@ -4425,7 +4421,7 @@ static inline struct mvpp2_rx_queue *mvpp2_get_rx_queue(struct mvpp2_port *port, static inline struct mvpp2_tx_queue *mvpp2_get_tx_queue(struct mvpp2_port *port, u32 cause) { - int queue = fls(cause >> 16) - 1; + int queue = fls(cause) - 1; return port->txqs[queue]; } @@ -4452,6 +4448,29 @@ static void mvpp2_txq_done(struct mvpp2_port *port, struct mvpp2_tx_queue *txq, netif_tx_wake_queue(nq); } +static unsigned int mvpp2_tx_done(struct mvpp2_port *port, u32 cause) +{ + struct mvpp2_tx_queue *txq; + struct mvpp2_txq_pcpu *txq_pcpu; + unsigned int tx_todo = 0; + + while (cause) { + txq = mvpp2_get_tx_queue(port, cause); + if (!txq) + break; + + txq_pcpu = this_cpu_ptr(txq->pcpu); + + if (txq_pcpu->count) { + mvpp2_txq_done(port, txq, txq_pcpu); + tx_todo += txq_pcpu->count; + } + + cause &= ~(1 << txq->log_id); + } + return tx_todo; +} + /* Rx/Tx queue initialization/cleanup methods */ /* Allocate and initialize descriptors for aggr TXQ */ @@ -4812,7 +4831,6 @@ static int mvpp2_setup_txqs(struct mvpp2_port *port) goto err_cleanup; } - on_each_cpu(mvpp2_tx_done_pkts_coal_set, port, 1); on_each_cpu(mvpp2_txq_sent_counter_clear, port, 1); return 0; @@ -4894,6 +4912,49 @@ static void mvpp2_link_event(struct net_device *dev) } } +static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu) +{ + ktime_t interval; + + if (!port_pcpu->timer_scheduled) { + port_pcpu->timer_scheduled = true; + interval = ktime_set(0, MVPP2_TXDONE_HRTIMER_PERIOD_NS); + hrtimer_start(&port_pcpu->tx_done_timer, interval, + HRTIMER_MODE_REL_PINNED); + } +} + +static void mvpp2_tx_proc_cb(unsigned long data) +{ + struct net_device *dev = (struct net_device *)data; + struct mvpp2_port *port = netdev_priv(dev); + struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu); + unsigned int tx_todo, cause; + + if (!netif_running(dev)) + return; + port_pcpu->timer_scheduled = false; + + /* Process all the Tx queues */ + cause = (1 << txq_number) - 1; + tx_todo = mvpp2_tx_done(port, cause); + + /* Set the timer in case not all the packets were processed */ + if (tx_todo) + mvpp2_timer_set(port_pcpu); +} + +static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer) +{ + struct mvpp2_port_pcpu *port_pcpu = container_of(timer, + struct mvpp2_port_pcpu, + tx_done_timer); + + tasklet_schedule(&port_pcpu->tx_done_tasklet); + + return HRTIMER_NORESTART; +} + /* Main RX/TX processing routines */ /* Display more error info */ @@ -5262,6 +5323,17 @@ static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev) dev_kfree_skb_any(skb); } + /* Finalize TX processing */ + if (txq_pcpu->count >= txq->done_pkts_coal) + mvpp2_txq_done(port, txq, txq_pcpu); + + /* Set the timer in case not all frags were processed */ + if (txq_pcpu->count <= frags && txq_pcpu->count > 0) { + struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu); + + mvpp2_timer_set(port_pcpu); + } + return NETDEV_TX_OK; } @@ -5275,10 +5347,11 @@ static inline void mvpp2_cause_error(struct net_device *dev, int cause) netdev_err(dev, "tx fifo underrun error\n"); } -static void mvpp2_txq_done_percpu(void *arg) +static int mvpp2_poll(struct napi_struct *napi, int budget) { - struct mvpp2_port *port = arg; - u32 cause_rx_tx, cause_tx, cause_misc; + u32 cause_rx_tx, cause_rx, cause_misc; + int rx_done = 0; + struct mvpp2_port *port = netdev_priv(napi->dev); /* Rx/Tx cause register * @@ -5292,7 +5365,7 @@ static void mvpp2_txq_done_percpu(void *arg) */ cause_rx_tx = mvpp2_read(port->priv, MVPP2_ISR_RX_TX_CAUSE_REG(port->id)); - cause_tx = cause_rx_tx & MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK; + cause_rx_tx &= ~MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK; cause_misc = cause_rx_tx & MVPP2_CAUSE_MISC_SUM_MASK; if (cause_misc) { @@ -5304,26 +5377,6 @@ static void mvpp2_txq_done_percpu(void *arg) cause_rx_tx & ~MVPP2_CAUSE_MISC_SUM_MASK); } - /* Release TX descriptors */ - if (cause_tx) { - struct mvpp2_tx_queue *txq = mvpp2_get_tx_queue(port, cause_tx); - struct mvpp2_txq_pcpu *txq_pcpu = this_cpu_ptr(txq->pcpu); - - if (txq_pcpu->count) - mvpp2_txq_done(port, txq, txq_pcpu); - } -} - -static int mvpp2_poll(struct napi_struct *napi, int budget) -{ - u32 cause_rx_tx, cause_rx; - int rx_done = 0; - struct mvpp2_port *port = netdev_priv(napi->dev); - - on_each_cpu(mvpp2_txq_done_percpu, port, 1); - - cause_rx_tx = mvpp2_read(port->priv, - MVPP2_ISR_RX_TX_CAUSE_REG(port->id)); cause_rx = cause_rx_tx & MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK; /* Process RX packets */ @@ -5568,6 +5621,8 @@ static int mvpp2_open(struct net_device *dev) static int mvpp2_stop(struct net_device *dev) { struct mvpp2_port *port = netdev_priv(dev); + struct mvpp2_port_pcpu *port_pcpu; + int cpu; mvpp2_stop_dev(port); mvpp2_phy_disconnect(port); @@ -5576,6 +5631,13 @@ static int mvpp2_stop(struct net_device *dev) on_each_cpu(mvpp2_interrupts_mask, port, 1); free_irq(port->irq, port); + for_each_present_cpu(cpu) { + port_pcpu = per_cpu_ptr(port->pcpu, cpu); + + hrtimer_cancel(&port_pcpu->tx_done_timer); + port_pcpu->timer_scheduled = false; + tasklet_kill(&port_pcpu->tx_done_tasklet); + } mvpp2_cleanup_rxqs(port); mvpp2_cleanup_txqs(port); @@ -5791,7 +5853,6 @@ static int mvpp2_ethtool_set_coalesce(struct net_device *dev, txq->done_pkts_coal = c->tx_max_coalesced_frames; } - on_each_cpu(mvpp2_tx_done_pkts_coal_set, port, 1); return 0; } @@ -6042,6 +6103,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, { struct device_node *phy_node; struct mvpp2_port *port; + struct mvpp2_port_pcpu *port_pcpu; struct net_device *dev; struct resource *res; const char *dt_mac_addr; @@ -6051,7 +6113,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, int features; int phy_mode; int priv_common_regs_num = 2; - int err, i; + int err, i, cpu; dev = alloc_etherdev_mqs(sizeof(struct mvpp2_port), txq_number, rxq_number); @@ -6142,6 +6204,24 @@ static int mvpp2_port_probe(struct platform_device *pdev, } mvpp2_port_power_up(port); + port->pcpu = alloc_percpu(struct mvpp2_port_pcpu); + if (!port->pcpu) { + err = -ENOMEM; + goto err_free_txq_pcpu; + } + + for_each_present_cpu(cpu) { + port_pcpu = per_cpu_ptr(port->pcpu, cpu); + + hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb; + port_pcpu->timer_scheduled = false; + + tasklet_init(&port_pcpu->tx_done_tasklet, mvpp2_tx_proc_cb, + (unsigned long)dev); + } + netif_napi_add(dev, &port->napi, mvpp2_poll, NAPI_POLL_WEIGHT); features = NETIF_F_SG | NETIF_F_IP_CSUM; dev->features = features | NETIF_F_RXCSUM; @@ -6151,7 +6231,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, err = register_netdev(dev); if (err < 0) { dev_err(&pdev->dev, "failed to register netdev\n"); - goto err_free_txq_pcpu; + goto err_free_port_pcpu; } netdev_info(dev, "Using %s mac address %pM\n", mac_from, dev->dev_addr); @@ -6160,6 +6240,8 @@ static int mvpp2_port_probe(struct platform_device *pdev, priv->port_list[id] = port; return 0; +err_free_port_pcpu: + free_percpu(port->pcpu); err_free_txq_pcpu: for (i = 0; i < txq_number; i++) free_percpu(port->txqs[i]->pcpu); @@ -6178,6 +6260,7 @@ static void mvpp2_port_remove(struct mvpp2_port *port) int i; unregister_netdev(port->dev); + free_percpu(port->pcpu); free_percpu(port->stats); for (i = 0; i < txq_number; i++) free_percpu(port->txqs[i]->pcpu); From ade4dc3e616e33c80d7e62855fe1b6f9895bc7c3 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 6 Aug 2015 22:48:23 +0200 Subject: [PATCH 59/89] bna: fix interrupts storm caused by erroneous packets The commit "e29aa33 bna: Enable Multi Buffer RX" moved packets counter increment from the beginning of the NAPI processing loop after the check for erroneous packets so they are never accounted. This counter is used to inform firmware about number of processed completions (packets). As these packets are never acked the firmware fires IRQs for them again and again. Fixes: e29aa33 ("bna: Enable Multi Buffer RX") Signed-off-by: Ivan Vecera Acked-by: Rasesh Mody Signed-off-by: David S. Miller --- drivers/net/ethernet/brocade/bna/bnad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c index 0612b19f6313bd..506047c386071d 100644 --- a/drivers/net/ethernet/brocade/bna/bnad.c +++ b/drivers/net/ethernet/brocade/bna/bnad.c @@ -676,6 +676,7 @@ bnad_cq_process(struct bnad *bnad, struct bna_ccb *ccb, int budget) if (!next_cmpl->valid) break; } + packets++; /* TODO: BNA_CQ_EF_LOCAL ? */ if (unlikely(flags & (BNA_CQ_EF_MAC_ERROR | @@ -692,7 +693,6 @@ bnad_cq_process(struct bnad *bnad, struct bna_ccb *ccb, int budget) else bnad_cq_setup_skb_frags(rcb, skb, sop_ci, nvecs, len); - packets++; rcb->rxq->rx_packets++; rcb->rxq->rx_bytes += totlen; ccb->bytes_per_intr += totlen; From 4e7c1330689e27556de407d3fdadc65ffff5eb12 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 7 Aug 2015 00:26:41 +0200 Subject: [PATCH 60/89] netlink: make sure -EBUSY won't escape from netlink_insert Linus reports the following deadlock on rtnl_mutex; triggered only once so far (extract): [12236.694209] NetworkManager D 0000000000013b80 0 1047 1 0x00000000 [12236.694218] ffff88003f902640 0000000000000000 ffffffff815d15a9 0000000000000018 [12236.694224] ffff880119538000 ffff88003f902640 ffffffff81a8ff84 00000000ffffffff [12236.694230] ffffffff81a8ff88 ffff880119c47f00 ffffffff815d133a ffffffff81a8ff80 [12236.694235] Call Trace: [12236.694250] [] ? schedule_preempt_disabled+0x9/0x10 [12236.694257] [] ? schedule+0x2a/0x70 [12236.694263] [] ? schedule_preempt_disabled+0x9/0x10 [12236.694271] [] ? __mutex_lock_slowpath+0x7f/0xf0 [12236.694280] [] ? mutex_lock+0x16/0x30 [12236.694291] [] ? rtnetlink_rcv+0x10/0x30 [12236.694299] [] ? netlink_unicast+0xfb/0x180 [12236.694309] [] ? rtnl_getlink+0x113/0x190 [12236.694319] [] ? rtnetlink_rcv_msg+0x7a/0x210 [12236.694331] [] ? sock_has_perm+0x5c/0x70 [12236.694339] [] ? rtnetlink_rcv+0x30/0x30 [12236.694346] [] ? netlink_rcv_skb+0x9c/0xc0 [12236.694354] [] ? rtnetlink_rcv+0x1f/0x30 [12236.694360] [] ? netlink_unicast+0xfb/0x180 [12236.694367] [] ? netlink_sendmsg+0x484/0x5d0 [12236.694376] [] ? __wake_up+0x2f/0x50 [12236.694387] [] ? sock_sendmsg+0x33/0x40 [12236.694396] [] ? ___sys_sendmsg+0x22e/0x240 [12236.694405] [] ? ___sys_recvmsg+0x135/0x1a0 [12236.694415] [] ? eventfd_write+0x82/0x210 [12236.694423] [] ? fsnotify+0x32e/0x4c0 [12236.694429] [] ? wake_up_q+0x60/0x60 [12236.694434] [] ? __sys_sendmsg+0x39/0x70 [12236.694440] [] ? entry_SYSCALL_64_fastpath+0x12/0x6a It seems so far plausible that the recursive call into rtnetlink_rcv() looks suspicious. One way, where this could trigger is that the senders NETLINK_CB(skb).portid was wrongly 0 (which is rtnetlink socket), so the rtnl_getlink() request's answer would be sent to the kernel instead to the actual user process, thus grabbing rtnl_mutex() twice. One theory would be that netlink_autobind() triggered via netlink_sendmsg() internally overwrites the -EBUSY error to 0, but where it is wrongly originating from __netlink_insert() instead. That would reset the socket's portid to 0, which is then filled into NETLINK_CB(skb).portid later on. As commit d470e3b483dc ("[NETLINK]: Fix two socket hashing bugs.") also puts it, -EBUSY should not be propagated from netlink_insert(). It looks like it's very unlikely to reproduce. We need to trigger the rhashtable_insert_rehash() handler under a situation where rehashing currently occurs (one /rare/ way would be to hit ht->elasticity limits while not filled enough to expand the hashtable, but that would rather require a specifically crafted bind() sequence with knowledge about destination slots, seems unlikely). It probably makes sense to guard __netlink_insert() in any case and remap that error. It was suggested that EOVERFLOW might be better than an already overloaded ENOMEM. Reference: http://thread.gmane.org/gmane.linux.network/372676 Reported-by: Linus Torvalds Signed-off-by: Daniel Borkmann Acked-by: Herbert Xu Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index d8e2e3918ce2fd..67d2104778636c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1096,6 +1096,11 @@ static int netlink_insert(struct sock *sk, u32 portid) err = __netlink_insert(table, sk); if (err) { + /* In case the hashtable backend returns with -EBUSY + * from here, it must not escape to the caller. + */ + if (unlikely(err == -EBUSY)) + err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; nlk_sk(sk)->portid = 0; From 330567b71d8716704b189454553c2696e1eceb6c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Aug 2015 10:54:28 +0200 Subject: [PATCH 61/89] ipv6: don't reject link-local nexthop on other interface 48ed7b26faa7 ("ipv6: reject locally assigned nexthop addresses") is too strict; it rejects following corner-case: ip -6 route add default via fe80::1:2:3 dev eth1 [ where fe80::1:2:3 is assigned to a local interface, but not eth1 ] Fix this by restricting search to given device if nh is linklocal. Joint work with Hannes Frederic Sowa. Fixes: 48ed7b26faa7 ("ipv6: reject locally assigned nexthop addresses") Signed-off-by: Hannes Frederic Sowa Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6090969937f8b6..9de4d2bcd9168c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1831,6 +1831,7 @@ int ip6_route_add(struct fib6_config *cfg) int gwa_type; gw_addr = &cfg->fc_gateway; + gwa_type = ipv6_addr_type(gw_addr); /* if gw_addr is local we will fail to detect this in case * address is still TENTATIVE (DAD in progress). rt6_lookup() @@ -1838,11 +1839,12 @@ int ip6_route_add(struct fib6_config *cfg) * prefix route was assigned to, which might be non-loopback. */ err = -EINVAL; - if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0)) + if (ipv6_chk_addr_and_flags(net, gw_addr, + gwa_type & IPV6_ADDR_LINKLOCAL ? + dev : NULL, 0, 0)) goto out; rt->rt6i_gateway = *gw_addr; - gwa_type = ipv6_addr_type(gw_addr); if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { struct rt6_info *grt; From 7a76a021cd5a292be875fbc616daf03eab1e6996 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Fri, 7 Aug 2015 09:32:21 -0700 Subject: [PATCH 62/89] net-timestamp: Update skb_complete_tx_timestamp comment After "62bccb8 net-timestamp: Make the clone operation stand-alone from phy timestamping" the hwtstamps parameter of skb_complete_tx_timestamp() may no longer be NULL. Signed-off-by: Benjamin Poirier Cc: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d6cdd6e87d53bc..22b6d9ca1654f1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2884,11 +2884,11 @@ static inline bool skb_defer_rx_timestamp(struct sk_buff *skb) * * PHY drivers may accept clones of transmitted packets for * timestamping via their phy_driver.txtstamp method. These drivers - * must call this function to return the skb back to the stack, with - * or without a timestamp. + * must call this function to return the skb back to the stack with a + * timestamp. * * @skb: clone of the the original outgoing packet - * @hwtstamps: hardware time stamps, may be NULL if not available + * @hwtstamps: hardware time stamps * */ void skb_complete_tx_timestamp(struct sk_buff *skb, From 21a447637d28eb824a1163c1fc5f41ffa4b28e33 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 8 Aug 2015 22:15:25 +0300 Subject: [PATCH 63/89] cxgb4: missing curly braces in t4_setup_debugfs() There were missing curly braces so it means we call add_debugfs_mem() unintentionally. Fixes: 3ccc6cf74d8c ('cxgb4: Adds support for T6 adapter') Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index a11485fbb33f2b..c3c7db41819dfa 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -2332,10 +2332,11 @@ int t4_setup_debugfs(struct adapter *adap) EXT_MEM1_SIZE_G(size)); } } else { - if (i & EXT_MEM_ENABLE_F) + if (i & EXT_MEM_ENABLE_F) { size = t4_read_reg(adap, MA_EXT_MEMORY_BAR_A); add_debugfs_mem(adap, "mc", MEM_MC, EXT_MEM_SIZE_G(size)); + } } de = debugfs_create_file_size("flash", S_IRUSR, adap->debugfs_root, adap, From e1615903eb6b5e599396d4b3d8e3e96f6d432a6e Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 10 Aug 2015 12:49:35 +0300 Subject: [PATCH 64/89] bnx2x: Prevent null pointer dereference on SKB release On error flows its possible to free an SKB even if it was not allocated. Signed-off-by: Yuval Mintz Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index a90d7364334f9d..f7fbdc9d132511 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -262,9 +262,9 @@ static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata, if (likely(skb)) { (*pkts_compl)++; (*bytes_compl) += skb->len; + dev_kfree_skb_any(skb); } - dev_kfree_skb_any(skb); tx_buf->first_bd = 0; tx_buf->skb = NULL; From 0ea853dfa93371e651d8b7b27fd2344e973a86ed Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 10 Aug 2015 12:49:36 +0300 Subject: [PATCH 65/89] bnx2x: Free NVRAM lock at end of each page Writing each 4Kb page into flash might take up-to ~100 miliseconds, during which time management firmware cannot acces the nvram for its own uses. Firmware upgrade utility use the ethtool API to burn new flash images for the device via the ethtool API, doing so by writing several page-worth of data on each command. Such action might create problems for the management firmware, as the nvram might not be accessible for a long time. This patch changes the write implementation, releasing the nvram lock on the completion of each page, allowing the management firmware time to claim it and perform its own required actions. Signed-off-by: Yuval Mintz Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c index 76b9052a961c51..5907c821d131ee 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c @@ -1718,6 +1718,22 @@ static int bnx2x_nvram_write(struct bnx2x *bp, u32 offset, u8 *data_buf, offset += sizeof(u32); data_buf += sizeof(u32); written_so_far += sizeof(u32); + + /* At end of each 4Kb page, release nvram lock to allow MFW + * chance to take it for its own use. + */ + if ((cmd_flags & MCPR_NVM_COMMAND_LAST) && + (written_so_far < buf_size)) { + DP(BNX2X_MSG_ETHTOOL | BNX2X_MSG_NVM, + "Releasing NVM lock after offset 0x%x\n", + (u32)(offset - sizeof(u32))); + bnx2x_release_nvram_lock(bp); + usleep_range(1000, 2000); + rc = bnx2x_acquire_nvram_lock(bp); + if (rc) + return rc; + } + cmd_flags = 0; } From 9d332d92a95c9c67abe08b5f7cba64d8fc1e3c76 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 10 Aug 2015 14:22:43 -0300 Subject: [PATCH 66/89] mkiss: Fix error handling in mkiss_open() If register_netdev() fails we are not propagating the error and we return success because ax_open() succeeded previously. Fix this by checking the return value of ax_open() and register_netdev() and propagate the error in case of failure. Reported-by: RUC_Soft_Sec Signed-off-by: Fabio Estevam Signed-off-by: David S. Miller --- drivers/net/hamradio/mkiss.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 2ffbf13471d09a..216bfd350169a9 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -728,11 +728,12 @@ static int mkiss_open(struct tty_struct *tty) dev->type = ARPHRD_AX25; /* Perform the low-level AX25 initialization. */ - if ((err = ax_open(ax->dev))) { + err = ax_open(ax->dev); + if (err) goto out_free_netdev; - } - if (register_netdev(dev)) + err = register_netdev(dev); + if (err) goto out_free_buffers; /* after register_netdev() - because else printk smashes the kernel */ From 2235f2ac75fd2501c251b0b699a9632e80239a6d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Aug 2015 09:09:13 -0700 Subject: [PATCH 67/89] inet: fix races with reqsk timers reqsk_queue_destroy() and reqsk_queue_unlink() should use del_timer_sync() instead of del_timer() before calling reqsk_put(), otherwise we could free a req still used by another cpu. But before doing so, reqsk_queue_destroy() must release syn_wait_lock spinlock or risk a dead lock, as reqsk_timer_handler() might need to take this same spinlock from reqsk_queue_unlink() (called from inet_csk_reqsk_queue_drop()) Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/request_sock.c | 8 +++++++- net/ipv4/inet_connection_sock.c | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 87b22c0bc08c2f..b42f0e26f89e4c 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -103,10 +103,16 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) spin_lock_bh(&queue->syn_wait_lock); while ((req = lopt->syn_table[i]) != NULL) { lopt->syn_table[i] = req->dl_next; + /* Because of following del_timer_sync(), + * we must release the spinlock here + * or risk a dead lock. + */ + spin_unlock_bh(&queue->syn_wait_lock); atomic_inc(&lopt->qlen_dec); - if (del_timer(&req->rsk_timer)) + if (del_timer_sync(&req->rsk_timer)) reqsk_put(req); reqsk_put(req); + spin_lock_bh(&queue->syn_wait_lock); } spin_unlock_bh(&queue->syn_wait_lock); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 60021d0d9326ac..05e3145f7dc346 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -593,7 +593,7 @@ static bool reqsk_queue_unlink(struct request_sock_queue *queue, } spin_unlock(&queue->syn_wait_lock); - if (del_timer(&req->rsk_timer)) + if (del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; } From 3257d8b12f954c462d29de6201664a846328a522 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Aug 2015 15:07:34 -0700 Subject: [PATCH 68/89] inet: fix possible request socket leak In commit b357a364c57c9 ("inet: fix possible panic in reqsk_queue_unlink()"), I missed fact that tcp_check_req() can return the listener socket in one case, and that we must release the request socket refcount or we leak it. Tested: Following packetdrill test template shows the issue 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 2920 +0 > S. 0:0(0) ack 1 +.002 < . 1:1(0) ack 21 win 2920 +0 > R 21:21(0) Fixes: b357a364c57c9 ("inet: fix possible panic in reqsk_queue_unlink()") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d7d4c2b79cf2f5..0ea2e1c5d395ac 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1348,7 +1348,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); if (req) { nsk = tcp_check_req(sk, skb, req, false); - if (!nsk) + if (!nsk || nsk == sk) reqsk_put(req); return nsk; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6748c4277affad..7a6cea5e427414 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -943,7 +943,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); if (req) { nsk = tcp_check_req(sk, skb, req, false); - if (!nsk) + if (!nsk || nsk == sk) reqsk_put(req); return nsk; } From ad6cd7bafcd2c812ba4200d5938e07304f1e2fcd Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 10 Aug 2015 18:11:06 +0100 Subject: [PATCH 69/89] Revert "xen/events/fifo: Handle linked events when closing a port" This reverts commit fcdf31a7c162de0c93a2bee51df4688ab0a348f8. This was causing a WARNING whenever a PIRQ was closed since shutdown_pirq() is called with irqs disabled. Signed-off-by: David Vrabel Cc: --- drivers/xen/events/events_base.c | 10 +++---- drivers/xen/events/events_fifo.c | 45 ++++------------------------ drivers/xen/events/events_internal.h | 7 ----- 3 files changed, 9 insertions(+), 53 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 1495eccb161762..96093ae369a561 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -452,12 +452,10 @@ static void xen_free_irq(unsigned irq) irq_free_desc(irq); } -static void xen_evtchn_close(unsigned int port, unsigned int cpu) +static void xen_evtchn_close(unsigned int port) { struct evtchn_close close; - xen_evtchn_op_close(port, cpu); - close.port = port; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) BUG(); @@ -546,7 +544,7 @@ static unsigned int __startup_pirq(unsigned int irq) err: pr_err("irq%d: Failed to set port to irq mapping (%d)\n", irq, rc); - xen_evtchn_close(evtchn, NR_CPUS); + xen_evtchn_close(evtchn); return 0; } @@ -567,7 +565,7 @@ static void shutdown_pirq(struct irq_data *data) return; mask_evtchn(evtchn); - xen_evtchn_close(evtchn, cpu_from_evtchn(evtchn)); + xen_evtchn_close(evtchn); xen_irq_info_cleanup(info); } @@ -611,7 +609,7 @@ static void __unbind_from_irq(unsigned int irq) if (VALID_EVTCHN(evtchn)) { unsigned int cpu = cpu_from_irq(irq); - xen_evtchn_close(evtchn, cpu); + xen_evtchn_close(evtchn); switch (type_from_irq(irq)) { case IRQT_VIRQ: diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index 6df8aac966b909..ed673e1acd6159 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -255,12 +255,6 @@ static void evtchn_fifo_unmask(unsigned port) } } -static bool evtchn_fifo_is_linked(unsigned port) -{ - event_word_t *word = event_word_from_port(port); - return sync_test_bit(EVTCHN_FIFO_BIT(LINKED, word), BM(word)); -} - static uint32_t clear_linked(volatile event_word_t *word) { event_word_t new, old, w; @@ -287,8 +281,7 @@ static void handle_irq_for_port(unsigned port) static void consume_one_event(unsigned cpu, struct evtchn_fifo_control_block *control_block, - unsigned priority, unsigned long *ready, - bool drop) + unsigned priority, unsigned long *ready) { struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); uint32_t head; @@ -320,15 +313,13 @@ static void consume_one_event(unsigned cpu, if (head == 0) clear_bit(priority, ready); - if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) { - if (likely(!drop)) - handle_irq_for_port(port); - } + if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) + handle_irq_for_port(port); q->head[priority] = head; } -static void __evtchn_fifo_handle_events(unsigned cpu, bool drop) +static void evtchn_fifo_handle_events(unsigned cpu) { struct evtchn_fifo_control_block *control_block; unsigned long ready; @@ -340,16 +331,11 @@ static void __evtchn_fifo_handle_events(unsigned cpu, bool drop) while (ready) { q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES); - consume_one_event(cpu, control_block, q, &ready, drop); + consume_one_event(cpu, control_block, q, &ready); ready |= xchg(&control_block->ready, 0); } } -static void evtchn_fifo_handle_events(unsigned cpu) -{ - __evtchn_fifo_handle_events(cpu, false); -} - static void evtchn_fifo_resume(void) { unsigned cpu; @@ -385,26 +371,6 @@ static void evtchn_fifo_resume(void) event_array_pages = 0; } -static void evtchn_fifo_close(unsigned port, unsigned int cpu) -{ - if (cpu == NR_CPUS) - return; - - get_online_cpus(); - if (cpu_online(cpu)) { - if (WARN_ON(irqs_disabled())) - goto out; - - while (evtchn_fifo_is_linked(port)) - cpu_relax(); - } else { - __evtchn_fifo_handle_events(cpu, true); - } - -out: - put_online_cpus(); -} - static const struct evtchn_ops evtchn_ops_fifo = { .max_channels = evtchn_fifo_max_channels, .nr_channels = evtchn_fifo_nr_channels, @@ -418,7 +384,6 @@ static const struct evtchn_ops evtchn_ops_fifo = { .unmask = evtchn_fifo_unmask, .handle_events = evtchn_fifo_handle_events, .resume = evtchn_fifo_resume, - .close = evtchn_fifo_close, }; static int evtchn_fifo_alloc_control_block(unsigned cpu) diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index d18e12315ec083..50c2050a1e3202 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -68,7 +68,6 @@ struct evtchn_ops { bool (*test_and_set_mask)(unsigned port); void (*mask)(unsigned port); void (*unmask)(unsigned port); - void (*close)(unsigned port, unsigned cpu); void (*handle_events)(unsigned cpu); void (*resume)(void); @@ -146,12 +145,6 @@ static inline void xen_evtchn_resume(void) evtchn_ops->resume(); } -static inline void xen_evtchn_op_close(unsigned port, unsigned cpu) -{ - if (evtchn_ops->close) - return evtchn_ops->close(port, cpu); -} - void xen_evtchn_2l_init(void); int xen_evtchn_fifo_init(void); From c22fe519e7e2b94ad173e0ea3b89c1a7d8be8d00 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 10 Aug 2015 19:10:38 +0100 Subject: [PATCH 70/89] xen/xenbus: Don't leak memory when unmapping the ring on HVM backend The commit ccc9d90a9a8b5c4ad7e9708ec41f75ff9e98d61d "xenbus_client: Extend interface to support multi-page ring" removes the call to free_xenballooned_pages() in xenbus_unmap_ring_vfree_hvm(), leaking a page for every shared ring. Only with backends running in HVM domains were affected. Signed-off-by: Julien Grall Cc: Reviewed-by: Boris Ostrovsky Reviewed-by: Wei Liu Signed-off-by: David Vrabel --- drivers/xen/xenbus/xenbus_client.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 9ad327238ba931..e30353575d5da1 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -814,8 +814,10 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) rv = xenbus_unmap_ring(dev, node->handles, node->nr_handles, addrs); - if (!rv) + if (!rv) { vunmap(vaddr); + free_xenballooned_pages(node->nr_handles, node->hvm.pages); + } else WARN(1, "Leaking %p, size %u page(s)\n", vaddr, node->nr_handles); From 09edea4f8fdeb4e292b80d493296070f5ec64e6e Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 10 Aug 2015 17:36:06 +0100 Subject: [PATCH 71/89] ARM: 8410/1: VDSO: fix coarse clock monotonicity regression Since 906c55579a63 ("timekeeping: Copy the shadow-timekeeper over the real timekeeper last") it has become possible on ARM to: - Obtain a CLOCK_MONOTONIC_COARSE or CLOCK_REALTIME_COARSE timestamp via syscall. - Subsequently obtain a timestamp for the same clock ID via VDSO which predates the first timestamp (by one jiffy). This is because ARM's update_vsyscall is deriving the coarse time using the __current_kernel_time interface, when it should really be using the timekeeper object provided to it by the timekeeping core. It happened to work before only because __current_kernel_time would access the same timekeeper object which had been passed to update_vsyscall. This is no longer the case. Cc: stable@vger.kernel.org Fixes: 906c55579a63 ("timekeeping: Copy the shadow-timekeeper over the real timekeeper last") Signed-off-by: Nathan Lynch Acked-by: Will Deacon Signed-off-by: Russell King --- arch/arm/kernel/vdso.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index efe17dd9b9218b..54a5aeab988d35 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -296,7 +296,6 @@ static bool tk_is_cntvct(const struct timekeeper *tk) */ void update_vsyscall(struct timekeeper *tk) { - struct timespec xtime_coarse; struct timespec64 *wtm = &tk->wall_to_monotonic; if (!cntvct_ok) { @@ -308,10 +307,10 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdso_data); - xtime_coarse = __current_kernel_time(); vdso_data->tk_is_cntvct = tk_is_cntvct(tk); - vdso_data->xtime_coarse_sec = xtime_coarse.tv_sec; - vdso_data->xtime_coarse_nsec = xtime_coarse.tv_nsec; + vdso_data->xtime_coarse_sec = tk->xtime_sec; + vdso_data->xtime_coarse_nsec = (u32)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); vdso_data->wtm_clock_sec = wtm->tv_sec; vdso_data->wtm_clock_nsec = wtm->tv_nsec; From 8961822c46cc363c239503f998a6d24bbeb346d5 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Tue, 11 Aug 2015 12:11:00 +0200 Subject: [PATCH 72/89] net: fs_enet: explicitly remove I flag on TX partial frames We are not interested in interrupts for partially transmitted frames, we have to clear BD_ENET_TX_INTR explicitly otherwise it may remain from a previously used descriptor. Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index 56316db6c5a674..cf8e54652df952 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -586,7 +586,8 @@ static int fs_enet_start_xmit(struct sk_buff *skb, struct net_device *dev) frag = skb_shinfo(skb)->frags; while (nr_frags) { CBDC_SC(bdp, - BD_ENET_TX_STATS | BD_ENET_TX_LAST | BD_ENET_TX_TC); + BD_ENET_TX_STATS | BD_ENET_TX_INTR | BD_ENET_TX_LAST | + BD_ENET_TX_TC); CBDS_SC(bdp, BD_ENET_TX_READY); if ((CBDR_SC(bdp) & BD_ENET_TX_WRAP) == 0) From c68875fa82a8ab2f45a32aa8adab059f3cb1ed01 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Tue, 11 Aug 2015 12:11:03 +0200 Subject: [PATCH 73/89] net: fs_enet: mask interrupts for TX partial frames. We are not interested in interrupts for partially transmitted frames. Unlike SCC and FCC, the FEC doesn't handle the I bit in buffer descriptors, instead it defines two interrupt bits, TXB and TXF. We have to mask TXB in order to only get interrupts once the frame is fully transmitted. Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fs_enet/mac-fec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fs_enet/mac-fec.c b/drivers/net/ethernet/freescale/fs_enet/mac-fec.c index b34214e2df5f6e..016743e355de31 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mac-fec.c +++ b/drivers/net/ethernet/freescale/fs_enet/mac-fec.c @@ -110,7 +110,7 @@ static int do_pd_setup(struct fs_enet_private *fep) } #define FEC_NAPI_RX_EVENT_MSK (FEC_ENET_RXF | FEC_ENET_RXB) -#define FEC_NAPI_TX_EVENT_MSK (FEC_ENET_TXF | FEC_ENET_TXB) +#define FEC_NAPI_TX_EVENT_MSK (FEC_ENET_TXF) #define FEC_RX_EVENT (FEC_ENET_RXF) #define FEC_TX_EVENT (FEC_ENET_TXF) #define FEC_ERR_EVENT_MSK (FEC_ENET_HBERR | FEC_ENET_BABR | \ From c0ddc8c745b7f89c50385fd7aa03c78dc543fa7a Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 27 Jul 2015 00:06:55 +0200 Subject: [PATCH 74/89] localmodconfig: Use Kbuild files too In kbuild it is allowed to define objects in files named "Makefile" and "Kbuild". Currently localmodconfig reads objects only from "Makefile"s and misses modules like nouveau. Link: http://lkml.kernel.org/r/1437948415-16290-1-git-send-email-richard@nod.at Cc: stable@vger.kernel.org Reported-and-tested-by: Leonidas Spyropoulos Signed-off-by: Richard Weinberger Signed-off-by: Steven Rostedt --- scripts/kconfig/streamline_config.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/streamline_config.pl b/scripts/kconfig/streamline_config.pl index 9cb8522d8d22a8..f3d3fb42b8735e 100755 --- a/scripts/kconfig/streamline_config.pl +++ b/scripts/kconfig/streamline_config.pl @@ -137,7 +137,7 @@ sub read_config { my $kconfig = $ARGV[1]; my $lsmod_file = $ENV{'LSMOD'}; -my @makefiles = `find $ksource -name Makefile 2>/dev/null`; +my @makefiles = `find $ksource -name Makefile -or -name Kbuild 2>/dev/null`; chomp @makefiles; my %depends; From 7f518ad0a212e2a6fd68630e176af1de395070a7 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 12 Aug 2015 15:10:21 +0100 Subject: [PATCH 75/89] dm thin metadata: delete btrees when releasing metadata snapshot The device details and mapping trees were just being decremented before. Now btree_del() is called to do a deep delete. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-thin-metadata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 48dfe3c4d6aa79..6ba47cfb144374 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1293,8 +1293,8 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd) return r; disk_super = dm_block_data(copy); - dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root)); - dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root)); + dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root)); + dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root)); dm_sm_dec_block(pmd->metadata_sm, held_root); return dm_tm_unlock(pmd->tm, copy); From b0dc3c8bc157c60b1d470163882be8c13e1950af Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 12 Aug 2015 15:12:09 +0100 Subject: [PATCH 76/89] dm btree: add ref counting ops for the leaves of top level btrees When using nested btrees, the top leaves of the top levels contain block addresses for the root of the next tree down. If we shadow a shared leaf node the leaf values (sub tree roots) should be incremented accordingly. This is only an issue if there is metadata sharing in the top levels. Which only occurs if metadata snapshots are being used (as is possible with dm-thinp). And could result in a block from the thinp metadata snap being reused early, thus corrupting the thinp metadata snap. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- .../md/persistent-data/dm-btree-internal.h | 6 +++ drivers/md/persistent-data/dm-btree-remove.c | 16 +++----- drivers/md/persistent-data/dm-btree-spine.c | 37 +++++++++++++++++++ drivers/md/persistent-data/dm-btree.c | 7 +--- 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index bf2b80d5c4707a..8731b6ea026bd9 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -138,4 +138,10 @@ int lower_bound(struct btree_node *n, uint64_t key); extern struct dm_block_validator btree_node_validator; +/* + * Value type for upper levels of multi-level btrees. + */ +extern void init_le64_type(struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt); + #endif /* DM_BTREE_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index 9ca9eccd512fd9..4222f774cf369b 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -544,14 +544,6 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, return r; } -static struct dm_btree_value_type le64_type = { - .context = NULL, - .size = sizeof(__le64), - .inc = NULL, - .dec = NULL, - .equal = NULL -}; - int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root) { @@ -559,12 +551,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, int index = 0, r = 0; struct shadow_spine spine; struct btree_node *n; + struct dm_btree_value_type le64_vt; + init_le64_type(info->tm, &le64_vt); init_shadow_spine(&spine, info); for (level = 0; level < info->levels; level++) { r = remove_raw(&spine, info, (level == last_level ? - &info->value_type : &le64_type), + &info->value_type : &le64_vt), root, keys[level], (unsigned *)&index); if (r < 0) break; @@ -654,11 +648,13 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root, int index = 0, r = 0; struct shadow_spine spine; struct btree_node *n; + struct dm_btree_value_type le64_vt; uint64_t k; + init_le64_type(info->tm, &le64_vt); init_shadow_spine(&spine, info); for (level = 0; level < last_level; level++) { - r = remove_raw(&spine, info, &le64_type, + r = remove_raw(&spine, info, &le64_vt, root, keys[level], (unsigned *) &index); if (r < 0) goto out; diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index 1b5e13ec7f96a6..0dee514ba4c5f9 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -249,3 +249,40 @@ int shadow_root(struct shadow_spine *s) { return s->root; } + +static void le64_inc(void *context, const void *value_le) +{ + struct dm_transaction_manager *tm = context; + __le64 v_le; + + memcpy(&v_le, value_le, sizeof(v_le)); + dm_tm_inc(tm, le64_to_cpu(v_le)); +} + +static void le64_dec(void *context, const void *value_le) +{ + struct dm_transaction_manager *tm = context; + __le64 v_le; + + memcpy(&v_le, value_le, sizeof(v_le)); + dm_tm_dec(tm, le64_to_cpu(v_le)); +} + +static int le64_equal(void *context, const void *value1_le, const void *value2_le) +{ + __le64 v1_le, v2_le; + + memcpy(&v1_le, value1_le, sizeof(v1_le)); + memcpy(&v2_le, value2_le, sizeof(v2_le)); + return v1_le == v2_le; +} + +void init_le64_type(struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt) +{ + vt->context = tm; + vt->size = sizeof(__le64); + vt->inc = le64_inc; + vt->dec = le64_dec; + vt->equal = le64_equal; +} diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index fdd3793e22f957..c7726cebc4950c 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -667,12 +667,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root, struct btree_node *n; struct dm_btree_value_type le64_type; - le64_type.context = NULL; - le64_type.size = sizeof(__le64); - le64_type.inc = NULL; - le64_type.dec = NULL; - le64_type.equal = NULL; - + init_le64_type(info->tm, &le64_type); init_shadow_spine(&spine, info); for (level = 0; level < (info->levels - 1); level++) { From 34dd051741572859bc1fef525c5ddbc127158b52 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Wed, 12 Aug 2015 19:22:43 +0800 Subject: [PATCH 77/89] dm cache policy smq: move 'dm-cache-default' module alias to SMQ When creating dm-cache with the default policy, it will call request_module("dm-cache-default") to register the default policy. But the "dm-cache-default" alias was left referring to the MQ policy. Fix this by moving the module alias to SMQ. Fixes: bccab6a0 (dm cache: switch the "default" cache replacement policy from mq to smq) Signed-off-by: Yi Zhang Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-mq.c | 2 -- drivers/md/dm-cache-policy-smq.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 32814371b8d304..aa1b41ca40f778 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -1471,5 +1471,3 @@ module_exit(mq_exit); MODULE_AUTHOR("Joe Thornber "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("mq cache policy"); - -MODULE_ALIAS("dm-cache-default"); diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 48a4a826ae0764..200366c62231dd 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1789,3 +1789,5 @@ module_exit(smq_exit); MODULE_AUTHOR("Joe Thornber "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("smq cache policy"); + +MODULE_ALIAS("dm-cache-default"); From 8c8bac59dda0c41c7dd289d443ac42b7b72d31b0 Mon Sep 17 00:00:00 2001 From: Boyuan Zhang Date: Wed, 5 Aug 2015 14:03:48 -0400 Subject: [PATCH 78/89] drm/amdgpu: add context buffer size check for HEVC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Boyuan Zhang Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index 2f7a5efa21c23a..f5c22556ec2c17 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -374,7 +374,7 @@ static int amdgpu_uvd_cs_msg_decode(uint32_t *msg, unsigned buf_sizes[]) unsigned height_in_mb = ALIGN(height / 16, 2); unsigned fs_in_mb = width_in_mb * height_in_mb; - unsigned image_size, tmp, min_dpb_size, num_dpb_buffer; + unsigned image_size, tmp, min_dpb_size, num_dpb_buffer, min_ctx_size; image_size = width * height; image_size += image_size / 2; @@ -466,6 +466,8 @@ static int amdgpu_uvd_cs_msg_decode(uint32_t *msg, unsigned buf_sizes[]) num_dpb_buffer = (le32_to_cpu(msg[59]) & 0xff) + 2; min_dpb_size = image_size * num_dpb_buffer; + min_ctx_size = ((width + 255) / 16) * ((height + 255) / 16) + * 16 * num_dpb_buffer + 52 * 1024; break; default: @@ -486,6 +488,7 @@ static int amdgpu_uvd_cs_msg_decode(uint32_t *msg, unsigned buf_sizes[]) buf_sizes[0x1] = dpb_size; buf_sizes[0x2] = image_size; + buf_sizes[0x4] = min_ctx_size; return 0; } @@ -628,6 +631,13 @@ static int amdgpu_uvd_cs_pass2(struct amdgpu_uvd_cs_ctx *ctx) return -EINVAL; } + } else if (cmd == 0x206) { + if ((end - start) < ctx->buf_sizes[4]) { + DRM_ERROR("buffer (%d) to small (%d / %d)!\n", cmd, + (unsigned)(end - start), + ctx->buf_sizes[4]); + return -EINVAL; + } } else if ((cmd != 0x100) && (cmd != 0x204)) { DRM_ERROR("invalid UVD command %X!\n", cmd); return -EINVAL; @@ -755,9 +765,10 @@ int amdgpu_uvd_ring_parse_cs(struct amdgpu_cs_parser *parser, uint32_t ib_idx) struct amdgpu_uvd_cs_ctx ctx = {}; unsigned buf_sizes[] = { [0x00000000] = 2048, - [0x00000001] = 32 * 1024 * 1024, - [0x00000002] = 2048 * 1152 * 3, + [0x00000001] = 0xFFFFFFFF, + [0x00000002] = 0xFFFFFFFF, [0x00000003] = 2048, + [0x00000004] = 0xFFFFFFFF, }; struct amdgpu_ib *ib = &parser->ibs[ib_idx]; int r; From b8826b0cbf47ecfc9fdefdbf8c7bbb308e117005 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 10 Aug 2015 11:08:31 -0400 Subject: [PATCH 79/89] Revert "drm/amdgpu: Configure doorbell to maximum slots" This reverts commit 78ad5cdd21f0d614983fc397338944e797ec70b9. This commit breaks dpm and suspend/resume on CZ. --- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index f5a42ab1f65c07..20e2cfd521d535 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -3135,7 +3135,7 @@ static int gfx_v8_0_cp_compute_resume(struct amdgpu_device *adev) WREG32(mmCP_MEC_DOORBELL_RANGE_LOWER, AMDGPU_DOORBELL_KIQ << 2); WREG32(mmCP_MEC_DOORBELL_RANGE_UPPER, - 0x7FFFF << 2); + AMDGPU_DOORBELL_MEC_RING7 << 2); } tmp = RREG32(mmCP_HQD_PQ_DOORBELL_CONTROL); tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL, From e037239e5e7b61007763984aa35a8329596d8c88 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 10 Aug 2015 15:28:49 -0400 Subject: [PATCH 80/89] drm/radeon: add new OLAND pci id Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- include/drm/drm_pciids.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 45c39a37f92495..8bc073d297db2a 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -172,6 +172,7 @@ {0x1002, 0x6610, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6611, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6613, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x6617, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6620, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6621, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6623, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ From 211c504a444710b1d8ce3431ac19f2578602ca27 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 8 Aug 2015 12:58:57 -0700 Subject: [PATCH 81/89] net: dsa: Do not override PHY interface if already configured In case we need to divert reads/writes using the slave MII bus, we may have already fetched a valid PHY interface property from Device Tree, and that mode is used by the PHY driver to make configuration decisions. If we could not fetch the "phy-mode" property, we will assign p->phy_interface to PHY_INTERFACE_MODE_NA, such that we can actually check for that condition as to whether or not we should override the interface value. Fixes: 19334920eaf7 ("net: dsa: Set valid phy interface type") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 0917123790eaf0..35c47ddd04f0ee 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -756,7 +756,8 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p, return -ENODEV; /* Use already configured phy mode */ - p->phy_interface = p->phy->interface; + if (p->phy_interface == PHY_INTERFACE_MODE_NA) + p->phy_interface = p->phy->interface; phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, p->phy_interface); From b02e3e948de6c11fded1821d89012e24d953da12 Mon Sep 17 00:00:00 2001 From: Venkat Venkatsubra Date: Tue, 11 Aug 2015 07:57:23 -0700 Subject: [PATCH 82/89] bonding: Gratuitous ARP gets dropped when first slave added When the first slave is added (such as during bootup) the first gratuitous ARP gets dropped. We don't see this drop during a failover. The packet gets dropped in qdisc (noop_enqueue). The fix is to delay the sending of gratuitous ARPs till the bond dev's carrier is present. It can also be worked around by setting num_grat_arp to more than 1. Signed-off-by: Venkat Venkatsubra Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e1ccefce9a9de6..a98dd4f1b0e331 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -786,6 +786,7 @@ static bool bond_should_notify_peers(struct bonding *bond) slave ? slave->dev->name : "NULL"); if (!slave || !bond->send_peer_notif || + !netif_carrier_ok(bond->dev) || test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state)) return false; From a898fe040f62a32b90e26dc1f1b5973608054b29 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Aug 2015 02:41:55 +0200 Subject: [PATCH 83/89] gianfar: correct filer table writing MAX_FILER_IDX is the last usable index. Using less-than will already guarantee that one entry for catch-all rule will be left, no need to subtract 1 here. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar_ethtool.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c index 3c0a8f825b6301..4a710f3eb5ebc9 100644 --- a/drivers/net/ethernet/freescale/gianfar_ethtool.c +++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c @@ -1583,11 +1583,10 @@ static int gfar_write_filer_table(struct gfar_private *priv, return -EBUSY; /* Fill regular entries */ - for (; i < MAX_FILER_IDX - 1 && (tab->fe[i].ctrl | tab->fe[i].prop); - i++) + for (; i < MAX_FILER_IDX && (tab->fe[i].ctrl | tab->fe[i].prop); i++) gfar_write_filer(priv, i, tab->fe[i].ctrl, tab->fe[i].prop); /* Fill the rest with fall-troughs */ - for (; i < MAX_FILER_IDX - 1; i++) + for (; i < MAX_FILER_IDX; i++) gfar_write_filer(priv, i, 0x60, 0xFFFFFFFF); /* Last entry must be default accept * because that's what people expect From b5c8c8906e425f71efb83291c3837e4b78b769ea Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Aug 2015 02:41:56 +0200 Subject: [PATCH 84/89] gianfar: correct list membership accounting At a cost of one line let's make sure .count is correct when calling gfar_process_filer_changes(). Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar_ethtool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c index 4a710f3eb5ebc9..f477b67730bb81 100644 --- a/drivers/net/ethernet/freescale/gianfar_ethtool.c +++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c @@ -1721,13 +1721,14 @@ static int gfar_add_cls(struct gfar_private *priv, } process: + priv->rx_list.count++; ret = gfar_process_filer_changes(priv); if (ret) goto clean_list; - priv->rx_list.count++; return ret; clean_list: + priv->rx_list.count--; list_del(&temp->list); clean_mem: kfree(temp); From 1f2b72933422dfdaa80b59dc3a4c37eef25c4297 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Aug 2015 02:41:57 +0200 Subject: [PATCH 85/89] gianfar: remove faulty filer optimizer Current filer rule optimization is broken in several ways: (1) Can perform reads/writes beyond end of allocated tables. (gianfar_ethtool.c:1326). (2) It breaks badly for rules with more than 2 specifiers (e.g. matching ip, port, tos). Example: # ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.1 dst-port 1 tos 1 action 1 Added rule with ID 254 # ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.2 dst-port 2 tos 2 action 9 Added rule with ID 253 # ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.3 dst-port 3 tos 3 action 17 Added rule with ID 252 # ./filer_decode /sys/kernel/debug/gfar1/filer_raw 00: MASK == 00000210 AND Q:00 ctrl:00000080 prop:00000210 01: FPR == 00000210 AND CLE Q:00 ctrl:00000281 prop:00000210 02: MASK == ffffffff AND Q:00 ctrl:00000080 prop:ffffffff 03: DPT == 00000003 AND Q:00 ctrl:0000008e prop:00000003 04: TOS == 00000003 AND Q:00 ctrl:0000008a prop:00000003 05: DIA == 0a000003 AND Q:11 ctrl:0000448c prop:0a000003 06: DPT == 00000002 AND Q:00 ctrl:0000008e prop:00000002 07: TOS == 00000002 AND Q:00 ctrl:0000008a prop:00000002 08: DIA == 0a000002 AND Q:09 ctrl:0000248c prop:0a000002 09: DIA == 0a000001 AND Q:00 ctrl:0000008c prop:0a000001 0a: DPT == 00000001 AND Q:00 ctrl:0000008e prop:00000001 0b: TOS == 00000001 CLE Q:01 ctrl:0000060a prop:00000001 ff: MASK >= 00000000 Q:00 ctrl:00000020 prop:00000000 (Entire cluster gets AND-ed together). (3) We observed that the masking rules it generates do not play well with clustering on P2020. Only first rule of the cluster would ever fire. Given that optimizer relies heavily on masking this is very hard to fix. Example: # ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.1 dst-port 1 action 1 Added rule with ID 254 # ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.2 dst-port 2 action 9 Added rule with ID 253 # ethtool -N eth2 flow-type udp4 dst-ip 10.0.0.3 dst-port 3 action 17 Added rule with ID 252 # ./filer_decode /sys/kernel/debug/gfar1/filer_raw 00: MASK == 00000210 AND Q:00 ctrl:00000080 prop:00000210 01: FPR == 00000210 AND CLE Q:00 ctrl:00000281 prop:00000210 02: MASK == ffffffff AND Q:00 ctrl:00000080 prop:ffffffff 03: DPT == 00000003 AND Q:00 ctrl:0000008e prop:00000003 04: DIA == 0a000003 Q:11 ctrl:0000440c prop:0a000003 05: DPT == 00000002 AND Q:00 ctrl:0000008e prop:00000002 06: DIA == 0a000002 Q:09 ctrl:0000240c prop:0a000002 07: DIA == 0a000001 AND Q:00 ctrl:0000008c prop:0a000001 08: DPT == 00000001 CLE Q:01 ctrl:0000060e prop:00000001 ff: MASK >= 00000000 Q:00 ctrl:00000020 prop:00000000 Which looks correct according to the spec but only the first (eth id 252)/last added rule for 10.0.0.3 will ever trigger. As if filer did not treat the AND CLE as cluster start but also kept AND-ing the rules. We found no errata covering this. The fact that nobody noticed (2) or (3) makes me think that this feature is not very widely used and we should just remove it. Reported-by: Aleksander Dutkowski Signed-off-by: Jakub Kicinski Acked-by: Claudiu Manoil Signed-off-by: David S. Miller --- .../net/ethernet/freescale/gianfar_ethtool.c | 337 ------------------ 1 file changed, 337 deletions(-) diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c index f477b67730bb81..5b90fcf96265ae 100644 --- a/drivers/net/ethernet/freescale/gianfar_ethtool.c +++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c @@ -900,27 +900,6 @@ static int gfar_check_filer_hardware(struct gfar_private *priv) return 0; } -static int gfar_comp_asc(const void *a, const void *b) -{ - return memcmp(a, b, 4); -} - -static int gfar_comp_desc(const void *a, const void *b) -{ - return -memcmp(a, b, 4); -} - -static void gfar_swap(void *a, void *b, int size) -{ - u32 *_a = a; - u32 *_b = b; - - swap(_a[0], _b[0]); - swap(_a[1], _b[1]); - swap(_a[2], _b[2]); - swap(_a[3], _b[3]); -} - /* Write a mask to filer cache */ static void gfar_set_mask(u32 mask, struct filer_table *tab) { @@ -1270,310 +1249,6 @@ static int gfar_convert_to_filer(struct ethtool_rx_flow_spec *rule, return 0; } -/* Copy size filer entries */ -static void gfar_copy_filer_entries(struct gfar_filer_entry dst[0], - struct gfar_filer_entry src[0], s32 size) -{ - while (size > 0) { - size--; - dst[size].ctrl = src[size].ctrl; - dst[size].prop = src[size].prop; - } -} - -/* Delete the contents of the filer-table between start and end - * and collapse them - */ -static int gfar_trim_filer_entries(u32 begin, u32 end, struct filer_table *tab) -{ - int length; - - if (end > MAX_FILER_CACHE_IDX || end < begin) - return -EINVAL; - - end++; - length = end - begin; - - /* Copy */ - while (end < tab->index) { - tab->fe[begin].ctrl = tab->fe[end].ctrl; - tab->fe[begin++].prop = tab->fe[end++].prop; - - } - /* Fill up with don't cares */ - while (begin < tab->index) { - tab->fe[begin].ctrl = 0x60; - tab->fe[begin].prop = 0xFFFFFFFF; - begin++; - } - - tab->index -= length; - return 0; -} - -/* Make space on the wanted location */ -static int gfar_expand_filer_entries(u32 begin, u32 length, - struct filer_table *tab) -{ - if (length == 0 || length + tab->index > MAX_FILER_CACHE_IDX || - begin > MAX_FILER_CACHE_IDX) - return -EINVAL; - - gfar_copy_filer_entries(&(tab->fe[begin + length]), &(tab->fe[begin]), - tab->index - length + 1); - - tab->index += length; - return 0; -} - -static int gfar_get_next_cluster_start(int start, struct filer_table *tab) -{ - for (; (start < tab->index) && (start < MAX_FILER_CACHE_IDX - 1); - start++) { - if ((tab->fe[start].ctrl & (RQFCR_AND | RQFCR_CLE)) == - (RQFCR_AND | RQFCR_CLE)) - return start; - } - return -1; -} - -static int gfar_get_next_cluster_end(int start, struct filer_table *tab) -{ - for (; (start < tab->index) && (start < MAX_FILER_CACHE_IDX - 1); - start++) { - if ((tab->fe[start].ctrl & (RQFCR_AND | RQFCR_CLE)) == - (RQFCR_CLE)) - return start; - } - return -1; -} - -/* Uses hardwares clustering option to reduce - * the number of filer table entries - */ -static void gfar_cluster_filer(struct filer_table *tab) -{ - s32 i = -1, j, iend, jend; - - while ((i = gfar_get_next_cluster_start(++i, tab)) != -1) { - j = i; - while ((j = gfar_get_next_cluster_start(++j, tab)) != -1) { - /* The cluster entries self and the previous one - * (a mask) must be identical! - */ - if (tab->fe[i].ctrl != tab->fe[j].ctrl) - break; - if (tab->fe[i].prop != tab->fe[j].prop) - break; - if (tab->fe[i - 1].ctrl != tab->fe[j - 1].ctrl) - break; - if (tab->fe[i - 1].prop != tab->fe[j - 1].prop) - break; - iend = gfar_get_next_cluster_end(i, tab); - jend = gfar_get_next_cluster_end(j, tab); - if (jend == -1 || iend == -1) - break; - - /* First we make some free space, where our cluster - * element should be. Then we copy it there and finally - * delete in from its old location. - */ - if (gfar_expand_filer_entries(iend, (jend - j), tab) == - -EINVAL) - break; - - gfar_copy_filer_entries(&(tab->fe[iend + 1]), - &(tab->fe[jend + 1]), jend - j); - - if (gfar_trim_filer_entries(jend - 1, - jend + (jend - j), - tab) == -EINVAL) - return; - - /* Mask out cluster bit */ - tab->fe[iend].ctrl &= ~(RQFCR_CLE); - } - } -} - -/* Swaps the masked bits of a1<>a2 and b1<>b2 */ -static void gfar_swap_bits(struct gfar_filer_entry *a1, - struct gfar_filer_entry *a2, - struct gfar_filer_entry *b1, - struct gfar_filer_entry *b2, u32 mask) -{ - u32 temp[4]; - temp[0] = a1->ctrl & mask; - temp[1] = a2->ctrl & mask; - temp[2] = b1->ctrl & mask; - temp[3] = b2->ctrl & mask; - - a1->ctrl &= ~mask; - a2->ctrl &= ~mask; - b1->ctrl &= ~mask; - b2->ctrl &= ~mask; - - a1->ctrl |= temp[1]; - a2->ctrl |= temp[0]; - b1->ctrl |= temp[3]; - b2->ctrl |= temp[2]; -} - -/* Generate a list consisting of masks values with their start and - * end of validity and block as indicator for parts belonging - * together (glued by ANDs) in mask_table - */ -static u32 gfar_generate_mask_table(struct gfar_mask_entry *mask_table, - struct filer_table *tab) -{ - u32 i, and_index = 0, block_index = 1; - - for (i = 0; i < tab->index; i++) { - - /* LSByte of control = 0 sets a mask */ - if (!(tab->fe[i].ctrl & 0xF)) { - mask_table[and_index].mask = tab->fe[i].prop; - mask_table[and_index].start = i; - mask_table[and_index].block = block_index; - if (and_index >= 1) - mask_table[and_index - 1].end = i - 1; - and_index++; - } - /* cluster starts and ends will be separated because they should - * hold their position - */ - if (tab->fe[i].ctrl & RQFCR_CLE) - block_index++; - /* A not set AND indicates the end of a depended block */ - if (!(tab->fe[i].ctrl & RQFCR_AND)) - block_index++; - } - - mask_table[and_index - 1].end = i - 1; - - return and_index; -} - -/* Sorts the entries of mask_table by the values of the masks. - * Important: The 0xFF80 flags of the first and last entry of a - * block must hold their position (which queue, CLusterEnable, ReJEct, - * AND) - */ -static void gfar_sort_mask_table(struct gfar_mask_entry *mask_table, - struct filer_table *temp_table, u32 and_index) -{ - /* Pointer to compare function (_asc or _desc) */ - int (*gfar_comp)(const void *, const void *); - - u32 i, size = 0, start = 0, prev = 1; - u32 old_first, old_last, new_first, new_last; - - gfar_comp = &gfar_comp_desc; - - for (i = 0; i < and_index; i++) { - if (prev != mask_table[i].block) { - old_first = mask_table[start].start + 1; - old_last = mask_table[i - 1].end; - sort(mask_table + start, size, - sizeof(struct gfar_mask_entry), - gfar_comp, &gfar_swap); - - /* Toggle order for every block. This makes the - * thing more efficient! - */ - if (gfar_comp == gfar_comp_desc) - gfar_comp = &gfar_comp_asc; - else - gfar_comp = &gfar_comp_desc; - - new_first = mask_table[start].start + 1; - new_last = mask_table[i - 1].end; - - gfar_swap_bits(&temp_table->fe[new_first], - &temp_table->fe[old_first], - &temp_table->fe[new_last], - &temp_table->fe[old_last], - RQFCR_QUEUE | RQFCR_CLE | - RQFCR_RJE | RQFCR_AND); - - start = i; - size = 0; - } - size++; - prev = mask_table[i].block; - } -} - -/* Reduces the number of masks needed in the filer table to save entries - * This is done by sorting the masks of a depended block. A depended block is - * identified by gluing ANDs or CLE. The sorting order toggles after every - * block. Of course entries in scope of a mask must change their location with - * it. - */ -static int gfar_optimize_filer_masks(struct filer_table *tab) -{ - struct filer_table *temp_table; - struct gfar_mask_entry *mask_table; - - u32 and_index = 0, previous_mask = 0, i = 0, j = 0, size = 0; - s32 ret = 0; - - /* We need a copy of the filer table because - * we want to change its order - */ - temp_table = kmemdup(tab, sizeof(*temp_table), GFP_KERNEL); - if (temp_table == NULL) - return -ENOMEM; - - mask_table = kcalloc(MAX_FILER_CACHE_IDX / 2 + 1, - sizeof(struct gfar_mask_entry), GFP_KERNEL); - - if (mask_table == NULL) { - ret = -ENOMEM; - goto end; - } - - and_index = gfar_generate_mask_table(mask_table, tab); - - gfar_sort_mask_table(mask_table, temp_table, and_index); - - /* Now we can copy the data from our duplicated filer table to - * the real one in the order the mask table says - */ - for (i = 0; i < and_index; i++) { - size = mask_table[i].end - mask_table[i].start + 1; - gfar_copy_filer_entries(&(tab->fe[j]), - &(temp_table->fe[mask_table[i].start]), size); - j += size; - } - - /* And finally we just have to check for duplicated masks and drop the - * second ones - */ - for (i = 0; i < tab->index && i < MAX_FILER_CACHE_IDX; i++) { - if (tab->fe[i].ctrl == 0x80) { - previous_mask = i++; - break; - } - } - for (; i < tab->index && i < MAX_FILER_CACHE_IDX; i++) { - if (tab->fe[i].ctrl == 0x80) { - if (tab->fe[i].prop == tab->fe[previous_mask].prop) { - /* Two identical ones found! - * So drop the second one! - */ - gfar_trim_filer_entries(i, i, tab); - } else - /* Not identical! */ - previous_mask = i; - } - } - - kfree(mask_table); -end: kfree(temp_table); - return ret; -} - /* Write the bit-pattern from software's buffer to hardware registers */ static int gfar_write_filer_table(struct gfar_private *priv, struct filer_table *tab) @@ -1620,7 +1295,6 @@ static int gfar_process_filer_changes(struct gfar_private *priv) { struct ethtool_flow_spec_container *j; struct filer_table *tab; - s32 i = 0; s32 ret = 0; /* So index is set to zero, too! */ @@ -1645,17 +1319,6 @@ static int gfar_process_filer_changes(struct gfar_private *priv) } } - i = tab->index; - - /* Optimizations to save entries */ - gfar_cluster_filer(tab); - gfar_optimize_filer_masks(tab); - - pr_debug("\tSummary:\n" - "\tData on hardware: %d\n" - "\tCompression rate: %d%%\n", - tab->index, 100 - (100 * tab->index) / i); - /* Write everything to hardware */ ret = gfar_write_filer_table(priv, tab); if (ret == -EBUSY) { From e6d006938c9bda7ffd22af9d3e1257fd75941fb7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 13 Aug 2015 00:08:01 +0300 Subject: [PATCH 86/89] cosa: missing error code on failure in probe() If register_hdlc_device() fails, the current code returns 0 but we should return an error code instead. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/wan/cosa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index 7193b7304fdd3e..848ea6a399f236 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -589,7 +589,8 @@ static int cosa_probe(int base, int irq, int dma) chan->netdev->base_addr = chan->cosa->datareg; chan->netdev->irq = chan->cosa->irq; chan->netdev->dma = chan->cosa->dma; - if (register_hdlc_device(chan->netdev)) { + err = register_hdlc_device(chan->netdev); + if (err) { netdev_warn(chan->netdev, "register_hdlc_device() failed\n"); free_netdev(chan->netdev); From 5c16179b550b9fd8114637a56b153c9768ea06a5 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 21 Jul 2015 11:00:53 +0200 Subject: [PATCH 87/89] EDAC, ppc4xx: Access mci->csrows array elements properly The commit de3910eb79ac ("edac: change the mem allocation scheme to make Documentation/kobject.txt happy") changed the memory allocation for the csrows member. But ppc4xx_edac was forgotten in the patch. Fix it. Signed-off-by: Michael Walle Cc: Cc: linux-edac Cc: Mauro Carvalho Chehab Link: http://lkml.kernel.org/r/1437469253-8611-1-git-send-email-michael@walle.cc Signed-off-by: Borislav Petkov --- drivers/edac/ppc4xx_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c index 3515b381c13126..711d8ad74f116e 100644 --- a/drivers/edac/ppc4xx_edac.c +++ b/drivers/edac/ppc4xx_edac.c @@ -920,7 +920,7 @@ static int ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1) */ for (row = 0; row < mci->nr_csrows; row++) { - struct csrow_info *csi = &mci->csrows[row]; + struct csrow_info *csi = mci->csrows[row]; /* * Get the configuration settings for this From ed596cde9425509ec6ce88e19f03e9b13b6f518b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2015 08:25:20 -0700 Subject: [PATCH 88/89] Revert x86 sigcontext cleanups This reverts commits 9a036b93a344 ("x86/signal/64: Remove 'fs' and 'gs' from sigcontext") and c6f2062935c8 ("x86/signal/64: Fix SS handling for signals delivered to 64-bit programs"). They were cleanups, but they break dosemu by changing the signal return behavior (and removing 'fs' and 'gs' from the sigcontext struct - while not actually changing any behavior - causes build problems). Reported-and-tested-by: Stas Sergeev Acked-by: Andy Lutomirski Cc: Ingo Molnar Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/sigcontext.h | 6 +++--- arch/x86/include/uapi/asm/sigcontext.h | 21 +++------------------ arch/x86/kernel/signal.c | 26 +++++++++++--------------- 3 files changed, 17 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 6fe6b182c9981d..9dfce4e0417d92 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -57,9 +57,9 @@ struct sigcontext { unsigned long ip; unsigned long flags; unsigned short cs; - unsigned short __pad2; /* Was called gs, but was always zero. */ - unsigned short __pad1; /* Was called fs, but was always zero. */ - unsigned short ss; + unsigned short gs; + unsigned short fs; + unsigned short __pad0; unsigned long err; unsigned long trapno; unsigned long oldmask; diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 0e8a973de9ee8a..40836a9a7250c9 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -177,24 +177,9 @@ struct sigcontext { __u64 rip; __u64 eflags; /* RFLAGS */ __u16 cs; - - /* - * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), - * Linux saved and restored fs and gs in these slots. This - * was counterproductive, as fsbase and gsbase were never - * saved, so arch_prctl was presumably unreliable. - * - * If these slots are ever needed for any other purpose, there - * is some risk that very old 64-bit binaries could get - * confused. I doubt that many such binaries still work, - * though, since the same patch in 2.5.64 also removed the - * 64-bit set_thread_area syscall, so it appears that there is - * no TLS API that works in both pre- and post-2.5.64 kernels. - */ - __u16 __pad2; /* Was gs. */ - __u16 __pad1; /* Was fs. */ - - __u16 ss; + __u16 gs; + __u16 fs; + __u16 __pad0; __u64 err; __u64 trapno; __u64 oldmask; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 206996c1669db3..71820c42b6ce6b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -93,8 +93,15 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) COPY(r15); #endif /* CONFIG_X86_64 */ +#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); +#else /* !CONFIG_X86_32 */ + /* Kernel saves and restores only the CS segment register on signals, + * which is the bare minimum needed to allow mixed 32/64-bit code. + * App's signal handler can save/restore other segments if needed. */ + COPY_SEG_CPL3(cs); +#endif /* CONFIG_X86_32 */ get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); @@ -154,9 +161,8 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, #else /* !CONFIG_X86_32 */ put_user_ex(regs->flags, &sc->flags); put_user_ex(regs->cs, &sc->cs); - put_user_ex(0, &sc->__pad2); - put_user_ex(0, &sc->__pad1); - put_user_ex(regs->ss, &sc->ss); + put_user_ex(0, &sc->gs); + put_user_ex(0, &sc->fs); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, &sc->fpstate); @@ -451,19 +457,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->sp = (unsigned long)frame; - /* - * Set up the CS and SS registers to run signal handlers in - * 64-bit mode, even if the handler happens to be interrupting - * 32-bit or 16-bit code. - * - * SS is subtle. In 64-bit mode, we don't need any particular - * SS descriptor, but we do need SS to be valid. It's possible - * that the old SS is entirely bogus -- this can happen if the - * signal we're trying to deliver is #GP or #SS caused by a bad - * SS value. - */ + /* Set up the CS register to run signal handlers in 64-bit mode, + even if the handler happens to be interrupting 32-bit code. */ regs->cs = __USER_CS; - regs->ss = __USER_DS; return 0; } From cd88ec2317015f9ae94fa55149bc6f61e1a460e9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Aug 2015 16:19:44 -0700 Subject: [PATCH 89/89] x86: fix error handling for 32-bit compat out-of-range system call numbers Commit 3f5159a9221f ("x86/asm/entry/32: Update -ENOSYS handling to match the 64-bit logic") broke the ENOSYS handling for the 32-bit compat case. The proper error return value was never loaded into %rax, except if things just happened to go through the audit paths, which ended up reloading the return value. This moves the loading or %rax into the normal system call path, just to make sure the error case triggers it. It's kind of sad, since it adds a useless instruction to reload the register to the fast path, but it's not like that single load from the stack is going to be noticeable. Reported-by: David Drysdale Tested-by: Kees Cook Acked-by: Andy Lutomirski Cc: Denys Vlasenko Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/entry/entry_64_compat.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 5a1844765a7aba..a7e257d9cb90b9 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -140,6 +140,7 @@ sysexit_from_sys_call: */ andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) movl RIP(%rsp), %ecx /* User %eip */ + movq RAX(%rsp), %rax RESTORE_RSI_RDI xorl %edx, %edx /* Do not leak kernel information */ xorq %r8, %r8 @@ -219,7 +220,6 @@ sysexit_from_sys_call: 1: setbe %al /* 1 if error, 0 if not */ movzbl %al, %edi /* zero-extend that into %edi */ call __audit_syscall_exit - movq RAX(%rsp), %rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -368,6 +368,7 @@ sysretl_from_sys_call: RESTORE_RSI_RDI_RDX movl RIP(%rsp), %ecx movl EFLAGS(%rsp), %r11d + movq RAX(%rsp), %rax xorq %r10, %r10 xorq %r9, %r9 xorq %r8, %r8