From 30c694fd4a99fbbc4115d180156ca01b60953371 Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Fri, 4 Aug 2023 10:34:30 +0200 Subject: [PATCH 01/32] regulator: da9063: better fix null deref with partial DT Two versions of the original patch were sent but V1 was merged instead of V2 due to a mistake. So update to V2. The advantage of V2 is that it completely avoids dereferencing the pointer, even just to take the address, which may fix problems with some compilers. Both versions work on my gcc 9.4 but use the safer one. Fixes: 98e2dd5f7a8b ("regulator: da9063: fix null pointer deref with partial DT config") Signed-off-by: Martin Fuzzey Tested-by: Benjamin Bara Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230804083514.1887124-1-martin.fuzzey@flowbird.group Signed-off-by: Mark Brown --- drivers/regulator/da9063-regulator.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/regulator/da9063-regulator.c b/drivers/regulator/da9063-regulator.c index dfd5ec9f75c90..a0621665a6d22 100644 --- a/drivers/regulator/da9063-regulator.c +++ b/drivers/regulator/da9063-regulator.c @@ -778,9 +778,6 @@ static int da9063_check_xvp_constraints(struct regulator_config *config) const struct notification_limit *uv_l = &constr->under_voltage_limits; const struct notification_limit *ov_l = &constr->over_voltage_limits; - if (!config->init_data) /* No config in DT, pointers will be invalid */ - return 0; - /* make sure that only one severity is used to clarify if unchanged, enabled or disabled */ if ((!!uv_l->prot + !!uv_l->err + !!uv_l->warn) > 1) { dev_err(config->dev, "%s: at most one voltage monitoring severity allowed!\n", @@ -1031,9 +1028,12 @@ static int da9063_regulator_probe(struct platform_device *pdev) config.of_node = da9063_reg_matches[id].of_node; config.regmap = da9063->regmap; - ret = da9063_check_xvp_constraints(&config); - if (ret) - return ret; + /* Checking constraints requires init_data from DT. */ + if (config.init_data) { + ret = da9063_check_xvp_constraints(&config); + if (ret) + return ret; + } regl->rdev = devm_regulator_register(&pdev->dev, ®l->desc, &config); From 7cdf55462c5533a1c78ae13ab8563558e30e4130 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Tue, 1 Aug 2023 12:57:02 +0300 Subject: [PATCH 02/32] regulator: qcom-rpmh: Fix LDO 12 regulator for PM8550 The LDO 12 is NLDO 515 low voltage type, so fix accordingly. Fixes: e6e3776d682d ("regulator: qcom-rpmh: Add support for PM8550 regulators") Signed-off-by: Abel Vesa Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20230801095702.2891127-1-abel.vesa@linaro.org Signed-off-by: Mark Brown --- drivers/regulator/qcom-rpmh-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/qcom-rpmh-regulator.c b/drivers/regulator/qcom-rpmh-regulator.c index f3b280af07737..cd077b7c4aff3 100644 --- a/drivers/regulator/qcom-rpmh-regulator.c +++ b/drivers/regulator/qcom-rpmh-regulator.c @@ -1068,7 +1068,7 @@ static const struct rpmh_vreg_init_data pm8550_vreg_data[] = { RPMH_VREG("ldo9", "ldo%s9", &pmic5_pldo, "vdd-l8-l9"), RPMH_VREG("ldo10", "ldo%s10", &pmic5_nldo515, "vdd-l1-l4-l10"), RPMH_VREG("ldo11", "ldo%s11", &pmic5_nldo515, "vdd-l11"), - RPMH_VREG("ldo12", "ldo%s12", &pmic5_pldo, "vdd-l12"), + RPMH_VREG("ldo12", "ldo%s12", &pmic5_nldo515, "vdd-l12"), RPMH_VREG("ldo13", "ldo%s13", &pmic5_pldo, "vdd-l2-l13-l14"), RPMH_VREG("ldo14", "ldo%s14", &pmic5_pldo, "vdd-l2-l13-l14"), RPMH_VREG("ldo15", "ldo%s15", &pmic5_nldo515, "vdd-l15"), From 55c91fedd03d7b9cf0c5199b2eb12b9b8e95281a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 29 Jun 2023 14:05:26 +0200 Subject: [PATCH 03/32] virtio-mmio: don't break lifecycle of vm_dev vm_dev has a separate lifecycle because it has a 'struct device' embedded. Thus, having a release callback for it is correct. Allocating the vm_dev struct with devres totally breaks this protection, though. Instead of waiting for the vm_dev release callback, the memory is freed when the platform_device is removed. Resulting in a use-after-free when finally the callback is to be called. To easily see the problem, compile the kernel with CONFIG_DEBUG_KOBJECT_RELEASE and unbind with sysfs. The fix is easy, don't use devres in this case. Found during my research about object lifetime problems. Fixes: 7eb781b1bbb7 ("virtio_mmio: add cleanup for virtio_mmio_probe") Signed-off-by: Wolfram Sang Message-Id: <20230629120526.7184-1-wsa+renesas@sang-engineering.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mmio.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index a46a4a29e9295..97760f6112959 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -607,9 +607,8 @@ static void virtio_mmio_release_dev(struct device *_d) struct virtio_device *vdev = container_of(_d, struct virtio_device, dev); struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); - struct platform_device *pdev = vm_dev->pdev; - devm_kfree(&pdev->dev, vm_dev); + kfree(vm_dev); } /* Platform device */ @@ -620,7 +619,7 @@ static int virtio_mmio_probe(struct platform_device *pdev) unsigned long magic; int rc; - vm_dev = devm_kzalloc(&pdev->dev, sizeof(*vm_dev), GFP_KERNEL); + vm_dev = kzalloc(sizeof(*vm_dev), GFP_KERNEL); if (!vm_dev) return -ENOMEM; From 9ad1a29cb0991e3145996cdce691525e8ac65db7 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 6 Jul 2023 16:17:18 -0700 Subject: [PATCH 04/32] pds_vdpa: protect Makefile from unconfigured debugfs debugfs.h protects itself from an undefined DEBUG_FS, so it is not necessary to check it in the driver code or the Makefile. The driver code had been updated for this, but the Makefile had missed the update. Link: https://lore.kernel.org/linux-next/fec68c3c-8249-7af4-5390-0495386a76f9@infradead.org/ Fixes: a16291b5bcbb ("pds_vdpa: Add new vDPA driver for AMD/Pensando DSC") Signed-off-by: Shannon Nelson Message-Id: <20230706231718.54198-1-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Acked-by: Jason Wang --- drivers/vdpa/pds/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vdpa/pds/Makefile b/drivers/vdpa/pds/Makefile index 2e22418e3ab30..c2d314d4614d3 100644 --- a/drivers/vdpa/pds/Makefile +++ b/drivers/vdpa/pds/Makefile @@ -5,6 +5,5 @@ obj-$(CONFIG_PDS_VDPA) := pds_vdpa.o pds_vdpa-y := aux_drv.o \ cmds.o \ + debugfs.o \ vdpa_dev.o - -pds_vdpa-$(CONFIG_DEBUG_FS) += debugfs.o From 5ced58bfa132c8ba0f9c893eb621595a84cfee12 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sun, 9 Jul 2023 15:28:58 -0500 Subject: [PATCH 05/32] vhost-scsi: Fix alignment handling with windows The linux block layer requires bios/requests to have lengths with a 512 byte alignment. Some drivers/layers like dm-crypt and the directi IO code will test for it and just fail. Other drivers like SCSI just assume the requirement is met and will end up in infinte retry loops. The problem for drivers like SCSI is that it uses functions like blk_rq_cur_sectors and blk_rq_sectors which divide the request's length by 512. If there's lefovers then it just gets dropped. But other code in the block/scsi layer may use blk_rq_bytes/blk_rq_cur_bytes and end up thinking there is still data left and try to retry the cmd. We can then end up getting stuck in retry loops where part of the block/scsi thinks there is data left, but other parts think we want to do IOs of zero length. Linux will always check for alignment, but windows will not. When vhost-scsi then translates the iovec it gets from a windows guest to a scatterlist, we can end up with sg items where the sg->length is not divisible by 512 due to the misaligned offset: sg[0].offset = 255; sg[0].length = 3841; sg... sg[N].offset = 0; sg[N].length = 255; When the lio backends then convert the SG to bios or other iovecs, we end up sending them with the same misaligned values and can hit the issues above. This just has us drop down to allocating a temp page and copying the data when we detect a misaligned buffer and the IO is large enough that it will get split into multiple bad IOs. Signed-off-by: Mike Christie Message-Id: <20230709202859.138387-2-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin Acked-by: Stefan Hajnoczi --- drivers/vhost/scsi.c | 186 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 161 insertions(+), 25 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index c83f7f043470d..324e4b3846fa3 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -75,6 +77,9 @@ struct vhost_scsi_cmd { u32 tvc_prot_sgl_count; /* Saved unpacked SCSI LUN for vhost_scsi_target_queue_cmd() */ u32 tvc_lun; + u32 copied_iov:1; + const void *saved_iter_addr; + struct iov_iter saved_iter; /* Pointer to the SGL formatted memory from virtio-scsi */ struct scatterlist *tvc_sgl; struct scatterlist *tvc_prot_sgl; @@ -328,8 +333,13 @@ static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd) int i; if (tv_cmd->tvc_sgl_count) { - for (i = 0; i < tv_cmd->tvc_sgl_count; i++) - put_page(sg_page(&tv_cmd->tvc_sgl[i])); + for (i = 0; i < tv_cmd->tvc_sgl_count; i++) { + if (tv_cmd->copied_iov) + __free_page(sg_page(&tv_cmd->tvc_sgl[i])); + else + put_page(sg_page(&tv_cmd->tvc_sgl[i])); + } + kfree(tv_cmd->saved_iter_addr); } if (tv_cmd->tvc_prot_sgl_count) { for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++) @@ -504,6 +514,28 @@ static void vhost_scsi_evt_work(struct vhost_work *work) mutex_unlock(&vq->mutex); } +static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd) +{ + struct iov_iter *iter = &cmd->saved_iter; + struct scatterlist *sg = cmd->tvc_sgl; + struct page *page; + size_t len; + int i; + + for (i = 0; i < cmd->tvc_sgl_count; i++) { + page = sg_page(&sg[i]); + len = sg[i].length; + + if (copy_page_to_iter(page, 0, len, iter) != len) { + pr_err("Could not copy data while handling misaligned cmd. Error %zu\n", + len); + return -1; + } + } + + return 0; +} + /* Fill in status and signal that we are done processing this command * * This is scheduled in the vhost work queue so we are called with the owner @@ -527,15 +559,20 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__, cmd, se_cmd->residual_count, se_cmd->scsi_status); - memset(&v_rsp, 0, sizeof(v_rsp)); - v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, se_cmd->residual_count); - /* TODO is status_qualifier field needed? */ - v_rsp.status = se_cmd->scsi_status; - v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq, - se_cmd->scsi_sense_length); - memcpy(v_rsp.sense, cmd->tvc_sense_buf, - se_cmd->scsi_sense_length); + + if (cmd->saved_iter_addr && vhost_scsi_copy_sgl_to_iov(cmd)) { + v_rsp.response = VIRTIO_SCSI_S_BAD_TARGET; + } else { + v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, + se_cmd->residual_count); + /* TODO is status_qualifier field needed? */ + v_rsp.status = se_cmd->scsi_status; + v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq, + se_cmd->scsi_sense_length); + memcpy(v_rsp.sense, cmd->tvc_sense_buf, + se_cmd->scsi_sense_length); + } iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov, cmd->tvc_in_iovs, sizeof(v_rsp)); @@ -613,12 +650,12 @@ static int vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, struct scatterlist *sgl, - bool write) + bool is_prot) { struct page **pages = cmd->tvc_upages; struct scatterlist *sg = sgl; - ssize_t bytes; - size_t offset; + ssize_t bytes, mapped_bytes; + size_t offset, mapped_offset; unsigned int npages = 0; bytes = iov_iter_get_pages2(iter, pages, LONG_MAX, @@ -627,13 +664,53 @@ vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd, if (bytes <= 0) return bytes < 0 ? bytes : -EFAULT; + mapped_bytes = bytes; + mapped_offset = offset; + while (bytes) { unsigned n = min_t(unsigned, PAGE_SIZE - offset, bytes); + /* + * The block layer requires bios/requests to be a multiple of + * 512 bytes, but Windows can send us vecs that are misaligned. + * This can result in bios and later requests with misaligned + * sizes if we have to break up a cmd/scatterlist into multiple + * bios. + * + * We currently only break up a command into multiple bios if + * we hit the vec/seg limit, so check if our sgl_count is + * greater than the max and if a vec in the cmd has a + * misaligned offset/size. + */ + if (!is_prot && + (offset & (SECTOR_SIZE - 1) || n & (SECTOR_SIZE - 1)) && + cmd->tvc_sgl_count > BIO_MAX_VECS) { + WARN_ONCE(true, + "vhost-scsi detected misaligned IO. Performance may be degraded."); + goto revert_iter_get_pages; + } + sg_set_page(sg++, pages[npages++], n, offset); bytes -= n; offset = 0; } + return npages; + +revert_iter_get_pages: + iov_iter_revert(iter, mapped_bytes); + + npages = 0; + while (mapped_bytes) { + unsigned int n = min_t(unsigned int, PAGE_SIZE - mapped_offset, + mapped_bytes); + + put_page(pages[npages++]); + + mapped_bytes -= n; + mapped_offset = 0; + } + + return -EINVAL; } static int @@ -657,25 +734,80 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls) } static int -vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write, - struct iov_iter *iter, - struct scatterlist *sg, int sg_count) +vhost_scsi_copy_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, + struct scatterlist *sg, int sg_count) +{ + size_t len = iov_iter_count(iter); + unsigned int nbytes = 0; + struct page *page; + int i; + + if (cmd->tvc_data_direction == DMA_FROM_DEVICE) { + cmd->saved_iter_addr = dup_iter(&cmd->saved_iter, iter, + GFP_KERNEL); + if (!cmd->saved_iter_addr) + return -ENOMEM; + } + + for (i = 0; i < sg_count; i++) { + page = alloc_page(GFP_KERNEL); + if (!page) { + i--; + goto err; + } + + nbytes = min_t(unsigned int, PAGE_SIZE, len); + sg_set_page(&sg[i], page, nbytes, 0); + + if (cmd->tvc_data_direction == DMA_TO_DEVICE && + copy_page_from_iter(page, 0, nbytes, iter) != nbytes) + goto err; + + len -= nbytes; + } + + cmd->copied_iov = 1; + return 0; + +err: + pr_err("Could not read %u bytes while handling misaligned cmd\n", + nbytes); + + for (; i >= 0; i--) + __free_page(sg_page(&sg[i])); + kfree(cmd->saved_iter_addr); + return -ENOMEM; +} + +static int +vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, + struct scatterlist *sg, int sg_count, bool is_prot) { struct scatterlist *p = sg; + size_t revert_bytes; int ret; while (iov_iter_count(iter)) { - ret = vhost_scsi_map_to_sgl(cmd, iter, sg, write); + ret = vhost_scsi_map_to_sgl(cmd, iter, sg, is_prot); if (ret < 0) { + revert_bytes = 0; + while (p < sg) { - struct page *page = sg_page(p++); - if (page) + struct page *page = sg_page(p); + + if (page) { put_page(page); + revert_bytes += p->length; + } + p++; } + + iov_iter_revert(iter, revert_bytes); return ret; } sg += ret; } + return 0; } @@ -685,7 +817,6 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, size_t data_bytes, struct iov_iter *data_iter) { int sgl_count, ret; - bool write = (cmd->tvc_data_direction == DMA_FROM_DEVICE); if (prot_bytes) { sgl_count = vhost_scsi_calc_sgls(prot_iter, prot_bytes, @@ -698,9 +829,8 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, pr_debug("%s prot_sg %p prot_sgl_count %u\n", __func__, cmd->tvc_prot_sgl, cmd->tvc_prot_sgl_count); - ret = vhost_scsi_iov_to_sgl(cmd, write, prot_iter, - cmd->tvc_prot_sgl, - cmd->tvc_prot_sgl_count); + ret = vhost_scsi_iov_to_sgl(cmd, prot_iter, cmd->tvc_prot_sgl, + cmd->tvc_prot_sgl_count, true); if (ret < 0) { cmd->tvc_prot_sgl_count = 0; return ret; @@ -716,8 +846,14 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, pr_debug("%s data_sg %p data_sgl_count %u\n", __func__, cmd->tvc_sgl, cmd->tvc_sgl_count); - ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter, - cmd->tvc_sgl, cmd->tvc_sgl_count); + ret = vhost_scsi_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, + cmd->tvc_sgl_count, false); + if (ret == -EINVAL) { + sg_init_table(cmd->tvc_sgl, cmd->tvc_sgl_count); + ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, + cmd->tvc_sgl_count); + } + if (ret < 0) { cmd->tvc_sgl_count = 0; return ret; From c5ace19efb0ac884a9a417e2a1499ce9849bdaa5 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sun, 9 Jul 2023 15:28:59 -0500 Subject: [PATCH 06/32] vhost-scsi: Rename vhost_scsi_iov_to_sgl Rename vhost_scsi_iov_to_sgl to vhost_scsi_map_iov_to_sgl so it matches matches the naming style used for vhost_scsi_copy_iov_to_sgl. Signed-off-by: Mike Christie Message-Id: <20230709202859.138387-3-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin Acked-by: Stefan Hajnoczi --- drivers/vhost/scsi.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 324e4b3846fa3..abef0619c7901 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -780,8 +780,8 @@ vhost_scsi_copy_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, } static int -vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, - struct scatterlist *sg, int sg_count, bool is_prot) +vhost_scsi_map_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, + struct scatterlist *sg, int sg_count, bool is_prot) { struct scatterlist *p = sg; size_t revert_bytes; @@ -829,8 +829,9 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, pr_debug("%s prot_sg %p prot_sgl_count %u\n", __func__, cmd->tvc_prot_sgl, cmd->tvc_prot_sgl_count); - ret = vhost_scsi_iov_to_sgl(cmd, prot_iter, cmd->tvc_prot_sgl, - cmd->tvc_prot_sgl_count, true); + ret = vhost_scsi_map_iov_to_sgl(cmd, prot_iter, + cmd->tvc_prot_sgl, + cmd->tvc_prot_sgl_count, true); if (ret < 0) { cmd->tvc_prot_sgl_count = 0; return ret; @@ -846,8 +847,8 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, pr_debug("%s data_sg %p data_sgl_count %u\n", __func__, cmd->tvc_sgl, cmd->tvc_sgl_count); - ret = vhost_scsi_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, - cmd->tvc_sgl_count, false); + ret = vhost_scsi_map_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, + cmd->tvc_sgl_count, false); if (ret == -EINVAL) { sg_init_table(cmd->tvc_sgl, cmd->tvc_sgl_count); ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, From 8d4bdf11f096e5b343ee0f9aaa8c262dc16d2e1e Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sat, 15 Jul 2023 09:20:27 -0500 Subject: [PATCH 07/32] MAINTAINERS: add vhost-scsi entry and myself as a co-maintainer I've been doing a lot of the development on vhost-scsi the last couple of years, so per Michael T's suggestion this adds me as co-maintainer. Signed-off-by: Mike Christie Message-Id: <20230715142027.5572-1-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Acked-by: Stefano Garzarella Acked-by: Stefan Hajnoczi --- MAINTAINERS | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0f966f05fb0d4..63a31c3bf7d40 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22476,7 +22476,6 @@ L: virtualization@lists.linux-foundation.org S: Maintained F: drivers/block/virtio_blk.c F: drivers/scsi/virtio_scsi.c -F: drivers/vhost/scsi.c F: include/uapi/linux/virtio_blk.h F: include/uapi/linux/virtio_scsi.h @@ -22575,6 +22574,16 @@ F: include/linux/vhost_iotlb.h F: include/uapi/linux/vhost.h F: kernel/vhost_task.c +VIRTIO HOST (VHOST-SCSI) +M: "Michael S. Tsirkin" +M: Jason Wang +M: Mike Christie +R: Paolo Bonzini +R: Stefan Hajnoczi +L: virtualization@lists.linux-foundation.org +S: Maintained +F: drivers/vhost/scsi.c + VIRTIO I2C DRIVER M: Conghui Chen M: Viresh Kumar From 13f3efaca024e16ccfab0e8b2cf29d66489d8d54 Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Wed, 19 Jul 2023 11:45:50 -0400 Subject: [PATCH 08/32] virtio-pci: Fix legacy device flag setting error in probe The 'is_legacy' flag is used to differentiate between legacy vs modern device. Currently, it is based on the value of vp_dev->ldev.ioaddr. However, due to the shared memory of the union between struct virtio_pci_legacy_device and struct virtio_pci_modern_device, when virtio_pci_modern_probe modifies the content of struct virtio_pci_modern_device, it affects the content of struct virtio_pci_legacy_device, and ldev.ioaddr is no longer zero, causing the 'is_legacy' flag to be set as true. To resolve issue, when legacy device is probed, mark 'is_legacy' as true, when modern device is probed, keep 'is_legacy' as false. Fixes: 4f0fc22534e3 ("virtio_pci: Optimize virtio_pci_device structure size") Signed-off-by: Feng Liu Reviewed-by: Parav Pandit Reviewed-by: Jiri Pirko Message-Id: <20230719154550.79536-1-feliu@nvidia.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo Acked-by: Jason Wang --- drivers/virtio/virtio_pci_common.c | 2 -- drivers/virtio/virtio_pci_legacy.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index a6c86f916dbdf..c2524a7207cfa 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -557,8 +557,6 @@ static int virtio_pci_probe(struct pci_dev *pci_dev, pci_set_master(pci_dev); - vp_dev->is_legacy = vp_dev->ldev.ioaddr ? true : false; - rc = register_virtio_device(&vp_dev->vdev); reg_dev = vp_dev; if (rc) diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c index 2257f1b3d8ae1..d9cbb02b35a11 100644 --- a/drivers/virtio/virtio_pci_legacy.c +++ b/drivers/virtio/virtio_pci_legacy.c @@ -223,6 +223,7 @@ int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev) vp_dev->config_vector = vp_config_vector; vp_dev->setup_vq = setup_vq; vp_dev->del_vq = del_vq; + vp_dev->is_legacy = true; return 0; } From 79c8651587504ba263d2fd67fd4406240fb21f69 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 27 Jul 2023 20:57:48 +0300 Subject: [PATCH 09/32] vdpa: Add features attr to vdpa_nl_policy for nlattr length check The vdpa_nl_policy structure is used to validate the nlattr when parsing the incoming nlmsg. It will ensure the attribute being described produces a valid nlattr pointer in info->attrs before entering into each handler in vdpa_nl_ops. That is to say, the missing part in vdpa_nl_policy may lead to illegal nlattr after parsing, which could lead to OOB read just like CVE-2023-3773. This patch adds the missing nla_policy for vdpa features attr to avoid such bugs. Fixes: 90fea5a800c3 ("vdpa: device feature provisioning") Signed-off-by: Lin Ma Cc: stable@vger.kernel.org Message-Id: <20230727175757.73988-3-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 965e32529eb85..3ad355a2208aa 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -1249,6 +1249,7 @@ static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR, /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */ [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68), + [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 }, }; static const struct genl_ops vdpa_nl_ops[] = { From b3003e1b54e057f5f3124e437b80c3bef26ed3fe Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 27 Jul 2023 20:57:50 +0300 Subject: [PATCH 10/32] vdpa: Add queue index attr to vdpa_nl_policy for nlattr length check The vdpa_nl_policy structure is used to validate the nlattr when parsing the incoming nlmsg. It will ensure the attribute being described produces a valid nlattr pointer in info->attrs before entering into each handler in vdpa_nl_ops. That is to say, the missing part in vdpa_nl_policy may lead to illegal nlattr after parsing, which could lead to OOB read just like CVE-2023-3773. This patch adds the missing nla_policy for vdpa queue index attr to avoid such bugs. Fixes: 13b00b135665 ("vdpa: Add support for querying vendor statistics") Signed-off-by: Lin Ma Cc: stable@vger.kernelorg Message-Id: <20230727175757.73988-5-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 3ad355a2208aa..75f1df2b9d2aa 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -1249,6 +1249,7 @@ static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR, /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */ [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68), + [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 }, [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 }, }; From 5d6ba607d6cb5c58a4ddf33381e18c83dbb4098f Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 27 Jul 2023 20:57:52 +0300 Subject: [PATCH 11/32] vdpa: Add max vqp attr to vdpa_nl_policy for nlattr length check The vdpa_nl_policy structure is used to validate the nlattr when parsing the incoming nlmsg. It will ensure the attribute being described produces a valid nlattr pointer in info->attrs before entering into each handler in vdpa_nl_ops. That is to say, the missing part in vdpa_nl_policy may lead to illegal nlattr after parsing, which could lead to OOB read just like CVE-2023-3773. This patch adds the missing nla_policy for vdpa max vqp attr to avoid such bugs. Fixes: ad69dd0bf26b ("vdpa: Introduce query of device config layout") Signed-off-by: Lin Ma Cc: stable@vger.kernel.org Message-Id: <20230727175757.73988-7-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 75f1df2b9d2aa..f2f654fd84e5f 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -1247,6 +1247,7 @@ static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR, + [VDPA_ATTR_DEV_NET_CFG_MAX_VQP] = { .type = NLA_U16 }, /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */ [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68), [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 }, From f46c1e1620c6bbc9aad5693082efd1b80822e97c Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 27 Jul 2023 20:57:54 +0300 Subject: [PATCH 12/32] vdpa: Enable strict validation for netlinks ops The previous patches added the missing nla policies that were required for validation to work. Now strict validation on netlink ops can be enabled. This patch does it. Signed-off-by: Dragos Tatulea Cc: stable@vger.kernel.org Message-Id: <20230727175757.73988-9-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index f2f654fd84e5f..a7612e0783b36 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -1257,37 +1257,31 @@ static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { static const struct genl_ops vdpa_nl_ops[] = { { .cmd = VDPA_CMD_MGMTDEV_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_mgmtdev_get_doit, .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_NEW, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_dev_add_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_DEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_dev_del_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_dev_get_doit, .dumpit = vdpa_nl_cmd_dev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_CONFIG_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_dev_config_get_doit, .dumpit = vdpa_nl_cmd_dev_config_get_dumpit, }, { .cmd = VDPA_CMD_DEV_VSTATS_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = vdpa_nl_cmd_dev_stats_get_doit, .flags = GENL_ADMIN_PERM, }, From 7ca26efb09a1543fddb29308ea3b63b66cb5d3ee Mon Sep 17 00:00:00 2001 From: Maxime Coquelin Date: Wed, 5 Jul 2023 13:45:05 +0200 Subject: [PATCH 13/32] vduse: Use proper spinlock for IRQ injection The IRQ injection work used spin_lock_irq() to protect the scheduling of the softirq, but spin_lock_bh() should be used. With spin_lock_irq(), we noticed delay of more than 6 seconds between the time a NAPI polling work is scheduled and the time it is executed. Fixes: c8a6153b6c59 ("vduse: Introduce VDUSE - vDPA Device in Userspace") Cc: xieyongji@bytedance.com Suggested-by: Jason Wang Signed-off-by: Maxime Coquelin Message-Id: <20230705114505.63274-1-maxime.coquelin@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Xie Yongji --- drivers/vdpa/vdpa_user/vduse_dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index dc38ed21319da..df7869537ef14 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -935,10 +935,10 @@ static void vduse_dev_irq_inject(struct work_struct *work) { struct vduse_dev *dev = container_of(work, struct vduse_dev, inject); - spin_lock_irq(&dev->irq_lock); + spin_lock_bh(&dev->irq_lock); if (dev->config_cb.callback) dev->config_cb.callback(dev->config_cb.private); - spin_unlock_irq(&dev->irq_lock); + spin_unlock_bh(&dev->irq_lock); } static void vduse_vq_irq_inject(struct work_struct *work) @@ -946,10 +946,10 @@ static void vduse_vq_irq_inject(struct work_struct *work) struct vduse_virtqueue *vq = container_of(work, struct vduse_virtqueue, inject); - spin_lock_irq(&vq->irq_lock); + spin_lock_bh(&vq->irq_lock); if (vq->ready && vq->cb.callback) vq->cb.callback(vq->cb.private); - spin_unlock_irq(&vq->irq_lock); + spin_unlock_bh(&vq->irq_lock); } static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq) From df9557046440b0a62250fee3169a8f6a139f55a6 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 26 Jul 2023 22:10:07 +0300 Subject: [PATCH 14/32] virtio-vdpa: Fix cpumask memory leak in virtio_vdpa_find_vqs() Free the cpumask allocated by create_affinity_masks() before returning from the function. Fixes: 3dad56823b53 ("virtio-vdpa: Support interrupt affinity spreading mechanism") Signed-off-by: Gal Pressman Reviewed-by: Dragos Tatulea Message-Id: <20230726191036.14324-1-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Xie Yongji --- drivers/virtio/virtio_vdpa.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 989e2d7184ce4..961161da59000 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -393,11 +393,13 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs, cb.callback = virtio_vdpa_config_cb; cb.private = vd_dev; ops->set_config_cb(vdpa, &cb); + kfree(masks); return 0; err_setup_vq: virtio_vdpa_del_vqs(vdev); + kfree(masks); return err; } From 3fe024193340b225d1fd410d78c495434a9d68e0 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 27 Jul 2023 20:23:46 +0300 Subject: [PATCH 15/32] vdpa/mlx5: Correct default number of queues when MQ is on MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The standard specifies that the initial number of queues is the default, which is 1 (1 tx, 1 rx). Signed-off-by: Dragos Tatulea Reviewed-by: Eugenio Pérez Message-Id: <20230727172354.68243-2-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Tested-by: Lei Yang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 9138ef2fb2c85..6b6eb69a8a906 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2517,7 +2517,15 @@ static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features) else ndev->rqt_size = 1; - ndev->cur_num_vqs = 2 * ndev->rqt_size; + /* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section + * 5.1.6.5.5 "Device operation in multiqueue mode": + * + * Multiqueue is disabled by default. + * The driver enables multiqueue by sending a command using class + * VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue + * operation, as follows: ... + */ + ndev->cur_num_vqs = 2; update_cvq_info(mvdev); return err; From 9ee811009ad8f87982b69e61d07447d12233ad01 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 2 Aug 2023 20:12:18 +0300 Subject: [PATCH 16/32] vdpa/mlx5: Fix mr->initialized semantics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mr->initialized flag is shared between the control vq and data vq part of the mr init/uninit. But if the control vq and data vq get placed in different ASIDs, it can happen that initializing the control vq will prevent the data vq mr from being initialized. This patch consolidates the control and data vq init parts into their own init functions. The mr->initialized will now be used for the data vq only. The control vq currently doesn't need a flag. The uninitializing part is also taken care of: mlx5_vdpa_destroy_mr got split into data and control vq functions which are now also ASID aware. Fixes: 8fcd20c30704 ("vdpa/mlx5: Support different address spaces for control and data") Signed-off-by: Dragos Tatulea Reviewed-by: Eugenio Pérez Reviewed-by: Gal Pressman Message-Id: <20230802171231.11001-3-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 + drivers/vdpa/mlx5/core/mr.c | 97 +++++++++++++++++++++--------- 2 files changed, 71 insertions(+), 27 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 25fc4120b618d..a0420be5059f4 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -31,6 +31,7 @@ struct mlx5_vdpa_mr { struct list_head head; unsigned long num_directs; unsigned long num_klms; + /* state of dvq mr */ bool initialized; /* serialize mkey creation and destruction */ diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 03e5432297912..4ae14a248a4bc 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -489,60 +489,103 @@ static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr } } -void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) +static void _mlx5_vdpa_destroy_cvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid) +{ + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid) + return; + + prune_iotlb(mvdev); +} + +static void _mlx5_vdpa_destroy_dvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid) { struct mlx5_vdpa_mr *mr = &mvdev->mr; - mutex_lock(&mr->mkey_mtx); + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid) + return; + if (!mr->initialized) - goto out; + return; - prune_iotlb(mvdev); if (mr->user_mr) destroy_user_mr(mvdev, mr); else destroy_dma_mr(mvdev, mr); mr->initialized = false; -out: +} + +static void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid) +{ + struct mlx5_vdpa_mr *mr = &mvdev->mr; + + mutex_lock(&mr->mkey_mtx); + + _mlx5_vdpa_destroy_dvq_mr(mvdev, asid); + _mlx5_vdpa_destroy_cvq_mr(mvdev, asid); + mutex_unlock(&mr->mkey_mtx); } -static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, - struct vhost_iotlb *iotlb, unsigned int asid) +void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) +{ + mlx5_vdpa_destroy_mr_asid(mvdev, mvdev->group2asid[MLX5_VDPA_CVQ_GROUP]); + mlx5_vdpa_destroy_mr_asid(mvdev, mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]); +} + +static int _mlx5_vdpa_create_cvq_mr(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, + unsigned int asid) +{ + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid) + return 0; + + return dup_iotlb(mvdev, iotlb); +} + +static int _mlx5_vdpa_create_dvq_mr(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, + unsigned int asid) { struct mlx5_vdpa_mr *mr = &mvdev->mr; int err; - if (mr->initialized) + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid) return 0; - if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { - if (iotlb) - err = create_user_mr(mvdev, iotlb); - else - err = create_dma_mr(mvdev, mr); + if (mr->initialized) + return 0; - if (err) - return err; - } + if (iotlb) + err = create_user_mr(mvdev, iotlb); + else + err = create_dma_mr(mvdev, mr); - if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] == asid) { - err = dup_iotlb(mvdev, iotlb); - if (err) - goto out_err; - } + if (err) + return err; mr->initialized = true; + + return 0; +} + +static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, unsigned int asid) +{ + int err; + + err = _mlx5_vdpa_create_dvq_mr(mvdev, iotlb, asid); + if (err) + return err; + + err = _mlx5_vdpa_create_cvq_mr(mvdev, iotlb, asid); + if (err) + goto out_err; + return 0; out_err: - if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { - if (iotlb) - destroy_user_mr(mvdev, mr); - else - destroy_dma_mr(mvdev, mr); - } + _mlx5_vdpa_destroy_dvq_mr(mvdev, asid); return err; } From ad03a0f44cdb97b46e5c84ed353dac9b8ae2c276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Wed, 2 Aug 2023 20:12:20 +0300 Subject: [PATCH 17/32] vdpa/mlx5: Delete control vq iotlb in destroy_mr only when necessary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mlx5_vdpa_destroy_mr can be called from .set_map with data ASID after the control virtqueue ASID iotlb has been populated. The control vq iotlb must not be cleared, since it will not be populated again. So call the ASID aware destroy function which makes sure that the right vq resource is destroyed. Fixes: 8fcd20c30704 ("vdpa/mlx5: Support different address spaces for control and data") Signed-off-by: Eugenio Pérez Reviewed-by: Gal Pressman Message-Id: <20230802171231.11001-5-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 + drivers/vdpa/mlx5/core/mr.c | 2 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index a0420be5059f4..b53420e874acb 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -122,6 +122,7 @@ int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *io int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, unsigned int asid); void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev); +void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid); #define mlx5_vdpa_warn(__dev, format, ...) \ dev_warn((__dev)->mdev->device, "%s:%d:(pid %d) warning: " format, __func__, __LINE__, \ diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 4ae14a248a4bc..5a1971fcd87b1 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -515,7 +515,7 @@ static void _mlx5_vdpa_destroy_dvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int mr->initialized = false; } -static void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid) +void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid) { struct mlx5_vdpa_mr *mr = &mvdev->mr; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 6b6eb69a8a906..dbbc82eec7de5 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2644,7 +2644,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, goto err_mr; teardown_driver(ndev); - mlx5_vdpa_destroy_mr(mvdev); + mlx5_vdpa_destroy_mr_asid(mvdev, asid); err = mlx5_vdpa_create_mr(mvdev, iotlb, asid); if (err) goto err_mr; @@ -2660,7 +2660,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, return 0; err_setup: - mlx5_vdpa_destroy_mr(mvdev); + mlx5_vdpa_destroy_mr_asid(mvdev, asid); err_mr: return err; } From 810b0cc1c28a9b8d055dd8f7d85975e3cf9f4430 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 3 Aug 2023 18:26:33 +0300 Subject: [PATCH 18/32] vdpa/mlx5: Fix crash on shutdown for when no ndev exists The ndev was accessed on shutdown without a check if it actually exists. This triggered the crash pasted below. Instead of doing the ndev check, delete the shutdown handler altogether. The irqs will be released at the parent VF level (mlx5_core). BUG: kernel NULL pointer dereference, address: 0000000000000300 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1 Comm: systemd-shutdow Not tainted 6.5.0-rc2_for_upstream_min_debug_2023_07_17_15_05 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx5v_shutdown+0xe/0x50 [mlx5_vdpa] RSP: 0018:ffff8881003bfdc0 EFLAGS: 00010286 RAX: ffff888103befba0 RBX: ffff888109d28008 RCX: 0000000000000017 RDX: 0000000000000001 RSI: 0000000000000212 RDI: ffff888109d28000 RBP: 0000000000000000 R08: 0000000d3a3a3882 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffff888109d28000 R13: ffff888109d28080 R14: 00000000fee1dead R15: 0000000000000000 FS: 00007f4969e0be40(0000) GS:ffff88852c800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000300 CR3: 00000001051cd006 CR4: 0000000000370eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __die+0x20/0x60 ? page_fault_oops+0x14c/0x3c0 ? exc_page_fault+0x75/0x140 ? asm_exc_page_fault+0x22/0x30 ? mlx5v_shutdown+0xe/0x50 [mlx5_vdpa] device_shutdown+0x13e/0x1e0 kernel_restart+0x36/0x90 __do_sys_reboot+0x141/0x210 ? vfs_writev+0xcd/0x140 ? handle_mm_fault+0x161/0x260 ? do_writev+0x6b/0x110 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7f496990fb56 RSP: 002b:00007fffc7bdde88 EFLAGS: 00000206 ORIG_RAX: 00000000000000a9 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f496990fb56 RDX: 0000000001234567 RSI: 0000000028121969 RDI: fffffffffee1dead RBP: 00007fffc7bde1d0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000 R13: 00007fffc7bddf10 R14: 0000000000000000 R15: 00007fffc7bde2b8 CR2: 0000000000000300 ---[ end trace 0000000000000000 ]--- Fixes: bc9a2b3e686e ("vdpa/mlx5: Support interrupt bypassing") Signed-off-by: Dragos Tatulea Message-Id: <20230803152648.199297-1-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index dbbc82eec7de5..37be945a02308 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3556,17 +3556,6 @@ static void mlx5v_remove(struct auxiliary_device *adev) kfree(mgtdev); } -static void mlx5v_shutdown(struct auxiliary_device *auxdev) -{ - struct mlx5_vdpa_mgmtdev *mgtdev; - struct mlx5_vdpa_net *ndev; - - mgtdev = auxiliary_get_drvdata(auxdev); - ndev = mgtdev->ndev; - - free_irqs(ndev); -} - static const struct auxiliary_device_id mlx5v_id_table[] = { { .name = MLX5_ADEV_NAME ".vnet", }, {}, @@ -3578,7 +3567,6 @@ static struct auxiliary_driver mlx5v_driver = { .name = "vnet", .probe = mlx5v_probe, .remove = mlx5v_remove, - .shutdown = mlx5v_shutdown, .id_table = mlx5v_id_table, }; From 2c507ce90e02cd78d00fd4b0fe26c8641873c13f Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 10 Aug 2023 19:04:05 +0800 Subject: [PATCH 19/32] virtio-net: Zero max_tx_vq field for VIRTIO_NET_CTRL_MQ_HASH_CONFIG case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel uses `struct virtio_net_ctrl_rss` to save command-specific-data for both the VIRTIO_NET_CTRL_MQ_HASH_CONFIG and VIRTIO_NET_CTRL_MQ_RSS_CONFIG commands. According to the VirtIO standard, "Field reserved MUST contain zeroes. It is defined to make the structure to match the layout of virtio_net_rss_config structure, defined in 5.1.6.5.7.". Yet for the VIRTIO_NET_CTRL_MQ_HASH_CONFIG command case, the `max_tx_vq` field in struct virtio_net_ctrl_rss, which corresponds to the `reserved` field in struct virtio_net_hash_config, is not zeroed, thereby violating the VirtIO standard. This patch solves this problem by zeroing this field in virtnet_init_default_rss(). Cc: Andrew Melnychenko Cc: stable@vger.kernel.org Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") Signed-off-by: Hawkins Jiawei Acked-by: Jason Wang Acked-by: Eugenio Pérez Acked-by: Michael S. Tsirkin Message-Id: <20230810110405.25558-1-yin31149@gmail.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo Acked-by: Jason Wang --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 1270c8d23463f..8db38634ae82d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2761,7 +2761,7 @@ static void virtnet_init_default_rss(struct virtnet_info *vi) vi->ctrl->rss.indirection_table[i] = indir_val; } - vi->ctrl->rss.max_tx_vq = vi->curr_queue_pairs; + vi->ctrl->rss.max_tx_vq = vi->has_rss ? vi->curr_queue_pairs : 0; vi->ctrl->rss.hash_key_length = vi->rss_key_size; netdev_rss_key_fill(vi->ctrl->rss.key, vi->rss_key_size); From 0cd2c13b1c15dbbdf1e2ae5b7160537f97df06b5 Mon Sep 17 00:00:00 2001 From: Allen Hubbe Date: Mon, 10 Jul 2023 21:24:33 -0700 Subject: [PATCH 20/32] pds_vdpa: reset to vdpa specified mac When the vdpa device is reset, also reinitialize it with the mac address that was assigned when the device was added. Fixes: 151cc834f3dd ("pds_vdpa: add support for vdpa and vdpamgmt interfaces") Signed-off-by: Allen Hubbe Signed-off-by: Shannon Nelson Message-Id: <20230711042437.69381-2-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/pds/vdpa_dev.c | 16 ++++++++-------- drivers/vdpa/pds/vdpa_dev.h | 1 + 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index 5071a4d58f8db..e2e99bb0be2b0 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -409,6 +409,8 @@ static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) pdsv->vqs[i].avail_idx = 0; pdsv->vqs[i].used_idx = 0; } + + pds_vdpa_cmd_set_mac(pdsv, pdsv->mac); } if (status & ~old_status & VIRTIO_CONFIG_S_FEATURES_OK) { @@ -532,7 +534,6 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, struct device *dma_dev; struct pci_dev *pdev; struct device *dev; - u8 mac[ETH_ALEN]; int err; int i; @@ -617,19 +618,18 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, * or set a random mac if default is 00:..:00 */ if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR)) { - ether_addr_copy(mac, add_config->net.mac); - pds_vdpa_cmd_set_mac(pdsv, mac); + ether_addr_copy(pdsv->mac, add_config->net.mac); } else { struct virtio_net_config __iomem *vc; vc = pdsv->vdpa_aux->vd_mdev.device; - memcpy_fromio(mac, vc->mac, sizeof(mac)); - if (is_zero_ether_addr(mac)) { - eth_random_addr(mac); - dev_info(dev, "setting random mac %pM\n", mac); - pds_vdpa_cmd_set_mac(pdsv, mac); + memcpy_fromio(pdsv->mac, vc->mac, sizeof(pdsv->mac)); + if (is_zero_ether_addr(pdsv->mac)) { + eth_random_addr(pdsv->mac); + dev_info(dev, "setting random mac %pM\n", pdsv->mac); } } + pds_vdpa_cmd_set_mac(pdsv, pdsv->mac); for (i = 0; i < pdsv->num_vqs; i++) { pdsv->vqs[i].qid = i; diff --git a/drivers/vdpa/pds/vdpa_dev.h b/drivers/vdpa/pds/vdpa_dev.h index a1bc37de95374..cf02df287fc40 100644 --- a/drivers/vdpa/pds/vdpa_dev.h +++ b/drivers/vdpa/pds/vdpa_dev.h @@ -39,6 +39,7 @@ struct pds_vdpa_device { u64 req_features; /* features requested by vdpa */ u8 vdpa_index; /* rsvd for future subdevice use */ u8 num_vqs; /* num vqs in use */ + u8 mac[ETH_ALEN]; /* mac selected when the device was added */ struct vdpa_callback config_cb; struct notifier_block nb; }; From abdf31bd91120035172dc58e2e87064a72e9e087 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 10 Jul 2023 21:24:34 -0700 Subject: [PATCH 21/32] pds_vdpa: always allow offering VIRTIO_NET_F_MAC Our driver sets a mac if the HW is 00:..:00 so we need to be sure to advertise VIRTIO_NET_F_MAC even if the HW doesn't. We also need to be sure that virtio_net sees the VIRTIO_NET_F_MAC and doesn't rewrite the mac address that a user may have set with the vdpa utility. After reading the hw_feature bits, add the VIRTIO_NET_F_MAC to the driver's supported_features and use that for reporting what is available. If the HW is not advertising it, be sure to strip the VIRTIO_NET_F_MAC before finishing the feature negotiation. If the user specifies a device_features bitpattern in the vdpa utility without the VIRTIO_NET_F_MAC set, then don't set the mac. Fixes: 151cc834f3dd ("pds_vdpa: add support for vdpa and vdpamgmt interfaces") Signed-off-by: Shannon Nelson Message-Id: <20230711042437.69381-3-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/pds/debugfs.c | 2 -- drivers/vdpa/pds/vdpa_dev.c | 30 +++++++++++++++++++++--------- drivers/vdpa/pds/vdpa_dev.h | 4 ++-- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/vdpa/pds/debugfs.c b/drivers/vdpa/pds/debugfs.c index 21a0dc0cb607c..754ccb7a66661 100644 --- a/drivers/vdpa/pds/debugfs.c +++ b/drivers/vdpa/pds/debugfs.c @@ -224,8 +224,6 @@ static int config_show(struct seq_file *seq, void *v) seq_printf(seq, "dev_status: %#x\n", status); print_status_bits(seq, status); - seq_printf(seq, "req_features: %#llx\n", pdsv->req_features); - print_feature_bits_all(seq, pdsv->req_features); driver_features = vp_modern_get_driver_features(&pdsv->vdpa_aux->vd_mdev); seq_printf(seq, "driver_features: %#llx\n", driver_features); print_feature_bits_all(seq, driver_features); diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index e2e99bb0be2b0..5b566e0eef0a0 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -318,6 +318,7 @@ static int pds_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 featur struct device *dev = &pdsv->vdpa_dev.dev; u64 driver_features; u64 nego_features; + u64 hw_features; u64 missing; if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) { @@ -325,21 +326,26 @@ static int pds_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 featur return -EOPNOTSUPP; } - pdsv->req_features = features; - /* Check for valid feature bits */ - nego_features = features & le64_to_cpu(pdsv->vdpa_aux->ident.hw_features); - missing = pdsv->req_features & ~nego_features; + nego_features = features & pdsv->supported_features; + missing = features & ~nego_features; if (missing) { dev_err(dev, "Can't support all requested features in %#llx, missing %#llx features\n", - pdsv->req_features, missing); + features, missing); return -EOPNOTSUPP; } + pdsv->negotiated_features = nego_features; + driver_features = pds_vdpa_get_driver_features(vdpa_dev); dev_dbg(dev, "%s: %#llx => %#llx\n", __func__, driver_features, nego_features); + /* if we're faking the F_MAC, strip it before writing to device */ + hw_features = le64_to_cpu(pdsv->vdpa_aux->ident.hw_features); + if (!(hw_features & BIT_ULL(VIRTIO_NET_F_MAC))) + nego_features &= ~BIT_ULL(VIRTIO_NET_F_MAC); + if (driver_features == nego_features) return 0; @@ -352,7 +358,7 @@ static u64 pds_vdpa_get_driver_features(struct vdpa_device *vdpa_dev) { struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); - return vp_modern_get_driver_features(&pdsv->vdpa_aux->vd_mdev); + return pdsv->negotiated_features; } static void pds_vdpa_set_config_cb(struct vdpa_device *vdpa_dev, @@ -564,7 +570,7 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { u64 unsupp_features = - add_config->device_features & ~mgmt->supported_features; + add_config->device_features & ~pdsv->supported_features; if (unsupp_features) { dev_err(dev, "Unsupported features: %#llx\n", unsupp_features); @@ -615,7 +621,8 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, } /* Set a mac, either from the user config if provided - * or set a random mac if default is 00:..:00 + * or use the device's mac if not 00:..:00 + * or set a random mac */ if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR)) { ether_addr_copy(pdsv->mac, add_config->net.mac); @@ -624,7 +631,8 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, vc = pdsv->vdpa_aux->vd_mdev.device; memcpy_fromio(pdsv->mac, vc->mac, sizeof(pdsv->mac)); - if (is_zero_ether_addr(pdsv->mac)) { + if (is_zero_ether_addr(pdsv->mac) && + (pdsv->supported_features & BIT_ULL(VIRTIO_NET_F_MAC))) { eth_random_addr(pdsv->mac); dev_info(dev, "setting random mac %pM\n", pdsv->mac); } @@ -752,6 +760,10 @@ int pds_vdpa_get_mgmt_info(struct pds_vdpa_aux *vdpa_aux) mgmt->id_table = pds_vdpa_id_table; mgmt->device = dev; mgmt->supported_features = le64_to_cpu(vdpa_aux->ident.hw_features); + + /* advertise F_MAC even if the device doesn't */ + mgmt->supported_features |= BIT_ULL(VIRTIO_NET_F_MAC); + mgmt->config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); diff --git a/drivers/vdpa/pds/vdpa_dev.h b/drivers/vdpa/pds/vdpa_dev.h index cf02df287fc40..d984ba24a7dae 100644 --- a/drivers/vdpa/pds/vdpa_dev.h +++ b/drivers/vdpa/pds/vdpa_dev.h @@ -35,8 +35,8 @@ struct pds_vdpa_device { struct pds_vdpa_aux *vdpa_aux; struct pds_vdpa_vq_info vqs[PDS_VDPA_MAX_QUEUES]; - u64 supported_features; /* specified device features */ - u64 req_features; /* features requested by vdpa */ + u64 supported_features; /* supported device features */ + u64 negotiated_features; /* negotiated features */ u8 vdpa_index; /* rsvd for future subdevice use */ u8 num_vqs; /* num vqs in use */ u8 mac[ETH_ALEN]; /* mac selected when the device was added */ From ed88863040daad18d3f9b12f7c9c1c3da3731e1f Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 10 Jul 2023 21:24:35 -0700 Subject: [PATCH 22/32] pds_vdpa: clean and reset vqs entries Make sure that we initialize the vqs[] entries the same way both for initial setup and after a vq reset. Fixes: 151cc834f3dd ("pds_vdpa: add support for vdpa and vdpamgmt interfaces") Signed-off-by: Shannon Nelson Message-Id: <20230711042437.69381-4-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/pds/vdpa_dev.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index 5b566e0eef0a0..04a362648b020 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -428,6 +428,17 @@ static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) } } +static void pds_vdpa_init_vqs_entry(struct pds_vdpa_device *pdsv, int qid, + void __iomem *notify) +{ + memset(&pdsv->vqs[qid], 0, sizeof(pdsv->vqs[0])); + pdsv->vqs[qid].qid = qid; + pdsv->vqs[qid].pdsv = pdsv; + pdsv->vqs[qid].ready = false; + pdsv->vqs[qid].irq = VIRTIO_MSI_NO_VECTOR; + pdsv->vqs[qid].notify = notify; +} + static int pds_vdpa_reset(struct vdpa_device *vdpa_dev) { struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); @@ -450,8 +461,7 @@ static int pds_vdpa_reset(struct vdpa_device *vdpa_dev) dev_err(dev, "%s: reset_vq failed qid %d: %pe\n", __func__, i, ERR_PTR(err)); pds_vdpa_release_irq(pdsv, i); - memset(&pdsv->vqs[i], 0, sizeof(pdsv->vqs[0])); - pdsv->vqs[i].ready = false; + pds_vdpa_init_vqs_entry(pdsv, i, pdsv->vqs[i].notify); } } @@ -640,11 +650,11 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, pds_vdpa_cmd_set_mac(pdsv, pdsv->mac); for (i = 0; i < pdsv->num_vqs; i++) { - pdsv->vqs[i].qid = i; - pdsv->vqs[i].pdsv = pdsv; - pdsv->vqs[i].irq = VIRTIO_MSI_NO_VECTOR; - pdsv->vqs[i].notify = vp_modern_map_vq_notify(&pdsv->vdpa_aux->vd_mdev, - i, &pdsv->vqs[i].notify_pa); + void __iomem *notify; + + notify = vp_modern_map_vq_notify(&pdsv->vdpa_aux->vd_mdev, + i, &pdsv->vqs[i].notify_pa); + pds_vdpa_init_vqs_entry(pdsv, i, notify); } pdsv->vdpa_dev.mdev = &vdpa_aux->vdpa_mdev; From c0a6c5cbf1a9e49357e942ed393da08a55808a49 Mon Sep 17 00:00:00 2001 From: Allen Hubbe Date: Mon, 10 Jul 2023 21:24:36 -0700 Subject: [PATCH 23/32] pds_vdpa: alloc irq vectors on DRIVER_OK We were allocating irq vectors at the time the aux dev was probed, but that is before the PCI VF is assigned to a separate iommu domain by vhost_vdpa. Because vhost_vdpa later changes the iommu domain the interrupts do not work. Instead, we can allocate the irq vectors later when we see DRIVER_OK and know that the reassignment of the PCI VF to an iommu domain has already happened. Fixes: 151cc834f3dd ("pds_vdpa: add support for vdpa and vdpamgmt interfaces") Signed-off-by: Allen Hubbe Signed-off-by: Shannon Nelson Acked-by: Jason Wang Message-Id: <20230711042437.69381-5-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/pds/vdpa_dev.c | 110 ++++++++++++++++++++++++++---------- 1 file changed, 81 insertions(+), 29 deletions(-) diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index 04a362648b020..52b2449182ad7 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -126,11 +126,9 @@ static void pds_vdpa_release_irq(struct pds_vdpa_device *pdsv, int qid) static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready) { struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); - struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev; struct device *dev = &pdsv->vdpa_dev.dev; u64 driver_features; u16 invert_idx = 0; - int irq; int err; dev_dbg(dev, "%s: qid %d ready %d => %d\n", @@ -143,19 +141,6 @@ static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool re invert_idx = PDS_VDPA_PACKED_INVERT_IDX; if (ready) { - irq = pci_irq_vector(pdev, qid); - snprintf(pdsv->vqs[qid].irq_name, sizeof(pdsv->vqs[qid].irq_name), - "vdpa-%s-%d", dev_name(dev), qid); - - err = request_irq(irq, pds_vdpa_isr, 0, - pdsv->vqs[qid].irq_name, &pdsv->vqs[qid]); - if (err) { - dev_err(dev, "%s: no irq for qid %d: %pe\n", - __func__, qid, ERR_PTR(err)); - return; - } - pdsv->vqs[qid].irq = irq; - /* Pass vq setup info to DSC using adminq to gather up and * send all info at once so FW can do its full set up in * one easy operation @@ -164,7 +149,6 @@ static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool re if (err) { dev_err(dev, "Failed to init vq %d: %pe\n", qid, ERR_PTR(err)); - pds_vdpa_release_irq(pdsv, qid); ready = false; } } else { @@ -172,7 +156,6 @@ static void pds_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool re if (err) dev_err(dev, "%s: reset_vq failed qid %d: %pe\n", __func__, qid, ERR_PTR(err)); - pds_vdpa_release_irq(pdsv, qid); } pdsv->vqs[qid].ready = ready; @@ -395,6 +378,72 @@ static u8 pds_vdpa_get_status(struct vdpa_device *vdpa_dev) return vp_modern_get_status(&pdsv->vdpa_aux->vd_mdev); } +static int pds_vdpa_request_irqs(struct pds_vdpa_device *pdsv) +{ + struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev; + struct pds_vdpa_aux *vdpa_aux = pdsv->vdpa_aux; + struct device *dev = &pdsv->vdpa_dev.dev; + int max_vq, nintrs, qid, err; + + max_vq = vdpa_aux->vdpa_mdev.max_supported_vqs; + + nintrs = pci_alloc_irq_vectors(pdev, max_vq, max_vq, PCI_IRQ_MSIX); + if (nintrs < 0) { + dev_err(dev, "Couldn't get %d msix vectors: %pe\n", + max_vq, ERR_PTR(nintrs)); + return nintrs; + } + + for (qid = 0; qid < pdsv->num_vqs; ++qid) { + int irq = pci_irq_vector(pdev, qid); + + snprintf(pdsv->vqs[qid].irq_name, sizeof(pdsv->vqs[qid].irq_name), + "vdpa-%s-%d", dev_name(dev), qid); + + err = request_irq(irq, pds_vdpa_isr, 0, + pdsv->vqs[qid].irq_name, + &pdsv->vqs[qid]); + if (err) { + dev_err(dev, "%s: no irq for qid %d: %pe\n", + __func__, qid, ERR_PTR(err)); + goto err_release; + } + + pdsv->vqs[qid].irq = irq; + } + + vdpa_aux->nintrs = nintrs; + + return 0; + +err_release: + while (qid--) + pds_vdpa_release_irq(pdsv, qid); + + pci_free_irq_vectors(pdev); + + vdpa_aux->nintrs = 0; + + return err; +} + +static void pds_vdpa_release_irqs(struct pds_vdpa_device *pdsv) +{ + struct pci_dev *pdev = pdsv->vdpa_aux->padev->vf_pdev; + struct pds_vdpa_aux *vdpa_aux = pdsv->vdpa_aux; + int qid; + + if (!vdpa_aux->nintrs) + return; + + for (qid = 0; qid < pdsv->num_vqs; qid++) + pds_vdpa_release_irq(pdsv, qid); + + pci_free_irq_vectors(pdev); + + vdpa_aux->nintrs = 0; +} + static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) { struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); @@ -405,6 +454,11 @@ static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) old_status = pds_vdpa_get_status(vdpa_dev); dev_dbg(dev, "%s: old %#x new %#x\n", __func__, old_status, status); + if (status & ~old_status & VIRTIO_CONFIG_S_DRIVER_OK) { + if (pds_vdpa_request_irqs(pdsv)) + status = old_status | VIRTIO_CONFIG_S_FAILED; + } + pds_vdpa_cmd_set_status(pdsv, status); /* Note: still working with FW on the need for this reset cmd */ @@ -426,6 +480,9 @@ static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) i, &pdsv->vqs[i].notify_pa); } } + + if (old_status & ~status & VIRTIO_CONFIG_S_DRIVER_OK) + pds_vdpa_release_irqs(pdsv); } static void pds_vdpa_init_vqs_entry(struct pds_vdpa_device *pdsv, int qid, @@ -460,13 +517,17 @@ static int pds_vdpa_reset(struct vdpa_device *vdpa_dev) if (err) dev_err(dev, "%s: reset_vq failed qid %d: %pe\n", __func__, i, ERR_PTR(err)); - pds_vdpa_release_irq(pdsv, i); - pds_vdpa_init_vqs_entry(pdsv, i, pdsv->vqs[i].notify); } } pds_vdpa_set_status(vdpa_dev, 0); + if (status & VIRTIO_CONFIG_S_DRIVER_OK) { + /* Reset the vq info */ + for (i = 0; i < pdsv->num_vqs && !err; i++) + pds_vdpa_init_vqs_entry(pdsv, i, pdsv->vqs[i].notify); + } + return 0; } @@ -764,7 +825,7 @@ int pds_vdpa_get_mgmt_info(struct pds_vdpa_aux *vdpa_aux) max_vqs = min_t(u16, dev_intrs, max_vqs); mgmt->max_supported_vqs = min_t(u16, PDS_VDPA_MAX_QUEUES, max_vqs); - vdpa_aux->nintrs = mgmt->max_supported_vqs; + vdpa_aux->nintrs = 0; mgmt->ops = &pds_vdpa_mgmt_dev_ops; mgmt->id_table = pds_vdpa_id_table; @@ -778,14 +839,5 @@ int pds_vdpa_get_mgmt_info(struct pds_vdpa_aux *vdpa_aux) mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); mgmt->config_attr_mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); - err = pci_alloc_irq_vectors(pdev, vdpa_aux->nintrs, vdpa_aux->nintrs, - PCI_IRQ_MSIX); - if (err < 0) { - dev_err(dev, "Couldn't get %d msix vectors: %pe\n", - vdpa_aux->nintrs, ERR_PTR(err)); - return err; - } - vdpa_aux->nintrs = err; - return 0; } From 8efc365b20dc9a5b0c8fd0e8a195690bf21cd8be Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 10 Jul 2023 21:24:37 -0700 Subject: [PATCH 24/32] pds_vdpa: fix up debugfs feature bit printing Make clearer in debugfs output the difference between the hw feature bits, the features supported through the driver, and the features that have been negotiated. Signed-off-by: Shannon Nelson Message-Id: <20230711042437.69381-6-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/pds/debugfs.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/vdpa/pds/debugfs.c b/drivers/vdpa/pds/debugfs.c index 754ccb7a66661..9b04aad6ec35d 100644 --- a/drivers/vdpa/pds/debugfs.c +++ b/drivers/vdpa/pds/debugfs.c @@ -176,6 +176,7 @@ static int identity_show(struct seq_file *seq, void *v) { struct pds_vdpa_aux *vdpa_aux = seq->private; struct vdpa_mgmt_dev *mgmt; + u64 hw_features; seq_printf(seq, "aux_dev: %s\n", dev_name(&vdpa_aux->padev->aux_dev.dev)); @@ -183,8 +184,9 @@ static int identity_show(struct seq_file *seq, void *v) mgmt = &vdpa_aux->vdpa_mdev; seq_printf(seq, "max_vqs: %d\n", mgmt->max_supported_vqs); seq_printf(seq, "config_attr_mask: %#llx\n", mgmt->config_attr_mask); - seq_printf(seq, "supported_features: %#llx\n", mgmt->supported_features); - print_feature_bits_all(seq, mgmt->supported_features); + hw_features = le64_to_cpu(vdpa_aux->ident.hw_features); + seq_printf(seq, "hw_features: %#llx\n", hw_features); + print_feature_bits_all(seq, hw_features); return 0; } @@ -200,7 +202,6 @@ static int config_show(struct seq_file *seq, void *v) { struct pds_vdpa_device *pdsv = seq->private; struct virtio_net_config vc; - u64 driver_features; u8 status; memcpy_fromio(&vc, pdsv->vdpa_aux->vd_mdev.device, @@ -223,10 +224,8 @@ static int config_show(struct seq_file *seq, void *v) status = vp_modern_get_status(&pdsv->vdpa_aux->vd_mdev); seq_printf(seq, "dev_status: %#x\n", status); print_status_bits(seq, status); - - driver_features = vp_modern_get_driver_features(&pdsv->vdpa_aux->vd_mdev); - seq_printf(seq, "driver_features: %#llx\n", driver_features); - print_feature_bits_all(seq, driver_features); + seq_printf(seq, "negotiated_features: %#llx\n", pdsv->negotiated_features); + print_feature_bits_all(seq, pdsv->negotiated_features); seq_printf(seq, "vdpa_index: %d\n", pdsv->vdpa_index); seq_printf(seq, "num_vqs: %d\n", pdsv->num_vqs); From f504e15b94eb4e5b47f8715da59c0207f68dffe1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 13 Jul 2023 16:55:48 +0200 Subject: [PATCH 25/32] virtio-mem: remove unsafe unplug in Big Block Mode (BBM) When "unsafe unplug" is enabled, we don't fake-offline all memory ahead of actual memory offlining using alloc_contig_range(). Instead, we rely on offline_pages() to also perform actual page migration, which might fail or take a very long time. In that case, it's possible to easily run into endless loops that cannot be aborted anymore (as offlining is triggered by a workqueue then): For example, a single (accidentally) permanently unmovable page in ZONE_MOVABLE results in an endless loop. For ZONE_NORMAL, races between isolating the pageblock (and checking for unmovable pages) and concurrent page allocation are possible and similarly result in endless loops. The idea of the unsafe unplug mode was to make it possible to more reliably unplug large memory blocks. However, (a) we really should be tackling that differently, by extending the alloc_contig_range()-based mechanism; and (b) this mode is not the default and as far as I know, it's unused either way. So let's simply get rid of it. Signed-off-by: David Hildenbrand Message-Id: <20230713145551.2824980-2-david@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 51 +++++++++++++++---------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 835f6cc2fb664..ed15d2a4bd96b 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -38,11 +38,6 @@ module_param(bbm_block_size, ulong, 0444); MODULE_PARM_DESC(bbm_block_size, "Big Block size in bytes. Default is 0 (auto-detection)."); -static bool bbm_safe_unplug = true; -module_param(bbm_safe_unplug, bool, 0444); -MODULE_PARM_DESC(bbm_safe_unplug, - "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); - /* * virtio-mem currently supports the following modes of operation: * @@ -2111,38 +2106,32 @@ static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, VIRTIO_MEM_BBM_BB_ADDED)) return -EINVAL; - if (bbm_safe_unplug) { - /* - * Start by fake-offlining all memory. Once we marked the device - * block as fake-offline, all newly onlined memory will - * automatically be kept fake-offline. Protect from concurrent - * onlining/offlining until we have a consistent state. - */ - mutex_lock(&vm->hotplug_mutex); - virtio_mem_bbm_set_bb_state(vm, bb_id, - VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); + /* + * Start by fake-offlining all memory. Once we marked the device + * block as fake-offline, all newly onlined memory will + * automatically be kept fake-offline. Protect from concurrent + * onlining/offlining until we have a consistent state. + */ + mutex_lock(&vm->hotplug_mutex); + virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - page = pfn_to_online_page(pfn); - if (!page) - continue; + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; - rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); - if (rc) { - end_pfn = pfn; - goto rollback_safe_unplug; - } + rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); + if (rc) { + end_pfn = pfn; + goto rollback; } - mutex_unlock(&vm->hotplug_mutex); } + mutex_unlock(&vm->hotplug_mutex); rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); if (rc) { - if (bbm_safe_unplug) { - mutex_lock(&vm->hotplug_mutex); - goto rollback_safe_unplug; - } - return rc; + mutex_lock(&vm->hotplug_mutex); + goto rollback; } rc = virtio_mem_bbm_unplug_bb(vm, bb_id); @@ -2154,7 +2143,7 @@ static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, VIRTIO_MEM_BBM_BB_UNUSED); return rc; -rollback_safe_unplug: +rollback: for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { page = pfn_to_online_page(pfn); if (!page) From ddf409851461f515cc32974714b73efe2e012bde Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 13 Jul 2023 16:55:49 +0200 Subject: [PATCH 26/32] virtio-mem: convert most offline_and_remove_memory() errors to -EBUSY Just like we do with alloc_contig_range(), let's convert all unknown errors to -EBUSY, but WARN so we can look into the issue. For example, offline_pages() could fail with -EINTR, which would be unexpected in our case. Signed-off-by: David Hildenbrand Message-Id: <20230713145551.2824980-3-david@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index ed15d2a4bd96b..1a76ba2bc118c 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -741,11 +741,15 @@ static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, * immediately instead of waiting. */ virtio_mem_retry(vm); - } else { - dev_dbg(&vm->vdev->dev, - "offlining and removing memory failed: %d\n", rc); + return 0; } - return rc; + dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc); + /* + * We don't really expect this to fail, because we fake-offlined all + * memory already. But it could fail in corner cases. + */ + WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY); + return rc == -ENOMEM ? -ENOMEM : -EBUSY; } /* From a31648fd4f96fbe0a4d0aeb16b57a2405c6943c0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 13 Jul 2023 16:55:50 +0200 Subject: [PATCH 27/32] virtio-mem: keep retrying on offline_and_remove_memory() errors in Sub Block Mode (SBM) In case offline_and_remove_memory() fails in SBM, we leave a completely unplugged Linux memory block stick around until we try plugging memory again. We won't try removing that memory block again. offline_and_remove_memory() may, for example, fail if we're racing with another alloc_contig_range() user, if allocating temporary memory fails, or if some memory notifier rejected the offlining request. Let's handle that case better, by simple retrying to offline and remove such memory. Tested using CONFIG_MEMORY_NOTIFIER_ERROR_INJECT. Signed-off-by: David Hildenbrand Message-Id: <20230713145551.2824980-4-david@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 92 +++++++++++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 19 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 1a76ba2bc118c..a5cf92e3e5af2 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -168,6 +168,13 @@ struct virtio_mem { /* The number of subblocks per Linux memory block. */ uint32_t sbs_per_mb; + /* + * Some of the Linux memory blocks tracked as "partially + * plugged" are completely unplugged and can be offlined + * and removed -- which previously failed. + */ + bool have_unplugged_mb; + /* Summary of all memory block states. */ unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; @@ -765,6 +772,34 @@ static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, return virtio_mem_offline_and_remove_memory(vm, addr, size); } +/* + * Try (offlining and) removing memory from Linux in case all subblocks are + * unplugged. Can be called on online and offline memory blocks. + * + * May modify the state of memory blocks in virtio-mem. + */ +static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm, + unsigned long mb_id) +{ + int rc; + + /* + * Once all subblocks of a memory block were unplugged, offline and + * remove it. + */ + if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) + return 0; + + /* offline_and_remove_memory() works for online and offline memory. */ + mutex_unlock(&vm->hotplug_mutex); + rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); + mutex_lock(&vm->hotplug_mutex); + if (!rc) + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); + return rc; +} + /* * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a * all Linux memory blocks covered by the big block. @@ -1988,20 +2023,10 @@ static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, } unplugged: - /* - * Once all subblocks of a memory block were unplugged, offline and - * remove it. This will usually not fail, as no memory is in use - * anymore - however some other notifiers might NACK the request. - */ - if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { - mutex_unlock(&vm->hotplug_mutex); - rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); - mutex_lock(&vm->hotplug_mutex); - if (!rc) - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_UNUSED); - } - + rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id); + if (rc) + vm->sbm.have_unplugged_mb = 1; + /* Ignore errors, this is not critical. We'll retry later. */ return 0; } @@ -2253,12 +2278,13 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) /* * Try to unplug all blocks that couldn't be unplugged before, for example, - * because the hypervisor was busy. + * because the hypervisor was busy. Further, offline and remove any memory + * blocks where we previously failed. */ -static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) +static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm) { unsigned long id; - int rc; + int rc = 0; if (!vm->in_sbm) { virtio_mem_bbm_for_each_bb(vm, id, @@ -2280,6 +2306,27 @@ static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) VIRTIO_MEM_SBM_MB_UNUSED); } + if (!vm->sbm.have_unplugged_mb) + return 0; + + /* + * Let's retry (offlining and) removing completely unplugged Linux + * memory blocks. + */ + vm->sbm.have_unplugged_mb = false; + + mutex_lock(&vm->hotplug_mutex); + virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL) + rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); + virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL) + rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); + virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) + rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); + mutex_unlock(&vm->hotplug_mutex); + + if (rc) + vm->sbm.have_unplugged_mb = true; + /* Ignore errors, this is not critical. We'll retry later. */ return 0; } @@ -2361,9 +2408,9 @@ static void virtio_mem_run_wq(struct work_struct *work) virtio_mem_refresh_config(vm); } - /* Unplug any leftovers from previous runs */ + /* Cleanup any leftovers from previous runs */ if (!rc) - rc = virtio_mem_unplug_pending_mb(vm); + rc = virtio_mem_cleanup_pending_mb(vm); if (!rc && vm->requested_size != vm->plugged_size) { if (vm->requested_size > vm->plugged_size) { @@ -2375,6 +2422,13 @@ static void virtio_mem_run_wq(struct work_struct *work) } } + /* + * Keep retrying to offline and remove completely unplugged Linux + * memory blocks. + */ + if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb) + rc = -EBUSY; + switch (rc) { case 0: vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; From f55484fd7be923b740e8e1fc304070ba53675cb4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 13 Jul 2023 16:55:51 +0200 Subject: [PATCH 28/32] virtio-mem: check if the config changed before fake offlining memory If we repeatedly fail to fake offline memory to unplug it, we won't be sending any unplug requests to the device. However, we only check if the config changed when sending such (un)plug requests. We could end up trying for a long time to unplug memory, even though the config changed already and we're not supposed to unplug memory anymore. For example, the hypervisor might detect a low-memory situation while unplugging memory and decide to replug some memory. Continuing trying to unplug memory in that case can be problematic. So let's check on a more regular basis. Signed-off-by: David Hildenbrand Message-Id: <20230713145551.2824980-5-david@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index a5cf92e3e5af2..fa5226c198cc6 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1189,7 +1189,8 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) * Try to allocate a range, marking pages fake-offline, effectively * fake-offlining them. */ -static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) +static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn, + unsigned long nr_pages) { const bool is_movable = is_zone_movable_page(pfn_to_page(pfn)); int rc, retry_count; @@ -1202,6 +1203,14 @@ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) * some guarantees. */ for (retry_count = 0; retry_count < 5; retry_count++) { + /* + * If the config changed, stop immediately and go back to the + * main loop: avoid trying to keep unplugging if the device + * might have decided to not remove any more memory. + */ + if (atomic_read(&vm->config_changed)) + return -EAGAIN; + rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, GFP_KERNEL); if (rc == -ENOMEM) @@ -1951,7 +1960,7 @@ static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + sb_id * vm->sbm.sb_size); - rc = virtio_mem_fake_offline(start_pfn, nr_pages); + rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages); if (rc) return rc; @@ -2149,7 +2158,7 @@ static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, if (!page) continue; - rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); + rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION); if (rc) { end_pfn = pfn; goto rollback; From e8f5f849ffce24490eb9449e98312b66c0dba76f Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 10 Aug 2023 15:34:21 -0500 Subject: [PATCH 29/32] cifs: fix potential oops in cifs_oplock_break With deferred close we can have closes that race with lease breaks, and so with the current checks for whether to send the lease response, oplock_response(), this can mean that an unmount (kill_sb) can occur just before we were checking if the tcon->ses is valid. See below: [Fri Aug 4 04:12:50 2023] RIP: 0010:cifs_oplock_break+0x1f7/0x5b0 [cifs] [Fri Aug 4 04:12:50 2023] Code: 7d a8 48 8b 7d c0 c0 e9 02 48 89 45 b8 41 89 cf e8 3e f5 ff ff 4c 89 f7 41 83 e7 01 e8 82 b3 03 f2 49 8b 45 50 48 85 c0 74 5e <48> 83 78 60 00 74 57 45 84 ff 75 52 48 8b 43 98 48 83 eb 68 48 39 [Fri Aug 4 04:12:50 2023] RSP: 0018:ffffb30607ddbdf8 EFLAGS: 00010206 [Fri Aug 4 04:12:50 2023] RAX: 632d223d32612022 RBX: ffff97136944b1e0 RCX: 0000000080100009 [Fri Aug 4 04:12:50 2023] RDX: 0000000000000001 RSI: 0000000080100009 RDI: ffff97136944b188 [Fri Aug 4 04:12:50 2023] RBP: ffffb30607ddbe58 R08: 0000000000000001 R09: ffffffffc08e0900 [Fri Aug 4 04:12:50 2023] R10: 0000000000000001 R11: 000000000000000f R12: ffff97136944b138 [Fri Aug 4 04:12:50 2023] R13: ffff97149147c000 R14: ffff97136944b188 R15: 0000000000000000 [Fri Aug 4 04:12:50 2023] FS: 0000000000000000(0000) GS:ffff9714f7c00000(0000) knlGS:0000000000000000 [Fri Aug 4 04:12:50 2023] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [Fri Aug 4 04:12:50 2023] CR2: 00007fd8de9c7590 CR3: 000000011228e000 CR4: 0000000000350ef0 [Fri Aug 4 04:12:50 2023] Call Trace: [Fri Aug 4 04:12:50 2023] [Fri Aug 4 04:12:50 2023] process_one_work+0x225/0x3d0 [Fri Aug 4 04:12:50 2023] worker_thread+0x4d/0x3e0 [Fri Aug 4 04:12:50 2023] ? process_one_work+0x3d0/0x3d0 [Fri Aug 4 04:12:50 2023] kthread+0x12a/0x150 [Fri Aug 4 04:12:50 2023] ? set_kthread_struct+0x50/0x50 [Fri Aug 4 04:12:50 2023] ret_from_fork+0x22/0x30 [Fri Aug 4 04:12:50 2023] To fix this change the ordering of the checks before sending the oplock_response to first check if the openFileList is empty. Fixes: da787d5b7498 ("SMB3: Do not send lease break acknowledgment if all file handles have been closed") Suggested-by: Bharath SM Reviewed-by: Bharath SM Reviewed-by: Shyam Prasad N Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/smb/client/file.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index fc5acc95cd13f..60a49caf8425a 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -4878,9 +4878,11 @@ void cifs_oplock_break(struct work_struct *work) struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, oplock_break); struct inode *inode = d_inode(cfile->dentry); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cinode = CIFS_I(inode); - struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; + struct tcon_link *tlink; int rc = 0; bool purge_cache = false, oplock_break_cancelled; __u64 persistent_fid, volatile_fid; @@ -4889,6 +4891,12 @@ void cifs_oplock_break(struct work_struct *work) wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, TASK_UNINTERRUPTIBLE); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + goto out; + tcon = tlink_tcon(tlink); + server = tcon->ses->server; + server->ops->downgrade_oplock(server, cinode, cfile->oplock_level, cfile->oplock_epoch, &purge_cache); @@ -4938,18 +4946,19 @@ void cifs_oplock_break(struct work_struct *work) /* * MS-SMB2 3.2.5.19.1 and 3.2.5.19.2 (and MS-CIFS 3.2.5.42) do not require * an acknowledgment to be sent when the file has already been closed. - * check for server null, since can race with kill_sb calling tree disconnect. */ spin_lock(&cinode->open_file_lock); - if (tcon->ses && tcon->ses->server && !oplock_break_cancelled && - !list_empty(&cinode->openFileList)) { + /* check list empty since can race with kill_sb calling tree disconnect */ + if (!oplock_break_cancelled && !list_empty(&cinode->openFileList)) { spin_unlock(&cinode->open_file_lock); - rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid, - volatile_fid, net_fid, cinode); + rc = server->ops->oplock_response(tcon, persistent_fid, + volatile_fid, net_fid, cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } else spin_unlock(&cinode->open_file_lock); + cifs_put_tlink(tlink); +out: cifs_done_oplock_break(cinode); } From 7a894c87374771f3cfb1b8e5453fbe03f1fb8135 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 13 Aug 2023 22:11:19 +0200 Subject: [PATCH 30/32] parisc: Fix CONFIG_TLB_PTLOCK to work with lightweight spinlock checks For the TLB_PTLOCK checks we used an optimization to store the spc register into the spinlock to unlock it. This optimization works as long as the lightweight spinlock checks (CONFIG_LIGHTWEIGHT_SPINLOCK_CHECK) aren't enabled, because they really check if the lock word is zero or __ARCH_SPIN_LOCK_UNLOCKED_VAL and abort with a kernel crash ("Spinlock was trashed") otherwise. Drop that optimization to make it possible to activate both checks at the same time. Noticed-by: Sam James Signed-off-by: Helge Deller Tested-by: Sam James Cc: stable@vger.kernel.org # v6.4+ Fixes: 15e64ef6520e ("parisc: Add lightweight spinlock checks") --- arch/parisc/kernel/entry.S | 47 +++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index 0e5ebfe8d9d29..ae03b8679696e 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -406,7 +407,7 @@ LDREG 0(\ptp),\pte bb,<,n \pte,_PAGE_PRESENT_BIT,3f b \fault - stw \spc,0(\tmp) + stw \tmp1,0(\tmp) 99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) #endif 2: LDREG 0(\ptp),\pte @@ -415,24 +416,22 @@ .endm /* Release page_table_lock without reloading lock address. - Note that the values in the register spc are limited to - NR_SPACE_IDS (262144). Thus, the stw instruction always - stores a nonzero value even when register spc is 64 bits. We use an ordered store to ensure all prior accesses are performed prior to releasing the lock. */ - .macro ptl_unlock0 spc,tmp + .macro ptl_unlock0 spc,tmp,tmp2 #ifdef CONFIG_TLB_PTLOCK -98: or,COND(=) %r0,\spc,%r0 - stw,ma \spc,0(\tmp) +98: ldi __ARCH_SPIN_LOCK_UNLOCKED_VAL, \tmp2 + or,COND(=) %r0,\spc,%r0 + stw,ma \tmp2,0(\tmp) 99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) #endif .endm /* Release page_table_lock. */ - .macro ptl_unlock1 spc,tmp + .macro ptl_unlock1 spc,tmp,tmp2 #ifdef CONFIG_TLB_PTLOCK 98: get_ptl \tmp - ptl_unlock0 \spc,\tmp + ptl_unlock0 \spc,\tmp,\tmp2 99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) #endif .endm @@ -1125,7 +1124,7 @@ dtlb_miss_20w: idtlbt pte,prot - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1151,7 +1150,7 @@ nadtlb_miss_20w: idtlbt pte,prot - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1185,7 +1184,7 @@ dtlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1218,7 +1217,7 @@ nadtlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1247,7 +1246,7 @@ dtlb_miss_20: idtlbt pte,prot - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1275,7 +1274,7 @@ nadtlb_miss_20: idtlbt pte,prot - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1320,7 +1319,7 @@ itlb_miss_20w: iitlbt pte,prot - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1344,7 +1343,7 @@ naitlb_miss_20w: iitlbt pte,prot - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1378,7 +1377,7 @@ itlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1402,7 +1401,7 @@ naitlb_miss_11: mtsp t1, %sr1 /* Restore sr1 */ - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1432,7 +1431,7 @@ itlb_miss_20: iitlbt pte,prot - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1452,7 +1451,7 @@ naitlb_miss_20: iitlbt pte,prot - ptl_unlock1 spc,t0 + ptl_unlock1 spc,t0,t1 rfir nop @@ -1482,7 +1481,7 @@ dbit_trap_20w: idtlbt pte,prot - ptl_unlock0 spc,t0 + ptl_unlock0 spc,t0,t1 rfir nop #else @@ -1508,7 +1507,7 @@ dbit_trap_11: mtsp t1, %sr1 /* Restore sr1 */ - ptl_unlock0 spc,t0 + ptl_unlock0 spc,t0,t1 rfir nop @@ -1528,7 +1527,7 @@ dbit_trap_20: idtlbt pte,prot - ptl_unlock0 spc,t0 + ptl_unlock0 spc,t0,t1 rfir nop #endif From 69513dd669e243928f7450893190915a88f84a2b Mon Sep 17 00:00:00 2001 From: Russell Harmon via samba-technical Date: Thu, 10 Aug 2023 00:19:22 -0700 Subject: [PATCH 31/32] cifs: Release folio lock on fscache read hit. Under the current code, when cifs_readpage_worker is called, the call contract is that the callee should unlock the page. This is documented in the read_folio section of Documentation/filesystems/vfs.rst as: > The filesystem should unlock the folio once the read has completed, > whether it was successful or not. Without this change, when fscache is in use and cache hit occurs during a read, the page lock is leaked, producing the following stack on subsequent reads (via mmap) to the page: $ cat /proc/3890/task/12864/stack [<0>] folio_wait_bit_common+0x124/0x350 [<0>] filemap_read_folio+0xad/0xf0 [<0>] filemap_fault+0x8b1/0xab0 [<0>] __do_fault+0x39/0x150 [<0>] do_fault+0x25c/0x3e0 [<0>] __handle_mm_fault+0x6ca/0xc70 [<0>] handle_mm_fault+0xe9/0x350 [<0>] do_user_addr_fault+0x225/0x6c0 [<0>] exc_page_fault+0x84/0x1b0 [<0>] asm_exc_page_fault+0x27/0x30 This requires a reboot to resolve; it is a deadlock. Note however that the call to cifs_readpage_from_fscache does mark the page clean, but does not free the folio lock. This happens in __cifs_readpage_from_fscache on success. Releasing the lock at that point however is not appropriate as cifs_readahead also calls cifs_readpage_from_fscache and *does* unconditionally release the lock after its return. This change therefore effectively makes cifs_readpage_worker work like cifs_readahead. Signed-off-by: Russell Harmon Acked-by: Paulo Alcantara (SUSE) Reviewed-by: David Howells Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 60a49caf8425a..6bc44f79d2e90 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -4681,9 +4681,9 @@ static int cifs_readpage_worker(struct file *file, struct page *page, io_error: kunmap(page); - unlock_page(page); read_complete: + unlock_page(page); return rc; } From 7b38f6ddc97bf572c3422d3175e8678dd95502fa Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 10 Aug 2023 21:41:03 -0500 Subject: [PATCH 32/32] smb3: display network namespace in debug information We recently had problems where a network namespace was deleted causing hard to debug reconnect problems. To help deal with configuration issues like this it is useful to dump the network namespace to better debug what happened. So add this to information displayed in /proc/fs/cifs/DebugData for the server (and channels if mounted with multichannel). For example: Local Users To Server: 1 SecMode: 0x1 Req On Wire: 0 Net namespace: 4026531840 This can be easily compared with what is displayed for the processes on the system. For example /proc/1/ns/net in this case showed the same thing (see below), and we can see that the namespace is still valid in this example. 'net:[4026531840]' Cc: stable@vger.kernel.org Acked-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index fb4162a52844a..aec6e91374742 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -153,6 +153,11 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) in_flight(server), atomic_read(&server->in_send), atomic_read(&server->num_waiters)); +#ifdef CONFIG_NET_NS + if (server->net) + seq_printf(m, " Net namespace: %u ", server->net->ns.inum); +#endif /* NET_NS */ + } static inline const char *smb_speed_to_str(size_t bps) @@ -430,10 +435,15 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) server->reconnect_instance, server->srv_count, server->sec_mode, in_flight(server)); +#ifdef CONFIG_NET_NS + if (server->net) + seq_printf(m, " Net namespace: %u ", server->net->ns.inum); +#endif /* NET_NS */ seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d", atomic_read(&server->in_send), atomic_read(&server->num_waiters)); + if (server->leaf_fullpath) { seq_printf(m, "\nDFS leaf full path: %s", server->leaf_fullpath);