[PATCH 09/14] staging: media: tegra-vde: Add IOMMU support
Dmitry Osipenko
digetx at gmail.com
Sat Aug 18 12:50:02 UTC 2018
On Monday, 13 August 2018 17:50:22 MSK Thierry Reding wrote:
> From: Thierry Reding <treding at nvidia.com>
>
> Implement support for using an IOMMU to map physically discontiguous
> buffers into contiguous I/O virtual mappings that the VDE can use. This
> allows importing arbitrary DMA-BUFs for use by the VDE.
>
> While at it, make sure that the device is detached from any DMA/IOMMU
> mapping that it might have automatically been attached to at boot. If
> using the IOMMU API explicitly, detaching from any existing mapping is
> required to avoid double mapping of buffers.
>
> Signed-off-by: Thierry Reding <treding at nvidia.com>
> ---
> drivers/staging/media/tegra-vde/tegra-vde.c | 171 +++++++++++++++++---
> 1 file changed, 153 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/staging/media/tegra-vde/tegra-vde.c
> b/drivers/staging/media/tegra-vde/tegra-vde.c index
> 2496a03fd158..3bc0bfcfe34e 100644
> --- a/drivers/staging/media/tegra-vde/tegra-vde.c
> +++ b/drivers/staging/media/tegra-vde/tegra-vde.c
> @@ -13,7 +13,9 @@
> #include <linux/dma-buf.h>
> #include <linux/genalloc.h>
> #include <linux/interrupt.h>
> +#include <linux/iommu.h>
> #include <linux/iopoll.h>
> +#include <linux/iova.h>
> #include <linux/miscdevice.h>
> #include <linux/module.h>
> #include <linux/of_device.h>
> @@ -22,6 +24,10 @@
> #include <linux/slab.h>
> #include <linux/uaccess.h>
>
> +#if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU)
> +#include <asm/dma-iommu.h>
> +#endif
> +
> #include <soc/tegra/pmc.h>
>
> #include <drm/drm_fourcc.h>
> @@ -61,6 +67,11 @@ struct video_frame {
> u32 frame_num;
> u32 flags;
> u64 modifier;
> +
> + struct iova *y_iova;
> + struct iova *cb_iova;
> + struct iova *cr_iova;
> + struct iova *aux_iova;
> };
>
> struct tegra_vde_soc {
> @@ -93,6 +104,12 @@ struct tegra_vde {
> struct clk *clk_bsev;
> dma_addr_t iram_lists_addr;
> u32 *iram;
> +
> + struct iommu_domain *domain;
> + struct iommu_group *group;
> + struct iova_domain iova;
> + unsigned long limit;
> + unsigned int shift;
> };
>
> static void tegra_vde_set_bits(struct tegra_vde *vde,
> @@ -634,12 +651,22 @@ static void tegra_vde_decode_frame(struct tegra_vde
> *vde, VDE_WR(0x20000000 | (macroblocks_nb - 1), vde->sxe + 0x00);
> }
>
> -static void tegra_vde_detach_and_put_dmabuf(struct dma_buf_attachment *a,
> +static void tegra_vde_detach_and_put_dmabuf(struct tegra_vde *vde,
> + struct dma_buf_attachment *a,
> struct sg_table *sgt,
> + struct iova *iova,
> enum dma_data_direction dma_dir)
> {
> struct dma_buf *dmabuf = a->dmabuf;
>
> + if (vde->domain) {
> + unsigned long size = iova_size(iova) << vde->shift;
Let's make it "size = iova_align(&vde->iova, dmabuf->size)" for better
readability.
> + dma_addr_t addr = iova_dma_addr(&vde->iova, iova);
> +
> + iommu_unmap(vde->domain, addr, size);
> + __free_iova(&vde->iova, iova);
> + }
> +
> dma_buf_unmap_attachment(a, sgt, dma_dir);
> dma_buf_detach(dmabuf, a);
> dma_buf_put(dmabuf);
> @@ -651,14 +678,16 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde
> *vde, size_t min_size,
> size_t align_size,
> struct dma_buf_attachment **a,
> - dma_addr_t *addr,
> + dma_addr_t *addrp,
> struct sg_table **s,
> - size_t *size,
> + struct iova **iovap,
> + size_t *sizep,
> enum dma_data_direction dma_dir)
> {
> struct dma_buf_attachment *attachment;
> struct dma_buf *dmabuf;
> struct sg_table *sgt;
> + size_t size;
> int err;
>
> dmabuf = dma_buf_get(fd);
> @@ -695,18 +724,47 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde
> *vde, goto err_detach;
> }
>
> - if (sgt->nents != 1) {
> + if (sgt->nents > 1 && !vde->domain) {
> dev_err(vde->dev, "Sparse DMA region is unsupported\n");
> err = -EINVAL;
> goto err_unmap;
> }
>
> - *addr = sg_dma_address(sgt->sgl) + offset;
> + if (vde->domain) {
> + int prot = IOMMU_READ | IOMMU_WRITE;
> + struct iova *iova;
> + dma_addr_t addr;
> +
> + size = (dmabuf->size - offset) >> vde->shift;
Offset shall not be subtracted and dmabuf size shall be rounded to IOVA
granule. Also, let's not carry shift within the vde structure as it doesn't
really worth it.
shift = iova_shift(&vde->iova);
size = iova_align(&vde->iova, dmabuf->size) >> shift;
> +
> + iova = alloc_iova(&vde->iova, size, vde->limit - 1, true);
> + if (!iova) {
> + err = -ENOMEM;
> + goto err_unmap;
> + }
> +
> + addr = iova_dma_addr(&vde->iova, iova);
> +
> + size = iommu_map_sg(vde->domain, addr, sgt->sgl, sgt->nents,
> + prot);
> + if (!size) {
> + __free_iova(&vde->iova, iova);
> + err = -ENXIO;
> + goto err_unmap;
> + }
> +
> + *addrp = addr;
Returned address shall point at the beginning of the buffer + offset.
*addrp = addr + offset;
Returned size shall represent the leftover size after the offset subtraction,
like for example it is used for the bitstream_data address counter limit.
size = dmabuf->size - offset;
> + *iovap = iova;
> + } else {
> + *addrp = sg_dma_address(sgt->sgl) + offset;
> + size = dmabuf->size - offset;
> + }
> +
> *a = attachment;
> *s = sgt;
>
> - if (size)
> - *size = dmabuf->size - offset;
> + if (sizep)
> + *sizep = size;
>
> return 0;
>
> @@ -734,6 +792,7 @@ static int tegra_vde_attach_dmabufs_to_frame(struct
> tegra_vde *vde, &frame->y_dmabuf_attachment,
> &frame->y_addr,
> &frame->y_sgt,
> + &frame->y_iova,
> NULL, dma_dir);
> if (err)
> return err;
> @@ -743,6 +802,7 @@ static int tegra_vde_attach_dmabufs_to_frame(struct
> tegra_vde *vde, &frame->cb_dmabuf_attachment,
> &frame->cb_addr,
> &frame->cb_sgt,
> + &frame->cb_iova,
> NULL, dma_dir);
> if (err)
> goto err_release_y;
> @@ -752,6 +812,7 @@ static int tegra_vde_attach_dmabufs_to_frame(struct
> tegra_vde *vde, &frame->cr_dmabuf_attachment,
> &frame->cr_addr,
> &frame->cr_sgt,
> + &frame->cr_iova,
> NULL, dma_dir);
> if (err)
> goto err_release_cb;
> @@ -766,6 +827,7 @@ static int tegra_vde_attach_dmabufs_to_frame(struct
> tegra_vde *vde, &frame->aux_dmabuf_attachment,
> &frame->aux_addr,
> &frame->aux_sgt,
> + &frame->aux_iova,
> NULL, dma_dir);
> if (err)
> goto err_release_cr;
> @@ -774,13 +836,16 @@ static int tegra_vde_attach_dmabufs_to_frame(struct
> tegra_vde *vde,
>
> err_release_cr:
> tegra_vde_detach_and_put_dmabuf(vde, frame->cr_dmabuf_attachment,
> - frame->cr_sgt, dma_dir);
> + frame->cr_sgt, frame->cr_iova,
> + dma_dir);
> err_release_cb:
> tegra_vde_detach_and_put_dmabuf(vde, frame->cb_dmabuf_attachment,
> - frame->cb_sgt, dma_dir);
> + frame->cb_sgt, frame->cb_iova,
> + dma_dir);
> err_release_y:
> tegra_vde_detach_and_put_dmabuf(vde, frame->y_dmabuf_attachment,
> - frame->y_sgt, dma_dir);
> + frame->y_sgt, frame->y_iova,
> + dma_dir);
>
> return err;
> }
> @@ -793,16 +858,20 @@ static void tegra_vde_release_frame_dmabufs(struct
> tegra_vde *vde, if (!baseline_profile)
> tegra_vde_detach_and_put_dmabuf(vde,
> frame->aux_dmabuf_attachment,
> - frame->aux_sgt, dma_dir);
> + frame->aux_sgt,
> + frame->aux_iova, dma_dir);
>
> tegra_vde_detach_and_put_dmabuf(vde, frame->cr_dmabuf_attachment,
> - frame->cr_sgt, dma_dir);
> + frame->cr_sgt, frame->cr_iova,
> + dma_dir);
>
> tegra_vde_detach_and_put_dmabuf(vde, frame->cb_dmabuf_attachment,
> - frame->cb_sgt, dma_dir);
> + frame->cb_sgt, frame->cb_iova,
> + dma_dir);
>
> tegra_vde_detach_and_put_dmabuf(vde, frame->y_dmabuf_attachment,
> - frame->y_sgt, dma_dir);
> + frame->y_sgt, frame->y_iova,
> + dma_dir);
> }
>
> static int tegra_vde_validate_frame(struct device *dev,
> @@ -923,6 +992,8 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde
> *vde, struct sg_table *bitstream_sgt, *secure_sgt;
> enum dma_data_direction dma_dir;
> dma_addr_t bitstream_data_addr;
> + struct iova *bitstream_iova;
> + struct iova *secure_iova;
> dma_addr_t secure_addr;
> dma_addr_t bsev_ptr;
> size_t lsize, csize;
> @@ -948,6 +1019,7 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde
> *vde, &bitstream_data_dmabuf_attachment,
> &bitstream_data_addr,
> &bitstream_sgt,
> + &bitstream_iova,
> &bitstream_data_size,
> DMA_TO_DEVICE);
> if (ret)
> @@ -959,6 +1031,7 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde
> *vde, &secure_attachment,
> &secure_addr,
> &secure_sgt,
> + &secure_iova,
> &secure_size,
> DMA_TO_DEVICE);
> if (ret)
> @@ -1095,12 +1168,13 @@ static int tegra_vde_ioctl_decode_h264(struct
> tegra_vde *vde, release_bitstream_dmabuf:
> if (secure_attachment)
> tegra_vde_detach_and_put_dmabuf(vde, secure_attachment,
> - secure_sgt,
> + secure_sgt, secure_iova,
> DMA_TO_DEVICE);
>
> tegra_vde_detach_and_put_dmabuf(vde,
> bitstream_data_dmabuf_attachment,
> - bitstream_sgt, DMA_TO_DEVICE);
> + bitstream_sgt, bitstream_iova,
> + DMA_TO_DEVICE);
>
> return ret;
> }
> @@ -1193,6 +1267,15 @@ static int tegra_vde_probe(struct platform_device
> *pdev) struct tegra_vde *vde;
> int irq, err;
>
> +#if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU)
> + if (dev->archdata.mapping) {
> + struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
> +
> + arm_iommu_detach_device(dev);
> + arm_iommu_release_mapping(mapping);
> + }
> +#endif
> +
> vde = devm_kzalloc(dev, sizeof(*vde), GFP_KERNEL);
> if (!vde)
> return -ENOMEM;
> @@ -1335,6 +1418,37 @@ static int tegra_vde_probe(struct platform_device
> *pdev) return -ENOMEM;
> }
>
> + vde->group = iommu_group_get(dev);
> + if (vde->group) {
> + unsigned long order;
> +
> + vde->domain = iommu_domain_alloc(&platform_bus_type);
> + if (!vde->domain) {
> + iommu_group_put(vde->group);
> + vde->group = NULL;
> + } else {
> + err = iova_cache_get();
> + if (err < 0)
> + goto free_domain;
> +
> + order = __ffs(vde->domain->pgsize_bitmap);
> +
> + init_iova_domain(&vde->iova, 1UL << order, 0);
The minimum address of IOVA allocations shall be determined by the domains
aperture start address.
> + vde->shift = iova_shift(&vde->iova);
> + vde->limit = 1 << (32 - vde->shift);
IOVA limit shall be determined by the domains aperture size.
struct iommu_domain_geometry *geometry;
order = __ffs(vde->domain->pgsize_bitmap);
geometry = &vde->domain->geometry;
init_iova_domain(&vde->iova, 1UL << order,
geometry->aperture_start >> order);
vde->iova_end = geometry->aperture_end;
Hence let's replace 'limit' with 'iova_end'.
> +
> + /*
> + * VDE doesn't seem to like accessing the last page of
> + * its 32-bit address space.
> + */
> + vde->limit -= 1;
That's probably because some VDE HW address counter is getting wrapped around
and it can't cope with that due to a HW bug / optimization.
Since this only affects the end of AS, let's check if that adjustment is
needed:
if (vde->iova_end == 0xffffffff)
vde->iova_end -= 1UL << order;
> +
> + err = iommu_attach_group(vde->domain, vde->group);
> + if (err < 0)
goto put_iova;
> + goto put_cache;
> + }
> + }
> +
> mutex_init(&vde->lock);
> init_completion(&vde->decode_completion);
>
> @@ -1346,7 +1460,7 @@ static int tegra_vde_probe(struct platform_device
> *pdev) err = misc_register(&vde->miscdev);
> if (err) {
> dev_err(dev, "Failed to register misc device: %d\n", err);
> - goto err_gen_free;
> + goto detach;
> }
>
> pm_runtime_enable(dev);
> @@ -1364,7 +1478,21 @@ static int tegra_vde_probe(struct platform_device
> *pdev) err_misc_unreg:
> misc_deregister(&vde->miscdev);
>
> -err_gen_free:
put_iova:
put_iova_domain(&vde->iova);
> +detach:
> + if (vde->domain)
> + iommu_detach_group(vde->domain, vde->group);
> +
> +put_cache:
> + if (vde->domain)
> + iova_cache_put();
> +
> +free_domain:
> + if (vde->domain)
> + iommu_domain_free(vde->domain);
> +
> + if (vde->group)
> + iommu_group_put(vde->group);
> +
> gen_pool_free(vde->iram_pool, (unsigned long)vde->iram,
> gen_pool_size(vde->iram_pool));
>
> @@ -1388,6 +1516,13 @@ static int tegra_vde_remove(struct platform_device
> *pdev)
>
> misc_deregister(&vde->miscdev);
>
> + if (vde->domain) {
> + iommu_detach_group(vde->domain, vde->group);
put_iova_domain(&vde->iova);
> + iova_cache_put();
> + iommu_domain_free(vde->domain);
> + iommu_group_put(vde->group);
> + }
> +
> gen_pool_free(vde->iram_pool, (unsigned long)vde->iram,
> gen_pool_size(vde->iram_pool));
More information about the devel
mailing list