Showing error 573

User: Jiri Slaby
Error type: Double Lock
Error type description: Some lock is locked twice unintentionally in a sequence
File location: drivers/pci/intel-iommu.c
Line in file: 1300
Project: Linux Kernel
Project version: 2.6.28
Confirmation: Fixed by 00dfff77e7184140dc45724c7232e99302f6bf97
Tools: Stanse (1.2)
Entered: 2011-11-07 22:19:59 UTC


Source:

   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/slab.h>
  28#include <linux/irq.h>
  29#include <linux/interrupt.h>
  30#include <linux/sysdev.h>
  31#include <linux/spinlock.h>
  32#include <linux/pci.h>
  33#include <linux/dmar.h>
  34#include <linux/dma-mapping.h>
  35#include <linux/mempool.h>
  36#include <linux/timer.h>
  37#include <linux/iova.h>
  38#include <linux/intel-iommu.h>
  39#include <asm/cacheflush.h>
  40#include <asm/iommu.h>
  41#include "pci.h"
  42
  43#define ROOT_SIZE                VTD_PAGE_SIZE
  44#define CONTEXT_SIZE                VTD_PAGE_SIZE
  45
  46#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  47#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  48
  49#define IOAPIC_RANGE_START        (0xfee00000)
  50#define IOAPIC_RANGE_END        (0xfeefffff)
  51#define IOVA_START_ADDR                (0x1000)
  52
  53#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  54
  55#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
  56
  57
  58static void flush_unmaps_timeout(unsigned long data);
  59
  60DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
  61
  62#define HIGH_WATER_MARK 250
  63struct deferred_flush_tables {
  64        int next;
  65        struct iova *iova[HIGH_WATER_MARK];
  66        struct dmar_domain *domain[HIGH_WATER_MARK];
  67};
  68
  69static struct deferred_flush_tables *deferred_flush;
  70
  71/* bitmap for indexing intel_iommus */
  72static int g_num_of_iommus;
  73
  74static DEFINE_SPINLOCK(async_umap_flush_lock);
  75static LIST_HEAD(unmaps_to_do);
  76
  77static int timer_on;
  78static long list_size;
  79
  80static void domain_remove_dev_info(struct dmar_domain *domain);
  81
  82int dmar_disabled;
  83static int __initdata dmar_map_gfx = 1;
  84static int dmar_forcedac;
  85static int intel_iommu_strict;
  86
  87#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
  88static DEFINE_SPINLOCK(device_domain_lock);
  89static LIST_HEAD(device_domain_list);
  90
  91static int __init intel_iommu_setup(char *str)
  92{
  93        if (!str)
  94                return -EINVAL;
  95        while (*str) {
  96                if (!strncmp(str, "off", 3)) {
  97                        dmar_disabled = 1;
  98                        printk(KERN_INFO"Intel-IOMMU: disabled\n");
  99                } else if (!strncmp(str, "igfx_off", 8)) {
 100                        dmar_map_gfx = 0;
 101                        printk(KERN_INFO
 102                                "Intel-IOMMU: disable GFX device mapping\n");
 103                } else if (!strncmp(str, "forcedac", 8)) {
 104                        printk(KERN_INFO
 105                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 106                        dmar_forcedac = 1;
 107                } else if (!strncmp(str, "strict", 6)) {
 108                        printk(KERN_INFO
 109                                "Intel-IOMMU: disable batched IOTLB flush\n");
 110                        intel_iommu_strict = 1;
 111                }
 112
 113                str += strcspn(str, ",");
 114                while (*str == ',')
 115                        str++;
 116        }
 117        return 0;
 118}
 119__setup("intel_iommu=", intel_iommu_setup);
 120
 121static struct kmem_cache *iommu_domain_cache;
 122static struct kmem_cache *iommu_devinfo_cache;
 123static struct kmem_cache *iommu_iova_cache;
 124
 125static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
 126{
 127        unsigned int flags;
 128        void *vaddr;
 129
 130        /* trying to avoid low memory issues */
 131        flags = current->flags & PF_MEMALLOC;
 132        current->flags |= PF_MEMALLOC;
 133        vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
 134        current->flags &= (~PF_MEMALLOC | flags);
 135        return vaddr;
 136}
 137
 138
 139static inline void *alloc_pgtable_page(void)
 140{
 141        unsigned int flags;
 142        void *vaddr;
 143
 144        /* trying to avoid low memory issues */
 145        flags = current->flags & PF_MEMALLOC;
 146        current->flags |= PF_MEMALLOC;
 147        vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
 148        current->flags &= (~PF_MEMALLOC | flags);
 149        return vaddr;
 150}
 151
 152static inline void free_pgtable_page(void *vaddr)
 153{
 154        free_page((unsigned long)vaddr);
 155}
 156
 157static inline void *alloc_domain_mem(void)
 158{
 159        return iommu_kmem_cache_alloc(iommu_domain_cache);
 160}
 161
 162static void free_domain_mem(void *vaddr)
 163{
 164        kmem_cache_free(iommu_domain_cache, vaddr);
 165}
 166
 167static inline void * alloc_devinfo_mem(void)
 168{
 169        return iommu_kmem_cache_alloc(iommu_devinfo_cache);
 170}
 171
 172static inline void free_devinfo_mem(void *vaddr)
 173{
 174        kmem_cache_free(iommu_devinfo_cache, vaddr);
 175}
 176
 177struct iova *alloc_iova_mem(void)
 178{
 179        return iommu_kmem_cache_alloc(iommu_iova_cache);
 180}
 181
 182void free_iova_mem(struct iova *iova)
 183{
 184        kmem_cache_free(iommu_iova_cache, iova);
 185}
 186
 187/* Gets context entry for a given bus and devfn */
 188static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 189                u8 bus, u8 devfn)
 190{
 191        struct root_entry *root;
 192        struct context_entry *context;
 193        unsigned long phy_addr;
 194        unsigned long flags;
 195
 196        spin_lock_irqsave(&iommu->lock, flags);
 197        root = &iommu->root_entry[bus];
 198        context = get_context_addr_from_root(root);
 199        if (!context) {
 200                context = (struct context_entry *)alloc_pgtable_page();
 201                if (!context) {
 202                        spin_unlock_irqrestore(&iommu->lock, flags);
 203                        return NULL;
 204                }
 205                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 206                phy_addr = virt_to_phys((void *)context);
 207                set_root_value(root, phy_addr);
 208                set_root_present(root);
 209                __iommu_flush_cache(iommu, root, sizeof(*root));
 210        }
 211        spin_unlock_irqrestore(&iommu->lock, flags);
 212        return &context[devfn];
 213}
 214
 215static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 216{
 217        struct root_entry *root;
 218        struct context_entry *context;
 219        int ret;
 220        unsigned long flags;
 221
 222        spin_lock_irqsave(&iommu->lock, flags);
 223        root = &iommu->root_entry[bus];
 224        context = get_context_addr_from_root(root);
 225        if (!context) {
 226                ret = 0;
 227                goto out;
 228        }
 229        ret = context_present(context[devfn]);
 230out:
 231        spin_unlock_irqrestore(&iommu->lock, flags);
 232        return ret;
 233}
 234
 235static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 236{
 237        struct root_entry *root;
 238        struct context_entry *context;
 239        unsigned long flags;
 240
 241        spin_lock_irqsave(&iommu->lock, flags);
 242        root = &iommu->root_entry[bus];
 243        context = get_context_addr_from_root(root);
 244        if (context) {
 245                context_clear_entry(context[devfn]);
 246                __iommu_flush_cache(iommu, &context[devfn], \
 247                        sizeof(*context));
 248        }
 249        spin_unlock_irqrestore(&iommu->lock, flags);
 250}
 251
 252static void free_context_table(struct intel_iommu *iommu)
 253{
 254        struct root_entry *root;
 255        int i;
 256        unsigned long flags;
 257        struct context_entry *context;
 258
 259        spin_lock_irqsave(&iommu->lock, flags);
 260        if (!iommu->root_entry) {
 261                goto out;
 262        }
 263        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 264                root = &iommu->root_entry[i];
 265                context = get_context_addr_from_root(root);
 266                if (context)
 267                        free_pgtable_page(context);
 268        }
 269        free_pgtable_page(iommu->root_entry);
 270        iommu->root_entry = NULL;
 271out:
 272        spin_unlock_irqrestore(&iommu->lock, flags);
 273}
 274
 275/* page table handling */
 276#define LEVEL_STRIDE                (9)
 277#define LEVEL_MASK                (((u64)1 << LEVEL_STRIDE) - 1)
 278
 279static inline int agaw_to_level(int agaw)
 280{
 281        return agaw + 2;
 282}
 283
 284static inline int agaw_to_width(int agaw)
 285{
 286        return 30 + agaw * LEVEL_STRIDE;
 287
 288}
 289
 290static inline int width_to_agaw(int width)
 291{
 292        return (width - 30) / LEVEL_STRIDE;
 293}
 294
 295static inline unsigned int level_to_offset_bits(int level)
 296{
 297        return (12 + (level - 1) * LEVEL_STRIDE);
 298}
 299
 300static inline int address_level_offset(u64 addr, int level)
 301{
 302        return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
 303}
 304
 305static inline u64 level_mask(int level)
 306{
 307        return ((u64)-1 << level_to_offset_bits(level));
 308}
 309
 310static inline u64 level_size(int level)
 311{
 312        return ((u64)1 << level_to_offset_bits(level));
 313}
 314
 315static inline u64 align_to_level(u64 addr, int level)
 316{
 317        return ((addr + level_size(level) - 1) & level_mask(level));
 318}
 319
 320static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
 321{
 322        int addr_width = agaw_to_width(domain->agaw);
 323        struct dma_pte *parent, *pte = NULL;
 324        int level = agaw_to_level(domain->agaw);
 325        int offset;
 326        unsigned long flags;
 327
 328        BUG_ON(!domain->pgd);
 329
 330        addr &= (((u64)1) << addr_width) - 1;
 331        parent = domain->pgd;
 332
 333        spin_lock_irqsave(&domain->mapping_lock, flags);
 334        while (level > 0) {
 335                void *tmp_page;
 336
 337                offset = address_level_offset(addr, level);
 338                pte = &parent[offset];
 339                if (level == 1)
 340                        break;
 341
 342                if (!dma_pte_present(*pte)) {
 343                        tmp_page = alloc_pgtable_page();
 344
 345                        if (!tmp_page) {
 346                                spin_unlock_irqrestore(&domain->mapping_lock,
 347                                        flags);
 348                                return NULL;
 349                        }
 350                        __iommu_flush_cache(domain->iommu, tmp_page,
 351                                        PAGE_SIZE);
 352                        dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
 353                        /*
 354                         * high level table always sets r/w, last level page
 355                         * table control read/write
 356                         */
 357                        dma_set_pte_readable(*pte);
 358                        dma_set_pte_writable(*pte);
 359                        __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
 360                }
 361                parent = phys_to_virt(dma_pte_addr(*pte));
 362                level--;
 363        }
 364
 365        spin_unlock_irqrestore(&domain->mapping_lock, flags);
 366        return pte;
 367}
 368
 369/* return address's pte at specific level */
 370static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
 371                int level)
 372{
 373        struct dma_pte *parent, *pte = NULL;
 374        int total = agaw_to_level(domain->agaw);
 375        int offset;
 376
 377        parent = domain->pgd;
 378        while (level <= total) {
 379                offset = address_level_offset(addr, total);
 380                pte = &parent[offset];
 381                if (level == total)
 382                        return pte;
 383
 384                if (!dma_pte_present(*pte))
 385                        break;
 386                parent = phys_to_virt(dma_pte_addr(*pte));
 387                total--;
 388        }
 389        return NULL;
 390}
 391
 392/* clear one page's page table */
 393static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
 394{
 395        struct dma_pte *pte = NULL;
 396
 397        /* get last level pte */
 398        pte = dma_addr_level_pte(domain, addr, 1);
 399
 400        if (pte) {
 401                dma_clear_pte(*pte);
 402                __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
 403        }
 404}
 405
 406/* clear last level pte, a tlb flush should be followed */
 407static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
 408{
 409        int addr_width = agaw_to_width(domain->agaw);
 410
 411        start &= (((u64)1) << addr_width) - 1;
 412        end &= (((u64)1) << addr_width) - 1;
 413        /* in case it's partial page */
 414        start = PAGE_ALIGN(start);
 415        end &= PAGE_MASK;
 416
 417        /* we don't need lock here, nobody else touches the iova range */
 418        while (start < end) {
 419                dma_pte_clear_one(domain, start);
 420                start += VTD_PAGE_SIZE;
 421        }
 422}
 423
 424/* free page table pages. last level pte should already be cleared */
 425static void dma_pte_free_pagetable(struct dmar_domain *domain,
 426        u64 start, u64 end)
 427{
 428        int addr_width = agaw_to_width(domain->agaw);
 429        struct dma_pte *pte;
 430        int total = agaw_to_level(domain->agaw);
 431        int level;
 432        u64 tmp;
 433
 434        start &= (((u64)1) << addr_width) - 1;
 435        end &= (((u64)1) << addr_width) - 1;
 436
 437        /* we don't need lock here, nobody else touches the iova range */
 438        level = 2;
 439        while (level <= total) {
 440                tmp = align_to_level(start, level);
 441                if (tmp >= end || (tmp + level_size(level) > end))
 442                        return;
 443
 444                while (tmp < end) {
 445                        pte = dma_addr_level_pte(domain, tmp, level);
 446                        if (pte) {
 447                                free_pgtable_page(
 448                                        phys_to_virt(dma_pte_addr(*pte)));
 449                                dma_clear_pte(*pte);
 450                                __iommu_flush_cache(domain->iommu,
 451                                                pte, sizeof(*pte));
 452                        }
 453                        tmp += level_size(level);
 454                }
 455                level++;
 456        }
 457        /* free pgd */
 458        if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
 459                free_pgtable_page(domain->pgd);
 460                domain->pgd = NULL;
 461        }
 462}
 463
 464/* iommu handling */
 465static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 466{
 467        struct root_entry *root;
 468        unsigned long flags;
 469
 470        root = (struct root_entry *)alloc_pgtable_page();
 471        if (!root)
 472                return -ENOMEM;
 473
 474        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 475
 476        spin_lock_irqsave(&iommu->lock, flags);
 477        iommu->root_entry = root;
 478        spin_unlock_irqrestore(&iommu->lock, flags);
 479
 480        return 0;
 481}
 482
 483static void iommu_set_root_entry(struct intel_iommu *iommu)
 484{
 485        void *addr;
 486        u32 cmd, sts;
 487        unsigned long flag;
 488
 489        addr = iommu->root_entry;
 490
 491        spin_lock_irqsave(&iommu->register_lock, flag);
 492        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 493
 494        cmd = iommu->gcmd | DMA_GCMD_SRTP;
 495        writel(cmd, iommu->reg + DMAR_GCMD_REG);
 496
 497        /* Make sure hardware complete it */
 498        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 499                readl, (sts & DMA_GSTS_RTPS), sts);
 500
 501        spin_unlock_irqrestore(&iommu->register_lock, flag);
 502}
 503
 504static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 505{
 506        u32 val;
 507        unsigned long flag;
 508
 509        if (!cap_rwbf(iommu->cap))
 510                return;
 511        val = iommu->gcmd | DMA_GCMD_WBF;
 512
 513        spin_lock_irqsave(&iommu->register_lock, flag);
 514        writel(val, iommu->reg + DMAR_GCMD_REG);
 515
 516        /* Make sure hardware complete it */
 517        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 518                        readl, (!(val & DMA_GSTS_WBFS)), val);
 519
 520        spin_unlock_irqrestore(&iommu->register_lock, flag);
 521}
 522
 523/* return value determine if we need a write buffer flush */
 524static int __iommu_flush_context(struct intel_iommu *iommu,
 525        u16 did, u16 source_id, u8 function_mask, u64 type,
 526        int non_present_entry_flush)
 527{
 528        u64 val = 0;
 529        unsigned long flag;
 530
 531        /*
 532         * In the non-present entry flush case, if hardware doesn't cache
 533         * non-present entry we do nothing and if hardware cache non-present
 534         * entry, we flush entries of domain 0 (the domain id is used to cache
 535         * any non-present entries)
 536         */
 537        if (non_present_entry_flush) {
 538                if (!cap_caching_mode(iommu->cap))
 539                        return 1;
 540                else
 541                        did = 0;
 542        }
 543
 544        switch (type) {
 545        case DMA_CCMD_GLOBAL_INVL:
 546                val = DMA_CCMD_GLOBAL_INVL;
 547                break;
 548        case DMA_CCMD_DOMAIN_INVL:
 549                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
 550                break;
 551        case DMA_CCMD_DEVICE_INVL:
 552                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
 553                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
 554                break;
 555        default:
 556                BUG();
 557        }
 558        val |= DMA_CCMD_ICC;
 559
 560        spin_lock_irqsave(&iommu->register_lock, flag);
 561        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
 562
 563        /* Make sure hardware complete it */
 564        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
 565                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
 566
 567        spin_unlock_irqrestore(&iommu->register_lock, flag);
 568
 569        /* flush context entry will implicitly flush write buffer */
 570        return 0;
 571}
 572
 573/* return value determine if we need a write buffer flush */
 574static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 575        u64 addr, unsigned int size_order, u64 type,
 576        int non_present_entry_flush)
 577{
 578        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
 579        u64 val = 0, val_iva = 0;
 580        unsigned long flag;
 581
 582        /*
 583         * In the non-present entry flush case, if hardware doesn't cache
 584         * non-present entry we do nothing and if hardware cache non-present
 585         * entry, we flush entries of domain 0 (the domain id is used to cache
 586         * any non-present entries)
 587         */
 588        if (non_present_entry_flush) {
 589                if (!cap_caching_mode(iommu->cap))
 590                        return 1;
 591                else
 592                        did = 0;
 593        }
 594
 595        switch (type) {
 596        case DMA_TLB_GLOBAL_FLUSH:
 597                /* global flush doesn't need set IVA_REG */
 598                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
 599                break;
 600        case DMA_TLB_DSI_FLUSH:
 601                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
 602                break;
 603        case DMA_TLB_PSI_FLUSH:
 604                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
 605                /* Note: always flush non-leaf currently */
 606                val_iva = size_order | addr;
 607                break;
 608        default:
 609                BUG();
 610        }
 611        /* Note: set drain read/write */
 612#if 0
 613        /*
 614         * This is probably to be super secure.. Looks like we can
 615         * ignore it without any impact.
 616         */
 617        if (cap_read_drain(iommu->cap))
 618                val |= DMA_TLB_READ_DRAIN;
 619#endif
 620        if (cap_write_drain(iommu->cap))
 621                val |= DMA_TLB_WRITE_DRAIN;
 622
 623        spin_lock_irqsave(&iommu->register_lock, flag);
 624        /* Note: Only uses first TLB reg currently */
 625        if (val_iva)
 626                dmar_writeq(iommu->reg + tlb_offset, val_iva);
 627        dmar_writeq(iommu->reg + tlb_offset + 8, val);
 628
 629        /* Make sure hardware complete it */
 630        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
 631                dmar_readq, (!(val & DMA_TLB_IVT)), val);
 632
 633        spin_unlock_irqrestore(&iommu->register_lock, flag);
 634
 635        /* check IOTLB invalidation granularity */
 636        if (DMA_TLB_IAIG(val) == 0)
 637                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
 638        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
 639                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
 640                        (unsigned long long)DMA_TLB_IIRG(type),
 641                        (unsigned long long)DMA_TLB_IAIG(val));
 642        /* flush iotlb entry will implicitly flush write buffer */
 643        return 0;
 644}
 645
 646static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 647        u64 addr, unsigned int pages, int non_present_entry_flush)
 648{
 649        unsigned int mask;
 650
 651        BUG_ON(addr & (~VTD_PAGE_MASK));
 652        BUG_ON(pages == 0);
 653
 654        /* Fallback to domain selective flush if no PSI support */
 655        if (!cap_pgsel_inv(iommu->cap))
 656                return iommu->flush.flush_iotlb(iommu, did, 0, 0,
 657                                                DMA_TLB_DSI_FLUSH,
 658                                                non_present_entry_flush);
 659
 660        /*
 661         * PSI requires page size to be 2 ^ x, and the base address is naturally
 662         * aligned to the size
 663         */
 664        mask = ilog2(__roundup_pow_of_two(pages));
 665        /* Fallback to domain selective flush if size is too big */
 666        if (mask > cap_max_amask_val(iommu->cap))
 667                return iommu->flush.flush_iotlb(iommu, did, 0, 0,
 668                        DMA_TLB_DSI_FLUSH, non_present_entry_flush);
 669
 670        return iommu->flush.flush_iotlb(iommu, did, addr, mask,
 671                                        DMA_TLB_PSI_FLUSH,
 672                                        non_present_entry_flush);
 673}
 674
 675static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
 676{
 677        u32 pmen;
 678        unsigned long flags;
 679
 680        spin_lock_irqsave(&iommu->register_lock, flags);
 681        pmen = readl(iommu->reg + DMAR_PMEN_REG);
 682        pmen &= ~DMA_PMEN_EPM;
 683        writel(pmen, iommu->reg + DMAR_PMEN_REG);
 684
 685        /* wait for the protected region status bit to clear */
 686        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
 687                readl, !(pmen & DMA_PMEN_PRS), pmen);
 688
 689        spin_unlock_irqrestore(&iommu->register_lock, flags);
 690}
 691
 692static int iommu_enable_translation(struct intel_iommu *iommu)
 693{
 694        u32 sts;
 695        unsigned long flags;
 696
 697        spin_lock_irqsave(&iommu->register_lock, flags);
 698        writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
 699
 700        /* Make sure hardware complete it */
 701        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 702                readl, (sts & DMA_GSTS_TES), sts);
 703
 704        iommu->gcmd |= DMA_GCMD_TE;
 705        spin_unlock_irqrestore(&iommu->register_lock, flags);
 706        return 0;
 707}
 708
 709static int iommu_disable_translation(struct intel_iommu *iommu)
 710{
 711        u32 sts;
 712        unsigned long flag;
 713
 714        spin_lock_irqsave(&iommu->register_lock, flag);
 715        iommu->gcmd &= ~DMA_GCMD_TE;
 716        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
 717
 718        /* Make sure hardware complete it */
 719        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 720                readl, (!(sts & DMA_GSTS_TES)), sts);
 721
 722        spin_unlock_irqrestore(&iommu->register_lock, flag);
 723        return 0;
 724}
 725
 726/* iommu interrupt handling. Most stuff are MSI-like. */
 727
 728static const char *fault_reason_strings[] =
 729{
 730        "Software",
 731        "Present bit in root entry is clear",
 732        "Present bit in context entry is clear",
 733        "Invalid context entry",
 734        "Access beyond MGAW",
 735        "PTE Write access is not set",
 736        "PTE Read access is not set",
 737        "Next page table ptr is invalid",
 738        "Root table address invalid",
 739        "Context table ptr is invalid",
 740        "non-zero reserved fields in RTP",
 741        "non-zero reserved fields in CTP",
 742        "non-zero reserved fields in PTE",
 743};
 744#define MAX_FAULT_REASON_IDX         (ARRAY_SIZE(fault_reason_strings) - 1)
 745
 746const char *dmar_get_fault_reason(u8 fault_reason)
 747{
 748        if (fault_reason > MAX_FAULT_REASON_IDX)
 749                return "Unknown";
 750        else
 751                return fault_reason_strings[fault_reason];
 752}
 753
 754void dmar_msi_unmask(unsigned int irq)
 755{
 756        struct intel_iommu *iommu = get_irq_data(irq);
 757        unsigned long flag;
 758
 759        /* unmask it */
 760        spin_lock_irqsave(&iommu->register_lock, flag);
 761        writel(0, iommu->reg + DMAR_FECTL_REG);
 762        /* Read a reg to force flush the post write */
 763        readl(iommu->reg + DMAR_FECTL_REG);
 764        spin_unlock_irqrestore(&iommu->register_lock, flag);
 765}
 766
 767void dmar_msi_mask(unsigned int irq)
 768{
 769        unsigned long flag;
 770        struct intel_iommu *iommu = get_irq_data(irq);
 771
 772        /* mask it */
 773        spin_lock_irqsave(&iommu->register_lock, flag);
 774        writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
 775        /* Read a reg to force flush the post write */
 776        readl(iommu->reg + DMAR_FECTL_REG);
 777        spin_unlock_irqrestore(&iommu->register_lock, flag);
 778}
 779
 780void dmar_msi_write(int irq, struct msi_msg *msg)
 781{
 782        struct intel_iommu *iommu = get_irq_data(irq);
 783        unsigned long flag;
 784
 785        spin_lock_irqsave(&iommu->register_lock, flag);
 786        writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
 787        writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
 788        writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
 789        spin_unlock_irqrestore(&iommu->register_lock, flag);
 790}
 791
 792void dmar_msi_read(int irq, struct msi_msg *msg)
 793{
 794        struct intel_iommu *iommu = get_irq_data(irq);
 795        unsigned long flag;
 796
 797        spin_lock_irqsave(&iommu->register_lock, flag);
 798        msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
 799        msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
 800        msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
 801        spin_unlock_irqrestore(&iommu->register_lock, flag);
 802}
 803
 804static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
 805                u8 fault_reason, u16 source_id, unsigned long long addr)
 806{
 807        const char *reason;
 808
 809        reason = dmar_get_fault_reason(fault_reason);
 810
 811        printk(KERN_ERR
 812                "DMAR:[%s] Request device [%02x:%02x.%d] "
 813                "fault addr %llx \n"
 814                "DMAR:[fault reason %02d] %s\n",
 815                (type ? "DMA Read" : "DMA Write"),
 816                (source_id >> 8), PCI_SLOT(source_id & 0xFF),
 817                PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
 818        return 0;
 819}
 820
 821#define PRIMARY_FAULT_REG_LEN (16)
 822static irqreturn_t iommu_page_fault(int irq, void *dev_id)
 823{
 824        struct intel_iommu *iommu = dev_id;
 825        int reg, fault_index;
 826        u32 fault_status;
 827        unsigned long flag;
 828
 829        spin_lock_irqsave(&iommu->register_lock, flag);
 830        fault_status = readl(iommu->reg + DMAR_FSTS_REG);
 831
 832        /* TBD: ignore advanced fault log currently */
 833        if (!(fault_status & DMA_FSTS_PPF))
 834                goto clear_overflow;
 835
 836        fault_index = dma_fsts_fault_record_index(fault_status);
 837        reg = cap_fault_reg_offset(iommu->cap);
 838        while (1) {
 839                u8 fault_reason;
 840                u16 source_id;
 841                u64 guest_addr;
 842                int type;
 843                u32 data;
 844
 845                /* highest 32 bits */
 846                data = readl(iommu->reg + reg +
 847                                fault_index * PRIMARY_FAULT_REG_LEN + 12);
 848                if (!(data & DMA_FRCD_F))
 849                        break;
 850
 851                fault_reason = dma_frcd_fault_reason(data);
 852                type = dma_frcd_type(data);
 853
 854                data = readl(iommu->reg + reg +
 855                                fault_index * PRIMARY_FAULT_REG_LEN + 8);
 856                source_id = dma_frcd_source_id(data);
 857
 858                guest_addr = dmar_readq(iommu->reg + reg +
 859                                fault_index * PRIMARY_FAULT_REG_LEN);
 860                guest_addr = dma_frcd_page_addr(guest_addr);
 861                /* clear the fault */
 862                writel(DMA_FRCD_F, iommu->reg + reg +
 863                        fault_index * PRIMARY_FAULT_REG_LEN + 12);
 864
 865                spin_unlock_irqrestore(&iommu->register_lock, flag);
 866
 867                iommu_page_fault_do_one(iommu, type, fault_reason,
 868                                source_id, guest_addr);
 869
 870                fault_index++;
 871                if (fault_index > cap_num_fault_regs(iommu->cap))
 872                        fault_index = 0;
 873                spin_lock_irqsave(&iommu->register_lock, flag);
 874        }
 875clear_overflow:
 876        /* clear primary fault overflow */
 877        fault_status = readl(iommu->reg + DMAR_FSTS_REG);
 878        if (fault_status & DMA_FSTS_PFO)
 879                writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
 880
 881        spin_unlock_irqrestore(&iommu->register_lock, flag);
 882        return IRQ_HANDLED;
 883}
 884
 885int dmar_set_interrupt(struct intel_iommu *iommu)
 886{
 887        int irq, ret;
 888
 889        irq = create_irq();
 890        if (!irq) {
 891                printk(KERN_ERR "IOMMU: no free vectors\n");
 892                return -EINVAL;
 893        }
 894
 895        set_irq_data(irq, iommu);
 896        iommu->irq = irq;
 897
 898        ret = arch_setup_dmar_msi(irq);
 899        if (ret) {
 900                set_irq_data(irq, NULL);
 901                iommu->irq = 0;
 902                destroy_irq(irq);
 903                return 0;
 904        }
 905
 906        /* Force fault register is cleared */
 907        iommu_page_fault(irq, iommu);
 908
 909        ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
 910        if (ret)
 911                printk(KERN_ERR "IOMMU: can't request irq\n");
 912        return ret;
 913}
 914
 915static int iommu_init_domains(struct intel_iommu *iommu)
 916{
 917        unsigned long ndomains;
 918        unsigned long nlongs;
 919
 920        ndomains = cap_ndoms(iommu->cap);
 921        pr_debug("Number of Domains supportd <%ld>\n", ndomains);
 922        nlongs = BITS_TO_LONGS(ndomains);
 923
 924        /* TBD: there might be 64K domains,
 925         * consider other allocation for future chip
 926         */
 927        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
 928        if (!iommu->domain_ids) {
 929                printk(KERN_ERR "Allocating domain id array failed\n");
 930                return -ENOMEM;
 931        }
 932        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
 933                        GFP_KERNEL);
 934        if (!iommu->domains) {
 935                printk(KERN_ERR "Allocating domain array failed\n");
 936                kfree(iommu->domain_ids);
 937                return -ENOMEM;
 938        }
 939
 940        spin_lock_init(&iommu->lock);
 941
 942        /*
 943         * if Caching mode is set, then invalid translations are tagged
 944         * with domainid 0. Hence we need to pre-allocate it.
 945         */
 946        if (cap_caching_mode(iommu->cap))
 947                set_bit(0, iommu->domain_ids);
 948        return 0;
 949}
 950
 951
 952static void domain_exit(struct dmar_domain *domain);
 953
 954void free_dmar_iommu(struct intel_iommu *iommu)
 955{
 956        struct dmar_domain *domain;
 957        int i;
 958
 959        i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
 960        for (; i < cap_ndoms(iommu->cap); ) {
 961                domain = iommu->domains[i];
 962                clear_bit(i, iommu->domain_ids);
 963                domain_exit(domain);
 964                i = find_next_bit(iommu->domain_ids,
 965                        cap_ndoms(iommu->cap), i+1);
 966        }
 967
 968        if (iommu->gcmd & DMA_GCMD_TE)
 969                iommu_disable_translation(iommu);
 970
 971        if (iommu->irq) {
 972                set_irq_data(iommu->irq, NULL);
 973                /* This will mask the irq */
 974                free_irq(iommu->irq, iommu);
 975                destroy_irq(iommu->irq);
 976        }
 977
 978        kfree(iommu->domains);
 979        kfree(iommu->domain_ids);
 980
 981        /* free context mapping */
 982        free_context_table(iommu);
 983}
 984
 985static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
 986{
 987        unsigned long num;
 988        unsigned long ndomains;
 989        struct dmar_domain *domain;
 990        unsigned long flags;
 991
 992        domain = alloc_domain_mem();
 993        if (!domain)
 994                return NULL;
 995
 996        ndomains = cap_ndoms(iommu->cap);
 997
 998        spin_lock_irqsave(&iommu->lock, flags);
 999        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1000        if (num >= ndomains) {
1001                spin_unlock_irqrestore(&iommu->lock, flags);
1002                free_domain_mem(domain);
1003                printk(KERN_ERR "IOMMU: no free domain ids\n");
1004                return NULL;
1005        }
1006
1007        set_bit(num, iommu->domain_ids);
1008        domain->id = num;
1009        domain->iommu = iommu;
1010        iommu->domains[num] = domain;
1011        spin_unlock_irqrestore(&iommu->lock, flags);
1012
1013        return domain;
1014}
1015
1016static void iommu_free_domain(struct dmar_domain *domain)
1017{
1018        unsigned long flags;
1019
1020        spin_lock_irqsave(&domain->iommu->lock, flags);
1021        clear_bit(domain->id, domain->iommu->domain_ids);
1022        spin_unlock_irqrestore(&domain->iommu->lock, flags);
1023}
1024
1025static struct iova_domain reserved_iova_list;
1026static struct lock_class_key reserved_alloc_key;
1027static struct lock_class_key reserved_rbtree_key;
1028
1029static void dmar_init_reserved_ranges(void)
1030{
1031        struct pci_dev *pdev = NULL;
1032        struct iova *iova;
1033        int i;
1034        u64 addr, size;
1035
1036        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1037
1038        lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1039                &reserved_alloc_key);
1040        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1041                &reserved_rbtree_key);
1042
1043        /* IOAPIC ranges shouldn't be accessed by DMA */
1044        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1045                IOVA_PFN(IOAPIC_RANGE_END));
1046        if (!iova)
1047                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1048
1049        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1050        for_each_pci_dev(pdev) {
1051                struct resource *r;
1052
1053                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1054                        r = &pdev->resource[i];
1055                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1056                                continue;
1057                        addr = r->start;
1058                        addr &= PAGE_MASK;
1059                        size = r->end - addr;
1060                        size = PAGE_ALIGN(size);
1061                        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1062                                IOVA_PFN(size + addr) - 1);
1063                        if (!iova)
1064                                printk(KERN_ERR "Reserve iova failed\n");
1065                }
1066        }
1067
1068}
1069
1070static void domain_reserve_special_ranges(struct dmar_domain *domain)
1071{
1072        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1073}
1074
1075static inline int guestwidth_to_adjustwidth(int gaw)
1076{
1077        int agaw;
1078        int r = (gaw - 12) % 9;
1079
1080        if (r == 0)
1081                agaw = gaw;
1082        else
1083                agaw = gaw + 9 - r;
1084        if (agaw > 64)
1085                agaw = 64;
1086        return agaw;
1087}
1088
1089static int domain_init(struct dmar_domain *domain, int guest_width)
1090{
1091        struct intel_iommu *iommu;
1092        int adjust_width, agaw;
1093        unsigned long sagaw;
1094
1095        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1096        spin_lock_init(&domain->mapping_lock);
1097
1098        domain_reserve_special_ranges(domain);
1099
1100        /* calculate AGAW */
1101        iommu = domain->iommu;
1102        if (guest_width > cap_mgaw(iommu->cap))
1103                guest_width = cap_mgaw(iommu->cap);
1104        domain->gaw = guest_width;
1105        adjust_width = guestwidth_to_adjustwidth(guest_width);
1106        agaw = width_to_agaw(adjust_width);
1107        sagaw = cap_sagaw(iommu->cap);
1108        if (!test_bit(agaw, &sagaw)) {
1109                /* hardware doesn't support it, choose a bigger one */
1110                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1111                agaw = find_next_bit(&sagaw, 5, agaw);
1112                if (agaw >= 5)
1113                        return -ENODEV;
1114        }
1115        domain->agaw = agaw;
1116        INIT_LIST_HEAD(&domain->devices);
1117
1118        /* always allocate the top pgd */
1119        domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1120        if (!domain->pgd)
1121                return -ENOMEM;
1122        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1123        return 0;
1124}
1125
1126static void domain_exit(struct dmar_domain *domain)
1127{
1128        u64 end;
1129
1130        /* Domain 0 is reserved, so dont process it */
1131        if (!domain)
1132                return;
1133
1134        domain_remove_dev_info(domain);
1135        /* destroy iovas */
1136        put_iova_domain(&domain->iovad);
1137        end = DOMAIN_MAX_ADDR(domain->gaw);
1138        end = end & (~PAGE_MASK);
1139
1140        /* clear ptes */
1141        dma_pte_clear_range(domain, 0, end);
1142
1143        /* free page tables */
1144        dma_pte_free_pagetable(domain, 0, end);
1145
1146        iommu_free_domain(domain);
1147        free_domain_mem(domain);
1148}
1149
1150static int domain_context_mapping_one(struct dmar_domain *domain,
1151                u8 bus, u8 devfn)
1152{
1153        struct context_entry *context;
1154        struct intel_iommu *iommu = domain->iommu;
1155        unsigned long flags;
1156
1157        pr_debug("Set context mapping for %02x:%02x.%d\n",
1158                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1159        BUG_ON(!domain->pgd);
1160        context = device_to_context_entry(iommu, bus, devfn);
1161        if (!context)
1162                return -ENOMEM;
1163        spin_lock_irqsave(&iommu->lock, flags);
1164        if (context_present(*context)) {
1165                spin_unlock_irqrestore(&iommu->lock, flags);
1166                return 0;
1167        }
1168
1169        context_set_domain_id(*context, domain->id);
1170        context_set_address_width(*context, domain->agaw);
1171        context_set_address_root(*context, virt_to_phys(domain->pgd));
1172        context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1173        context_set_fault_enable(*context);
1174        context_set_present(*context);
1175        __iommu_flush_cache(iommu, context, sizeof(*context));
1176
1177        /* it's a non-present to present mapping */
1178        if (iommu->flush.flush_context(iommu, domain->id,
1179                (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1180                DMA_CCMD_DEVICE_INVL, 1))
1181                iommu_flush_write_buffer(iommu);
1182        else
1183                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1184
1185        spin_unlock_irqrestore(&iommu->lock, flags);
1186        return 0;
1187}
1188
1189static int
1190domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1191{
1192        int ret;
1193        struct pci_dev *tmp, *parent;
1194
1195        ret = domain_context_mapping_one(domain, pdev->bus->number,
1196                pdev->devfn);
1197        if (ret)
1198                return ret;
1199
1200        /* dependent device mapping */
1201        tmp = pci_find_upstream_pcie_bridge(pdev);
1202        if (!tmp)
1203                return 0;
1204        /* Secondary interface's bus number and devfn 0 */
1205        parent = pdev->bus->self;
1206        while (parent != tmp) {
1207                ret = domain_context_mapping_one(domain, parent->bus->number,
1208                        parent->devfn);
1209                if (ret)
1210                        return ret;
1211                parent = parent->bus->self;
1212        }
1213        if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1214                return domain_context_mapping_one(domain,
1215                        tmp->subordinate->number, 0);
1216        else /* this is a legacy PCI bridge */
1217                return domain_context_mapping_one(domain,
1218                        tmp->bus->number, tmp->devfn);
1219}
1220
1221static int domain_context_mapped(struct dmar_domain *domain,
1222        struct pci_dev *pdev)
1223{
1224        int ret;
1225        struct pci_dev *tmp, *parent;
1226
1227        ret = device_context_mapped(domain->iommu,
1228                pdev->bus->number, pdev->devfn);
1229        if (!ret)
1230                return ret;
1231        /* dependent device mapping */
1232        tmp = pci_find_upstream_pcie_bridge(pdev);
1233        if (!tmp)
1234                return ret;
1235        /* Secondary interface's bus number and devfn 0 */
1236        parent = pdev->bus->self;
1237        while (parent != tmp) {
1238                ret = device_context_mapped(domain->iommu, parent->bus->number,
1239                        parent->devfn);
1240                if (!ret)
1241                        return ret;
1242                parent = parent->bus->self;
1243        }
1244        if (tmp->is_pcie)
1245                return device_context_mapped(domain->iommu,
1246                        tmp->subordinate->number, 0);
1247        else
1248                return device_context_mapped(domain->iommu,
1249                        tmp->bus->number, tmp->devfn);
1250}
1251
1252static int
1253domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1254                        u64 hpa, size_t size, int prot)
1255{
1256        u64 start_pfn, end_pfn;
1257        struct dma_pte *pte;
1258        int index;
1259        int addr_width = agaw_to_width(domain->agaw);
1260
1261        hpa &= (((u64)1) << addr_width) - 1;
1262
1263        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1264                return -EINVAL;
1265        iova &= PAGE_MASK;
1266        start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1267        end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1268        index = 0;
1269        while (start_pfn < end_pfn) {
1270                pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1271                if (!pte)
1272                        return -ENOMEM;
1273                /* We don't need lock here, nobody else
1274                 * touches the iova range
1275                 */
1276                BUG_ON(dma_pte_addr(*pte));
1277                dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT);
1278                dma_set_pte_prot(*pte, prot);
1279                __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1280                start_pfn++;
1281                index++;
1282        }
1283        return 0;
1284}
1285
1286static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1287{
1288        clear_context_table(domain->iommu, bus, devfn);
1289        domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1290                                           DMA_CCMD_GLOBAL_INVL, 0);
1291        domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1292                                         DMA_TLB_GLOBAL_FLUSH, 0);
1293}
1294
1295static void domain_remove_dev_info(struct dmar_domain *domain)
1296{
1297        struct device_domain_info *info;
1298        unsigned long flags;
1299
1300        spin_lock_irqsave(&device_domain_lock, flags);
1301        while (!list_empty(&domain->devices)) {
1302                info = list_entry(domain->devices.next,
1303                        struct device_domain_info, link);
1304                list_del(&info->link);
1305                list_del(&info->global);
1306                if (info->dev)
1307                        info->dev->dev.archdata.iommu = NULL;
1308                spin_unlock_irqrestore(&device_domain_lock, flags);
1309
1310                detach_domain_for_dev(info->domain, info->bus, info->devfn);
1311                free_devinfo_mem(info);
1312
1313                spin_lock_irqsave(&device_domain_lock, flags);
1314        }
1315        spin_unlock_irqrestore(&device_domain_lock, flags);
1316}
1317
1318/*
1319 * find_domain
1320 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1321 */
1322static struct dmar_domain *
1323find_domain(struct pci_dev *pdev)
1324{
1325        struct device_domain_info *info;
1326
1327        /* No lock here, assumes no domain exit in normal case */
1328        info = pdev->dev.archdata.iommu;
1329        if (info)
1330                return info->domain;
1331        return NULL;
1332}
1333
1334/* domain is initialized */
1335static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1336{
1337        struct dmar_domain *domain, *found = NULL;
1338        struct intel_iommu *iommu;
1339        struct dmar_drhd_unit *drhd;
1340        struct device_domain_info *info, *tmp;
1341        struct pci_dev *dev_tmp;
1342        unsigned long flags;
1343        int bus = 0, devfn = 0;
1344
1345        domain = find_domain(pdev);
1346        if (domain)
1347                return domain;
1348
1349        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1350        if (dev_tmp) {
1351                if (dev_tmp->is_pcie) {
1352                        bus = dev_tmp->subordinate->number;
1353                        devfn = 0;
1354                } else {
1355                        bus = dev_tmp->bus->number;
1356                        devfn = dev_tmp->devfn;
1357                }
1358                spin_lock_irqsave(&device_domain_lock, flags);
1359                list_for_each_entry(info, &device_domain_list, global) {
1360                        if (info->bus == bus && info->devfn == devfn) {
1361                                found = info->domain;
1362                                break;
1363                        }
1364                }
1365                spin_unlock_irqrestore(&device_domain_lock, flags);
1366                /* pcie-pci bridge already has a domain, uses it */
1367                if (found) {
1368                        domain = found;
1369                        goto found_domain;
1370                }
1371        }
1372
1373        /* Allocate new domain for the device */
1374        drhd = dmar_find_matched_drhd_unit(pdev);
1375        if (!drhd) {
1376                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1377                        pci_name(pdev));
1378                return NULL;
1379        }
1380        iommu = drhd->iommu;
1381
1382        domain = iommu_alloc_domain(iommu);
1383        if (!domain)
1384                goto error;
1385
1386        if (domain_init(domain, gaw)) {
1387                domain_exit(domain);
1388                goto error;
1389        }
1390
1391        /* register pcie-to-pci device */
1392        if (dev_tmp) {
1393                info = alloc_devinfo_mem();
1394                if (!info) {
1395                        domain_exit(domain);
1396                        goto error;
1397                }
1398                info->bus = bus;
1399                info->devfn = devfn;
1400                info->dev = NULL;
1401                info->domain = domain;
1402                /* This domain is shared by devices under p2p bridge */
1403                domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1404
1405                /* pcie-to-pci bridge already has a domain, uses it */
1406                found = NULL;
1407                spin_lock_irqsave(&device_domain_lock, flags);
1408                list_for_each_entry(tmp, &device_domain_list, global) {
1409                        if (tmp->bus == bus && tmp->devfn == devfn) {
1410                                found = tmp->domain;
1411                                break;
1412                        }
1413                }
1414                if (found) {
1415                        free_devinfo_mem(info);
1416                        domain_exit(domain);
1417                        domain = found;
1418                } else {
1419                        list_add(&info->link, &domain->devices);
1420                        list_add(&info->global, &device_domain_list);
1421                }
1422                spin_unlock_irqrestore(&device_domain_lock, flags);
1423        }
1424
1425found_domain:
1426        info = alloc_devinfo_mem();
1427        if (!info)
1428                goto error;
1429        info->bus = pdev->bus->number;
1430        info->devfn = pdev->devfn;
1431        info->dev = pdev;
1432        info->domain = domain;
1433        spin_lock_irqsave(&device_domain_lock, flags);
1434        /* somebody is fast */
1435        found = find_domain(pdev);
1436        if (found != NULL) {
1437                spin_unlock_irqrestore(&device_domain_lock, flags);
1438                if (found != domain) {
1439                        domain_exit(domain);
1440                        domain = found;
1441                }
1442                free_devinfo_mem(info);
1443                return domain;
1444        }
1445        list_add(&info->link, &domain->devices);
1446        list_add(&info->global, &device_domain_list);
1447        pdev->dev.archdata.iommu = info;
1448        spin_unlock_irqrestore(&device_domain_lock, flags);
1449        return domain;
1450error:
1451        /* recheck it here, maybe others set it */
1452        return find_domain(pdev);
1453}
1454
1455static int iommu_prepare_identity_map(struct pci_dev *pdev,
1456                                      unsigned long long start,
1457                                      unsigned long long end)
1458{
1459        struct dmar_domain *domain;
1460        unsigned long size;
1461        unsigned long long base;
1462        int ret;
1463
1464        printk(KERN_INFO
1465                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1466                pci_name(pdev), start, end);
1467        /* page table init */
1468        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1469        if (!domain)
1470                return -ENOMEM;
1471
1472        /* The address might not be aligned */
1473        base = start & PAGE_MASK;
1474        size = end - base;
1475        size = PAGE_ALIGN(size);
1476        if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1477                        IOVA_PFN(base + size) - 1)) {
1478                printk(KERN_ERR "IOMMU: reserve iova failed\n");
1479                ret = -ENOMEM;
1480                goto error;
1481        }
1482
1483        pr_debug("Mapping reserved region %lx@%llx for %s\n",
1484                size, base, pci_name(pdev));
1485        /*
1486         * RMRR range might have overlap with physical memory range,
1487         * clear it first
1488         */
1489        dma_pte_clear_range(domain, base, base + size);
1490
1491        ret = domain_page_mapping(domain, base, base, size,
1492                DMA_PTE_READ|DMA_PTE_WRITE);
1493        if (ret)
1494                goto error;
1495
1496        /* context entry init */
1497        ret = domain_context_mapping(domain, pdev);
1498        if (!ret)
1499                return 0;
1500error:
1501        domain_exit(domain);
1502        return ret;
1503
1504}
1505
1506static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1507        struct pci_dev *pdev)
1508{
1509        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1510                return 0;
1511        return iommu_prepare_identity_map(pdev, rmrr->base_address,
1512                rmrr->end_address + 1);
1513}
1514
1515#ifdef CONFIG_DMAR_GFX_WA
1516struct iommu_prepare_data {
1517        struct pci_dev *pdev;
1518        int ret;
1519};
1520
1521static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1522                                         unsigned long end_pfn, void *datax)
1523{
1524        struct iommu_prepare_data *data;
1525
1526        data = (struct iommu_prepare_data *)datax;
1527
1528        data->ret = iommu_prepare_identity_map(data->pdev,
1529                                start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1530        return data->ret;
1531
1532}
1533
1534static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1535{
1536        int nid;
1537        struct iommu_prepare_data data;
1538
1539        data.pdev = pdev;
1540        data.ret = 0;
1541
1542        for_each_online_node(nid) {
1543                work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1544                if (data.ret)
1545                        return data.ret;
1546        }
1547        return data.ret;
1548}
1549
1550static void __init iommu_prepare_gfx_mapping(void)
1551{
1552        struct pci_dev *pdev = NULL;
1553        int ret;
1554
1555        for_each_pci_dev(pdev) {
1556                if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1557                                !IS_GFX_DEVICE(pdev))
1558                        continue;
1559                printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1560                        pci_name(pdev));
1561                ret = iommu_prepare_with_active_regions(pdev);
1562                if (ret)
1563                        printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1564        }
1565}
1566#endif
1567
1568#ifdef CONFIG_DMAR_FLOPPY_WA
1569static inline void iommu_prepare_isa(void)
1570{
1571        struct pci_dev *pdev;
1572        int ret;
1573
1574        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1575        if (!pdev)
1576                return;
1577
1578        printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1579        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1580
1581        if (ret)
1582                printk("IOMMU: Failed to create 0-64M identity map, "
1583                        "floppy might not work\n");
1584
1585}
1586#else
1587static inline void iommu_prepare_isa(void)
1588{
1589        return;
1590}
1591#endif /* !CONFIG_DMAR_FLPY_WA */
1592
1593int __init init_dmars(void)
1594{
1595        struct dmar_drhd_unit *drhd;
1596        struct dmar_rmrr_unit *rmrr;
1597        struct pci_dev *pdev;
1598        struct intel_iommu *iommu;
1599        int i, ret, unit = 0;
1600
1601        /*
1602         * for each drhd
1603         *    allocate root
1604         *    initialize and program root entry to not present
1605         * endfor
1606         */
1607        for_each_drhd_unit(drhd) {
1608                g_num_of_iommus++;
1609                /*
1610                 * lock not needed as this is only incremented in the single
1611                 * threaded kernel __init code path all other access are read
1612                 * only
1613                 */
1614        }
1615
1616        deferred_flush = kzalloc(g_num_of_iommus *
1617                sizeof(struct deferred_flush_tables), GFP_KERNEL);
1618        if (!deferred_flush) {
1619                ret = -ENOMEM;
1620                goto error;
1621        }
1622
1623        for_each_drhd_unit(drhd) {
1624                if (drhd->ignored)
1625                        continue;
1626
1627                iommu = drhd->iommu;
1628
1629                ret = iommu_init_domains(iommu);
1630                if (ret)
1631                        goto error;
1632
1633                /*
1634                 * TBD:
1635                 * we could share the same root & context tables
1636                 * amoung all IOMMU's. Need to Split it later.
1637                 */
1638                ret = iommu_alloc_root_entry(iommu);
1639                if (ret) {
1640                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1641                        goto error;
1642                }
1643        }
1644
1645        for_each_drhd_unit(drhd) {
1646                if (drhd->ignored)
1647                        continue;
1648
1649                iommu = drhd->iommu;
1650                if (dmar_enable_qi(iommu)) {
1651                        /*
1652                         * Queued Invalidate not enabled, use Register Based
1653                         * Invalidate
1654                         */
1655                        iommu->flush.flush_context = __iommu_flush_context;
1656                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1657                        printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1658                               "invalidation\n",
1659                               (unsigned long long)drhd->reg_base_addr);
1660                } else {
1661                        iommu->flush.flush_context = qi_flush_context;
1662                        iommu->flush.flush_iotlb = qi_flush_iotlb;
1663                        printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1664                               "invalidation\n",
1665                               (unsigned long long)drhd->reg_base_addr);
1666                }
1667        }
1668
1669        /*
1670         * For each rmrr
1671         *   for each dev attached to rmrr
1672         *   do
1673         *     locate drhd for dev, alloc domain for dev
1674         *     allocate free domain
1675         *     allocate page table entries for rmrr
1676         *     if context not allocated for bus
1677         *           allocate and init context
1678         *           set present in root table for this bus
1679         *     init context with domain, translation etc
1680         *    endfor
1681         * endfor
1682         */
1683        for_each_rmrr_units(rmrr) {
1684                for (i = 0; i < rmrr->devices_cnt; i++) {
1685                        pdev = rmrr->devices[i];
1686                        /* some BIOS lists non-exist devices in DMAR table */
1687                        if (!pdev)
1688                                continue;
1689                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1690                        if (ret)
1691                                printk(KERN_ERR
1692                                 "IOMMU: mapping reserved region failed\n");
1693                }
1694        }
1695
1696        iommu_prepare_gfx_mapping();
1697
1698        iommu_prepare_isa();
1699
1700        /*
1701         * for each drhd
1702         *   enable fault log
1703         *   global invalidate context cache
1704         *   global invalidate iotlb
1705         *   enable translation
1706         */
1707        for_each_drhd_unit(drhd) {
1708                if (drhd->ignored)
1709                        continue;
1710                iommu = drhd->iommu;
1711                sprintf (iommu->name, "dmar%d", unit++);
1712
1713                iommu_flush_write_buffer(iommu);
1714
1715                ret = dmar_set_interrupt(iommu);
1716                if (ret)
1717                        goto error;
1718
1719                iommu_set_root_entry(iommu);
1720
1721                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1722                                           0);
1723                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1724                                         0);
1725                iommu_disable_protect_mem_regions(iommu);
1726
1727                ret = iommu_enable_translation(iommu);
1728                if (ret)
1729                        goto error;
1730        }
1731
1732        return 0;
1733error:
1734        for_each_drhd_unit(drhd) {
1735                if (drhd->ignored)
1736                        continue;
1737                iommu = drhd->iommu;
1738                free_iommu(iommu);
1739        }
1740        return ret;
1741}
1742
1743static inline u64 aligned_size(u64 host_addr, size_t size)
1744{
1745        u64 addr;
1746        addr = (host_addr & (~PAGE_MASK)) + size;
1747        return PAGE_ALIGN(addr);
1748}
1749
1750struct iova *
1751iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1752{
1753        struct iova *piova;
1754
1755        /* Make sure it's in range */
1756        end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1757        if (!size || (IOVA_START_ADDR + size > end))
1758                return NULL;
1759
1760        piova = alloc_iova(&domain->iovad,
1761                        size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1762        return piova;
1763}
1764
1765static struct iova *
1766__intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1767                   size_t size, u64 dma_mask)
1768{
1769        struct pci_dev *pdev = to_pci_dev(dev);
1770        struct iova *iova = NULL;
1771
1772        if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1773                iova = iommu_alloc_iova(domain, size, dma_mask);
1774        else {
1775                /*
1776                 * First try to allocate an io virtual address in
1777                 * DMA_32BIT_MASK and if that fails then try allocating
1778                 * from higher range
1779                 */
1780                iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1781                if (!iova)
1782                        iova = iommu_alloc_iova(domain, size, dma_mask);
1783        }
1784
1785        if (!iova) {
1786                printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1787                return NULL;
1788        }
1789
1790        return iova;
1791}
1792
1793static struct dmar_domain *
1794get_valid_domain_for_dev(struct pci_dev *pdev)
1795{
1796        struct dmar_domain *domain;
1797        int ret;
1798
1799        domain = get_domain_for_dev(pdev,
1800                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
1801        if (!domain) {
1802                printk(KERN_ERR
1803                        "Allocating domain for %s failed", pci_name(pdev));
1804                return NULL;
1805        }
1806
1807        /* make sure context mapping is ok */
1808        if (unlikely(!domain_context_mapped(domain, pdev))) {
1809                ret = domain_context_mapping(domain, pdev);
1810                if (ret) {
1811                        printk(KERN_ERR
1812                                "Domain context map for %s failed",
1813                                pci_name(pdev));
1814                        return NULL;
1815                }
1816        }
1817
1818        return domain;
1819}
1820
1821static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
1822                                     size_t size, int dir, u64 dma_mask)
1823{
1824        struct pci_dev *pdev = to_pci_dev(hwdev);
1825        struct dmar_domain *domain;
1826        phys_addr_t start_paddr;
1827        struct iova *iova;
1828        int prot = 0;
1829        int ret;
1830
1831        BUG_ON(dir == DMA_NONE);
1832        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1833                return paddr;
1834
1835        domain = get_valid_domain_for_dev(pdev);
1836        if (!domain)
1837                return 0;
1838
1839        size = aligned_size((u64)paddr, size);
1840
1841        iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
1842        if (!iova)
1843                goto error;
1844
1845        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
1846
1847        /*
1848         * Check if DMAR supports zero-length reads on write only
1849         * mappings..
1850         */
1851        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1852                        !cap_zlr(domain->iommu->cap))
1853                prot |= DMA_PTE_READ;
1854        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1855                prot |= DMA_PTE_WRITE;
1856        /*
1857         * paddr - (paddr + size) might be partial page, we should map the whole
1858         * page.  Note: if two part of one page are separately mapped, we
1859         * might have two guest_addr mapping to the same host paddr, but this
1860         * is not a big problem
1861         */
1862        ret = domain_page_mapping(domain, start_paddr,
1863                ((u64)paddr) & PAGE_MASK, size, prot);
1864        if (ret)
1865                goto error;
1866
1867        /* it's a non-present to present mapping */
1868        ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1869                        start_paddr, size >> VTD_PAGE_SHIFT, 1);
1870        if (ret)
1871                iommu_flush_write_buffer(domain->iommu);
1872
1873        return start_paddr + ((u64)paddr & (~PAGE_MASK));
1874
1875error:
1876        if (iova)
1877                __free_iova(&domain->iovad, iova);
1878        printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1879                pci_name(pdev), size, (unsigned long long)paddr, dir);
1880        return 0;
1881}
1882
1883dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
1884                            size_t size, int dir)
1885{
1886        return __intel_map_single(hwdev, paddr, size, dir,
1887                                  to_pci_dev(hwdev)->dma_mask);
1888}
1889
1890static void flush_unmaps(void)
1891{
1892        int i, j;
1893
1894        timer_on = 0;
1895
1896        /* just flush them all */
1897        for (i = 0; i < g_num_of_iommus; i++) {
1898                if (deferred_flush[i].next) {
1899                        struct intel_iommu *iommu =
1900                                deferred_flush[i].domain[0]->iommu;
1901
1902                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1903                                                 DMA_TLB_GLOBAL_FLUSH, 0);
1904                        for (j = 0; j < deferred_flush[i].next; j++) {
1905                                __free_iova(&deferred_flush[i].domain[j]->iovad,
1906                                                deferred_flush[i].iova[j]);
1907                        }
1908                        deferred_flush[i].next = 0;
1909                }
1910        }
1911
1912        list_size = 0;
1913}
1914
1915static void flush_unmaps_timeout(unsigned long data)
1916{
1917        unsigned long flags;
1918
1919        spin_lock_irqsave(&async_umap_flush_lock, flags);
1920        flush_unmaps();
1921        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1922}
1923
1924static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1925{
1926        unsigned long flags;
1927        int next, iommu_id;
1928
1929        spin_lock_irqsave(&async_umap_flush_lock, flags);
1930        if (list_size == HIGH_WATER_MARK)
1931                flush_unmaps();
1932
1933        iommu_id = dom->iommu->seq_id;
1934
1935        next = deferred_flush[iommu_id].next;
1936        deferred_flush[iommu_id].domain[next] = dom;
1937        deferred_flush[iommu_id].iova[next] = iova;
1938        deferred_flush[iommu_id].next++;
1939
1940        if (!timer_on) {
1941                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1942                timer_on = 1;
1943        }
1944        list_size++;
1945        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1946}
1947
1948void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
1949                        int dir)
1950{
1951        struct pci_dev *pdev = to_pci_dev(dev);
1952        struct dmar_domain *domain;
1953        unsigned long start_addr;
1954        struct iova *iova;
1955
1956        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1957                return;
1958        domain = find_domain(pdev);
1959        BUG_ON(!domain);
1960
1961        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1962        if (!iova)
1963                return;
1964
1965        start_addr = iova->pfn_lo << PAGE_SHIFT;
1966        size = aligned_size((u64)dev_addr, size);
1967
1968        pr_debug("Device %s unmapping: %lx@%llx\n",
1969                pci_name(pdev), size, (unsigned long long)start_addr);
1970
1971        /*  clear the whole page */
1972        dma_pte_clear_range(domain, start_addr, start_addr + size);
1973        /* free page tables */
1974        dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1975        if (intel_iommu_strict) {
1976                if (iommu_flush_iotlb_psi(domain->iommu,
1977                        domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
1978                        iommu_flush_write_buffer(domain->iommu);
1979                /* free iova */
1980                __free_iova(&domain->iovad, iova);
1981        } else {
1982                add_unmap(domain, iova);
1983                /*
1984                 * queue up the release of the unmap to save the 1/6th of the
1985                 * cpu used up by the iotlb flush operation...
1986                 */
1987        }
1988}
1989
1990void *intel_alloc_coherent(struct device *hwdev, size_t size,
1991                           dma_addr_t *dma_handle, gfp_t flags)
1992{
1993        void *vaddr;
1994        int order;
1995
1996        size = PAGE_ALIGN(size);
1997        order = get_order(size);
1998        flags &= ~(GFP_DMA | GFP_DMA32);
1999
2000        vaddr = (void *)__get_free_pages(flags, order);
2001        if (!vaddr)
2002                return NULL;
2003        memset(vaddr, 0, size);
2004
2005        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2006                                         DMA_BIDIRECTIONAL,
2007                                         hwdev->coherent_dma_mask);
2008        if (*dma_handle)
2009                return vaddr;
2010        free_pages((unsigned long)vaddr, order);
2011        return NULL;
2012}
2013
2014void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2015                         dma_addr_t dma_handle)
2016{
2017        int order;
2018
2019        size = PAGE_ALIGN(size);
2020        order = get_order(size);
2021
2022        intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2023        free_pages((unsigned long)vaddr, order);
2024}
2025
2026#define SG_ENT_VIRT_ADDRESS(sg)        (sg_virt((sg)))
2027
2028void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2029                    int nelems, int dir)
2030{
2031        int i;
2032        struct pci_dev *pdev = to_pci_dev(hwdev);
2033        struct dmar_domain *domain;
2034        unsigned long start_addr;
2035        struct iova *iova;
2036        size_t size = 0;
2037        void *addr;
2038        struct scatterlist *sg;
2039
2040        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2041                return;
2042
2043        domain = find_domain(pdev);
2044
2045        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2046        if (!iova)
2047                return;
2048        for_each_sg(sglist, sg, nelems, i) {
2049                addr = SG_ENT_VIRT_ADDRESS(sg);
2050                size += aligned_size((u64)addr, sg->length);
2051        }
2052
2053        start_addr = iova->pfn_lo << PAGE_SHIFT;
2054
2055        /*  clear the whole page */
2056        dma_pte_clear_range(domain, start_addr, start_addr + size);
2057        /* free page tables */
2058        dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2059
2060        if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2061                        size >> VTD_PAGE_SHIFT, 0))
2062                iommu_flush_write_buffer(domain->iommu);
2063
2064        /* free iova */
2065        __free_iova(&domain->iovad, iova);
2066}
2067
2068static int intel_nontranslate_map_sg(struct device *hddev,
2069        struct scatterlist *sglist, int nelems, int dir)
2070{
2071        int i;
2072        struct scatterlist *sg;
2073
2074        for_each_sg(sglist, sg, nelems, i) {
2075                BUG_ON(!sg_page(sg));
2076                sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2077                sg->dma_length = sg->length;
2078        }
2079        return nelems;
2080}
2081
2082int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2083                 int dir)
2084{
2085        void *addr;
2086        int i;
2087        struct pci_dev *pdev = to_pci_dev(hwdev);
2088        struct dmar_domain *domain;
2089        size_t size = 0;
2090        int prot = 0;
2091        size_t offset = 0;
2092        struct iova *iova = NULL;
2093        int ret;
2094        struct scatterlist *sg;
2095        unsigned long start_addr;
2096
2097        BUG_ON(dir == DMA_NONE);
2098        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2099                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2100
2101        domain = get_valid_domain_for_dev(pdev);
2102        if (!domain)
2103                return 0;
2104
2105        for_each_sg(sglist, sg, nelems, i) {
2106                addr = SG_ENT_VIRT_ADDRESS(sg);
2107                addr = (void *)virt_to_phys(addr);
2108                size += aligned_size((u64)addr, sg->length);
2109        }
2110
2111        iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2112        if (!iova) {
2113                sglist->dma_length = 0;
2114                return 0;
2115        }
2116
2117        /*
2118         * Check if DMAR supports zero-length reads on write only
2119         * mappings..
2120         */
2121        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2122                        !cap_zlr(domain->iommu->cap))
2123                prot |= DMA_PTE_READ;
2124        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2125                prot |= DMA_PTE_WRITE;
2126
2127        start_addr = iova->pfn_lo << PAGE_SHIFT;
2128        offset = 0;
2129        for_each_sg(sglist, sg, nelems, i) {
2130                addr = SG_ENT_VIRT_ADDRESS(sg);
2131                addr = (void *)virt_to_phys(addr);
2132                size = aligned_size((u64)addr, sg->length);
2133                ret = domain_page_mapping(domain, start_addr + offset,
2134                        ((u64)addr) & PAGE_MASK,
2135                        size, prot);
2136                if (ret) {
2137                        /*  clear the page */
2138                        dma_pte_clear_range(domain, start_addr,
2139                                  start_addr + offset);
2140                        /* free page tables */
2141                        dma_pte_free_pagetable(domain, start_addr,
2142                                  start_addr + offset);
2143                        /* free iova */
2144                        __free_iova(&domain->iovad, iova);
2145                        return 0;
2146                }
2147                sg->dma_address = start_addr + offset +
2148                                ((u64)addr & (~PAGE_MASK));
2149                sg->dma_length = sg->length;
2150                offset += size;
2151        }
2152
2153        /* it's a non-present to present mapping */
2154        if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2155                        start_addr, offset >> VTD_PAGE_SHIFT, 1))
2156                iommu_flush_write_buffer(domain->iommu);
2157        return nelems;
2158}
2159
2160static struct dma_mapping_ops intel_dma_ops = {
2161        .alloc_coherent = intel_alloc_coherent,
2162        .free_coherent = intel_free_coherent,
2163        .map_single = intel_map_single,
2164        .unmap_single = intel_unmap_single,
2165        .map_sg = intel_map_sg,
2166        .unmap_sg = intel_unmap_sg,
2167};
2168
2169static inline int iommu_domain_cache_init(void)
2170{
2171        int ret = 0;
2172
2173        iommu_domain_cache = kmem_cache_create("iommu_domain",
2174                                         sizeof(struct dmar_domain),
2175                                         0,
2176                                         SLAB_HWCACHE_ALIGN,
2177
2178                                         NULL);
2179        if (!iommu_domain_cache) {
2180                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2181                ret = -ENOMEM;
2182        }
2183
2184        return ret;
2185}
2186
2187static inline int iommu_devinfo_cache_init(void)
2188{
2189        int ret = 0;
2190
2191        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2192                                         sizeof(struct device_domain_info),
2193                                         0,
2194                                         SLAB_HWCACHE_ALIGN,
2195                                         NULL);
2196        if (!iommu_devinfo_cache) {
2197                printk(KERN_ERR "Couldn't create devinfo cache\n");
2198                ret = -ENOMEM;
2199        }
2200
2201        return ret;
2202}
2203
2204static inline int iommu_iova_cache_init(void)
2205{
2206        int ret = 0;
2207
2208        iommu_iova_cache = kmem_cache_create("iommu_iova",
2209                                         sizeof(struct iova),
2210                                         0,
2211                                         SLAB_HWCACHE_ALIGN,
2212                                         NULL);
2213        if (!iommu_iova_cache) {
2214                printk(KERN_ERR "Couldn't create iova cache\n");
2215                ret = -ENOMEM;
2216        }
2217
2218        return ret;
2219}
2220
2221static int __init iommu_init_mempool(void)
2222{
2223        int ret;
2224        ret = iommu_iova_cache_init();
2225        if (ret)
2226                return ret;
2227
2228        ret = iommu_domain_cache_init();
2229        if (ret)
2230                goto domain_error;
2231
2232        ret = iommu_devinfo_cache_init();
2233        if (!ret)
2234                return ret;
2235
2236        kmem_cache_destroy(iommu_domain_cache);
2237domain_error:
2238        kmem_cache_destroy(iommu_iova_cache);
2239
2240        return -ENOMEM;
2241}
2242
2243static void __init iommu_exit_mempool(void)
2244{
2245        kmem_cache_destroy(iommu_devinfo_cache);
2246        kmem_cache_destroy(iommu_domain_cache);
2247        kmem_cache_destroy(iommu_iova_cache);
2248
2249}
2250
2251static void __init init_no_remapping_devices(void)
2252{
2253        struct dmar_drhd_unit *drhd;
2254
2255        for_each_drhd_unit(drhd) {
2256                if (!drhd->include_all) {
2257                        int i;
2258                        for (i = 0; i < drhd->devices_cnt; i++)
2259                                if (drhd->devices[i] != NULL)
2260                                        break;
2261                        /* ignore DMAR unit if no pci devices exist */
2262                        if (i == drhd->devices_cnt)
2263                                drhd->ignored = 1;
2264                }
2265        }
2266
2267        if (dmar_map_gfx)
2268                return;
2269
2270        for_each_drhd_unit(drhd) {
2271                int i;
2272                if (drhd->ignored || drhd->include_all)
2273                        continue;
2274
2275                for (i = 0; i < drhd->devices_cnt; i++)
2276                        if (drhd->devices[i] &&
2277                                !IS_GFX_DEVICE(drhd->devices[i]))
2278                                break;
2279
2280                if (i < drhd->devices_cnt)
2281                        continue;
2282
2283                /* bypass IOMMU if it is just for gfx devices */
2284                drhd->ignored = 1;
2285                for (i = 0; i < drhd->devices_cnt; i++) {
2286                        if (!drhd->devices[i])
2287                                continue;
2288                        drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2289                }
2290        }
2291}
2292
2293int __init intel_iommu_init(void)
2294{
2295        int ret = 0;
2296
2297        if (dmar_table_init())
2298                return         -ENODEV;
2299
2300        if (dmar_dev_scope_init())
2301                return         -ENODEV;
2302
2303        /*
2304         * Check the need for DMA-remapping initialization now.
2305         * Above initialization will also be used by Interrupt-remapping.
2306         */
2307        if (no_iommu || swiotlb || dmar_disabled)
2308                return -ENODEV;
2309
2310        iommu_init_mempool();
2311        dmar_init_reserved_ranges();
2312
2313        init_no_remapping_devices();
2314
2315        ret = init_dmars();
2316        if (ret) {
2317                printk(KERN_ERR "IOMMU: dmar init failed\n");
2318                put_iova_domain(&reserved_iova_list);
2319                iommu_exit_mempool();
2320                return ret;
2321        }
2322        printk(KERN_INFO
2323        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2324
2325        init_timer(&unmap_timer);
2326        force_iommu = 1;
2327        dma_ops = &intel_dma_ops;
2328        return 0;
2329}
2330
2331void intel_iommu_domain_exit(struct dmar_domain *domain)
2332{
2333        u64 end;
2334
2335        /* Domain 0 is reserved, so dont process it */
2336        if (!domain)
2337                return;
2338
2339        end = DOMAIN_MAX_ADDR(domain->gaw);
2340        end = end & (~VTD_PAGE_MASK);
2341
2342        /* clear ptes */
2343        dma_pte_clear_range(domain, 0, end);
2344
2345        /* free page tables */
2346        dma_pte_free_pagetable(domain, 0, end);
2347
2348        iommu_free_domain(domain);
2349        free_domain_mem(domain);
2350}
2351EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2352
2353struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2354{
2355        struct dmar_drhd_unit *drhd;
2356        struct dmar_domain *domain;
2357        struct intel_iommu *iommu;
2358
2359        drhd = dmar_find_matched_drhd_unit(pdev);
2360        if (!drhd) {
2361                printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2362                return NULL;
2363        }
2364
2365        iommu = drhd->iommu;
2366        if (!iommu) {
2367                printk(KERN_ERR
2368                        "intel_iommu_domain_alloc: iommu == NULL\n");
2369                return NULL;
2370        }
2371        domain = iommu_alloc_domain(iommu);
2372        if (!domain) {
2373                printk(KERN_ERR
2374                        "intel_iommu_domain_alloc: domain == NULL\n");
2375                return NULL;
2376        }
2377        if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2378                printk(KERN_ERR
2379                        "intel_iommu_domain_alloc: domain_init() failed\n");
2380                intel_iommu_domain_exit(domain);
2381                return NULL;
2382        }
2383        return domain;
2384}
2385EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2386
2387int intel_iommu_context_mapping(
2388        struct dmar_domain *domain, struct pci_dev *pdev)
2389{
2390        int rc;
2391        rc = domain_context_mapping(domain, pdev);
2392        return rc;
2393}
2394EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2395
2396int intel_iommu_page_mapping(
2397        struct dmar_domain *domain, dma_addr_t iova,
2398        u64 hpa, size_t size, int prot)
2399{
2400        int rc;
2401        rc = domain_page_mapping(domain, iova, hpa, size, prot);
2402        return rc;
2403}
2404EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2405
2406void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2407{
2408        detach_domain_for_dev(domain, bus, devfn);
2409}
2410EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2411
2412struct dmar_domain *
2413intel_iommu_find_domain(struct pci_dev *pdev)
2414{
2415        return find_domain(pdev);
2416}
2417EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2418
2419int intel_iommu_found(void)
2420{
2421        return g_num_of_iommus;
2422}
2423EXPORT_SYMBOL_GPL(intel_iommu_found);
2424
2425u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2426{
2427        struct dma_pte *pte;
2428        u64 pfn;
2429
2430        pfn = 0;
2431        pte = addr_to_dma_pte(domain, iova);
2432
2433        if (pte)
2434                pfn = dma_pte_addr(*pte);
2435
2436        return pfn >> VTD_PAGE_SHIFT;
2437}
2438EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);