From 0b5afe05377d7993f19292bf49dd13e959000790 Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Thu, 23 May 2024 17:40:55 +0000 Subject: [PATCH 1/4] KVM: arm64: Add early_param to control WFx trapping Add an early_params to control WFI and WFE trapping. This is to control the degree guests can wait for interrupts on their own without being trapped by KVM. Options for each param are trap and notrap. trap enables the trap. notrap disables the trap. Note that when enabled, traps are allowed but not guaranteed by the CPU architecture. Absent an explicitly set policy, default to current behavior: disabling the trap if only a single task is running and enabling otherwise. Signed-off-by: Colton Lewis Reviewed-by: Jing Zhang Link: https://lore.kernel.org/r/20240523174056.1565133-1-coltonlewis@google.com [ oliver: rework kvm_vcpu_should_clear_tw*() for readability ] Signed-off-by: Oliver Upton --- .../admin-guide/kernel-parameters.txt | 18 +++++ arch/arm64/kvm/arm.c | 68 ++++++++++++++++++- 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b600df82669d..47249bd99987 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2745,6 +2745,24 @@ [KVM,ARM,EARLY] Allow use of GICv4 for direct injection of LPIs. + kvm-arm.wfe_trap_policy= + [KVM,ARM] Control when to set WFE instruction trap for + KVM VMs. Traps are allowed but not guaranteed by the + CPU architecture. + + trap: set WFE instruction trap + + notrap: clear WFE instruction trap + + kvm-arm.wfi_trap_policy= + [KVM,ARM] Control when to set WFI instruction trap for + KVM VMs. Traps are allowed but not guaranteed by the + CPU architecture. + + trap: set WFI instruction trap + + notrap: clear WFI instruction trap + kvm_cma_resv_ratio=n [PPC,EARLY] Reserves given percentage from system memory area for contiguous memory allocation for KVM hash pagetable diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 59716789fe0f..53e23528d2cf 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -48,6 +48,15 @@ static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; +enum kvm_wfx_trap_policy { + KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */ + KVM_WFX_NOTRAP, + KVM_WFX_TRAP, +}; + +static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; +static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; + DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); @@ -546,6 +555,24 @@ static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu) } } +static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) + return kvm_wfi_trap_policy == KVM_WFX_NOTRAP; + + return single_task_running() && + (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) || + vcpu->kvm->arch.vgic.nassgireq); +} + +static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) + return kvm_wfe_trap_policy == KVM_WFX_NOTRAP; + + return single_task_running(); +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_s2_mmu *mmu; @@ -579,10 +606,15 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); - if (single_task_running()) - vcpu_clear_wfx_traps(vcpu); + if (kvm_vcpu_should_clear_twe(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_TWE; else - vcpu_set_wfx_traps(vcpu); + vcpu->arch.hcr_el2 |= HCR_TWE; + + if (kvm_vcpu_should_clear_twi(vcpu)) + vcpu->arch.hcr_el2 &= ~HCR_TWI; + else + vcpu->arch.hcr_el2 |= HCR_TWI; vcpu_set_pauth_traps(vcpu); @@ -2858,6 +2890,36 @@ static int __init early_kvm_mode_cfg(char *arg) } early_param("kvm-arm.mode", early_kvm_mode_cfg); +static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "trap") == 0) { + *p = KVM_WFX_TRAP; + return 0; + } + + if (strcmp(arg, "notrap") == 0) { + *p = KVM_WFX_NOTRAP; + return 0; + } + + return -EINVAL; +} + +static int __init early_kvm_wfi_trap_policy_cfg(char *arg) +{ + return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy); +} +early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg); + +static int __init early_kvm_wfe_trap_policy_cfg(char *arg) +{ + return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy); +} +early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg); + enum kvm_mode kvm_get_mode(void) { return kvm_mode; From eb9d53d4a949c6d6d7c9f130e537f6b5687fedf9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 14 Jun 2024 13:58:58 +0100 Subject: [PATCH 2/4] KVM: arm64: nv: Fix RESx behaviour of disabled FGTs with negative polarity The Fine Grained Trap extension is pretty messy as it doesn't consistently use the same polarity for all trap bits. A bunch of them, added later in the life of the architecture, have a *negative* priority. So if these bits are disabled, they must be RES1 and not RES0. But that's not what the code implements, making the traps for these negative trap bits being always on instead of disabled. Fix the relevant bits, and stick a brown paper bag on my head for the rest of the day... Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240614125858.78361-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index bae8536cbf00..0acb60273482 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -328,21 +328,21 @@ int kvm_init_nv_sysregs(struct kvm *kvm) HFGxTR_EL2_ERXPFGF_EL1 | HFGxTR_EL2_ERXPFGCTL_EL1 | HFGxTR_EL2_ERXPFGCDN_EL1 | HFGxTR_EL2_ERXADDR_EL1); if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA)) - res0 |= HFGxTR_EL2_nACCDATA_EL1; + res1 |= HFGxTR_EL2_nACCDATA_EL1; if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) - res0 |= (HFGxTR_EL2_nGCS_EL0 | HFGxTR_EL2_nGCS_EL1); + res1 |= (HFGxTR_EL2_nGCS_EL0 | HFGxTR_EL2_nGCS_EL1); if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) - res0 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0); + res1 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0); if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) - res0 |= HFGxTR_EL2_nRCWMASK_EL1; + res1 |= HFGxTR_EL2_nRCWMASK_EL1; if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) - res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); + res1 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) - res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); + res1 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) - res0 |= HFGxTR_EL2_nS2POR_EL1; + res1 |= HFGxTR_EL2_nS2POR_EL1; if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP)) - res0 |= (HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nAMAIR2_EL1); + res1 |= (HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nAMAIR2_EL1); set_sysreg_masks(kvm, HFGRTR_EL2, res0 | __HFGRTR_EL2_RES0, res1); set_sysreg_masks(kvm, HFGWTR_EL2, res0 | __HFGWTR_EL2_RES0, res1); @@ -378,10 +378,10 @@ int kvm_init_nv_sysregs(struct kvm *kvm) HDFGRTR_EL2_TRBPTR_EL1 | HDFGRTR_EL2_TRBSR_EL1 | HDFGRTR_EL2_TRBTRG_EL1); if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) - res0 |= (HDFGRTR_EL2_nBRBIDR | HDFGRTR_EL2_nBRBCTL | + res1 |= (HDFGRTR_EL2_nBRBIDR | HDFGRTR_EL2_nBRBCTL | HDFGRTR_EL2_nBRBDATA); if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2)) - res0 |= HDFGRTR_EL2_nPMSNEVFR_EL1; + res1 |= HDFGRTR_EL2_nPMSNEVFR_EL1; set_sysreg_masks(kvm, HDFGRTR_EL2, res0 | HDFGRTR_EL2_RES0, res1); /* Reuse the bits from the read-side and add the write-specific stuff */ @@ -417,9 +417,9 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= (HFGITR_EL2_CFPRCTX | HFGITR_EL2_DVPRCTX | HFGITR_EL2_CPPRCTX); if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) - res0 |= (HFGITR_EL2_nBRBINJ | HFGITR_EL2_nBRBIALL); + res1 |= (HFGITR_EL2_nBRBINJ | HFGITR_EL2_nBRBIALL); if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) - res0 |= (HFGITR_EL2_nGCSPUSHM_EL1 | HFGITR_EL2_nGCSSTR_EL1 | + res1 |= (HFGITR_EL2_nGCSPUSHM_EL1 | HFGITR_EL2_nGCSSTR_EL1 | HFGITR_EL2_nGCSEPP); if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX)) res0 |= HFGITR_EL2_COSPRCTX; From 3dc14eefa504d2fbe8e75113c7bb164a20bc39b0 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 17 Jun 2024 18:10:18 +0000 Subject: [PATCH 3/4] KVM: arm64: nv: Use GFP_KERNEL_ACCOUNT for sysreg_masks allocation Of course, userspace is in the driver's seat for struct kvm and associated allocations. Make sure the sysreg_masks allocation participates in kmem accounting. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240617181018.2054332-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 0acb60273482..913018e4cdae 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -198,7 +198,7 @@ int kvm_init_nv_sysregs(struct kvm *kvm) goto out; kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!kvm->arch.sysreg_masks) { ret = -ENOMEM; goto out; From cb52b5c8b81bfbc34df13537d82cd1849725d6c7 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 8 Jul 2024 19:50:51 +0000 Subject: [PATCH 4/4] Revert "KVM: arm64: nv: Fix RESx behaviour of disabled FGTs with negative polarity" This reverts commit eb9d53d4a949c6d6d7c9f130e537f6b5687fedf9. As Marc pointed out on the list [*], this patch is wrong, and those who find themselves in the SOB chain should have their heads checked. Annoyingly, the architecture has some FGT trap bits that are negative (i.e. 0 implies trap), and there was some confusion how KVM handles this for nested guests. However, it is clear now that KVM honors the RES0-ness of FGT traps already, meaning traps for features never exposed to the guest hypervisor get handled at L0. As they should. Link: https://lore.kernel.org/kvmarm/86bk3c3uss.wl-maz@kernel.org/T/#mb9abb3dd79f6a4544a91cb35676bd637c3a5e836 Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 913018e4cdae..d23618af8291 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -328,21 +328,21 @@ int kvm_init_nv_sysregs(struct kvm *kvm) HFGxTR_EL2_ERXPFGF_EL1 | HFGxTR_EL2_ERXPFGCTL_EL1 | HFGxTR_EL2_ERXPFGCDN_EL1 | HFGxTR_EL2_ERXADDR_EL1); if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA)) - res1 |= HFGxTR_EL2_nACCDATA_EL1; + res0 |= HFGxTR_EL2_nACCDATA_EL1; if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) - res1 |= (HFGxTR_EL2_nGCS_EL0 | HFGxTR_EL2_nGCS_EL1); + res0 |= (HFGxTR_EL2_nGCS_EL0 | HFGxTR_EL2_nGCS_EL1); if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) - res1 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0); + res0 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0); if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) - res1 |= HFGxTR_EL2_nRCWMASK_EL1; + res0 |= HFGxTR_EL2_nRCWMASK_EL1; if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) - res1 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); + res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) - res1 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); + res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) - res1 |= HFGxTR_EL2_nS2POR_EL1; + res0 |= HFGxTR_EL2_nS2POR_EL1; if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP)) - res1 |= (HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nAMAIR2_EL1); + res0 |= (HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nAMAIR2_EL1); set_sysreg_masks(kvm, HFGRTR_EL2, res0 | __HFGRTR_EL2_RES0, res1); set_sysreg_masks(kvm, HFGWTR_EL2, res0 | __HFGWTR_EL2_RES0, res1); @@ -378,10 +378,10 @@ int kvm_init_nv_sysregs(struct kvm *kvm) HDFGRTR_EL2_TRBPTR_EL1 | HDFGRTR_EL2_TRBSR_EL1 | HDFGRTR_EL2_TRBTRG_EL1); if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) - res1 |= (HDFGRTR_EL2_nBRBIDR | HDFGRTR_EL2_nBRBCTL | + res0 |= (HDFGRTR_EL2_nBRBIDR | HDFGRTR_EL2_nBRBCTL | HDFGRTR_EL2_nBRBDATA); if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2)) - res1 |= HDFGRTR_EL2_nPMSNEVFR_EL1; + res0 |= HDFGRTR_EL2_nPMSNEVFR_EL1; set_sysreg_masks(kvm, HDFGRTR_EL2, res0 | HDFGRTR_EL2_RES0, res1); /* Reuse the bits from the read-side and add the write-specific stuff */ @@ -417,9 +417,9 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= (HFGITR_EL2_CFPRCTX | HFGITR_EL2_DVPRCTX | HFGITR_EL2_CPPRCTX); if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) - res1 |= (HFGITR_EL2_nBRBINJ | HFGITR_EL2_nBRBIALL); + res0 |= (HFGITR_EL2_nBRBINJ | HFGITR_EL2_nBRBIALL); if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) - res1 |= (HFGITR_EL2_nGCSPUSHM_EL1 | HFGITR_EL2_nGCSSTR_EL1 | + res0 |= (HFGITR_EL2_nGCSPUSHM_EL1 | HFGITR_EL2_nGCSSTR_EL1 | HFGITR_EL2_nGCSEPP); if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX)) res0 |= HFGITR_EL2_COSPRCTX;