1013 lines
38 KiB
Diff
1013 lines
38 KiB
Diff
|
diff --git a/Documentation/hw-vuln/index.rst b/Documentation/hw-vuln/index.rst
|
||
|
index ffc064c1ec68..24f53c501366 100644
|
||
|
--- a/Documentation/hw-vuln/index.rst
|
||
|
+++ b/Documentation/hw-vuln/index.rst
|
||
|
@@ -11,3 +11,5 @@ are configurable at compile, boot or run time.
|
||
|
|
||
|
l1tf
|
||
|
mds
|
||
|
+ tsx_async_abort
|
||
|
+ multihit.rst
|
||
|
diff --git a/Documentation/hw-vuln/multihit.rst b/Documentation/hw-vuln/multihit.rst
|
||
|
new file mode 100644
|
||
|
index 000000000000..ba9988d8bce5
|
||
|
--- /dev/null
|
||
|
+++ b/Documentation/hw-vuln/multihit.rst
|
||
|
@@ -0,0 +1,163 @@
|
||
|
+iTLB multihit
|
||
|
+=============
|
||
|
+
|
||
|
+iTLB multihit is an erratum where some processors may incur a machine check
|
||
|
+error, possibly resulting in an unrecoverable CPU lockup, when an
|
||
|
+instruction fetch hits multiple entries in the instruction TLB. This can
|
||
|
+occur when the page size is changed along with either the physical address
|
||
|
+or cache type. A malicious guest running on a virtualized system can
|
||
|
+exploit this erratum to perform a denial of service attack.
|
||
|
+
|
||
|
+
|
||
|
+Affected processors
|
||
|
+-------------------
|
||
|
+
|
||
|
+Variations of this erratum are present on most Intel Core and Xeon processor
|
||
|
+models. The erratum is not present on:
|
||
|
+
|
||
|
+ - non-Intel processors
|
||
|
+
|
||
|
+ - Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont)
|
||
|
+
|
||
|
+ - Intel processors that have the PSCHANGE_MC_NO bit set in the
|
||
|
+ IA32_ARCH_CAPABILITIES MSR.
|
||
|
+
|
||
|
+
|
||
|
+Related CVEs
|
||
|
+------------
|
||
|
+
|
||
|
+The following CVE entry is related to this issue:
|
||
|
+
|
||
|
+ ============== =================================================
|
||
|
+ CVE-2018-12207 Machine Check Error Avoidance on Page Size Change
|
||
|
+ ============== =================================================
|
||
|
+
|
||
|
+
|
||
|
+Problem
|
||
|
+-------
|
||
|
+
|
||
|
+Privileged software, including OS and virtual machine managers (VMM), are in
|
||
|
+charge of memory management. A key component in memory management is the control
|
||
|
+of the page tables. Modern processors use virtual memory, a technique that creates
|
||
|
+the illusion of a very large memory for processors. This virtual space is split
|
||
|
+into pages of a given size. Page tables translate virtual addresses to physical
|
||
|
+addresses.
|
||
|
+
|
||
|
+To reduce latency when performing a virtual to physical address translation,
|
||
|
+processors include a structure, called TLB, that caches recent translations.
|
||
|
+There are separate TLBs for instruction (iTLB) and data (dTLB).
|
||
|
+
|
||
|
+Under this errata, instructions are fetched from a linear address translated
|
||
|
+using a 4 KB translation cached in the iTLB. Privileged software modifies the
|
||
|
+paging structure so that the same linear address using large page size (2 MB, 4
|
||
|
+MB, 1 GB) with a different physical address or memory type. After the page
|
||
|
+structure modification but before the software invalidates any iTLB entries for
|
||
|
+the linear address, a code fetch that happens on the same linear address may
|
||
|
+cause a machine-check error which can result in a system hang or shutdown.
|
||
|
+
|
||
|
+
|
||
|
+Attack scenarios
|
||
|
+----------------
|
||
|
+
|
||
|
+Attacks against the iTLB multihit erratum can be mounted from malicious
|
||
|
+guests in a virtualized system.
|
||
|
+
|
||
|
+
|
||
|
+iTLB multihit system information
|
||
|
+--------------------------------
|
||
|
+
|
||
|
+The Linux kernel provides a sysfs interface to enumerate the current iTLB
|
||
|
+multihit status of the system:whether the system is vulnerable and which
|
||
|
+mitigations are active. The relevant sysfs file is:
|
||
|
+
|
||
|
+/sys/devices/system/cpu/vulnerabilities/itlb_multihit
|
||
|
+
|
||
|
+The possible values in this file are:
|
||
|
+
|
||
|
+.. list-table::
|
||
|
+
|
||
|
+ * - Not affected
|
||
|
+ - The processor is not vulnerable.
|
||
|
+ * - KVM: Mitigation: Split huge pages
|
||
|
+ - Software changes mitigate this issue.
|
||
|
+ * - KVM: Vulnerable
|
||
|
+ - The processor is vulnerable, but no mitigation enabled
|
||
|
+
|
||
|
+
|
||
|
+Enumeration of the erratum
|
||
|
+--------------------------------
|
||
|
+
|
||
|
+A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr
|
||
|
+and will be set on CPU's which are mitigated against this issue.
|
||
|
+
|
||
|
+ ======================================= =========== ===============================
|
||
|
+ IA32_ARCH_CAPABILITIES MSR Not present Possibly vulnerable,check model
|
||
|
+ IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '0' Likely vulnerable,check model
|
||
|
+ IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '1' Not vulnerable
|
||
|
+ ======================================= =========== ===============================
|
||
|
+
|
||
|
+
|
||
|
+Mitigation mechanism
|
||
|
+-------------------------
|
||
|
+
|
||
|
+This erratum can be mitigated by restricting the use of large page sizes to
|
||
|
+non-executable pages. This forces all iTLB entries to be 4K, and removes
|
||
|
+the possibility of multiple hits.
|
||
|
+
|
||
|
+In order to mitigate the vulnerability, KVM initially marks all huge pages
|
||
|
+as non-executable. If the guest attempts to execute in one of those pages,
|
||
|
+the page is broken down into 4K pages, which are then marked executable.
|
||
|
+
|
||
|
+If EPT is disabled or not available on the host, KVM is in control of TLB
|
||
|
+flushes and the problematic situation cannot happen. However, the shadow
|
||
|
+EPT paging mechanism used by nested virtualization is vulnerable, because
|
||
|
+the nested guest can trigger multiple iTLB hits by modifying its own
|
||
|
+(non-nested) page tables. For simplicity, KVM will make large pages
|
||
|
+non-executable in all shadow paging modes.
|
||
|
+
|
||
|
+Mitigation control on the kernel command line and KVM - module parameter
|
||
|
+------------------------------------------------------------------------
|
||
|
+
|
||
|
+The KVM hypervisor mitigation mechanism for marking huge pages as
|
||
|
+non-executable can be controlled with a module parameter "nx_huge_pages=".
|
||
|
+The kernel command line allows to control the iTLB multihit mitigations at
|
||
|
+boot time with the option "kvm.nx_huge_pages=".
|
||
|
+
|
||
|
+The valid arguments for these options are:
|
||
|
+
|
||
|
+ ========== ================================================================
|
||
|
+ force Mitigation is enabled. In this case, the mitigation implements
|
||
|
+ non-executable huge pages in Linux kernel KVM module. All huge
|
||
|
+ pages in the EPT are marked as non-executable.
|
||
|
+ If a guest attempts to execute in one of those pages, the page is
|
||
|
+ broken down into 4K pages, which are then marked executable.
|
||
|
+
|
||
|
+ off Mitigation is disabled.
|
||
|
+
|
||
|
+ auto Enable mitigation only if the platform is affected and the kernel
|
||
|
+ was not booted with the "mitigations=off" command line parameter.
|
||
|
+ This is the default option.
|
||
|
+ ========== ================================================================
|
||
|
+
|
||
|
+
|
||
|
+Mitigation selection guide
|
||
|
+--------------------------
|
||
|
+
|
||
|
+1. No virtualization in use
|
||
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
|
+
|
||
|
+ The system is protected by the kernel unconditionally and no further
|
||
|
+ action is required.
|
||
|
+
|
||
|
+2. Virtualization with trusted guests
|
||
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
|
+
|
||
|
+ If the guest comes from a trusted source, you may assume that the guest will
|
||
|
+ not attempt to maliciously exploit these errata and no further action is
|
||
|
+ required.
|
||
|
+
|
||
|
+3. Virtualization with untrusted guests
|
||
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
|
+ If the guest comes from an untrusted source, the guest host kernel will need
|
||
|
+ to apply iTLB multihit mitigation via the kernel command line or kvm
|
||
|
+ module parameter.
|
||
|
diff --git a/Documentation/hw-vuln/tsx_async_abort.rst b/Documentation/hw-vuln/tsx_async_abort.rst
|
||
|
new file mode 100644
|
||
|
index 000000000000..fddbd7579c53
|
||
|
--- /dev/null
|
||
|
+++ b/Documentation/hw-vuln/tsx_async_abort.rst
|
||
|
@@ -0,0 +1,276 @@
|
||
|
+.. SPDX-License-Identifier: GPL-2.0
|
||
|
+
|
||
|
+TAA - TSX Asynchronous Abort
|
||
|
+======================================
|
||
|
+
|
||
|
+TAA is a hardware vulnerability that allows unprivileged speculative access to
|
||
|
+data which is available in various CPU internal buffers by using asynchronous
|
||
|
+aborts within an Intel TSX transactional region.
|
||
|
+
|
||
|
+Affected processors
|
||
|
+-------------------
|
||
|
+
|
||
|
+This vulnerability only affects Intel processors that support Intel
|
||
|
+Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8)
|
||
|
+is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit
|
||
|
+(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations
|
||
|
+also mitigate against TAA.
|
||
|
+
|
||
|
+Whether a processor is affected or not can be read out from the TAA
|
||
|
+vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`.
|
||
|
+
|
||
|
+Related CVEs
|
||
|
+------------
|
||
|
+
|
||
|
+The following CVE entry is related to this TAA issue:
|
||
|
+
|
||
|
+ ============== ===== ===================================================
|
||
|
+ CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some
|
||
|
+ microprocessors utilizing speculative execution may
|
||
|
+ allow an authenticated user to potentially enable
|
||
|
+ information disclosure via a side channel with
|
||
|
+ local access.
|
||
|
+ ============== ===== ===================================================
|
||
|
+
|
||
|
+Problem
|
||
|
+-------
|
||
|
+
|
||
|
+When performing store, load or L1 refill operations, processors write
|
||
|
+data into temporary microarchitectural structures (buffers). The data in
|
||
|
+those buffers can be forwarded to load operations as an optimization.
|
||
|
+
|
||
|
+Intel TSX is an extension to the x86 instruction set architecture that adds
|
||
|
+hardware transactional memory support to improve performance of multi-threaded
|
||
|
+software. TSX lets the processor expose and exploit concurrency hidden in an
|
||
|
+application due to dynamically avoiding unnecessary synchronization.
|
||
|
+
|
||
|
+TSX supports atomic memory transactions that are either committed (success) or
|
||
|
+aborted. During an abort, operations that happened within the transactional region
|
||
|
+are rolled back. An asynchronous abort takes place, among other options, when a
|
||
|
+different thread accesses a cache line that is also used within the transactional
|
||
|
+region when that access might lead to a data race.
|
||
|
+
|
||
|
+Immediately after an uncompleted asynchronous abort, certain speculatively
|
||
|
+executed loads may read data from those internal buffers and pass it to dependent
|
||
|
+operations. This can be then used to infer the value via a cache side channel
|
||
|
+attack.
|
||
|
+
|
||
|
+Because the buffers are potentially shared between Hyper-Threads cross
|
||
|
+Hyper-Thread attacks are possible.
|
||
|
+
|
||
|
+The victim of a malicious actor does not need to make use of TSX. Only the
|
||
|
+attacker needs to begin a TSX transaction and raise an asynchronous abort
|
||
|
+which in turn potenitally leaks data stored in the buffers.
|
||
|
+
|
||
|
+More detailed technical information is available in the TAA specific x86
|
||
|
+architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`.
|
||
|
+
|
||
|
+
|
||
|
+Attack scenarios
|
||
|
+----------------
|
||
|
+
|
||
|
+Attacks against the TAA vulnerability can be implemented from unprivileged
|
||
|
+applications running on hosts or guests.
|
||
|
+
|
||
|
+As for MDS, the attacker has no control over the memory addresses that can
|
||
|
+be leaked. Only the victim is responsible for bringing data to the CPU. As
|
||
|
+a result, the malicious actor has to sample as much data as possible and
|
||
|
+then postprocess it to try to infer any useful information from it.
|
||
|
+
|
||
|
+A potential attacker only has read access to the data. Also, there is no direct
|
||
|
+privilege escalation by using this technique.
|
||
|
+
|
||
|
+
|
||
|
+.. _tsx_async_abort_sys_info:
|
||
|
+
|
||
|
+TAA system information
|
||
|
+-----------------------
|
||
|
+
|
||
|
+The Linux kernel provides a sysfs interface to enumerate the current TAA status
|
||
|
+of mitigated systems. The relevant sysfs file is:
|
||
|
+
|
||
|
+/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
|
||
|
+
|
||
|
+The possible values in this file are:
|
||
|
+
|
||
|
+.. list-table::
|
||
|
+
|
||
|
+ * - 'Vulnerable'
|
||
|
+ - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied.
|
||
|
+ * - 'Vulnerable: Clear CPU buffers attempted, no microcode'
|
||
|
+ - The system tries to clear the buffers but the microcode might not support the operation.
|
||
|
+ * - 'Mitigation: Clear CPU buffers'
|
||
|
+ - The microcode has been updated to clear the buffers. TSX is still enabled.
|
||
|
+ * - 'Mitigation: TSX disabled'
|
||
|
+ - TSX is disabled.
|
||
|
+ * - 'Not affected'
|
||
|
+ - The CPU is not affected by this issue.
|
||
|
+
|
||
|
+.. _ucode_needed:
|
||
|
+
|
||
|
+Best effort mitigation mode
|
||
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
|
+
|
||
|
+If the processor is vulnerable, but the availability of the microcode-based
|
||
|
+mitigation mechanism is not advertised via CPUID the kernel selects a best
|
||
|
+effort mitigation mode. This mode invokes the mitigation instructions
|
||
|
+without a guarantee that they clear the CPU buffers.
|
||
|
+
|
||
|
+This is done to address virtualization scenarios where the host has the
|
||
|
+microcode update applied, but the hypervisor is not yet updated to expose the
|
||
|
+CPUID to the guest. If the host has updated microcode the protection takes
|
||
|
+effect; otherwise a few CPU cycles are wasted pointlessly.
|
||
|
+
|
||
|
+The state in the tsx_async_abort sysfs file reflects this situation
|
||
|
+accordingly.
|
||
|
+
|
||
|
+
|
||
|
+Mitigation mechanism
|
||
|
+--------------------
|
||
|
+
|
||
|
+The kernel detects the affected CPUs and the presence of the microcode which is
|
||
|
+required. If a CPU is affected and the microcode is available, then the kernel
|
||
|
+enables the mitigation by default.
|
||
|
+
|
||
|
+
|
||
|
+The mitigation can be controlled at boot time via a kernel command line option.
|
||
|
+See :ref:`taa_mitigation_control_command_line`.
|
||
|
+
|
||
|
+.. _virt_mechanism:
|
||
|
+
|
||
|
+Virtualization mitigation
|
||
|
+^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
|
+
|
||
|
+Affected systems where the host has TAA microcode and TAA is mitigated by
|
||
|
+having disabled TSX previously, are not vulnerable regardless of the status
|
||
|
+of the VMs.
|
||
|
+
|
||
|
+In all other cases, if the host either does not have the TAA microcode or
|
||
|
+the kernel is not mitigated, the system might be vulnerable.
|
||
|
+
|
||
|
+
|
||
|
+.. _taa_mitigation_control_command_line:
|
||
|
+
|
||
|
+Mitigation control on the kernel command line
|
||
|
+---------------------------------------------
|
||
|
+
|
||
|
+The kernel command line allows to control the TAA mitigations at boot time with
|
||
|
+the option "tsx_async_abort=". The valid arguments for this option are:
|
||
|
+
|
||
|
+ ============ =============================================================
|
||
|
+ off This option disables the TAA mitigation on affected platforms.
|
||
|
+ If the system has TSX enabled (see next parameter) and the CPU
|
||
|
+ is affected, the system is vulnerable.
|
||
|
+
|
||
|
+ full TAA mitigation is enabled. If TSX is enabled, on an affected
|
||
|
+ system it will clear CPU buffers on ring transitions. On
|
||
|
+ systems which are MDS-affected and deploy MDS mitigation,
|
||
|
+ TAA is also mitigated. Specifying this option on those
|
||
|
+ systems will have no effect.
|
||
|
+
|
||
|
+ full,nosmt The same as tsx_async_abort=full, with SMT disabled on
|
||
|
+ vulnerable CPUs that have TSX enabled. This is the complete
|
||
|
+ mitigation. When TSX is disabled, SMT is not disabled because
|
||
|
+ CPU is not vulnerable to cross-thread TAA attacks.
|
||
|
+ ============ =============================================================
|
||
|
+
|
||
|
+Not specifying this option is equivalent to "tsx_async_abort=full".
|
||
|
+
|
||
|
+The kernel command line also allows to control the TSX feature using the
|
||
|
+parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used
|
||
|
+to control the TSX feature and the enumeration of the TSX feature bits (RTM
|
||
|
+and HLE) in CPUID.
|
||
|
+
|
||
|
+The valid options are:
|
||
|
+
|
||
|
+ ============ =============================================================
|
||
|
+ off Disables TSX on the system.
|
||
|
+
|
||
|
+ Note that this option takes effect only on newer CPUs which are
|
||
|
+ not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1
|
||
|
+ and which get the new IA32_TSX_CTRL MSR through a microcode
|
||
|
+ update. This new MSR allows for the reliable deactivation of
|
||
|
+ the TSX functionality.
|
||
|
+
|
||
|
+ on Enables TSX.
|
||
|
+
|
||
|
+ Although there are mitigations for all known security
|
||
|
+ vulnerabilities, TSX has been known to be an accelerator for
|
||
|
+ several previous speculation-related CVEs, and so there may be
|
||
|
+ unknown security risks associated with leaving it enabled.
|
||
|
+
|
||
|
+ auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX
|
||
|
+ on the system.
|
||
|
+ ============ =============================================================
|
||
|
+
|
||
|
+Not specifying this option is equivalent to "tsx=off".
|
||
|
+
|
||
|
+The following combinations of the "tsx_async_abort" and "tsx" are possible. For
|
||
|
+affected platforms tsx=auto is equivalent to tsx=off and the result will be:
|
||
|
+
|
||
|
+ ========= ========================== =========================================
|
||
|
+ tsx=on tsx_async_abort=full The system will use VERW to clear CPU
|
||
|
+ buffers. Cross-thread attacks are still
|
||
|
+ possible on SMT machines.
|
||
|
+ tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT
|
||
|
+ mitigated.
|
||
|
+ tsx=on tsx_async_abort=off The system is vulnerable.
|
||
|
+ tsx=off tsx_async_abort=full TSX might be disabled if microcode
|
||
|
+ provides a TSX control MSR. If so,
|
||
|
+ system is not vulnerable.
|
||
|
+ tsx=off tsx_async_abort=full,nosmt Ditto
|
||
|
+ tsx=off tsx_async_abort=off ditto
|
||
|
+ ========= ========================== =========================================
|
||
|
+
|
||
|
+
|
||
|
+For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU
|
||
|
+buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0)
|
||
|
+"tsx" command line argument has no effect.
|
||
|
+
|
||
|
+For the affected platforms below table indicates the mitigation status for the
|
||
|
+combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO
|
||
|
+and TSX_CTRL_MSR.
|
||
|
+
|
||
|
+ ======= ========= ============= ========================================
|
||
|
+ MDS_NO MD_CLEAR TSX_CTRL_MSR Status
|
||
|
+ ======= ========= ============= ========================================
|
||
|
+ 0 0 0 Vulnerable (needs microcode)
|
||
|
+ 0 1 0 MDS and TAA mitigated via VERW
|
||
|
+ 1 1 0 MDS fixed, TAA vulnerable if TSX enabled
|
||
|
+ because MD_CLEAR has no meaning and
|
||
|
+ VERW is not guaranteed to clear buffers
|
||
|
+ 1 X 1 MDS fixed, TAA can be mitigated by
|
||
|
+ VERW or TSX_CTRL_MSR
|
||
|
+ ======= ========= ============= ========================================
|
||
|
+
|
||
|
+Mitigation selection guide
|
||
|
+--------------------------
|
||
|
+
|
||
|
+1. Trusted userspace and guests
|
||
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
|
+
|
||
|
+If all user space applications are from a trusted source and do not execute
|
||
|
+untrusted code which is supplied externally, then the mitigation can be
|
||
|
+disabled. The same applies to virtualized environments with trusted guests.
|
||
|
+
|
||
|
+
|
||
|
+2. Untrusted userspace and guests
|
||
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||
|
+
|
||
|
+If there are untrusted applications or guests on the system, enabling TSX
|
||
|
+might allow a malicious actor to leak data from the host or from other
|
||
|
+processes running on the same physical core.
|
||
|
+
|
||
|
+If the microcode is available and the TSX is disabled on the host, attacks
|
||
|
+are prevented in a virtualized environment as well, even if the VMs do not
|
||
|
+explicitly enable the mitigation.
|
||
|
+
|
||
|
+
|
||
|
+.. _taa_default_mitigations:
|
||
|
+
|
||
|
+Default mitigations
|
||
|
+-------------------
|
||
|
+
|
||
|
+The kernel's default action for vulnerable processors is:
|
||
|
+
|
||
|
+ - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off).
|
||
|
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
|
||
|
index e5dd9f4d6100..46ef3680c8ab 100644
|
||
|
--- a/Documentation/virtual/kvm/locking.txt
|
||
|
+++ b/Documentation/virtual/kvm/locking.txt
|
||
|
@@ -13,8 +13,8 @@ The acquisition orders for mutexes are as follows:
|
||
|
- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
|
||
|
them together is quite rare.
|
||
|
|
||
|
-For spinlocks, kvm_lock is taken outside kvm->mmu_lock. Everything
|
||
|
-else is a leaf: no other lock is taken inside the critical sections.
|
||
|
+Everything else is a leaf: no other lock is taken inside the critical
|
||
|
+sections.
|
||
|
|
||
|
2: Exception
|
||
|
------------
|
||
|
@@ -142,7 +142,7 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update().
|
||
|
------------
|
||
|
|
||
|
Name: kvm_lock
|
||
|
-Type: spinlock_t
|
||
|
+Type: mutex
|
||
|
Arch: any
|
||
|
Protects: - vm_list
|
||
|
|
||
|
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
|
||
|
index ef389dcf1b1d..0780d55c5aa8 100644
|
||
|
--- a/Documentation/x86/index.rst
|
||
|
+++ b/Documentation/x86/index.rst
|
||
|
@@ -6,3 +6,4 @@ x86 architecture specifics
|
||
|
:maxdepth: 1
|
||
|
|
||
|
mds
|
||
|
+ tsx_async_abort
|
||
|
diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst
|
||
|
new file mode 100644
|
||
|
index 000000000000..4a4336a89372
|
||
|
--- /dev/null
|
||
|
+++ b/Documentation/x86/tsx_async_abort.rst
|
||
|
@@ -0,0 +1,117 @@
|
||
|
+.. SPDX-License-Identifier: GPL-2.0
|
||
|
+
|
||
|
+TSX Async Abort (TAA) mitigation
|
||
|
+================================
|
||
|
+
|
||
|
+.. _tsx_async_abort:
|
||
|
+
|
||
|
+Overview
|
||
|
+--------
|
||
|
+
|
||
|
+TSX Async Abort (TAA) is a side channel attack on internal buffers in some
|
||
|
+Intel processors similar to Microachitectural Data Sampling (MDS). In this
|
||
|
+case certain loads may speculatively pass invalid data to dependent operations
|
||
|
+when an asynchronous abort condition is pending in a Transactional
|
||
|
+Synchronization Extensions (TSX) transaction. This includes loads with no
|
||
|
+fault or assist condition. Such loads may speculatively expose stale data from
|
||
|
+the same uarch data structures as in MDS, with same scope of exposure i.e.
|
||
|
+same-thread and cross-thread. This issue affects all current processors that
|
||
|
+support TSX.
|
||
|
+
|
||
|
+Mitigation strategy
|
||
|
+-------------------
|
||
|
+
|
||
|
+a) TSX disable - one of the mitigations is to disable TSX. A new MSR
|
||
|
+IA32_TSX_CTRL will be available in future and current processors after
|
||
|
+microcode update which can be used to disable TSX. In addition, it
|
||
|
+controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID.
|
||
|
+
|
||
|
+b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this
|
||
|
+vulnerability. More details on this approach can be found in
|
||
|
+:ref:`Documentation/hw-vuln/mds.rst <mds>`.
|
||
|
+
|
||
|
+Kernel internal mitigation modes
|
||
|
+--------------------------------
|
||
|
+
|
||
|
+ ============= ============================================================
|
||
|
+ off Mitigation is disabled. Either the CPU is not affected or
|
||
|
+ tsx_async_abort=off is supplied on the kernel command line.
|
||
|
+
|
||
|
+ tsx disabled Mitigation is enabled. TSX feature is disabled by default at
|
||
|
+ bootup on processors that support TSX control.
|
||
|
+
|
||
|
+ verw Mitigation is enabled. CPU is affected and MD_CLEAR is
|
||
|
+ advertised in CPUID.
|
||
|
+
|
||
|
+ ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not
|
||
|
+ advertised in CPUID. That is mainly for virtualization
|
||
|
+ scenarios where the host has the updated microcode but the
|
||
|
+ hypervisor does not expose MD_CLEAR in CPUID. It's a best
|
||
|
+ effort approach without guarantee.
|
||
|
+ ============= ============================================================
|
||
|
+
|
||
|
+If the CPU is affected and the "tsx_async_abort" kernel command line parameter is
|
||
|
+not provided then the kernel selects an appropriate mitigation depending on the
|
||
|
+status of RTM and MD_CLEAR CPUID bits.
|
||
|
+
|
||
|
+Below tables indicate the impact of tsx=on|off|auto cmdline options on state of
|
||
|
+TAA mitigation, VERW behavior and TSX feature for various combinations of
|
||
|
+MSR_IA32_ARCH_CAPABILITIES bits.
|
||
|
+
|
||
|
+1. "tsx=off"
|
||
|
+
|
||
|
+========= ========= ============ ============ ============== =================== ======================
|
||
|
+MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off
|
||
|
+---------------------------------- -------------------------------------------------------------------------
|
||
|
+TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
|
||
|
+ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
|
||
|
+========= ========= ============ ============ ============== =================== ======================
|
||
|
+ 0 0 0 HW default Yes Same as MDS Same as MDS
|
||
|
+ 0 0 1 Invalid case Invalid case Invalid case Invalid case
|
||
|
+ 0 1 0 HW default No Need ucode update Need ucode update
|
||
|
+ 0 1 1 Disabled Yes TSX disabled TSX disabled
|
||
|
+ 1 X 1 Disabled X None needed None needed
|
||
|
+========= ========= ============ ============ ============== =================== ======================
|
||
|
+
|
||
|
+2. "tsx=on"
|
||
|
+
|
||
|
+========= ========= ============ ============ ============== =================== ======================
|
||
|
+MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on
|
||
|
+---------------------------------- -------------------------------------------------------------------------
|
||
|
+TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
|
||
|
+ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
|
||
|
+========= ========= ============ ============ ============== =================== ======================
|
||
|
+ 0 0 0 HW default Yes Same as MDS Same as MDS
|
||
|
+ 0 0 1 Invalid case Invalid case Invalid case Invalid case
|
||
|
+ 0 1 0 HW default No Need ucode update Need ucode update
|
||
|
+ 0 1 1 Enabled Yes None Same as MDS
|
||
|
+ 1 X 1 Enabled X None needed None needed
|
||
|
+========= ========= ============ ============ ============== =================== ======================
|
||
|
+
|
||
|
+3. "tsx=auto"
|
||
|
+
|
||
|
+========= ========= ============ ============ ============== =================== ======================
|
||
|
+MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto
|
||
|
+---------------------------------- -------------------------------------------------------------------------
|
||
|
+TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
|
||
|
+ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
|
||
|
+========= ========= ============ ============ ============== =================== ======================
|
||
|
+ 0 0 0 HW default Yes Same as MDS Same as MDS
|
||
|
+ 0 0 1 Invalid case Invalid case Invalid case Invalid case
|
||
|
+ 0 1 0 HW default No Need ucode update Need ucode update
|
||
|
+ 0 1 1 Disabled Yes TSX disabled TSX disabled
|
||
|
+ 1 X 1 Enabled X None needed None needed
|
||
|
+========= ========= ============ ============ ============== =================== ======================
|
||
|
+
|
||
|
+In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that
|
||
|
+indicates whether MSR_IA32_TSX_CTRL is supported.
|
||
|
+
|
||
|
+There are two control bits in IA32_TSX_CTRL MSR:
|
||
|
+
|
||
|
+ Bit 0: When set it disables the Restricted Transactional Memory (RTM)
|
||
|
+ sub-feature of TSX (will force all transactions to abort on the
|
||
|
+ XBEGIN instruction).
|
||
|
+
|
||
|
+ Bit 1: When set it disables the enumeration of the RTM and HLE feature
|
||
|
+ (i.e. it will make CPUID(EAX=7).EBX{bit4} and
|
||
|
+ CPUID(EAX=7).EBX{bit11} read as 0).
|
||
|
diff --git a/Makefile b/Makefile
|
||
|
index 4741bbdfaa10..1e322e669301 100644
|
||
|
--- a/Makefile
|
||
|
+++ b/Makefile
|
||
|
@@ -1,6 +1,6 @@
|
||
|
VERSION = 4
|
||
|
PATCHLEVEL = 9
|
||
|
-SUBLEVEL = 201
|
||
|
+SUBLEVEL = 202
|
||
|
EXTRAVERSION =
|
||
|
NAME = Roaring Lionus
|
||
|
|
||
|
diff --git a/arch/mips/bcm63xx/reset.c b/arch/mips/bcm63xx/reset.c
|
||
|
index d1fe51edf5e6..4d411da2497b 100644
|
||
|
--- a/arch/mips/bcm63xx/reset.c
|
||
|
+++ b/arch/mips/bcm63xx/reset.c
|
||
|
@@ -119,7 +119,7 @@
|
||
|
#define BCM6368_RESET_DSL 0
|
||
|
#define BCM6368_RESET_SAR SOFTRESET_6368_SAR_MASK
|
||
|
#define BCM6368_RESET_EPHY SOFTRESET_6368_EPHY_MASK
|
||
|
-#define BCM6368_RESET_ENETSW 0
|
||
|
+#define BCM6368_RESET_ENETSW SOFTRESET_6368_ENETSW_MASK
|
||
|
#define BCM6368_RESET_PCM SOFTRESET_6368_PCM_MASK
|
||
|
#define BCM6368_RESET_MPI SOFTRESET_6368_MPI_MASK
|
||
|
#define BCM6368_RESET_PCIE 0
|
||
|
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
|
||
|
index 3dc96b455e0c..37c254677ccd 100644
|
||
|
--- a/arch/s390/kvm/kvm-s390.c
|
||
|
+++ b/arch/s390/kvm/kvm-s390.c
|
||
|
@@ -1422,13 +1422,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||
|
kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
|
||
|
if (!kvm->arch.sca)
|
||
|
goto out_err;
|
||
|
- spin_lock(&kvm_lock);
|
||
|
+ mutex_lock(&kvm_lock);
|
||
|
sca_offset += 16;
|
||
|
if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE)
|
||
|
sca_offset = 0;
|
||
|
kvm->arch.sca = (struct bsca_block *)
|
||
|
((char *) kvm->arch.sca + sca_offset);
|
||
|
- spin_unlock(&kvm_lock);
|
||
|
+ mutex_unlock(&kvm_lock);
|
||
|
|
||
|
sprintf(debug_name, "kvm-%u", current->pid);
|
||
|
|
||
|
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
|
||
|
index a2f6953a86f5..0a21fb86fd67 100644
|
||
|
--- a/drivers/bluetooth/hci_ldisc.c
|
||
|
+++ b/drivers/bluetooth/hci_ldisc.c
|
||
|
@@ -653,15 +653,14 @@ static int hci_uart_set_proto(struct hci_uart *hu, int id)
|
||
|
return err;
|
||
|
|
||
|
hu->proto = p;
|
||
|
- set_bit(HCI_UART_PROTO_READY, &hu->flags);
|
||
|
|
||
|
err = hci_uart_register_dev(hu);
|
||
|
if (err) {
|
||
|
- clear_bit(HCI_UART_PROTO_READY, &hu->flags);
|
||
|
p->close(hu);
|
||
|
return err;
|
||
|
}
|
||
|
|
||
|
+ set_bit(HCI_UART_PROTO_READY, &hu->flags);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
|
||
|
index 95e28ecfde0a..99c7cf4822c3 100644
|
||
|
--- a/drivers/usb/gadget/udc/core.c
|
||
|
+++ b/drivers/usb/gadget/udc/core.c
|
||
|
@@ -817,6 +817,8 @@ int usb_gadget_map_request_by_dev(struct device *dev,
|
||
|
dev_err(dev, "failed to map buffer\n");
|
||
|
return -EFAULT;
|
||
|
}
|
||
|
+
|
||
|
+ req->dma_mapped = 1;
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
@@ -841,9 +843,10 @@ void usb_gadget_unmap_request_by_dev(struct device *dev,
|
||
|
is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
|
||
|
|
||
|
req->num_mapped_sgs = 0;
|
||
|
- } else {
|
||
|
+ } else if (req->dma_mapped) {
|
||
|
dma_unmap_single(dev, req->dma, req->length,
|
||
|
is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
|
||
|
+ req->dma_mapped = 0;
|
||
|
}
|
||
|
}
|
||
|
EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev);
|
||
|
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
|
||
|
index b27c9b2e683f..e19bbc38a722 100644
|
||
|
--- a/include/linux/cpu.h
|
||
|
+++ b/include/linux/cpu.h
|
||
|
@@ -56,6 +56,11 @@ extern ssize_t cpu_show_l1tf(struct device *dev,
|
||
|
struct device_attribute *attr, char *buf);
|
||
|
extern ssize_t cpu_show_mds(struct device *dev,
|
||
|
struct device_attribute *attr, char *buf);
|
||
|
+extern ssize_t cpu_show_tsx_async_abort(struct device *dev,
|
||
|
+ struct device_attribute *attr,
|
||
|
+ char *buf);
|
||
|
+extern ssize_t cpu_show_itlb_multihit(struct device *dev,
|
||
|
+ struct device_attribute *attr, char *buf);
|
||
|
|
||
|
extern __printf(4, 5)
|
||
|
struct device *cpu_device_create(struct device *parent, void *drvdata,
|
||
|
@@ -282,28 +287,7 @@ static inline int cpuhp_smt_enable(void) { return 0; }
|
||
|
static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; }
|
||
|
#endif
|
||
|
|
||
|
-/*
|
||
|
- * These are used for a global "mitigations=" cmdline option for toggling
|
||
|
- * optional CPU mitigations.
|
||
|
- */
|
||
|
-enum cpu_mitigations {
|
||
|
- CPU_MITIGATIONS_OFF,
|
||
|
- CPU_MITIGATIONS_AUTO,
|
||
|
- CPU_MITIGATIONS_AUTO_NOSMT,
|
||
|
-};
|
||
|
-
|
||
|
-extern enum cpu_mitigations cpu_mitigations;
|
||
|
-
|
||
|
-/* mitigations=off */
|
||
|
-static inline bool cpu_mitigations_off(void)
|
||
|
-{
|
||
|
- return cpu_mitigations == CPU_MITIGATIONS_OFF;
|
||
|
-}
|
||
|
-
|
||
|
-/* mitigations=auto,nosmt */
|
||
|
-static inline bool cpu_mitigations_auto_nosmt(void)
|
||
|
-{
|
||
|
- return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
|
||
|
-}
|
||
|
+extern bool cpu_mitigations_off(void);
|
||
|
+extern bool cpu_mitigations_auto_nosmt(void);
|
||
|
|
||
|
#endif /* _LINUX_CPU_H_ */
|
||
|
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
|
||
|
index eb55374b73f3..0590e7d47b02 100644
|
||
|
--- a/include/linux/kvm_host.h
|
||
|
+++ b/include/linux/kvm_host.h
|
||
|
@@ -129,7 +129,7 @@ static inline bool is_error_page(struct page *page)
|
||
|
|
||
|
extern struct kmem_cache *kvm_vcpu_cache;
|
||
|
|
||
|
-extern spinlock_t kvm_lock;
|
||
|
+extern struct mutex kvm_lock;
|
||
|
extern struct list_head vm_list;
|
||
|
|
||
|
struct kvm_io_range {
|
||
|
@@ -1208,4 +1208,10 @@ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu)
|
||
|
}
|
||
|
#endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */
|
||
|
|
||
|
+typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
|
||
|
+
|
||
|
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
|
||
|
+ uintptr_t data, const char *name,
|
||
|
+ struct task_struct **thread_ptr);
|
||
|
+
|
||
|
#endif
|
||
|
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
|
||
|
index e4516e9ded0f..4b810bc7ae63 100644
|
||
|
--- a/include/linux/usb/gadget.h
|
||
|
+++ b/include/linux/usb/gadget.h
|
||
|
@@ -48,6 +48,7 @@ struct usb_ep;
|
||
|
* by adding a zero length packet as needed;
|
||
|
* @short_not_ok: When reading data, makes short packets be
|
||
|
* treated as errors (queue stops advancing till cleanup).
|
||
|
+ * @dma_mapped: Indicates if request has been mapped to DMA (internal)
|
||
|
* @complete: Function called when request completes, so this request and
|
||
|
* its buffer may be re-used. The function will always be called with
|
||
|
* interrupts disabled, and it must not sleep.
|
||
|
@@ -103,6 +104,7 @@ struct usb_request {
|
||
|
unsigned no_interrupt:1;
|
||
|
unsigned zero:1;
|
||
|
unsigned short_not_ok:1;
|
||
|
+ unsigned dma_mapped:1;
|
||
|
|
||
|
void (*complete)(struct usb_ep *ep,
|
||
|
struct usb_request *req);
|
||
|
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
|
||
|
index c72586a094ed..0fc93519e63e 100644
|
||
|
--- a/virt/kvm/kvm_main.c
|
||
|
+++ b/virt/kvm/kvm_main.c
|
||
|
@@ -49,6 +49,7 @@
|
||
|
#include <linux/slab.h>
|
||
|
#include <linux/sort.h>
|
||
|
#include <linux/bsearch.h>
|
||
|
+#include <linux/kthread.h>
|
||
|
|
||
|
#include <asm/processor.h>
|
||
|
#include <asm/io.h>
|
||
|
@@ -87,7 +88,7 @@ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
|
||
|
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
|
||
|
*/
|
||
|
|
||
|
-DEFINE_SPINLOCK(kvm_lock);
|
||
|
+DEFINE_MUTEX(kvm_lock);
|
||
|
static DEFINE_RAW_SPINLOCK(kvm_count_lock);
|
||
|
LIST_HEAD(vm_list);
|
||
|
|
||
|
@@ -612,6 +613,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+/*
|
||
|
+ * Called after the VM is otherwise initialized, but just before adding it to
|
||
|
+ * the vm_list.
|
||
|
+ */
|
||
|
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
|
||
|
+{
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
+/*
|
||
|
+ * Called just after removing the VM from the vm_list, but before doing any
|
||
|
+ * other destruction.
|
||
|
+ */
|
||
|
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
|
||
|
+{
|
||
|
+}
|
||
|
+
|
||
|
static struct kvm *kvm_create_vm(unsigned long type)
|
||
|
{
|
||
|
int r, i;
|
||
|
@@ -659,22 +677,31 @@ static struct kvm *kvm_create_vm(unsigned long type)
|
||
|
kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
|
||
|
GFP_KERNEL);
|
||
|
if (!kvm->buses[i])
|
||
|
- goto out_err;
|
||
|
+ goto out_err_no_mmu_notifier;
|
||
|
}
|
||
|
|
||
|
r = kvm_init_mmu_notifier(kvm);
|
||
|
+ if (r)
|
||
|
+ goto out_err_no_mmu_notifier;
|
||
|
+
|
||
|
+ r = kvm_arch_post_init_vm(kvm);
|
||
|
if (r)
|
||
|
goto out_err;
|
||
|
|
||
|
- spin_lock(&kvm_lock);
|
||
|
+ mutex_lock(&kvm_lock);
|
||
|
list_add(&kvm->vm_list, &vm_list);
|
||
|
- spin_unlock(&kvm_lock);
|
||
|
+ mutex_unlock(&kvm_lock);
|
||
|
|
||
|
preempt_notifier_inc();
|
||
|
|
||
|
return kvm;
|
||
|
|
||
|
out_err:
|
||
|
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
|
||
|
+ if (kvm->mmu_notifier.ops)
|
||
|
+ mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
|
||
|
+#endif
|
||
|
+out_err_no_mmu_notifier:
|
||
|
cleanup_srcu_struct(&kvm->irq_srcu);
|
||
|
out_err_no_irq_srcu:
|
||
|
cleanup_srcu_struct(&kvm->srcu);
|
||
|
@@ -724,9 +751,11 @@ static void kvm_destroy_vm(struct kvm *kvm)
|
||
|
|
||
|
kvm_destroy_vm_debugfs(kvm);
|
||
|
kvm_arch_sync_events(kvm);
|
||
|
- spin_lock(&kvm_lock);
|
||
|
+ mutex_lock(&kvm_lock);
|
||
|
list_del(&kvm->vm_list);
|
||
|
- spin_unlock(&kvm_lock);
|
||
|
+ mutex_unlock(&kvm_lock);
|
||
|
+ kvm_arch_pre_destroy_vm(kvm);
|
||
|
+
|
||
|
kvm_free_irq_routing(kvm);
|
||
|
for (i = 0; i < KVM_NR_BUSES; i++) {
|
||
|
if (kvm->buses[i])
|
||
|
@@ -3752,13 +3781,13 @@ static int vm_stat_get(void *_offset, u64 *val)
|
||
|
u64 tmp_val;
|
||
|
|
||
|
*val = 0;
|
||
|
- spin_lock(&kvm_lock);
|
||
|
+ mutex_lock(&kvm_lock);
|
||
|
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||
|
stat_tmp.kvm = kvm;
|
||
|
vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
|
||
|
*val += tmp_val;
|
||
|
}
|
||
|
- spin_unlock(&kvm_lock);
|
||
|
+ mutex_unlock(&kvm_lock);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
@@ -3772,13 +3801,13 @@ static int vcpu_stat_get(void *_offset, u64 *val)
|
||
|
u64 tmp_val;
|
||
|
|
||
|
*val = 0;
|
||
|
- spin_lock(&kvm_lock);
|
||
|
+ mutex_lock(&kvm_lock);
|
||
|
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||
|
stat_tmp.kvm = kvm;
|
||
|
vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
|
||
|
*val += tmp_val;
|
||
|
}
|
||
|
- spin_unlock(&kvm_lock);
|
||
|
+ mutex_unlock(&kvm_lock);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
@@ -3987,3 +4016,86 @@ void kvm_exit(void)
|
||
|
kvm_vfio_ops_exit();
|
||
|
}
|
||
|
EXPORT_SYMBOL_GPL(kvm_exit);
|
||
|
+
|
||
|
+struct kvm_vm_worker_thread_context {
|
||
|
+ struct kvm *kvm;
|
||
|
+ struct task_struct *parent;
|
||
|
+ struct completion init_done;
|
||
|
+ kvm_vm_thread_fn_t thread_fn;
|
||
|
+ uintptr_t data;
|
||
|
+ int err;
|
||
|
+};
|
||
|
+
|
||
|
+static int kvm_vm_worker_thread(void *context)
|
||
|
+{
|
||
|
+ /*
|
||
|
+ * The init_context is allocated on the stack of the parent thread, so
|
||
|
+ * we have to locally copy anything that is needed beyond initialization
|
||
|
+ */
|
||
|
+ struct kvm_vm_worker_thread_context *init_context = context;
|
||
|
+ struct kvm *kvm = init_context->kvm;
|
||
|
+ kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
|
||
|
+ uintptr_t data = init_context->data;
|
||
|
+ int err;
|
||
|
+
|
||
|
+ err = kthread_park(current);
|
||
|
+ /* kthread_park(current) is never supposed to return an error */
|
||
|
+ WARN_ON(err != 0);
|
||
|
+ if (err)
|
||
|
+ goto init_complete;
|
||
|
+
|
||
|
+ err = cgroup_attach_task_all(init_context->parent, current);
|
||
|
+ if (err) {
|
||
|
+ kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
|
||
|
+ __func__, err);
|
||
|
+ goto init_complete;
|
||
|
+ }
|
||
|
+
|
||
|
+ set_user_nice(current, task_nice(init_context->parent));
|
||
|
+
|
||
|
+init_complete:
|
||
|
+ init_context->err = err;
|
||
|
+ complete(&init_context->init_done);
|
||
|
+ init_context = NULL;
|
||
|
+
|
||
|
+ if (err)
|
||
|
+ return err;
|
||
|
+
|
||
|
+ /* Wait to be woken up by the spawner before proceeding. */
|
||
|
+ kthread_parkme();
|
||
|
+
|
||
|
+ if (!kthread_should_stop())
|
||
|
+ err = thread_fn(kvm, data);
|
||
|
+
|
||
|
+ return err;
|
||
|
+}
|
||
|
+
|
||
|
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
|
||
|
+ uintptr_t data, const char *name,
|
||
|
+ struct task_struct **thread_ptr)
|
||
|
+{
|
||
|
+ struct kvm_vm_worker_thread_context init_context = {};
|
||
|
+ struct task_struct *thread;
|
||
|
+
|
||
|
+ *thread_ptr = NULL;
|
||
|
+ init_context.kvm = kvm;
|
||
|
+ init_context.parent = current;
|
||
|
+ init_context.thread_fn = thread_fn;
|
||
|
+ init_context.data = data;
|
||
|
+ init_completion(&init_context.init_done);
|
||
|
+
|
||
|
+ thread = kthread_run(kvm_vm_worker_thread, &init_context,
|
||
|
+ "%s-%d", name, task_pid_nr(current));
|
||
|
+ if (IS_ERR(thread))
|
||
|
+ return PTR_ERR(thread);
|
||
|
+
|
||
|
+ /* kthread_run is never supposed to return NULL */
|
||
|
+ WARN_ON(thread == NULL);
|
||
|
+
|
||
|
+ wait_for_completion(&init_context.init_done);
|
||
|
+
|
||
|
+ if (!init_context.err)
|
||
|
+ *thread_ptr = thread;
|
||
|
+
|
||
|
+ return init_context.err;
|
||
|
+}
|