On 1/29/2026 7:09 AM, Zide Chen wrote:
> Newer Intel server CPUs support a large number of PMU MSRs. Currently,
> QEMU allocates cpu->kvm_msr_buf as a single-page buffer, which is not
> sufficient to hold all possible MSRs.
>
> Increase MSR_BUF_SIZE to 8192 bytes, providing space for up to 511 MSRs.
> This is sufficient even for the theoretical worst case, such as
> architectural LBR with a depth of 64.
>
> KVM_[GET/SET]_MSRS is limited to 255 MSRs per call. Raising this limit
> to 511 would require changes in KVM and would introduce backward
> compatibility issues. Instead, split requests into multiple
> KVM_[GET/SET]_MSRS calls when the number of MSRs exceeds the API limit.
>
> Signed-off-by: Zide Chen <[email protected]>
> ---
> V2:
> - No changes.
>
> target/i386/kvm/kvm.c | 109 +++++++++++++++++++++++++++++++++++-------
> 1 file changed, 92 insertions(+), 17 deletions(-)
>
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 530f50e4b218..a2cf9b5df35d 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -98,9 +98,12 @@
> #define KVM_APIC_BUS_CYCLE_NS 1
> #define KVM_APIC_BUS_FREQUENCY (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
>
> -/* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
> - * 255 kvm_msr_entry structs */
> -#define MSR_BUF_SIZE 4096
> +/* A 8192-byte buffer can hold the 8-byte kvm_msrs header, plus
> + * 511 kvm_msr_entry structs */
> +#define MSR_BUF_SIZE 8192
> +
> +/* Maximum number of MSRs in one single KVM_[GET/SET]_MSRS call. */
> +#define KVM_MAX_IO_MSRS 255
>
> typedef bool QEMURDMSRHandler(X86CPU *cpu, uint32_t msr, uint64_t *val);
> typedef bool QEMUWRMSRHandler(X86CPU *cpu, uint32_t msr, uint64_t val);
> @@ -3878,23 +3881,102 @@ static void kvm_msr_entry_add_perf(X86CPU *cpu,
> FeatureWordArray f)
> }
> }
>
> -static int kvm_buf_set_msrs(X86CPU *cpu)
> +static int __kvm_buf_set_msrs(X86CPU *cpu, struct kvm_msrs *msrs)
> {
> - int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
> + int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, msrs);
> if (ret < 0) {
> return ret;
> }
>
> - if (ret < cpu->kvm_msr_buf->nmsrs) {
> - struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
> + if (ret < msrs->nmsrs) {
> + struct kvm_msr_entry *e = &msrs->entries[ret];
> error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
> (uint32_t)e->index, (uint64_t)e->data);
> }
>
> - assert(ret == cpu->kvm_msr_buf->nmsrs);
> + assert(ret == msrs->nmsrs);
> + return ret;
> +}
> +
> +static int __kvm_buf_get_msrs(X86CPU *cpu, struct kvm_msrs *msrs)
> +{
> + int ret;
> +
> + ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, msrs);
> + if (ret < 0) {
> + return ret;
> + }
> +
> + if (ret < msrs->nmsrs) {
> + struct kvm_msr_entry *e = &msrs->entries[ret];
> + error_report("error: failed to get MSR 0x%" PRIx32,
> + (uint32_t)e->index);
> + }
> +
> + assert(ret == msrs->nmsrs);
> + return ret;
> +}
> +
> +static int kvm_buf_set_or_get_msrs(X86CPU *cpu, bool is_write)
> +{
> + struct kvm_msr_entry *entries = cpu->kvm_msr_buf->entries;
> + struct kvm_msrs *buf = NULL;
> + int current, remaining, ret = 0;
> + size_t buf_size;
> +
> + buf_size = KVM_MAX_IO_MSRS * sizeof(struct kvm_msr_entry) +
> + sizeof(struct kvm_msrs);
> + buf = g_malloc(buf_size);
> +
> + remaining = cpu->kvm_msr_buf->nmsrs;
> + current = 0;
> + while (remaining) {
> + size_t size;
> +
> + memset(buf, 0, buf_size);
> +
> + if (remaining > KVM_MAX_IO_MSRS) {
> + buf->nmsrs = KVM_MAX_IO_MSRS;
> + } else {
> + buf->nmsrs = remaining;
> + }
> +
> + size = buf->nmsrs * sizeof(entries[0]);
> + memcpy(buf->entries, &entries[current], size);
> +
> + if (is_write) {
> + ret = __kvm_buf_set_msrs(cpu, buf);
> + } else {
> + ret = __kvm_buf_get_msrs(cpu, buf);
> + }
> +
> + if (ret < 0) {
> + goto out;
> + }
> +
> + if (!is_write)
> + memcpy(&entries[current], buf->entries, size);
> +
> + current += buf->nmsrs;
> + remaining -= buf->nmsrs;
> + }
> +
> +out:
> + g_free(buf);
> + return ret < 0 ? ret : cpu->kvm_msr_buf->nmsrs;
> +}
> +
> +static int kvm_buf_set_msrs(X86CPU *cpu)
Better add inline.
> +{
> + kvm_buf_set_or_get_msrs(cpu, true);
> return 0;
why not directly return the return value of kvm_buf_set_or_get_msrs().
> }
>
> +static int kvm_buf_get_msrs(X86CPU *cpu)
inline.
Others look good to me.
> +{
> + return kvm_buf_set_or_get_msrs(cpu, false);
> +}
> +
> static void kvm_init_msrs(X86CPU *cpu)
> {
> CPUX86State *env = &cpu->env;
> @@ -3928,7 +4010,7 @@ static void kvm_init_msrs(X86CPU *cpu)
> if (has_msr_ucode_rev) {
> kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
> }
> - assert(kvm_buf_set_msrs(cpu) == 0);
> + kvm_buf_set_msrs(cpu);
> }
>
> static int kvm_put_msrs(X86CPU *cpu, KvmPutState level)
> @@ -4746,18 +4828,11 @@ static int kvm_get_msrs(X86CPU *cpu)
> }
> }
>
> - ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
> + ret = kvm_buf_get_msrs(cpu);
> if (ret < 0) {
> return ret;
> }
>
> - if (ret < cpu->kvm_msr_buf->nmsrs) {
> - struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
> - error_report("error: failed to get MSR 0x%" PRIx32,
> - (uint32_t)e->index);
> - }
> -
> - assert(ret == cpu->kvm_msr_buf->nmsrs);
> /*
> * MTRR masks: Each mask consists of 5 parts
> * a 10..0: must be zero