voidsyscall_init(void) { /* The default user and kernel segments */ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
/* * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit * instruction to return to ring 3 (both sysexit and sysret cause * #UD when FRED is enabled). */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_syscall_init(); }
if (ia32_enabled()) { wrmsrl_cstar((unsignedlong)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsignedlong)(cpu_entry_stack(smp_processor_id()) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); } else { wrmsrl_cstar((unsignedlong)entry_SYSCALL32_ignore); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); }
/* * Flags to clear on syscall; clear as much as possible * to minimize user space-kernel interference. */ wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF| X86_EFLAGS_AC|X86_EFLAGS_ID); }
其中在第6行,将User段的cs和kernel段的cs写入MSR_STAR寄存器。第一个特殊模块集寄存器- MSR_STAR 的 63:48 为用户代码的代码段。这些数据将加载至 CS 和 SS 段选择符,由提供将系统调用返回至相应特权级的用户代码功能的 sysret 指令使用。 同时从内核代码来看, 当用户空间应用程序执行系统调用时,MSR_STAR 的 47:32 将作为 CS and SS段选择寄存器的基地址。
1 2
/* The default user and kernel segments */ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
if (ia32_enabled()) { wrmsrl_cstar((unsignedlong)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsignedlong)(cpu_entry_stack(smp_processor_id()) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); } else { wrmsrl_cstar((unsignedlong)entry_SYSCALL32_ignore); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); } wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF| X86_EFLAGS_AC|X86_EFLAGS_ID); //禁止中断和一些别的掩码位,最重要的是禁止中断。也就是说syscall是不允许中断的 }
struct pt_regs { /* * C ABI says these regs are callee-preserved. They aren't saved on * kernel entry unless syscall needs a complete, fully filled * "struct pt_regs". */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx;
/* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; unsigned long r8; unsigned long ax; unsigned long cx; unsigned long dx; unsigned long si; unsigned long di;
/* * orig_ax is used on entry for: * - the syscall number (syscall, sysenter, int80) * - error_code stored by the CPU on traps and exceptions * - the interrupt number for device interrupts * * A FRED stack frame starts here: * 1) It _always_ includes an error code; * * 2) The return frame for ERET[US] starts here, but * the content of orig_ax is ignored. */ unsigned long orig_ax;
/* The IRETQ return frame starts here */ unsigned long ip;
union { /* CS selector */ u16 cs; /* The extended 64-bit data slot containing CS */ u64 csx; /* The FRED CS extension */ struct fred_cs fred_cs; };
unsigned long flags; unsigned long sp;
union { /* SS selector */ u16 ss; /* The extended 64-bit data slot containing SS */ u64 ssx; /* The FRED SS extension */ struct fred_ss fred_ss; };
/* * Top of stack on IDT systems, while FRED systems have extra fields * defined above for storing exception related information, e.g. CR2 or * DR6. */ };
/* IRQs are off. */ movq %rsp, %rdi /* Sign extend the lower 32bit as syscall numbers are treated as int */ movslq %eax, %rsi
/* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER # 限制间接分支预测 UNTRAIN_RET # 清理返回栈预测器 CLEAR_BRANCH_HISTORY # 清分支历史,防止 spec leak call do_syscall_64 /* returns with IRQs disabled */ __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
__visible noinstr booldo_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); //给内核栈加一个随机偏移 nr = syscall_enter_from_user_mode(regs, nr); // 做一些限制检查,比如syscall 过滤/限制(seccomp 等),然后可能要调整syscall的编号,最终nr才是需要调用的编号。
instrumentation_begin(); //用于kprobe, ftrace的跟踪
if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { /* Invalid system call, but still a system call. */ regs->ax = __x64_sys_ni_syscall(regs); }
/* * Check that the register state is valid for using SYSRET to exit * to userspace. Otherwise use the slower but fully capable IRET * exit path. */
/* XEN PV guests always use the IRET path */ if (cpu_feature_enabled(X86_FEATURE_XENPV)) returnfalse;
/* SYSRET requires RCX == RIP and R11 == EFLAGS */ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) returnfalse;
/* CS and SS must match the values set in MSR_STAR */ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) returnfalse;
/* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over * the kernel, since userspace controls RSP. * * TASK_SIZE_MAX covers all user-accessible addresses other than * the deprecated vsyscall page. */ if (unlikely(regs->ip >= TASK_SIZE_MAX)) returnfalse;
/* * SYSRET cannot restore RF. It can restore TF, but unlike IRET, * restoring TF results in a trap from userspace immediately after * SYSRET. */ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) returnfalse;
/* Use SYSRET to exit to userspace */ returntrue; }
if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { /* Invalid system call, but still a system call. */ regs->ax = __x64_sys_ni_syscall(regs); }
static __always_inline booldo_syscall_x64(struct pt_regs *regs, int nr) { /* * Convert negative numbers to very high and thus out of range * numbers for comparisons. */ unsignedint unr = nr;
static __always_inline booldo_syscall_x32(struct pt_regs *regs, int nr) { /* * Adjust the starting offset of the table, and convert numbers * < __X32_SYSCALL_BIT to very high and thus out of range * numbers for comparisons. */ unsignedint xnr = nr - __X32_SYSCALL_BIT;
call do_syscall_64 /* returns with IRQs disabled */
/* * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. If we're not, * go to the slow exit path. * In the Xen PV case we must use iret anyway. */
ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/* * We win! This label is here just for ease of understanding * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: IBRS_EXIT #清除IBRS相关寄存器 POP_REGS pop_rdi=0 # 把所有pt_regs寄存器弹出来,并且不弹出%rdi,因为本身没有压入过
/* * Now all regs are restored except RSP and RDI. * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi # 把当前栈顶%rsp 给 rdi寄存器 movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp #把trampoline的栈给rsp,这是一个固定的小栈 UNWIND_HINT_END_OF_STACK