/** * idtentry - Macro to generate entry stubs for simple IDT entries * @vector: Vector number * @asmsym: ASM symbol for the entry point * @cfunc: C function to be called * @has_error_code: Hardware pushed error code on stack * * The macro emits code to set up the kernel context for straight forward * and simple IDT entries. No IST stack, no paranoid entry checks. */ .macro idtentry vector asmsym cfunc has_error_code:req SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 ASM_CLAC
.if \has_error_code == 0 pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif
.if \vector == X86_TRAP_BP /* * If coming from kernel space, create a 6-word gap to allow the * int3 handler to emulate a call instruction. */ testb $3, CS-ORIG_RAX(%rsp) jnz .Lfrom_usermode_no_gap_\@ .rept 6 pushq 5*8(%rsp) .endr UNWIND_HINT_IRET_REGS offset=8 .Lfrom_usermode_no_gap_\@: .endif
/* * Since Linux does not use ring 2, the 'sp2' slot is unused by * hardware. entry_SYSCALL_64 uses it as scratch space to stash * the user RSP value. */ u64 sp2;
/** * idtentry_body - Macro to emit code calling the C function * @cfunc: C function to be called * @has_error_code: Hardware pushed error code on stack */ .macro idtentry_body cfunc has_error_code:req
call error_entry UNWIND_HINT_REGS
movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
.if \has_error_code == 1 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .endif
/* * Save all registers in pt_regs, and switch GS if needed. */ SYM_CODE_START_LOCAL(error_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 testb $3, CS+8(%rsp) jz .Lerror_kernelspace
/* * We entered from user mode or we're pretending to have entered * from user mode due to an IRET fault. */ SWAPGS FENCE_SWAPGS_USER_ENTRY /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
.Lerror_entry_from_usermode_after_swapgs: /* Put us onto the real thread stack. */ popq %r12 /* save return addr in %12 */ movq %rsp, %rdi /* arg0 = pt_regs pointer */ call sync_regs movq %rax, %rsp /* switch stack */ ENCODE_FRAME_POINTER pushq %r12 ret
.Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY .Lerror_entry_done: ret
/* * There are two places in the kernel that can potentially fault with * usergs. Handle them here. B stepping K8s sometimes report a * truncated RIP for IRET exceptions returning to compat mode. Check * for these here too. */ .Lerror_kernelspace: leaq native_irq_return_iret(%rip), %rcx cmpq %rcx, RIP+8(%rsp) je .Lerror_bad_iret movl %ecx, %eax /* zero extend */ cmpq %rax, RIP+8(%rsp) je .Lbstep_iret cmpq $.Lgs_change, RIP+8(%rsp) jne .Lerror_entry_done_lfence
/* * hack: .Lgs_change can fail with user gsbase. If this happens, fix up * gsbase and proceed. We'll fix up the exception and land in * .Lgs_change's error handler with kernel gsbase. */ SWAPGS FENCE_SWAPGS_USER_ENTRY jmp .Lerror_entry_done
.Lbstep_iret: /* Fix truncated RIP */ movq %rcx, RIP+8(%rsp) /* fall through */
.Lerror_bad_iret: /* * We came from an IRET to user mode, so we have user * gsbase and CR3. Switch to kernel gsbase and CR3: */ SWAPGS FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
/* * Pretend that the exception came from user mode: set up pt_regs * as if we faulted immediately after IRET. */ mov %rsp, %rdi call fixup_bad_iret mov %rax, %rsp jmp .Lerror_entry_from_usermode_after_swapgs SYM_CODE_END(error_entry)
SYM_CODE_START(entry_INT80_compat) UNWIND_HINT_EMPTY /* * Interrupts are off on entry. */ ASM_CLAC /* Do this early to minimize exposure */ SWAPGS
/* * User tracing code (ptrace or signal handlers) might assume that * the saved RAX contains a 32-bit number when we're invoking a 32-bit * syscall. Just in case the high bits are nonzero, zero-extend * the syscall number. (This could almost certainly be deleted * with no ill effects.) */ movl %eax, %eax
/* switch to thread stack expects orig_ax and rdi to be pushed */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */
/* Need to switch before accessing the thread stack. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
/* In the Xen PV case we already run on the thread stack. */ ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
/* IRQs are off. */ movq %rsp, %rdi /* Sign extend the lower 32bit as syscall numbers are treated as int */ movslq %eax, %rsi call do_syscall_64 /* returns with IRQs disabled */
/* * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. If we're not, * go to the slow exit path. * In the Xen PV case we must use iret anyway. */
ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ X86_FEATURE_XENPV
/* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over * the kernel, since userspace controls RSP. * * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. * * Change top bits to match most significant bit (47th or 56th bit * depending on paging mode) in the address. */ #ifdef CONFIG_X86_5LEVEL ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 #else shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx #endif
/* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 jne swapgs_restore_regs_and_return_to_usermode
cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ jne swapgs_restore_regs_and_return_to_usermode
/* * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot * restore RF properly. If the slowpath sets it for whatever reason, we * need to restore it correctly. * * SYSRET can restore TF, but unlike IRET, restoring TF results in a * trap from userspace immediately after SYSRET. This would cause an * infinite loop whenever #DB happens with register state that satisfies * the opportunistic SYSRET conditions. For example, single-stepping * this user code: * * movq $stuck_here, %rcx * pushfq * popq %r11 * stuck_here: * * would never get past 'stuck_here'. */ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 jnz swapgs_restore_regs_and_return_to_usermode
/* nothing to check for RSP */
cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ jne swapgs_restore_regs_and_return_to_usermode
/* * We win! This label is here just for ease of understanding * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ POP_REGS pop_rdi=0 skip_r11rcx=1
/* * Now all regs are restored except RSP and RDI. * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp UNWIND_HINT_EMPTY
__visible noinstr voiddo_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); //ASLR,安全问题,就给栈加了个偏移 nr = syscall_enter_from_user_mode(regs, nr); //一些初始化的处理,检查等操作
instrumentation_begin();//KMSAN相关操作,如果没开选项则为空
//do_syscall_x64处理64位系统调用,里面会调用系统调用表中的函数,x32就是32位的,不过是运行在64位kernel上的32位程序使用的 if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { /* Invalid system call, but still a system call. */ regs->ax = __x64_sys_ni_syscall(regs); }