Welcome > CHERI > Cheri Qemu > Instruction Fetch and PCC checks

Instruction Fetch and PCC checks

References:

QEMU start to instruction fetch

Some call paths for instruction fetch and decoding:

// cpu_exec callers
cpu_loop // linux-user/mips/cpu_loop.c
=> cpu_exec

qemu_tcg_init_vcpu // thread creation using fn pointer, cpus.c
=> qemu_tcg_rr_cpu_thread_fn
   => tcg_cpu_exec
      => cpu_exec

qemu_tcg_init_vcpu // thread creation using fn pointer, cpus.c
=> qemu_tcg_cpu_thread_fn
   => tcg_cpu_exec
      => cpu_exec

// cpu_exec ---> translator_loop
cpu_exec
   => tb_find
      => if (not found): tb_gen_code // accel/tcg/translate-all.c 
         => gen_intermediate_code // target/mips/translate.c 
            => translator_loop

// translator_loop calling callbacks
translator_loop  // accel/tcg/translator.c
=> ops->translate_insn 
   => mips_tr_translate_insn // target/mips/translate.c
   => riscv_tr_translate_insn // target/mips/translate.c

// impl of translate_insn

mips_tr_ops.translate_insn
= mips_tr_translate_insn // registered call back // target/mips/translate.c
   => decode_opc         // target/mips/translate.c
   => decode_micromips_opc  // target/mips/translate.c

Instruction fetch and translation

mips_tr_translate_insn calls gen_check_pcc_bounds_next_inst() to generate instruction on checking the PCC bounds, simulating a PCC check in CHERI.

static void mips_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs)
{
    // XXX: we don't support micromips, etc. so we can hardcode 4 bytes as the
    // instruction size (see assert below).
    gen_check_pcc_bounds_next_inst(ctx, 4);

    ...
    else if (!(ctx->hflags & MIPS_HFLAG_M16)) {

        ctx->opcode = cpu_ldl_code(env, ctx->base.pc_next);
        insn_bytes = 4;
        gen_mips_log_instr32(ctx);
        decode_opc(env, ctx);
    } else if (ctx->insn_flags & ASE_MICROMIPS) {
        gen_mips_log_instr_unsupported(ctx, "micromips");
        ctx->opcode = cpu_lduw_code(env, ctx->base.pc_next);
        insn_bytes = decode_micromips_opc(env, ctx);
    } else ...
}

gen_check_pcc_bounds_next_inst is implemented as below. The PCC check is done based on the base and bound information stored in the current context DisasContext *ctx. So when does the ctx.base.pc_first and ctx.base.pcc_top is updated with new base and bounds information?

static inline void gen_check_pcc_bounds_next_inst(DisasContext *ctx,
                                                  uint32_t num_bytes)
{
#ifdef TARGET_CHERI
    if (have_cheri_tb_flags(ctx, TB_FLAG_PCC_FULL_AS)) {
        return; // PCC spans the full address space, no need to check
    }

    // Note: PC can only be incremented since a branch exits the TB, so checking
    // for pc_next < pcc.base should not be needed. Add a debug assertion in
    // case this assumption no longer holds in the future.
    // Note: we don't have to check for wraparound here since this case is
    // already handled by the TB_FLAG_PCC_FULL_AS check above. Wraparound is
    // permitted to avoid any differences with non-CHERI enabled CPUs.
    tcg_debug_assert(ctx->base.pc_next >= ctx->base.pc_first);
    if (unlikely(ctx->base.pc_next + num_bytes > ctx->base.pcc_top)) {
        cheri_tcg_prepare_for_unconditional_exception(&ctx->base);
        gen_raise_pcc_violation(&ctx->base, ctx->base.pc_next, num_bytes);
    }
#endif
}

Instruction translation cache

tb_find will search a translated basic block and avoid re-translation.

static inline TranslationBlock *tb_find(CPUState *cpu,
                                        TranslationBlock *last_tb,
                                        int tb_exit, uint32_t cf_mask)
{
    TranslationBlock *tb;
    target_ulong cs_base, cs_top = 0, pc;
    uint32_t cheri_flags = 0;
    uint32_t flags;

    tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &cs_top, &cheri_flags, &flags,
                              cf_mask);
    if (tb == NULL) {
        mmap_lock();
        tb = tb_gen_code(cpu, pc, cs_base, cs_top, cheri_flags, flags, cf_mask);
        mmap_unlock();
        /* We add the TB in the virtual pc hash table for the fast lookup */
        atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
    }
    ...
}

Instruction execution

After a basic block is translated. It will start execution in cpu_loop_exec_tb

// accel/tcg/cpu-exec.c

/* main execution loop */

int cpu_exec(CPUState *cpu)
{
    ...
    /* if an exception is pending, we execute it here */
    while (!cpu_handle_exception(cpu, &ret)) {
        TranslationBlock *last_tb = NULL;
        int tb_exit = 0;

        while (!cpu_handle_interrupt(cpu, &last_tb)) {
            uint32_t cflags = cpu->cflags_next_tb;
            TranslationBlock *tb;
            ...

            tb = tb_find(cpu, last_tb, tb_exit, cflags);
            cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit);
            ...
        }
    }
    ...
}

cpu_loop_exec_tb

// call path:
cpu_loop_exec_tb => cpu_tb_exec => tcg_qemu_tb_exec => 

// accel/tcg/cpu-exec.c
cpu_tb_exec

// tcg/tci.c
tcg_qemu_tb_exec

tcg_qemu_tb_exec can have two definitions:

// include/tcg/tcg.h
#ifdef HAVE_TCG_QEMU_TB_EXEC
uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr);
#else
# define tcg_qemu_tb_exec(env, tb_ptr) \
    ((uintptr_t (*)(void *, void *))tcg_ctx->code_gen_prologue)(env, tb_ptr)
#endif

In this CHERI based version, the second version is used, where a code pointer(callback) is used: tcg_ctx->code_gen_prologue. It can be initialized with different functions under differnt qemu modes, system mode, linux user, and bsd user mode.

All above code pointers are initialized using tcg_prologue_init:

// tcg/tcg.c

void tcg_prologue_init(TCGContext *s)
{
    size_t prologue_size, total_size;
    void *buf0, *buf1;

    /* Put the prologue at the beginning of code_gen_buffer.  */
    buf0 = s->code_gen_buffer;
    total_size = s->code_gen_buffer_size;
    s->code_ptr = buf0;
    s->code_buf = buf0;
    s->data_gen_ptr = NULL;
    s->code_gen_prologue = buf0;

    ...

    /* Generate the prologue.  */
    tcg_target_qemu_prologue(s);

    ...
    
    /* Deduct the prologue from the buffer.  */
    prologue_size = tcg_current_code_size(s);
    s->code_gen_ptr = buf1;
    s->code_gen_buffer = buf1;
    s->code_buf = buf1;
    total_size -= prologue_size;
    s->code_gen_buffer_size = total_size;

    tcg_register_jit(s->code_gen_buffer, total_size);

#ifdef DEBUG_DISAS
    if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
        FILE *logfile = qemu_log_lock();
        qemu_log("PROLOGUE: [size=%zu]\n", prologue_size);
        if (s->data_gen_ptr) {
            size_t code_size = s->data_gen_ptr - buf0;
            size_t data_size = prologue_size - code_size;
            size_t i;

            log_disas(buf0, code_size);

            for (i = 0; i < data_size; i += sizeof(tcg_target_ulong)) {
                if (sizeof(tcg_target_ulong) == 8) {
                    qemu_log("0x%08" PRIxPTR ":  .quad  0x%016" PRIx64 "\n",
                             (uintptr_t)s->data_gen_ptr + i,
                             *(uint64_t *)(s->data_gen_ptr + i));
                } else {
                    qemu_log("0x%08" PRIxPTR ":  .long  0x%08x\n",
                             (uintptr_t)s->data_gen_ptr + i,
                             *(uint32_t *)(s->data_gen_ptr + i));
                }
            }
        } else {
            log_disas(buf0, prologue_size);
        }
        qemu_log("\n");
        qemu_log_flush();
        qemu_log_unlock(logfile);
    }
#endif

    /* Assert that goto_ptr is implemented completely.  */
    if (TCG_TARGET_HAS_goto_ptr) {
        tcg_debug_assert(s->code_gen_epilogue != NULL);
    }
}

We can see that TCGContext.code_gen_buffer is assigned to tcg_ctx->code_gen_prologue. So we need to find out where TCGContext.code_gen_buffer is pointing to.

tcg_prologue_init is called in three places with different func pointers assigned to tcg_ctx, see code below:

system mode init in accel/tcg/translate-all.c:
linux user mode in linux-user/main.c:
bsd user mode in bsd-user/main.c:

Now we focus on the system mode version:

// accel/tcg/translate-all.c

/* Must be called before using the QEMU cpus. 'tb_size' is the size
   (in bytes) allocated to the translation buffer. Zero means default
   size. */
void tcg_exec_init(unsigned long tb_size)
{
    tcg_allowed = true;
    cpu_gen_init();
    page_init();
    tb_htable_init();
    code_gen_alloc(tb_size);
#if defined(CONFIG_SOFTMMU)
    /* There's no guest base to take into account, so go ahead and
       initialize the prologue now.  */
    tcg_prologue_init(tcg_ctx);
#endif
}

The tcg_ctx is a thread-local variable:

__thread TCGContext *tcg_ctx; // accel/tcg/translate-all.c

Its TCGContext.code_gen_buffer is initialized in code_gen_alloc where a dynamic translator buffer is allocated using mmap().

// accel/tcg/translate-all.c
static inline void code_gen_alloc(size_t tb_size)
{
    tcg_ctx->code_gen_buffer_size = size_code_gen_buffer(tb_size);
    tcg_ctx->code_gen_buffer = alloc_code_gen_buffer();
    if (tcg_ctx->code_gen_buffer == NULL) {
        fprintf(stderr, "Could not allocate dynamic translator buffer\n");
        exit(1);
    }
}

tcg_prologue_init assigns this buffer to TCGContext.code_gen_prologue and also generates the prologue code in it by calling tcg_target_qemu_prologue(s);

tcg_target_qemu_prologue() is a target dependent function implemented in tcg/arch/tcg-target.inc.c. It will generate a system wide prologue code. This piece of code is generated only once each QEMU running instance, generated upon QEMU starts.

QEMU will generate different prologue when it run on different architectures. For example, if QEMU is runnign on x86 arch (³²⁄₆₄), the prologue generating code is defined in tcg/i386/tcg-target.inc.c:

// tcg/i386/tcg-target.inc.c

/* Generate global QEMU prologue and epilogue code */
static void tcg_target_qemu_prologue(TCGContext *s)
{
    int i, stack_addend;

    /* TB prologue */
    // we get here when the qemu is running on x86 hardware.
    printf("i386 tcg_target_qemu_prologue\n");

    /* Reserve some stack space, also for TCG temps.  */
    stack_addend = FRAME_SIZE - PUSH_SIZE;
    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
                  CPU_TEMP_BUF_NLONGS * sizeof(long));

    /* Save all callee saved registers.  */
    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
        tcg_out_push(s, tcg_target_callee_save_regs[i]);
    }

#if TCG_TARGET_REG_BITS == 32
    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
    /* jmp *tb.  */
    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
                         + stack_addend);
#else
# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
    if (guest_base) {
        int seg = setup_guest_base_seg();
        if (seg != 0) {
            x86_guest_base_seg = seg;
        } else if (guest_base == (int32_t)guest_base) {
            x86_guest_base_offset = guest_base;
        } else {
            /* Choose R12 because, as a base, it requires a SIB byte. */
            x86_guest_base_index = TCG_REG_R12;
            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
        }
    }
# endif
    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
    /* jmp *tb.  */
    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
#endif

    /*
     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
     * and fall through to the rest of the epilogue.
     */
    s->code_gen_epilogue = s->code_ptr;
    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);

    /* TB epilogue */
    tb_ret_addr = s->code_ptr;

    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);

    if (have_avx2) {
        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
    }
    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
    }
    tcg_out_opc(s, OPC_RET, 0, 0, 0);
}

Every tcg_out_xxx function will compose an instruction that can run on the target x86 architecture. For example, a 4 byte size instruction can be emitted via tcg_out32(TCGContext *s, uint32_t v):

#if TCG_TARGET_INSN_UNIT_SIZE <= 4
static __attribute__((unused)) inline void tcg_out32(TCGContext *s, uint32_t v)
{
    if (TCG_TARGET_INSN_UNIT_SIZE == 4) {
        *s->code_ptr++ = v;
    } else {
        tcg_insn_unit *p = s->code_ptr;
        memcpy(p, &v, sizeof(v));
        s->code_ptr = p + (4 / TCG_TARGET_INSN_UNIT_SIZE);
    }
}

Translation block chaining

After each translated basic block is executed, QEMU uses the simulated Program Counter (PC) and other cpu state information such as CS segment based value, to find the next basic block.

Direct block chaining. In order to accelerate the most common cases where the new simulated PC is known, QEMU can patch a basic block so that it jumps directly to the next one.

Find_tb

Reference reference More

Translation Block Invalidate

References: Translator Internals The target CPUs have many internal states which change the way it evaluates instructions. In order to achieve a good speed, the translation phase considers that some state information of the virtual CPU cannot change in it. The state is recorded in the Translation Block (TB). If the state changes (e.g. privilege level), a new TB will be generated and the previous TB won’t be used anymore until the state matches the state recorded in the previous TB.

Inst Trans

References: QEMU source ~4.2.93 QEMU Detailed Study Translator Internals decode_opc Call path: // cpu_exec ---> translator_loop cpu_exec => tb_find => if (not found): tb_gen_code // accel/tcg/translate-all.c => gen_intermediate_code // target/mips/translate.c => translator_loop // translator_loop calling callbacks translator_loop // accel/tcg/translator.c => ops->translate_insn => mips_tr_translate_insn // target/mips/translate.c => riscv_tr_translate_insn // target/mips/translate.c // impl of translate_insn mips_tr_ops.translate_insn = mips_tr_translate_insn // registered call back // target/mips/translate.c => decode_opc // target/mips/translate.

If you could revise
the fundmental principles of
computer system design
to improve security...

... what would you change?