References:
Some call paths for instruction fetch and decoding:
// cpu_exec callers
cpu_loop // linux-user/mips/cpu_loop.c
=> cpu_exec
qemu_tcg_init_vcpu // thread creation using fn pointer, cpus.c
=> qemu_tcg_rr_cpu_thread_fn
=> tcg_cpu_exec
=> cpu_exec
qemu_tcg_init_vcpu // thread creation using fn pointer, cpus.c
=> qemu_tcg_cpu_thread_fn
=> tcg_cpu_exec
=> cpu_exec
// cpu_exec ---> translator_loop
cpu_exec
=> tb_find
=> if (not found): tb_gen_code // accel/tcg/translate-all.c
=> gen_intermediate_code // target/mips/translate.c
=> translator_loop
// translator_loop calling callbacks
translator_loop // accel/tcg/translator.c
=> ops->translate_insn
=> mips_tr_translate_insn // target/mips/translate.c
=> riscv_tr_translate_insn // target/mips/translate.c
// impl of translate_insn
mips_tr_ops.translate_insn
= mips_tr_translate_insn // registered call back // target/mips/translate.c
=> decode_opc // target/mips/translate.c
=> decode_micromips_opc // target/mips/translate.c
mips_tr_translate_insn
calls gen_check_pcc_bounds_next_inst()
to generate instruction on checking the PCC bounds, simulating a PCC check in CHERI.
static void mips_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs)
{
// XXX: we don't support micromips, etc. so we can hardcode 4 bytes as the
// instruction size (see assert below).
gen_check_pcc_bounds_next_inst(ctx, 4);
...
else if (!(ctx->hflags & MIPS_HFLAG_M16)) {
ctx->opcode = cpu_ldl_code(env, ctx->base.pc_next);
insn_bytes = 4;
gen_mips_log_instr32(ctx);
decode_opc(env, ctx);
} else if (ctx->insn_flags & ASE_MICROMIPS) {
gen_mips_log_instr_unsupported(ctx, "micromips");
ctx->opcode = cpu_lduw_code(env, ctx->base.pc_next);
insn_bytes = decode_micromips_opc(env, ctx);
} else ...
}
gen_check_pcc_bounds_next_inst
is implemented as below. The PCC check is done
based on the base and bound information stored in the current context DisasContext *ctx
.
So when does the ctx.base.pc_first
and ctx.base.pcc_top
is updated with new base and bounds information?
static inline void gen_check_pcc_bounds_next_inst(DisasContext *ctx,
uint32_t num_bytes)
{
#ifdef TARGET_CHERI
if (have_cheri_tb_flags(ctx, TB_FLAG_PCC_FULL_AS)) {
return; // PCC spans the full address space, no need to check
}
// Note: PC can only be incremented since a branch exits the TB, so checking
// for pc_next < pcc.base should not be needed. Add a debug assertion in
// case this assumption no longer holds in the future.
// Note: we don't have to check for wraparound here since this case is
// already handled by the TB_FLAG_PCC_FULL_AS check above. Wraparound is
// permitted to avoid any differences with non-CHERI enabled CPUs.
tcg_debug_assert(ctx->base.pc_next >= ctx->base.pc_first);
if (unlikely(ctx->base.pc_next + num_bytes > ctx->base.pcc_top)) {
cheri_tcg_prepare_for_unconditional_exception(&ctx->base);
gen_raise_pcc_violation(&ctx->base, ctx->base.pc_next, num_bytes);
}
#endif
}
tb_find
will search a translated basic block and avoid re-translation.
static inline TranslationBlock *tb_find(CPUState *cpu,
TranslationBlock *last_tb,
int tb_exit, uint32_t cf_mask)
{
TranslationBlock *tb;
target_ulong cs_base, cs_top = 0, pc;
uint32_t cheri_flags = 0;
uint32_t flags;
tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &cs_top, &cheri_flags, &flags,
cf_mask);
if (tb == NULL) {
mmap_lock();
tb = tb_gen_code(cpu, pc, cs_base, cs_top, cheri_flags, flags, cf_mask);
mmap_unlock();
/* We add the TB in the virtual pc hash table for the fast lookup */
atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
}
...
}
After a basic block is translated. It will start execution in cpu_loop_exec_tb
// accel/tcg/cpu-exec.c
/* main execution loop */
int cpu_exec(CPUState *cpu)
{
...
/* if an exception is pending, we execute it here */
while (!cpu_handle_exception(cpu, &ret)) {
TranslationBlock *last_tb = NULL;
int tb_exit = 0;
while (!cpu_handle_interrupt(cpu, &last_tb)) {
uint32_t cflags = cpu->cflags_next_tb;
TranslationBlock *tb;
...
tb = tb_find(cpu, last_tb, tb_exit, cflags);
cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit);
...
}
}
...
}
cpu_loop_exec_tb
// call path:
cpu_loop_exec_tb => cpu_tb_exec => tcg_qemu_tb_exec =>
// accel/tcg/cpu-exec.c
cpu_tb_exec
// tcg/tci.c
tcg_qemu_tb_exec
tcg_qemu_tb_exec
can have two definitions:
// include/tcg/tcg.h
#ifdef HAVE_TCG_QEMU_TB_EXEC
uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr);
#else
# define tcg_qemu_tb_exec(env, tb_ptr) \
((uintptr_t (*)(void *, void *))tcg_ctx->code_gen_prologue)(env, tb_ptr)
#endif
In this CHERI based version, the second version is used, where a code pointer(callback) is used: tcg_ctx->code_gen_prologue
. It can be initialized with different functions under differnt qemu modes, system mode, linux user, and bsd user mode.
All above code pointers are initialized using tcg_prologue_init
:
// tcg/tcg.c
void tcg_prologue_init(TCGContext *s)
{
size_t prologue_size, total_size;
void *buf0, *buf1;
/* Put the prologue at the beginning of code_gen_buffer. */
buf0 = s->code_gen_buffer;
total_size = s->code_gen_buffer_size;
s->code_ptr = buf0;
s->code_buf = buf0;
s->data_gen_ptr = NULL;
s->code_gen_prologue = buf0;
...
/* Generate the prologue. */
tcg_target_qemu_prologue(s);
...
/* Deduct the prologue from the buffer. */
prologue_size = tcg_current_code_size(s);
s->code_gen_ptr = buf1;
s->code_gen_buffer = buf1;
s->code_buf = buf1;
total_size -= prologue_size;
s->code_gen_buffer_size = total_size;
tcg_register_jit(s->code_gen_buffer, total_size);
#ifdef DEBUG_DISAS
if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
FILE *logfile = qemu_log_lock();
qemu_log("PROLOGUE: [size=%zu]\n", prologue_size);
if (s->data_gen_ptr) {
size_t code_size = s->data_gen_ptr - buf0;
size_t data_size = prologue_size - code_size;
size_t i;
log_disas(buf0, code_size);
for (i = 0; i < data_size; i += sizeof(tcg_target_ulong)) {
if (sizeof(tcg_target_ulong) == 8) {
qemu_log("0x%08" PRIxPTR ": .quad 0x%016" PRIx64 "\n",
(uintptr_t)s->data_gen_ptr + i,
*(uint64_t *)(s->data_gen_ptr + i));
} else {
qemu_log("0x%08" PRIxPTR ": .long 0x%08x\n",
(uintptr_t)s->data_gen_ptr + i,
*(uint32_t *)(s->data_gen_ptr + i));
}
}
} else {
log_disas(buf0, prologue_size);
}
qemu_log("\n");
qemu_log_flush();
qemu_log_unlock(logfile);
}
#endif
/* Assert that goto_ptr is implemented completely. */
if (TCG_TARGET_HAS_goto_ptr) {
tcg_debug_assert(s->code_gen_epilogue != NULL);
}
}
We can see that TCGContext.code_gen_buffer
is assigned to tcg_ctx->code_gen_prologue
.
So we need to find out where TCGContext.code_gen_buffer
is pointing to.
tcg_prologue_init
is called in three places with different func pointers assigned to tcg_ctx
, see code below:
accel/tcg/translate-all.c
:linux-user/main.c
:bsd-user/main.c
:Now we focus on the system mode version:
// accel/tcg/translate-all.c
/* Must be called before using the QEMU cpus. 'tb_size' is the size
(in bytes) allocated to the translation buffer. Zero means default
size. */
void tcg_exec_init(unsigned long tb_size)
{
tcg_allowed = true;
cpu_gen_init();
page_init();
tb_htable_init();
code_gen_alloc(tb_size);
#if defined(CONFIG_SOFTMMU)
/* There's no guest base to take into account, so go ahead and
initialize the prologue now. */
tcg_prologue_init(tcg_ctx);
#endif
}
The tcg_ctx
is a thread-local variable:
__thread TCGContext *tcg_ctx; // accel/tcg/translate-all.c
Its TCGContext.code_gen_buffer
is initialized in code_gen_alloc
where a dynamic translator buffer is allocated using mmap().
// accel/tcg/translate-all.c
static inline void code_gen_alloc(size_t tb_size)
{
tcg_ctx->code_gen_buffer_size = size_code_gen_buffer(tb_size);
tcg_ctx->code_gen_buffer = alloc_code_gen_buffer();
if (tcg_ctx->code_gen_buffer == NULL) {
fprintf(stderr, "Could not allocate dynamic translator buffer\n");
exit(1);
}
}
tcg_prologue_init
assigns this buffer to TCGContext.code_gen_prologue
and also generates the prologue code in it by calling tcg_target_qemu_prologue(s);
tcg_target_qemu_prologue()
is a target dependent function implemented in tcg/arch/tcg-target.inc.c
. It will generate a system wide prologue code. This piece of code
is generated only once each QEMU running instance, generated upon QEMU starts.
QEMU will generate different prologue when it run on different architectures.
For example, if QEMU is runnign on x86 arch (32⁄64), the prologue generating code is defined in tcg/i386/tcg-target.inc.c
:
// tcg/i386/tcg-target.inc.c
/* Generate global QEMU prologue and epilogue code */
static void tcg_target_qemu_prologue(TCGContext *s)
{
int i, stack_addend;
/* TB prologue */
// we get here when the qemu is running on x86 hardware.
printf("i386 tcg_target_qemu_prologue\n");
/* Reserve some stack space, also for TCG temps. */
stack_addend = FRAME_SIZE - PUSH_SIZE;
tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
CPU_TEMP_BUF_NLONGS * sizeof(long));
/* Save all callee saved registers. */
for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
tcg_out_push(s, tcg_target_callee_save_regs[i]);
}
#if TCG_TARGET_REG_BITS == 32
tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
(ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
/* jmp *tb. */
tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
(ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
+ stack_addend);
#else
# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
if (guest_base) {
int seg = setup_guest_base_seg();
if (seg != 0) {
x86_guest_base_seg = seg;
} else if (guest_base == (int32_t)guest_base) {
x86_guest_base_offset = guest_base;
} else {
/* Choose R12 because, as a base, it requires a SIB byte. */
x86_guest_base_index = TCG_REG_R12;
tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
}
}
# endif
tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
/* jmp *tb. */
tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
#endif
/*
* Return path for goto_ptr. Set return value to 0, a-la exit_tb,
* and fall through to the rest of the epilogue.
*/
s->code_gen_epilogue = s->code_ptr;
tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
/* TB epilogue */
tb_ret_addr = s->code_ptr;
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
if (have_avx2) {
tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
}
for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
tcg_out_pop(s, tcg_target_callee_save_regs[i]);
}
tcg_out_opc(s, OPC_RET, 0, 0, 0);
}
Every tcg_out_xxx
function will compose an instruction that can run on the target x86 architecture. For example, a 4 byte size instruction can be emitted via tcg_out32(TCGContext *s, uint32_t v)
:
#if TCG_TARGET_INSN_UNIT_SIZE <= 4
static __attribute__((unused)) inline void tcg_out32(TCGContext *s, uint32_t v)
{
if (TCG_TARGET_INSN_UNIT_SIZE == 4) {
*s->code_ptr++ = v;
} else {
tcg_insn_unit *p = s->code_ptr;
memcpy(p, &v, sizeof(v));
s->code_ptr = p + (4 / TCG_TARGET_INSN_UNIT_SIZE);
}
}
After each translated basic block is executed, QEMU uses the simulated Program Counter (PC) and other cpu state information such as CS segment based value, to find the next basic block.
Direct block chaining. In order to accelerate the most common cases where the new simulated PC is known, QEMU can patch a basic block so that it jumps directly to the next one.
Reference reference More
References: Translator Internals The target CPUs have many internal states which change the way it evaluates instructions. In order to achieve a good speed, the translation phase considers that some state information of the virtual CPU cannot change in it. The state is recorded in the Translation Block (TB). If the state changes (e.g. privilege level), a new TB will be generated and the previous TB won’t be used anymore until the state matches the state recorded in the previous TB.
References: QEMU source ~4.2.93 QEMU Detailed Study Translator Internals decode_opc Call path: // cpu_exec ---> translator_loop cpu_exec => tb_find => if (not found): tb_gen_code // accel/tcg/translate-all.c => gen_intermediate_code // target/mips/translate.c => translator_loop // translator_loop calling callbacks translator_loop // accel/tcg/translator.c => ops->translate_insn => mips_tr_translate_insn // target/mips/translate.c => riscv_tr_translate_insn // target/mips/translate.c // impl of translate_insn mips_tr_ops.translate_insn = mips_tr_translate_insn // registered call back // target/mips/translate.c => decode_opc // target/mips/translate.
If you could revise
the fundmental principles of
computer system design
to improve security...
... what would you change?