parse_notes()
in sys/kern/imgact_elf.c
———————–References:
The overview of how a execve system call loads a binary and start executing a binary file is discussed based on the Linux previously. Here this post aims to track more details on the source code about how the ELF file is parsed and mapped into the address space in the FreeBSD.
Implementation is similar.
FreeBSD has a struct image_params imgp;
store similar info as the struct linux_binprm
in Linux.
do_execve()
handles the syscall
struct image_params imgp
->newcred
->freepath
, ->execpath
, etc.// sys/kern/kern_exec.c
/*
* In-kernel implementation of execve(). All arguments are assumed to be
* userspace pointers from the passed thread.
*/
static int
do_execve(struct thread *td, struct image_args *args,
void * __capability umac){
...
/*
* Loop through the list of image activators, calling each one.
* An activator returns -1 if there is no match, 0 on success,
* and an error otherwise.
*/
for (i = 0; error == -1 && execsw[i]; ++i) {
if (execsw[i]->ex_imgact == NULL ||
execsw[i]->ex_imgact == img_first) {
continue;
}
error = (*execsw[i]->ex_imgact)(imgp);
}
...
}
The definition of struct execsw
for elf files:
// in file: sys/kern/imgact_elf.c
/*
* Tell kern_execve.c about it, with a little help from the linker.
*/
static struct execsw __elfN(execsw) = {
.ex_imgact = __CONCAT(exec_, __elfN(imgact)), // -> e.g. `exec_elf64_imgact`
.ex_name = __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
};
EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
// __elfN is defined as a macro:
// in file: sys/sys/elf_generic.h
#define __elfN(x) __CONCAT(__CONCAT(__CONCAT(elf,__ELF_WORD_SIZE),_),x)
// function __CONCAT(exec_, __elfN(imgact)) is defined as
// in file sys/kern/imgact_elf.c
static int
__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
{
}
References:
ELF binary is loaded using function _CONCAT(exec, __elfN(imgact))(struct image_params *imgp). This is a macro based function name definition, can be compiled to exec_elf64_imgact()
for example.
PT_LOAD
, PT_INTERP
, PT_GNU_STACK
, and PT_PHDR
. Update image_params
accordingly.__elfN(get_branchinfo)
. Dynamic or not. (dso)imgp->map_flags
accordingly;exec_new_vmspace(imgp, sv)
: clear old virtual address space, and set up new stack.__elfN(load_sections)
: loading the contents of sections.__elfN(enforce_limits)
:// sys/kern/imgact_elf.c
// http://fxr.watson.org/fxr/source/kern/imgact_elf.c?v=FREEBSD-12-STABLE#L1106
static int
__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
{
/// ...
for (i = 0; i < hdr->e_phnum; i++) {
switch (phdr[i].p_type) {
case PT_LOAD:
if (n == 0)
baddr = phdr[i].p_vaddr;
if (phdr[i].p_align > maxalign)
maxalign = phdr[i].p_align;
mapsz += phdr[i].p_memsz;
n++;
/*
* If this segment contains the program headers,
* remember their virtual address for the AT_PHDR
* aux entry. Static binaries don't usually include
* a PT_PHDR entry.
*/
if (phdr[i].p_offset == 0 &&
hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
<= phdr[i].p_filesz)
proghdr = phdr[i].p_vaddr + hdr->e_phoff;
break;
case PT_INTERP:
/* Path to interpreter */
if (interp != NULL) {
uprintf("Multiple PT_INTERP headers\n");
error = ENOEXEC;
goto ret;
}
error = __elfN(get_interp)(imgp, &phdr[i], &interp,
&free_interp);
if (error != 0)
goto ret;
break;
case PT_GNU_STACK:
if (__elfN(nxstack))
imgp->stack_prot =
__elfN(trans_prot)(phdr[i].p_flags);
imgp->stack_sz = phdr[i].p_memsz;
break;
case PT_PHDR: /* Program header table info */
proghdr = phdr[i].p_vaddr;
break;
}
}
/// ...
}
get_branchinfo
—> parse_notes()
:
Call path:
__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
=> brand_info = __elfN(get_brandinfo)(imgp, interp, &osrel, &fctl0);
static Elf_Brandinfo *
__elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
int32_t *osrel, uint32_t *fctl0)
=> ret = __elfN(check_note)(imgp, bi->brand_note, osrel, fctl0);
=> static boolean_t
__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote,
int32_t *osrel, uint32_t *fctl0){
for (i = 0; i < hdr->e_phnum; i++) {
if (phdr[i].p_type == PT_NOTE && __elfN(parse_notes)(imgp,
&brandnote->hdr, brandnote->vendor, &phdr[i], brandnote_cb,
&b_arg)) {
for (j = 0; j < hdr->e_phnum; j++) {
if (phdr[j].p_type == PT_NOTE &&
__elfN(parse_notes)(imgp, &fctl_note,
FREEBSD_ABI_VENDOR, &phdr[j],
note_fctl_cb, &f_arg))
break;
}
return (TRUE);
}
}
}
=> __elfN(parse_notes)(imgp, &fctl_note,
FREEBSD_ABI_VENDOR, &phdr[j],
note_fctl_cb, &f_arg));
parse_notes()
:
// sys/kern/imgact_elf.c
static boolean_t
__elfN(parse_notes)(struct image_params *imgp, Elf_Note *checknote,
const char *note_vendor, const Elf_Phdr *pnote,
boolean_t (*cb)(const Elf_Note *, void *, boolean_t *), void *cb_arg)
{
const Elf_Note *note, *note0, *note_end;
const char *note_name;
char *buf;
int i, error;
boolean_t res;
/* We need some limit, might as well use PAGE_SIZE. */
if (pnote == NULL || pnote->p_filesz > PAGE_SIZE)
return (FALSE);
ASSERT_VOP_LOCKED(imgp->vp, "parse_notes");
if (pnote->p_offset > PAGE_SIZE ||
pnote->p_filesz > PAGE_SIZE - pnote->p_offset) {
buf = malloc(pnote->p_filesz, M_TEMP, M_NOWAIT);
if (buf == NULL) {
VOP_UNLOCK(imgp->vp);
buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK);
vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
}
error = vn_rdwr(UIO_READ, imgp->vp, buf, pnote->p_filesz,
pnote->p_offset, UIO_SYSSPACE, IO_NODELOCKED,
curthread->td_ucred, NOCRED, NULL, curthread);
if (error != 0) {
uprintf("i/o error PT_NOTE\n");
goto retf;
}
note = note0 = (const Elf_Note *)buf;
note_end = (const Elf_Note *)(buf + pnote->p_filesz);
} else {
note = note0 = (const Elf_Note *)(imgp->image_header +
pnote->p_offset);
note_end = (const Elf_Note *)(imgp->image_header +
pnote->p_offset + pnote->p_filesz);
buf = NULL;
}
for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
if (!aligned(note, Elf32_Addr) || (const char *)note_end -
(const char *)note < sizeof(Elf_Note)) {
goto retf;
}
if (note->n_namesz != checknote->n_namesz ||
note->n_descsz != checknote->n_descsz ||
note->n_type != checknote->n_type)
goto nextnote;
note_name = (const char *)(note + 1);
if (note_name + checknote->n_namesz >=
(const char *)note_end || strncmp(note_vendor,
note_name, checknote->n_namesz) != 0)
goto nextnote;
if (cb(note, cb_arg, &res))
goto ret;
nextnote:
note = (const Elf_Note *)((const char *)(note + 1) +
roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) +
roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE));
}
retf:
res = FALSE;
ret:
free(buf, M_TEMP);
return (res);
}
exec_new_vmspace()
: clear old address space, and set up a new stack.
Call path:
do_execve()
=> __CONCAT(exec_,(imgact)) // sys/kern/imgact_elf.c
=> exec_new_vmspace(imgp, sv) // sys/kern/kern_exec.c
Input is imgp
and struct sysentvec *sv
. struct sysentvec
contains
a pointer to system call table,
a function translating trap-to-signal mapping,
…
steps in exec_new_vmspace()
:
Allocate stack.
// sys/kern/kern_exec.c
/*
* Destroy old address space, and allocate a new stack.
* The new stack is only sgrowsiz large because it is grown
* automatically on a page fault.
*/
int
exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
{
..
}
Call path:
load_sections
=> load_section
=> map_insert
// sys/kern/imgact_elf.c
// load_sections => load_section
static int
__elfN(load_sections)(struct image_params *imgp, const Elf_Ehdr *hdr,
const Elf_Phdr *phdr, u_long rbase, u_long *base_addrp, u_long *max_addrp)
{
for (i = 0; i < hdr->e_phnum; i++) {
if (phdr[i].p_type != PT_LOAD || phdr[i].p_memsz == 0)
continue;
/* Loadable segment */
prot = __elfN(trans_prot)(phdr[i].p_flags);
error = __elfN(load_section)(imgp, phdr[i].p_offset,
(caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
phdr[i].p_memsz, phdr[i].p_filesz, prot);
if (error != 0)
return (error);
...
}
load_section => map_insert
…
static int
__elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
{
object = imgp->object;
map = &imgp->proc->p_vmspace->vm_map;
map_addr = trunc_page((vm_offset_t)vmaddr);
file_addr = trunc_page(offset);
/*
* We have two choices. We can either clear the data in the last page
* of an oversized mapping, or we can start the anon mapping a page
* early and copy the initialized data into that first page. We
* choose the second.
*/
if (filsz == 0)
map_len = 0;
else if (memsz > filsz)
map_len = trunc_page(offset + filsz) - file_addr;
else
map_len = round_page(offset + filsz) - file_addr;
...
if (map_len != 0) {
...
rv = __elfN(map_insert)(imgp, map, object, file_addr,
map_addr, map_addr + map_len, prot, cow);
...
}
...
}
map_insert => { map_partial; vm_map_fixed; copyout_implicit_cap; vm_imgact_map_page; vm_imgact_unmap_page; }
static int
__elfN(map_insert)(struct image_params *imgp, vm_map_t map, vm_object_t object,
vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot,
int cow)
{
rv = __elfN(map_partial)(map, object, offset, start,
round_page(start), prot);
...
if ((offset & PAGE_MASK) != 0) {
/*
* The mapping is not page aligned. This means that we have
* to copy the data.
*/
rv = vm_map_fixed(map, NULL, 0, start, end - start,
prot | VM_PROT_WRITE, VM_PROT_ALL, MAP_CHECK_EXCL);
if (rv != KERN_SUCCESS)
return (rv);
if (object == NULL)
return (KERN_SUCCESS);
for (; start < end; start += sz) {
sf = vm_imgact_map_page(object, offset);
if (sf == NULL)
return (KERN_FAILURE);
off = offset - trunc_page(offset);
sz = end - start;
if (sz > PAGE_SIZE - off)
sz = PAGE_SIZE - off;
error = copyout_implicit_cap((caddr_t)sf_buf_kva(sf) + off,
(caddr_t)start, sz);
vm_imgact_unmap_page(sf);
if (error != 0)
return (KERN_FAILURE);
offset += sz;
}
} else {
vm_object_reference(object);
rv = vm_map_fixed(map, object, offset, start, end - start,
prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL |
(object != NULL ? MAP_VN_EXEC : 0));
if (rv != KERN_SUCCESS) {
locked = VOP_ISLOCKED(imgp->vp);
VOP_UNLOCK(imgp->vp);
vm_object_deallocate(object);
vn_lock(imgp->vp, locked | LK_RETRY);
return (rv);
} else if (object != NULL) {
MPASS(imgp->vp->v_object == object);
VOP_SET_TEXT_CHECKED(imgp->vp);
}
}
return (KERN_SUCCESS);
}
If you could revise
the fundmental principles of
computer system design
to improve security...
... what would you change?