~laumann/hadb

31d85f22fc8e29c115835a41cde6fff4b49d3784 — Thomas Bracht Laumann Jespersen 4 months ago af60fd9
format/dwarf: decode .debug_abbrev

This is going in a more sans-io manner where all the library can be
fed is bytes, and API functions return either a tuple containg the
result and how many bytes were read or an error indicating a
partialread (ie not enough bytes were available).
4 files changed, 428 insertions(+), 5 deletions(-)

A format/dwarf/abbrev.ha
M format/dwarf/debuginfo.ha
A format/dwarf/leb128.ha
M format/dwarf/types.ha
A format/dwarf/abbrev.ha => format/dwarf/abbrev.ha +136 -0
@@ 0,0 1,136 @@
// Decoder for the .debug_abbrev section
//
// The .debug_abbrev (abbreviations section) contains the schema for
// all the DIE types in .debug_info.
//
// Ref: https://wiki.osdev.org/DWARF#The_.debug_abbrev_section
// DWARF 5, Section 7.5.3 (page 203)

export fn freeabbrevs(abbrevs: []abbrev) void = {
	for (let i = 0z; i < len(abbrevs); i += 1)
		free(abbrevs[i].attrs);
	free(abbrevs);
};

// Returns abbreviations in a list, but references to .debug_str are
// not resolved. The caller must free the return value.
export fn decodeabbrev(input: []u8) (([]abbrev, size) | partialread) = {
	// .debug_abbrev is a series of abbreviation declarations
	// each declaration specifies tag and attributes for a
	// particular form of DIE
	// each decl begis with a uleb128 that represents the abbrev
	// code itself - this code is used by reference in .debug_info
	// following the abbrev code is another uleb128 the encodes the
	// entry's tag
	let p = input;

	let ret: []abbrev = [];
	let err = true;
	defer if (err) free(ret);

	// Begin parsing entries
	for (true) {
		const (entry, r) = readuleb128(p)!;
		if (entry == 0)
			break;
		p = p[r..];
		const (t, r) = readuleb128(p)!;
		p = p[r..];
		const haschildren = p[0] == 1;
		p = p[1..];

		// parse AT-FORM pairs
		let attrs: [](attr, form) = [];
		for (true) {
			const (at, r) = readuleb128(p)!;
			p = p[r..];
			const (dwform, r) = readuleb128(p)!;
			p = p[r..];
			if (at == 0 && dwform == 0)
				break; // done
			append(attrs, (at: attr, dwform: form));
		};
		append(ret, abbrev {
			code = entry,
			tag = t: tag,
			haschildren = haschildren,
			attrs = attrs,
		});
	};
	err = false;
	const read = len(input) - len(p) + 1;
	return (ret, read);
};

@test fn simple_abbrev() void = {
	const bin: []u8 = [
		0x01, 0x11, 0x01, 0x25,
		0x0E, 0x13, 0x0B, 0x03,
		0x0E, 0x1B, 0x0E, 0x11,
		0x01, 0x12, 0x06, 0x10,
		0x17, 0x00, 0x00, 0x02,
		0x16, 0x00, 0x03, 0x0E,
		0x3A, 0x0B, 0x3B, 0x0B,
		0x49, 0x13, 0x00, 0x00,
		0x03, 0x24, 0x00, 0x0B,
		0x0B, 0x3E, 0x0B, 0x03,
		0x0E, 0x00, 0x00, 0x00,
	];
	const expected: []abbrev = [
		abbrev {
			code = 0x1,
			tag = tag::COMPILE_UNIT,
			haschildren = true,
			attrs = [
				(attr::PRODUCER, form::STRP),
				(attr::LANGUAGE, form::DATA1),
				(attr::NAME, form::STRP),
				(attr::COMP_DIR, form::STRP),
				(attr::LOW_PC, form::ADDRESS),
				(attr::HIGH_PC, form::DATA4),
				(attr::STMT_LIST, form::SEC_OFFSET),
			],
		},
		abbrev {
			code = 0x2,
			tag = tag::TYPEDEF,
			haschildren = false,
			attrs = [
				(attr::NAME, form::STRP),
				(attr::DECL_FILE, form::DATA1),
				(attr::DECL_LINE, form::DATA1),
				(attr::TYPE, form::REF4),
			],
		},
		abbrev {
			code = 0x3,
			tag = tag::BASE_TYPE,
			haschildren = false,
			attrs = [
				(attr::BYTE_SIZE, form::DATA1),
				(attr::ENCODING, form::DATA1),
				(attr::NAME, form::STRP),
			],
		},
	];

	let (abbrevs, read) = decodeabbrev(bin)!;
	defer freeabbrevs(abbrevs);
	assert(len(abbrevs) == len(expected));
	assert(read == len(bin));
	for (let i = 0z; i < len(abbrevs); i += 1) {
		const a = abbrevs[i];
		const e = expected[i];
		assert(a.code == e.code);
		assert(a.tag == e.tag);
		assert(a.haschildren == e.haschildren);
		assert(len(a.attrs) == len(e.attrs));
		for (let j = 0z; j < len(a.attrs); j += 1) {
			const aa = a.attrs[i];
			const ee = e.attrs[i];
			assert(aa.0 == ee.0);
			assert(aa.1 == ee.1);
		};
	};
};


M format/dwarf/debuginfo.ha => format/dwarf/debuginfo.ha +7 -5
@@ 5,7 5,7 @@
// Each entry in .debug_info corresponds to a compilation unit

// A single entry in .debug_info
type debuginfo = struct {
export type debuginfo = struct {
	dwarf_version: u16,
	abbrev_offset: u32,
	ptr_size: u8, // size of addresses of target (in bytes)


@@ 14,15 14,17 @@ type debuginfo = struct {

// An entry in .debug_abbrev, commonly referred to as DIE (Debugging
// Information Entry)
type die = struct {
export type die = struct {
	code: uint, // uleb128
	tag: uint,
	children: []die
};

type abbrev = struct {
	code: uint, // uleb128
	tag: tag
export type abbrev = struct {
	code: uleb128,
	tag: tag,
	haschildren: bool,
	attrs: [](attr, form),
};

// FIXME: Convert the following hex to a proper unit test

A format/dwarf/leb128.ha => format/dwarf/leb128.ha +85 -0
@@ 0,0 1,85 @@
use fmt;

// Read an unsigned LEB128 number from the given buffer, returning the
// number and the number of bytes consumed.
fn readuleb128(in: []u8) ((uleb128, size) | partialread) = {
	let result: uleb128 = 0;

	let shift = 0u;
	for (let i = 0z; i < len(in); i += 1) {
		const byte = in[i];
		let bits: uleb128 = byte & 0x7f;
		bits <<= shift;
		result |= bits;
		if (byte & 0x80 == 0)
			return (result, i+1);
		shift += 7;
	};
	return partialread;
};

// Read a signed LEB128 integer, return the resulting number and the
// number of bytes used from the input.
fn readsleb128(in: []u8) ((sleb128, size) | partialread) = {
	let result = 0u;
	let sign = 0u8;
	let shift = 0u;

	for (let i = 0z; i < len(in); i += 1) {
		const byte = in[i];

		let bits: uint = byte & 0x7f;
		bits <<= shift;
		result |= bits;

		sign = byte & 0x40;
		shift += 7;
		if (byte & 0x80 == 0) {
			if (sign == 0x40)
				result |= ~((1u << shift)-1);
			return (result: sleb128, i+1);
		};
	};
	return partialread;
};


@test fn readuleb128() void = {
	const cases: [_]([]u8, uleb128, size) = [
		([2], 2, 1),
		([2, 0x80], 2, 1), // the 0x80 is not read
		([127], 127, 1),
		([0x80, 1], 128, 2),
		([1+0x80, 1], 129, 2),
		([2+0x80, 1], 130, 2),
		([57+0x80, 100], 12857, 2),
	];
	for (let i = 0z; i < len(cases); i += 1) {
		const (input, expected, read) = cases[i];

		const (decoded, n) = readuleb128(input)!;
		assert(decoded == expected);
		assert(n == read);
	};
};

// TODO tests for partialread

@test fn decodesleb128() void = {
	const cases: [_]([]u8, sleb128, size) = [
		([2], 2, 1),
		([0x7e], -2, 1),
		([127 + 0x80, 0], 127, 2),
		([1+0x80, 0x7f], -127, 2),
		([0+0x80, 1], 128, 2),
		([0+0x80, 0x7f], -128, 2),
		([1+0x80, 1], 129, 2),
		([0x7f+0x80, 0x7e], -129, 2),
	];
	for (let i = 0z; i < len(cases); i += 1) {
		const (input, expected, read) = cases[i];
		const (decoded, n) = readsleb128(input)!;
		assert(decoded == expected);
		assert(n == read);
	};
};

M format/dwarf/types.ha => format/dwarf/types.ha +200 -0
@@ 8,6 8,9 @@ export type sleb128 = int;

// Errors

// Partial read
export type partialread = !void;

// Unexpected EOF
export type unexpectedeof = !void;



@@ 191,3 194,200 @@ export type tag = enum {
};

// Also need something like "attr" and "form"
// Numeric values for DW_AT_*
//
// From DWARF 5
export type attr = enum {
	// page 208
	SIBLING = 0x01, // reference
	LOCATION = 0x02, // exprloc, loclist
	NAME = 0x03, // string

	ORDERING = 0x09, // constant

	BYTE_SIZE = 0x0b, // constant, exprloc, reference

	BIT_SIZE = 0x0d, // constant, exprloc, reference

	STMT_LIST = 0x10, // lineptr
	LOW_PC = 0x11, // address,
	HIGH_PC = 0x12, // address, constant
	LANGUAGE = 0x13, // constant

	DISCR = 0x15, // reference
	DISCR_VALUE = 0x16, // constant
	VISIBILITY = 0x17, // constant
	IMPORT = 0x18, // reference
	STRING_LENGTH = 0x19, // exprloc, loclist, reference
	COMMON_REFERENCE = 0x1a, // reference
	COMP_DIR = 0x1b, // string
	CONST_VALUE = 0x1c, // block, constant, string
	CONTAINING_TYPE = 0x1d, // reference
	DEFAULT_VALUE = 0x1e, // constant, reference, flag

	INLINE = 0x20, // constant
	IS_OPTIONAL = 0x21, // flag
	LOWER_BOUND = 0x22, // constant, exprloc, reference

	// page 209
	PRODUCER = 0x25, // string

	PROTOTYPED = 0x27, // flag

	RETURN_ADDR = 0x2a, // exprloc, loclist

	START_SCOPE = 0x2c, // constant, rnglist

	BIT_STRIDE = 0x2e, // constant, exprloc, reference
	UPPER_BOUND = 0x2f, // constant, exprloc, reference

	ABSTRACT_ORIGIN = 0x31, // reference
	ACCESSIBILITY = 0x32, // constant
	ADDRESS_CLASS = 0x33, // constant
	ARTIFICIAL = 0x34, // flag
	BASE_TYPES = 0x35, // reference
	CALLING_CONVENTION = 0x36, // constant
	COUNT = 0x37, // constant, exprloc, reference
	DATA_MEMBER_LOCATION = 0x38, // constant, exprloc, loclist
	DECL_COLUMN = 0x39, // constant
	DECL_FILE = 0x3a, // constant
	DECL_LINE = 0x3b, // constant
	DECLARATION = 0x3c, // flag
	DISCR_LIST = 0x3d, // block
	ENCODING = 0x3e, // constant
	EXTERNAL = 0x3f, // flag
	FRAME_BASE = 0x40, // exprloc, loclist
	FRIEND = 0x41, // reference
	IDENTIFIER_CASE = 0x42, // constant

	// page 210
	NAMELIST_ITEM = 0x44, // reference
	PRIORITY = 0x45, // reference
	SEGMENT = 0x46, // exprloc, loclist
	SPECIFICATION = 0x47, // reference
	STATIC_LINK = 0x48, // exprloc, loclist
	TYPE = 0x49, // reference
	USE_LOCATION = 0x4a, // exprloc, loclist
	VARIABLE_PARAMETER = 0x4b, // flag
	VIRTUALITY = 0x4c, // constant
	VTABLE_ELEM_LOCATION = 0x4d, // exprloc, loclist
	ALLOCATED = 0x4e, // constant, exprloc, reference
	ASSOCIATED = 0x4f, // constant, exprloc, reference
	DATA_LOCATION = 0x50, // exprloc
	BYTE_STRIDE = 0x51, // constant, exprloc, reference
	ENTRY_PC = 0x52, // address, constant
	USE_UTF8 = 0x53, // flag
	EXTENSION = 0x54, // reference
	RANGES = 0x55, // rnglist
	TRAMPOLINE = 0x56, // address, flag, reference, string
	CALL_COLUMN = 0x57, // constant
	CALL_FILE = 0x58, // constant
	CALL_LINE = 0x59, // constant
	DESCRIPTION = 0x5a, // string
	BINARY_SCALE = 0x5b, // constant
	DECIMAL_SCALE = 0x5c, // constant
	SMALL = 0x5d, // reference
	DECIMAL_SIGN = 0x5e, // constant
	DIGIT_COUNT = 0x5f, // constant
	PICTURE_STRING = 0x60, // string

	// page 211
	MUTABLE = 0x61, // flag
	THREADS_SCALED = 0x62, // flag
	EXPLICIT = 0x63, // flag
	OBJECT_POINTER = 0x64, // reference
	ENDIANITY = 0x65, // constant
	ELEMENTAL = 0x66, // flag
	PURE = 0x67, // flag
	RECURSIVE = 0x68, // flag
	SIGNATURE = 0x69, // reference
	MAIN_SUBPROGRAM = 0x6a, // flag
	DATA_BIT_OFFSET = 0x6b, // constant
	CONST_EXPR = 0x6c, // flag
	ENUM_CLASS = 0x6d, // flag
	LINKAGE_NAME = 0x6e, // string
	STRING_LENGTH_BIT_SIZE = 0x6f, // constant
	STRING_LENGTH_BYTE_SIZE = 0x70, // constant
	RANK = 0x71, // constant, exprloc
	STR_OFFSETS_BASE = 0x72, // stroffsetsptr
	ADDR_BASE = 0x73, // addrptr
	RNGLISTS_BASE = 0x74, // rnglistsptr

	DWO_NAME = 0x76, // string
	REFERENCE = 0x77, // flag
	RLVALUE_REFERENCE = 0x78, // flag
	MACROS = 0x79, // macptr
	CALL_ALL_CALLS = 0x7a, // flag
	CALL_ALL_SOURCE_CALLS = 0x7b, // flag
	CALL_ALL_TAIL_CALLS = 0x7c, // flag
	CALL_RETURN_PC = 0x7d, // address
	CALL_VALUE = 0x7e, // exprloc
	CALL_ORIGIN = 0x7f, // exprloc
	CALL_PARAMETER = 0x80, // reference

	// page 212
	CALL_PC = 0x81, // address
	CALL_TAIL_CALL = 0x82, // flag
	CALL_TARGET = 0x83, // exprloc
	CALL_TARGET_CLOBBERED = 0x84, // exprloc
	CALL_DATA_LOCATION = 0x85, // exprloc
	CALL_DATA_VALUE = 0x86, // exprloc
	NORETURN = 0x87, // flag
	ALIGNMENT = 0x88, // constant
	EXPORT_SYMBOLS = 0x89, // flag
	DELETED = 0x8a, // flag
	DEFAULTED = 0x8b, // constant
	LOCLISTS_BASE = 0x8c, // loclistsptr
	LO_USER = 0x2000,
	HI_USER = 0x3fff,
};

export type form = enum {
	// page 220
	ADDRESS = 0x01, // address

	BLOCK2 = 0x03, // block
	BLOCK4 = 0x04, // block
	DATA2 = 0x05, // constant
	DATA4 = 0x06, // constant
	DATA8 = 0x07, // constant
	STRING = 0x08, // string
	BLOCK = 0x09, // block
	BLOCK1 = 0x0a, // block
	DATA1 = 0x0b, // constant
	FLAG = 0x0c, // flag
	SDATA = 0x0d, // constant
	STRP = 0x0e, // string
	UDATA = 0x0f, // constant
	REF_ADDR = 0x10, // reference
	REF1 = 0x11, // reference
	REF2 = 0x12, // reference
	REF4 = 0x13, // reference
	REF8 = 0x14, // reference
	REF_UDATA = 0x15, // reference
	INDIRECT = 0x16, // see Section 7.5.3, p. 203
	SEC_OFFSET = 0x17, // addrptr, lineptr, loclist, loclistsptr, macptr, rnglist, rnglistsptr, stroffsetsptr
	EXPRLOC = 0x18, // exprloc
	FLAG_PRESENT = 0x19, // flag
	STRX = 0x1a, // string
	ADDRX = 0x1b, // address
	REP_SUP4 = 0x1c, // reference
	STRP_SUP = 0x1d, // string

	// page 221
	DATA16 = 0x1e, // constant
	LINE_STRP = 0x1f, // string
	REF_SIG8 = 0x20, // reference
	IMPLICIT_CONST = 0x21, // constant
	LOCLISTX = 0x22, // loclist
	RNGLISTX = 0x23, // rnglist
	REF_SUP8 = 0x24, // reference
	STRX1 = 0x25, // string
	STRX2 = 0x26, // string
	STRX3 = 0x27, // string
	STRX4 = 0x28, // string
	ADDRX1 = 0x29, // address
	ADDRX2 = 0x2a, // address
	ADDRX3 = 0x2b, // address
	ADDRX4 = 0x2c, // address
};