C++ 基于Capstone实现反汇编器

Capstone是一个反汇编框架，提供了一个简单、轻量级的API接口，可透明地处理大多数流行的指令体系，包括x86/x86-64、ARM及MIPS等。Capstone支持C/C++和Python，并且可以在很多操作系统上运行。

python安装capstone: pip3 install capstone，之后启动python解释器就可以其中使用该模块。

Linux安装libcapstone: $ sudo apt-get install libcapstone3

安装开发包: $ sudo apt-get install libcapstone-dev

Capstone线性反汇编

Capstone接收一个含有字节块的缓冲区作为输入，并输出这些字节的反汇编指令，其最基本的使用方法是提供一个包含字节块的缓冲区(这些字节都来自二进制文件的.text节)。然后将这些字节序列线性反汇编为人类可读的形式，或是指令助记符形式。除了一些初始化和输出解析的代码之外，Capstone通过调用cs_disasm函数来实现上述功能。

下面实现一个简单的线性反汇编器

此处要用到上篇文章中的头文件和相关函数定义: C++ 基于libbfd实现二进制加载器

#include <stdio.h>

#include <string>

#include <capstone/capstone.h>

#include "loader.h"

int disasm(Binary *bin);

int main(int argc,char *argv[])

{

    Binary bin;

    std::string fname;

    if (argc < 2) {

        printf("Usage: %s <binary>\n",argv[0]);

        return 1;

    }

    fname.assign(argv[1]); // 命令行参数赋值

    // 加载二进制文件

    if (load_binary(fname,&bin,Binary::BIN_TYPE_AUTO) < 0) {

        return 1;

    }

    // 反汇编

    if (disasm(&bin) < 0) {

        return 1;

    }

    // 释放

    unload_binary(&bin);

    return 0;

}

int disasm(Binary *bin)

{

    csh dis;

    cs_insn *insns;

    Section *text;

    size_t n;

    // 获取二进制文件的节

    text = bin->get_text_section();

    if (!text) {

        fprintf(stderr,"Nothing to disassemble\n");

        return 0;

    }

    // 初始化capstone

    if (cs_open(CS_ARCH_X86,CS_MODE_64,&dis) != CS_ERR_OK) {

        fprintf(stderr,"Failed to open Capstone\n");

        return -1;

    }

    // 反汇编 将结果存放在insns结构体 返回

    n = cs_disasm(dis,text->bytes,text->size,text->vma,0,&insns);

    if (n <= 0) {

        fprintf(stderr,"Disassembly error: %s\n",

            cs_strerror(cs_errno(dis)));

        return -1;

    }

    // 循环遍历insns

    for (size_t i=0;i<n;i++) {

        printf("0x%016jx: ",insns[i].address);

        for (size_t j=0;j<16;j++) {

            if (j < insns[i].size)

                printf("%02x ",insns[i].bytes[j]);

            else

                printf("   ");

        }

        printf("%-12s %s\n",insns[i].mnemonic,insns[i].op_str);

    }

    cs_free(insns,n); // 释放空间

    cs_close(&dis);

    return 0;

}

首先使用LoadBinary函数将二进制程序加载进Binary对象，然后传递给disasm函数。在disasm函数中调用get_text_section首先获取.text节的数据，然后使用cs_open函数初始化capstone，该函数接收3个参数: 硬件体系结构、硬件模式和一个csh结构的指针，csh类型的变量作为一个句柄存在，将在capstone中的多个API函数中使用。

向cs_disasm函数传递csh句柄，作为第一个参数，第二个参数是缓冲区，即上面得到.text节内容(被加载到Section对象中)，第三个参数是传入的缓冲区的字节数，第四个参数是第一条指令的地址，最后一个参数是cs_insn类型的变量，将存放反汇编后的结果。

该结构定义如下:

// Detail information of disassembled instruction

typedef struct cs_insn {

    // Instruction ID (basically a numeric ID for the instruction mnemonic)

    // Find the instruction id in the '[ARCH]_insn' enum in the header file

    // of corresponding architecture, such as 'arm_insn' in arm.h for ARM,

    // 'x86_insn' in x86.h for X86, etc...

    // This information is available even when CS_OPT_DETAIL = CS_OPT_OFF

    // NOTE: in Skipdata mode, "data" instruction has 0 for this id field.

    unsigned int id;

    // Address (EIP) of this instruction

    // This information is available even when CS_OPT_DETAIL = CS_OPT_OFF

    uint64_t address;

    // Size of this instruction

    // This information is available even when CS_OPT_DETAIL = CS_OPT_OFF

    uint16_t size;

    // Machine bytes of this instruction, with number of bytes indicated by @size above

    // This information is available even when CS_OPT_DETAIL = CS_OPT_OFF

    uint8_t bytes[16];

    // Ascii text of instruction mnemonic

    // This information is available even when CS_OPT_DETAIL = CS_OPT_OFF

    char mnemonic[32];

    // Ascii text of instruction operands

    // This information is available even when CS_OPT_DETAIL = CS_OPT_OFF

    char op_str[160];

    // Pointer to cs_detail.

    // NOTE: detail pointer is only valid when both requirements below are met:

    // (1) CS_OP_DETAIL = CS_OPT_ON

    // (2) Engine is not in Skipdata mode (CS_OP_SKIPDATA option set to CS_OPT_ON)

    //

    // NOTE 2: when in Skipdata mode, or when detail mode is OFF, even if this pointer

    //     is not NULL, its content is still irrelevant.

    cs_detail *detail;

} cs_insn;

该结构体中，id字段是指令类型(和硬件体系相关)的唯一标识符，可用于检查正在处理的指令类型，而无须与指令助记符进行字符串比较。

x86平台下相关的值如下:

// X86 instructions

typedef enum x86_insn {

    X86_INS_INVALID = 0,

    X86_INS_AAA,

    X86_INS_AAD,

    X86_INS_AAM,

    X86_INS_AAS,

    X86_INS_FABS,

    X86_INS_ADC,

    X86_INS_ADCX,

    X86_INS_ADD,

    X86_INS_ADDPD,

    X86_INS_ADDPS,

    X86_INS_ADDSD,

    X86_INS_ADDSS,

    X86_INS_ADDSUBPD,

    .....

    X86_GRP_VLX,

    X86_GRP_SMAP,

    X86_GRP_NOVLX,

    X86_GRP_ENDING

} x86_insn_group;

address、size及bytes字段表示指令的地址、字节数及字节数据。

mnemonic是指令可读形式的指令字符串(不含操作数)，而op_str是指令操作数的可读表示，detail包含了更详细的信息，如下是cs_insn的定义:

// NOTE: All information in cs_detail is only available when CS_OPT_DETAIL = CS_OPT_ON

typedef struct cs_detail {

    uint8_t regs_read[12]; // list of implicit registers read by this insn

    uint8_t regs_read_count; // number of implicit registers read by this insn

    uint8_t regs_write[20]; // list of implicit registers modified by this insn

    uint8_t regs_write_count; // number of implicit registers modified by this insn

    uint8_t groups[8]; // list of group this instruction belong to

    uint8_t groups_count; // number of groups this insn belongs to

    // Architecture-specific instruction info

    union {

        cs_x86 x86;    // X86 architecture, including 16-bit, 32-bit & 64-bit mode

        cs_arm64 arm64;    // ARM64 architecture (aka AArch64)

        cs_arm arm;        // ARM architecture (including Thumb/Thumb2)

        cs_mips mips;    // MIPS architecture

        cs_ppc ppc;    // PowerPC architecture

        cs_sparc sparc;    // Sparc architecture

        cs_sysz sysz;    // SystemZ architecture

        cs_xcore xcore;    // XCore architecture

    };

} cs_detail;

只有开启了capstone的详细反汇编模式，才会设置detail指针。

如果cs_disasm函数执行成功，则返回反汇编指令的数量，如果函数执行失败，则返回0。

cs_error函数可用于检查错误，cs_strerror可将cs_err值转换为字符串来描述错误，之后在循环中不断取出数据，按照地址机器码指令字符串的格式打印，循环之后调用cs_free函数释放内存，调用cs_close关闭。

终端执行命令编译: g++ -I . loader.cpp liner.cpp -o liner -lbfd -lcapstone

运行测试，反汇编一个hello world程序

$ ./liner ./hello

0x0000000000401040: 31 ed                                           xor          ebp, ebp

0x0000000000401042: 49 89 d1                                        mov          r9, rdx

0x0000000000401045: 5e                                              pop          rsi

0x0000000000401046: 48 89 e2                                        mov          rdx, rsp

0x0000000000401049: 48 83 e4 f0                                     and          rsp, 0xfffffffffffffff0

0x000000000040104d: 50                                              push         rax

0x000000000040104e: 54                                              push         rsp

0x000000000040104f: 49 c7 c0 a0 11 40 00                            mov          r8, 0x4011a0

0x0000000000401056: 48 c7 c1 40 11 40 00                            mov          rcx, 0x401140

0x000000000040105d: 48 c7 c7 22 11 40 00                            mov          rdi, 0x401122

0x0000000000401064: ff 15 86 2f 00 00                               call         qword ptr [rip + 0x2f86]

0x000000000040106a: f4                                              hlt

0x000000000040106b: 0f 1f 44 00 00                                  nop          dword ptr [rax + rax]

0x0000000000401070: c3                                              ret

.....

Capstone递归反汇编

线性反汇编只能显示基本的信息，但缺少更为详细的信息(指令类型、操作数类型等)，想要查看详细的信息只能在Capstone的详细反汇编模式中找到。递归反汇编从已知入口点开始分析，如二进制文件的主入口点或函数符号，并从此处跟踪控制流指令，而线性反汇编器会盲目地按顺序反汇编所有代码。与线性反汇编器相比，递归反汇编器不易被代码中的数据干扰，但可能会错过那些只能通过间接跳转才能到达的指令，这些指令不能被静态解析。

示例代码:

#include <stdio.h>

#include <queue>

#include <map>

#include <string>

#include <capstone/capstone.h>

#include "loader.h"

int disasm(Binary *bin); // 反汇编

void print_ins(cs_insn *ins); // 打印结构

bool is_cs_cflow_group(uint8_t g);

bool is_cs_cflow_ins(cs_insn *ins);

bool is_cs_unconditional_cflow_ins(cs_insn *ins);

uint64_t get_cs_ins_immediate_target(cs_insn *ins);

int main(int argc,char* argv[])

{

    Binary bin;

    std::string fname;

    if (argc < 2) {

        printf("Usage: %s <binary>\n");

        return 1;

    }

    fname.assign(argv[1]); // 赋值给fname

    // 加载二进制文件

    if (load_binary(fname,&bin,Binary::BIN_TYPE_AUTO) < 0) {

        return 1;

    }

    if (disasm(&bin) < 0) {

        return 1;

    }

    // 释放

    unload_binary(&bin);

    return 0;

}

int disasm(Binary *bin)

{

    csh dis;

    cs_insn *cs_ins;

    Section* text;

    size_t n;

    const uint8_t *pc;

    uint64_t addr,offset,target;

    std::queue<uint64_t> Q;

    std::map<uint64_t, bool> seen;

    text = bin->get_text_section();

    if (!text) {

        fprintf(stderr,"Nothing to disassemble\n");

        return 0;

    }

    if (cs_open(CS_ARCH_X86,CS_MODE_64,&dis)!=CS_ERR_OK) {

        fprintf(stderr,"Failed to open Capstone\n");

        return -1;

    }

    cs_option(dis,CS_OPT_DETAIL,CS_OPT_ON);

    cs_ins = cs_malloc(dis); // 分配缓冲区

    if (!cs_ins) {

        fprintf(stderr,"Out of memory\n");

        cs_close(&dis);

        return -1;

    }

    addr = bin->entry; // 二进制程序入口点

    // 将入口地址放入队列

    if (text->contains(addr)) Q.push(addr);

    printf("entry point: 0x%016jx\n",addr);

    // 遍历符号表

    for (auto &sym: bin->symbols) {

        if (sym.type == Symbol::SYM_TYPE_FUNC && text->contains(sym.addr)) {

            Q.push(sym.addr); // 将函数起始地址放入队列

            printf("function symbol: 0x%016jx\n",sym.addr);

        }

    }

    // 遍历队列中的地址

    while(!Q.empty()) {

        addr = Q.front(); // 获取地址

        Q.pop(); // 移出队列

        if (seen[addr]) continue; // 跳过已经处理过的地址

        offset = addr - text->vma; // 地址偏移

        pc = text->bytes + offset; // 计算VMA

        n = text->size - offset; // 字节数

        while (cs_disasm_iter(dis,&pc,&n,&addr,cs_ins)) {

            // 判断是否为无效命令

            if (cs_ins->id == X86_INS_INVALID || cs_ins->size == 0) {

                break;

            }

        }

        seen[cs_ins->address] = true; // 记录已处理地址

        print_ins(cs_ins);

        // 判断是否为控制流指令

        if (is_cs_cflow_ins(cs_ins)) {

            target = get_cs_ins_immediate_target(cs_ins); // 解析流控制流目标地址

            if (target && !seen[target] && text->contains(target)) {

                Q.push(target);

                printf(" -> new target: 0x%016jx\n",target);

            }

            if (is_cs_unconditional_cflow_ins(cs_ins)) {

                break;

            } else if (cs_ins->id == X86_INS_HLT) {

                break;

            }

        }

        printf("--------------\n");

    }

    cs_free(cs_ins,1);

    cs_close(&dis);

    return 0;

}

// 打印指令信息

void print_ins(cs_insn *ins)

{

    printf("0x%016jx: ",ins->address);

    for (size_t i = 0;i < 16;i++) {

        if (i < ins->size)

            printf("%02x ",ins->bytes[i]);

        else

            printf("   ");

    }

    printf("%-12s %s\n",ins->mnemonic,ins->op_str);

}

// 根据detail->group来判断控制流指令类型

bool is_cs_cflow_group(uint8_t g)

{

    return (g == CS_GRP_JUMP) || (g == CS_GRP_CALL)

    || (g == CS_GRP_RET) || (g == CS_GRP_IRET);

}

bool is_cs_cflow_ins(cs_insn *ins)

{

    for (size_t i = 0;i < ins->detail->groups_count;i++) {

        if (is_cs_cflow_group(ins->detail->groups[i])) {

            return true;

        }

    }

    return false;

}

// 判断是否为无条件跳转指令

bool is_cs_unconditional_cflow_ins(cs_insn *ins)

{

    switch (ins->id) {

        case X86_INS_JMP:

        case X86_INS_LJMP:

        case X86_INS_RET:

        case X86_INS_RETF:

        case X86_INS_RETFQ:

            return true;

        default:

            return false;

    }

}

uint64_t get_cs_ins_immediate_target(cs_insn *ins)

{

    cs_x86_op *cs_op;

    for (size_t i=0; i < ins->detail->groups_count; i++) {

        if (is_cs_cflow_group(ins->detail->groups[i])) {

            for (size_t j = 0; j < ins->detail->groups[i];j++) {

                cs_op = &ins->detail->x86.operands[j];

                if (cs_op->type == X86_OP_IMM) {

                    return cs_op->imm;

                }

            }

        }

    }

    return 0;

}

与线性反汇编的程序相比，main函数是相同的，disasm函数的初始化代码也是相似的，都是首先加载.text节并得到一个capstone句柄，额外增加了对cs_options的调用，设置CS_OPT_DETAIL选项开启详细反汇编模式。

程序中创建了一个队列，用于存储地址，便于跟踪指令流，而map结构的seen用于存放已经跟踪过的地址。首先将初始入口点放入该队列，即二进制程序的入口点，然后遍历整个符号表，将函数符号对应的地址放入队列。

之后会循环迭代这个队列，取出存放的地址，即起始点，对每个起始点进行线性反汇编，并将每个新发现的控制流跳转地址增加到队列中，这些新的地址将在后续的循环中再次被反汇编。每次线性扫描只在遇到hlt指令或者无条件分支指令时停止，因为这些指令之后出现的可能是数据而不是代码，所以不能继续进行反汇编。

cs_disasm_iter是cs_disasm函数的迭代版本。cs_disasm_iter一次只反汇编一条指令，而不是整个代码区。在每条指令进行反汇编后，cs_disasm_iter返回true或false，true表示指令已经成功反汇编，而false表示指令反汇编失败。因此创建一个while循环，知道该函数返回false才停止。该函数的第一个参数是capstone句柄，第二个参数是一个二级指针，指向反汇编代码，在cs_disasm_iter每次被调用时，会更新这个指针，将其指向上次反汇编字节的下一个位置，就好像程序计数器。第三个参数是反汇编的剩余字节数，在调用cs_disasm_iter时，该字节数会被自动递减，在该程序中其大小总是等于.text节的大小减去已经反汇编的字节数。之后的一个参数等于前一个参数指向的代码的VMA，最后一个参数是指向cs_insn对象的指针，该对象作为每个反汇编指令的缓冲区。

offset = addr - text->vma; // 地址偏移

pc = text->bytes + offset; // 计算VMA

n = text->size - offset; // 字节数

while (cs_disasm_iter(dis,&pc,&n,&addr,cs_ins)) {

// 判断是否为无效命令

    if (cs_ins->id == X86_INS_INVALID || cs_ins->size == 0) {

        break;

    }

}

用cs_disasm_iter代替cs_disasm有两个优点: cs_disasm_iter支持迭代机制，在每条指令被反汇编后能立即查看，以便检查控制流指令并进行递归遍历。

在整个对队列的循环中，每次调用cs_disasm_iter函数时得到和指令相关的cs_ins结构后，使用is_cs_cflow_ins确定指令是否为控制流指令。

bool is_cs_cflow_ins(cs_insn *ins)

{

    for (size_t i = 0;i < ins->detail->groups_count;i++) {

        if (is_cs_cflow_group(ins->detail->groups[i])) {

            return true;

        }

    }

    return false;

}

该函数须要访问cs_ins结构体中detail的groups数组，如下是detail结构的详细信息:

// NOTE: All information in cs_detail is only available when CS_OPT_DETAIL = CS_OPT_ON

typedef struct cs_detail {

	uint8_t regs_read[12]; // list of implicit registers read by this insn

	uint8_t regs_read_count; // number of implicit registers read by this insn

	uint8_t regs_write[20]; // list of implicit registers modified by this insn

	uint8_t regs_write_count; // number of implicit registers modified by this insn

	uint8_t groups[8]; // list of group this instruction belong to

	uint8_t groups_count; // number of groups this insn belongs to

	// Architecture-specific instruction info

	union {

		cs_x86 x86;	// X86 architecture, including 16-bit, 32-bit & 64-bit mode

		cs_arm64 arm64;	// ARM64 architecture (aka AArch64)

		cs_arm arm;		// ARM architecture (including Thumb/Thumb2)

		cs_mips mips;	// MIPS architecture

		cs_ppc ppc;	// PowerPC architecture

		cs_sparc sparc;	// Sparc architecture

		cs_sysz sysz;	// SystemZ architecture

		cs_xcore xcore;	// XCore architecture

	};

} cs_detail;

is_cs_cflow_ins中调用is_cs_cflow_group，检查指令是否为跳转、调用、返回和中断。

bool is_cs_cflow_group(uint8_t g)

{

    return (g == CS_GRP_JUMP) || (g == CS_GRP_CALL)

    || (g == CS_GRP_RET) || (g == CS_GRP_IRET);

}

经过上述的一系列判断，如果发现尚未被处理过的控制流指令，则要进一步解析控制流目标地址。如下函数负责获取地址，但只能作用于直接寻址。

uint64_t get_cs_ins_immediate_target(cs_insn *ins)

{

    cs_x86_op *cs_op;

    for (size_t i=0; i < ins->detail->groups_count; i++) {

        if (is_cs_cflow_group(ins->detail->groups[i])) {

            for (size_t j = 0; j < ins->detail->x86.op_count;j++) {

                cs_op = &ins->detail->x86.operands[j];

                if (cs_op->type == X86_OP_IMM) {

                    return cs_op->imm;

                }

            }

        }

    }

    return 0;

}

该函数要检查指令的操作数，而每种指令体系中都有自己的一套操作数类型，因此无法采用通用的解析方法。

访问detail中x86.operands数组，获取指令操作数，类型为cs_x86_op结构体:

// Instruction operand

typedef struct cs_x86_op {

		x86_op_type type;	// operand type

		union {

			x86_reg reg;	// register value for REG operand

			int64_t imm;		// immediate value for IMM operand

			double fp;		// floating point value for FP operand

			x86_op_mem mem;		// base/index/scale/disp value for MEM operand

		};

		// size of this operand (in bytes).

		uint8_t size;

		// AVX broadcast type, or 0 if irrelevant

		x86_avx_bcast avx_bcast;

		// AVX zero opmask {z}

		bool avx_zero_opmask;

} cs_x86_op;

遍历这个数组，判断类型，如果为IMM指令(立即数)，则直接访问imm成员，获取操作数

终端执行命令: g++ -I . recursive.cpp loader.cpp -o recursive -lbfd -lcapstone

$ ./recursive ./hello

entry point: 0x0000000000401040

function symbol: 0x0000000000401080

function symbol: 0x00000000004010b0

function symbol: 0x00000000004010f0

function symbol: 0x0000000000401120

function symbol: 0x00000000004011c0

function symbol: 0x0000000000401160

function symbol: 0x0000000000401070

function symbol: 0x0000000000401040

function symbol: 0x0000000000401133

function symbol: 0x0000000000401122

0x00000000004011c1: c3                                              ret

C++ 基于Capstone实现反汇编器

Capstone线性反汇编

Capstone递归反汇编