diff --git a/libr/arch/p/x86_nz/att2intel.c b/libr/arch/p/x86_nz/att2intel.c index 9233dcdc96061..16286d496cd44 100644 --- a/libr/arch/p/x86_nz/att2intel.c +++ b/libr/arch/p/x86_nz/att2intel.c @@ -1,168 +1,24 @@ -/* radare - LGPL - Copyright 2011-2022 - pancake */ +/* radare - LGPL - Copyright 2011-2025 - pancake */ #include -static int replace(int argc, const char *argv[], char *newstr) { - int i,j,k; - const struct { - const char *op; - const char *str; - } ops[] = { - { "cmpl", "cmp 2, 1" }, - { "testl", "test 2, 1" }, - { "leal", "lea 2, 1" }, - { "movl", "mov 2, 1" }, - { "xorl", "xor 2, 1" }, - { "andl", "and 2, 1" }, - { "orl", "or 2, 1" }, - { "addl", "add 2, 1" }, - { "incl", "inc 1" }, - { "decl", "dec 1" }, - { "subl", "sub 2, 1" }, - { "mull", "mul 2, 1" }, - { "divl", "div 2, 1" }, - { "pushl", "push 1" }, - { "popl", "pop 1" }, - { "ret", "ret" }, - { NULL } - }; - - for (i = 0; ops[i].op; i++) { - if (!strcmp (ops[i].op, argv[0])) { - if (newstr) { - for (j = k = 0; ops[i].str[j] != '\0'; j++, k++) { - if (ops[i].str[j] >= '0' && ops[i].str[j] <= '9') { - const char *w = argv[ops[i].str[j] - '0']; - if (w) { - strcpy (newstr + k, w); - k += strlen (w) - 1; - } - } else { - newstr[k] = ops[i].str[j]; - } - } - newstr[k]='\0'; - } - return true; - } - } - - /* TODO: this is slow */ - if (newstr) { - newstr[0] = '\0'; - for (i = 0; i < argc; i++) { - strcat (newstr, argv[i]); - strcat (newstr, (i == 0 || i== argc - 1)?" ":","); - } - } - - return false; -} +// x86-specific AT&T to Intel parser plugin +// Uses the x86-specific r_str_att2intel() from libr/util/str_att.c +// This plugin is used as a pseudo-disassembler parser to convert AT&T output to Intel static char *parse(RAsmPluginSession *aps, const char *data) { - char w0[32], w1[32], w2[32], w3[32]; - char *optr, *num; - int i, n; - - // malloc can be slow here :? - char *buf = strdup (data); - if (!buf) { - return NULL; - } - r_str_trim_head (buf); - - char *ptr = strchr (buf, '#'); - if (ptr) { - *ptr = 0; - r_str_trim (buf); - } - if (*buf == '.' || buf[strlen(buf)-1] == ':') { - free (buf); - return strdup (data); - } - r_str_replace_char (buf, '$', 0); - r_str_replace_char (buf, '%', 0); - r_str_replace_char (buf, '\t', ' '); - r_str_replace_char (buf, '(', '['); - r_str_replace_char (buf, ')', ']'); - ptr = strchr (buf, '['); - if (ptr) { - *ptr = 0; - num = (char*)r_str_lchr (buf, ' '); - if (!num) { - num = (char *)r_str_lchr (buf, ','); - } - if (num) { - n = atoi (num + 1); - *ptr = '['; - r_str_cpy (num + 1, ptr); - ptr = (char*)r_str_lchr (buf, ']'); - if (n && ptr) { - char *rest = strdup (ptr+1); - size_t dist = strlen (data) + 1 - (ptr - buf); - snprintf (ptr, dist, "%+d]%s", n, rest); - free (rest); - } - } else { - *ptr = '['; - } - } - char *str = NULL; - if (*buf) { - *w0 = *w1 = *w2 = *w3 = 0; - ptr = strchr (buf, ' '); - if (!ptr) { - ptr = strchr (buf, '\t'); - } - if (ptr) { - *ptr = '\0'; - for (ptr++; *ptr == ' '; ptr++) { - ; - } - strncpy (w0, buf, sizeof (w0) - 1); - strncpy (w1, ptr, sizeof (w1) - 1); + return r_str_att2intel (data); +} - optr = ptr; - ptr = strchr (ptr, ','); - if (ptr) { - *ptr = '\0'; - for (ptr++; *ptr == ' '; ptr++) { - ; - } - strncpy (w1, optr, sizeof (w1)-1); - strncpy (w2, ptr, sizeof (w2)-1); - ptr = strchr (ptr, ','); - if (ptr) { - *ptr = '\0'; - for (ptr++; *ptr == ' '; ptr++) { - ; - } - strncpy (w2, optr, sizeof (w2)-1); - strncpy (w3, ptr, sizeof (w3)-1); - } - } - } - { - const char *wa[] = { w0, w1, w2, w3 }; - int nw = 0; - for (i = 0; i < 4; i++) { - if (wa[i][0] != '\0') { - nw++; - } - } - str = malloc (strlen (data) + 128); - strcpy (str, data); - replace (nw, wa, str); - } - } - free (buf); - return str; +// Public API wrapper for backwards compatibility +R_API char *r_asm_att2intel(const char *att_str) { + return r_str_att2intel (att_str); } RAsmPlugin r_asm_plugin_att2intel = { .meta = { .name = "att2intel", - .desc = "X86 att 2 intel plugin", + .desc = "AT&T to Intel syntax converter", .author = "pancake", .license = "LGPL-3.0-only", }, diff --git a/libr/asm/asm.c b/libr/asm/asm.c index f686aa8a61bb4..c9248cda9fce5 100644 --- a/libr/asm/asm.c +++ b/libr/asm/asm.c @@ -619,6 +619,28 @@ static int r_asm_assemble_single(RAsm *a, RAnalOp *op, const char *buf) { if (!b) { return 0; } + // convert AT&T syntax to Intel if needed, but only if the encoder doesn't support ATT natively + // The r_str_att2intel function is x86-specific, so only use it for x86 architectures + if (a->config->syntax == R_ARCH_SYNTAX_ATT) { + const char *arch = R_UNWRAP3 (a, config, arch); + // Get the encoder plugin to check its syntax support + RArchSession *as = R_UNWRAP4 (a, analb.anal, arch, session); + RArchPlugin *encoder_plugin = NULL; + if (as) { + // If encoder is separate, use that + encoder_plugin = as->encoder ? as->encoder->plugin : as->plugin; + } + // Check if encoder supports ATT syntax natively + bool encoder_supports_att = encoder_plugin && (encoder_plugin->encode_syntax & R_ARCH_SYNTAX_MASK_ATT); + // Only convert for x86 if encoder doesn't support ATT natively + if (!encoder_supports_att && arch && r_str_startswith (arch, "x86")) { + char *intel = r_str_att2intel (b); + if (intel) { + free (b); + b = intel; + } + } + } r_str_case (b, false); // to-lower if (a->analb.anal) { ut8 buf[256] = { 0 }; diff --git a/libr/include/r_arch.h b/libr/include/r_arch.h index baad892c48cb8..43060289b5836 100644 --- a/libr/include/r_arch.h +++ b/libr/include/r_arch.h @@ -151,6 +151,11 @@ typedef bool (*RArchPluginInitCallback)(RArchSession *s); typedef bool (*RArchPluginFiniCallback)(RArchSession *s); typedef bool (*RArchPluginEsilCallback)(RArchSession *s, RArchEsilAction action); +// Bitmask for supported syntax modes in arch plugins +#define R_ARCH_SYNTAX_MASK_INTEL (1 << R_ARCH_SYNTAX_INTEL) +#define R_ARCH_SYNTAX_MASK_ATT (1 << R_ARCH_SYNTAX_ATT) +#define R_ARCH_SYNTAX_MASK_MASM (1 << R_ARCH_SYNTAX_MASM) + // TODO: use `const char *const` instead of `char*` typedef struct r_arch_plugin_t { RPluginMeta meta; @@ -161,6 +166,7 @@ typedef struct r_arch_plugin_t { ut32 endian; RSysBits bits; RSysBits addr_bits; + ut32 encode_syntax; // bitmask of R_ARCH_SYNTAX_MASK_* for encoder-supported syntaxes (0 = Intel only, default) const RArchPluginInitCallback init; const RArchPluginFiniCallback fini; diff --git a/libr/include/r_asm.h b/libr/include/r_asm.h index 33287c1ed82ef..c1950ea73e4d8 100644 --- a/libr/include/r_asm.h +++ b/libr/include/r_asm.h @@ -137,6 +137,7 @@ R_API bool r_asm_set_big_endian(RAsm *a, bool big_endian); R_API bool r_asm_set_syntax(RAsm *a, int syntax); // This is in RArchConfig R_API int r_asm_syntax_from_string(const char *name); +R_API char *r_asm_att2intel(const char *att_str); // Convert AT&T to Intel syntax R_API int r_asm_set_pc(RAsm *a, ut64 pc); R_API int r_asm_disassemble(RAsm *a, RAnalOp *op, const ut8 *buf, int len); R_API RAsmCode* r_asm_mdisassemble(RAsm *a, const ut8 *buf, int len); diff --git a/libr/include/r_util/r_str.h b/libr/include/r_util/r_str.h index f68730c7173fe..9775ecb1e3266 100644 --- a/libr/include/r_util/r_str.h +++ b/libr/include/r_util/r_str.h @@ -344,6 +344,8 @@ R_API void r_string_appendf(RString *a, const char * R_NONNULL fmt, ...); R_API char *r_str_pseudo_transform(const char **rules, const char *asm_str); R_API char *r_str_pseudo_subvar(char *pseudo, void *varmap); +// AT&T to Intel syntax conversion +R_API char *r_str_att2intel(const char *att_str); #ifdef __cplusplus } diff --git a/libr/util/Makefile b/libr/util/Makefile index 5588ced194ee7..5f5ef5cb64eb3 100644 --- a/libr/util/Makefile +++ b/libr/util/Makefile @@ -15,7 +15,7 @@ OBJS+=udiff.o bdiff.o stack.o queue.o tree.o idpool.o assert.o bplist.o treemap. OBJS+=punycode.o pkcs7.o x509.o asn1.o asn1_str.o json_parser.o json_indent.o skiplist.o OBJS+=pj.o rbtree.o intervaltree.o qrcode.o vector.o str_constpool.o str_trim.o OBJS+=chartable.o protobuf.o graph_drawable.o axml.o sstext.o new_rbtree.o token.o -OBJS+=rvc.o rvc_git.o rvc_rvc.o bscanf.o rprintf.o base32.o bloom.o mmap.o rxmldom.o +OBJS+=rvc.o rvc_git.o rvc_rvc.o bscanf.o rprintf.o base32.o bloom.o mmap.o rxmldom.o str_att.o ifeq (${HAVE_GPERF},1) OBJS+=d/ascii.o diff --git a/libr/util/meson.build b/libr/util/meson.build index f44ade8a6aa0a..cc7729586f14f 100644 --- a/libr/util/meson.build +++ b/libr/util/meson.build @@ -78,6 +78,7 @@ r_util_sources = [ 'str_pseudo.c', 'str_strip.c', 'str_trim.c', + 'str_att.c', 'str.c', 'strbuf.c', 'strpool.c', diff --git a/libr/util/str_att.c b/libr/util/str_att.c new file mode 100644 index 0000000000000..67e610a206ff9 --- /dev/null +++ b/libr/util/str_att.c @@ -0,0 +1,364 @@ +/* radare - LGPL - Copyright 2011-2025 - pancake */ + +#include + +// x86-specific AT&T to Intel syntax conversion +// This handles x86 AT&T syntax conversion to Intel syntax for assemblers that +// only support Intel syntax. The conversion includes: +// - Remove % prefix from registers +// - Remove $ prefix from immediates +// - Swap operand order (AT&T: src, dst -> Intel: dst, src) +// - Convert memory addressing ( ) to [ ] +// - Strip AT&T size suffixes (l, q, b, w) from instructions +// +// Note: This is NOT generic across architectures. Most non-x86 architectures +// use their own syntax and don't have AT&T/Intel distinction. The encoder +// plugin's encode_syntax field should indicate if it supports ATT natively. + +// instructions that take 2 operands and need swap (src, dst -> dst, src) +static const char *ops_2swap[] = { + "mov", "movl", "movq", "movb", "movw", "movabs", + "add", "addl", "addq", "addb", "addw", + "sub", "subl", "subq", "subb", "subw", + "and", "andl", "andq", "andb", "andw", + "or", "orl", "orq", "orb", "orw", + "xor", "xorl", "xorq", "xorb", "xorw", + "cmp", "cmpl", "cmpq", "cmpb", "cmpw", + "test", "testl", "testq", "testb", "testw", + "lea", "leal", "leaq", + "imul", "imull", "imulq", + "shl", "shll", "shlq", "shlb", "shlw", + "shr", "shrl", "shrq", "shrb", "shrw", + "sar", "sarl", "sarq", "sarb", "sarw", + "sal", "sall", "salq", "salb", "salw", + "rol", "roll", "rolq", "rolb", "rolw", + "ror", "rorl", "rorq", "rorb", "rorw", + "adc", "adcl", "adcq", "adcb", "adcw", + "sbb", "sbbl", "sbbq", "sbbb", "sbbw", + "bt", "btl", "btq", + "bts", "btsl", "btsq", + "btr", "btrl", "btrq", + "btc", "btcl", "btcq", + "xchg", "xchgl", "xchgq", "xchgb", "xchgw", + "movzx", "movzxl", "movzxq", "movzxb", "movzxw", + "movsx", "movsxl", "movsxq", "movsxb", "movsxw", + "movzbl", "movzbq", "movzbw", + "movsbl", "movsbq", "movsbw", + "movzwl", "movzwq", + "movswl", "movswq", + "movslq", + "cmova", "cmovae", "cmovb", "cmovbe", "cmovc", + "cmove", "cmovg", "cmovge", "cmovl", "cmovle", + "cmovna", "cmovnae", "cmovnb", "cmovnbe", "cmovnc", + "cmovne", "cmovng", "cmovnge", "cmovnl", "cmovnle", + "cmovno", "cmovnp", "cmovns", "cmovnz", + "cmovo", "cmovp", "cmovpe", "cmovpo", "cmovs", "cmovz", + NULL +}; + +// instructions that take 1 operand (no swap needed) +static const char *ops_1[] = { + "push", "pushl", "pushq", "pushw", + "pop", "popl", "popq", "popw", + "inc", "incl", "incq", "incb", "incw", + "dec", "decl", "decq", "decb", "decw", + "neg", "negl", "negq", "negb", "negw", + "not", "notl", "notq", "notb", "notw", + "mul", "mull", "mulq", "mulb", "mulw", + "div", "divl", "divq", "divb", "divw", + "idiv", "idivl", "idivq", "idivb", "idivw", + "call", "calll", "callq", + "jmp", "jmpl", "jmpq", + "ja", "jae", "jb", "jbe", "jc", "jcxz", "jecxz", "jrcxz", + "je", "jg", "jge", "jl", "jle", + "jna", "jnae", "jnb", "jnbe", "jnc", + "jne", "jng", "jnge", "jnl", "jnle", + "jno", "jnp", "jns", "jnz", + "jo", "jp", "jpe", "jpo", "js", "jz", + "loop", "loope", "loopne", "loopz", "loopnz", + "seta", "setae", "setb", "setbe", "setc", + "sete", "setg", "setge", "setl", "setle", + "setna", "setnae", "setnb", "setnbe", "setnc", + "setne", "setng", "setnge", "setnl", "setnle", + "setno", "setnp", "setns", "setnz", + "seto", "setp", "setpe", "setpo", "sets", "setz", + "int", + NULL +}; + +// instructions that take 0 operands +static const char *ops_0[] = { + "ret", "retl", "retq", + "leave", "leavel", "leaveq", + "nop", "nopl", "nopq", "nopw", + "hlt", + "int3", + "syscall", + "sysret", + "clc", "stc", "cmc", + "cld", "std", + "cli", "sti", + "pushf", "pushfl", "pushfq", + "popf", "popfl", "popfq", + "cbw", "cwde", "cdqe", + "cwd", "cdq", "cqo", + "lahf", "sahf", + NULL +}; + +static bool is_in_list(const char *op, const char **list) { + int i; + for (i = 0; list[i]; i++) { + if (!strcmp (list[i], op)) { + return true; + } + } + return false; +} + +// strip AT&T size suffix (l, q, b, w) from instruction if not in known lists +static void strip_size_suffix(char *op) { + size_t len = strlen (op); + if (len > 1) { + char last = op[len - 1]; + if (last == 'l' || last == 'q' || last == 'b' || last == 'w') { + // check if stripping would give a valid base instruction + char base[32]; + r_str_ncpy (base, op, sizeof (base)); + base[len - 1] = '\0'; + // only strip if it's a known suffixed instruction pattern + if (is_in_list (op, ops_2swap) || is_in_list (op, ops_1) || is_in_list (op, ops_0)) { + // keep as is, it's already recognized + } else { + // try to see if base form exists + char suffixed[32]; + snprintf (suffixed, sizeof (suffixed), "%sl", base); + if (is_in_list (suffixed, ops_2swap) || is_in_list (suffixed, ops_1)) { + op[len - 1] = '\0'; + } + } + } + } +} + +static int replace(int argc, const char *argv[], char *newstr) { + if (argc < 1 || !argv[0]) { + return false; + } + char op[32]; + r_str_ncpy (op, argv[0], sizeof (op)); + r_str_case (op, false); + strip_size_suffix (op); + + if (is_in_list (argv[0], ops_0) || is_in_list (op, ops_0)) { + // 0 operand instruction + if (newstr) { + strcpy (newstr, op); + } + return true; + } + + if (argc >= 2 && (is_in_list (argv[0], ops_1) || is_in_list (op, ops_1))) { + // 1 operand instruction + if (newstr) { + snprintf (newstr, 256, "%s %s", op, argv[1]); + } + return true; + } + + if (argc >= 3 && (is_in_list (argv[0], ops_2swap) || is_in_list (op, ops_2swap))) { + // 2 operand instruction - swap operands (ATT: src, dst -> Intel: dst, src) + if (newstr) { + snprintf (newstr, 256, "%s %s, %s", op, argv[2], argv[1]); + } + return true; + } + + // fallback: swap operands for unknown 2-operand instructions + if (argc >= 3 && newstr) { + snprintf (newstr, 256, "%s %s, %s", op, argv[2], argv[1]); + return true; + } + + // single operand fallback + if (argc == 2 && newstr) { + snprintf (newstr, 256, "%s %s", op, argv[1]); + return true; + } + + // no operands fallback + if (argc == 1 && newstr) { + strcpy (newstr, op); + return true; + } + + return false; +} + +/** + * \brief Convert AT&T assembly syntax to Intel syntax + * \param att_str The AT&T syntax assembly string + * \return Newly allocated Intel syntax string, or NULL on failure. Caller must free. + * + * This function handles the common AT&T to Intel conversion rules: + * - Removes % prefix from register names + * - Removes $ prefix from immediate values + * - Swaps operand order (AT&T: src, dst -> Intel: dst, src) + * - Converts memory addressing from offset(base) to [base+offset] + * - Strips AT&T size suffixes (l, q, b, w) from instructions + */ +R_API char *r_str_att2intel(const char *att_str) { + char w0[64], w1[64], w2[64], w3[64]; + int i; + + if (R_STR_ISEMPTY (att_str)) { + return NULL; + } + + char *buf = strdup (att_str); + if (!buf) { + return NULL; + } + r_str_trim_head (buf); + + // handle comments + char *ptr = strchr (buf, '#'); + if (ptr) { + *ptr = 0; + r_str_trim (buf); + } + + // skip directives and labels + if (*buf == '.' || (strlen (buf) > 0 && buf[strlen (buf) - 1] == ':')) { + free (buf); + return strdup (att_str); + } + + // remove AT&T prefixes: $ for immediates, % for registers + r_str_replace_char (buf, '$', 0); + r_str_replace_char (buf, '%', 0); + r_str_replace_char (buf, '\t', ' '); + + // handle memory addressing: convert ( ) to [ ] + // ATT: offset(base,index,scale) -> Intel: [base+index*scale+offset] + r_str_replace_char (buf, '(', '['); + r_str_replace_char (buf, ')', ']'); + + // handle displacement in memory operands like 8(%rsp) -> [rsp+8] + ptr = strchr (buf, '['); + if (ptr) { + // find the displacement before the bracket + char *start = ptr; + while (start > buf && (isdigit ((unsigned char)*(start - 1)) || *(start - 1) == '-' || *(start - 1) == '+')) { + start--; + } + if (start < ptr && start > buf && *(start - 1) != ' ' && *(start - 1) != ',') { + // skip this case, it's part of an operand name + } else if (start < ptr) { + // there's a displacement + int disp = atoi (start); + if (disp != 0) { + // find the closing bracket + char *close = strchr (ptr, ']'); + if (close) { + char inner[64] = {0}; + size_t inner_len = close - ptr - 1; + if (inner_len < sizeof (inner)) { + strncpy (inner, ptr + 1, inner_len); + inner[inner_len] = '\0'; + char *rest = strdup (close + 1); + // rebuild: [inner+disp]rest + size_t avail = strlen (att_str) + 64 - (start - buf); + snprintf (start, avail, "[%s%+d]%s", inner, disp, rest ? rest : ""); + free (rest); + } + } + } + } + } + + char *str = NULL; + if (*buf) { + *w0 = *w1 = *w2 = *w3 = 0; + + // find instruction + ptr = buf; + while (*ptr && !isspace ((unsigned char)*ptr)) { + ptr++; + } + size_t oplen = ptr - buf; + if (oplen >= sizeof (w0)) { + oplen = sizeof (w0) - 1; + } + strncpy (w0, buf, oplen); + w0[oplen] = '\0'; + + // skip whitespace after instruction + while (*ptr && isspace ((unsigned char)*ptr)) { + ptr++; + } + + // parse operands + if (*ptr) { + // find first comma + char *comma = strchr (ptr, ','); + if (comma) { + size_t len = comma - ptr; + if (len >= sizeof (w1)) { + len = sizeof (w1) - 1; + } + strncpy (w1, ptr, len); + w1[len] = '\0'; + r_str_trim (w1); + + // move to second operand + ptr = comma + 1; + while (*ptr && isspace ((unsigned char)*ptr)) { + ptr++; + } + + // find second comma for potential third operand + comma = strchr (ptr, ','); + if (comma) { + len = comma - ptr; + if (len >= sizeof (w2)) { + len = sizeof (w2) - 1; + } + strncpy (w2, ptr, len); + w2[len] = '\0'; + r_str_trim (w2); + + // third operand + ptr = comma + 1; + while (*ptr && isspace ((unsigned char)*ptr)) { + ptr++; + } + r_str_ncpy (w3, ptr, sizeof (w3)); + r_str_trim (w3); + } else { + r_str_ncpy (w2, ptr, sizeof (w2)); + r_str_trim (w2); + } + } else { + // single operand + r_str_ncpy (w1, ptr, sizeof (w1)); + r_str_trim (w1); + } + } + + const char *wa[] = { w0, w1, w2, w3 }; + int nw = 0; + for (i = 0; i < 4; i++) { + if (wa[i][0] != '\0') { + nw++; + } + } + str = malloc (strlen (att_str) + 256); + if (str) { + str[0] = '\0'; + replace (nw, wa, str); + } + } + free (buf); + return str; +} diff --git a/test/db/cmd/attasm b/test/db/cmd/attasm new file mode 100644 index 0000000000000..70f61e3f59d7f --- /dev/null +++ b/test/db/cmd/attasm @@ -0,0 +1,449 @@ +NAME=x86 AT&T movl immediate to register +FILE=- +CMDS=<