Skip to content

Improved signedness data accuracy and consistency #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 28, 2022

Conversation

mappzor
Copy link
Contributor

@mappzor mappzor commented May 4, 2022

This PR fixes inconsistencies mentioned in zyantific/zydis#192 and zyantific/zydis#327. Major goal is to have consistent signedness for every instruction. This is especially important for encoder users as specifying signed/unsigned immediates gets more intuitive with those changes. There were a few categories of issues:

  • byteops were typically unsigned while other variants were signed
  • inconsistencies between VEX/EVEX/MVEX variants (especially MVEX)
  • wrong signedness where it actually matters (div/idiv, mul/imul, sar/shr, etc.)
  • everything else

As discussed some months ago I went for "signed by default" policy unless there was a good reason to have things unsigned. This means that e.g. data transfers and bitwise operations are now consistently signed.

I searched for all instruction definitions that mixed signed and unsigned operands across variants and within single definition itself. This resulted in around 1300 definitions which were manually reviewed.

Final conversion was done by this piece of spaghetti:

#!/usr/bin/env python3
import json
from enum import Enum, auto
from utils import *


class RuleType(Enum):
    S_TO_U = auto()
    S_TO_U_FORCE = auto()
    S_TO_U_12 = auto()
    S_TO_U_1 = auto()
    S_TO_U_2 = auto()
    S_TO_U_3 = auto()
    U_TO_S = auto()
    U_TO_S_NOT_LAST = auto()
    C_TO_S1U2 = auto()
    E_TO_S_34 = auto()


rules = {
    'div': RuleType.S_TO_U_FORCE,
    'enter': RuleType.S_TO_U,
    'movzx': RuleType.S_TO_U_FORCE,
    'mul': RuleType.S_TO_U_FORCE,
    'pavgb': RuleType.S_TO_U,
    'pavgw': RuleType.S_TO_U,
    'pmuludq': RuleType.S_TO_U,
    'pslld': RuleType.S_TO_U,
    'psllq': RuleType.S_TO_U,
    'psllw': RuleType.S_TO_U,
    'pextrb': RuleType.S_TO_U_12,
    'pextrw': RuleType.S_TO_U_12,
    'pinsrb': RuleType.S_TO_U_12,
    'pinsrw': RuleType.S_TO_U_12,
    'pinsrd': RuleType.S_TO_U_12,
    'pinsrq': RuleType.S_TO_U_12,
    'psraw': RuleType.S_TO_U_2,
    'psrad': RuleType.S_TO_U_2,
    'psrld': RuleType.S_TO_U,
    'psrlq': RuleType.S_TO_U,
    'psrlw': RuleType.S_TO_U,
    'rcl': RuleType.C_TO_S1U2,
    'rcr': RuleType.C_TO_S1U2,
    'ret': RuleType.S_TO_U,
    'rol': RuleType.C_TO_S1U2,
    'ror': RuleType.C_TO_S1U2,
    'sar': RuleType.C_TO_S1U2,
    'shl': RuleType.S_TO_U_FORCE,
    'shld': RuleType.S_TO_U_FORCE,
    'shr': RuleType.S_TO_U_FORCE,
    'shrd': RuleType.S_TO_U_FORCE,
    'lwpins': RuleType.S_TO_U_3,
    'lwpval': RuleType.S_TO_U_3,
    'vaesdec': RuleType.S_TO_U,
    'vaesdeclast': RuleType.S_TO_U,
    'vaesenc': RuleType.S_TO_U,
    'vaesenclast': RuleType.S_TO_U,
    'vaeskeygenassist': RuleType.S_TO_U,
    'vpabsb': RuleType.S_TO_U_1,
    'vpabsd': RuleType.S_TO_U_1,
    'vpabsw': RuleType.S_TO_U_1,
    'vpackusdw': RuleType.E_TO_S_34,
    'vpackuswb': RuleType.E_TO_S_34,
    'vpextrb': RuleType.S_TO_U_12,
    'vpextrd': RuleType.S_TO_U_12,
    'vpextrq': RuleType.S_TO_U_12,
    'vpextrw': RuleType.S_TO_U_12,
    'vpmovzxbd': RuleType.S_TO_U,
    'vpmovzxbq': RuleType.S_TO_U,
    'vpmovzxbw': RuleType.S_TO_U,
    'vpmovzxdq': RuleType.S_TO_U,
    'vpmovzxwd': RuleType.S_TO_U,
    'vpmovzxwq': RuleType.S_TO_U,
    'vpshufd': RuleType.S_TO_U,
    'vpshufhw': RuleType.S_TO_U,
    'vpshuflw': RuleType.S_TO_U,
    'vpsllvd': RuleType.S_TO_U,
    'vpsllvq': RuleType.S_TO_U,
    'vpsrad': RuleType.U_TO_S_NOT_LAST,
    'vpsravd': RuleType.U_TO_S_NOT_LAST,
    'vpsraw': RuleType.U_TO_S_NOT_LAST,
    'vpsrlvd': RuleType.S_TO_U,
    'vpsrlvq': RuleType.S_TO_U,
}
for mnemonic in ['adc', 'add', 'and', 'cmp', 'cmpxchg', 'crc32', 'dec', 'idiv', 'imul', 'inc', 'maskmovq', 'mov', 'movsx', 'neg', 'not', 'or', 'paddq', 'punpcklbw', 'punpckldq', 'punpcklwd', 'sbb', 'sub', 'test', 'xadd', 'xchg', 'xor',
                 'jknzd', 'jkzd', 'vbroadcasti32x4', 'vbroadcasti64x4', 'vmovd', 'vmovdqa32', 'vmovdqa64', 'vmovntdq', 'vmovntdqa', 'vmovq', 'vpaddb', 'vpaddd', 'vpaddq', 'vpaddw', 'vpblendvb', 'vpbroadcastd', 'vpbroadcastq',
                 'vpcmpeqd', 'vpcmpgtb', 'vpcmpgtw', 'vpgatherdd', 'vpgatherdq', 'vpgatherqd', 'vpgatherqq', 'vpmulhw', 'vpmulld', 'vpmullw', 'vpscatterdd', 'vpscatterdq', 'vpsubb', 'vpsubd', 'vpsubq', 'vpsubw' 'vptestmd']:
    rules[mnemonic] = RuleType.U_TO_S
for mnemonic in ['valignd', 'vpermd', 'vpdpbusd', 'vpdpbusds', 'vpdpwssd', 'vpdpwssds']:
    rules[mnemonic] = None

print(str(list(rules.keys())))

with open('../Data/instructions.json', 'r') as f:
    db = json.load(f)

for insn in db:
    mnemonic = insn['mnemonic']
    if mnemonic not in rules:
        continue
    prefix = insn.get('encoding', '').upper()
    #if prefix in ['VEX', 'EVEX', 'MVEX', 'XOP']:
    #    continue

    rule = rules[mnemonic]
    if rule == RuleType.E_TO_S_34 and prefix != 'EVEX':
        continue
    last_visible = len(get_operands(insn)) - 1
    for i, op in enumerate(get_operands(insn, True)):
        if not op.get('visible', True) and rule != RuleType.S_TO_U_FORCE:
            continue
        if op.get('operand_type', '') == 'mask':
            continue

        if rule in [RuleType.U_TO_S, RuleType.U_TO_S_NOT_LAST, RuleType.C_TO_S1U2, RuleType.E_TO_S_34]:
            convert = not any([
                rule == RuleType.C_TO_S1U2 and i != 0,
                rule == RuleType.E_TO_S_34 and i not in [2, 3],
                rule == RuleType.U_TO_S_NOT_LAST and i == last_visible,
            ])
            if convert:
                if 'element_type' in op and op['element_type'].startswith('uint'):
                    op['element_type'] = op['element_type'][1:]
                if 'encoding' in op and op['encoding'].startswith('uimm'):
                    op['encoding'] = 's' + op['encoding'][1:]

        if rule in [RuleType.S_TO_U, RuleType.S_TO_U_FORCE, RuleType.S_TO_U_12, RuleType.S_TO_U_1, RuleType.S_TO_U_2, RuleType.S_TO_U_3, RuleType.C_TO_S1U2, RuleType.U_TO_S_NOT_LAST]:
            convert = not any([
                rule == RuleType.S_TO_U_12 and i > 1,
                rule == RuleType.S_TO_U_1 and i != 0,
                rule in [RuleType.S_TO_U_2, RuleType.C_TO_S1U2] and i != 1,
                rule == RuleType.S_TO_U_3 and i != 2,
                rule == RuleType.U_TO_S_NOT_LAST and i != last_visible,
            ])
            if convert:
                if 'element_type' in op and op['element_type'].startswith('int'):
                    op['element_type'] = 'u' + op['element_type']
                if 'encoding' in op and op['encoding'].startswith('simm'):
                    op['encoding'] = 'u' + op['encoding'][1:]

                if rule != RuleType.S_TO_U and 'element_type' not in op:
                    op['element_type'] = 'uint'

        if mnemonic in ['valignd', 'vpermd'] and op.get('operand_type', '') == 'mem' and op['element_type'] == 'int32':
            op['element_type'] = 'float32'
        if mnemonic in ['vpdpbusd', 'vpdpbusds']:
            if prefix == 'VEX':
                if i == 1:
                    op['element_type'] = 'uint8'
                elif i == 2:
                    op['element_type'] = 'int8'
            elif prefix == 'EVEX':
                if i == 2:
                    op['element_type'] = 'uint8'
                elif i == 3:
                    if op.get('operand_type', '') == 'mem':
                        op['element_type'] = 'int32'
                    else:
                        op['element_type'] = 'int8'
        if mnemonic in ['vpdpwssd', 'vpdpwssds']:
            if prefix == 'VEX' and i in [1, 2]:
                    op['element_type'] = 'int16'
            elif prefix == 'EVEX':
                if i == 2:
                    op['element_type'] = 'int16'
                elif i == 3:
                    if op.get('operand_type', '') == 'mem':
                        op['element_type'] = 'int32'
                    else:
                        op['element_type'] = 'int16'

with open('../Data/instructions_fixed.json', 'w') as f:
    f.write(json.dumps(db, indent=2).replace('"mask_flags": []', '"mask_flags": [\n      ]'))
    f.write('\n')

Copy link
Member

@flobernd flobernd left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I briefly swiped through the changes and it looks good so far 😃

@athre0z athre0z merged commit 16c114f into zyantific:master May 28, 2022
@mappzor mappzor deleted the signedness branch July 18, 2022 13:14
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants