1def get_argument_string(function_shortcut, first=''): 

2 args = function_shortcut[function_shortcut.index('[') + 1: -1] 

3 arg_string = "(" 

4 if first: 

5 arg_string += first + ', ' 

6 for arg in args.split(","): 

7 arg = arg.strip() 

8 if not arg: 

9 continue 

10 if arg in ('0', '1', '2', '3', '4', '5'): 

11 arg_string += "{" + arg + "}," 

12 else: 

13 arg_string += arg + "," 

14 arg_string = arg_string[:-1] + ")" 

15 return arg_string 

16 

17 

18def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): 

19 if instruction_set != 'neon' and not instruction_set.startswith('sve'): 

20 raise NotImplementedError(instruction_set) 

21 if instruction_set == 'sve': 

22 raise NotImplementedError("sizeless SVE is not implemented") 

23 

24 if instruction_set.startswith('sve'): 

25 cmp = 'cmp' 

26 bitwidth = int(instruction_set[3:]) 

27 elif instruction_set == 'neon': 

28 cmp = 'c' 

29 bitwidth = 128 

30 

31 base_names = { 

32 '+': 'add[0, 1]', 

33 '-': 'sub[0, 1]', 

34 '*': 'mul[0, 1]', 

35 '/': 'div[0, 1]', 

36 'sqrt': 'sqrt[0]', 

37 

38 'loadU': 'ld1[0]', 

39 'loadA': 'ld1[0]', 

40 'storeU': 'st1[0, 1]', 

41 'storeA': 'st1[0, 1]', 

42 

43 'abs': 'abs[0]', 

44 '==': f'{cmp}eq[0, 1]', 

45 '!=': f'{cmp}eq[0, 1]', 

46 '<=': f'{cmp}le[0, 1]', 

47 '<': f'{cmp}lt[0, 1]', 

48 '>=': f'{cmp}ge[0, 1]', 

49 '>': f'{cmp}gt[0, 1]', 

50 } 

51 

52 bits = {'double': 64, 

53 'float': 32, 

54 'int': 32} 

55 

56 width = bitwidth // bits[data_type] 

57 intwidth = bitwidth // bits['int'] 

58 if instruction_set.startswith('sve'): 

59 prefix = 'sv' 

60 suffix = f'_f{bits[data_type]}' 

61 elif instruction_set == 'neon': 

62 prefix = 'v' 

63 suffix = f'q_f{bits[data_type]}' 

64 

65 result = dict() 

66 result['bytes'] = bitwidth // 8 

67 

68 predicate = f'{prefix}whilelt_b{bits[data_type]}(0, {width})' 

69 int_predicate = f'{prefix}whilelt_b{bits["int"]}(0, {intwidth})' 

70 

71 for intrinsic_id, function_shortcut in base_names.items(): 

72 function_shortcut = function_shortcut.strip() 

73 name = function_shortcut[:function_shortcut.index('[')] 

74 

75 arg_string = get_argument_string(function_shortcut, first=predicate if prefix == 'sv' else '') 

76 if prefix == 'sv' and not name.startswith('ld') and not name.startswith('st') and not name.startswith(cmp): 

77 undef = '_x' 

78 else: 

79 undef = '' 

80 

81 result[intrinsic_id] = prefix + name + suffix + undef + arg_string 

82 

83 result['width'] = width 

84 result['intwidth'] = intwidth 

85 

86 if instruction_set.startswith('sve'): 

87 result['makeVecConst'] = f'svdup_f{bits[data_type]}' + '({0})' 

88 result['makeVecConstInt'] = f'svdup_s{bits["int"]}' + '({0})' 

89 result['makeVecIndex'] = f'svindex_s{bits["int"]}' + '({0}, {1})' 

90 

91 vindex = f'svindex_u{bits[data_type]}(0, {{0}})' 

92 result['scatter'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \ 

93 vindex.format("{2}") + ', {1})' 

94 result['gather'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \ 

95 vindex.format("{1}") + ')' 

96 

97 result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})" 

98 

99 result['float'] = 'svfloat32_st' 

100 result['double'] = 'svfloat64_st' 

101 result['int'] = f'svint{bits["int"]}_st' 

102 result['bool'] = 'svbool_st' 

103 

104 result['headers'] = ['<arm_sve.h>', '"arm_neon_helpers.h"'] 

105 

106 result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})' 

107 result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})' 

108 result['blendv'] = f'svsel_f{bits[data_type]}' + '({2}, {1}, {0})' 

109 result['any'] = f'svptest_any({predicate}, {{0}})' 

110 result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}' 

111 

112 result['maskStoreU'] = result['storeU'].replace(predicate, '{2}') 

113 result['maskStoreA'] = result['storeA'].replace(predicate, '{2}') 

114 result['maskScatter'] = result['scatter'].replace(predicate, '{3}') 

115 

116 result['compile_flags'] = [f'-msve-vector-bits={bitwidth}'] 

117 else: 

118 result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})' 

119 result['makeVec'] = f'makeVec_f{bits[data_type]}' + '(' + ", ".join(['{' + str(i) + '}' for i in 

120 range(width)]) + ')' 

121 result['makeVecConstInt'] = f'vdupq_n_s{bits["int"]}' + '({0})' 

122 result['makeVecInt'] = f'makeVec_s{bits["int"]}' + '({0}, {1}, {2}, {3})' 

123 

124 result['+int'] = f"vaddq_s{bits['int']}" + "({0}, {1})" 

125 

126 result[data_type] = f'float{bits[data_type]}x{width}_t' 

127 result['int'] = f'int{bits["int"]}x{intwidth}_t' 

128 result['bool'] = f'uint{bits[data_type]}x{width}_t' 

129 

130 result['headers'] = ['<arm_neon.h>', '"arm_neon_helpers.h"'] 

131 

132 result['!='] = f'vmvnq_u{bits[data_type]}({result["=="]})' 

133 

134 result['&'] = f'vandq_u{bits[data_type]}' + '({0}, {1})' 

135 result['|'] = f'vorrq_u{bits[data_type]}' + '({0}, {1})' 

136 result['blendv'] = f'vbslq_f{bits[data_type]}' + '({2}, {1}, {0})' 

137 result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0' 

138 result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff' 

139 

140 if bitwidth & (bitwidth - 1) == 0: 

141 # only power-of-2 vector sizes will evenly divide a cacheline 

142 result['cachelineSize'] = 'cachelineSize()' 

143 result['cachelineZero'] = 'cachelineZero((void*) {0})' 

144 

145 return result