1def get_argument_string(intrinsic_id, width, function_shortcut): 

2 if intrinsic_id == 'makeVecConst' or intrinsic_id == 'makeVecConstInt': 

3 arg_string = f"({','.join(['{0}'] * width)})" 

4 elif intrinsic_id == 'makeVec' or intrinsic_id == 'makeVecInt': 

5 params = ["{" + str(i) + "}" for i in reversed(range(width))] 

6 arg_string = f"({','.join(params)})" 

7 elif intrinsic_id == 'makeVecBool': 

8 params = [f"(({{{i}}} ? -1.0 : 0.0)" for i in reversed(range(width))] 

9 arg_string = f"({','.join(params)})" 

10 elif intrinsic_id == 'makeVecConstBool': 

11 params = ["(({0}) ? -1.0 : 0.0)" for _ in range(width)] 

12 arg_string = f"({','.join(params)})" 

13 else: 

14 args = function_shortcut[function_shortcut.index('[') + 1: -1] 

15 arg_string = "(" 

16 for arg in args.split(","): 

17 arg = arg.strip() 

18 if not arg: 

19 continue 

20 if arg in ('0', '1', '2', '3', '4', '5'): 

21 arg_string += "{" + arg + "}," 

22 else: 

23 arg_string += arg + "," 

24 arg_string = arg_string[:-1] + ")" 

25 return arg_string 

26 

27 

28def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): 

29 comparisons = { 

30 '==': '_CMP_EQ_UQ', 

31 '!=': '_CMP_NEQ_UQ', 

32 '>=': '_CMP_GE_OQ', 

33 '<=': '_CMP_LE_OQ', 

34 '<': '_CMP_NGE_UQ', 

35 '>': '_CMP_NLE_UQ', 

36 } 

37 base_names = { 

38 '+': 'add[0, 1]', 

39 '-': 'sub[0, 1]', 

40 '*': 'mul[0, 1]', 

41 '/': 'div[0, 1]', 

42 '&': 'and[0, 1]', 

43 '|': 'or[0, 1]', 

44 'blendv': 'blendv[0, 1, 2]', 

45 

46 'sqrt': 'sqrt[0]', 

47 

48 'makeVecConst': 'set[]', 

49 'makeVec': 'set[]', 

50 'makeVecBool': 'set[]', 

51 'makeVecConstBool': 'set[]', 

52 'makeVecInt': 'set[]', 

53 'makeVecConstInt': 'set[]', 

54 

55 'loadU': 'loadu[0]', 

56 'loadA': 'load[0]', 

57 'storeU': 'storeu[0,1]', 

58 'storeA': 'store[0,1]', 

59 'stream': 'stream[0,1]', 

60 'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]', 

61 'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]', 

62 } 

63 

64 for comparison_op, constant in comparisons.items(): 

65 base_names[comparison_op] = f'cmp[0, 1, {constant}]' 

66 

67 headers = { 

68 'avx512': ['<immintrin.h>'], 

69 'avx': ['<immintrin.h>'], 

70 'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>', 

71 '<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>'] 

72 } 

73 

74 suffix = { 

75 'double': 'pd', 

76 'float': 'ps', 

77 'int': 'epi32' 

78 } 

79 prefix = { 

80 'sse': '_mm', 

81 'avx': '_mm256', 

82 'avx512': '_mm512', 

83 } 

84 

85 width = { 

86 ("double", "sse"): 2, 

87 ("float", "sse"): 4, 

88 ("int", "sse"): 4, 

89 ("double", "avx"): 4, 

90 ("float", "avx"): 8, 

91 ("int", "avx"): 8, 

92 ("double", "avx512"): 8, 

93 ("float", "avx512"): 16, 

94 ("int", "avx512"): 16, 

95 } 

96 

97 result = { 

98 'width': width[(data_type, instruction_set)], 

99 'intwidth': width[('int', instruction_set)], 

100 'bytes': 4 * width[("float", instruction_set)] 

101 } 

102 pre = prefix[instruction_set] 

103 for intrinsic_id, function_shortcut in base_names.items(): 

104 function_shortcut = function_shortcut.strip() 

105 name = function_shortcut[:function_shortcut.index('[')] 

106 

107 if 'Int' in intrinsic_id: 

108 suf = suffix['int'] 

109 arg_string = get_argument_string(intrinsic_id, result['intwidth'], function_shortcut) 

110 else: 

111 suf = suffix[data_type] 

112 arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut) 

113 

114 mask_suffix = '_mask' if instruction_set == 'avx512' and intrinsic_id in comparisons.keys() else '' 

115 result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string 

116 

117 result['dataTypePrefix'] = { 

118 'double': "_" + pre + 'd', 

119 'float': "_" + pre, 

120 } 

121 

122 bit_width = result['width'] * (64 if data_type == 'double' else 32) 

123 result['double'] = f"__m{bit_width}d" 

124 result['float'] = f"__m{bit_width}" 

125 result['int'] = f"__m{bit_width}i" 

126 result['bool'] = result[data_type] 

127 

128 result['headers'] = headers[instruction_set] 

129 result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0" 

130 result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}" 

131 

132 if instruction_set == 'avx512': 

133 size = result['width'] 

134 result['&'] = f'_kand_mask{size}({{0}}, {{1}})' 

135 result['|'] = f'_kor_mask{size}({{0}}, {{1}})' 

136 result['any'] = f'!_ktestz_mask{size}_u8({{0}}, {{0}})' 

137 result['all'] = f'_kortestc_mask{size}_u8({{0}}, {{0}})' 

138 result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})' 

139 result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})" 

140 result['abs'] = f"{pre}_abs_{suf}({{0}})" 

141 result['bool'] = f"__mmask{size}" 

142 

143 params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)]) 

144 result['makeVecBool'] = f"__mmask8(({params}) )" 

145 params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)]) 

146 result['makeVecConstBool'] = f"__mmask8(({params}) )" 

147 

148 vindex = f'{pre}_set_epi{bit_width//size}(' + ', '.join([str(i) for i in range(result['width'])][::-1]) + ')' 

149 vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}({{0}}))' 

150 result['scatter'] = f'{pre}_i{bit_width//size}scatter_{suf}({{0}}, ' + vindex.format("{2}") + \ 

151 f', {{1}}, {64//size})' 

152 result['maskScatter'] = f'{pre}_mask_i{bit_width//size}scatter_{suf}({{0}}, {{3}}, ' + vindex.format("{2}") + \ 

153 f', {{1}}, {64//size})' 

154 result['gather'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {64//size})' 

155 

156 if instruction_set == 'avx' and data_type == 'float': 

157 result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})" 

158 

159 result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})" 

160 

161 result['streamFence'] = '_mm_mfence()' 

162 

163 return result