1import warnings 

2import fcntl 

3from collections import defaultdict 

4from tempfile import TemporaryDirectory 

5import textwrap 

6import itertools 

7import string 

8 

9from jinja2 import Environment, PackageLoader, StrictUndefined, Template 

10import sympy as sp 

11from kerncraft.kerncraft import KernelCode 

12from kerncraft.kernel import symbol_pos_int 

13from kerncraft.machinemodel import MachineModel 

14 

15from pystencils.astnodes import \ 

16 KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment 

17from pystencils.backends.cbackend import generate_c, get_headers 

18from pystencils.field import get_layout_from_strides 

19from pystencils.sympyextensions import count_operations_in_ast 

20from pystencils.transformations import filtered_tree_iteration 

21from pystencils.utils import DotDict 

22from pystencils.cpu.kernelcreation import add_openmp 

23from pystencils.data_types import get_base_type 

24from pystencils.sympyextensions import prod 

25 

26 

27class PyStencilsKerncraftKernel(KernelCode): 

28 """ 

29 Implementation of kerncraft's kernel interface for pystencils CPU kernels. 

30 Analyses a list of equations assuming they will be executed on a CPU 

31 """ 

32 LIKWID_BASE = '/usr/local/likwid' 

33 

34 def __init__(self, ast: KernelFunction, machine: MachineModel, 

35 assumed_layout='SoA', debug_print=False, filename=None): 

36 """Create a kerncraft kernel using a pystencils AST 

37 

38 Args: 

39 ast: pystencils ast 

40 machine: kerncraft machine model - specify this if kernel needs to be compiled 

41 assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index 

42 coordinates is not known. In this case either a structures of array (SoA) or 

43 array of structures (AoS) layout is assumed 

44 debug_print: print debug information 

45 filename: used for caching 

46 """ 

47 super(KernelCode, self).__init__(machine=machine) 

48 

49 # Initialize state 

50 self.asm_block = None 

51 self._filename = filename 

52 self._keep_intermediates = False 

53 

54 self.kernel_ast = ast 

55 self.temporary_dir = TemporaryDirectory() 

56 self._keep_intermediates = debug_print 

57 

58 # Loops 

59 inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment) 

60 if l.is_innermost_loop] 

61 if len(inner_loops) == 0: 

62 raise ValueError("No loop found in pystencils AST") 

63 else: 

64 if len(inner_loops) > 1: 

65 warnings.warn("pystencils AST contains multiple inner loops. " 

66 "Only one can be analyzed - choosing first one") 

67 inner_loop = inner_loops[0] 

68 

69 self._loop_stack = [] 

70 cur_node = inner_loop 

71 while cur_node is not None: 

72 if isinstance(cur_node, LoopOverCoordinate): 

73 loop_counter_sym = cur_node.loop_counter_symbol 

74 loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, 1) 

75 # If the correct step were to be provided, all access within that step length will 

76 # also need to be passed to kerncraft: cur_node.step) 

77 self._loop_stack.append(loop_info) 

78 cur_node = cur_node.parent 

79 self._loop_stack = list(reversed(self._loop_stack)) 

80 

81 def get_layout_tuple(f): 

82 if f.has_fixed_shape: 

83 return get_layout_from_strides(f.strides) 

84 else: 

85 layout_list = list(f.layout) 

86 for _ in range(f.index_dimensions): 

87 layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1) 

88 return layout_list 

89 

90 # Variables (arrays) and Constants (scalar sizes) 

91 const_names_iter = itertools.product(string.ascii_uppercase, repeat=1) 

92 constants_reversed = {} 

93 fields_accessed = self.kernel_ast.fields_accessed 

94 for field in fields_accessed: 

95 layout = get_layout_tuple(field) 

96 permuted_shape = list(field.shape[i] for i in layout) 

97 # Replace shape dimensions with constant variables (necessary for layer condition 

98 # analysis) 

99 for i, d in enumerate(permuted_shape): 

100 if d not in self.constants.values(): 

101 const_symbol = symbol_pos_int(''.join(next(const_names_iter))) 

102 self.set_constant(const_symbol, d) 

103 constants_reversed[d] = const_symbol 

104 permuted_shape[i] = constants_reversed[d] 

105 self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape)) 

106 

107 # Data sources & destinations 

108 self.sources = defaultdict(list) 

109 self.destinations = defaultdict(list) 

110 

111 reads, writes = search_resolved_field_accesses_in_ast(inner_loop) 

112 for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]: 

113 for fa in accesses: 

114 coord = [symbol_pos_int(LoopOverCoordinate.get_loop_counter_name(i)) + off 

115 for i, off in enumerate(fa.offsets)] 

116 coord += list(fa.idx_coordinate_values) 

117 layout = get_layout_tuple(fa.field) 

118 permuted_coord = [sp.sympify(coord[i]) for i in layout] 

119 target_dict[fa.field.name].append(permuted_coord) 

120 

121 # data type 

122 self.datatype = list(self.variables.values())[0][0] 

123 

124 # flops 

125 operation_count = count_operations_in_ast(inner_loop) 

126 self._flops = { 

127 '+': operation_count['adds'], 

128 '*': operation_count['muls'], 

129 '/': operation_count['divs'], 

130 } 

131 for k in [k for k, v in self._flops.items() if v == 0]: 

132 del self._flops[k] 

133 self.check() 

134 

135 if debug_print: 

136 from pprint import pprint 

137 print("----------------------------- Loop Stack --------------------------") 

138 pprint(self._loop_stack) 

139 print("----------------------------- Sources -----------------------------") 

140 pprint(self.sources) 

141 print("----------------------------- Destinations ------------------------") 

142 pprint(self.destinations) 

143 print("----------------------------- FLOPS -------------------------------") 

144 pprint(self._flops) 

145 

146 def get_kernel_header(self, name='pystencils_kernel'): 

147 file_name = "pystencils_kernel.h" 

148 file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False) 

149 lock_mode, lock_fp = self.lock_intermediate(file_path) 

150 

151 if lock_mode == fcntl.LOCK_SH: 

152 # use cache 

153 pass 

154 else: # lock_mode == fcntl.LOCK_EX: 

155 function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True) 

156 

157 jinja_context = { 

158 'function_signature': function_signature, 

159 } 

160 

161 env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) 

162 file_header = env.get_template('kernel.h').render(**jinja_context) 

163 with open(file_path, 'w') as f: 

164 f.write(file_header) 

165 

166 self.release_exclusive_lock(lock_fp) # degrade to shared lock 

167 return file_path, lock_fp 

168 

169 def get_kernel_code(self, openmp=False, name='pystencils_kernl'): 

170 """ 

171 Generate and return compilable source code from AST. 

172 

173 Args: 

174 openmp: if true, openmp code will be generated 

175 name: kernel name 

176 """ 

177 filename = 'pystencils_kernl' 

178 if openmp: 

179 filename += '-omp' 

180 filename += '.c' 

181 file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False) 

182 lock_mode, lock_fp = self.lock_intermediate(file_path) 

183 

184 if lock_mode == fcntl.LOCK_SH: 

185 # use cache 

186 with open(file_path) as f: 

187 code = f.read() 

188 else: # lock_mode == fcntl.LOCK_EX: 

189 header_list = get_headers(self.kernel_ast) 

190 includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) 

191 

192 if openmp: 

193 add_openmp(self.kernel_ast) 

194 

195 kernel_code = generate_c(self.kernel_ast, dialect='c') 

196 

197 jinja_context = { 

198 'includes': includes, 

199 'kernel_code': kernel_code, 

200 } 

201 

202 env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) 

203 code = env.get_template('kernel.c').render(**jinja_context) 

204 with open(file_path, 'w') as f: 

205 f.write(code) 

206 

207 self.release_exclusive_lock(lock_fp) # degrade to shared lock 

208 return file_path, lock_fp 

209 

210 CODE_TEMPLATE = Template(textwrap.dedent(""" 

211 #include <likwid.h> 

212 #include <stdlib.h> 

213 #include <stdint.h> 

214 #include <stdbool.h> 

215 #include <math.h> 

216 #include "kerncraft.h" 

217 #include "kernel.h" 

218 

219 #define RESTRICT __restrict__ 

220 #define FUNC_PREFIX 

221 void dummy(void *); 

222 extern int var_false; 

223 

224 int main(int argc, char **argv) { 

225 {%- for constantName, dataType in constants %} 

226 // Constant {{constantName}} 

227 {{dataType}} {{constantName}}; 

228 {{constantName}} = 0.23; 

229 {%- endfor %} 

230 

231 // Declaring arrays 

232 {%- for field_name, dataType, size in fields %} 

233 

234 // Initialization {{field_name}} 

235 double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64); 

236 // TODO initialize in parallel context in same order as they are touched 

237 for (unsigned long long i = 0; i < {{size}}; ++i) 

238 {{field_name}}[i] = 0.23; 

239 {%- endfor %} 

240 

241 likwid_markerInit(); 

242 #pragma omp parallel 

243 { 

244 likwid_markerRegisterRegion("loop"); 

245 #pragma omp barrier 

246 

247 // Initializing arrays in same order as touched in kernel loop nest 

248 //INIT_ARRAYS; 

249 

250 // Dummy call 

251 {%- for field_name, dataType, size in fields %} 

252 if(var_false) dummy({{field_name}}); 

253 {%- endfor %} 

254 {%- for constantName, dataType in constants %} 

255 if(var_false) dummy(&{{constantName}}); 

256 {%- endfor %} 

257 

258 for(int warmup = 1; warmup >= 0; --warmup) { 

259 int repeat = 2; 

260 if(warmup == 0) { 

261 repeat = atoi(argv[1]); 

262 likwid_markerStartRegion("loop"); 

263 } 

264 

265 for(; repeat > 0; --repeat) { 

266 {{kernelName}}({{call_argument_list}}); 

267 

268 {%- for field_name, dataType, size in fields %} 

269 if(var_false) dummy({{field_name}}); 

270 {%- endfor %} 

271 {%- for constantName, dataType in constants %} 

272 if(var_false) dummy(&{{constantName}}); 

273 {%- endfor %} 

274 } 

275 

276 } 

277 likwid_markerStopRegion("loop"); 

278 } 

279 likwid_markerClose(); 

280 return 0; 

281 } 

282 """)) 

283 

284 def get_main_code(self, kernel_function_name='kernel'): 

285 """ 

286 Generate and return compilable source code from AST. 

287 

288 :return: tuple of filename and shared lock file pointer 

289 """ 

290 # TODO produce nicer code, including help text and other "comfort features". 

291 assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \ 

292 "based on a kernel description rather than code." 

293 

294 file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False) 

295 lock_mode, lock_fp = self.lock_intermediate(file_path) 

296 

297 if lock_mode == fcntl.LOCK_SH: 

298 # use cache 

299 with open(file_path) as f: 

300 code = f.read() 

301 else: # lock_mode == fcntl.LOCK_EX 

302 # needs update 

303 accessed_fields = {f.name: f for f in self.kernel_ast.fields_accessed} 

304 constants = [] 

305 fields = [] 

306 call_parameters = [] 

307 for p in self.kernel_ast.get_parameters(): 

308 if not p.is_field_parameter: 

309 constants.append((p.symbol.name, str(p.symbol.dtype))) 

310 call_parameters.append(p.symbol.name) 

311 else: 

312 assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" 

313 field = accessed_fields[p.field_name] 

314 dtype = str(get_base_type(p.symbol.dtype)) 

315 fields.append((p.field_name, dtype, prod(field.shape))) 

316 call_parameters.append(p.field_name) 

317 

318 header_list = get_headers(self.kernel_ast) 

319 includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) 

320 

321 # Generate code 

322 code = self.CODE_TEMPLATE.render( 

323 kernelName=self.kernel_ast.function_name, 

324 fields=fields, 

325 constants=constants, 

326 call_agument_list=','.join(call_parameters), 

327 includes=includes) 

328 

329 # Store to file 

330 with open(file_path, 'w') as f: 

331 f.write(code) 

332 self.release_exclusive_lock(lock_fp) # degrade to shared lock 

333 

334 return file_path, lock_fp 

335 

336 

337class KerncraftParameters(DotDict): 

338 def __init__(self, **kwargs): 

339 super(KerncraftParameters, self).__init__() 

340 self['asm_block'] = 'auto' 

341 self['asm_increment'] = 0 

342 self['cores'] = 1 

343 self['cache_predictor'] = 'SIM' 

344 self['verbose'] = 0 

345 self['pointer_increment'] = 'auto' 

346 self['iterations'] = 10 

347 self['unit'] = 'cy/CL' 

348 self['ignore_warnings'] = True 

349 self['incore_model'] = 'OSACA' 

350 self.update(**kwargs) 

351 

352 

353# ------------------------------------------- Helper functions --------------------------------------------------------- 

354 

355 

356def search_resolved_field_accesses_in_ast(ast): 

357 def visit(node, reads, writes): 

358 if not isinstance(node, SympyAssignment): 

359 for a in node.args: 

360 visit(a, reads, writes) 

361 return 

362 

363 for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]: 

364 accesses.update(expr.atoms(ResolvedFieldAccess)) 

365 

366 read_accesses = set() 

367 write_accesses = set() 

368 visit(ast, read_accesses, write_accesses) 

369 return read_accesses, write_accesses