1import subprocess 

2import warnings 

3import tempfile 

4from pathlib import Path 

5 

6from jinja2 import Environment, PackageLoader, StrictUndefined 

7 

8from pystencils.astnodes import PragmaBlock 

9from pystencils.backends.cbackend import generate_c, get_headers 

10from pystencils.cpu.cpujit import get_compiler_config, run_compile_step 

11from pystencils.data_types import get_base_type 

12from pystencils.include import get_pystencils_include_path 

13from pystencils.integer_functions import modulo_ceil 

14from pystencils.sympyextensions import prod 

15 

16import numpy as np 

17 

18 

19def generate_benchmark(ast, likwid=False, openmp=False, timing=False): 

20 """Return C code of a benchmark program for the given kernel. 

21 

22 Args: 

23 ast: the pystencils AST object as returned by create_kernel 

24 likwid: if True likwid markers are added to the code 

25 openmp: relevant only if likwid=True, to generated correct likwid initialization code 

26 timing: add timing output to the code, prints time per iteration to stdout 

27 

28 Returns: 

29 C code as string 

30 """ 

31 accessed_fields = {f.name: f for f in ast.fields_accessed} 

32 constants = [] 

33 fields = [] 

34 call_parameters = [] 

35 for p in ast.get_parameters(): 

36 if not p.is_field_parameter: 

37 constants.append((p.symbol.name, str(p.symbol.dtype))) 

38 call_parameters.append(p.symbol.name) 

39 else: 

40 assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" 

41 field = accessed_fields[p.field_name] 

42 dtype = str(get_base_type(p.symbol.dtype)) 

43 np_dtype = get_base_type(p.symbol.dtype).numpy_dtype 

44 size_data_type = np_dtype.itemsize 

45 

46 dim0_size = field.shape[-1] 

47 dim1_size = np.prod(field.shape[:-1]) 

48 elements = prod(field.shape) 

49 

50 if ast.instruction_set: 

51 align = ast.instruction_set['width'] * size_data_type 

52 padding_elements = modulo_ceil(dim0_size, ast.instruction_set['width']) - dim0_size 

53 padding_bytes = padding_elements * size_data_type 

54 ghost_layers = max(max(ast.ghost_layers)) 

55 

56 size = dim1_size * padding_bytes + np.prod(field.shape) * size_data_type 

57 

58 assert align % np_dtype.itemsize == 0 

59 offset = ((dim0_size + padding_elements + ghost_layers) % ast.instruction_set['width']) * size_data_type 

60 

61 fields.append((p.field_name, dtype, elements, size, offset, align)) 

62 call_parameters.append(p.field_name) 

63 else: 

64 size = elements * size_data_type 

65 fields.append((p.field_name, dtype, elements, size, 0, 0)) 

66 call_parameters.append(p.field_name) 

67 

68 header_list = get_headers(ast) 

69 includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) 

70 

71 # Strip "#pragma omp parallel" from within kernel, because main function takes care of that 

72 # when likwid and openmp are enabled 

73 if likwid and openmp: 

74 if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock): 

75 ast.body.args[0].pragma_line = '' 

76 

77 jinja_context = { 

78 'likwid': likwid, 

79 'openmp': openmp, 

80 'kernel_code': generate_c(ast, dialect='c'), 

81 'kernelName': ast.function_name, 

82 'fields': fields, 

83 'constants': constants, 

84 'call_argument_list': ",".join(call_parameters), 

85 'includes': includes, 

86 'timing': timing, 

87 } 

88 

89 env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) 

90 

91 return env.get_template('benchmark.c').render(**jinja_context) 

92 

93 

94def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None): 

95 """Runs the given kernel with outer loop in C 

96 

97 Args: 

98 ast: pystencils ast which is used to compile the benchmark file 

99 inner_iterations: timings are recorded around this many iterations 

100 outer_iterations: number of timings recorded 

101 path: path where the benchmark file is stored. If None a tmp folder is created 

102 

103 Returns: 

104 list of times per iterations for each outer iteration 

105 """ 

106 import kerncraft 

107 

108 benchmark_code = generate_benchmark(ast, timing=True) 

109 

110 if path is None: 

111 path = tempfile.mkdtemp() 

112 

113 if isinstance(path, str): 

114 path = Path(path) 

115 

116 with open(path / 'bench.c', 'w') as f: 

117 f.write(benchmark_code) 

118 

119 kerncraft_path = Path(kerncraft.__file__).parent 

120 

121 extra_flags = ['-I' + get_pystencils_include_path(), 

122 '-I' + str(kerncraft_path / 'headers')] 

123 

124 compiler_config = get_compiler_config() 

125 compile_cmd = [compiler_config['command']] + compiler_config['flags'].split() 

126 compile_cmd += [*extra_flags, 

127 str(kerncraft_path / 'headers' / 'timing.c'), 

128 str(kerncraft_path / 'headers' / 'dummy.c'), 

129 str(path / 'bench.c'), 

130 '-o', str(path / 'bench'), 

131 ] 

132 run_compile_step(compile_cmd) 

133 

134 time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)])) 

135 benchmark_time_limit = 20 

136 if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations: 

137 warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than " 

138 f"{benchmark_time_limit} seconds for this kernel") 

139 warnings.warn(warn) 

140 

141 results = [] 

142 for _ in range(outer_iterations): 

143 benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)])) 

144 results.append(benchmark_time) 

145 return results