1import abc 

2from functools import partial 

3 

4import sympy as sp 

5from sympy.core.cache import cacheit 

6 

7from pystencils.astnodes import Block, Conditional 

8from pystencils.data_types import TypedSymbol, create_type 

9from pystencils.integer_functions import div_ceil, div_floor 

10from pystencils.slicing import normalize_slice 

11from pystencils.sympyextensions import is_integer_sequence, prod 

12 

13 

14class ThreadIndexingSymbol(TypedSymbol): 

15 def __new__(cls, *args, **kwds): 

16 obj = ThreadIndexingSymbol.__xnew_cached_(cls, *args, **kwds) 

17 return obj 

18 

19 def __new_stage2__(cls, name, dtype, *args, **kwargs): 

20 obj = super(ThreadIndexingSymbol, cls).__xnew__(cls, name, dtype, *args, **kwargs) 

21 return obj 

22 

23 __xnew__ = staticmethod(__new_stage2__) 

24 __xnew_cached_ = staticmethod(cacheit(__new_stage2__)) 

25 

26 

27BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')] 

28THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')] 

29BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')] 

30GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')] 

31 

32 

33class AbstractIndexing(abc.ABC): 

34 """ 

35 Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional 

36 field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices 

37 and computes the number of blocks and threads a kernel is started with. The Indexing class is created with 

38 a pystencils field, a slice to iterate over, and further optional parameters that must have default values. 

39 """ 

40 

41 @property 

42 @abc.abstractmethod 

43 def coordinates(self): 

44 """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices. 

45 These symbolic indices can be obtained with the method `index_variables` """ 

46 

47 @property 

48 def index_variables(self): 

49 """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """ 

50 return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM 

51 

52 @abc.abstractmethod 

53 def call_parameters(self, arr_shape): 

54 """Determine grid and block size for kernel call. 

55 

56 Args: 

57 arr_shape: the numeric (not symbolic) shape of the array 

58 Returns: 

59 dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks 

60 the kernel should be started with 

61 """ 

62 

63 @abc.abstractmethod 

64 def guard(self, kernel_content, arr_shape): 

65 """In some indexing schemes not all threads of a block execute the kernel content. 

66 

67 This function can return a Conditional ast node, defining this execution guard. 

68 

69 Args: 

70 kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block 

71 arr_shape: the numeric or symbolic shape of the field 

72 

73 Returns: 

74 ast node, which is put inside the kernel function 

75 """ 

76 

77 @abc.abstractmethod 

78 def max_threads_per_block(self): 

79 """Return maximal number of threads per block for launch bounds. If this cannot be determined without 

80 knowing the array shape return None for unknown """ 

81 

82 @abc.abstractmethod 

83 def symbolic_parameters(self): 

84 """Set of symbols required in call_parameters code""" 

85 

86 

87# -------------------------------------------- Implementations --------------------------------------------------------- 

88 

89 

90class BlockIndexing(AbstractIndexing): 

91 """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks. 

92 

93 Args: 

94 field: pystencils field (common to all Indexing classes) 

95 iteration_slice: slice that defines rectangular subarea which is iterated over 

96 permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate 

97 gets the largest amount of threads 

98 compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used 

99 """ 

100 

101 def __init__(self, field, iteration_slice, 

102 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, 

103 maximum_block_size=(1024, 1024, 64)): 

104 if field.spatial_dimensions > 3: 

105 raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") 

106 

107 if permute_block_size_dependent_on_layout: 

108 block_size = self.permute_block_size_according_to_layout(block_size, field.layout) 

109 

110 self._block_size = block_size 

111 if maximum_block_size == 'auto': 

112 # Get device limits 

113 import pycuda.driver as cuda 

114 # noinspection PyUnresolvedReferences 

115 import pycuda.autoinit # NOQA 

116 da = cuda.device_attribute 

117 device = cuda.Context.get_device() 

118 maximum_block_size = tuple(device.get_attribute(a) 

119 for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)) 

120 

121 self._maximum_block_size = maximum_block_size 

122 self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape) 

123 self._dim = field.spatial_dimensions 

124 self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape] 

125 self._compile_time_block_size = compile_time_block_size 

126 

127 @property 

128 def coordinates(self): 

129 offsets = _get_start_from_slice(self._iterationSlice) 

130 block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM 

131 coordinates = [block_index * bs + thread_idx + off 

132 for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)] 

133 

134 return coordinates[:self._dim] 

135 

136 def call_parameters(self, arr_shape): 

137 substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None} 

138 

139 widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice), 

140 _get_end_from_slice(self._iterationSlice, arr_shape))] 

141 widths = sp.Matrix(widths).subs(substitution_dict) 

142 extend_bs = (1,) * (3 - len(self._block_size)) 

143 block_size = self._block_size + extend_bs 

144 if not self._compile_time_block_size: 

145 assert len(block_size) == 3 

146 adapted_block_size = [] 

147 for i in range(len(widths)): 

148 factor = div_floor(prod(block_size[:i]), prod(adapted_block_size)) 

149 adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i])) 

150 block_size = tuple(adapted_block_size) + extend_bs 

151 

152 block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size)) 

153 grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size)) 

154 extend_gr = (1,) * (3 - len(grid)) 

155 

156 return {'block': block_size, 

157 'grid': grid + extend_gr} 

158 

159 def guard(self, kernel_content, arr_shape): 

160 arr_shape = arr_shape[:self._dim] 

161 conditions = [c < end 

162 for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))] 

163 condition = conditions[0] 

164 for c in conditions[1:]: 

165 condition = sp.And(condition, c) 

166 return Block([Conditional(condition, kernel_content)]) 

167 

168 @staticmethod 

169 def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None): 

170 """Shrinks the block_size if there are too many registers used per multiprocessor. 

171 This is not done automatically, since the required_registers_per_thread are not known before compilation. 

172 They can be obtained by ``func.num_regs`` from a pycuda function. 

173 :returns smaller block_size if too many registers are used. 

174 """ 

175 import pycuda.driver as cuda 

176 # noinspection PyUnresolvedReferences 

177 import pycuda.autoinit # NOQA 

178 

179 da = cuda.device_attribute 

180 if device is None: 

181 device = cuda.Context.get_device() 

182 available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR) 

183 

184 block = block_size 

185 

186 while True: 

187 num_threads = 1 

188 for t in block: 

189 num_threads *= t 

190 required_registers_per_mt = num_threads * required_registers_per_thread 

191 if required_registers_per_mt <= available_registers_per_mp: 

192 return block 

193 else: 

194 largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e]) 

195 assert block[largest_grid_entry_idx] >= 2 

196 block[largest_grid_entry_idx] //= 2 

197 

198 @staticmethod 

199 def permute_block_size_according_to_layout(block_size, layout): 

200 """Returns modified block_size such that the fastest coordinate gets the biggest block dimension""" 

201 if not is_integer_sequence(block_size): 

202 return block_size 

203 sorted_block_size = list(sorted(block_size, reverse=True)) 

204 while len(sorted_block_size) > len(layout): 

205 sorted_block_size[0] *= sorted_block_size[-1] 

206 sorted_block_size = sorted_block_size[:-1] 

207 

208 result = list(block_size) 

209 for l, bs in zip(reversed(layout), sorted_block_size): 

210 result[l] = bs 

211 return tuple(result[:len(layout)]) 

212 

213 def max_threads_per_block(self): 

214 if is_integer_sequence(self._block_size): 

215 return prod(self._block_size) 

216 else: 

217 return None 

218 

219 def symbolic_parameters(self): 

220 return set(b for b in self._block_size if isinstance(b, sp.Symbol)) 

221 

222 

223class LineIndexing(AbstractIndexing): 

224 """ 

225 Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block. 

226 The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z} 

227 This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the 

228 maximum amount of threads allowed in a CUDA block (which depends on device). 

229 """ 

230 

231 def __init__(self, field, iteration_slice): 

232 available_indices = [THREAD_IDX[0]] + BLOCK_IDX 

233 if field.spatial_dimensions > 4: 

234 raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") 

235 

236 coordinates = available_indices[:field.spatial_dimensions] 

237 

238 fastest_coordinate = field.layout[-1] 

239 coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0] 

240 

241 self._coordinates = coordinates 

242 self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape) 

243 self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape] 

244 

245 @property 

246 def coordinates(self): 

247 return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))] 

248 

249 def call_parameters(self, arr_shape): 

250 substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None} 

251 

252 widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice), 

253 _get_end_from_slice(self._iterationSlice, arr_shape))] 

254 widths = sp.Matrix(widths).subs(substitution_dict) 

255 

256 def get_shape_of_cuda_idx(cuda_idx): 

257 if cuda_idx not in self._coordinates: 

258 return 1 

259 else: 

260 idx = self._coordinates.index(cuda_idx) 

261 return widths[idx] 

262 

263 return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]), 

264 'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])} 

265 

266 def guard(self, kernel_content, arr_shape): 

267 return kernel_content 

268 

269 def max_threads_per_block(self): 

270 return None 

271 

272 def symbolic_parameters(self): 

273 return set() 

274 

275 

276# -------------------------------------- Helper functions -------------------------------------------------------------- 

277 

278def _get_start_from_slice(iteration_slice): 

279 res = [] 

280 for slice_component in iteration_slice: 

281 if type(slice_component) is slice: 

282 res.append(slice_component.start if slice_component.start is not None else 0) 

283 else: 

284 assert isinstance(slice_component, int) 

285 res.append(slice_component) 

286 return res 

287 

288 

289def _get_end_from_slice(iteration_slice, arr_shape): 

290 iter_slice = normalize_slice(iteration_slice, arr_shape) 

291 res = [] 

292 for slice_component in iter_slice: 

293 if type(slice_component) is slice: 

294 res.append(slice_component.stop) 

295 else: 

296 assert isinstance(slice_component, int) 

297 res.append(slice_component + 1) 

298 return res 

299 

300 

301def indexing_creator_from_params(gpu_indexing, gpu_indexing_params): 

302 if isinstance(gpu_indexing, str): 

303 if gpu_indexing == 'block': 

304 indexing_creator = BlockIndexing 

305 elif gpu_indexing == 'line': 

306 indexing_creator = LineIndexing 

307 else: 

308 raise ValueError(f"Unknown GPU indexing {gpu_indexing}. Valid values are 'block' and 'line'") 

309 if gpu_indexing_params: 

310 indexing_creator = partial(indexing_creator, **gpu_indexing_params) 

311 return indexing_creator 

312 else: 

313 return gpu_indexing