File: | tools/polly/lib/External/ppcg/gpu_group.c |
Warning: | line 336, column 2 Value stored to 'nparam' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* |
2 | * Copyright 2010-2011 INRIA Saclay |
3 | * Copyright 2012-2014 Ecole Normale Superieure |
4 | * Copyright 2015 Sven Verdoolaege |
5 | * |
6 | * Use of this software is governed by the MIT license |
7 | * |
8 | * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, |
9 | * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, |
10 | * 91893 Orsay, France |
11 | * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France |
12 | */ |
13 | |
14 | #include <isl/constraint.h> |
15 | #include <isl/ilp.h> |
16 | |
17 | #include "gpu_array_tile.h" |
18 | #include "gpu_group.h" |
19 | #include "gpu_tree.h" |
20 | #include "schedule.h" |
21 | |
22 | /* Print the name of the local copy of a given group of array references. |
23 | */ |
24 | __isl_give isl_printer *gpu_array_ref_group_print_name( |
25 | struct gpu_array_ref_group *group, __isl_take isl_printer *p) |
26 | { |
27 | int global = 0; |
28 | enum ppcg_group_access_type type; |
29 | |
30 | type = gpu_array_ref_group_type(group); |
31 | if (type == ppcg_access_private) |
32 | p = isl_printer_print_str(p, "private_"); |
33 | else if (type == ppcg_access_shared) |
34 | p = isl_printer_print_str(p, "shared_"); |
35 | else |
36 | global = 1; |
37 | p = isl_printer_print_str(p, group->array->name); |
38 | if (!global && group->local_array->n_group > 1) { |
39 | p = isl_printer_print_str(p, "_"); |
40 | p = isl_printer_print_int(p, group->nr); |
41 | } |
42 | |
43 | return p; |
44 | } |
45 | |
46 | /* Return the union of all read (read = 1) and/or write (write = 1) |
47 | * access relations in the group. |
48 | */ |
49 | __isl_give isl_union_map *gpu_array_ref_group_access_relation( |
50 | struct gpu_array_ref_group *group, int read, int write) |
51 | { |
52 | int i; |
53 | isl_union_map *access; |
54 | |
55 | access = isl_union_map_empty(isl_map_get_space(group->access)); |
56 | for (i = 0; i < group->n_ref; ++i) { |
57 | isl_map *map_i; |
58 | |
59 | if (!((read && group->refs[i]->read) || |
60 | (write && group->refs[i]->write))) |
61 | continue; |
62 | map_i = isl_map_copy(group->refs[i]->access); |
63 | access = isl_union_map_union(access, |
64 | isl_union_map_from_map(map_i)); |
65 | } |
66 | |
67 | return access; |
68 | } |
69 | |
70 | /* Should this array reference group be mapped to private, shared or global |
71 | * memory? |
72 | * If we have computed both a private and a shared tile, then |
73 | * the tile with the smallest depth is used. If both have the same depth, |
74 | * then the private tile is used. |
75 | */ |
76 | enum ppcg_group_access_type gpu_array_ref_group_type( |
77 | struct gpu_array_ref_group *group) |
78 | { |
79 | if (group->private_tile && group->shared_tile && |
80 | group->shared_tile->depth < group->private_tile->depth) |
81 | return ppcg_access_shared; |
82 | if (group->private_tile) |
83 | return ppcg_access_private; |
84 | if (group->shared_tile) |
85 | return ppcg_access_shared; |
86 | return ppcg_access_global; |
87 | } |
88 | |
89 | |
90 | /* Return the effective gpu_array_tile associated to "group" or |
91 | * NULL if there is no such gpu_array_tile. |
92 | */ |
93 | struct gpu_array_tile *gpu_array_ref_group_tile( |
94 | struct gpu_array_ref_group *group) |
95 | { |
96 | switch (gpu_array_ref_group_type(group)) { |
97 | case ppcg_access_global: |
98 | return NULL((void*)0); |
99 | case ppcg_access_shared: |
100 | return group->shared_tile; |
101 | case ppcg_access_private: |
102 | return group->private_tile; |
103 | } |
104 | } |
105 | |
106 | /* Does the tile associated to "group" require unrolling of the schedule |
107 | * dimensions mapped to threads? |
108 | * Note that this can only happen for private tiles. |
109 | */ |
110 | int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group) |
111 | { |
112 | struct gpu_array_tile *tile; |
113 | |
114 | tile = gpu_array_ref_group_tile(group); |
115 | if (!tile) |
116 | return 0; |
117 | return tile->requires_unroll; |
118 | } |
119 | |
120 | /* Given a constraint |
121 | * |
122 | * a(p,i) + j = g f(e) |
123 | * |
124 | * or -a(p,i) - j = g f(e) if sign < 0, |
125 | * store a(p,i) in bound->shift and g (stride) in bound->stride. |
126 | * a(p,i) is assumed to be an expression in only the parameters |
127 | * and the input dimensions. |
128 | */ |
129 | static void extract_stride(__isl_keep isl_constraint *c, |
130 | struct gpu_array_bound *bound, __isl_keep isl_val *stride, int sign) |
131 | { |
132 | int i; |
133 | isl_val *v; |
134 | isl_space *space; |
135 | unsigned nparam; |
136 | unsigned nvar; |
137 | isl_aff *aff; |
138 | |
139 | isl_val_free(bound->stride); |
140 | bound->stride = isl_val_copy(stride); |
141 | |
142 | space = isl_constraint_get_space(c); |
143 | space = isl_space_domain(space); |
144 | |
145 | nparam = isl_space_dim(space, isl_dim_param); |
146 | nvar = isl_space_dim(space, isl_dim_set); |
147 | |
148 | v = isl_constraint_get_constant_val(c); |
149 | if (sign < 0) |
150 | v = isl_val_neg(v); |
151 | aff = isl_aff_zero_on_domain(isl_local_space_from_space(space)); |
152 | aff = isl_aff_set_constant_val(aff, v); |
153 | |
154 | for (i = 0; i < nparam; ++i) { |
155 | if (!isl_constraint_involves_dims(c, isl_dim_param, i, 1)) |
156 | continue; |
157 | v = isl_constraint_get_coefficient_val(c, isl_dim_param, i); |
158 | if (sign < 0) |
159 | v = isl_val_neg(v); |
160 | aff = isl_aff_add_coefficient_val(aff, isl_dim_param, i, v); |
161 | } |
162 | |
163 | for (i = 0; i < nvar; ++i) { |
164 | if (!isl_constraint_involves_dims(c, isl_dim_in, i, 1)) |
165 | continue; |
166 | v = isl_constraint_get_coefficient_val(c, isl_dim_in, i); |
167 | if (sign < 0) |
168 | v = isl_val_neg(v); |
169 | aff = isl_aff_add_coefficient_val(aff, isl_dim_in, i, v); |
170 | } |
171 | |
172 | bound->shift = aff; |
173 | } |
174 | |
175 | /* Given an equality constraint of a map with a single output dimension j, |
176 | * check if the constraint is of the form |
177 | * |
178 | * a(p,i) + j = g f(e) |
179 | * |
180 | * with a(p,i) an expression in the parameters and input dimensions |
181 | * and f(e) an expression in the existentially quantified variables. |
182 | * If so, and if g is larger than any such g from a previously considered |
183 | * constraint, then call extract_stride to record the stride information |
184 | * in bound. |
185 | */ |
186 | static isl_stat check_stride_constraint(__isl_take isl_constraint *c, |
187 | void *user) |
188 | { |
189 | int i; |
190 | isl_ctx *ctx; |
191 | isl_val *v; |
192 | unsigned n_div; |
193 | struct gpu_array_bound *bound = user; |
194 | |
195 | ctx = isl_constraint_get_ctx(c); |
196 | n_div = isl_constraint_dim(c, isl_dim_div); |
197 | v = isl_constraint_get_coefficient_val(c, isl_dim_out, 0); |
198 | |
199 | if (n_div && (isl_val_is_one(v) || isl_val_is_negone(v))) { |
200 | int s = isl_val_sgn(v); |
201 | isl_val *stride = isl_val_zero(ctx); |
202 | |
203 | isl_val_free(v); |
204 | for (i = 0; i < n_div; ++i) { |
205 | v = isl_constraint_get_coefficient_val(c, |
206 | isl_dim_div, i); |
207 | stride = isl_val_gcd(stride, v); |
208 | } |
209 | if (!isl_val_is_zero(stride) && |
210 | isl_val_gt(stride, bound->stride)) |
211 | extract_stride(c, bound, stride, s); |
212 | |
213 | isl_val_free(stride); |
214 | } else |
215 | isl_val_free(v); |
216 | |
217 | isl_constraint_free(c); |
218 | return isl_stat_ok; |
219 | } |
220 | |
221 | /* Given contraints on an array index i, check if we can find |
222 | * a shift a(p) and a stride g such that |
223 | * |
224 | * a(p) + i = 0 mod g |
225 | * |
226 | * If so, record the information in bound and apply the mapping |
227 | * i -> (i + a(p))/g to the array index in bounds and return |
228 | * the new constraints. |
229 | * If not, simply return the original constraints. |
230 | * |
231 | * If bounds is a subset of the space |
232 | * |
233 | * D -> i |
234 | * |
235 | * then the bound recorded in bound->shift is of the form |
236 | * |
237 | * D -> s(D) |
238 | * |
239 | * with s(D) equal to a(p) above. |
240 | * Next, we construct a mapping of the form |
241 | * |
242 | * [D -> i] -> [D -> (i + S(D))/g] |
243 | * |
244 | * This mapping is computed as follows. |
245 | * We first introduce "i" in the domain through precomposition |
246 | * with [D -> i] -> D obtaining |
247 | * |
248 | * [D -> i] -> s(D) |
249 | * |
250 | * Adding [D -> i] -> i produces |
251 | * |
252 | * [D -> i] -> i + s(D) |
253 | * |
254 | * and the domain product with [D -> i] -> D yields |
255 | * |
256 | * [D -> i] -> [D -> i + s(D)] |
257 | * |
258 | * Composition with [D -> i] -> [D -> i/g] gives the desired result. |
259 | */ |
260 | static __isl_give isl_basic_map *check_stride(struct gpu_array_bound *bound, |
261 | __isl_take isl_basic_map *bounds) |
262 | { |
263 | isl_space *space; |
264 | isl_basic_map *hull; |
265 | isl_basic_map *shift, *id, *bmap, *scale; |
266 | isl_basic_set *bset; |
267 | isl_aff *aff; |
268 | |
269 | bound->stride = NULL((void*)0); |
270 | |
271 | hull = isl_basic_map_affine_hull(isl_basic_map_copy(bounds)); |
272 | |
273 | isl_basic_map_foreach_constraint(hull, &check_stride_constraint, bound); |
274 | |
275 | isl_basic_map_free(hull); |
276 | |
277 | if (!bound->stride) |
278 | return bounds; |
279 | |
280 | shift = isl_basic_map_from_aff(isl_aff_copy(bound->shift)); |
281 | space = isl_basic_map_get_space(bounds); |
282 | bmap = isl_basic_map_domain_map(isl_basic_map_universe(space)); |
283 | shift = isl_basic_map_apply_range(bmap, shift); |
284 | space = isl_basic_map_get_space(bounds); |
285 | id = isl_basic_map_range_map(isl_basic_map_universe(space)); |
286 | shift = isl_basic_map_sum(id, shift); |
287 | space = isl_basic_map_get_space(bounds); |
288 | id = isl_basic_map_domain_map(isl_basic_map_universe(space)); |
289 | shift = isl_basic_map_range_product(id, shift); |
290 | |
291 | space = isl_space_domain(isl_basic_map_get_space(bounds)); |
292 | id = isl_basic_map_identity(isl_space_map_from_set(space)); |
293 | space = isl_space_range(isl_basic_map_get_space(bounds)); |
294 | aff = isl_aff_zero_on_domain(isl_local_space_from_space(space)); |
295 | aff = isl_aff_add_coefficient_si(aff, isl_dim_in, 0, 1); |
296 | aff = isl_aff_scale_down_val(aff, isl_val_copy(bound->stride)); |
297 | scale = isl_basic_map_from_aff(aff); |
298 | scale = isl_basic_map_product(id, scale); |
299 | |
300 | bmap = isl_basic_map_apply_range(shift, scale); |
301 | bset = isl_basic_set_apply(isl_basic_map_wrap(bounds), bmap); |
302 | bounds = isl_basic_set_unwrap(bset); |
303 | |
304 | return bounds; |
305 | } |
306 | |
307 | /* Data used in compute_array_dim_size and compute_size_in_direction. |
308 | * |
309 | * pos is the position of the variable representing the array index, |
310 | * i.e., the variable for which want to compute the size. This variable |
311 | * is also the last variable in the set. |
312 | */ |
313 | struct gpu_size_info { |
314 | isl_basic_set *bset; |
315 | struct gpu_array_bound *bound; |
316 | int pos; |
317 | }; |
318 | |
319 | /* Given a constraint from the basic set describing the bounds on |
320 | * an array index, check if it is a lower bound, say m i >= b(x), and, |
321 | * if so, check whether the expression "i - ceil(b(x)/m) + 1" has a constant |
322 | * upper bound. If so, and if this bound is smaller than any bound |
323 | * derived from earlier constraints, set the size to this bound on |
324 | * the expression and the lower bound to ceil(b(x)/m). |
325 | */ |
326 | static isl_stat compute_size_in_direction(__isl_take isl_constraint *c, |
327 | void *user) |
328 | { |
329 | struct gpu_size_info *size = user; |
330 | unsigned nparam; |
331 | unsigned n_div; |
332 | isl_val *v; |
333 | isl_aff *aff; |
334 | isl_aff *lb; |
335 | |
336 | nparam = isl_basic_set_dim(size->bset, isl_dim_param); |
Value stored to 'nparam' is never read | |
337 | n_div = isl_constraint_dim(c, isl_dim_div); |
338 | |
339 | if (isl_constraint_involves_dims(c, isl_dim_div, 0, n_div) || |
340 | !isl_constraint_is_lower_bound(c, isl_dim_set, size->pos)) { |
341 | isl_constraint_free(c); |
342 | return isl_stat_ok; |
343 | } |
344 | |
345 | aff = isl_constraint_get_bound(c, isl_dim_set, size->pos); |
346 | aff = isl_aff_ceil(aff); |
347 | |
348 | lb = isl_aff_copy(aff); |
349 | |
350 | aff = isl_aff_neg(aff); |
351 | aff = isl_aff_add_coefficient_si(aff, isl_dim_in, size->pos, 1); |
352 | |
353 | v = isl_basic_set_max_val(size->bset, aff); |
354 | isl_aff_free(aff); |
355 | |
356 | if (isl_val_is_int(v)) { |
357 | v = isl_val_add_ui(v, 1); |
358 | if (!size->bound->size || isl_val_lt(v, size->bound->size)) { |
359 | isl_val_free(size->bound->size); |
360 | size->bound->size = isl_val_copy(v); |
361 | lb = isl_aff_drop_dims(lb, isl_dim_in, size->pos, 1); |
362 | isl_aff_free(size->bound->lb); |
363 | size->bound->lb = isl_aff_copy(lb); |
364 | } |
365 | } |
366 | isl_val_free(v); |
367 | isl_aff_free(lb); |
368 | |
369 | isl_constraint_free(c); |
370 | |
371 | return isl_stat_ok; |
372 | } |
373 | |
374 | /* Given a basic map "bounds" that maps parameters and input dimensions |
375 | * to a single output dimension, look for an expression in the parameters |
376 | * and input dimensions such that the range of the output dimension shifted |
377 | * by this expression is a constant. |
378 | * |
379 | * In particular, we currently only consider lower bounds on the output |
380 | * dimension as candidate expressions. |
381 | */ |
382 | static int compute_array_dim_size(struct gpu_array_bound *bound, |
383 | __isl_take isl_basic_map *bounds) |
384 | { |
385 | struct gpu_size_info size; |
386 | |
387 | bounds = isl_basic_map_detect_equalities(bounds); |
388 | bounds = check_stride(bound, bounds); |
389 | |
390 | bound->size = NULL((void*)0); |
391 | bound->lb = NULL((void*)0); |
392 | |
393 | size.bound = bound; |
394 | size.pos = isl_basic_map_dim(bounds, isl_dim_in); |
395 | size.bset = isl_basic_map_wrap(bounds); |
396 | size.bset = isl_basic_set_flatten(size.bset); |
397 | size.bset = isl_set_simple_hull(isl_basic_set_compute_divs(size.bset)); |
398 | isl_basic_set_foreach_constraint(size.bset, &compute_size_in_direction, |
399 | &size); |
400 | isl_basic_set_free(size.bset); |
401 | |
402 | return bound->size ? 0 : -1; |
403 | } |
404 | |
405 | /* Check if we can find a memory tile for the given array |
406 | * based on the given accesses, and if so, put the results in "tile". |
407 | * |
408 | * We project the accesses on each index in turn and look for a parametric |
409 | * offset such that the size is constant. |
410 | * |
411 | * tile->depth is initialized to the input dimension of the computed bounds. |
412 | */ |
413 | static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile) |
414 | { |
415 | int i; |
416 | |
417 | tile->depth = isl_map_dim(access, isl_dim_in); |
418 | |
419 | for (i = 0; i < tile->n; ++i) { |
420 | isl_map *access_i; |
421 | isl_basic_map *hull; |
422 | |
423 | access_i = isl_map_copy(access); |
424 | access_i = isl_map_project_out(access_i, isl_dim_out, 0, i); |
425 | access_i = isl_map_project_out(access_i, isl_dim_out, |
426 | 1, tile->n - (i + 1)); |
427 | access_i = isl_map_compute_divs(access_i); |
428 | hull = isl_map_simple_hull(access_i); |
429 | if (compute_array_dim_size(&tile->bound[i], hull) < 0) |
430 | return 0; |
431 | } |
432 | |
433 | return 1; |
434 | } |
435 | |
436 | /* Internal data structure for gpu_group_references. |
437 | * |
438 | * scop represents the input scop. |
439 | * kernel_depth is the schedule depth where the kernel launch will |
440 | * be introduced, i.e., it is the depth of the band that is mapped |
441 | * to blocks. |
442 | * shared_depth is the schedule depth at which the copying to/from |
443 | * shared memory is computed. The copy operation may then |
444 | * later be hoisted to a higher level. |
445 | * thread_depth is the schedule depth where the thread mark is located, |
446 | * i.e., it is the depth of the band that is mapped to threads and also |
447 | * the schedule depth at which the copying to/from private memory |
448 | * is computed. The copy operation may then later be hoisted to |
449 | * a higher level. |
450 | * n_thread is the number of schedule dimensions in the band that |
451 | * is mapped to threads. |
452 | * privatization lives in the range of thread_sched (i.e., it is |
453 | * of dimension thread_depth + n_thread) and encodes the mapping |
454 | * to thread identifiers (as parameters). |
455 | * host_sched contains the kernel_depth dimensions of the host schedule. |
456 | * shared_sched contains the first shared_depth dimensions of the |
457 | * kernel schedule. |
458 | * copy_sched contains the first thread_depth dimensions of the |
459 | * kernel schedule. |
460 | * thread_sched contains the first (thread_depth + n_thread) dimensions |
461 | * of the kernel schedule. |
462 | * full_sched is a union_map representation of the entire kernel schedule. |
463 | * The schedules are all formulated in terms of the original statement |
464 | * instances, i.e., those that appear in the domains of the access |
465 | * relations. |
466 | */ |
467 | struct gpu_group_data { |
468 | struct ppcg_scop *scop; |
469 | int kernel_depth; |
470 | int shared_depth; |
471 | int thread_depth; |
472 | int n_thread; |
473 | isl_set *privatization; |
474 | isl_union_map *host_sched; |
475 | isl_union_map *shared_sched; |
476 | isl_union_map *copy_sched; |
477 | isl_union_map *thread_sched; |
478 | isl_union_map *full_sched; |
479 | }; |
480 | |
481 | /* Construct a map from domain_space to domain_space that increments |
482 | * the dimension at position "pos" and leaves all other dimensions |
483 | * constant. |
484 | */ |
485 | static __isl_give isl_map *next(__isl_take isl_space *domain_space, int pos) |
486 | { |
487 | isl_space *space; |
488 | isl_aff *aff; |
489 | isl_multi_aff *next; |
490 | |
491 | space = isl_space_map_from_set(domain_space); |
492 | next = isl_multi_aff_identity(space); |
493 | aff = isl_multi_aff_get_aff(next, pos); |
494 | aff = isl_aff_add_constant_si(aff, 1); |
495 | next = isl_multi_aff_set_aff(next, pos, aff); |
496 | |
497 | return isl_map_from_multi_aff(next); |
498 | } |
499 | |
500 | /* Check if the given access is coalesced (or if there is no point |
501 | * in trying to coalesce the access by mapping the array to shared memory). |
502 | * That is, check whether incrementing the dimension that will get |
503 | * wrapped over the last thread index results in incrementing |
504 | * the last array index. |
505 | * |
506 | * If no two consecutive array elements are ever accessed by "access", |
507 | * then mapping the corresponding array to shared memory will not |
508 | * improve coalescing. In fact, the copying will likely be performed |
509 | * by a single thread. Consider the access as coalesced such that |
510 | * the caller will not try and map the array to shared memory just |
511 | * to improve coalescing. |
512 | * |
513 | * This function is only called for access relations without reuse and |
514 | * kernels with at least one thread identifier. |
515 | */ |
516 | static int access_is_coalesced(struct gpu_group_data *data, |
517 | __isl_keep isl_union_map *access) |
518 | { |
519 | int dim; |
520 | isl_space *space; |
521 | isl_set *accessed; |
522 | isl_map *access_map; |
523 | isl_map *next_thread_x; |
524 | isl_map *next_element; |
525 | isl_map *map; |
526 | int coalesced, empty; |
527 | |
528 | access = isl_union_map_copy(access); |
529 | access = isl_union_map_apply_domain(access, |
530 | isl_union_map_copy(data->full_sched)); |
531 | access_map = isl_map_from_union_map(access); |
532 | |
533 | space = isl_map_get_space(access_map); |
534 | space = isl_space_range(space); |
535 | dim = isl_space_dim(space, isl_dim_set); |
536 | if (dim == 0) |
537 | next_element = isl_map_empty(isl_space_map_from_set(space)); |
538 | else |
539 | next_element = next(space, dim - 1); |
540 | |
541 | accessed = isl_map_range(isl_map_copy(access_map)); |
542 | map = isl_map_copy(next_element); |
543 | map = isl_map_intersect_domain(map, isl_set_copy(accessed)); |
544 | map = isl_map_intersect_range(map, accessed); |
545 | empty = isl_map_is_empty(map); |
546 | isl_map_free(map); |
547 | |
548 | if (empty < 0 || empty) { |
549 | isl_map_free(next_element); |
550 | isl_map_free(access_map); |
551 | return empty; |
552 | } |
553 | |
554 | space = isl_map_get_space(access_map); |
555 | space = isl_space_domain(space); |
556 | next_thread_x = next(space, data->thread_depth + data->n_thread - 1); |
557 | |
558 | map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map)); |
559 | map = isl_map_apply_range(map, access_map); |
560 | |
561 | coalesced = isl_map_is_subset(map, next_element); |
562 | |
563 | isl_map_free(next_element); |
564 | isl_map_free(map); |
565 | |
566 | return coalesced; |
567 | } |
568 | |
569 | /* Replace the host schedule dimensions in the access relation "access" |
570 | * by parameters, so that they are treated as fixed when checking for reuse |
571 | * (within a kernel) or whether two consecutive elements are accessed |
572 | * (within a kernel). |
573 | */ |
574 | static __isl_give isl_union_map *localize_access(struct gpu_group_data *data, |
575 | __isl_take isl_union_map *access) |
576 | { |
577 | int n; |
578 | isl_space *space; |
579 | isl_set *param; |
580 | isl_union_map *umap; |
581 | isl_id_list *ids; |
582 | |
583 | umap = isl_union_map_copy(data->host_sched); |
584 | space = isl_union_map_get_space(umap); |
585 | n = data->kernel_depth; |
586 | ids = ppcg_scop_generate_names(data->scop, n, "__ppcg_host_"); |
587 | param = parametrization(space, n, 0, ids); |
588 | isl_id_list_free(ids); |
589 | umap = isl_union_map_intersect_range(umap, |
590 | isl_union_set_from_set(param)); |
591 | access = isl_union_map_intersect_domain(access, |
592 | isl_union_map_domain(umap)); |
593 | |
594 | return access; |
595 | } |
596 | |
597 | /* Given an access relation in terms of at least data->thread_depth initial |
598 | * dimensions of the computed schedule, check if it is bijective for |
599 | * fixed values of the first data->thread_depth dimensions. |
600 | * We perform this check by equating these dimensions to parameters. |
601 | */ |
602 | static int access_is_bijective(struct gpu_group_data *data, |
603 | __isl_keep isl_map *access) |
604 | { |
605 | int res; |
606 | int dim; |
607 | isl_set *par; |
608 | isl_space *space; |
609 | isl_id_list *ids; |
610 | |
611 | access = isl_map_copy(access); |
612 | space = isl_space_params(isl_map_get_space(access)); |
613 | ids = ppcg_scop_generate_names(data->scop, data->thread_depth, "s"); |
614 | dim = isl_map_dim(access, isl_dim_in); |
615 | par = parametrization(space, dim, 0, ids); |
616 | isl_id_list_free(ids); |
617 | access = isl_map_intersect_domain(access, par); |
618 | res = isl_map_is_bijective(access); |
619 | isl_map_free(access); |
620 | |
621 | return res; |
622 | } |
623 | |
624 | /* Compute the number of outer schedule tile dimensions that affect |
625 | * the offset of "tile". |
626 | * If there is no such dimension, then return the index |
627 | * of the first kernel dimension, i.e., data->kernel_depth. |
628 | */ |
629 | static int compute_tile_depth(struct gpu_group_data *data, |
630 | struct gpu_array_tile *tile) |
631 | { |
632 | int i, j; |
633 | |
634 | for (j = tile->depth - 1; j >= data->kernel_depth; --j) { |
635 | for (i = 0; i < tile->n; ++i) { |
636 | isl_aff *lb; |
637 | isl_aff *shift; |
638 | |
639 | lb = tile->bound[i].lb; |
640 | if (isl_aff_involves_dims(lb, isl_dim_in, j, 1)) |
641 | break; |
642 | |
643 | shift = tile->bound[i].shift; |
644 | if (!shift) |
645 | continue; |
646 | if (isl_aff_involves_dims(shift, isl_dim_in, j, 1)) |
647 | break; |
648 | } |
649 | if (i < tile->n) |
650 | break; |
651 | } |
652 | |
653 | return ++j; |
654 | } |
655 | |
656 | /* Return the lowest depth between data->kernel_depth and data->thread_depth |
657 | * at which every array element accessed through "acc" is accessed |
658 | * by a single thread. The input dimension of "acc" is |
659 | * data->thread_depth + data->n_thread, where the final data->n_thread |
660 | * dimensions are those that will be mapped to threads. |
661 | * If the values for these dimensions are uniquely determined |
662 | * by the array index and a given number of outer dimensions, then |
663 | * there is only one thread accessing that array element within those |
664 | * outer dimensions. |
665 | * |
666 | * The input space of "acc" is first split up, such that it has the form |
667 | * |
668 | * [O -> T] -> A |
669 | * |
670 | * with O the outer dimensions, T the dimensions that will be mapped to threads |
671 | * and A the array index. |
672 | * |
673 | * Then the positions of T and A are interchanged to simplify the test |
674 | * whether T uniquely depends on O and A. |
675 | * In particular, the above access relation is first combined with |
676 | * |
677 | * [O -> T] -> T |
678 | * |
679 | * to form |
680 | * |
681 | * [O -> T] -> [A -> T] |
682 | * |
683 | * from which |
684 | * |
685 | * O -> [A -> T] |
686 | * |
687 | * is extracted, which is then uncurried to |
688 | * |
689 | * [O -> A] -> T |
690 | * |
691 | * Finally, the final dimensions of O are projected out one by one |
692 | * until T is no longer uniquely determined by A and the remaining |
693 | * dimensions in O. The value returned is that of the last dimension |
694 | * that was successfully projected out. |
695 | * Note that there is no need to test whether [O -> A] -> T itself |
696 | * is single-valued as that was already tested in access_is_bijective. |
697 | */ |
698 | static int compute_accessed_by_single_thread_depth(struct gpu_group_data *data, |
699 | __isl_keep isl_map *acc) |
700 | { |
701 | int i; |
702 | isl_space *space; |
703 | isl_map *map; |
704 | isl_bool sv; |
705 | |
706 | if (data->thread_depth == data->kernel_depth) |
707 | return data->thread_depth; |
708 | |
709 | acc = isl_map_copy(acc); |
710 | |
711 | space = isl_map_get_space(acc); |
712 | space = isl_space_params(space); |
713 | space = isl_space_set_from_params(space); |
714 | space = isl_space_add_dims(space, isl_dim_set, data->thread_depth); |
715 | space = isl_space_from_domain(space); |
716 | space = isl_space_add_dims(space, isl_dim_out, data->n_thread); |
717 | space = isl_space_wrap(space); |
718 | map = isl_set_flatten_map(isl_set_universe(space)); |
719 | acc = isl_map_apply_range(map, acc); |
720 | |
721 | space = isl_space_domain(isl_map_get_space(acc)); |
722 | map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space))); |
723 | acc = isl_map_range_product(acc, map); |
724 | acc = isl_map_domain_factor_domain(acc); |
725 | acc = isl_map_uncurry(acc); |
726 | |
727 | for (i = data->thread_depth - 1; i >= data->kernel_depth; --i) { |
728 | acc = isl_map_project_out(acc, isl_dim_in, i, 1); |
729 | sv = isl_map_is_single_valued(acc); |
730 | if (sv < 0) |
731 | return -1; |
732 | if (!sv) |
733 | break; |
734 | } |
735 | |
736 | isl_map_free(acc); |
737 | |
738 | return ++i; |
739 | } |
740 | |
741 | /* Adjust the fields of "tile" to reflect the new input dimension "depth". |
742 | * The dimension beyond "depth" are assumed not to affect the tile, |
743 | * so they can simply be dropped. |
744 | */ |
745 | static int tile_adjust_depth(struct gpu_array_tile *tile, int depth) |
746 | { |
747 | int i; |
748 | |
749 | if (tile->depth == depth) |
750 | return 0; |
751 | |
752 | for (i = 0; i < tile->n; ++i) { |
753 | tile->bound[i].lb = isl_aff_drop_dims(tile->bound[i].lb, |
754 | isl_dim_in, depth, tile->depth - depth); |
755 | if (!tile->bound[i].lb) |
756 | return -1; |
757 | if (!tile->bound[i].shift) |
758 | continue; |
759 | tile->bound[i].shift = isl_aff_drop_dims(tile->bound[i].shift, |
760 | isl_dim_in, depth, tile->depth - depth); |
761 | if (!tile->bound[i].shift) |
762 | return -1; |
763 | } |
764 | |
765 | tile->depth = depth; |
766 | |
767 | return 0; |
768 | } |
769 | |
770 | /* Determine the number of schedule dimensions that affect the offset of the |
771 | * shared or private tile "tile" and store the result in tile->depth, with |
772 | * a lower bound of data->kernel_depth. |
773 | * Also adjust the fields of the tile to only refer to the tile->depth |
774 | * outer schedule dimensions. |
775 | */ |
776 | static isl_stat tile_set_depth(struct gpu_group_data *data, |
777 | struct gpu_array_tile *tile) |
778 | { |
779 | if (tile_adjust_depth(tile, compute_tile_depth(data, tile)) < 0) |
780 | return isl_stat_error; |
781 | |
782 | return isl_stat_ok; |
783 | } |
784 | |
785 | /* Determine the number of schedule dimensions that affect the offset of the |
786 | * shared tile and store the minimum of the private and shared tile depth |
787 | * in group->min_depth, with a lower bound of data->kernel_depth. |
788 | * If there is no tile defined on the array reference group, |
789 | * then set group->min_depth to data->thread_depth. |
790 | */ |
791 | static int set_depth(struct gpu_group_data *data, |
792 | struct gpu_array_ref_group *group) |
793 | { |
794 | group->min_depth = data->thread_depth; |
795 | |
796 | if (group->private_tile) { |
797 | if (group->private_tile->depth < group->min_depth) |
798 | group->min_depth = group->private_tile->depth; |
799 | } |
800 | if (group->shared_tile) { |
801 | if (tile_set_depth(data, group->shared_tile) < 0) |
802 | return -1; |
803 | if (group->shared_tile->depth < group->min_depth) |
804 | group->min_depth = group->shared_tile->depth; |
805 | } |
806 | |
807 | return 0; |
808 | } |
809 | |
810 | /* Fill up the groups array with singleton groups, i.e., one group |
811 | * per reference, initializing the array, access, write, n_ref and refs fields. |
812 | * In particular the access field is initialized to the scheduled |
813 | * access relation of the array reference. |
814 | * |
815 | * Return the number of elements initialized, i.e., the number of |
816 | * active references in the current kernel. |
817 | */ |
818 | static int populate_array_references(struct gpu_local_array_info *local, |
819 | struct gpu_array_ref_group **groups, struct gpu_group_data *data) |
820 | { |
821 | int i; |
822 | int n; |
823 | isl_ctx *ctx = isl_union_map_get_ctx(data->copy_sched); |
824 | |
825 | n = 0; |
826 | for (i = 0; i < local->array->n_ref; ++i) { |
827 | isl_union_map *umap; |
828 | isl_map *map; |
829 | struct gpu_array_ref_group *group; |
830 | struct gpu_stmt_access *access = local->array->refs[i]; |
831 | |
832 | map = isl_map_copy(access->access); |
833 | umap = isl_union_map_from_map(map); |
834 | umap = isl_union_map_apply_domain(umap, |
835 | isl_union_map_copy(data->copy_sched)); |
836 | |
837 | if (isl_union_map_is_empty(umap)) { |
838 | isl_union_map_free(umap); |
839 | continue; |
840 | } |
841 | |
842 | map = isl_map_from_union_map(umap); |
843 | map = isl_map_detect_equalities(map); |
844 | |
845 | group = isl_calloc_type(ctx, struct gpu_array_ref_group)((struct gpu_array_ref_group *)isl_calloc_or_die(ctx, 1, sizeof (struct gpu_array_ref_group))); |
846 | if (!group) |
847 | return -1; |
848 | group->local_array = local; |
849 | group->array = local->array; |
850 | group->access = map; |
851 | group->write = access->write; |
852 | group->exact_write = access->exact_write; |
853 | group->slice = access->n_index < local->array->n_index; |
854 | group->refs = &local->array->refs[i]; |
855 | group->n_ref = 1; |
856 | |
857 | groups[n++] = group; |
858 | } |
859 | |
860 | return n; |
861 | } |
862 | |
863 | /* If group->n_ref == 1, then group->refs was set by |
864 | * populate_array_references to point directly into |
865 | * group->array->refs and should not be freed. |
866 | * If group->n_ref > 1, then group->refs was set by join_groups |
867 | * to point to a newly allocated array. |
868 | */ |
869 | struct gpu_array_ref_group *gpu_array_ref_group_free( |
870 | struct gpu_array_ref_group *group) |
871 | { |
872 | if (!group) |
873 | return NULL((void*)0); |
874 | gpu_array_tile_free(group->shared_tile); |
875 | gpu_array_tile_free(group->private_tile); |
876 | isl_map_free(group->access); |
877 | if (group->n_ref > 1) |
878 | free(group->refs); |
879 | free(group); |
880 | return NULL((void*)0); |
881 | } |
882 | |
883 | /* Check if the access relations of group1 and group2 overlap within |
884 | * copy_sched. |
885 | */ |
886 | static int accesses_overlap(struct gpu_array_ref_group *group1, |
887 | struct gpu_array_ref_group *group2) |
888 | { |
889 | int disjoint; |
890 | |
891 | disjoint = isl_map_is_disjoint(group1->access, group2->access); |
892 | if (disjoint < 0) |
893 | return -1; |
894 | |
895 | return !disjoint; |
896 | } |
897 | |
898 | /* Combine the given two groups into a single group, containing |
899 | * the references of both groups. |
900 | */ |
901 | static struct gpu_array_ref_group *join_groups( |
902 | struct gpu_array_ref_group *group1, |
903 | struct gpu_array_ref_group *group2) |
904 | { |
905 | int i; |
906 | isl_ctx *ctx; |
907 | struct gpu_array_ref_group *group; |
908 | |
909 | if (!group1 || !group2) |
910 | return NULL((void*)0); |
911 | |
912 | ctx = isl_map_get_ctx(group1->access); |
913 | group = isl_calloc_type(ctx, struct gpu_array_ref_group)((struct gpu_array_ref_group *)isl_calloc_or_die(ctx, 1, sizeof (struct gpu_array_ref_group))); |
914 | if (!group) |
915 | return NULL((void*)0); |
916 | group->local_array = group1->local_array; |
917 | group->array = group1->array; |
918 | group->access = isl_map_union(isl_map_copy(group1->access), |
919 | isl_map_copy(group2->access)); |
920 | group->write = group1->write || group2->write; |
921 | group->exact_write = group1->exact_write && group2->exact_write; |
922 | group->slice = group1->slice || group2->slice; |
923 | group->n_ref = group1->n_ref + group2->n_ref; |
924 | group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,((struct gpu_stmt_access * *)isl_malloc_or_die(ctx, (group-> n_ref)*sizeof(struct gpu_stmt_access *))) |
925 | group->n_ref)((struct gpu_stmt_access * *)isl_malloc_or_die(ctx, (group-> n_ref)*sizeof(struct gpu_stmt_access *))); |
926 | if (!group->refs) |
927 | return gpu_array_ref_group_free(group); |
928 | for (i = 0; i < group1->n_ref; ++i) |
929 | group->refs[i] = group1->refs[i]; |
930 | for (i = 0; i < group2->n_ref; ++i) |
931 | group->refs[group1->n_ref + i] = group2->refs[i]; |
932 | |
933 | return group; |
934 | } |
935 | |
936 | /* Combine the given two groups into a single group and free |
937 | * the original two groups. |
938 | */ |
939 | static struct gpu_array_ref_group *join_groups_and_free( |
940 | struct gpu_array_ref_group *group1, |
941 | struct gpu_array_ref_group *group2) |
942 | { |
943 | struct gpu_array_ref_group *group; |
944 | |
945 | group = join_groups(group1, group2); |
946 | gpu_array_ref_group_free(group1); |
947 | gpu_array_ref_group_free(group2); |
948 | return group; |
949 | } |
950 | |
951 | /* Report that the array reference group with the given access relation |
952 | * is not mapped to shared memory in the given kernel because |
953 | * it does not exhibit any reuse and is considered to be coalesced. |
954 | */ |
955 | static void report_no_reuse_and_coalesced(struct ppcg_kernel *kernel, |
956 | __isl_keep isl_union_map *access) |
957 | { |
958 | isl_ctx *ctx; |
959 | isl_printer *p; |
960 | |
961 | ctx = isl_union_map_get_ctx(access); |
962 | p = isl_printer_to_file(ctx, stdoutstdout); |
963 | p = isl_printer_print_str(p, "Array reference group "); |
964 | p = isl_printer_print_union_map(p, access); |
965 | p = isl_printer_print_str(p, |
966 | " not considered for mapping to shared memory in kernel"); |
967 | p = isl_printer_print_int(p, kernel->id); |
968 | p = isl_printer_print_str(p, |
969 | " because it exhibits no reuse and is considered to be coalesced"); |
970 | p = isl_printer_end_line(p); |
971 | isl_printer_free(p); |
972 | } |
973 | |
974 | /* Given an access relation in terms of the data->thread_depth initial |
975 | * dimensions of the computed schedule and the thread identifiers |
976 | * (as parameters), check if the use of the corresponding private tile |
977 | * requires unrolling. |
978 | * |
979 | * If we are creating a private tile because we are forced to, |
980 | * then no unrolling is required. |
981 | * Otherwise we check if "access" is bijective and unrolling |
982 | * is required if it is not. Note that the access relation |
983 | * has already been determined to be bijective before the introduction |
984 | * of the thread identifiers and the removal of the schedule dimensions |
985 | * that are mapped to these threads. If the access relation is no longer |
986 | * bijective, then this means that more than one value of one of those |
987 | * schedule dimensions is mapped to the same thread and therefore |
988 | * unrolling is required. |
989 | */ |
990 | static int check_requires_unroll(struct gpu_group_data *data, |
991 | __isl_keep isl_map *access, int force_private) |
992 | { |
993 | int bijective; |
994 | |
995 | if (force_private) |
996 | return 0; |
997 | bijective = access_is_bijective(data, access); |
998 | if (bijective < 0) |
999 | return -1; |
1000 | return !bijective; |
1001 | } |
1002 | |
1003 | /* Map the domain of "access" to the outer data->shared_depth |
1004 | * schedule dimensions. When data->shared_depth is equal to |
1005 | * data->thread_depth, this result is already available in group->access. |
1006 | */ |
1007 | static __isl_give isl_map *shared_access(struct gpu_array_ref_group *group, |
1008 | __isl_keep isl_union_map *access, struct gpu_group_data *data) |
1009 | { |
1010 | isl_union_map *shared; |
1011 | |
1012 | if (data->shared_depth == data->thread_depth) |
1013 | return isl_map_copy(group->access); |
1014 | |
1015 | shared = isl_union_map_copy(access); |
1016 | shared = isl_union_map_apply_domain(shared, |
1017 | isl_union_map_copy(data->shared_sched)); |
1018 | return isl_map_from_union_map(shared); |
1019 | } |
1020 | |
1021 | /* Compute the private and/or shared memory tiles for the array |
1022 | * reference group "group" of array "array". |
1023 | * Return 0 on success and -1 on error. |
1024 | * |
1025 | * If the array is a read-only scalar or if the user requested |
1026 | * not to use shared or private memory, then we do not need to do anything. |
1027 | * |
1028 | * If any reference in the reference group accesses more than one element, |
1029 | * then we would have to make sure that the layout in shared memory |
1030 | * is the same as that in global memory. Since we do not handle this yet |
1031 | * (and it may not even be possible), we refuse to map to private or |
1032 | * shared memory in such cases. |
1033 | * |
1034 | * If the array group involves any may writes (that are not must writes), |
1035 | * then we would have to make sure that we load the data into shared/private |
1036 | * memory first in case the data is not written by the kernel |
1037 | * (but still written back out to global memory). |
1038 | * Since we don't have any such mechanism at the moment, we don't |
1039 | * compute shared/private tiles for groups involving may writes. |
1040 | * |
1041 | * We only try to compute a shared memory tile if there is any reuse |
1042 | * or if the access is not coalesced. |
1043 | * Reuse and coalescing are checked within the given kernel. |
1044 | * |
1045 | * For computing a private memory tile, we also require that there is |
1046 | * some reuse. Moreover, we require that the access is private |
1047 | * to the thread. That is, we check that any given array element |
1048 | * is only accessed by a single thread. |
1049 | * We compute an access relation that maps the outer |
1050 | * data->thread_depth + data->n_thread schedule dimensions. |
1051 | * The latter data->n_thread will be mapped to thread identifiers. |
1052 | * We actually check that those iterators that will be wrapped |
1053 | * partition the array space. This check is stricter than necessary |
1054 | * since several iterations may be mapped onto the same thread |
1055 | * and then they could be allowed to access the same memory elements, |
1056 | * but our check does not allow this situation. |
1057 | * |
1058 | * For private memory tiles, the number of schedule dimensions that |
1059 | * affect the offset is computed and stored in tile->depth, with |
1060 | * a lower bound of data->kernel_depth. If this depth is smaller |
1061 | * than the minimal depth that still ensures that every element |
1062 | * is accessed by a single thread, then the depth is raised |
1063 | * to this minimal depth. |
1064 | * The fields of the tile are then adjusted to only refer to the tile->depth |
1065 | * outer schedule dimensions. |
1066 | * |
1067 | * We also check that the index expression only depends on parallel |
1068 | * loops. That way, we can move those loops innermost and unroll them. |
1069 | * Again, we use a test that is stricter than necessary. |
1070 | * We actually check whether the index expression only depends |
1071 | * on the iterators that are wrapped over the threads. |
1072 | * These are necessarily parallel, but there may be more parallel loops. |
1073 | * |
1074 | * Combining the injectivity of the first test with the single-valuedness |
1075 | * of the second test, we simply test for bijectivity. |
1076 | * |
1077 | * If the use of the private tile requires unrolling, but some |
1078 | * of the other arrays are forcibly mapped to private memory, |
1079 | * then we do not allow the use of this private tile since |
1080 | * we cannot move the schedule dimensions that need to be unrolled down |
1081 | * without performing some kind of expansion on those arrays |
1082 | * that are forcibly mapped to private memory. |
1083 | * |
1084 | * If the array is marked force_private, then we bypass all checks |
1085 | * and assume we can (and should) use registers only. |
1086 | * |
1087 | * If it turns out we can (or have to) use registers, we compute |
1088 | * the private memory tile size using can_tile, after introducing a dependence |
1089 | * on the thread indices. |
1090 | */ |
1091 | static int compute_group_bounds_core(struct ppcg_kernel *kernel, |
1092 | struct gpu_array_ref_group *group, struct gpu_group_data *data) |
1093 | { |
1094 | isl_ctx *ctx = isl_space_get_ctx(group->array->space); |
1095 | isl_union_map *access, *local; |
1096 | int n_index = group->array->n_index; |
1097 | int no_reuse, coalesced; |
1098 | isl_map *acc; |
1099 | int force_private = group->local_array->force_private; |
1100 | int use_shared = !force_private && kernel->options->use_shared_memory && |
1101 | data->n_thread > 0; |
1102 | int use_private = force_private || kernel->options->use_private_memory; |
1103 | int r = 0; |
1104 | int requires_unroll; |
1105 | int unique_depth; |
1106 | |
1107 | if (!use_shared && !use_private) |
1108 | return 0; |
1109 | if (gpu_array_is_read_only_scalar(group->array)) |
1110 | return 0; |
1111 | if (!force_private && !group->exact_write) |
1112 | return 0; |
1113 | if (group->slice) |
1114 | return 0; |
1115 | |
1116 | access = gpu_array_ref_group_access_relation(group, 1, 1); |
1117 | local = localize_access(data, isl_union_map_copy(access)); |
1118 | no_reuse = isl_union_map_is_injective(local); |
1119 | if (no_reuse < 0) |
1120 | r = -1; |
1121 | if (use_shared && no_reuse) |
1122 | coalesced = access_is_coalesced(data, local); |
1123 | isl_union_map_free(local); |
1124 | |
1125 | if (r >= 0 && kernel->options->debug->verbose && |
1126 | use_shared && no_reuse && coalesced) |
1127 | report_no_reuse_and_coalesced(kernel, access); |
1128 | |
1129 | if (use_shared && (!no_reuse || !coalesced)) { |
1130 | group->shared_tile = gpu_array_tile_create(ctx, |
1131 | group->array->n_index); |
1132 | acc = shared_access(group, access, data); |
1133 | if (!group->shared_tile) |
1134 | r = -1; |
1135 | else if (!can_tile(acc, group->shared_tile)) |
1136 | group->shared_tile = |
1137 | gpu_array_tile_free(group->shared_tile); |
1138 | isl_map_free(acc); |
1139 | } |
1140 | |
1141 | if (r < 0 || (!force_private && (!use_private || no_reuse))) { |
1142 | isl_union_map_free(access); |
1143 | return r; |
1144 | } |
1145 | |
1146 | access = isl_union_map_apply_domain(access, |
1147 | isl_union_map_copy(data->thread_sched)); |
1148 | |
1149 | acc = isl_map_from_union_map(access); |
1150 | |
1151 | if (!force_private && !access_is_bijective(data, acc)) { |
1152 | isl_map_free(acc); |
1153 | return 0; |
1154 | } |
1155 | |
1156 | unique_depth = compute_accessed_by_single_thread_depth(data, acc); |
1157 | |
1158 | acc = isl_map_intersect_domain(acc, isl_set_copy(data->privatization)); |
1159 | acc = isl_map_project_out(acc, isl_dim_in, data->thread_depth, |
1160 | data->n_thread); |
1161 | requires_unroll = check_requires_unroll(data, acc, force_private); |
1162 | if (unique_depth < 0 || requires_unroll < 0 || |
1163 | (requires_unroll && kernel->any_force_private)) { |
1164 | isl_map_free(acc); |
1165 | return requires_unroll < 0 ? -1 : 0; |
1166 | } |
1167 | |
1168 | group->private_tile = gpu_array_tile_create(ctx, n_index); |
1169 | if (!group->private_tile) { |
1170 | isl_map_free(acc); |
1171 | return -1; |
1172 | } |
1173 | group->private_tile->requires_unroll = requires_unroll; |
1174 | if (!can_tile(acc, group->private_tile)) |
1175 | group->private_tile = gpu_array_tile_free(group->private_tile); |
1176 | |
1177 | isl_map_free(acc); |
1178 | |
1179 | if (group->private_tile) { |
1180 | struct gpu_array_tile *tile = group->private_tile; |
1181 | int tile_depth = compute_tile_depth(data, tile); |
1182 | if (tile_depth < unique_depth) |
1183 | tile_depth = unique_depth; |
1184 | if (tile_adjust_depth(tile, tile_depth) < 0) |
1185 | return -1; |
1186 | } |
1187 | |
1188 | if (force_private && !group->private_tile) |
1189 | isl_die(ctx, isl_error_internal,do { isl_handle_error(ctx, isl_error_internal, "unable to map array reference group to registers" , "/build/llvm-toolchain-snapshot-8~svn345461/tools/polly/lib/External/ppcg/gpu_group.c" , 1191); return -1; } while (0) |
1190 | "unable to map array reference group to registers",do { isl_handle_error(ctx, isl_error_internal, "unable to map array reference group to registers" , "/build/llvm-toolchain-snapshot-8~svn345461/tools/polly/lib/External/ppcg/gpu_group.c" , 1191); return -1; } while (0) |
1191 | return -1)do { isl_handle_error(ctx, isl_error_internal, "unable to map array reference group to registers" , "/build/llvm-toolchain-snapshot-8~svn345461/tools/polly/lib/External/ppcg/gpu_group.c" , 1191); return -1; } while (0); |
1192 | |
1193 | return 0; |
1194 | } |
1195 | |
1196 | /* Compute the private and/or shared memory tiles for the array |
1197 | * reference group "group" of array "array" and set the tile depth. |
1198 | * Return 0 on success and -1 on error. |
1199 | */ |
1200 | static int compute_group_bounds(struct ppcg_kernel *kernel, |
1201 | struct gpu_array_ref_group *group, struct gpu_group_data *data) |
1202 | { |
1203 | if (!group) |
1204 | return -1; |
1205 | if (compute_group_bounds_core(kernel, group, data) < 0) |
1206 | return -1; |
1207 | if (set_depth(data, group) < 0) |
1208 | return -1; |
1209 | |
1210 | return 0; |
1211 | } |
1212 | |
1213 | /* If two groups have overlapping access relations (as determined by |
1214 | * the "overlap" function) and if one of them involves a write, |
1215 | * then merge the two groups into one. |
1216 | * If "compute_bounds" is set, then call compute_group_bounds |
1217 | * on the merged groups. |
1218 | * |
1219 | * Return the updated number of groups. |
1220 | * Return -1 on error. |
1221 | */ |
1222 | static int group_writes(struct ppcg_kernel *kernel, |
1223 | int n, struct gpu_array_ref_group **groups, |
1224 | int (*overlap)(struct gpu_array_ref_group *group1, |
1225 | struct gpu_array_ref_group *group2), int compute_bounds, |
1226 | struct gpu_group_data *data) |
1227 | { |
1228 | int i, j; |
1229 | |
1230 | for (i = 0; i < n; ++i) { |
1231 | for (j = n - 1; j > i; --j) { |
1232 | if (!groups[i]->write && !groups[j]->write) |
1233 | continue; |
1234 | |
1235 | if (!overlap(groups[i], groups[j])) |
1236 | continue; |
1237 | |
1238 | groups[i] = join_groups_and_free(groups[i], groups[j]); |
1239 | if (j != n - 1) |
1240 | groups[j] = groups[n - 1]; |
1241 | groups[n - 1] = NULL((void*)0); |
1242 | n--; |
1243 | |
1244 | if (!groups[i]) |
1245 | return -1; |
1246 | if (compute_bounds && |
1247 | compute_group_bounds(kernel, groups[i], data) < 0) |
1248 | return -1; |
1249 | } |
1250 | } |
1251 | |
1252 | return n; |
1253 | } |
1254 | |
1255 | /* If two groups have overlapping access relations (within the innermost |
1256 | * loop) and if one of them involves a write, then merge the two groups |
1257 | * into one. |
1258 | * |
1259 | * Return the updated number of groups. |
1260 | */ |
1261 | static int group_overlapping_writes(struct ppcg_kernel *kernel, |
1262 | int n, struct gpu_array_ref_group **groups, |
1263 | struct gpu_group_data *data) |
1264 | { |
1265 | return group_writes(kernel, n, groups, &accesses_overlap, 0, data); |
1266 | } |
1267 | |
1268 | /* Check if the access relations of group1 and group2 overlap within |
1269 | * the outermost min(group1->min_depth, group2->min_depth) loops. |
1270 | */ |
1271 | static int depth_accesses_overlap(struct gpu_array_ref_group *group1, |
1272 | struct gpu_array_ref_group *group2) |
1273 | { |
1274 | int depth; |
1275 | int dim; |
1276 | int empty; |
1277 | isl_map *map_i, *map_j, *map; |
1278 | |
1279 | depth = group1->min_depth; |
1280 | if (group2->min_depth < depth) |
1281 | depth = group2->min_depth; |
1282 | map_i = isl_map_copy(group1->access); |
1283 | dim = isl_map_dim(map_i, isl_dim_in); |
1284 | map_i = isl_map_eliminate(map_i, isl_dim_in, depth, dim - depth); |
1285 | map_j = isl_map_copy(group2->access); |
1286 | map_j = isl_map_eliminate(map_j, isl_dim_in, depth, dim - depth); |
1287 | map = isl_map_intersect(map_i, map_j); |
1288 | empty = isl_map_is_empty(map); |
1289 | isl_map_free(map); |
1290 | |
1291 | return !empty; |
1292 | } |
1293 | |
1294 | /* If two groups have overlapping access relations (within the outer |
1295 | * depth loops) and if one of them involves a write, |
1296 | * then merge the two groups into one. |
1297 | * |
1298 | * Return the updated number of groups. |
1299 | */ |
1300 | static int group_depth_overlapping_writes(struct ppcg_kernel *kernel, |
1301 | int n, struct gpu_array_ref_group **groups, struct gpu_group_data *data) |
1302 | { |
1303 | return group_writes(kernel, n, groups, &depth_accesses_overlap, 1, |
1304 | data); |
1305 | } |
1306 | |
1307 | /* Is the size of the tile specified by "tile" smaller than the sum of |
1308 | * the sizes of the tiles specified by "tile1" and "tile2"? |
1309 | */ |
1310 | static int smaller_tile(struct gpu_array_tile *tile, |
1311 | struct gpu_array_tile *tile1, struct gpu_array_tile *tile2) |
1312 | { |
1313 | int smaller; |
1314 | isl_val *size, *size1, *size2; |
1315 | |
1316 | size = gpu_array_tile_size(tile); |
1317 | size1 = gpu_array_tile_size(tile1); |
1318 | size2 = gpu_array_tile_size(tile2); |
1319 | |
1320 | size = isl_val_sub(size, size1); |
1321 | size = isl_val_sub(size, size2); |
1322 | smaller = isl_val_is_neg(size); |
1323 | |
1324 | isl_val_free(size); |
1325 | |
1326 | return smaller; |
1327 | } |
1328 | |
1329 | /* Given an initial grouping of array references and shared memory tiles |
1330 | * for each group that allows for a shared memory tile, merge two groups |
1331 | * if both have a shared memory tile, the merged group also has |
1332 | * a shared memory tile and the size of the tile for the merge group |
1333 | * is smaller than the sum of the tile sizes of the individual groups. |
1334 | * |
1335 | * If merging two groups decreases the depth of the tile of |
1336 | * one or both of the two groups, then we need to check for overlapping |
1337 | * writes again. |
1338 | * |
1339 | * Return the number of groups after merging. |
1340 | * Return -1 on error. |
1341 | */ |
1342 | static int group_common_shared_memory_tile(struct ppcg_kernel *kernel, |
1343 | struct gpu_array_info *array, int n, |
1344 | struct gpu_array_ref_group **groups, struct gpu_group_data *data) |
1345 | { |
1346 | int i, j; |
1347 | int recompute_overlap = 0; |
1348 | |
1349 | for (i = 0; i < n; ++i) { |
1350 | if (!groups[i]->shared_tile) |
1351 | continue; |
1352 | for (j = n - 1; j > i; --j) { |
1353 | struct gpu_array_ref_group *group; |
1354 | |
1355 | if (!groups[j]->shared_tile) |
1356 | continue; |
1357 | |
1358 | if (!depth_accesses_overlap(groups[i], groups[j])) |
1359 | continue; |
1360 | |
1361 | group = join_groups(groups[i], groups[j]); |
1362 | if (compute_group_bounds(kernel, group, data) < 0) { |
1363 | gpu_array_ref_group_free(group); |
1364 | return -1; |
1365 | } |
1366 | if (!group->shared_tile || |
1367 | !smaller_tile(group->shared_tile, |
1368 | groups[i]->shared_tile, |
1369 | groups[j]->shared_tile)) { |
1370 | gpu_array_ref_group_free(group); |
1371 | continue; |
1372 | } |
1373 | |
1374 | if (group->min_depth < groups[i]->min_depth || |
1375 | group->min_depth < groups[j]->min_depth) |
1376 | recompute_overlap = 1; |
1377 | gpu_array_ref_group_free(groups[i]); |
1378 | gpu_array_ref_group_free(groups[j]); |
1379 | groups[i] = group; |
1380 | if (j != n - 1) |
1381 | groups[j] = groups[n - 1]; |
1382 | n--; |
1383 | } |
1384 | } |
1385 | |
1386 | if (recompute_overlap) |
1387 | n = group_depth_overlapping_writes(kernel, n, groups, data); |
1388 | return n; |
1389 | } |
1390 | |
1391 | /* Set array->n_group and array->groups to n and groups. |
1392 | * |
1393 | * Additionally, set the "nr" field of each group. |
1394 | */ |
1395 | static void set_array_groups(struct gpu_local_array_info *array, |
1396 | int n, struct gpu_array_ref_group **groups) |
1397 | { |
1398 | int i; |
1399 | |
1400 | array->n_group = n; |
1401 | array->groups = groups; |
1402 | |
1403 | for (i = 0; i < n; ++i) |
1404 | groups[i]->nr = i; |
1405 | } |
1406 | |
1407 | /* Combine all groups in "groups" into a single group and return |
1408 | * the new number of groups (1 or 0 if there were no groups to start with). |
1409 | */ |
1410 | static int join_all_groups(int n, struct gpu_array_ref_group **groups) |
1411 | { |
1412 | int i; |
1413 | |
1414 | for (i = n - 1; i > 0; --i) { |
1415 | groups[0] = join_groups_and_free(groups[0], groups[i]); |
1416 | groups[i] = NULL((void*)0); |
1417 | n--; |
1418 | } |
1419 | |
1420 | return n; |
1421 | } |
1422 | |
1423 | /* Group array references that should be considered together when |
1424 | * deciding whether to access them from private, shared or global memory. |
1425 | * Return -1 on error. |
1426 | * |
1427 | * In particular, if two array references overlap and if one of them |
1428 | * is a write, then the two references are grouped together. |
1429 | * We first perform an initial grouping based only on the access relation. |
1430 | * After computing shared and private memory tiles, we check for |
1431 | * overlapping writes again, but this time taking into account |
1432 | * the depth of the effective tile. |
1433 | * |
1434 | * Furthermore, if two groups admit a shared memory tile and if the |
1435 | * combination of the two also admits a shared memory tile, we merge |
1436 | * the two groups. |
1437 | * |
1438 | * If the array contains structures, then we compute a single |
1439 | * reference group without trying to find any tiles |
1440 | * since we do not map such arrays to private or shared |
1441 | * memory. The only exception is when those arrays of structures |
1442 | * are required to be mapped to private memory. |
1443 | */ |
1444 | static int group_array_references(struct ppcg_kernel *kernel, |
1445 | struct gpu_local_array_info *local, struct gpu_group_data *data) |
1446 | { |
1447 | int i; |
1448 | int n; |
1449 | isl_ctx *ctx = isl_union_map_get_ctx(data->shared_sched); |
1450 | struct gpu_array_ref_group **groups; |
1451 | |
1452 | groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,((struct gpu_array_ref_group * *)isl_calloc_or_die(ctx, local ->array->n_ref, sizeof(struct gpu_array_ref_group *))) |
1453 | local->array->n_ref)((struct gpu_array_ref_group * *)isl_calloc_or_die(ctx, local ->array->n_ref, sizeof(struct gpu_array_ref_group *))); |
1454 | if (!groups) |
1455 | return -1; |
1456 | |
1457 | n = populate_array_references(local, groups, data); |
1458 | |
1459 | if (local->array->has_compound_element && !local->force_private) { |
1460 | n = join_all_groups(n, groups); |
1461 | set_array_groups(local, n, groups); |
1462 | return 0; |
1463 | } |
1464 | |
1465 | n = group_overlapping_writes(kernel, n, groups, data); |
1466 | |
1467 | for (i = 0; i < n; ++i) |
1468 | if (compute_group_bounds(kernel, groups[i], data) < 0) |
1469 | n = -1; |
1470 | |
1471 | n = group_depth_overlapping_writes(kernel, n, groups, data); |
1472 | |
1473 | n = group_common_shared_memory_tile(kernel, local->array, |
1474 | n, groups, data); |
1475 | |
1476 | set_array_groups(local, n, groups); |
1477 | |
1478 | if (n >= 0) |
1479 | return 0; |
1480 | |
1481 | for (i = 0; i < local->array->n_ref; ++i) |
1482 | gpu_array_ref_group_free(groups[i]); |
1483 | return -1; |
1484 | } |
1485 | |
1486 | /* For each array in the input program that can be mapped to private memory, |
1487 | * check if there are any order dependences active inside the current kernel, |
1488 | * within the same iteration of the host schedule, i.e., the prefix |
1489 | * schedule at "node". |
1490 | * If so, mark the array as force_private so that its reference groups will be |
1491 | * mapped to a registers. |
1492 | * |
1493 | * Note that the arrays that cannot be mapped to private memory have |
1494 | * had their order dependences added to prog->array_order and |
1495 | * subsequently to the coincidence constraints. |
1496 | */ |
1497 | static void check_can_be_private_live_ranges(struct ppcg_kernel *kernel, |
1498 | __isl_keep isl_schedule_node *node) |
1499 | { |
1500 | int i; |
1501 | isl_union_set *domain; |
1502 | isl_multi_union_pw_aff *prefix; |
1503 | isl_union_pw_multi_aff *contraction; |
1504 | |
1505 | if (!kernel->options->live_range_reordering) |
1506 | return; |
1507 | |
1508 | kernel->any_force_private = 0; |
1509 | |
1510 | prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); |
1511 | contraction = isl_union_pw_multi_aff_copy(kernel->contraction); |
1512 | prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix, |
1513 | contraction); |
1514 | domain = isl_union_set_copy(kernel->expanded_domain); |
1515 | domain = isl_union_set_universe(domain); |
1516 | |
1517 | for (i = 0; i < kernel->n_array; ++i) { |
1518 | struct gpu_local_array_info *local = &kernel->array[i]; |
1519 | isl_union_map *order; |
1520 | |
1521 | local->force_private = 0; |
1522 | if (!gpu_array_can_be_private(local->array)) |
1523 | continue; |
1524 | order = isl_union_map_copy(local->array->dep_order); |
1525 | order = isl_union_map_intersect_domain(order, |
1526 | isl_union_set_copy(domain)); |
1527 | order = isl_union_map_intersect_range(order, |
1528 | isl_union_set_copy(domain)); |
1529 | order = isl_union_map_eq_at_multi_union_pw_aff(order, |
1530 | isl_multi_union_pw_aff_copy(prefix)); |
1531 | if (!isl_union_map_is_empty(order)) { |
1532 | local->force_private = 1; |
1533 | kernel->any_force_private = 1; |
1534 | } |
1535 | isl_union_map_free(order); |
1536 | } |
1537 | |
1538 | isl_multi_union_pw_aff_free(prefix); |
1539 | isl_union_set_free(domain); |
1540 | } |
1541 | |
1542 | /* Expand the domain of the schedule "s" by plugging in |
1543 | * the contraction "contraction" and return the result. |
1544 | */ |
1545 | static __isl_give isl_union_map *expand(__isl_take isl_union_map *s, |
1546 | __isl_keep isl_union_pw_multi_aff *contraction) |
1547 | { |
1548 | contraction = isl_union_pw_multi_aff_copy(contraction); |
1549 | s = isl_union_map_preimage_domain_union_pw_multi_aff(s, contraction); |
1550 | return s; |
1551 | } |
1552 | |
1553 | /* Create a set of dimension data->thread_depth + data->n_thread |
1554 | * that equates the residue of the final data->n_thread dimensions |
1555 | * modulo the kernel->block_dim sizes to the thread identifiers. |
1556 | * Store the computed set in data->privatization. |
1557 | * |
1558 | * The construction starts with the space of kernel->thread_filter, |
1559 | * which is known to reference all thread identifiers. |
1560 | */ |
1561 | static void compute_privatization(struct gpu_group_data *data, |
1562 | struct ppcg_kernel *kernel) |
1563 | { |
1564 | int i; |
1565 | isl_ctx *ctx; |
1566 | isl_space *space; |
1567 | isl_local_space *ls; |
1568 | isl_set *set; |
1569 | |
1570 | ctx = isl_union_map_get_ctx(data->shared_sched); |
1571 | space = isl_union_set_get_space(kernel->thread_filter); |
1572 | space = isl_space_set_from_params(space); |
1573 | space = isl_space_add_dims(space, isl_dim_set, |
1574 | data->thread_depth + data->n_thread); |
1575 | set = isl_set_universe(space); |
1576 | space = isl_set_get_space(set); |
1577 | ls = isl_local_space_from_space(space); |
1578 | |
1579 | for (i = 0; i < data->n_thread; ++i) { |
1580 | isl_aff *aff, *aff2; |
1581 | isl_constraint *c; |
1582 | isl_val *v; |
1583 | isl_id *id; |
1584 | int pos; |
1585 | |
1586 | aff = isl_aff_var_on_domain(isl_local_space_copy(ls), |
1587 | isl_dim_set, data->thread_depth + i); |
1588 | v = isl_val_int_from_si(ctx, kernel->block_dim[i]); |
1589 | aff = isl_aff_mod_val(aff, v); |
1590 | id = isl_id_list_get_id(kernel->thread_ids, i); |
1591 | pos = isl_set_find_dim_by_id(set, isl_dim_param, id); |
1592 | isl_id_free(id); |
1593 | aff2 = isl_aff_var_on_domain(isl_local_space_copy(ls), |
1594 | isl_dim_param, pos); |
1595 | aff = isl_aff_sub(aff, aff2); |
1596 | c = isl_equality_from_aff(aff); |
1597 | set = isl_set_add_constraint(set, c); |
1598 | } |
1599 | |
1600 | isl_local_space_free(ls); |
1601 | data->privatization = set; |
1602 | } |
1603 | |
1604 | /* Return the prefix schedule at "node" as a relation |
1605 | * between domain elements and schedule dimensions after detecting |
1606 | * equalities in this relation. |
1607 | */ |
1608 | static __isl_give isl_union_map *prefix_with_equalities( |
1609 | __isl_keep isl_schedule_node *node) |
1610 | { |
1611 | isl_union_map *schedule; |
1612 | |
1613 | schedule = isl_schedule_node_get_prefix_schedule_relation(node); |
1614 | schedule = isl_union_map_detect_equalities(schedule); |
1615 | |
1616 | return schedule; |
1617 | } |
1618 | |
1619 | /* Group references of all arrays in "kernel". |
1620 | * "node" points to the kernel mark. |
1621 | * The mapping to shared memory in computed at the "shared" mark. |
1622 | * |
1623 | * We first extract all required schedule information into |
1624 | * a gpu_group_data structure and then consider each array |
1625 | * in turn. |
1626 | */ |
1627 | int gpu_group_references(struct ppcg_kernel *kernel, |
1628 | __isl_keep isl_schedule_node *node) |
1629 | { |
1630 | int i; |
1631 | int r = 0; |
1632 | isl_union_pw_multi_aff *contraction; |
1633 | struct gpu_group_data data; |
1634 | |
1635 | check_can_be_private_live_ranges(kernel, node); |
1636 | |
1637 | data.scop = kernel->prog->scop; |
1638 | |
1639 | data.kernel_depth = isl_schedule_node_get_schedule_depth(node); |
1640 | data.host_sched = isl_schedule_node_get_prefix_schedule_relation(node); |
1641 | |
1642 | node = isl_schedule_node_copy(node); |
1643 | node = gpu_tree_move_down_to_shared(node, kernel->core); |
1644 | data.shared_depth = isl_schedule_node_get_schedule_depth(node); |
1645 | data.shared_sched = prefix_with_equalities(node); |
1646 | |
1647 | node = gpu_tree_move_down_to_thread(node, kernel->core); |
1648 | node = isl_schedule_node_child(node, 0); |
1649 | data.thread_depth = isl_schedule_node_get_schedule_depth(node); |
1650 | data.n_thread = isl_schedule_node_band_n_member(node); |
1651 | if (data.thread_depth == data.shared_depth) |
1652 | data.copy_sched = isl_union_map_copy(data.shared_sched); |
1653 | else |
1654 | data.copy_sched = prefix_with_equalities(node); |
1655 | data.thread_sched = isl_union_map_copy(data.copy_sched); |
1656 | data.thread_sched = isl_union_map_flat_range_product(data.thread_sched, |
1657 | isl_schedule_node_band_get_partial_schedule_union_map(node)); |
1658 | data.thread_sched = isl_union_map_detect_equalities(data.thread_sched); |
1659 | |
1660 | contraction = isl_union_pw_multi_aff_copy(kernel->contraction); |
1661 | data.host_sched = expand(data.host_sched, contraction); |
1662 | data.shared_sched = expand(data.shared_sched, contraction); |
1663 | if (data.thread_depth == data.shared_depth) { |
1664 | isl_union_map_free(data.copy_sched); |
1665 | data.copy_sched = isl_union_map_copy(data.shared_sched); |
1666 | } else { |
1667 | data.copy_sched = expand(data.copy_sched, contraction); |
1668 | } |
1669 | data.thread_sched = expand(data.thread_sched, contraction); |
1670 | isl_union_pw_multi_aff_free(contraction); |
1671 | |
1672 | node = isl_schedule_node_child(node, 0); |
1673 | data.full_sched = isl_union_map_copy(data.thread_sched); |
1674 | data.full_sched = isl_union_map_flat_range_product(data.full_sched, |
1675 | isl_schedule_node_get_subtree_schedule_union_map(node)); |
1676 | isl_schedule_node_free(node); |
1677 | |
1678 | compute_privatization(&data, kernel); |
1679 | |
1680 | for (i = 0; i < kernel->n_array; ++i) { |
1681 | r = group_array_references(kernel, &kernel->array[i], &data); |
1682 | if (r < 0) |
1683 | break; |
1684 | } |
1685 | |
1686 | isl_union_map_free(data.host_sched); |
1687 | isl_union_map_free(data.shared_sched); |
1688 | isl_union_map_free(data.copy_sched); |
1689 | isl_union_map_free(data.thread_sched); |
1690 | isl_union_map_free(data.full_sched); |
1691 | isl_set_free(data.privatization); |
1692 | |
1693 | return r; |
1694 | } |
1695 | |
1696 | /* Given a description of an array tile "tile" and the "space" |
1697 | * |
1698 | * { D -> A } |
1699 | * |
1700 | * where D represents the first tile->depth schedule dimensions |
1701 | * and A represents the array, construct an isl_multi_aff |
1702 | * |
1703 | * { [D[i] -> A[a]] -> A'[a'] } |
1704 | * |
1705 | * with A' a scaled down copy of A according to the shifts and strides |
1706 | * in "tile". In particular, |
1707 | * |
1708 | * a' = (a + shift(i))/stride |
1709 | * |
1710 | * "insert_array" represents |
1711 | * |
1712 | * { [D -> A] -> D } |
1713 | * |
1714 | * and is used to insert A into the domain of functions that only |
1715 | * reference D. |
1716 | */ |
1717 | static __isl_give isl_multi_aff *strided_tile( |
1718 | struct gpu_array_tile *tile, __isl_keep isl_space *space, |
1719 | __isl_keep isl_multi_aff *insert_array) |
1720 | { |
1721 | int i; |
1722 | isl_ctx *ctx; |
1723 | isl_multi_aff *shift; |
1724 | isl_multi_val *stride; |
1725 | isl_space *space2; |
1726 | isl_local_space *ls; |
1727 | isl_multi_aff *tiling; |
1728 | |
1729 | ctx = isl_space_get_ctx(space); |
1730 | space2 = isl_space_domain(isl_space_copy(space)); |
1731 | ls = isl_local_space_from_space(space2); |
1732 | space2 = isl_space_range(isl_space_copy(space)); |
1733 | stride = isl_multi_val_zero(space2); |
1734 | shift = isl_multi_aff_zero(isl_space_copy(space)); |
1735 | |
1736 | for (i = 0; i < tile->n; ++i) { |
1737 | struct gpu_array_bound *bound = &tile->bound[i]; |
1738 | isl_val *stride_i; |
1739 | isl_aff *shift_i; |
1740 | |
1741 | if (tile->bound[i].shift) { |
1742 | stride_i = isl_val_copy(bound->stride); |
1743 | shift_i = isl_aff_copy(bound->shift); |
1744 | } else { |
1745 | stride_i = isl_val_one(ctx); |
1746 | shift_i = isl_aff_zero_on_domain( |
1747 | isl_local_space_copy(ls)); |
1748 | } |
1749 | |
1750 | stride = isl_multi_val_set_val(stride, i, stride_i); |
1751 | shift = isl_multi_aff_set_aff(shift, i, shift_i); |
1752 | } |
1753 | isl_local_space_free(ls); |
1754 | |
1755 | shift = isl_multi_aff_pullback_multi_aff(shift, |
1756 | isl_multi_aff_copy(insert_array)); |
1757 | |
1758 | tiling = isl_multi_aff_range_map(isl_space_copy(space)); |
1759 | tiling = isl_multi_aff_add(tiling, shift); |
1760 | tiling = isl_multi_aff_scale_down_multi_val(tiling, stride); |
1761 | |
1762 | return tiling; |
1763 | } |
1764 | |
1765 | /* Compute a tiling for the array reference group "group". |
1766 | * |
1767 | * The tiling is of the form |
1768 | * |
1769 | * { [D[i] -> A[a]] -> T[t] } |
1770 | * |
1771 | * where D represents the first tile->depth schedule dimensions, |
1772 | * A represents the global array and T represents the shared or |
1773 | * private memory tile. The name of T is the name of the local |
1774 | * array. |
1775 | * |
1776 | * If there is any stride in the accesses, then the mapping is |
1777 | * |
1778 | * t = (a + shift(i))/stride - lb(i) |
1779 | * |
1780 | * otherwise, it is simply |
1781 | * |
1782 | * t = a - lb(i) |
1783 | */ |
1784 | void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group) |
1785 | { |
1786 | int i; |
1787 | struct gpu_array_tile *tile; |
1788 | isl_space *space; |
1789 | isl_multi_aff *tiling, *lb, *insert_array; |
1790 | isl_printer *p; |
1791 | char *local_name; |
1792 | |
1793 | tile = gpu_array_ref_group_tile(group); |
1794 | if (!tile) |
1795 | return; |
1796 | |
1797 | space = isl_map_get_space(group->access); |
1798 | space = isl_space_from_range(isl_space_range(space)); |
1799 | space = isl_space_add_dims(space, isl_dim_in, tile->depth); |
1800 | insert_array = isl_multi_aff_domain_map(isl_space_copy(space)); |
1801 | |
1802 | for (i = 0; i < tile->n; ++i) |
1803 | if (tile->bound[i].shift) |
1804 | break; |
1805 | |
1806 | if (i < tile->n) |
1807 | tiling = strided_tile(tile, space, insert_array); |
1808 | else |
1809 | tiling = isl_multi_aff_range_map(isl_space_copy(space)); |
1810 | |
1811 | lb = isl_multi_aff_zero(space); |
1812 | for (i = 0; i < tile->n; ++i) { |
1813 | isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb); |
1814 | lb = isl_multi_aff_set_aff(lb, i, lb_i); |
1815 | } |
1816 | lb = isl_multi_aff_pullback_multi_aff(lb, insert_array); |
1817 | |
1818 | tiling = isl_multi_aff_sub(tiling, lb); |
1819 | |
1820 | p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling)); |
1821 | p = gpu_array_ref_group_print_name(group, p); |
1822 | local_name = isl_printer_get_str(p); |
1823 | isl_printer_free(p); |
1824 | tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name); |
1825 | free(local_name); |
1826 | |
1827 | tile->tiling = tiling; |
1828 | } |