978 lines
28 KiB
C
978 lines
28 KiB
C
/* workitems.c -- The main runtime entry that performs work-item execution in
|
|
various ways and the builtin functions closely related to the
|
|
implementation.
|
|
|
|
Copyright (C) 2015-2021 Free Software Foundation, Inc.
|
|
Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
|
|
for General Processor Tech.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a
|
|
copy of this software and associated documentation files
|
|
(the "Software"), to deal in the Software without restriction, including
|
|
without limitation the rights to use, copy, modify, merge, publish,
|
|
distribute, sublicense, and/or sell copies of the Software, and to
|
|
permit persons to whom the Software is furnished to do so, subject to
|
|
the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included
|
|
in all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
|
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
|
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
|
|
USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
/* The fiber based multiple work-item work-group execution uses ucontext
|
|
based user mode threading. However, if gccbrig is able to optimize the
|
|
kernel to a much faster work-group function that implements the multiple
|
|
WI execution using loops instead of fibers requiring slow context switches,
|
|
the fiber-based implementation won't be called.
|
|
*/
|
|
|
|
#include <stdlib.h>
|
|
#include <signal.h>
|
|
#include <string.h>
|
|
|
|
#include "workitems.h"
|
|
#include "phsa-rt.h"
|
|
|
|
#ifdef HAVE_FIBERS
|
|
#include "fibers.h"
|
|
#endif
|
|
|
|
#ifdef BENCHMARK_PHSA_RT
|
|
#include <stdio.h>
|
|
#include <time.h>
|
|
|
|
static uint64_t wi_count = 0;
|
|
static uint64_t wis_skipped = 0;
|
|
static uint64_t wi_total = 0;
|
|
static clock_t start_time;
|
|
|
|
#endif
|
|
|
|
#ifdef DEBUG_PHSA_RT
|
|
#include <stdio.h>
|
|
#endif
|
|
|
|
#define PRIVATE_SEGMENT_ALIGN 256
|
|
#define FIBER_STACK_SIZE (64*1024)
|
|
#define GROUP_SEGMENT_ALIGN 256
|
|
|
|
/* Preserve this amount of additional space in the alloca stack as we need to
|
|
store the alloca frame pointer to the alloca frame, thus must preserve
|
|
space for it. This thus supports at most 1024 functions with allocas in
|
|
a call chain. */
|
|
#define ALLOCA_OVERHEAD 1024*4
|
|
|
|
uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context);
|
|
|
|
uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context);
|
|
|
|
uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context);
|
|
|
|
uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi);
|
|
|
|
uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi);
|
|
|
|
void
|
|
phsa_fatal_error (int code)
|
|
{
|
|
exit (code);
|
|
}
|
|
|
|
#ifdef HAVE_FIBERS
|
|
/* ucontext-based work-item thread implementation. Runs all work-items in
|
|
separate fibers. */
|
|
|
|
static void
|
|
phsa_work_item_thread (int arg0, int arg1)
|
|
{
|
|
void *arg = fiber_int_args_to_ptr (arg0, arg1);
|
|
|
|
PHSAWorkItem *wi = (PHSAWorkItem *) arg;
|
|
volatile PHSAWorkGroup *wg = wi->wg;
|
|
PHSAKernelLaunchData *l_data = wi->launch_data;
|
|
|
|
do
|
|
{
|
|
int retcode
|
|
= fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier);
|
|
|
|
/* At this point the threads can assume that either more_wgs is 0 or
|
|
the current_work_group_* is set to point to the WG executed next. */
|
|
if (!wi->wg->more_wgs)
|
|
break;
|
|
|
|
wi->group_x = wg->x;
|
|
wi->group_y = wg->y;
|
|
wi->group_z = wg->z;
|
|
|
|
wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
|
|
wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
|
|
wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
|
|
|
|
#ifdef DEBUG_PHSA_RT
|
|
printf (
|
|
"Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
|
|
wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
|
|
l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
|
|
#endif
|
|
|
|
if (wi->x < __hsail_currentworkgroupsize (0, wi)
|
|
&& wi->y < __hsail_currentworkgroupsize (1, wi)
|
|
&& wi->z < __hsail_currentworkgroupsize (2, wi))
|
|
{
|
|
l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr,
|
|
wg->initial_group_offset, wg->private_base_ptr);
|
|
#ifdef DEBUG_PHSA_RT
|
|
printf ("done.\n");
|
|
#endif
|
|
#ifdef BENCHMARK_PHSA_RT
|
|
wi_count++;
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
#ifdef DEBUG_PHSA_RT
|
|
printf ("skipped (partial WG).\n");
|
|
#endif
|
|
#ifdef BENCHMARK_PHSA_RT
|
|
wis_skipped++;
|
|
#endif
|
|
}
|
|
|
|
retcode
|
|
= fiber_barrier_reach ((fiber_barrier_t *)
|
|
l_data->wg_completion_barrier);
|
|
|
|
/* The first thread updates the WG to execute next etc. */
|
|
|
|
if (retcode == 0)
|
|
{
|
|
#ifdef EXECUTE_WGS_BACKWARDS
|
|
if (wg->x == l_data->wg_min_x)
|
|
{
|
|
wg->x = l_data->wg_max_x - 1;
|
|
if (wg->y == l_data->wg_min_y)
|
|
{
|
|
wg->y = l_data->wg_max_y - 1;
|
|
if (wg->z == l_data->wg_min_z)
|
|
wg->more_wgs = 0;
|
|
else
|
|
wg->z--;
|
|
}
|
|
else
|
|
wg->y--;
|
|
}
|
|
else
|
|
wg->x--;
|
|
#else
|
|
if (wg->x + 1 >= l_data->wg_max_x)
|
|
{
|
|
wg->x = l_data->wg_min_x;
|
|
if (wg->y + 1 >= l_data->wg_max_y)
|
|
{
|
|
wg->y = l_data->wg_min_y;
|
|
if (wg->z + 1 >= l_data->wg_max_z)
|
|
wg->more_wgs = 0;
|
|
else
|
|
wg->z++;
|
|
}
|
|
else
|
|
wg->y++;
|
|
}
|
|
else
|
|
wg->x++;
|
|
#endif
|
|
wi->group_x = wg->x;
|
|
wi->group_y = wg->y;
|
|
wi->group_z = wg->z;
|
|
|
|
wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
|
|
wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
|
|
wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
|
|
|
|
/* Reinitialize the work-group barrier according to the new WG's
|
|
size, which might not be the same as the previous ones, due
|
|
to "partial WGs". */
|
|
size_t wg_size = __hsail_currentworkgroupsize (0, wi)
|
|
* __hsail_currentworkgroupsize (1, wi)
|
|
* __hsail_currentworkgroupsize (2, wi);
|
|
|
|
#ifdef DEBUG_PHSA_RT
|
|
printf ("Reinitializing the WG barrier to %lu.\n", wg_size);
|
|
#endif
|
|
fiber_barrier_init ((fiber_barrier_t *)
|
|
wi->launch_data->wg_sync_barrier,
|
|
wg_size);
|
|
|
|
#ifdef BENCHMARK_PHSA_RT
|
|
if (wi_count % 1000 == 0)
|
|
{
|
|
clock_t spent_time = clock () - start_time;
|
|
double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
|
|
double wis_per_sec = wi_count / spent_time_sec;
|
|
uint64_t eta_sec
|
|
= (wi_total - wi_count - wis_skipped) / wis_per_sec;
|
|
|
|
printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in "
|
|
"%lu s)\n",
|
|
wi_count, wis_skipped, (uint64_t) spent_time_sec,
|
|
(uint64_t) wis_per_sec, (uint64_t) eta_sec);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
while (1);
|
|
|
|
fiber_exit ();
|
|
}
|
|
#endif
|
|
|
|
#define MIN(a, b) ((a < b) ? a : b)
|
|
#define MAX(a, b) ((a > b) ? a : b)
|
|
|
|
#ifdef HAVE_FIBERS
|
|
/* Spawns a given number of work-items to execute a set of work-groups,
|
|
blocks until their completion. */
|
|
|
|
static void
|
|
phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
|
|
uint32_t group_local_offset, size_t wg_size_x,
|
|
size_t wg_size_y, size_t wg_size_z)
|
|
{
|
|
PHSAWorkItem *wi_threads = NULL;
|
|
PHSAWorkGroup wg;
|
|
size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
|
|
uint32_t group_x, group_y, group_z;
|
|
fiber_barrier_t wg_start_barrier;
|
|
fiber_barrier_t wg_completion_barrier;
|
|
fiber_barrier_t wg_sync_barrier;
|
|
|
|
max_x = wg_size_x == 0 ? 1 : wg_size_x;
|
|
max_y = wg_size_y == 0 ? 1 : wg_size_y;
|
|
max_z = wg_size_z == 0 ? 1 : wg_size_z;
|
|
|
|
size_t wg_size = max_x * max_y * max_z;
|
|
if (wg_size > PHSA_MAX_WG_SIZE)
|
|
phsa_fatal_error (2);
|
|
|
|
wg.private_segment_total_size = context->dp->private_segment_size * wg_size;
|
|
if (wg.private_segment_total_size > 0
|
|
&& posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN,
|
|
wg.private_segment_total_size)
|
|
!= 0)
|
|
phsa_fatal_error (3);
|
|
|
|
wg.alloca_stack_p = wg.private_segment_total_size + ALLOCA_OVERHEAD;
|
|
wg.alloca_frame_p = wg.alloca_stack_p;
|
|
wg.initial_group_offset = group_local_offset;
|
|
|
|
#ifdef EXECUTE_WGS_BACKWARDS
|
|
group_x = context->wg_max_x - 1;
|
|
group_y = context->wg_max_y - 1;
|
|
group_z = context->wg_max_z - 1;
|
|
#else
|
|
group_x = context->wg_min_x;
|
|
group_y = context->wg_min_y;
|
|
group_z = context->wg_min_z;
|
|
#endif
|
|
|
|
fiber_barrier_init (&wg_sync_barrier, wg_size);
|
|
fiber_barrier_init (&wg_start_barrier, wg_size);
|
|
fiber_barrier_init (&wg_completion_barrier, wg_size);
|
|
|
|
context->wg_start_barrier = &wg_start_barrier;
|
|
context->wg_sync_barrier = &wg_sync_barrier;
|
|
context->wg_completion_barrier = &wg_completion_barrier;
|
|
|
|
wg.more_wgs = 1;
|
|
wg.group_base_ptr = group_base_ptr;
|
|
|
|
#ifdef BENCHMARK_PHSA_RT
|
|
wi_count = 0;
|
|
wis_skipped = 0;
|
|
start_time = clock ();
|
|
#endif
|
|
wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z);
|
|
for (x = 0; x < max_x; ++x)
|
|
for (y = 0; y < max_y; ++y)
|
|
for (z = 0; z < max_z; ++z)
|
|
{
|
|
PHSAWorkItem *wi = &wi_threads[flat_wi_id];
|
|
wi->launch_data = context;
|
|
wi->wg = &wg;
|
|
|
|
wg.x = wi->group_x = group_x;
|
|
wg.y = wi->group_y = group_y;
|
|
wg.z = wi->group_z = group_z;
|
|
|
|
wi->wg_size_x = context->dp->workgroup_size_x;
|
|
wi->wg_size_y = context->dp->workgroup_size_y;
|
|
wi->wg_size_z = context->dp->workgroup_size_z;
|
|
|
|
wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
|
|
wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
|
|
wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
|
|
|
|
wi->x = x;
|
|
wi->y = y;
|
|
wi->z = z;
|
|
|
|
/* TODO: set the stack size according to the private
|
|
segment size. Too big stack consumes huge amount of
|
|
memory in case of huge number of WIs and a too small stack
|
|
will fail in mysterious and potentially dangerous ways. */
|
|
|
|
fiber_init (&wi->fiber, phsa_work_item_thread, wi,
|
|
FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN);
|
|
++flat_wi_id;
|
|
}
|
|
|
|
do
|
|
{
|
|
--flat_wi_id;
|
|
fiber_join (&wi_threads[flat_wi_id].fiber);
|
|
}
|
|
while (flat_wi_id > 0);
|
|
|
|
if (wg.private_segment_total_size > 0)
|
|
free (wg.private_base_ptr);
|
|
|
|
free (wi_threads);
|
|
}
|
|
|
|
/* Spawn the work-item threads to execute work-groups and let
|
|
them execute all the WGs, including a potential partial WG. */
|
|
|
|
static void
|
|
phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr,
|
|
uint32_t group_local_offset)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = context->dp;
|
|
size_t x, y, z;
|
|
|
|
context->group_segment_start_addr = (size_t) group_base_ptr;
|
|
|
|
/* HSA seems to allow the WG size to be larger than the grid size. We need to
|
|
saturate the effective WG size to the grid size to prevent the extra WIs
|
|
from executing. */
|
|
size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
|
|
sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
|
|
sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
|
|
sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
|
|
sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
|
|
|
|
#ifdef BENCHMARK_PHSA_RT
|
|
wi_total = (uint64_t) dp->grid_size_x
|
|
* (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
|
|
* (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
|
|
#endif
|
|
|
|
/* For now execute all work groups in a single coarse thread (does not utilize
|
|
multicore/multithread). */
|
|
context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
|
|
|
|
int dims = dp->setup & 0x3;
|
|
|
|
context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
|
|
/ dp->workgroup_size_x;
|
|
|
|
context->wg_max_y
|
|
= dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
|
|
/ dp->workgroup_size_y;
|
|
|
|
context->wg_max_z
|
|
= dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
|
|
/ dp->workgroup_size_z;
|
|
|
|
#ifdef DEBUG_PHSA_RT
|
|
printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
|
|
"wg size %lu/%lu/%lu grid size %u/%u/%u\n",
|
|
context->wg_min_x, context->wg_min_y, context->wg_min_z,
|
|
context->wg_max_x, context->wg_max_y, context->wg_max_z,
|
|
sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
|
|
dp->grid_size_y, dp->grid_size_z);
|
|
#endif
|
|
|
|
phsa_execute_wi_gang (context, group_base_ptr, group_local_offset,
|
|
sat_wg_size_x, sat_wg_size_y, sat_wg_size_z);
|
|
}
|
|
#endif
|
|
|
|
/* Executes the given work-group function for all work groups in the grid.
|
|
|
|
A work-group function is a version of the original kernel which executes
|
|
the kernel for all work-items in a work-group. It is produced by gccbrig
|
|
if it can handle the kernel's barrier usage and is much faster way to
|
|
execute massive numbers of work-items in a non-SPMD machine than fibers
|
|
(easily 100x faster). */
|
|
static void
|
|
phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
|
|
uint32_t group_local_offset)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = context->dp;
|
|
size_t x, y, z, wg_x, wg_y, wg_z;
|
|
|
|
context->group_segment_start_addr = (size_t) group_base_ptr;
|
|
|
|
/* HSA seems to allow the WG size to be larger than the grid size. We need
|
|
to saturate the effective WG size to the grid size to prevent the extra WIs
|
|
from executing. */
|
|
size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size;
|
|
sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x);
|
|
sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y);
|
|
sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z);
|
|
sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z;
|
|
|
|
#ifdef BENCHMARK_PHSA_RT
|
|
wi_total = (uint64_t) dp->grid_size_x
|
|
* (dp->grid_size_y > 0 ? dp->grid_size_y : 1)
|
|
* (dp->grid_size_z > 0 ? dp->grid_size_z : 1);
|
|
#endif
|
|
|
|
context->wg_min_x = context->wg_min_y = context->wg_min_z = 0;
|
|
|
|
int dims = dp->setup & 0x3;
|
|
|
|
context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1)
|
|
/ dp->workgroup_size_x;
|
|
|
|
context->wg_max_y
|
|
= dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1)
|
|
/ dp->workgroup_size_y;
|
|
|
|
context->wg_max_z
|
|
= dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1)
|
|
/ dp->workgroup_size_z;
|
|
|
|
#ifdef DEBUG_PHSA_RT
|
|
printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with "
|
|
"wg size %lu/%lu/%lu grid size %u/%u/%u\n",
|
|
context->wg_min_x, context->wg_min_y, context->wg_min_z,
|
|
context->wg_max_x, context->wg_max_y, context->wg_max_z,
|
|
sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x,
|
|
dp->grid_size_y, dp->grid_size_z);
|
|
#endif
|
|
|
|
PHSAWorkItem wi;
|
|
PHSAWorkGroup wg;
|
|
wi.wg = &wg;
|
|
wi.x = wi.y = wi.z = 0;
|
|
wi.launch_data = context;
|
|
|
|
#ifdef BENCHMARK_PHSA_RT
|
|
start_time = clock ();
|
|
uint64_t wg_count = 0;
|
|
#endif
|
|
|
|
size_t wg_size = __hsail_workgroupsize (0, &wi)
|
|
* __hsail_workgroupsize (1, &wi)
|
|
* __hsail_workgroupsize (2, &wi);
|
|
|
|
void *private_base_ptr = NULL;
|
|
if (dp->private_segment_size > 0
|
|
&& posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN,
|
|
dp->private_segment_size * wg_size)
|
|
!= 0)
|
|
phsa_fatal_error (3);
|
|
|
|
wg.alloca_stack_p = dp->private_segment_size * wg_size + ALLOCA_OVERHEAD;
|
|
wg.alloca_frame_p = wg.alloca_stack_p;
|
|
|
|
wg.private_base_ptr = private_base_ptr;
|
|
wg.group_base_ptr = group_base_ptr;
|
|
|
|
#ifdef DEBUG_PHSA_RT
|
|
printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size,
|
|
wg_size, private_base_ptr);
|
|
#endif
|
|
|
|
for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z)
|
|
for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
|
|
for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
|
|
{
|
|
wi.group_x = wg_x;
|
|
wi.group_y = wg_y;
|
|
wi.group_z = wg_z;
|
|
|
|
wi.wg_size_x = context->dp->workgroup_size_x;
|
|
wi.wg_size_y = context->dp->workgroup_size_y;
|
|
wi.wg_size_z = context->dp->workgroup_size_z;
|
|
|
|
wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
|
|
wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
|
|
wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
|
|
|
|
context->kernel (context->kernarg_addr, &wi, group_base_ptr,
|
|
group_local_offset, private_base_ptr);
|
|
|
|
#if defined (BENCHMARK_PHSA_RT)
|
|
wg_count++;
|
|
if (wg_count % 1000000 == 0)
|
|
{
|
|
clock_t spent_time = clock () - start_time;
|
|
uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y
|
|
+ wg_z * sat_wg_size_z;
|
|
double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
|
|
double wis_per_sec = wi_count / spent_time_sec;
|
|
uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec;
|
|
|
|
printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n",
|
|
wi_count, (uint64_t) spent_time_sec,
|
|
(uint64_t) wis_per_sec, (uint64_t) eta_sec);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#ifdef BENCHMARK_PHSA_RT
|
|
clock_t spent_time = clock () - start_time;
|
|
double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC;
|
|
double wis_per_sec = wi_total / spent_time_sec;
|
|
|
|
printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total,
|
|
(uint64_t) spent_time_sec, (uint64_t) wis_per_sec);
|
|
#endif
|
|
free (private_base_ptr);
|
|
private_base_ptr = NULL;
|
|
}
|
|
|
|
/* gccbrig generates the following from each HSAIL kernel:
|
|
|
|
1) The actual kernel function (a single work-item kernel or a work-group
|
|
function) generated from HSAIL (BRIG).
|
|
|
|
static void _Kernel (void* args, void* context, void* group_base_ptr)
|
|
{
|
|
...
|
|
}
|
|
|
|
2) A public facing kernel function that is called from the PHSA runtime:
|
|
|
|
a) A single work-item function (that requires fibers for multi-WI):
|
|
|
|
void Kernel (void* context)
|
|
{
|
|
__launch_launch_kernel (_Kernel, context);
|
|
}
|
|
|
|
or
|
|
|
|
b) a when gccbrig could generate a work-group function:
|
|
|
|
void Kernel (void* context)
|
|
{
|
|
__hsail_launch_wg_function (_Kernel, context);
|
|
}
|
|
*/
|
|
|
|
#ifdef HAVE_FIBERS
|
|
|
|
void
|
|
__hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context,
|
|
void *group_base_ptr, uint32_t group_local_offset)
|
|
{
|
|
context->kernel = kernel;
|
|
phsa_spawn_work_items (context, group_base_ptr, group_local_offset);
|
|
}
|
|
#endif
|
|
|
|
void
|
|
__hsail_launch_wg_function (gccbrigKernelFunc kernel,
|
|
PHSAKernelLaunchData *context, void *group_base_ptr,
|
|
uint32_t group_local_offset)
|
|
{
|
|
context->kernel = kernel;
|
|
phsa_execute_work_groups (context, group_base_ptr, group_local_offset);
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
|
|
uint32_t id;
|
|
switch (dim)
|
|
{
|
|
default:
|
|
case 0:
|
|
/* Overflow semantics in the case of WG dim > grid dim. */
|
|
id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
|
|
% dp->grid_size_x;
|
|
break;
|
|
case 1:
|
|
id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
|
|
% dp->grid_size_y;
|
|
break;
|
|
case 2:
|
|
id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
|
|
% dp->grid_size_z;
|
|
break;
|
|
}
|
|
return id;
|
|
}
|
|
|
|
uint64_t
|
|
__hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
|
|
uint64_t id;
|
|
switch (dim)
|
|
{
|
|
default:
|
|
case 0:
|
|
/* Overflow semantics in the case of WG dim > grid dim. */
|
|
id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
|
|
% dp->grid_size_x;
|
|
break;
|
|
case 1:
|
|
id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
|
|
% dp->grid_size_y;
|
|
break;
|
|
case 2:
|
|
id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
|
|
% dp->grid_size_z;
|
|
break;
|
|
}
|
|
return id;
|
|
}
|
|
|
|
|
|
uint32_t
|
|
__hsail_workitemid (uint32_t dim, PHSAWorkItem *context)
|
|
{
|
|
PHSAWorkItem *c = (PHSAWorkItem *) context;
|
|
hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
|
|
/* The number of dimensions is in the two least significant bits. */
|
|
int dims = dp->setup & 0x3;
|
|
|
|
uint32_t id;
|
|
switch (dim)
|
|
{
|
|
default:
|
|
case 0:
|
|
id = c->x;
|
|
break;
|
|
case 1:
|
|
id = dims < 2 ? 0 : c->y;
|
|
break;
|
|
case 2:
|
|
id = dims < 3 ? 0 : c->z;
|
|
break;
|
|
}
|
|
return id;
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_gridgroups (uint32_t dim, PHSAWorkItem *context)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
int dims = dp->setup & 0x3;
|
|
|
|
uint32_t id;
|
|
switch (dim)
|
|
{
|
|
default:
|
|
case 0:
|
|
id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x;
|
|
break;
|
|
case 1:
|
|
id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1)
|
|
/ dp->workgroup_size_y;
|
|
break;
|
|
case 2:
|
|
id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1)
|
|
/ dp->workgroup_size_z;
|
|
break;
|
|
}
|
|
return id;
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_workitemflatid (PHSAWorkItem *c)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
|
|
|
|
return c->x + c->y * dp->workgroup_size_x
|
|
+ c->z * dp->workgroup_size_x * dp->workgroup_size_y;
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_currentworkitemflatid (PHSAWorkItem *c)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp;
|
|
|
|
return c->x + c->y * __hsail_currentworkgroupsize (0, c)
|
|
+ c->z * __hsail_currentworkgroupsize (0, c)
|
|
* __hsail_currentworkgroupsize (1, c);
|
|
}
|
|
|
|
void
|
|
__hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context)
|
|
{
|
|
switch (dim)
|
|
{
|
|
default:
|
|
case 0:
|
|
context->x = id;
|
|
break;
|
|
case 1:
|
|
context->y = id;
|
|
break;
|
|
case 2:
|
|
context->z = id;
|
|
break;
|
|
}
|
|
}
|
|
|
|
uint64_t
|
|
__hsail_workitemflatabsid_u64 (PHSAWorkItem *context)
|
|
{
|
|
PHSAWorkItem *c = (PHSAWorkItem *) context;
|
|
hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
|
|
/* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
|
|
uint64_t id0 = __hsail_workitemabsid (0, context);
|
|
uint64_t id1 = __hsail_workitemabsid (1, context);
|
|
uint64_t id2 = __hsail_workitemabsid (2, context);
|
|
|
|
uint64_t max0 = dp->grid_size_x;
|
|
uint64_t max1 = dp->grid_size_y;
|
|
uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
|
|
|
|
return id;
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_workitemflatabsid_u32 (PHSAWorkItem *context)
|
|
{
|
|
PHSAWorkItem *c = (PHSAWorkItem *) context;
|
|
hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp;
|
|
|
|
/* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */
|
|
uint64_t id0 = __hsail_workitemabsid (0, context);
|
|
uint64_t id1 = __hsail_workitemabsid (1, context);
|
|
uint64_t id2 = __hsail_workitemabsid (2, context);
|
|
|
|
uint64_t max0 = dp->grid_size_x;
|
|
uint64_t max1 = dp->grid_size_y;
|
|
uint64_t id = id0 + id1 * max0 + id2 * max0 * max1;
|
|
return (uint32_t) id;
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
|
|
uint32_t wg_size = 0;
|
|
switch (dim)
|
|
{
|
|
default:
|
|
case 0:
|
|
if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
|
|
wg_size = dp->workgroup_size_x; /* Full WG. */
|
|
else
|
|
wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */
|
|
break;
|
|
case 1:
|
|
if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
|
|
wg_size = dp->workgroup_size_y; /* Full WG. */
|
|
else
|
|
wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */
|
|
break;
|
|
case 2:
|
|
if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
|
|
wg_size = dp->workgroup_size_z; /* Full WG. */
|
|
else
|
|
wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */
|
|
break;
|
|
}
|
|
return wg_size;
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
|
|
switch (dim)
|
|
{
|
|
default:
|
|
case 0:
|
|
return dp->workgroup_size_x;
|
|
case 1:
|
|
return dp->workgroup_size_y;
|
|
case 2:
|
|
return dp->workgroup_size_z;
|
|
}
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_gridsize (uint32_t dim, PHSAWorkItem *wi)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
|
|
switch (dim)
|
|
{
|
|
default:
|
|
case 0:
|
|
return dp->grid_size_x;
|
|
case 1:
|
|
return dp->grid_size_y;
|
|
case 2:
|
|
return dp->grid_size_z;
|
|
}
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
|
|
{
|
|
switch (dim)
|
|
{
|
|
default:
|
|
case 0:
|
|
return wi->group_x;
|
|
case 1:
|
|
return wi->group_y;
|
|
case 2:
|
|
return wi->group_z;
|
|
}
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_dim (PHSAWorkItem *wi)
|
|
{
|
|
hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp;
|
|
return dp->setup & 0x3;
|
|
}
|
|
|
|
uint64_t
|
|
__hsail_packetid (PHSAWorkItem *wi)
|
|
{
|
|
return wi->launch_data->packet_id;
|
|
}
|
|
|
|
uint32_t
|
|
__hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi)
|
|
{
|
|
return (uint32_t) wi->launch_data->dp->completion_signal.handle;
|
|
}
|
|
|
|
uint64_t
|
|
__hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi)
|
|
{
|
|
return (uint64_t) (wi->launch_data->dp->completion_signal.handle);
|
|
}
|
|
|
|
#ifdef HAVE_FIBERS
|
|
void
|
|
__hsail_barrier (PHSAWorkItem *wi)
|
|
{
|
|
fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier);
|
|
}
|
|
#endif
|
|
|
|
/* Return a 32b private segment address that points to a dynamically
|
|
allocated chunk of 'size' with 'align'.
|
|
|
|
Allocates the space from the end of the private segment allocated
|
|
for the whole work group. In implementations with separate private
|
|
memories per WI, we will need to have a stack pointer per WI. But in
|
|
the current implementation, the segment is shared, so we possibly
|
|
save some space in case all WIs do not call the alloca.
|
|
|
|
The "alloca frames" are organized as follows:
|
|
|
|
wg->alloca_stack_p points to the last allocated data (initially
|
|
outside the private segment)
|
|
wg->alloca_frame_p points to the first address _outside_ the current
|
|
function's allocations (initially to the same as alloca_stack_p)
|
|
|
|
The data is allocated downwards from the end of the private segment.
|
|
|
|
In the beginning of a new function which has allocas, a new alloca
|
|
frame is pushed which adds the current alloca_frame_p (the current
|
|
function's frame starting point) to the top of the alloca stack and
|
|
alloca_frame_p is set to the current stack position.
|
|
|
|
At the exit points of a function with allocas, the alloca frame
|
|
is popped before returning. This involves popping the alloca_frame_p
|
|
to the one of the previous function in the call stack, and alloca_stack_p
|
|
similarly, to the position of the last word alloca'd by the previous
|
|
function.
|
|
*/
|
|
|
|
uint32_t
|
|
__hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi)
|
|
{
|
|
volatile PHSAWorkGroup *wg = wi->wg;
|
|
int64_t new_pos = wg->alloca_stack_p - size;
|
|
while (new_pos % align != 0)
|
|
new_pos--;
|
|
if (new_pos < 0)
|
|
phsa_fatal_error (2);
|
|
|
|
wg->alloca_stack_p = new_pos;
|
|
|
|
#ifdef DEBUG_ALLOCA
|
|
printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align,
|
|
wg->alloca_stack_p, wg->alloca_frame_p);
|
|
#endif
|
|
return new_pos;
|
|
}
|
|
|
|
/* Initializes a new "alloca frame" in the private segment.
|
|
This should be called at all the function entry points in case
|
|
the function contains at least one call to alloca. */
|
|
|
|
void
|
|
__hsail_alloca_push_frame (PHSAWorkItem *wi)
|
|
{
|
|
volatile PHSAWorkGroup *wg = wi->wg;
|
|
|
|
/* Store the alloca_frame_p without any alignment padding so
|
|
we know exactly where the previous frame ended after popping
|
|
it. */
|
|
#ifdef DEBUG_ALLOCA
|
|
printf ("--- push frame ");
|
|
#endif
|
|
uint32_t last_word_offs = __hsail_alloca (4, 1, wi);
|
|
memcpy (wg->private_base_ptr + last_word_offs,
|
|
(const void *) &wg->alloca_frame_p, 4);
|
|
wg->alloca_frame_p = last_word_offs;
|
|
|
|
#ifdef DEBUG_ALLOCA
|
|
printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p);
|
|
#endif
|
|
}
|
|
|
|
/* Frees the current "alloca frame" and restores the frame
|
|
pointer.
|
|
This should be called at all the function return points in case
|
|
the function contains at least one call to alloca. Restores the
|
|
alloca stack to the condition it was before pushing the frame
|
|
the last time. */
|
|
void
|
|
__hsail_alloca_pop_frame (PHSAWorkItem *wi)
|
|
{
|
|
volatile PHSAWorkGroup *wg = wi->wg;
|
|
|
|
wg->alloca_stack_p = wg->alloca_frame_p;
|
|
memcpy ((void *) &wg->alloca_frame_p,
|
|
(const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4);
|
|
/* Now frame_p points to the beginning of the previous function's
|
|
frame and stack_p to its end. */
|
|
|
|
wg->alloca_stack_p += 4;
|
|
|
|
#ifdef DEBUG_ALLOCA
|
|
printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p,
|
|
wg->alloca_frame_p);
|
|
#endif
|
|
}
|