736 lines
25 KiB
C++
736 lines
25 KiB
C++
/* brig-basic-inst-handler.cc -- brig basic instruction handling
|
|
Copyright (C) 2016-2021 Free Software Foundation, Inc.
|
|
Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
|
|
for General Processor Tech.
|
|
|
|
This file is part of GCC.
|
|
|
|
GCC is free software; you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free
|
|
Software Foundation; either version 3, or (at your option) any later
|
|
version.
|
|
|
|
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with GCC; see the file COPYING3. If not see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include <sstream>
|
|
|
|
#include "brig-code-entry-handler.h"
|
|
#include "brig-util.h"
|
|
|
|
#include "errors.h"
|
|
#include "gimple-expr.h"
|
|
#include "convert.h"
|
|
#include "print-tree.h"
|
|
#include "tree-pretty-print.h"
|
|
#include "langhooks.h"
|
|
#include "stor-layout.h"
|
|
#include "diagnostic-core.h"
|
|
#include "brig-builtins.h"
|
|
#include "fold-const.h"
|
|
|
|
brig_basic_inst_handler::brig_basic_inst_handler (brig_to_generic &parent)
|
|
: brig_code_entry_handler (parent)
|
|
{
|
|
}
|
|
|
|
class scalarized_sat_arithmetics : public tree_element_binary_visitor
|
|
{
|
|
public:
|
|
scalarized_sat_arithmetics (const BrigInstBase &brig_inst)
|
|
: m_brig_inst (brig_inst)
|
|
{
|
|
BrigType16_t element_type = brig_inst.type & BRIG_TYPE_BASE_MASK;
|
|
|
|
#undef DEF_HSAIL_SAT_BUILTIN
|
|
#undef DEF_HSAIL_BUILTIN
|
|
#undef DEF_HSAIL_ATOMIC_BUILTIN
|
|
#undef DEF_HSAIL_INTR_BUILTIN
|
|
#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
|
|
|
|
#define DEF_HSAIL_SAT_BUILTIN(ENUM, BRIG_OPCODE, HSAIL_TYPE, \
|
|
NAME, TYPE, ATTRS) \
|
|
if (brig_inst.opcode == BRIG_OPCODE && element_type == HSAIL_TYPE) \
|
|
m_builtin = builtin_decl_explicit (ENUM); \
|
|
else
|
|
#include "brig-builtins.def"
|
|
gcc_unreachable ();
|
|
}
|
|
|
|
virtual tree
|
|
visit_element (brig_code_entry_handler &, tree operand0, tree operand1)
|
|
{
|
|
/* Implement saturating arithmetics with scalar built-ins for now.
|
|
TODO: emit GENERIC nodes for the simplest cases or at least
|
|
emit vector built-ins. */
|
|
return call_builtin (m_builtin, 2, TREE_TYPE (operand0),
|
|
TREE_TYPE (operand0), operand0,
|
|
TREE_TYPE (operand1), operand1);
|
|
}
|
|
const BrigInstBase &m_brig_inst;
|
|
tree m_builtin;
|
|
};
|
|
|
|
/* Implements a vector shuffle. ARITH_TYPE is the type of the vector,
|
|
OPERANDS[0] is the first vector, OPERAND[1] the second vector and
|
|
OPERANDS[2] the shuffle mask in HSAIL format. The output is a VEC_PERM_EXPR
|
|
that implements the shuffle as a GENERIC expression. */
|
|
|
|
tree
|
|
brig_basic_inst_handler::build_shuffle (tree arith_type,
|
|
tree_stl_vec &operands)
|
|
{
|
|
tree element_type
|
|
= get_unsigned_int_type (TREE_TYPE (TREE_TYPE (operands[0])));
|
|
|
|
/* Offsets to add to the mask values to convert from the
|
|
HSAIL mask to VEC_PERM_EXPR masks. VEC_PERM_EXPR mask
|
|
assumes an index spanning from 0 to 2 times the vec
|
|
width while HSAIL refers separately to two different
|
|
input vectors, thus is not a "full shuffle" where all
|
|
output elements can originate from any input element. */
|
|
vec<constructor_elt, va_gc> *mask_offset_vals = NULL;
|
|
|
|
unsigned int element_count = gccbrig_type_vector_subparts (arith_type);
|
|
|
|
vec<constructor_elt, va_gc> *input_mask_vals = NULL;
|
|
size_t input_mask_element_size = exact_log2 (element_count);
|
|
|
|
/* Unpack the tightly packed mask elements to BIT_FIELD_REFs
|
|
from which to construct the mask vector as understood by
|
|
VEC_PERM_EXPR. */
|
|
tree mask_operand
|
|
= m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]);
|
|
|
|
tree mask_element_type
|
|
= build_nonstandard_integer_type (input_mask_element_size, true);
|
|
|
|
for (size_t i = 0; i < element_count; ++i)
|
|
{
|
|
tree mask_element
|
|
= build3 (BIT_FIELD_REF, mask_element_type, mask_operand,
|
|
bitsize_int (input_mask_element_size),
|
|
bitsize_int (i * input_mask_element_size));
|
|
|
|
mask_element = convert (element_type, mask_element);
|
|
|
|
tree offset;
|
|
if (i < element_count / 2)
|
|
offset = build_int_cst (element_type, 0);
|
|
else
|
|
offset = build_int_cst (element_type, element_count);
|
|
|
|
CONSTRUCTOR_APPEND_ELT (mask_offset_vals, NULL_TREE, offset);
|
|
CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
|
|
}
|
|
tree mask_vec_type = build_vector_type (element_type, element_count);
|
|
|
|
tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
|
|
tree offset_vec = build_constructor (mask_vec_type, mask_offset_vals);
|
|
|
|
tree mask = build2 (PLUS_EXPR, mask_vec_type, mask_vec, offset_vec);
|
|
|
|
tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
|
|
operands[1], mask);
|
|
return perm;
|
|
}
|
|
|
|
/* Unpacks (extracts) a scalar element with an index in OPERANDS[1]
|
|
from the vector expression in OPERANDS[0]. */
|
|
|
|
tree
|
|
brig_basic_inst_handler::build_unpack (tree_stl_vec &operands)
|
|
{
|
|
/* Implement the unpack with a shuffle that stores the unpacked
|
|
element to the lowest bit positions in the dest. After that
|
|
a bitwise AND is used to clear the uppermost bits. */
|
|
tree src_element_type = TREE_TYPE (TREE_TYPE (operands[0]));
|
|
|
|
/* Perform the operations with a raw (unsigned int type) type. */
|
|
tree element_type = get_unsigned_int_type (src_element_type);
|
|
|
|
vec<constructor_elt, va_gc> *input_mask_vals = NULL;
|
|
vec<constructor_elt, va_gc> *and_mask_vals = NULL;
|
|
|
|
size_t element_count
|
|
= gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
|
|
tree vec_type = build_vector_type (element_type, element_count);
|
|
|
|
for (size_t i = 0; i < element_count; ++i)
|
|
{
|
|
tree mask_element;
|
|
if (i == 0)
|
|
mask_element = convert (element_type, operands[1]);
|
|
else
|
|
mask_element = build_int_cst (element_type, 0);
|
|
|
|
CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
|
|
|
|
tree and_mask_element;
|
|
if (i == 0)
|
|
and_mask_element = build_int_cst (element_type, -1);
|
|
else
|
|
and_mask_element = build_int_cst (element_type, 0);
|
|
CONSTRUCTOR_APPEND_ELT (and_mask_vals, NULL_TREE, and_mask_element);
|
|
}
|
|
|
|
tree mask_vec = build_constructor (vec_type, input_mask_vals);
|
|
|
|
tree and_mask_vec = build_constructor (vec_type, and_mask_vals);
|
|
|
|
tree perm = build3 (VEC_PERM_EXPR, vec_type,
|
|
build_resize_convert_view (vec_type, operands[0]),
|
|
build_resize_convert_view (vec_type, operands[0]),
|
|
mask_vec);
|
|
|
|
tree cleared = build2 (BIT_AND_EXPR, vec_type, perm, and_mask_vec);
|
|
|
|
size_t s = int_size_in_bytes (TREE_TYPE (cleared)) * BITS_PER_UNIT;
|
|
tree raw_type = build_nonstandard_integer_type (s, true);
|
|
|
|
tree as_int = build_resize_convert_view (raw_type, cleared);
|
|
|
|
if (int_size_in_bytes (src_element_type) < 4)
|
|
{
|
|
if (INTEGRAL_TYPE_P (src_element_type))
|
|
return extend_int (as_int, uint32_type_node, src_element_type);
|
|
}
|
|
return as_int;
|
|
}
|
|
|
|
/* Packs (inserts) a scalar element in OPERANDS[1]
|
|
to the vector in OPERANDS[0] at element position defined by
|
|
OPERANDS[2]. */
|
|
|
|
tree
|
|
brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
|
|
{
|
|
/* Implement using a bit level insertion.
|
|
TODO: Reuse this for implementing 'bitinsert'
|
|
without a builtin call. */
|
|
|
|
size_t ecount = gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
|
|
size_t vecsize = int_size_in_bytes (TREE_TYPE (operands[0])) * BITS_PER_UNIT;
|
|
tree wide_type = build_nonstandard_integer_type (vecsize, 1);
|
|
|
|
tree src_vect = build_resize_convert_view (wide_type, operands[0]);
|
|
src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect);
|
|
|
|
tree scalar = operands[1];
|
|
scalar = m_parent.m_cf->add_temp_var ("scalar",
|
|
convert_to_integer (wide_type, scalar));
|
|
|
|
tree pos = operands[2];
|
|
|
|
/* The upper bits of the position can contain garbage.
|
|
Zero them for well-defined semantics. */
|
|
tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
|
|
build_int_cstu (TREE_TYPE (pos), ecount - 1));
|
|
pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t));
|
|
|
|
tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
|
|
size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
|
|
tree ewidth = build_int_cstu (wide_type, element_width);
|
|
|
|
tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
|
|
bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset);
|
|
|
|
uint64_t mask_int
|
|
= element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
|
|
|
|
tree mask = build_int_cstu (wide_type, mask_int);
|
|
|
|
mask = m_parent.m_cf->add_temp_var ("mask",
|
|
convert_to_integer (wide_type, mask));
|
|
|
|
tree clearing_mask
|
|
= build1 (BIT_NOT_EXPR, wide_type,
|
|
build2 (LSHIFT_EXPR, wide_type, mask, bitoffset));
|
|
|
|
tree zeroed_element
|
|
= build2 (BIT_AND_EXPR, wide_type, src_vect, clearing_mask);
|
|
|
|
/* TODO: Is the AND necessary: does HSA define what
|
|
happens if the upper bits in the inserted element are not
|
|
zero? */
|
|
tree element_in_position
|
|
= build2 (LSHIFT_EXPR, wide_type,
|
|
build2 (BIT_AND_EXPR, wide_type, scalar, mask), bitoffset);
|
|
|
|
tree inserted
|
|
= build2 (BIT_IOR_EXPR, wide_type, zeroed_element, element_in_position);
|
|
return inserted;
|
|
}
|
|
|
|
/* Implement the unpack{lo,hi}. BRIG_OPCODE should tell which one and
|
|
ARITH_TYPE describe the type of the vector arithmetics.
|
|
OPERANDS[0] and OPERANDS[1] are the input vectors. */
|
|
|
|
tree
|
|
brig_basic_inst_handler::build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode,
|
|
tree arith_type,
|
|
tree_stl_vec &operands)
|
|
{
|
|
tree element_type = get_unsigned_int_type (TREE_TYPE (arith_type));
|
|
tree mask_vec_type
|
|
= build_vector_type (element_type,
|
|
gccbrig_type_vector_subparts (arith_type));
|
|
|
|
size_t element_count = gccbrig_type_vector_subparts (arith_type);
|
|
vec<constructor_elt, va_gc> *input_mask_vals = NULL;
|
|
|
|
size_t offset = (brig_opcode == BRIG_OPCODE_UNPACKLO) ? 0 : element_count / 2;
|
|
|
|
for (size_t i = 0; i < element_count / 2; ++i)
|
|
{
|
|
CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
|
|
build_int_cst (element_type, offset + i));
|
|
CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
|
|
build_int_cst (element_type,
|
|
offset + i + element_count));
|
|
}
|
|
|
|
tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
|
|
|
|
tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
|
|
operands[1], mask_vec);
|
|
return perm;
|
|
}
|
|
|
|
/* Builds a basic instruction expression from a BRIG instruction. BRIG_OPCODE
|
|
is the opcode, BRIG_TYPE the brig type of the instruction, ARITH_TYPE the
|
|
desired tree type for the instruction, and OPERANDS the instruction's
|
|
input operands already converted to tree nodes. */
|
|
|
|
tree
|
|
brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
|
|
BrigType16_t brig_type,
|
|
tree arith_type,
|
|
tree_stl_vec &operands)
|
|
{
|
|
tree_code opcode
|
|
= brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
|
|
|
|
BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
|
|
|
|
tree instr_inner_type
|
|
= VECTOR_TYPE_P (arith_type) ? TREE_TYPE (arith_type) : arith_type;
|
|
|
|
if (opcode == RSHIFT_EXPR || opcode == LSHIFT_EXPR)
|
|
{
|
|
/* HSA defines modulo/clipping behavior for shift amounts larger
|
|
than the bit width, while tree.def leaves it undefined.
|
|
We need to mask the upper bits to ensure the defined behavior. */
|
|
tree scalar_mask
|
|
= build_int_cst (instr_inner_type,
|
|
gccbrig_hsa_type_bit_size (inner_type) - 1);
|
|
|
|
tree mask = VECTOR_TYPE_P (arith_type)
|
|
? build_vector_from_val (arith_type, scalar_mask)
|
|
: scalar_mask;
|
|
|
|
/* The shift amount is a scalar, broadcast it to produce
|
|
a vector shift. */
|
|
if (VECTOR_TYPE_P (arith_type))
|
|
operands[1] = build_vector_from_val (arith_type, operands[1]);
|
|
operands[1] = build2 (BIT_AND_EXPR, arith_type, operands[1], mask);
|
|
}
|
|
|
|
size_t input_count = operands.size ();
|
|
size_t output_count = gccbrig_hsa_opcode_op_output_p (brig_opcode, 0) ?
|
|
1 : 0;
|
|
|
|
if (opcode == TREE_LIST)
|
|
{
|
|
/* There was no direct GENERIC opcode for the instruction;
|
|
try to emulate it with a chain of GENERIC nodes. */
|
|
if (brig_opcode == BRIG_OPCODE_MAD || brig_opcode == BRIG_OPCODE_MAD24)
|
|
{
|
|
/* There doesn't seem to be a "standard" MAD built-in in gcc so let's
|
|
use a chain of multiply + add for now (double rounding method).
|
|
It should be easier for optimizers than a custom built-in call
|
|
WIDEN_MULT_EXPR is close, but requires a double size result
|
|
type. */
|
|
tree mult_res
|
|
= build2 (MULT_EXPR, arith_type, operands[0], operands[1]);
|
|
return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
|
|
}
|
|
else if (brig_opcode == BRIG_OPCODE_MAD24HI)
|
|
{
|
|
tree mult_res
|
|
= build2 (MULT_HIGHPART_EXPR, arith_type, operands[0], operands[1]);
|
|
return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
|
|
}
|
|
else if (brig_opcode == BRIG_OPCODE_SHUFFLE)
|
|
{
|
|
return build_shuffle (arith_type, operands);
|
|
}
|
|
else if (brig_opcode == BRIG_OPCODE_UNPACKLO
|
|
|| brig_opcode == BRIG_OPCODE_UNPACKHI)
|
|
{
|
|
return build_unpack_lo_or_hi (brig_opcode, arith_type, operands);
|
|
}
|
|
else if (brig_opcode == BRIG_OPCODE_UNPACK)
|
|
{
|
|
return build_unpack (operands);
|
|
}
|
|
else if (brig_opcode == BRIG_OPCODE_PACK)
|
|
{
|
|
return build_pack (operands);
|
|
}
|
|
else if (brig_opcode == BRIG_OPCODE_NRSQRT)
|
|
{
|
|
/* Implement as 1.0/sqrt (x) and assume gcc instruction selects to
|
|
native ISA other than a division, if available.
|
|
TODO: this will happen only with unsafe math optimizations
|
|
on which cannot be used in general to remain HSAIL compliant.
|
|
Perhaps a builtin call would be better option here. */
|
|
return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
|
|
m_parent.m_cf->expand_or_call_builtin
|
|
(BRIG_OPCODE_SQRT, brig_type, arith_type, operands));
|
|
}
|
|
else if (brig_opcode == BRIG_OPCODE_NRCP)
|
|
{
|
|
/* Implement as 1.0/x and assume gcc instruction selects to
|
|
native ISA other than a division, if available. */
|
|
return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
|
|
operands[0]);
|
|
}
|
|
else if (brig_opcode == BRIG_OPCODE_LANEID
|
|
|| brig_opcode == BRIG_OPCODE_MAXWAVEID
|
|
|| brig_opcode == BRIG_OPCODE_WAVEID)
|
|
{
|
|
/* Assuming WAVESIZE 1 (for now), therefore LANEID, WAVEID and
|
|
MAXWAVEID always return 0. */
|
|
return build_zero_cst (arith_type);
|
|
}
|
|
else
|
|
gcc_unreachable ();
|
|
}
|
|
else if (opcode == CALL_EXPR)
|
|
return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type,
|
|
arith_type, operands);
|
|
else if (output_count == 1)
|
|
{
|
|
if (input_count == 1)
|
|
{
|
|
if (opcode == MODIFY_EXPR)
|
|
return operands[0];
|
|
else
|
|
return build1 (opcode, arith_type, operands[0]);
|
|
}
|
|
else if (input_count == 2)
|
|
return build2 (opcode, arith_type, operands[0], operands[1]);
|
|
else if (input_count == 3)
|
|
return build3 (opcode, arith_type, operands[0], operands[1],
|
|
operands[2]);
|
|
else
|
|
gcc_unreachable ();
|
|
}
|
|
else
|
|
gcc_unreachable ();
|
|
|
|
return NULL_TREE;
|
|
}
|
|
|
|
/* Handles the basic instructions, including packed instructions. Deals
|
|
with the different packing modes by unpacking/packing the wanted
|
|
elements. Delegates most of the instruction cases to build_inst_expr(). */
|
|
|
|
size_t
|
|
brig_basic_inst_handler::operator () (const BrigBase *base)
|
|
{
|
|
const BrigInstBase *brig_inst = (const BrigInstBase *) base;
|
|
if (brig_inst->opcode == BRIG_OPCODE_NOP)
|
|
return base->byteCount;
|
|
|
|
tree_stl_vec operands = build_operands (*brig_inst);
|
|
|
|
size_t output_count
|
|
= gccbrig_hsa_opcode_op_output_p (brig_inst->opcode, 0) ? 1 : 0;
|
|
size_t input_count
|
|
= operands.size () == 0 ? 0 : (operands.size () - output_count);
|
|
|
|
gcc_assert (output_count == 0 || output_count == 1);
|
|
|
|
tree_stl_vec::iterator first_input_i = operands.begin ();
|
|
if (output_count > 0 && operands.size () > 0)
|
|
++first_input_i;
|
|
|
|
tree_stl_vec in_operands;
|
|
in_operands.assign (first_input_i, operands.end ());
|
|
|
|
BrigType16_t brig_inst_type = brig_inst->type;
|
|
|
|
if (brig_inst->opcode == BRIG_OPCODE_FIRSTBIT
|
|
|| brig_inst->opcode == BRIG_OPCODE_LASTBIT
|
|
|| brig_inst->opcode == BRIG_OPCODE_SAD)
|
|
/* These instructions are reported to be always 32b in HSAIL, but we want
|
|
to treat them according to their input argument's type to select the
|
|
correct instruction/builtin. */
|
|
brig_inst_type
|
|
= gccbrig_tree_type_to_hsa_type (TREE_TYPE (in_operands[0]));
|
|
|
|
tree instr_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
|
|
|
|
if (!instr_type)
|
|
{
|
|
gcc_unreachable ();
|
|
return base->byteCount;
|
|
}
|
|
|
|
bool is_vec_instr = hsa_type_packed_p (brig_inst_type);
|
|
|
|
size_t element_size_bits;
|
|
size_t element_count;
|
|
|
|
if (is_vec_instr)
|
|
{
|
|
BrigType16_t brig_element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
|
|
element_size_bits = gccbrig_hsa_type_bit_size (brig_element_type);
|
|
element_count = gccbrig_hsa_type_bit_size (brig_inst_type)
|
|
/ gccbrig_hsa_type_bit_size (brig_element_type);
|
|
}
|
|
else
|
|
{
|
|
element_size_bits = gccbrig_hsa_type_bit_size (brig_inst_type);
|
|
element_count = 1;
|
|
}
|
|
|
|
/* The actual arithmetics type that should be performed with the
|
|
operation. This is not always the same as the original BRIG
|
|
opcode's type due to implicit conversions of storage-only f16. */
|
|
tree arith_type = gccbrig_is_bit_operation (brig_inst->opcode)
|
|
? gccbrig_tree_type_for_hsa_type (brig_inst_type)
|
|
: get_tree_expr_type_for_hsa_type (brig_inst_type);
|
|
|
|
tree instr_expr = NULL_TREE;
|
|
|
|
BrigPack8_t p = BRIG_PACK_NONE;
|
|
if (brig_inst->base.kind == BRIG_KIND_INST_MOD)
|
|
p = ((const BrigInstMod *) brig_inst)->pack;
|
|
else if (brig_inst->base.kind == BRIG_KIND_INST_CMP)
|
|
p = ((const BrigInstCmp *) brig_inst)->pack;
|
|
|
|
if (p == BRIG_PACK_PS || p == BRIG_PACK_PSSAT)
|
|
in_operands[1] = build_lower_element_broadcast (in_operands[1]);
|
|
else if (p == BRIG_PACK_SP || p == BRIG_PACK_SPSAT)
|
|
in_operands[0] = build_lower_element_broadcast (in_operands[0]);
|
|
|
|
tree_code opcode
|
|
= brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode,
|
|
brig_inst_type);
|
|
|
|
if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
|
|
{
|
|
scalarized_sat_arithmetics sat_arith (*brig_inst);
|
|
gcc_assert (input_count == 2);
|
|
instr_expr = sat_arith (*this, in_operands[0], in_operands[1]);
|
|
}
|
|
else if (opcode == RETURN_EXPR)
|
|
{
|
|
if (m_parent.m_cf->m_is_kernel)
|
|
{
|
|
tree goto_stmt
|
|
= build1 (GOTO_EXPR, void_type_node, m_parent.m_cf->m_exit_label);
|
|
m_parent.m_cf->append_statement (goto_stmt);
|
|
return base->byteCount;
|
|
}
|
|
else
|
|
{
|
|
m_parent.m_cf->append_return_stmt ();
|
|
return base->byteCount;
|
|
}
|
|
}
|
|
else if (opcode == MULT_HIGHPART_EXPR &&
|
|
is_vec_instr && element_size_bits < 64)
|
|
{
|
|
/* MULT_HIGHPART_EXPR works only on target dependent vector sizes and
|
|
even the scalars do not seem to work at least for char elements.
|
|
|
|
Let's fall back to scalarization and promotion of the vector elements
|
|
to larger types with the MULHI computed as a regular MUL.
|
|
MULHI for 2x64b seems to work with the Intel CPUs I've tested so
|
|
that is passed on for vector processing so there is no need for
|
|
128b scalar arithmetics.
|
|
|
|
This is not modular as these type of things do not belong to the
|
|
frontend, there should be a legalization phase before the backend
|
|
that figures out the best way to compute the MULHI for any
|
|
integer vector datatype.
|
|
|
|
TODO: promote to larger vector types instead. For example
|
|
MULT_HIGHPART_EXPR with s8x8 doesn't work, but s16x8 seems to at least
|
|
with my x86-64.
|
|
*/
|
|
tree_stl_vec operand0_elements;
|
|
if (input_count > 0)
|
|
m_parent.m_cf->unpack (in_operands[0], operand0_elements);
|
|
|
|
tree_stl_vec operand1_elements;
|
|
if (input_count > 1)
|
|
m_parent.m_cf->unpack (in_operands[1], operand1_elements);
|
|
|
|
tree_stl_vec result_elements;
|
|
|
|
tree scalar_type = TREE_TYPE (arith_type);
|
|
BrigType16_t element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
|
|
tree promoted_type = short_integer_type_node;
|
|
switch (element_type)
|
|
{
|
|
case BRIG_TYPE_S8:
|
|
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S16);
|
|
break;
|
|
case BRIG_TYPE_U8:
|
|
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U16);
|
|
break;
|
|
case BRIG_TYPE_S16:
|
|
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S32);
|
|
break;
|
|
case BRIG_TYPE_U16:
|
|
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U32);
|
|
break;
|
|
case BRIG_TYPE_S32:
|
|
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S64);
|
|
break;
|
|
case BRIG_TYPE_U32:
|
|
promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U64);
|
|
break;
|
|
default:
|
|
gcc_unreachable ();
|
|
}
|
|
|
|
size_t promoted_type_size = int_size_in_bytes (promoted_type) * 8;
|
|
size_t element_count = gccbrig_type_vector_subparts (arith_type);
|
|
for (size_t i = 0; i < element_count; ++i)
|
|
{
|
|
tree operand0 = convert (promoted_type, operand0_elements.at (i));
|
|
tree operand1 = convert (promoted_type, operand1_elements.at (i));
|
|
|
|
tree scalar_expr
|
|
= build2 (MULT_EXPR, promoted_type, operand0, operand1);
|
|
|
|
scalar_expr
|
|
= build2 (RSHIFT_EXPR, promoted_type, scalar_expr,
|
|
build_int_cstu (promoted_type, promoted_type_size / 2));
|
|
|
|
result_elements.push_back (convert (scalar_type, scalar_expr));
|
|
}
|
|
instr_expr = m_parent.m_cf->pack (result_elements);
|
|
}
|
|
else
|
|
{
|
|
/* 'class' is always of b1 type, let's consider it by its
|
|
float type when building the instruction to find the
|
|
correct builtin. */
|
|
if (brig_inst->opcode == BRIG_OPCODE_CLASS)
|
|
brig_inst_type = ((const BrigInstSourceType *) base)->sourceType;
|
|
instr_expr = build_inst_expr (brig_inst->opcode, brig_inst_type,
|
|
arith_type, in_operands);
|
|
}
|
|
|
|
if (instr_expr == NULL_TREE)
|
|
{
|
|
gcc_unreachable ();
|
|
return base->byteCount;
|
|
}
|
|
|
|
if (p == BRIG_PACK_SS || p == BRIG_PACK_S || p == BRIG_PACK_SSSAT
|
|
|| p == BRIG_PACK_SSAT)
|
|
{
|
|
/* In case of _s_ or _ss_, select only the lowest element
|
|
from the new input to the output. We could extract
|
|
the element and use a scalar operation, but try
|
|
to keep data in vector registers as much as possible
|
|
to avoid copies between scalar and vector datapaths. */
|
|
tree old_value;
|
|
tree half_storage_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
|
|
bool is_fp16_operation
|
|
= (brig_inst_type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
|
|
&& !gccbrig_is_bit_operation (brig_inst->opcode);
|
|
|
|
if (is_fp16_operation)
|
|
old_value = build_h2f_conversion
|
|
(build_resize_convert_view (half_storage_type, operands[0]));
|
|
else
|
|
old_value
|
|
= build_resize_convert_view (TREE_TYPE (instr_expr), operands[0]);
|
|
|
|
size_t esize = is_fp16_operation ? 32 : element_size_bits;
|
|
|
|
/* Construct a permutation mask where other elements than the lowest one
|
|
is picked from the old_value. */
|
|
tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
|
|
vec<constructor_elt, va_gc> *constructor_vals = NULL;
|
|
for (size_t i = 0; i < element_count; ++i)
|
|
{
|
|
tree cst;
|
|
|
|
if (i == 0)
|
|
cst = build_int_cstu (mask_inner_type, element_count);
|
|
else
|
|
cst = build_int_cstu (mask_inner_type, i);
|
|
CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
|
|
}
|
|
tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
|
|
tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
|
|
|
|
tree new_value = create_tmp_var (TREE_TYPE (instr_expr), "new_output");
|
|
tree assign
|
|
= build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), new_value, instr_expr);
|
|
m_parent.m_cf->append_statement (assign);
|
|
|
|
instr_expr
|
|
= build3 (VEC_PERM_EXPR, arith_type, old_value, new_value, mask);
|
|
|
|
tree lower_output = create_tmp_var (TREE_TYPE (instr_expr), "s_output");
|
|
tree assign_lower = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr),
|
|
lower_output, instr_expr);
|
|
m_parent.m_cf->append_statement (assign_lower);
|
|
instr_expr = lower_output;
|
|
}
|
|
|
|
if (output_count == 1)
|
|
build_output_assignment (*brig_inst, operands[0], instr_expr);
|
|
else
|
|
m_parent.m_cf->append_statement (instr_expr);
|
|
return base->byteCount;
|
|
}
|
|
|
|
/* Create an expression that broadcasts the lowest element of the
|
|
vector in VEC_OPERAND to all elements of the returned vector. */
|
|
|
|
tree
|
|
brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
|
|
{
|
|
/* Build the broadcast using shuffle because there's no
|
|
direct broadcast in GENERIC and this way there's no need for
|
|
a separate extract of the lowest element. */
|
|
tree element_type = TREE_TYPE (TREE_TYPE (vec_operand));
|
|
size_t esize = 8 * int_size_in_bytes (element_type);
|
|
|
|
size_t element_count
|
|
= gccbrig_type_vector_subparts (TREE_TYPE (vec_operand));
|
|
tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
|
|
vec<constructor_elt, va_gc> *constructor_vals = NULL;
|
|
|
|
/* Construct the mask. */
|
|
for (size_t i = 0; i < element_count; ++i)
|
|
{
|
|
tree cst = build_int_cstu (mask_inner_type, element_count);
|
|
CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
|
|
}
|
|
tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
|
|
tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
|
|
|
|
return build3 (VEC_PERM_EXPR, TREE_TYPE (vec_operand), vec_operand,
|
|
vec_operand, mask);
|
|
}
|
|
|