ubuntu-buildroot/output/build/host-gcc-initial-11.4.0/gcc/brig/brigfrontend/brig-basic-inst-handler.cc

/* brig-basic-inst-handler.cc -- brig basic instruction handling
   Copyright (C) 2016-2021 Free Software Foundation, Inc.
   Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
   for General Processor Tech.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify it under
   the terms of the GNU General Public License as published by the Free
   Software Foundation; either version 3, or (at your option) any later
   version.

   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or
   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   for more details.

   You should have received a copy of the GNU General Public License
   along with GCC; see the file COPYING3.  If not see
   <http://www.gnu.org/licenses/>.  */

#include <sstream>

#include "brig-code-entry-handler.h"
#include "brig-util.h"

#include "errors.h"
#include "gimple-expr.h"
#include "convert.h"
#include "print-tree.h"
#include "tree-pretty-print.h"
#include "langhooks.h"
#include "stor-layout.h"
#include "diagnostic-core.h"
#include "brig-builtins.h"
#include "fold-const.h"

brig_basic_inst_handler::brig_basic_inst_handler (brig_to_generic &parent)
  : brig_code_entry_handler (parent)
{
}

class scalarized_sat_arithmetics : public tree_element_binary_visitor
{
public:
  scalarized_sat_arithmetics (const BrigInstBase &brig_inst)
    : m_brig_inst (brig_inst)
  {
    BrigType16_t element_type = brig_inst.type & BRIG_TYPE_BASE_MASK;

#undef DEF_HSAIL_SAT_BUILTIN
#undef DEF_HSAIL_BUILTIN
#undef DEF_HSAIL_ATOMIC_BUILTIN
#undef DEF_HSAIL_INTR_BUILTIN
#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN

#define DEF_HSAIL_SAT_BUILTIN(ENUM, BRIG_OPCODE, HSAIL_TYPE,		\
			      NAME, TYPE, ATTRS)			\
    if (brig_inst.opcode == BRIG_OPCODE && element_type == HSAIL_TYPE)	\
      m_builtin = builtin_decl_explicit (ENUM);				\
    else
#include "brig-builtins.def"
      gcc_unreachable ();
  }

  virtual tree
  visit_element (brig_code_entry_handler &, tree operand0, tree operand1)
  {
    /* Implement saturating arithmetics with scalar built-ins for now.
       TODO: emit GENERIC nodes for the simplest cases or at least
       emit vector built-ins.  */
    return call_builtin (m_builtin, 2, TREE_TYPE (operand0),
			 TREE_TYPE (operand0), operand0,
			 TREE_TYPE (operand1), operand1);
  }
  const BrigInstBase &m_brig_inst;
  tree m_builtin;
};

/* Implements a vector shuffle.  ARITH_TYPE is the type of the vector,
   OPERANDS[0] is the first vector, OPERAND[1] the second vector and
   OPERANDS[2] the shuffle mask in HSAIL format.  The output is a VEC_PERM_EXPR
   that implements the shuffle as a GENERIC expression.  */

tree
brig_basic_inst_handler::build_shuffle (tree arith_type,
					tree_stl_vec &operands)
{
  tree element_type
    = get_unsigned_int_type (TREE_TYPE (TREE_TYPE (operands[0])));

  /* Offsets to add to the mask values to convert from the
     HSAIL mask to VEC_PERM_EXPR masks.  VEC_PERM_EXPR mask
     assumes an index spanning from 0 to 2 times the vec
     width while HSAIL refers separately to two different
     input vectors, thus is not a "full shuffle" where all
     output elements can originate from any input element.  */
  vec<constructor_elt, va_gc> *mask_offset_vals = NULL;

  unsigned int element_count = gccbrig_type_vector_subparts (arith_type);

  vec<constructor_elt, va_gc> *input_mask_vals = NULL;
  size_t input_mask_element_size = exact_log2 (element_count);

  /* Unpack the tightly packed mask elements to BIT_FIELD_REFs
     from which to construct the mask vector as understood by
     VEC_PERM_EXPR.  */
  tree mask_operand
    = m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]);

  tree mask_element_type
    = build_nonstandard_integer_type (input_mask_element_size, true);

  for (size_t i = 0; i < element_count; ++i)
    {
      tree mask_element
	= build3 (BIT_FIELD_REF, mask_element_type, mask_operand,
		  bitsize_int (input_mask_element_size),
		  bitsize_int (i * input_mask_element_size));

      mask_element = convert (element_type, mask_element);

      tree offset;
      if (i < element_count / 2)
	offset = build_int_cst (element_type, 0);
      else
	offset = build_int_cst (element_type, element_count);

      CONSTRUCTOR_APPEND_ELT (mask_offset_vals, NULL_TREE, offset);
      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
    }
  tree mask_vec_type = build_vector_type (element_type, element_count);

  tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
  tree offset_vec = build_constructor (mask_vec_type, mask_offset_vals);

  tree mask = build2 (PLUS_EXPR, mask_vec_type, mask_vec, offset_vec);

  tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
		      operands[1], mask);
  return perm;
}

/* Unpacks (extracts) a scalar element with an index in OPERANDS[1]
   from the vector expression in OPERANDS[0].  */

tree
brig_basic_inst_handler::build_unpack (tree_stl_vec &operands)
{
  /* Implement the unpack with a shuffle that stores the unpacked
     element to the lowest bit positions in the dest.  After that
     a bitwise AND is used to clear the uppermost bits.  */
  tree src_element_type = TREE_TYPE (TREE_TYPE (operands[0]));

  /* Perform the operations with a raw (unsigned int type) type.  */
  tree element_type = get_unsigned_int_type (src_element_type);

  vec<constructor_elt, va_gc> *input_mask_vals = NULL;
  vec<constructor_elt, va_gc> *and_mask_vals = NULL;

  size_t element_count
    = gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
  tree vec_type = build_vector_type (element_type, element_count);

  for (size_t i = 0; i < element_count; ++i)
    {
      tree mask_element;
      if (i == 0)
	mask_element = convert (element_type, operands[1]);
      else
	mask_element = build_int_cst (element_type, 0);

      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);

      tree and_mask_element;
      if (i == 0)
	and_mask_element = build_int_cst (element_type, -1);
      else
	and_mask_element = build_int_cst (element_type, 0);
      CONSTRUCTOR_APPEND_ELT (and_mask_vals, NULL_TREE, and_mask_element);
    }

  tree mask_vec = build_constructor (vec_type, input_mask_vals);

  tree and_mask_vec = build_constructor (vec_type, and_mask_vals);

  tree perm = build3 (VEC_PERM_EXPR, vec_type,
		      build_resize_convert_view (vec_type, operands[0]),
		      build_resize_convert_view (vec_type, operands[0]),
		      mask_vec);

  tree cleared = build2 (BIT_AND_EXPR, vec_type, perm, and_mask_vec);

  size_t s = int_size_in_bytes (TREE_TYPE (cleared)) * BITS_PER_UNIT;
  tree raw_type = build_nonstandard_integer_type (s, true);

  tree as_int = build_resize_convert_view (raw_type, cleared);

  if (int_size_in_bytes (src_element_type) < 4)
    {
      if (INTEGRAL_TYPE_P (src_element_type))
	return extend_int (as_int, uint32_type_node, src_element_type);
    }
  return as_int;
}

/* Packs (inserts) a scalar element in OPERANDS[1]
   to the vector in OPERANDS[0] at element position defined by
   OPERANDS[2].  */

tree
brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
{
  /* Implement using a bit level insertion.
     TODO: Reuse this for implementing 'bitinsert'
     without a builtin call.  */

  size_t ecount = gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
  size_t vecsize = int_size_in_bytes (TREE_TYPE (operands[0])) * BITS_PER_UNIT;
  tree wide_type = build_nonstandard_integer_type (vecsize, 1);

  tree src_vect = build_resize_convert_view (wide_type, operands[0]);
  src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect);

  tree scalar = operands[1];
  scalar = m_parent.m_cf->add_temp_var ("scalar",
					convert_to_integer (wide_type, scalar));

  tree pos = operands[2];

  /* The upper bits of the position can contain garbage.
     Zero them for well-defined semantics.  */
  tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
		   build_int_cstu (TREE_TYPE (pos), ecount - 1));
  pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t));

  tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
  size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
  tree ewidth = build_int_cstu (wide_type, element_width);

  tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
  bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset);

  uint64_t mask_int
    = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;

  tree mask = build_int_cstu (wide_type, mask_int);

  mask = m_parent.m_cf->add_temp_var ("mask",
				      convert_to_integer (wide_type, mask));

  tree clearing_mask
    = build1 (BIT_NOT_EXPR, wide_type,
	      build2 (LSHIFT_EXPR, wide_type, mask, bitoffset));

  tree zeroed_element
    = build2 (BIT_AND_EXPR, wide_type, src_vect, clearing_mask);

  /* TODO: Is the AND necessary: does HSA define what
     happens if the upper bits in the inserted element are not
     zero? */
  tree element_in_position
    = build2 (LSHIFT_EXPR, wide_type,
	      build2 (BIT_AND_EXPR, wide_type, scalar, mask), bitoffset);

  tree inserted
    = build2 (BIT_IOR_EXPR, wide_type, zeroed_element, element_in_position);
  return inserted;
}

/* Implement the unpack{lo,hi}.  BRIG_OPCODE should tell which one and
   ARITH_TYPE describe the type of the vector arithmetics.
   OPERANDS[0] and OPERANDS[1] are the input vectors.  */

tree
brig_basic_inst_handler::build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode,
						tree arith_type,
						tree_stl_vec &operands)
{
  tree element_type = get_unsigned_int_type (TREE_TYPE (arith_type));
  tree mask_vec_type
    = build_vector_type (element_type,
			 gccbrig_type_vector_subparts (arith_type));

  size_t element_count = gccbrig_type_vector_subparts (arith_type);
  vec<constructor_elt, va_gc> *input_mask_vals = NULL;

  size_t offset = (brig_opcode == BRIG_OPCODE_UNPACKLO) ? 0 : element_count / 2;

  for (size_t i = 0; i < element_count / 2; ++i)
    {
      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
			      build_int_cst (element_type, offset + i));
      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
			      build_int_cst (element_type,
					     offset + i + element_count));
    }

  tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);

  tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
		      operands[1], mask_vec);
  return perm;
}

/* Builds a basic instruction expression from a BRIG instruction.  BRIG_OPCODE
   is the opcode, BRIG_TYPE the brig type of the instruction, ARITH_TYPE the
   desired tree type for the instruction, and OPERANDS the instruction's
   input operands already converted to tree nodes.  */

tree
brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
					  BrigType16_t brig_type,
					  tree arith_type,
					  tree_stl_vec &operands)
{
  tree_code opcode
    = brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type);

  BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;

  tree instr_inner_type
    = VECTOR_TYPE_P (arith_type) ? TREE_TYPE (arith_type) : arith_type;

  if (opcode == RSHIFT_EXPR || opcode == LSHIFT_EXPR)
    {
      /* HSA defines modulo/clipping behavior for shift amounts larger
	 than the bit width, while tree.def leaves it undefined.
	 We need to mask the upper bits to ensure the defined behavior.  */
      tree scalar_mask
	= build_int_cst (instr_inner_type,
			 gccbrig_hsa_type_bit_size (inner_type) - 1);

      tree mask = VECTOR_TYPE_P (arith_type)
		    ? build_vector_from_val (arith_type, scalar_mask)
		    : scalar_mask;

      /* The shift amount is a scalar, broadcast it to produce
	 a vector shift.  */
      if (VECTOR_TYPE_P (arith_type))
	operands[1] = build_vector_from_val (arith_type, operands[1]);
      operands[1] = build2 (BIT_AND_EXPR, arith_type, operands[1], mask);
    }

  size_t input_count = operands.size ();
  size_t output_count = gccbrig_hsa_opcode_op_output_p (brig_opcode, 0) ?
    1 : 0;

  if (opcode == TREE_LIST)
    {
      /* There was no direct GENERIC opcode for the instruction;
	 try to emulate it with a chain of GENERIC nodes.  */
      if (brig_opcode == BRIG_OPCODE_MAD || brig_opcode == BRIG_OPCODE_MAD24)
	{
	  /* There doesn't seem to be a "standard" MAD built-in in gcc so let's
	     use a chain of multiply + add for now (double rounding method).
	     It should be easier for optimizers than a custom built-in call
	     WIDEN_MULT_EXPR is close, but requires a double size result
	     type.  */
	  tree mult_res
	    = build2 (MULT_EXPR, arith_type, operands[0], operands[1]);
	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
	}
      else if (brig_opcode == BRIG_OPCODE_MAD24HI)
	{
	  tree mult_res
	    = build2 (MULT_HIGHPART_EXPR, arith_type, operands[0], operands[1]);
	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
	}
      else if (brig_opcode == BRIG_OPCODE_SHUFFLE)
	{
	  return build_shuffle (arith_type, operands);
	}
      else if (brig_opcode == BRIG_OPCODE_UNPACKLO
	       || brig_opcode == BRIG_OPCODE_UNPACKHI)
	{
	  return build_unpack_lo_or_hi (brig_opcode, arith_type, operands);
	}
      else if (brig_opcode == BRIG_OPCODE_UNPACK)
	{
	  return build_unpack (operands);
	}
      else if (brig_opcode == BRIG_OPCODE_PACK)
	{
	  return build_pack (operands);
	}
      else if (brig_opcode == BRIG_OPCODE_NRSQRT)
	{
	  /* Implement as 1.0/sqrt (x) and assume gcc instruction selects to
	     native ISA other than a division, if available.
	     TODO: this will happen only with unsafe math optimizations
	     on which cannot be used in general to remain HSAIL compliant.
	     Perhaps a builtin call would be better option here.  */
	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
			 m_parent.m_cf->expand_or_call_builtin
			 (BRIG_OPCODE_SQRT, brig_type, arith_type, operands));
	}
      else if (brig_opcode == BRIG_OPCODE_NRCP)
	{
	  /* Implement as 1.0/x and assume gcc instruction selects to
	     native ISA other than a division, if available.  */
	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
			 operands[0]);
	}
      else if (brig_opcode == BRIG_OPCODE_LANEID
	       || brig_opcode == BRIG_OPCODE_MAXWAVEID
	       || brig_opcode == BRIG_OPCODE_WAVEID)
	{
	  /* Assuming WAVESIZE 1 (for now), therefore LANEID, WAVEID and
	     MAXWAVEID always return 0.  */
	  return build_zero_cst (arith_type);
	}
      else
	gcc_unreachable ();
    }
  else if (opcode == CALL_EXPR)
    return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type,
						  arith_type, operands);
  else if (output_count == 1)
    {
      if (input_count == 1)
	{
	  if (opcode == MODIFY_EXPR)
	    return operands[0];
	  else
	    return build1 (opcode, arith_type, operands[0]);
	}
      else if (input_count == 2)
	return build2 (opcode, arith_type, operands[0], operands[1]);
      else if (input_count == 3)
	return build3 (opcode, arith_type, operands[0], operands[1],
		       operands[2]);
      else
	gcc_unreachable ();
    }
  else
    gcc_unreachable ();

  return NULL_TREE;
}

/* Handles the basic instructions, including packed instructions. Deals
   with the different packing modes by unpacking/packing the wanted
   elements.  Delegates most of the instruction cases to build_inst_expr(). */

size_t
brig_basic_inst_handler::operator () (const BrigBase *base)
{
  const BrigInstBase *brig_inst = (const BrigInstBase *) base;
  if (brig_inst->opcode == BRIG_OPCODE_NOP)
    return base->byteCount;

  tree_stl_vec operands = build_operands (*brig_inst);

  size_t output_count
    = gccbrig_hsa_opcode_op_output_p (brig_inst->opcode, 0) ? 1 : 0;
  size_t input_count
    = operands.size () == 0 ? 0 : (operands.size () - output_count);

  gcc_assert (output_count == 0 || output_count == 1);

  tree_stl_vec::iterator first_input_i = operands.begin ();
  if (output_count > 0 && operands.size () > 0)
    ++first_input_i;

  tree_stl_vec in_operands;
  in_operands.assign (first_input_i, operands.end ());

  BrigType16_t brig_inst_type = brig_inst->type;

  if (brig_inst->opcode == BRIG_OPCODE_FIRSTBIT
      || brig_inst->opcode == BRIG_OPCODE_LASTBIT
      || brig_inst->opcode == BRIG_OPCODE_SAD)
    /* These instructions are reported to be always 32b in HSAIL, but we want
       to treat them according to their input argument's type to select the
       correct instruction/builtin.  */
    brig_inst_type
      = gccbrig_tree_type_to_hsa_type (TREE_TYPE (in_operands[0]));

  tree instr_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);

  if (!instr_type)
    {
      gcc_unreachable ();
      return base->byteCount;
    }

  bool is_vec_instr = hsa_type_packed_p (brig_inst_type);

  size_t element_size_bits;
  size_t element_count;

  if (is_vec_instr)
    {
      BrigType16_t brig_element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
      element_size_bits = gccbrig_hsa_type_bit_size (brig_element_type);
      element_count = gccbrig_hsa_type_bit_size (brig_inst_type)
	/ gccbrig_hsa_type_bit_size (brig_element_type);
    }
  else
    {
      element_size_bits = gccbrig_hsa_type_bit_size (brig_inst_type);
      element_count = 1;
    }

  /* The actual arithmetics type that should be performed with the
     operation.  This is not always the same as the original BRIG
     opcode's type due to implicit conversions of storage-only f16.  */
  tree arith_type = gccbrig_is_bit_operation (brig_inst->opcode)
		      ? gccbrig_tree_type_for_hsa_type (brig_inst_type)
		      : get_tree_expr_type_for_hsa_type (brig_inst_type);

  tree instr_expr = NULL_TREE;

  BrigPack8_t p = BRIG_PACK_NONE;
  if (brig_inst->base.kind == BRIG_KIND_INST_MOD)
    p = ((const BrigInstMod *) brig_inst)->pack;
  else if (brig_inst->base.kind == BRIG_KIND_INST_CMP)
    p = ((const BrigInstCmp *) brig_inst)->pack;

  if (p == BRIG_PACK_PS || p == BRIG_PACK_PSSAT)
    in_operands[1] = build_lower_element_broadcast (in_operands[1]);
  else if (p == BRIG_PACK_SP || p == BRIG_PACK_SPSAT)
    in_operands[0] = build_lower_element_broadcast (in_operands[0]);

  tree_code opcode
    = brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode,
						   brig_inst_type);

  if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
    {
      scalarized_sat_arithmetics sat_arith (*brig_inst);
      gcc_assert (input_count == 2);
      instr_expr = sat_arith (*this, in_operands[0], in_operands[1]);
    }
  else if (opcode == RETURN_EXPR)
    {
      if (m_parent.m_cf->m_is_kernel)
	{
	  tree goto_stmt
	    = build1 (GOTO_EXPR, void_type_node, m_parent.m_cf->m_exit_label);
	  m_parent.m_cf->append_statement (goto_stmt);
	  return base->byteCount;
	}
      else
	{
	  m_parent.m_cf->append_return_stmt ();
	  return base->byteCount;
	}
    }
  else if (opcode == MULT_HIGHPART_EXPR &&
	   is_vec_instr && element_size_bits < 64)
    {
      /* MULT_HIGHPART_EXPR works only on target dependent vector sizes and
	 even the scalars do not seem to work at least for char elements.

	 Let's fall back to scalarization and promotion of the vector elements
	 to larger types with the MULHI computed as a regular MUL.
	 MULHI for 2x64b seems to work with the Intel CPUs I've tested so
	 that is passed on for vector processing so there is no need for
	 128b scalar arithmetics.

	 This is not modular as these type of things do not belong to the
	 frontend, there should be a legalization phase before the backend
	 that figures out the best way to compute the MULHI for any
	 integer vector datatype.

	 TODO: promote to larger vector types instead.  For example
	 MULT_HIGHPART_EXPR with s8x8 doesn't work, but s16x8 seems to at least
	 with my x86-64.
      */
      tree_stl_vec operand0_elements;
      if (input_count > 0)
	m_parent.m_cf->unpack (in_operands[0], operand0_elements);

      tree_stl_vec operand1_elements;
      if (input_count > 1)
	m_parent.m_cf->unpack (in_operands[1], operand1_elements);

      tree_stl_vec result_elements;

      tree scalar_type = TREE_TYPE (arith_type);
      BrigType16_t element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
      tree promoted_type = short_integer_type_node;
      switch (element_type)
	{
	case BRIG_TYPE_S8:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S16);
	  break;
	case BRIG_TYPE_U8:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U16);
	  break;
	case BRIG_TYPE_S16:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S32);
	  break;
	case BRIG_TYPE_U16:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U32);
	  break;
	case BRIG_TYPE_S32:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S64);
	  break;
	case BRIG_TYPE_U32:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U64);
	  break;
	default:
	  gcc_unreachable ();
	}

      size_t promoted_type_size = int_size_in_bytes (promoted_type) * 8;
      size_t element_count = gccbrig_type_vector_subparts (arith_type);
      for (size_t i = 0; i < element_count; ++i)
	{
	  tree operand0 = convert (promoted_type, operand0_elements.at (i));
	  tree operand1 = convert (promoted_type, operand1_elements.at (i));

	  tree scalar_expr
	    = build2 (MULT_EXPR, promoted_type, operand0, operand1);

	  scalar_expr
	    = build2 (RSHIFT_EXPR, promoted_type, scalar_expr,
		      build_int_cstu (promoted_type, promoted_type_size / 2));

	  result_elements.push_back (convert (scalar_type, scalar_expr));
	}
      instr_expr = m_parent.m_cf->pack (result_elements);
    }
  else
    {
      /* 'class' is always of b1 type, let's consider it by its
	 float type when building the instruction to find the
	 correct builtin.  */
      if (brig_inst->opcode == BRIG_OPCODE_CLASS)
	brig_inst_type = ((const BrigInstSourceType *) base)->sourceType;
      instr_expr = build_inst_expr (brig_inst->opcode, brig_inst_type,
				     arith_type, in_operands);
    }

  if (instr_expr == NULL_TREE)
    {
      gcc_unreachable ();
      return base->byteCount;
    }

  if (p == BRIG_PACK_SS || p == BRIG_PACK_S || p == BRIG_PACK_SSSAT
      || p == BRIG_PACK_SSAT)
    {
      /* In case of _s_ or _ss_, select only the lowest element
	 from the new input to the output.  We could extract
	 the element and use a scalar operation, but try
	 to keep data in vector registers as much as possible
	 to avoid copies between scalar and vector datapaths.  */
      tree old_value;
      tree half_storage_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
      bool is_fp16_operation
	= (brig_inst_type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
	&& !gccbrig_is_bit_operation (brig_inst->opcode);

      if (is_fp16_operation)
	old_value = build_h2f_conversion
	  (build_resize_convert_view (half_storage_type, operands[0]));
      else
	old_value
	  = build_resize_convert_view (TREE_TYPE (instr_expr), operands[0]);

      size_t esize = is_fp16_operation ? 32 : element_size_bits;

      /* Construct a permutation mask where other elements than the lowest one
	 is picked from the old_value.  */
      tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
      vec<constructor_elt, va_gc> *constructor_vals = NULL;
      for (size_t i = 0; i < element_count; ++i)
	{
	  tree cst;

	  if (i == 0)
	    cst = build_int_cstu (mask_inner_type, element_count);
	  else
	    cst = build_int_cstu (mask_inner_type, i);
	  CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
	}
      tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
      tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);

      tree new_value = create_tmp_var (TREE_TYPE (instr_expr), "new_output");
      tree assign
	= build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), new_value, instr_expr);
      m_parent.m_cf->append_statement (assign);

      instr_expr
	= build3 (VEC_PERM_EXPR, arith_type, old_value, new_value, mask);

      tree lower_output = create_tmp_var (TREE_TYPE (instr_expr), "s_output");
      tree assign_lower = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr),
				  lower_output, instr_expr);
      m_parent.m_cf->append_statement (assign_lower);
      instr_expr = lower_output;
    }

  if (output_count == 1)
    build_output_assignment (*brig_inst, operands[0], instr_expr);
  else
    m_parent.m_cf->append_statement (instr_expr);
  return base->byteCount;
}

/* Create an expression that broadcasts the lowest element of the
   vector in VEC_OPERAND to all elements of the returned vector.  */

tree
brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
{
  /* Build the broadcast using shuffle because there's no
     direct broadcast in GENERIC and this way there's no need for
     a separate extract of the lowest element.  */
  tree element_type = TREE_TYPE (TREE_TYPE (vec_operand));
  size_t esize = 8 * int_size_in_bytes (element_type);

  size_t element_count
    = gccbrig_type_vector_subparts (TREE_TYPE (vec_operand));
  tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
  vec<constructor_elt, va_gc> *constructor_vals = NULL;

  /* Construct the mask.  */
  for (size_t i = 0; i < element_count; ++i)
    {
      tree cst = build_int_cstu (mask_inner_type, element_count);
      CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
    }
  tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
  tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);

  return build3 (VEC_PERM_EXPR, TREE_TYPE (vec_operand), vec_operand,
		 vec_operand, mask);
}