243 lines
8.3 KiB
Markdown
243 lines
8.3 KiB
Markdown
|
;; Scheduling description for the SPARC M8.
|
||
|
;; Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
||
|
;;
|
||
|
;; This file is part of GCC.
|
||
|
;;
|
||
|
;; GCC is free software; you can redistribute it and/or modify
|
||
|
;; it under the terms of the GNU General Public License as published by
|
||
|
;; the Free Software Foundation; either version 3, or (at your option)
|
||
|
;; any later version.
|
||
|
;;
|
||
|
;; GCC is distributed in the hope that it will be useful,
|
||
|
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
;; GNU General Public License for more details.
|
||
|
;;
|
||
|
;; You should have received a copy of the GNU General Public License
|
||
|
;; along with GCC; see the file COPYING3. If not see
|
||
|
;; <http://www.gnu.org/licenses/>.
|
||
|
|
||
|
;; Thigs to improve:
|
||
|
;;
|
||
|
;; - Store instructions are implemented by micro-ops, one of which
|
||
|
;; generates the store address and is executed in the store address
|
||
|
;; generation unit in the slot0. We need to model that.
|
||
|
;;
|
||
|
;; - There are two V3 pipes connected to different slots. The current
|
||
|
;; implementation assumes that all the instructions executing in a
|
||
|
;; V3 pipe are issued to the unit in slot3.
|
||
|
;;
|
||
|
;; - Single-issue ALU operations incur an additional cycle of latency to
|
||
|
;; slot 0 and slot 1 instructions. This is not currently reflected
|
||
|
;; in the DFA.
|
||
|
|
||
|
(define_automaton "m8_0")
|
||
|
|
||
|
;; The S5 core has two dual-issue queues, PQLS and PQEX. Each queue
|
||
|
;; is divided into two slots: PQLS corresponds to slots 0 and 1, and
|
||
|
;; PQEX corresponds to slots 2 and 3. The core can issue 4
|
||
|
;; instructions per-cycle, and up to 4 instructions are committed each
|
||
|
;; cycle.
|
||
|
;;
|
||
|
;;
|
||
|
;; m8_slot0 - Load Unit.
|
||
|
;; - Store address gen. Unit.
|
||
|
;;
|
||
|
;;
|
||
|
;; === PQLS ==> m8_slot1 - Store data unit.
|
||
|
;; - Branch unit.
|
||
|
;;
|
||
|
;;
|
||
|
;; === PQEX ==> m8_slot2 - Integer Unit (EXU2).
|
||
|
;; - 3-cycles Crypto Unit (SPU2).
|
||
|
;;
|
||
|
;; m8_slot3 - Integer Unit (EXU3).
|
||
|
;; - 3-cycles Crypto Unit (SPU3).
|
||
|
;; - Floating-point and graphics unit (FPG).
|
||
|
;; - Long-latency Crypto Unit.
|
||
|
;; - Oracle Numbers Unit (ONU).
|
||
|
|
||
|
(define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0")
|
||
|
|
||
|
;; Some instructions stall the pipeline and avoid any other
|
||
|
;; instruction to be issued in the same cycle. We assume the same for
|
||
|
;; multi-instruction insns.
|
||
|
|
||
|
(define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3")
|
||
|
|
||
|
(define_insn_reservation "m8_single" 1
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "multi,savew,flushw,trap,bmask"))
|
||
|
"m8_single_issue")
|
||
|
|
||
|
;; Most of the instructions executing in the integer units have a
|
||
|
;; latency of 1.
|
||
|
|
||
|
(define_insn_reservation "m8_integer" 1
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask"))
|
||
|
"(m8_slot2 | m8_slot3)")
|
||
|
|
||
|
;; Flushing the instruction memory takes 27 cycles.
|
||
|
|
||
|
|
||
|
(define_insn_reservation "m8_iflush" 27
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "iflush"))
|
||
|
"(m8_slot2 | m8_slot3), nothing*26")
|
||
|
|
||
|
;; The integer multiplication instructions have a latency of 10 cycles
|
||
|
;; and execute in integer units.
|
||
|
;;
|
||
|
;; Likewise for array*, edge* and pdistn instructions.
|
||
|
;;
|
||
|
;; However, the latency is only 9 cycles if the consumer of the
|
||
|
;; operation is also capable of 9 cycles latency. We model this with
|
||
|
;; a bypass.
|
||
|
|
||
|
(define_insn_reservation "m8_imul" 10
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "imul,array,edge,edgen,pdistn"))
|
||
|
"(m8_slot2 | m8_slot3), nothing*12")
|
||
|
|
||
|
(define_bypass 9 "m8_imul" "m8_imul")
|
||
|
|
||
|
;; The integer division instructions `sdiv' and `udivx' have a latency
|
||
|
;; of 30 cycles and execute in integer units.
|
||
|
|
||
|
(define_insn_reservation "m8_idiv" 30
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "idiv"))
|
||
|
"(m8_slot2 | m8_slot3), nothing*29")
|
||
|
|
||
|
;; Both integer and floating-point load instructions have a latency of
|
||
|
;; only 3 cycles,and execute in the slot0.
|
||
|
;;
|
||
|
;; Misaligned load instructions feature a latency of 11 cycles.
|
||
|
;;
|
||
|
;; The prefetch instruction also executes in the load unit, but it's
|
||
|
;; latency is only 1 cycle.
|
||
|
|
||
|
(define_insn_reservation "m8_load" 3
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(ior (eq_attr "type" "fpload,sload")
|
||
|
(and (eq_attr "type" "load")
|
||
|
(eq_attr "subtype" "regular"))))
|
||
|
"m8_slot0, nothing*2")
|
||
|
|
||
|
;; (define_insn_reservation "m8_load_misalign" 11
|
||
|
;; (and (eq_attr "cpu" "m8")
|
||
|
;; (eq_attr "type" "load_mis,fpload_mis"))
|
||
|
;; "m8_slot0, nothing*10")
|
||
|
|
||
|
(define_insn_reservation "m8_prefetch" 1
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "load")
|
||
|
(eq_attr "subtype" "prefetch"))
|
||
|
"m8_slot0")
|
||
|
|
||
|
;; Both integer and floating-point store instructions have a latency
|
||
|
;; of 1 cycle, and execute in the store data unit in slot1.
|
||
|
;;
|
||
|
;; However, misaligned store instructions feature a latency of 3
|
||
|
;; cycles.
|
||
|
|
||
|
(define_insn_reservation "m8_store" 1
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "store,fpstore"))
|
||
|
"m8_slot1")
|
||
|
|
||
|
;; (define_insn_reservation "m8_store_misalign" 3
|
||
|
;; (and (eq_attr "cpu" "m8")
|
||
|
;; (eq_attr "type" "store_mis,fpstore_mis"))
|
||
|
;; "m8_slot1, nothing*2")
|
||
|
|
||
|
;; Control-transfer instructions execute in the Branch Unit in the
|
||
|
;; slot1.
|
||
|
|
||
|
(define_insn_reservation "m8_cti" 1
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
|
||
|
"m8_slot1")
|
||
|
|
||
|
;; Many instructions executing in the Floating-point and Graphics Unit
|
||
|
;; (FGU) serving slot3 feature a default latency of 9 cycles.
|
||
|
|
||
|
(define_insn_reservation "m8_fp" 9
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
|
||
|
(and (eq_attr "type" "fga")
|
||
|
(eq_attr "subtype" "fpu"))))
|
||
|
"m8_slot3, nothing*8")
|
||
|
|
||
|
;; Floating-point division and floating-point square-root instructions
|
||
|
;; have high latencies. They execute in the FGU.
|
||
|
|
||
|
(define_insn_reservation "m8_fpdivs" 26
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "fpdivs"))
|
||
|
"m8_slot3, nothing*25")
|
||
|
|
||
|
(define_insn_reservation "m8_fpsqrts" 33
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "fpsqrts"))
|
||
|
"m8_slot3, nothing*32")
|
||
|
|
||
|
(define_insn_reservation "m8_fpdivd" 30
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "fpdivd"))
|
||
|
"m8_slot3, nothing*29")
|
||
|
|
||
|
(define_insn_reservation "m8_fpsqrtd" 41
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "fpsqrtd"))
|
||
|
"m8_slot3, nothing*40")
|
||
|
|
||
|
;; SIMD VIS instructions executing in the Floating-point and graphics
|
||
|
;; unit (FPG) in slot3 usually have a latency of 5 cycles.
|
||
|
;;
|
||
|
;; However, the latency for many instructions is only 3 cycles if the
|
||
|
;; consumer can also be executed in 3 cycles. We model this with a
|
||
|
;; bypass. In these cases the instructions are executed in one of the
|
||
|
;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2
|
||
|
;; and 3.
|
||
|
|
||
|
(define_insn_reservation "m8_vis" 5
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(ior (eq_attr "type" "viscmp,lzd")
|
||
|
(and (eq_attr "type" "fga")
|
||
|
(eq_attr "subtype" "maxmin,cmask,other"))
|
||
|
(and (eq_attr "type" "vismv")
|
||
|
(eq_attr "subtype" "single,movstouw"))
|
||
|
(and (eq_attr "type" "visl")
|
||
|
(eq_attr "subtype" "single"))))
|
||
|
"m8_slot3, nothing*4")
|
||
|
|
||
|
(define_bypass 3 "m8_vis" "m8_vis")
|
||
|
|
||
|
(define_insn_reservation "m8_gsr" 5
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "gsr")
|
||
|
(eq_attr "subtype" "alignaddr"))
|
||
|
"m8_slot3, nothing*4")
|
||
|
|
||
|
;; A few VIS instructions have a latency of 1.
|
||
|
|
||
|
(define_insn_reservation "m8_vis_1cycle" 1
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(ior (and (eq_attr "type" "vismv")
|
||
|
(eq_attr "subtype" "double,movxtod,movdtox"))
|
||
|
(and (eq_attr "type" "visl")
|
||
|
(eq_attr "subtype" "double"))
|
||
|
(and (eq_attr "type" "fga")
|
||
|
(eq_attr "subtype" "addsub64"))))
|
||
|
"m8_slot3")
|
||
|
|
||
|
;; Reading and writing to the gsr register takes more than 70 cycles.
|
||
|
|
||
|
(define_insn_reservation "m8_gsr_reg" 70
|
||
|
(and (eq_attr "cpu" "m8")
|
||
|
(eq_attr "type" "gsr")
|
||
|
(eq_attr "subtype" "reg"))
|
||
|
"m8_slot3, nothing*69")
|