LLVM 22.0.0git
MachineSMEABIPass.cpp
Go to the documentation of this file.
1//===- MachineSMEABIPass.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the SME ABI requirements for ZA state. This includes
10// implementing the lazy (and agnostic) ZA state save schemes around calls.
11//
12//===----------------------------------------------------------------------===//
13//
14// This pass works by collecting instructions that require ZA to be in a
15// specific state (e.g., "ACTIVE" or "SAVED") and inserting the necessary state
16// transitions to ensure ZA is in the required state before instructions. State
17// transitions represent actions such as setting up or restoring a lazy save.
18// Certain points within a function may also have predefined states independent
19// of any instructions, for example, a "shared_za" function is always entered
20// and exited in the "ACTIVE" state.
21//
22// To handle ZA state across control flow, we make use of edge bundling. This
23// assigns each block an "incoming" and "outgoing" edge bundle (representing
24// incoming and outgoing edges). Initially, these are unique to each block;
25// then, in the process of forming bundles, the outgoing bundle of a block is
26// joined with the incoming bundle of all successors. The result is that each
27// bundle can be assigned a single ZA state, which ensures the state required by
28// all a blocks' successors is the same, and that each basic block will always
29// be entered with the same ZA state. This eliminates the need for splitting
30// edges to insert state transitions or "phi" nodes for ZA states.
31//
32// See below for a simple example of edge bundling.
33//
34// The following shows a conditionally executed basic block (BB1):
35//
36// if (cond)
37// BB1
38// BB2
39//
40// Initial Bundles Joined Bundles
41//
42// ┌──0──┐ ┌──0──┐
43// │ BB0 │ │ BB0 │
44// └──1──┘ └──1──┘
45// ├───────┐ ├───────┐
46// ▼ │ ▼ │
47// ┌──2──┐ │ ─────► ┌──1──┐ │
48// │ BB1 │ ▼ │ BB1 │ ▼
49// └──3──┘ ┌──4──┐ └──1──┘ ┌──1──┐
50// └───►4 BB2 │ └───►1 BB2 │
51// └──5──┘ └──2──┘
52//
53// On the left are the initial per-block bundles, and on the right are the
54// joined bundles (which are the result of the EdgeBundles analysis).
55
56#include "AArch64InstrInfo.h"
58#include "AArch64Subtarget.h"
68
69using namespace llvm;
70
71#define DEBUG_TYPE "aarch64-machine-sme-abi"
72
73namespace {
74
75// Note: For agnostic ZA, we assume the function is always entered/exited in the
76// "ACTIVE" state -- this _may_ not be the case (since OFF is also a
77// possibility, but for the purpose of placing ZA saves/restores, that does not
78// matter).
79enum ZAState : uint8_t {
80 // Any/unknown state (not valid)
81 ANY = 0,
82
83 // ZA is in use and active (i.e. within the accumulator)
84 ACTIVE,
85
86 // ZA is active, but ZT0 has been saved.
87 // This handles the edge case of sharedZA && !sharesZT0.
88 ACTIVE_ZT0_SAVED,
89
90 // A ZA save has been set up or committed (i.e. ZA is dormant or off)
91 // If the function uses ZT0 it must also be saved.
92 LOCAL_SAVED,
93
94 // ZA has been committed to the lazy save buffer of the current function.
95 // If the function uses ZT0 it must also be saved.
96 // ZA is off.
97 LOCAL_COMMITTED,
98
99 // The ZA/ZT0 state on entry to the function.
100 ENTRY,
101
102 // ZA is off.
103 OFF,
104
105 // The number of ZA states (not a valid state)
106 NUM_ZA_STATE
107};
108
109/// A bitmask enum to record live physical registers that the "emit*" routines
110/// may need to preserve. Note: This only tracks registers we may clobber.
111enum LiveRegs : uint8_t {
112 None = 0,
113 NZCV = 1 << 0,
114 W0 = 1 << 1,
115 W0_HI = 1 << 2,
116 X0 = W0 | W0_HI,
117 LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ W0_HI)
118};
119
120/// Holds the virtual registers live physical registers have been saved to.
121struct PhysRegSave {
122 LiveRegs PhysLiveRegs;
123 Register StatusFlags = AArch64::NoRegister;
124 Register X0Save = AArch64::NoRegister;
125};
126
127/// Contains the needed ZA state (and live registers) at an instruction. That is
128/// the state ZA must be in _before_ "InsertPt".
129struct InstInfo {
130 ZAState NeededState{ZAState::ANY};
132 LiveRegs PhysLiveRegs = LiveRegs::None;
133};
134
135/// Contains the needed ZA state for each instruction in a block. Instructions
136/// that do not require a ZA state are not recorded.
137struct BlockInfo {
139 ZAState FixedEntryState{ZAState::ANY};
140 ZAState DesiredIncomingState{ZAState::ANY};
141 ZAState DesiredOutgoingState{ZAState::ANY};
142 LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
143 LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
144};
145
146/// Contains the needed ZA state information for all blocks within a function.
147struct FunctionInfo {
149 std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
150 LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
151};
152
153/// State/helpers that is only needed when emitting code to handle
154/// saving/restoring ZA.
155class EmitContext {
156public:
157 EmitContext() = default;
158
159 /// Get or create a TPIDR2 block in \p MF.
160 int getTPIDR2Block(MachineFunction &MF) {
161 if (TPIDR2BlockFI)
162 return *TPIDR2BlockFI;
163 MachineFrameInfo &MFI = MF.getFrameInfo();
164 TPIDR2BlockFI = MFI.CreateStackObject(16, Align(16), false);
165 return *TPIDR2BlockFI;
166 }
167
168 /// Get or create agnostic ZA buffer pointer in \p MF.
169 Register getAgnosticZABufferPtr(MachineFunction &MF) {
170 if (AgnosticZABufferPtr != AArch64::NoRegister)
171 return AgnosticZABufferPtr;
172 Register BufferPtr =
173 MF.getInfo<AArch64FunctionInfo>()->getEarlyAllocSMESaveBuffer();
174 AgnosticZABufferPtr =
175 BufferPtr != AArch64::NoRegister
176 ? BufferPtr
177 : MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
178 return AgnosticZABufferPtr;
179 }
180
181 int getZT0SaveSlot(MachineFunction &MF) {
182 if (ZT0SaveFI)
183 return *ZT0SaveFI;
184 MachineFrameInfo &MFI = MF.getFrameInfo();
185 ZT0SaveFI = MFI.CreateSpillStackObject(64, Align(16));
186 return *ZT0SaveFI;
187 }
188
189 /// Returns true if the function must allocate a ZA save buffer on entry. This
190 /// will be the case if, at any point in the function, a ZA save was emitted.
191 bool needsSaveBuffer() const {
192 assert(!(TPIDR2BlockFI && AgnosticZABufferPtr) &&
193 "Cannot have both a TPIDR2 block and agnostic ZA buffer");
194 return TPIDR2BlockFI || AgnosticZABufferPtr != AArch64::NoRegister;
195 }
196
197private:
198 std::optional<int> ZT0SaveFI;
199 std::optional<int> TPIDR2BlockFI;
200 Register AgnosticZABufferPtr = AArch64::NoRegister;
201};
202
203/// Checks if \p State is a legal edge bundle state. For a state to be a legal
204/// bundle state, it must be possible to transition from it to any other bundle
205/// state without losing any ZA state. This is the case for ACTIVE/LOCAL_SAVED,
206/// as you can transition between those states by saving/restoring ZA. The OFF
207/// state would not be legal, as transitioning to it drops the content of ZA.
208static bool isLegalEdgeBundleZAState(ZAState State) {
209 switch (State) {
210 case ZAState::ACTIVE: // ZA state within the accumulator/ZT0.
211 case ZAState::ACTIVE_ZT0_SAVED: // ZT0 is saved (ZA is active).
212 case ZAState::LOCAL_SAVED: // ZA state may be saved on the stack.
213 case ZAState::LOCAL_COMMITTED: // ZA state is saved on the stack.
214 return true;
215 default:
216 return false;
217 }
218}
219
220StringRef getZAStateString(ZAState State) {
221#define MAKE_CASE(V) \
222 case V: \
223 return #V;
224 switch (State) {
225 MAKE_CASE(ZAState::ANY)
226 MAKE_CASE(ZAState::ACTIVE)
227 MAKE_CASE(ZAState::ACTIVE_ZT0_SAVED)
228 MAKE_CASE(ZAState::LOCAL_SAVED)
229 MAKE_CASE(ZAState::LOCAL_COMMITTED)
230 MAKE_CASE(ZAState::ENTRY)
231 MAKE_CASE(ZAState::OFF)
232 default:
233 llvm_unreachable("Unexpected ZAState");
234 }
235#undef MAKE_CASE
236}
237
238static bool isZAorZTRegOp(const TargetRegisterInfo &TRI,
239 const MachineOperand &MO) {
240 if (!MO.isReg() || !MO.getReg().isPhysical())
241 return false;
242 return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) {
243 return AArch64::MPR128RegClass.contains(SR) ||
244 AArch64::ZTRRegClass.contains(SR);
245 });
246}
247
248/// Returns the required ZA state needed before \p MI and an iterator pointing
249/// to where any code required to change the ZA state should be inserted.
250static std::pair<ZAState, MachineBasicBlock::iterator>
251getInstNeededZAState(const TargetRegisterInfo &TRI, MachineInstr &MI,
252 SMEAttrs SMEFnAttrs) {
254
255 // Note: InOutZAUsePseudo, RequiresZASavePseudo, and RequiresZT0SavePseudo are
256 // intended to mark the position immediately before a call. Due to
257 // SelectionDAG constraints, these markers occur after the ADJCALLSTACKDOWN,
258 // so we use std::prev(InsertPt) to get the position before the call.
259
260 if (MI.getOpcode() == AArch64::InOutZAUsePseudo)
261 return {ZAState::ACTIVE, std::prev(InsertPt)};
262
263 // Note: If we need to save both ZA and ZT0 we use RequiresZASavePseudo.
264 if (MI.getOpcode() == AArch64::RequiresZASavePseudo)
265 return {ZAState::LOCAL_SAVED, std::prev(InsertPt)};
266
267 // If we only need to save ZT0 there's two cases to consider:
268 // 1. The function has ZA state (that we don't need to save).
269 // - In this case we switch to the "ACTIVE_ZT0_SAVED" state.
270 // This only saves ZT0.
271 // 2. The function does not have ZA state
272 // - In this case we switch to "LOCAL_COMMITTED" state.
273 // This saves ZT0 and turns ZA off.
274 if (MI.getOpcode() == AArch64::RequiresZT0SavePseudo) {
275 return {SMEFnAttrs.hasZAState() ? ZAState::ACTIVE_ZT0_SAVED
276 : ZAState::LOCAL_COMMITTED,
277 std::prev(InsertPt)};
278 }
279
280 if (MI.isReturn()) {
281 bool ZAOffAtReturn = SMEFnAttrs.hasPrivateZAInterface();
282 return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
283 }
284
285 for (auto &MO : MI.operands()) {
286 if (isZAorZTRegOp(TRI, MO))
287 return {ZAState::ACTIVE, InsertPt};
288 }
289
290 return {ZAState::ANY, InsertPt};
291}
292
293struct MachineSMEABI : public MachineFunctionPass {
294 inline static char ID = 0;
295
296 MachineSMEABI(CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
297 : MachineFunctionPass(ID), OptLevel(OptLevel) {}
298
299 bool runOnMachineFunction(MachineFunction &MF) override;
300
301 StringRef getPassName() const override { return "Machine SME ABI pass"; }
302
303 void getAnalysisUsage(AnalysisUsage &AU) const override {
304 AU.setPreservesCFG();
309 }
310
311 /// Collects the needed ZA state (and live registers) before each instruction
312 /// within the machine function.
313 FunctionInfo collectNeededZAStates(SMEAttrs SMEFnAttrs);
314
315 /// Assigns each edge bundle a ZA state based on the needed states of blocks
316 /// that have incoming or outgoing edges in that bundle.
317 SmallVector<ZAState> assignBundleZAStates(const EdgeBundles &Bundles,
318 const FunctionInfo &FnInfo);
319
320 /// Inserts code to handle changes between ZA states within the function.
321 /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
322 void insertStateChanges(EmitContext &, const FunctionInfo &FnInfo,
323 const EdgeBundles &Bundles,
324 ArrayRef<ZAState> BundleStates);
325
326 /// Propagates desired states forwards (from predecessors -> successors) if
327 /// \p Forwards, otherwise, propagates backwards (from successors ->
328 /// predecessors).
329 void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true);
330
331 void emitZT0SaveRestore(EmitContext &, MachineBasicBlock &MBB,
332 MachineBasicBlock::iterator MBBI, bool IsSave);
333
334 // Emission routines for private and shared ZA functions (using lazy saves).
335 void emitSMEPrologue(MachineBasicBlock &MBB,
337 void emitRestoreLazySave(EmitContext &, MachineBasicBlock &MBB,
339 LiveRegs PhysLiveRegs);
340 void emitSetupLazySave(EmitContext &, MachineBasicBlock &MBB,
342 void emitAllocateLazySaveBuffer(EmitContext &, MachineBasicBlock &MBB,
345 bool ClearTPIDR2, bool On);
346
347 // Emission routines for agnostic ZA functions.
348 void emitSetupFullZASave(MachineBasicBlock &MBB,
350 LiveRegs PhysLiveRegs);
351 // Emit a "full" ZA save or restore. It is "full" in the sense that this
352 // function will emit a call to __arm_sme_save or __arm_sme_restore, which
353 // handles saving and restoring both ZA and ZT0.
354 void emitFullZASaveRestore(EmitContext &, MachineBasicBlock &MBB,
356 LiveRegs PhysLiveRegs, bool IsSave);
357 void emitAllocateFullZASaveBuffer(EmitContext &, MachineBasicBlock &MBB,
359 LiveRegs PhysLiveRegs);
360
361 /// Attempts to find an insertion point before \p Inst where the status flags
362 /// are not live. If \p Inst is `Block.Insts.end()` a point before the end of
363 /// the block is found.
364 std::pair<MachineBasicBlock::iterator, LiveRegs>
365 findStateChangeInsertionPoint(MachineBasicBlock &MBB, const BlockInfo &Block,
367 void emitStateChange(EmitContext &, MachineBasicBlock &MBB,
368 MachineBasicBlock::iterator MBBI, ZAState From,
369 ZAState To, LiveRegs PhysLiveRegs);
370
371 // Helpers for switching between lazy/full ZA save/restore routines.
372 void emitZASave(EmitContext &Context, MachineBasicBlock &MBB,
374 if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
375 return emitFullZASaveRestore(Context, MBB, MBBI, PhysLiveRegs,
376 /*IsSave=*/true);
377 return emitSetupLazySave(Context, MBB, MBBI);
378 }
379 void emitZARestore(EmitContext &Context, MachineBasicBlock &MBB,
381 if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
382 return emitFullZASaveRestore(Context, MBB, MBBI, PhysLiveRegs,
383 /*IsSave=*/false);
384 return emitRestoreLazySave(Context, MBB, MBBI, PhysLiveRegs);
385 }
386 void emitAllocateZASaveBuffer(EmitContext &Context, MachineBasicBlock &MBB,
388 LiveRegs PhysLiveRegs) {
389 if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
390 return emitAllocateFullZASaveBuffer(Context, MBB, MBBI, PhysLiveRegs);
391 return emitAllocateLazySaveBuffer(Context, MBB, MBBI);
392 }
393
394 /// Save live physical registers to virtual registers.
395 PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
397 /// Restore physical registers from a save of their previous values.
398 void restorePhyRegSave(const PhysRegSave &RegSave, MachineBasicBlock &MBB,
400
401private:
403
404 MachineFunction *MF = nullptr;
405 const AArch64Subtarget *Subtarget = nullptr;
406 const AArch64RegisterInfo *TRI = nullptr;
407 const AArch64FunctionInfo *AFI = nullptr;
408 const TargetInstrInfo *TII = nullptr;
409 MachineRegisterInfo *MRI = nullptr;
410 MachineLoopInfo *MLI = nullptr;
411};
412
413static LiveRegs getPhysLiveRegs(LiveRegUnits const &LiveUnits) {
414 LiveRegs PhysLiveRegs = LiveRegs::None;
415 if (!LiveUnits.available(AArch64::NZCV))
416 PhysLiveRegs |= LiveRegs::NZCV;
417 // We have to track W0 and X0 separately as otherwise things can get
418 // confused if we attempt to preserve X0 but only W0 was defined.
419 if (!LiveUnits.available(AArch64::W0))
420 PhysLiveRegs |= LiveRegs::W0;
421 if (!LiveUnits.available(AArch64::W0_HI))
422 PhysLiveRegs |= LiveRegs::W0_HI;
423 return PhysLiveRegs;
424}
425
426static void setPhysLiveRegs(LiveRegUnits &LiveUnits, LiveRegs PhysLiveRegs) {
427 if (PhysLiveRegs & LiveRegs::NZCV)
428 LiveUnits.addReg(AArch64::NZCV);
429 if (PhysLiveRegs & LiveRegs::W0)
430 LiveUnits.addReg(AArch64::W0);
431 if (PhysLiveRegs & LiveRegs::W0_HI)
432 LiveUnits.addReg(AArch64::W0_HI);
433}
434
435[[maybe_unused]] bool isCallStartOpcode(unsigned Opc) {
436 switch (Opc) {
437 case AArch64::TLSDESC_CALLSEQ:
438 case AArch64::TLSDESC_AUTH_CALLSEQ:
439 case AArch64::ADJCALLSTACKDOWN:
440 return true;
441 default:
442 return false;
443 }
444}
445
446FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
447 assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
448 SMEFnAttrs.hasZAState()) &&
449 "Expected function to have ZA/ZT0 state!");
450
452 LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
453 std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
454
455 for (MachineBasicBlock &MBB : *MF) {
456 BlockInfo &Block = Blocks[MBB.getNumber()];
457
458 if (MBB.isEntryBlock()) {
459 // Entry block:
460 Block.FixedEntryState = ZAState::ENTRY;
461 } else if (MBB.isEHPad()) {
462 // EH entry block:
463 Block.FixedEntryState = ZAState::LOCAL_COMMITTED;
464 }
465
466 LiveRegUnits LiveUnits(*TRI);
467 LiveUnits.addLiveOuts(MBB);
468
469 Block.PhysLiveRegsAtExit = getPhysLiveRegs(LiveUnits);
470 auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
471 auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
472 for (MachineInstr &MI : reverse(MBB)) {
474 LiveUnits.stepBackward(MI);
475 LiveRegs PhysLiveRegs = getPhysLiveRegs(LiveUnits);
476 // The SMEStateAllocPseudo marker is added to a function if the save
477 // buffer was allocated in SelectionDAG. It marks the end of the
478 // allocation -- which is a safe point for this pass to insert any TPIDR2
479 // block setup.
480 if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
481 AfterSMEProloguePt = MBBI;
482 PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
483 }
484 // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
485 auto [NeededState, InsertPt] = getInstNeededZAState(*TRI, MI, SMEFnAttrs);
486 assert((InsertPt == MBBI || isCallStartOpcode(InsertPt->getOpcode())) &&
487 "Unexpected state change insertion point!");
488 // TODO: Do something to avoid state changes where NZCV is live.
489 if (MBBI == FirstTerminatorInsertPt)
490 Block.PhysLiveRegsAtExit = PhysLiveRegs;
491 if (MBBI == FirstNonPhiInsertPt)
492 Block.PhysLiveRegsAtEntry = PhysLiveRegs;
493 if (NeededState != ZAState::ANY)
494 Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
495 }
496
497 // Reverse vector (as we had to iterate backwards for liveness).
498 std::reverse(Block.Insts.begin(), Block.Insts.end());
499
500 // Record the desired states on entry/exit of this block. These are the
501 // states that would not incur a state transition.
502 if (!Block.Insts.empty()) {
503 Block.DesiredIncomingState = Block.Insts.front().NeededState;
504 Block.DesiredOutgoingState = Block.Insts.back().NeededState;
505 }
506 }
507
508 return FunctionInfo{std::move(Blocks), AfterSMEProloguePt,
509 PhysLiveRegsAfterSMEPrologue};
510}
511
512void MachineSMEABI::propagateDesiredStates(FunctionInfo &FnInfo,
513 bool Forwards) {
514 // If `Forwards`, this propagates desired states from predecessors to
515 // successors, otherwise, this propagates states from successors to
516 // predecessors.
517 auto GetBlockState = [](BlockInfo &Block, bool Incoming) -> ZAState & {
518 return Incoming ? Block.DesiredIncomingState : Block.DesiredOutgoingState;
519 };
520
522 for (auto [BlockID, BlockInfo] : enumerate(FnInfo.Blocks)) {
523 if (!isLegalEdgeBundleZAState(GetBlockState(BlockInfo, Forwards)))
524 Worklist.push_back(MF->getBlockNumbered(BlockID));
525 }
526
527 while (!Worklist.empty()) {
528 MachineBasicBlock *MBB = Worklist.pop_back_val();
529 BlockInfo &Block = FnInfo.Blocks[MBB->getNumber()];
530
531 // Pick a legal edge bundle state that matches the majority of
532 // predecessors/successors.
533 int StateCounts[ZAState::NUM_ZA_STATE] = {0};
534 for (MachineBasicBlock *PredOrSucc :
535 Forwards ? predecessors(MBB) : successors(MBB)) {
536 BlockInfo &PredOrSuccBlock = FnInfo.Blocks[PredOrSucc->getNumber()];
537 ZAState ZAState = GetBlockState(PredOrSuccBlock, !Forwards);
538 if (isLegalEdgeBundleZAState(ZAState))
539 StateCounts[ZAState]++;
540 }
541
542 ZAState PropagatedState = ZAState(max_element(StateCounts) - StateCounts);
543 ZAState &CurrentState = GetBlockState(Block, Forwards);
544 if (PropagatedState != CurrentState) {
545 CurrentState = PropagatedState;
546 ZAState &OtherState = GetBlockState(Block, !Forwards);
547 // Propagate to the incoming/outgoing state if that is also "ANY".
548 if (OtherState == ZAState::ANY)
549 OtherState = PropagatedState;
550 // Push any successors/predecessors that may need updating to the
551 // worklist.
552 for (MachineBasicBlock *SuccOrPred :
553 Forwards ? successors(MBB) : predecessors(MBB)) {
554 BlockInfo &SuccOrPredBlock = FnInfo.Blocks[SuccOrPred->getNumber()];
555 if (!isLegalEdgeBundleZAState(GetBlockState(SuccOrPredBlock, Forwards)))
556 Worklist.push_back(SuccOrPred);
557 }
558 }
559 }
560}
561
562/// Assigns each edge bundle a ZA state based on the needed states of blocks
563/// that have incoming or outgoing edges in that bundle.
565MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
566 const FunctionInfo &FnInfo) {
567 SmallVector<ZAState> BundleStates(Bundles.getNumBundles());
568 for (unsigned I = 0, E = Bundles.getNumBundles(); I != E; ++I) {
569 LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n');
570
571 // Attempt to assign a ZA state for this bundle that minimizes state
572 // transitions. Edges within loops are given a higher weight as we assume
573 // they will be executed more than once.
574 int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
575 for (unsigned BlockID : Bundles.getBlocks(I)) {
576 LLVM_DEBUG(dbgs() << "- bb." << BlockID);
577
578 const BlockInfo &Block = FnInfo.Blocks[BlockID];
579 bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I;
580 bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I;
581
582 bool LegalInEdge =
583 InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState);
584 bool LegalOutEgde =
585 OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState);
586 if (LegalInEdge) {
587 LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
588 << getZAStateString(Block.DesiredIncomingState));
589 EdgeStateCounts[Block.DesiredIncomingState]++;
590 }
591 if (LegalOutEgde) {
592 LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
593 << getZAStateString(Block.DesiredOutgoingState));
594 EdgeStateCounts[Block.DesiredOutgoingState]++;
595 }
596 if (!LegalInEdge && !LegalOutEgde)
597 LLVM_DEBUG(dbgs() << " (no state preference)");
598 LLVM_DEBUG(dbgs() << '\n');
599 }
600
601 ZAState BundleState =
602 ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
603
604 if (BundleState == ZAState::ANY)
605 BundleState = ZAState::ACTIVE;
606
607 LLVM_DEBUG({
608 dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n'
609 << "Edge counts:";
610 for (auto [State, Count] : enumerate(EdgeStateCounts))
611 dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count;
612 dbgs() << "\n\n";
613 });
614
615 BundleStates[I] = BundleState;
616 }
617
618 return BundleStates;
619}
620
621std::pair<MachineBasicBlock::iterator, LiveRegs>
622MachineSMEABI::findStateChangeInsertionPoint(
623 MachineBasicBlock &MBB, const BlockInfo &Block,
625 LiveRegs PhysLiveRegs;
627 if (Inst != Block.Insts.end()) {
628 InsertPt = Inst->InsertPt;
629 PhysLiveRegs = Inst->PhysLiveRegs;
630 } else {
631 InsertPt = MBB.getFirstTerminator();
632 PhysLiveRegs = Block.PhysLiveRegsAtExit;
633 }
634
635 if (PhysLiveRegs == LiveRegs::None)
636 return {InsertPt, PhysLiveRegs}; // Nothing to do (no live regs).
637
638 // Find the previous state change. We can not move before this point.
639 MachineBasicBlock::iterator PrevStateChangeI;
640 if (Inst == Block.Insts.begin()) {
641 PrevStateChangeI = MBB.begin();
642 } else {
643 // Note: `std::prev(Inst)` is the previous InstInfo. We only create an
644 // InstInfo object for instructions that require a specific ZA state, so the
645 // InstInfo is the site of the previous state change in the block (which can
646 // be several MIs earlier).
647 PrevStateChangeI = std::prev(Inst)->InsertPt;
648 }
649
650 // Note: LiveUnits will only accurately track X0 and NZCV.
651 LiveRegUnits LiveUnits(*TRI);
652 setPhysLiveRegs(LiveUnits, PhysLiveRegs);
653 auto BestCandidate = std::make_pair(InsertPt, PhysLiveRegs);
654 for (MachineBasicBlock::iterator I = InsertPt; I != PrevStateChangeI; --I) {
655 // Don't move before/into a call (which may have a state change before it).
656 if (I->getOpcode() == TII->getCallFrameDestroyOpcode() || I->isCall())
657 break;
658 LiveUnits.stepBackward(*I);
659 LiveRegs CurrentPhysLiveRegs = getPhysLiveRegs(LiveUnits);
660 // Find places where NZCV is available, but keep looking for locations where
661 // both NZCV and X0 are available, which can avoid some copies.
662 if (!(CurrentPhysLiveRegs & LiveRegs::NZCV))
663 BestCandidate = {I, CurrentPhysLiveRegs};
664 if (CurrentPhysLiveRegs == LiveRegs::None)
665 break;
666 }
667 return BestCandidate;
668}
669
670void MachineSMEABI::insertStateChanges(EmitContext &Context,
671 const FunctionInfo &FnInfo,
672 const EdgeBundles &Bundles,
673 ArrayRef<ZAState> BundleStates) {
674 for (MachineBasicBlock &MBB : *MF) {
675 const BlockInfo &Block = FnInfo.Blocks[MBB.getNumber()];
676 ZAState InState = BundleStates[Bundles.getBundle(MBB.getNumber(),
677 /*Out=*/false)];
678
679 ZAState CurrentState = Block.FixedEntryState;
680 if (CurrentState == ZAState::ANY)
681 CurrentState = InState;
682
683 for (auto &Inst : Block.Insts) {
684 if (CurrentState != Inst.NeededState) {
685 auto [InsertPt, PhysLiveRegs] =
686 findStateChangeInsertionPoint(MBB, Block, &Inst);
687 emitStateChange(Context, MBB, InsertPt, CurrentState, Inst.NeededState,
688 PhysLiveRegs);
689 CurrentState = Inst.NeededState;
690 }
691 }
692
693 if (MBB.succ_empty())
694 continue;
695
696 ZAState OutState =
697 BundleStates[Bundles.getBundle(MBB.getNumber(), /*Out=*/true)];
698 if (CurrentState != OutState) {
699 auto [InsertPt, PhysLiveRegs] =
700 findStateChangeInsertionPoint(MBB, Block, Block.Insts.end());
701 emitStateChange(Context, MBB, InsertPt, CurrentState, OutState,
702 PhysLiveRegs);
703 }
704 }
705}
706
709 if (MBBI != MBB.end())
710 return MBBI->getDebugLoc();
711 return DebugLoc();
712}
713
714void MachineSMEABI::emitSetupLazySave(EmitContext &Context,
718
719 // Get pointer to TPIDR2 block.
720 Register TPIDR2 = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
721 Register TPIDR2Ptr = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
722 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
723 .addFrameIndex(Context.getTPIDR2Block(*MF))
724 .addImm(0)
725 .addImm(0);
726 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), TPIDR2Ptr)
727 .addReg(TPIDR2);
728 // Set TPIDR2_EL0 to point to TPIDR2 block.
729 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
730 .addImm(AArch64SysReg::TPIDR2_EL0)
731 .addReg(TPIDR2Ptr);
732}
733
734PhysRegSave MachineSMEABI::createPhysRegSave(LiveRegs PhysLiveRegs,
737 DebugLoc DL) {
738 PhysRegSave RegSave{PhysLiveRegs};
739 if (PhysLiveRegs & LiveRegs::NZCV) {
740 RegSave.StatusFlags = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
741 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), RegSave.StatusFlags)
742 .addImm(AArch64SysReg::NZCV)
743 .addReg(AArch64::NZCV, RegState::Implicit);
744 }
745 // Note: Preserving X0 is "free" as this is before register allocation, so
746 // the register allocator is still able to optimize these copies.
747 if (PhysLiveRegs & LiveRegs::W0) {
748 RegSave.X0Save = MRI->createVirtualRegister(PhysLiveRegs & LiveRegs::W0_HI
749 ? &AArch64::GPR64RegClass
750 : &AArch64::GPR32RegClass);
751 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), RegSave.X0Save)
752 .addReg(PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0);
753 }
754 return RegSave;
755}
756
757void MachineSMEABI::restorePhyRegSave(const PhysRegSave &RegSave,
760 DebugLoc DL) {
761 if (RegSave.StatusFlags != AArch64::NoRegister)
762 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
763 .addImm(AArch64SysReg::NZCV)
764 .addReg(RegSave.StatusFlags)
765 .addReg(AArch64::NZCV, RegState::ImplicitDefine);
766
767 if (RegSave.X0Save != AArch64::NoRegister)
768 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY),
769 RegSave.PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0)
770 .addReg(RegSave.X0Save);
771}
772
773void MachineSMEABI::emitRestoreLazySave(EmitContext &Context,
776 LiveRegs PhysLiveRegs) {
777 auto *TLI = Subtarget->getTargetLowering();
779 Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
780 Register TPIDR2 = AArch64::X0;
781
782 // TODO: Emit these within the restore MBB to prevent unnecessary saves.
783 PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
784
785 // Enable ZA.
786 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
787 .addImm(AArch64SVCR::SVCRZA)
788 .addImm(1);
789 // Get current TPIDR2_EL0.
790 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), TPIDR2EL0)
791 .addImm(AArch64SysReg::TPIDR2_EL0);
792 // Get pointer to TPIDR2 block.
793 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
794 .addFrameIndex(Context.getTPIDR2Block(*MF))
795 .addImm(0)
796 .addImm(0);
797 // (Conditionally) restore ZA state.
798 BuildMI(MBB, MBBI, DL, TII->get(AArch64::RestoreZAPseudo))
799 .addReg(TPIDR2EL0)
800 .addReg(TPIDR2)
801 .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_RESTORE))
802 .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
803 // Zero TPIDR2_EL0.
804 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
805 .addImm(AArch64SysReg::TPIDR2_EL0)
806 .addReg(AArch64::XZR);
807
808 restorePhyRegSave(RegSave, MBB, MBBI, DL);
809}
810
811void MachineSMEABI::emitZAMode(MachineBasicBlock &MBB,
813 bool ClearTPIDR2, bool On) {
815
816 if (ClearTPIDR2)
817 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
818 .addImm(AArch64SysReg::TPIDR2_EL0)
819 .addReg(AArch64::XZR);
820
821 // Disable ZA.
822 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
823 .addImm(AArch64SVCR::SVCRZA)
824 .addImm(On ? 1 : 0);
825}
826
827void MachineSMEABI::emitAllocateLazySaveBuffer(
828 EmitContext &Context, MachineBasicBlock &MBB,
830 MachineFrameInfo &MFI = MF->getFrameInfo();
832 Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
833 Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
834 Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
835
836 // Calculate SVL.
837 BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
838
839 // 1. Allocate the lazy save buffer.
840 if (Buffer == AArch64::NoRegister) {
841 // TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
842 // Buffer != AArch64::NoRegister). This is done to reuse the existing
843 // expansions (which can insert stack checks). This works, but it means we
844 // will always allocate the lazy save buffer (even if the function contains
845 // no lazy saves). If we want to handle Windows here, we'll need to
846 // implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
847 assert(!Subtarget->isTargetWindows() &&
848 "Lazy ZA save is not yet supported on Windows");
849 Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
850 // Get original stack pointer.
851 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
852 .addReg(AArch64::SP);
853 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
854 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSUBXrrr), Buffer)
855 .addReg(SVL)
856 .addReg(SVL)
857 .addReg(SP);
858 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::SP)
859 .addReg(Buffer);
860 // We have just allocated a variable sized object, tell this to PEI.
861 MFI.CreateVariableSizedObject(Align(16), nullptr);
862 }
863
864 // 2. Setup the TPIDR2 block.
865 {
866 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
867 // generally don't support big-endian SVE/SME.
868 if (!Subtarget->isLittleEndian())
870 "TPIDR2 block initialization is not supported on big-endian targets");
871
872 // Store buffer pointer and num_za_save_slices.
873 // Bytes 10-15 are implicitly zeroed.
874 BuildMI(MBB, MBBI, DL, TII->get(AArch64::STPXi))
875 .addReg(Buffer)
876 .addReg(SVL)
877 .addFrameIndex(Context.getTPIDR2Block(*MF))
878 .addImm(0);
879 }
880}
881
882static constexpr unsigned ZERO_ALL_ZA_MASK = 0b11111111;
883
884void MachineSMEABI::emitSMEPrologue(MachineBasicBlock &MBB,
886 auto *TLI = Subtarget->getTargetLowering();
888
889 bool ZeroZA = AFI->getSMEFnAttrs().isNewZA();
890 bool ZeroZT0 = AFI->getSMEFnAttrs().isNewZT0();
892 // Get current TPIDR2_EL0.
893 Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
894 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS))
895 .addReg(TPIDR2EL0, RegState::Define)
896 .addImm(AArch64SysReg::TPIDR2_EL0);
897 // If TPIDR2_EL0 is non-zero, commit the lazy save.
898 // NOTE: Functions that only use ZT0 don't need to zero ZA.
899 auto CommitZASave =
900 BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo))
901 .addReg(TPIDR2EL0)
902 .addImm(ZeroZA)
903 .addImm(ZeroZT0)
904 .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE))
905 .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
906 if (ZeroZA)
907 CommitZASave.addDef(AArch64::ZAB0, RegState::ImplicitDefine);
908 if (ZeroZT0)
909 CommitZASave.addDef(AArch64::ZT0, RegState::ImplicitDefine);
910 // Enable ZA (as ZA could have previously been in the OFF state).
911 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
912 .addImm(AArch64SVCR::SVCRZA)
913 .addImm(1);
914 } else if (AFI->getSMEFnAttrs().hasSharedZAInterface()) {
915 if (ZeroZA)
916 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ZERO_M))
918 .addDef(AArch64::ZAB0, RegState::ImplicitDefine);
919 if (ZeroZT0)
920 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ZERO_T)).addDef(AArch64::ZT0);
921 }
922}
923
924void MachineSMEABI::emitFullZASaveRestore(EmitContext &Context,
927 LiveRegs PhysLiveRegs, bool IsSave) {
928 auto *TLI = Subtarget->getTargetLowering();
930 Register BufferPtr = AArch64::X0;
931
932 PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
933
934 // Copy the buffer pointer into X0.
935 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
936 .addReg(Context.getAgnosticZABufferPtr(*MF));
937
938 // Call __arm_sme_save/__arm_sme_restore.
939 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
940 .addReg(BufferPtr, RegState::Implicit)
941 .addExternalSymbol(TLI->getLibcallName(
942 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE))
943 .addRegMask(TRI->getCallPreservedMask(
944 *MF,
946
947 restorePhyRegSave(RegSave, MBB, MBBI, DL);
948}
949
950void MachineSMEABI::emitZT0SaveRestore(EmitContext &Context,
953 bool IsSave) {
955 Register ZT0Save = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
956
957 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), ZT0Save)
958 .addFrameIndex(Context.getZT0SaveSlot(*MF))
959 .addImm(0)
960 .addImm(0);
961
962 if (IsSave) {
963 BuildMI(MBB, MBBI, DL, TII->get(AArch64::STR_TX))
964 .addReg(AArch64::ZT0)
965 .addReg(ZT0Save);
966 } else {
967 BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDR_TX), AArch64::ZT0)
968 .addReg(ZT0Save);
969 }
970}
971
972void MachineSMEABI::emitAllocateFullZASaveBuffer(
973 EmitContext &Context, MachineBasicBlock &MBB,
975 // Buffer already allocated in SelectionDAG.
977 return;
978
980 Register BufferPtr = Context.getAgnosticZABufferPtr(*MF);
981 Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
982
983 PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
984
985 // Calculate the SME state size.
986 {
987 auto *TLI = Subtarget->getTargetLowering();
988 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
989 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
990 .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_SME_STATE_SIZE))
991 .addReg(AArch64::X0, RegState::ImplicitDefine)
992 .addRegMask(TRI->getCallPreservedMask(
993 *MF, CallingConv::
995 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferSize)
996 .addReg(AArch64::X0);
997 }
998
999 // Allocate a buffer object of the size given __arm_sme_state_size.
1000 {
1001 MachineFrameInfo &MFI = MF->getFrameInfo();
1002 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1003 .addReg(AArch64::SP)
1004 .addReg(BufferSize)
1006 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
1007 .addReg(AArch64::SP);
1008
1009 // We have just allocated a variable sized object, tell this to PEI.
1010 MFI.CreateVariableSizedObject(Align(16), nullptr);
1011 }
1012
1013 restorePhyRegSave(RegSave, MBB, MBBI, DL);
1014}
1015
1016struct FromState {
1017 ZAState From;
1018
1019 constexpr uint8_t to(ZAState To) const {
1020 static_assert(NUM_ZA_STATE < 16, "expected ZAState to fit in 4-bits");
1021 return uint8_t(From) << 4 | uint8_t(To);
1022 }
1023};
1024
1025constexpr FromState transitionFrom(ZAState From) { return FromState{From}; }
1026
1027void MachineSMEABI::emitStateChange(EmitContext &Context,
1030 ZAState From, ZAState To,
1031 LiveRegs PhysLiveRegs) {
1032 // ZA not used.
1033 if (From == ZAState::ANY || To == ZAState::ANY)
1034 return;
1035
1036 // If we're exiting from the ENTRY state that means that the function has not
1037 // used ZA, so in the case of private ZA/ZT0 functions we can omit any set up.
1038 if (From == ZAState::ENTRY && To == ZAState::OFF)
1039 return;
1040
1041 // TODO: Avoid setting up the save buffer if there's no transition to
1042 // LOCAL_SAVED.
1043 if (From == ZAState::ENTRY) {
1044 assert(&MBB == &MBB.getParent()->front() &&
1045 "ENTRY state only valid in entry block");
1046 emitSMEPrologue(MBB, MBB.getFirstNonPHI());
1047 if (To == ZAState::ACTIVE)
1048 return; // Nothing more to do (ZA is active after the prologue).
1049
1050 // Note: "emitNewZAPrologue" zeros ZA, so we may need to setup a lazy save
1051 // if "To" is "ZAState::LOCAL_SAVED". It may be possible to improve this
1052 // case by changing the placement of the zero instruction.
1053 From = ZAState::ACTIVE;
1054 }
1055
1056 SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
1057 bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
1058 bool HasZT0State = SMEFnAttrs.hasZT0State();
1059 bool HasZAState = IsAgnosticZA || SMEFnAttrs.hasZAState();
1060
1061 switch (transitionFrom(From).to(To)) {
1062 // This section handles: ACTIVE <-> ACTIVE_ZT0_SAVED
1063 case transitionFrom(ZAState::ACTIVE).to(ZAState::ACTIVE_ZT0_SAVED):
1064 emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
1065 break;
1066 case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::ACTIVE):
1067 emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false);
1068 break;
1069
1070 // This section handles: ACTIVE[_ZT0_SAVED] -> LOCAL_SAVED
1071 case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_SAVED):
1072 case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::LOCAL_SAVED):
1073 if (HasZT0State && From == ZAState::ACTIVE)
1074 emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
1075 if (HasZAState)
1076 emitZASave(Context, MBB, InsertPt, PhysLiveRegs);
1077 break;
1078
1079 // This section handles: ACTIVE -> LOCAL_COMMITTED
1080 case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_COMMITTED):
1081 // TODO: We could support ZA state here, but this transition is currently
1082 // only possible when we _don't_ have ZA state.
1083 assert(HasZT0State && !HasZAState && "Expect to only have ZT0 state.");
1084 emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
1085 emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/false);
1086 break;
1087
1088 // This section handles: LOCAL_COMMITTED -> (OFF|LOCAL_SAVED)
1089 case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::OFF):
1090 case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::LOCAL_SAVED):
1091 // These transistions are a no-op.
1092 break;
1093
1094 // This section handles: LOCAL_(SAVED|COMMITTED) -> ACTIVE[_ZT0_SAVED]
1095 case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE):
1096 case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE_ZT0_SAVED):
1097 case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::ACTIVE):
1098 if (HasZAState)
1099 emitZARestore(Context, MBB, InsertPt, PhysLiveRegs);
1100 else
1101 emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/true);
1102 if (HasZT0State && To == ZAState::ACTIVE)
1103 emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false);
1104 break;
1105
1106 // This section handles transistions to OFF (not previously covered)
1107 case transitionFrom(ZAState::ACTIVE).to(ZAState::OFF):
1108 case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::OFF):
1109 case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::OFF):
1110 assert(SMEFnAttrs.hasPrivateZAInterface() &&
1111 "Did not expect to turn ZA off in shared/agnostic ZA function");
1112 emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED,
1113 /*On=*/false);
1114 break;
1115
1116 default:
1117 dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
1118 << getZAStateString(To) << '\n';
1119 llvm_unreachable("Unimplemented state transition");
1120 }
1121}
1122
1123} // end anonymous namespace
1124
1125INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi", "Machine SME ABI",
1126 false, false)
1127
1128bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
1129 if (!MF.getSubtarget<AArch64Subtarget>().hasSME())
1130 return false;
1131
1132 AFI = MF.getInfo<AArch64FunctionInfo>();
1133 SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
1134 if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
1135 !SMEFnAttrs.hasAgnosticZAInterface())
1136 return false;
1137
1138 assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
1139
1140 this->MF = &MF;
1141 Subtarget = &MF.getSubtarget<AArch64Subtarget>();
1142 TII = Subtarget->getInstrInfo();
1143 TRI = Subtarget->getRegisterInfo();
1144 MRI = &MF.getRegInfo();
1145
1146 const EdgeBundles &Bundles =
1147 getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
1148
1149 FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs);
1150
1151 if (OptLevel != CodeGenOptLevel::None) {
1152 // Propagate desired states forward, then backwards. Most of the propagation
1153 // should be done in the forward step, and backwards propagation is then
1154 // used to fill in the gaps. Note: Doing both in one step can give poor
1155 // results. For example, consider this subgraph:
1156 //
1157 // ┌─────┐
1158 // ┌─┤ BB0 ◄───┐
1159 // │ └─┬───┘ │
1160 // │ ┌─▼───◄──┐│
1161 // │ │ BB1 │ ││
1162 // │ └─┬┬──┘ ││
1163 // │ │└─────┘│
1164 // │ ┌─▼───┐ │
1165 // │ │ BB2 ├───┘
1166 // │ └─┬───┘
1167 // │ ┌─▼───┐
1168 // └─► BB3 │
1169 // └─────┘
1170 //
1171 // If:
1172 // - "BB0" and "BB2" (outer loop) has no state preference
1173 // - "BB1" (inner loop) desires the ACTIVE state on entry/exit
1174 // - "BB3" desires the LOCAL_SAVED state on entry
1175 //
1176 // If we propagate forwards first, ACTIVE is propagated from BB1 to BB2,
1177 // then from BB2 to BB0. Which results in the inner and outer loops having
1178 // the "ACTIVE" state. This avoids any state changes in the loops.
1179 //
1180 // If we propagate backwards first, we _could_ propagate LOCAL_SAVED from
1181 // BB3 to BB0, which would result in a transition from ACTIVE -> LOCAL_SAVED
1182 // in the outer loop.
1183 for (bool Forwards : {true, false})
1184 propagateDesiredStates(FnInfo, Forwards);
1185 }
1186
1187 SmallVector<ZAState> BundleStates = assignBundleZAStates(Bundles, FnInfo);
1188
1189 EmitContext Context;
1190 insertStateChanges(Context, FnInfo, Bundles, BundleStates);
1191
1192 if (Context.needsSaveBuffer()) {
1193 if (FnInfo.AfterSMEProloguePt) {
1194 // Note: With inline stack probes the AfterSMEProloguePt may not be in the
1195 // entry block (due to the probing loop).
1196 MachineBasicBlock::iterator MBBI = *FnInfo.AfterSMEProloguePt;
1197 emitAllocateZASaveBuffer(Context, *MBBI->getParent(), MBBI,
1198 FnInfo.PhysLiveRegsAfterSMEPrologue);
1199 } else {
1200 MachineBasicBlock &EntryBlock = MF.front();
1201 emitAllocateZASaveBuffer(
1202 Context, EntryBlock, EntryBlock.getFirstNonPHI(),
1203 FnInfo.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry);
1204 }
1205 }
1206
1207 return true;
1208}
1209
1211 return new MachineSMEABI(OptLevel);
1212}
unsigned const MachineRegisterInfo * MRI
static constexpr unsigned ZERO_ALL_ZA_MASK
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
const TargetInstrInfo & TII
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
IRTranslator LLVM IR MI
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define ENTRY(ASMNAME, ENUM)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define MAKE_CASE(V)
Register const TargetRegisterInfo * TRI
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
const AArch64RegisterInfo * getRegisterInfo() const override
const AArch64TargetLowering * getTargetLowering() const override
Represent the analysis usage information of a pass.
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
A debug info location.
Definition DebugLoc.h:124
ArrayRef< unsigned > getBlocks(unsigned Bundle) const
getBlocks - Return an array of blocks that are connected to Bundle.
Definition EdgeBundles.h:53
unsigned getBundle(unsigned N, bool Out) const
getBundle - Return the ingoing (Out = false) or outgoing (Out = true) bundle number for basic block N
Definition EdgeBundles.h:47
unsigned getNumBundles() const
getNumBundles - Return the total number of bundles in the CFG.
Definition EdgeBundles.h:50
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
void addReg(MCRegister Reg)
Adds register units covered by physical register Reg.
LLVM_ABI void stepBackward(const MachineInstr &MI)
Updates liveness when stepping backwards over the instruction MI.
LLVM_ABI void addLiveOuts(const MachineBasicBlock &MBB)
Adds registers living out of block MBB.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
MachineBasicBlock * getBlockNumbered(unsigned N) const
getBlockNumbered - MachineBasicBlocks are automatically numbered when they are inserted into the mach...
unsigned getNumBlockIDs() const
getNumBlockIDs - Return the number of MBB ID's allocated.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasAgnosticZAInterface() const
bool hasPrivateZAInterface() const
bool hasSharedZAInterface() const
typename SuperClass::const_iterator const_iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
CallingConv Namespace - This namespace contains an enum with a value for the well-known calling conve...
Definition CallingConv.h:21
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
auto successors(const MachineBasicBlock *BB)
FunctionPass * createMachineSMEABIPass(CodeGenOptLevel)
LLVM_ABI char & MachineDominatorsID
MachineDominators - This pass is a machine dominators analysis pass.
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition Error.cpp:177
LLVM_ABI char & MachineLoopInfoID
MachineLoopInfo - This pass is a loop analysis pass.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os, -Oz
Definition CodeGen.h:85
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2030
auto predecessors(const MachineBasicBlock *BB)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...