LLVM 22.0.0git
MachineSMEABIPass.cpp
Go to the documentation of this file.
1//===- MachineSMEABIPass.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the SME ABI requirements for ZA state. This includes
10// implementing the lazy (and agnostic) ZA state save schemes around calls.
11//
12//===----------------------------------------------------------------------===//
13//
14// This pass works by collecting instructions that require ZA to be in a
15// specific state (e.g., "ACTIVE" or "SAVED") and inserting the necessary state
16// transitions to ensure ZA is in the required state before instructions. State
17// transitions represent actions such as setting up or restoring a lazy save.
18// Certain points within a function may also have predefined states independent
19// of any instructions, for example, a "shared_za" function is always entered
20// and exited in the "ACTIVE" state.
21//
22// To handle ZA state across control flow, we make use of edge bundling. This
23// assigns each block an "incoming" and "outgoing" edge bundle (representing
24// incoming and outgoing edges). Initially, these are unique to each block;
25// then, in the process of forming bundles, the outgoing block of a block is
26// joined with the incoming bundle of all successors. The result is that each
27// bundle can be assigned a single ZA state, which ensures the state required by
28// all a blocks' successors is the same, and that each basic block will always
29// be entered with the same ZA state. This eliminates the need for splitting
30// edges to insert state transitions or "phi" nodes for ZA states.
31//
32// See below for a simple example of edge bundling.
33//
34// The following shows a conditionally executed basic block (BB1):
35//
36// if (cond)
37// BB1
38// BB2
39//
40// Initial Bundles Joined Bundles
41//
42// ┌──0──┐ ┌──0──┐
43// │ BB0 │ │ BB0 │
44// └──1──┘ └──1──┘
45// ├───────┐ ├───────┐
46// ▼ │ ▼ │
47// ┌──2──┐ │ ─────► ┌──1──┐ │
48// │ BB1 │ ▼ │ BB1 │ ▼
49// └──3──┘ ┌──4──┐ └──1──┘ ┌──1──┐
50// └───►4 BB2 │ └───►1 BB2 │
51// └──5──┘ └──2──┘
52//
53// On the left are the initial per-block bundles, and on the right are the
54// joined bundles (which are the result of the EdgeBundles analysis).
55
56#include "AArch64InstrInfo.h"
58#include "AArch64Subtarget.h"
68
69using namespace llvm;
70
71#define DEBUG_TYPE "aarch64-machine-sme-abi"
72
73namespace {
74
75enum ZAState {
76 // Any/unknown state (not valid)
77 ANY = 0,
78
79 // ZA is in use and active (i.e. within the accumulator)
80 ACTIVE,
81
82 // A ZA save has been set up or committed (i.e. ZA is dormant or off)
83 LOCAL_SAVED,
84
85 // ZA is off or a lazy save has been set up by the caller
86 CALLER_DORMANT,
87
88 // ZA is off
89 OFF,
90
91 // The number of ZA states (not a valid state)
92 NUM_ZA_STATE
93};
94
95/// A bitmask enum to record live physical registers that the "emit*" routines
96/// may need to preserve. Note: This only tracks registers we may clobber.
97enum LiveRegs : uint8_t {
98 None = 0,
99 NZCV = 1 << 0,
100 W0 = 1 << 1,
101 W0_HI = 1 << 2,
102 X0 = W0 | W0_HI,
103 LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ W0_HI)
104};
105
106/// Holds the virtual registers live physical registers have been saved to.
107struct PhysRegSave {
108 LiveRegs PhysLiveRegs;
109 Register StatusFlags = AArch64::NoRegister;
110 Register X0Save = AArch64::NoRegister;
111};
112
113static bool isLegalEdgeBundleZAState(ZAState State) {
114 switch (State) {
115 case ZAState::ACTIVE:
116 case ZAState::LOCAL_SAVED:
117 return true;
118 default:
119 return false;
120 }
121}
122struct TPIDR2State {
123 int FrameIndex = -1;
124};
125
126StringRef getZAStateString(ZAState State) {
127#define MAKE_CASE(V) \
128 case V: \
129 return #V;
130 switch (State) {
131 MAKE_CASE(ZAState::ANY)
132 MAKE_CASE(ZAState::ACTIVE)
133 MAKE_CASE(ZAState::LOCAL_SAVED)
134 MAKE_CASE(ZAState::CALLER_DORMANT)
135 MAKE_CASE(ZAState::OFF)
136 default:
137 llvm_unreachable("Unexpected ZAState");
138 }
139#undef MAKE_CASE
140}
141
142static bool isZAorZTRegOp(const TargetRegisterInfo &TRI,
143 const MachineOperand &MO) {
144 if (!MO.isReg() || !MO.getReg().isPhysical())
145 return false;
146 return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) {
147 return AArch64::MPR128RegClass.contains(SR) ||
148 AArch64::ZTRRegClass.contains(SR);
149 });
150}
151
152/// Returns the required ZA state needed before \p MI and an iterator pointing
153/// to where any code required to change the ZA state should be inserted.
154static std::pair<ZAState, MachineBasicBlock::iterator>
155getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
156 bool ZAOffAtReturn) {
158
159 if (MI.getOpcode() == AArch64::InOutZAUsePseudo)
160 return {ZAState::ACTIVE, std::prev(InsertPt)};
161
162 if (MI.getOpcode() == AArch64::RequiresZASavePseudo)
163 return {ZAState::LOCAL_SAVED, std::prev(InsertPt)};
164
165 if (MI.isReturn())
166 return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
167
168 for (auto &MO : MI.operands()) {
169 if (isZAorZTRegOp(TRI, MO))
170 return {ZAState::ACTIVE, InsertPt};
171 }
172
173 return {ZAState::ANY, InsertPt};
174}
175
176struct MachineSMEABI : public MachineFunctionPass {
177 inline static char ID = 0;
178
179 MachineSMEABI() : MachineFunctionPass(ID) {}
180
181 bool runOnMachineFunction(MachineFunction &MF) override;
182
183 StringRef getPassName() const override { return "Machine SME ABI pass"; }
184
185 void getAnalysisUsage(AnalysisUsage &AU) const override {
186 AU.setPreservesCFG();
191 }
192
193 /// Collects the needed ZA state (and live registers) before each instruction
194 /// within the machine function.
195 void collectNeededZAStates(SMEAttrs);
196
197 /// Assigns each edge bundle a ZA state based on the needed states of blocks
198 /// that have incoming or outgoing edges in that bundle.
199 void assignBundleZAStates();
200
201 /// Inserts code to handle changes between ZA states within the function.
202 /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
203 void insertStateChanges();
204
205 // Emission routines for private and shared ZA functions (using lazy saves).
206 void emitNewZAPrologue(MachineBasicBlock &MBB,
208 void emitRestoreLazySave(MachineBasicBlock &MBB,
210 LiveRegs PhysLiveRegs);
211 void emitSetupLazySave(MachineBasicBlock &MBB,
213 void emitAllocateLazySaveBuffer(MachineBasicBlock &MBB,
216 bool ClearTPIDR2);
217
218 // Emission routines for agnostic ZA functions.
219 void emitSetupFullZASave(MachineBasicBlock &MBB,
221 LiveRegs PhysLiveRegs);
222 // Emit a "full" ZA save or restore. It is "full" in the sense that this
223 // function will emit a call to __arm_sme_save or __arm_sme_restore, which
224 // handles saving and restoring both ZA and ZT0.
225 void emitFullZASaveRestore(MachineBasicBlock &MBB,
227 LiveRegs PhysLiveRegs, bool IsSave);
228 void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB,
230 LiveRegs PhysLiveRegs);
231
233 ZAState From, ZAState To, LiveRegs PhysLiveRegs);
234
235 // Helpers for switching between lazy/full ZA save/restore routines.
237 LiveRegs PhysLiveRegs) {
238 if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
239 return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true);
240 return emitSetupLazySave(MBB, MBBI);
241 }
243 LiveRegs PhysLiveRegs) {
244 if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
245 return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false);
246 return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs);
247 }
248 void emitAllocateZASaveBuffer(MachineBasicBlock &MBB,
250 LiveRegs PhysLiveRegs) {
251 if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
252 return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs);
253 return emitAllocateLazySaveBuffer(MBB, MBBI);
254 }
255
256 /// Save live physical registers to virtual registers.
257 PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
259 /// Restore physical registers from a save of their previous values.
260 void restorePhyRegSave(PhysRegSave const &RegSave, MachineBasicBlock &MBB,
262
263 /// Get or create a TPIDR2 block in this function.
264 TPIDR2State getTPIDR2Block();
265
266 Register getAgnosticZABufferPtr();
267
268private:
269 /// Contains the needed ZA state (and live registers) at an instruction.
270 struct InstInfo {
271 ZAState NeededState{ZAState::ANY};
273 LiveRegs PhysLiveRegs = LiveRegs::None;
274 };
275
276 /// Contains the needed ZA state for each instruction in a block.
277 /// Instructions that do not require a ZA state are not recorded.
278 struct BlockInfo {
279 ZAState FixedEntryState{ZAState::ANY};
281 LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
282 LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
283 };
284
285 // All pass state that must be cleared between functions.
286 struct PassState {
288 SmallVector<ZAState> BundleStates;
289 std::optional<TPIDR2State> TPIDR2Block;
290 std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
291 Register AgnosticZABufferPtr = AArch64::NoRegister;
292 LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
293 } State;
294
295 MachineFunction *MF = nullptr;
296 EdgeBundles *Bundles = nullptr;
297 const AArch64Subtarget *Subtarget = nullptr;
298 const AArch64RegisterInfo *TRI = nullptr;
299 const AArch64FunctionInfo *AFI = nullptr;
300 const TargetInstrInfo *TII = nullptr;
301 MachineRegisterInfo *MRI = nullptr;
302};
303
304void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
305 assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
306 SMEFnAttrs.hasZAState()) &&
307 "Expected function to have ZA/ZT0 state!");
308
309 State.Blocks.resize(MF->getNumBlockIDs());
310 for (MachineBasicBlock &MBB : *MF) {
311 BlockInfo &Block = State.Blocks[MBB.getNumber()];
312 if (MBB.isEntryBlock()) {
313 // Entry block:
314 Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface()
315 ? ZAState::CALLER_DORMANT
316 : ZAState::ACTIVE;
317 } else if (MBB.isEHPad()) {
318 // EH entry block:
319 Block.FixedEntryState = ZAState::LOCAL_SAVED;
320 }
321
322 LiveRegUnits LiveUnits(*TRI);
323 LiveUnits.addLiveOuts(MBB);
324
325 auto GetPhysLiveRegs = [&] {
326 LiveRegs PhysLiveRegs = LiveRegs::None;
327 if (!LiveUnits.available(AArch64::NZCV))
328 PhysLiveRegs |= LiveRegs::NZCV;
329 // We have to track W0 and X0 separately as otherwise things can get
330 // confused if we attempt to preserve X0 but only W0 was defined.
331 if (!LiveUnits.available(AArch64::W0))
332 PhysLiveRegs |= LiveRegs::W0;
333 if (!LiveUnits.available(AArch64::W0_HI))
334 PhysLiveRegs |= LiveRegs::W0_HI;
335 return PhysLiveRegs;
336 };
337
338 Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
339 auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
340 auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
341 for (MachineInstr &MI : reverse(MBB)) {
343 LiveUnits.stepBackward(MI);
344 LiveRegs PhysLiveRegs = GetPhysLiveRegs();
345 // The SMEStateAllocPseudo marker is added to a function if the save
346 // buffer was allocated in SelectionDAG. It marks the end of the
347 // allocation -- which is a safe point for this pass to insert any TPIDR2
348 // block setup.
349 if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
350 State.AfterSMEProloguePt = MBBI;
351 State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
352 }
353 // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
354 auto [NeededState, InsertPt] = getZAStateBeforeInst(
355 *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
356 assert((InsertPt == MBBI ||
357 InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) &&
358 "Unexpected state change insertion point!");
359 // TODO: Do something to avoid state changes where NZCV is live.
360 if (MBBI == FirstTerminatorInsertPt)
361 Block.PhysLiveRegsAtExit = PhysLiveRegs;
362 if (MBBI == FirstNonPhiInsertPt)
363 Block.PhysLiveRegsAtEntry = PhysLiveRegs;
364 if (NeededState != ZAState::ANY)
365 Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
366 }
367
368 // Reverse vector (as we had to iterate backwards for liveness).
369 std::reverse(Block.Insts.begin(), Block.Insts.end());
370 }
371}
372
373void MachineSMEABI::assignBundleZAStates() {
374 State.BundleStates.resize(Bundles->getNumBundles());
375 for (unsigned I = 0, E = Bundles->getNumBundles(); I != E; ++I) {
376 LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n');
377
378 // Attempt to assign a ZA state for this bundle that minimizes state
379 // transitions. Edges within loops are given a higher weight as we assume
380 // they will be executed more than once.
381 // TODO: We should propagate desired incoming/outgoing states through blocks
382 // that have the "ANY" state first to make better global decisions.
383 int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
384 for (unsigned BlockID : Bundles->getBlocks(I)) {
385 LLVM_DEBUG(dbgs() << "- bb." << BlockID);
386
387 const BlockInfo &Block = State.Blocks[BlockID];
388 if (Block.Insts.empty()) {
389 LLVM_DEBUG(dbgs() << " (no state preference)\n");
390 continue;
391 }
392 bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
393 bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
394
395 ZAState DesiredIncomingState = Block.Insts.front().NeededState;
396 if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
397 EdgeStateCounts[DesiredIncomingState]++;
398 LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
399 << getZAStateString(DesiredIncomingState));
400 }
401 ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
402 if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
403 EdgeStateCounts[DesiredOutgoingState]++;
404 LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
405 << getZAStateString(DesiredOutgoingState));
406 }
407 LLVM_DEBUG(dbgs() << '\n');
408 }
409
410 ZAState BundleState =
411 ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
412
413 // Force ZA to be active in bundles that don't have a preferred state.
414 // TODO: Something better here (to avoid extra mode switches).
415 if (BundleState == ZAState::ANY)
416 BundleState = ZAState::ACTIVE;
417
418 LLVM_DEBUG({
419 dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n'
420 << "Edge counts:";
421 for (auto [State, Count] : enumerate(EdgeStateCounts))
422 dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count;
423 dbgs() << "\n\n";
424 });
425
426 State.BundleStates[I] = BundleState;
427 }
428}
429
430void MachineSMEABI::insertStateChanges() {
431 for (MachineBasicBlock &MBB : *MF) {
432 const BlockInfo &Block = State.Blocks[MBB.getNumber()];
433 ZAState InState = State.BundleStates[Bundles->getBundle(MBB.getNumber(),
434 /*Out=*/false)];
435
436 ZAState CurrentState = Block.FixedEntryState;
437 if (CurrentState == ZAState::ANY)
438 CurrentState = InState;
439
440 for (auto &Inst : Block.Insts) {
441 if (CurrentState != Inst.NeededState)
442 emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState,
443 Inst.PhysLiveRegs);
444 CurrentState = Inst.NeededState;
445 }
446
447 if (MBB.succ_empty())
448 continue;
449
450 ZAState OutState =
451 State.BundleStates[Bundles->getBundle(MBB.getNumber(), /*Out=*/true)];
452 if (CurrentState != OutState)
453 emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState,
454 Block.PhysLiveRegsAtExit);
455 }
456}
457
458TPIDR2State MachineSMEABI::getTPIDR2Block() {
459 if (State.TPIDR2Block)
460 return *State.TPIDR2Block;
461 MachineFrameInfo &MFI = MF->getFrameInfo();
462 State.TPIDR2Block = TPIDR2State{MFI.CreateStackObject(16, Align(16), false)};
463 return *State.TPIDR2Block;
464}
465
468 if (MBBI != MBB.end())
469 return MBBI->getDebugLoc();
470 return DebugLoc();
471}
472
473void MachineSMEABI::emitSetupLazySave(MachineBasicBlock &MBB,
476
477 // Get pointer to TPIDR2 block.
478 Register TPIDR2 = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
479 Register TPIDR2Ptr = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
480 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
481 .addFrameIndex(getTPIDR2Block().FrameIndex)
482 .addImm(0)
483 .addImm(0);
484 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), TPIDR2Ptr)
485 .addReg(TPIDR2);
486 // Set TPIDR2_EL0 to point to TPIDR2 block.
487 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
488 .addImm(AArch64SysReg::TPIDR2_EL0)
489 .addReg(TPIDR2Ptr);
490}
491
492PhysRegSave MachineSMEABI::createPhysRegSave(LiveRegs PhysLiveRegs,
495 DebugLoc DL) {
496 PhysRegSave RegSave{PhysLiveRegs};
497 if (PhysLiveRegs & LiveRegs::NZCV) {
498 RegSave.StatusFlags = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
499 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), RegSave.StatusFlags)
500 .addImm(AArch64SysReg::NZCV)
501 .addReg(AArch64::NZCV, RegState::Implicit);
502 }
503 // Note: Preserving X0 is "free" as this is before register allocation, so
504 // the register allocator is still able to optimize these copies.
505 if (PhysLiveRegs & LiveRegs::W0) {
506 RegSave.X0Save = MRI->createVirtualRegister(PhysLiveRegs & LiveRegs::W0_HI
507 ? &AArch64::GPR64RegClass
508 : &AArch64::GPR32RegClass);
509 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), RegSave.X0Save)
510 .addReg(PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0);
511 }
512 return RegSave;
513}
514
515void MachineSMEABI::restorePhyRegSave(PhysRegSave const &RegSave,
518 DebugLoc DL) {
519 if (RegSave.StatusFlags != AArch64::NoRegister)
520 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
521 .addImm(AArch64SysReg::NZCV)
522 .addReg(RegSave.StatusFlags)
523 .addReg(AArch64::NZCV, RegState::ImplicitDefine);
524
525 if (RegSave.X0Save != AArch64::NoRegister)
526 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY),
527 RegSave.PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0)
528 .addReg(RegSave.X0Save);
529}
530
531void MachineSMEABI::emitRestoreLazySave(MachineBasicBlock &MBB,
533 LiveRegs PhysLiveRegs) {
534 auto *TLI = Subtarget->getTargetLowering();
536 Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
537 Register TPIDR2 = AArch64::X0;
538
539 // TODO: Emit these within the restore MBB to prevent unnecessary saves.
540 PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
541
542 // Enable ZA.
543 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
544 .addImm(AArch64SVCR::SVCRZA)
545 .addImm(1);
546 // Get current TPIDR2_EL0.
547 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), TPIDR2EL0)
548 .addImm(AArch64SysReg::TPIDR2_EL0);
549 // Get pointer to TPIDR2 block.
550 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
551 .addFrameIndex(getTPIDR2Block().FrameIndex)
552 .addImm(0)
553 .addImm(0);
554 // (Conditionally) restore ZA state.
555 BuildMI(MBB, MBBI, DL, TII->get(AArch64::RestoreZAPseudo))
556 .addReg(TPIDR2EL0)
557 .addReg(TPIDR2)
558 .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_RESTORE))
559 .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
560 // Zero TPIDR2_EL0.
561 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
562 .addImm(AArch64SysReg::TPIDR2_EL0)
563 .addReg(AArch64::XZR);
564
565 restorePhyRegSave(RegSave, MBB, MBBI, DL);
566}
567
568void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
570 bool ClearTPIDR2) {
572
573 if (ClearTPIDR2)
574 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
575 .addImm(AArch64SysReg::TPIDR2_EL0)
576 .addReg(AArch64::XZR);
577
578 // Disable ZA.
579 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
580 .addImm(AArch64SVCR::SVCRZA)
581 .addImm(0);
582}
583
584void MachineSMEABI::emitAllocateLazySaveBuffer(
586 MachineFrameInfo &MFI = MF->getFrameInfo();
588 Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
589 Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
590 Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
591
592 // Calculate SVL.
593 BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
594
595 // 1. Allocate the lazy save buffer.
596 if (Buffer == AArch64::NoRegister) {
597 // TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
598 // Buffer != AArch64::NoRegister). This is done to reuse the existing
599 // expansions (which can insert stack checks). This works, but it means we
600 // will always allocate the lazy save buffer (even if the function contains
601 // no lazy saves). If we want to handle Windows here, we'll need to
602 // implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
603 assert(!Subtarget->isTargetWindows() &&
604 "Lazy ZA save is not yet supported on Windows");
605 Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
606 // Get original stack pointer.
607 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
608 .addReg(AArch64::SP);
609 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
610 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSUBXrrr), Buffer)
611 .addReg(SVL)
612 .addReg(SVL)
613 .addReg(SP);
614 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::SP)
615 .addReg(Buffer);
616 // We have just allocated a variable sized object, tell this to PEI.
617 MFI.CreateVariableSizedObject(Align(16), nullptr);
618 }
619
620 // 2. Setup the TPIDR2 block.
621 {
622 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
623 // generally don't support big-endian SVE/SME.
624 if (!Subtarget->isLittleEndian())
626 "TPIDR2 block initialization is not supported on big-endian targets");
627
628 // Store buffer pointer and num_za_save_slices.
629 // Bytes 10-15 are implicitly zeroed.
630 BuildMI(MBB, MBBI, DL, TII->get(AArch64::STPXi))
631 .addReg(Buffer)
632 .addReg(SVL)
633 .addFrameIndex(getTPIDR2Block().FrameIndex)
634 .addImm(0);
635 }
636}
637
638void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
640 auto *TLI = Subtarget->getTargetLowering();
642
643 // Get current TPIDR2_EL0.
644 Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
645 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS))
646 .addReg(TPIDR2EL0, RegState::Define)
647 .addImm(AArch64SysReg::TPIDR2_EL0);
648 // If TPIDR2_EL0 is non-zero, commit the lazy save.
649 // NOTE: Functions that only use ZT0 don't need to zero ZA.
650 bool ZeroZA = AFI->getSMEFnAttrs().hasZAState();
651 auto CommitZASave =
652 BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo))
653 .addReg(TPIDR2EL0)
654 .addImm(ZeroZA ? 1 : 0)
655 .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE))
656 .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
657 if (ZeroZA)
658 CommitZASave.addDef(AArch64::ZAB0, RegState::ImplicitDefine);
659 // Enable ZA (as ZA could have previously been in the OFF state).
660 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
661 .addImm(AArch64SVCR::SVCRZA)
662 .addImm(1);
663}
664
665Register MachineSMEABI::getAgnosticZABufferPtr() {
666 if (State.AgnosticZABufferPtr != AArch64::NoRegister)
667 return State.AgnosticZABufferPtr;
668 Register BufferPtr = AFI->getEarlyAllocSMESaveBuffer();
669 State.AgnosticZABufferPtr =
670 BufferPtr != AArch64::NoRegister
671 ? BufferPtr
672 : MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
673 return State.AgnosticZABufferPtr;
674}
675
676void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
678 LiveRegs PhysLiveRegs, bool IsSave) {
679 auto *TLI = Subtarget->getTargetLowering();
681 Register BufferPtr = AArch64::X0;
682
683 PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
684
685 // Copy the buffer pointer into X0.
686 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
687 .addReg(getAgnosticZABufferPtr());
688
689 // Call __arm_sme_save/__arm_sme_restore.
690 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
691 .addReg(BufferPtr, RegState::Implicit)
692 .addExternalSymbol(TLI->getLibcallName(
693 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE))
694 .addRegMask(TRI->getCallPreservedMask(
695 *MF,
697
698 restorePhyRegSave(RegSave, MBB, MBBI, DL);
699}
700
701void MachineSMEABI::emitAllocateFullZASaveBuffer(
703 LiveRegs PhysLiveRegs) {
704 // Buffer already allocated in SelectionDAG.
706 return;
707
709 Register BufferPtr = getAgnosticZABufferPtr();
710 Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
711
712 PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
713
714 // Calculate the SME state size.
715 {
716 auto *TLI = Subtarget->getTargetLowering();
717 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
718 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
719 .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_SME_STATE_SIZE))
720 .addReg(AArch64::X0, RegState::ImplicitDefine)
721 .addRegMask(TRI->getCallPreservedMask(
722 *MF, CallingConv::
724 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferSize)
725 .addReg(AArch64::X0);
726 }
727
728 // Allocate a buffer object of the size given __arm_sme_state_size.
729 {
730 MachineFrameInfo &MFI = MF->getFrameInfo();
731 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
732 .addReg(AArch64::SP)
733 .addReg(BufferSize)
735 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
736 .addReg(AArch64::SP);
737
738 // We have just allocated a variable sized object, tell this to PEI.
739 MFI.CreateVariableSizedObject(Align(16), nullptr);
740 }
741
742 restorePhyRegSave(RegSave, MBB, MBBI, DL);
743}
744
745void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
747 ZAState From, ZAState To,
748 LiveRegs PhysLiveRegs) {
749
750 // ZA not used.
751 if (From == ZAState::ANY || To == ZAState::ANY)
752 return;
753
754 // If we're exiting from the CALLER_DORMANT state that means this new ZA
755 // function did not touch ZA (so ZA was never turned on).
756 if (From == ZAState::CALLER_DORMANT && To == ZAState::OFF)
757 return;
758
759 // TODO: Avoid setting up the save buffer if there's no transition to
760 // LOCAL_SAVED.
761 if (From == ZAState::CALLER_DORMANT) {
763 "CALLER_DORMANT state requires private ZA interface");
764 assert(&MBB == &MBB.getParent()->front() &&
765 "CALLER_DORMANT state only valid in entry block");
766 emitNewZAPrologue(MBB, MBB.getFirstNonPHI());
767 if (To == ZAState::ACTIVE)
768 return; // Nothing more to do (ZA is active after the prologue).
769
770 // Note: "emitNewZAPrologue" zeros ZA, so we may need to setup a lazy save
771 // if "To" is "ZAState::LOCAL_SAVED". It may be possible to improve this
772 // case by changing the placement of the zero instruction.
773 From = ZAState::ACTIVE;
774 }
775
776 if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
777 emitZASave(MBB, InsertPt, PhysLiveRegs);
778 else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
779 emitZARestore(MBB, InsertPt, PhysLiveRegs);
780 else if (To == ZAState::OFF) {
781 assert(From != ZAState::CALLER_DORMANT &&
782 "CALLER_DORMANT to OFF should have already been handled");
784 "Should not turn ZA off in agnostic ZA function");
785 emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
786 } else {
787 dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
788 << getZAStateString(To) << '\n';
789 llvm_unreachable("Unimplemented state transition");
790 }
791}
792
793} // end anonymous namespace
794
795INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi", "Machine SME ABI",
796 false, false)
797
798bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
799 if (!MF.getSubtarget<AArch64Subtarget>().hasSME())
800 return false;
801
802 AFI = MF.getInfo<AArch64FunctionInfo>();
803 SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
804 if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
805 !SMEFnAttrs.hasAgnosticZAInterface())
806 return false;
807
808 assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
809
810 // Reset pass state.
811 State = PassState{};
812 this->MF = &MF;
813 Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
814 Subtarget = &MF.getSubtarget<AArch64Subtarget>();
815 TII = Subtarget->getInstrInfo();
816 TRI = Subtarget->getRegisterInfo();
817 MRI = &MF.getRegInfo();
818
819 collectNeededZAStates(SMEFnAttrs);
820 assignBundleZAStates();
821 insertStateChanges();
822
823 // Allocate save buffer (if needed).
824 if (State.AgnosticZABufferPtr != AArch64::NoRegister || State.TPIDR2Block) {
825 if (State.AfterSMEProloguePt) {
826 // Note: With inline stack probes the AfterSMEProloguePt may not be in the
827 // entry block (due to the probing loop).
828 emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
829 *State.AfterSMEProloguePt,
830 State.PhysLiveRegsAfterSMEPrologue);
831 } else {
832 MachineBasicBlock &EntryBlock = MF.front();
833 emitAllocateZASaveBuffer(
834 EntryBlock, EntryBlock.getFirstNonPHI(),
835 State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry);
836 }
837 }
838
839 return true;
840}
841
842FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define I(x, y, z)
Definition MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register const TargetRegisterInfo * TRI
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
const AArch64RegisterInfo * getRegisterInfo() const override
const AArch64TargetLowering * getTargetLowering() const override
Represent the analysis usage information of a pass.
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
A debug info location.
Definition DebugLoc.h:124
ArrayRef< unsigned > getBlocks(unsigned Bundle) const
getBlocks - Return an array of blocks that are connected to Bundle.
Definition EdgeBundles.h:53
unsigned getBundle(unsigned N, bool Out) const
getBundle - Return the ingoing (Out = false) or outgoing (Out = true) bundle number for basic block N
Definition EdgeBundles.h:47
unsigned getNumBundles() const
getNumBundles - Return the total number of bundles in the CFG.
Definition EdgeBundles.h:50
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
LLVM_ABI void stepBackward(const MachineInstr &MI)
Updates liveness when stepping backwards over the instruction MI.
LLVM_ABI void addLiveOuts(const MachineBasicBlock &MBB)
Adds registers living out of block MBB.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
unsigned getNumBlockIDs() const
getNumBlockIDs - Return the number of MBB ID's allocated.
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasAgnosticZAInterface() const
bool hasPrivateZAInterface() const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
CallingConv Namespace - This namespace contains an enum with a value for the well-known calling conve...
Definition CallingConv.h:21
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createMachineSMEABIPass()
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
LLVM_ABI char & MachineDominatorsID
MachineDominators - This pass is a machine dominators analysis pass.
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition Error.cpp:177
LLVM_ABI char & MachineLoopInfoID
MachineLoopInfo - This pass is a loop analysis pass.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
auto reverse(ContainerTy &&C)
Definition STLExtras.h:420
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2032
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39