Skip to content

Commit bbf847f

Browse files
committed
fix for aarch64
1 parent 7957e62 commit bbf847f

File tree

2 files changed

+32
-25
lines changed

2 files changed

+32
-25
lines changed

Python/jit.c

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -592,18 +592,11 @@ combine_symbol_mask(const symbol_mask src, symbol_mask dest)
592592

593593
// Decode a _LOAD_FAST_BORROW* opcode into register variant and oparg.
594594
// Returns 1 if the opcode is a _LOAD_FAST_BORROW variant, 0 otherwise.
595-
// On AArch64, falls back to stencil for oparg > 4085 (imm12 limit).
596-
// https://developer.arm.com/documentation/ddi0602/2024-06/Base-Instructions/LDR--immediate---Load-register--immediate--?lang=en
597595
static int
598596
_decode_load_fast_borrow(uint16_t opcode, uint16_t insn_oparg,
599597
int *reg_variant, int *oparg)
600598
{
601599
if (opcode >= _LOAD_FAST_BORROW_r01 && opcode <= _LOAD_FAST_BORROW_r23) {
602-
#if defined(__aarch64__) || defined(_M_ARM64)
603-
if (insn_oparg > 4085) {
604-
return 0;
605-
}
606-
#endif
607600
*reg_variant = opcode - _LOAD_FAST_BORROW_r01;
608601
*oparg = insn_oparg;
609602
return 1;
@@ -613,34 +606,52 @@ _decode_load_fast_borrow(uint16_t opcode, uint16_t insn_oparg,
613606

614607
#if defined(__aarch64__) || defined(_M_ARM64)
615608

616-
// AArch64: ldr x8, [x21, #off] ; orr xDST, x8, #1 (8 bytes, no data)
617-
// preserve_none CC: x21=frame, x24/x25/x26=cache0/1/2
609+
// AArch64: preserve_none CC: x21=frame, x24/x25/x26=cache0/1/2
610+
// Small oparg (imm12 fits): ldr x8, [x21, #off] ; orr xDST, x8, #1 (8 bytes)
611+
// Large oparg: mov w8, #off ; ldr x8, [x21, x8] ; orr xDST, x8, #1 (12 bytes)
612+
// https://developer.arm.com/documentation/ddi0602/2024-06/Base-Instructions/LDR--immediate---Load-register--immediate--?lang=en
613+
614+
static const uint32_t _aarch64_cache_regs[3] = {24, 25, 26};
618615

619616
static int
620617
_load_fast_borrow_code_size(int oparg)
621618
{
622-
(void)oparg;
623-
return 8;
619+
uint32_t byte_offset = (uint32_t)(offsetof(_PyInterpreterFrame, localsplus)
620+
+ (unsigned)oparg * sizeof(_PyStackRef));
621+
uint32_t imm12 = byte_offset >> 3;
622+
return imm12 < 4096 ? 8 : 12;
624623
}
625624

626-
static const uint32_t _aarch64_cache_regs[3] = {24, 25, 26};
627-
628625
static void
629626
_emit_load_fast_borrow(unsigned char *code, int reg_variant, int oparg)
630627
{
631628
uint32_t byte_offset = (uint32_t)(offsetof(_PyInterpreterFrame, localsplus)
632629
+ (unsigned)oparg * sizeof(_PyStackRef));
633630
assert(byte_offset % 8 == 0);
631+
uint32_t dst = _aarch64_cache_regs[reg_variant];
634632
uint32_t imm12 = byte_offset >> 3;
635-
assert(imm12 < 4096);
636633

637-
// ldr x8, [x21, #byte_offset]
638-
uint32_t ldr = 0xF9400000 | (imm12 << 10) | (21 << 5) | 8;
639-
// orr xDST, x8, #0x1
640-
uint32_t orr = 0xB2400000 | (8 << 5) | _aarch64_cache_regs[reg_variant];
634+
if (imm12 < 4096) {
635+
// ldr x8, [x21, #byte_offset]
636+
uint32_t ldr = 0xF9400000 | (imm12 << 10) | (21 << 5) | 8;
637+
// orr xDST, x8, #0x1
638+
uint32_t orr = 0xB2400000 | (8 << 5) | dst;
641639

642-
memcpy(code, &ldr, 4);
643-
memcpy(code + 4, &orr, 4);
640+
memcpy(code, &ldr, 4);
641+
memcpy(code + 4, &orr, 4);
642+
}
643+
else {
644+
// mov w8, #byte_offset
645+
uint32_t mov = 0x52800000 | ((byte_offset & 0xFFFF) << 5) | 8;
646+
// ldr x8, [x21, x8]
647+
uint32_t ldr = 0xF8686AA8;
648+
// orr xDST, x8, #0x1
649+
uint32_t orr = 0xB2400000 | (8 << 5) | dst;
650+
651+
memcpy(code, &mov, 4);
652+
memcpy(code + 4, &ldr, 4);
653+
memcpy(code + 8, &orr, 4);
654+
}
644655
}
645656

646657
#elif defined(__x86_64__) || defined(_M_X64)

Tools/jit/_targets.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -217,12 +217,8 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
217217
for case, opname in cases_and_opnames:
218218
# _LOAD_FAST_BORROW uses manual codegen in jit.c,
219219
# so skip stencil generation for its register variants.
220-
# AArch64 keeps stencils as fallback for huge opargs
221-
# (imm12 limit, oparg > 4085).
222-
# https://developer.arm.com/documentation/ddi0602/2024-06/Base-Instructions/LDR--immediate---Load-register--immediate--?lang=en
223220
if opname.startswith("_LOAD_FAST_BORROW_r"):
224-
if not self.triple.startswith("aarch64"):
225-
continue
221+
continue
226222
# Write out a copy of the template with *only* this case
227223
# inserted. This is about twice as fast as #include'ing all
228224
# of executor_cases.c.h each time we compile (since the C

0 commit comments

Comments
 (0)