@@ -592,18 +592,11 @@ combine_symbol_mask(const symbol_mask src, symbol_mask dest)
592592
593593// Decode a _LOAD_FAST_BORROW* opcode into register variant and oparg.
594594// Returns 1 if the opcode is a _LOAD_FAST_BORROW variant, 0 otherwise.
595- // On AArch64, falls back to stencil for oparg > 4085 (imm12 limit).
596- // https://developer.arm.com/documentation/ddi0602/2024-06/Base-Instructions/LDR--immediate---Load-register--immediate--?lang=en
597595static int
598596_decode_load_fast_borrow (uint16_t opcode , uint16_t insn_oparg ,
599597 int * reg_variant , int * oparg )
600598{
601599 if (opcode >= _LOAD_FAST_BORROW_r01 && opcode <= _LOAD_FAST_BORROW_r23 ) {
602- #if defined(__aarch64__ ) || defined(_M_ARM64 )
603- if (insn_oparg > 4085 ) {
604- return 0 ;
605- }
606- #endif
607600 * reg_variant = opcode - _LOAD_FAST_BORROW_r01 ;
608601 * oparg = insn_oparg ;
609602 return 1 ;
@@ -613,34 +606,52 @@ _decode_load_fast_borrow(uint16_t opcode, uint16_t insn_oparg,
613606
614607#if defined(__aarch64__ ) || defined(_M_ARM64 )
615608
616- // AArch64: ldr x8, [x21, #off] ; orr xDST, x8, #1 (8 bytes, no data)
617- // preserve_none CC: x21=frame, x24/x25/x26=cache0/1/2
609+ // AArch64: preserve_none CC: x21=frame, x24/x25/x26=cache0/1/2
610+ // Small oparg (imm12 fits): ldr x8, [x21, #off] ; orr xDST, x8, #1 (8 bytes)
611+ // Large oparg: mov w8, #off ; ldr x8, [x21, x8] ; orr xDST, x8, #1 (12 bytes)
612+ // https://developer.arm.com/documentation/ddi0602/2024-06/Base-Instructions/LDR--immediate---Load-register--immediate--?lang=en
613+
614+ static const uint32_t _aarch64_cache_regs [3 ] = {24 , 25 , 26 };
618615
619616static int
620617_load_fast_borrow_code_size (int oparg )
621618{
622- (void )oparg ;
623- return 8 ;
619+ uint32_t byte_offset = (uint32_t )(offsetof(_PyInterpreterFrame , localsplus )
620+ + (unsigned )oparg * sizeof (_PyStackRef ));
621+ uint32_t imm12 = byte_offset >> 3 ;
622+ return imm12 < 4096 ? 8 : 12 ;
624623}
625624
626- static const uint32_t _aarch64_cache_regs [3 ] = {24 , 25 , 26 };
627-
628625static void
629626_emit_load_fast_borrow (unsigned char * code , int reg_variant , int oparg )
630627{
631628 uint32_t byte_offset = (uint32_t )(offsetof(_PyInterpreterFrame , localsplus )
632629 + (unsigned )oparg * sizeof (_PyStackRef ));
633630 assert (byte_offset % 8 == 0 );
631+ uint32_t dst = _aarch64_cache_regs [reg_variant ];
634632 uint32_t imm12 = byte_offset >> 3 ;
635- assert (imm12 < 4096 );
636633
637- // ldr x8, [x21, #byte_offset]
638- uint32_t ldr = 0xF9400000 | (imm12 << 10 ) | (21 << 5 ) | 8 ;
639- // orr xDST, x8, #0x1
640- uint32_t orr = 0xB2400000 | (8 << 5 ) | _aarch64_cache_regs [reg_variant ];
634+ if (imm12 < 4096 ) {
635+ // ldr x8, [x21, #byte_offset]
636+ uint32_t ldr = 0xF9400000 | (imm12 << 10 ) | (21 << 5 ) | 8 ;
637+ // orr xDST, x8, #0x1
638+ uint32_t orr = 0xB2400000 | (8 << 5 ) | dst ;
641639
642- memcpy (code , & ldr , 4 );
643- memcpy (code + 4 , & orr , 4 );
640+ memcpy (code , & ldr , 4 );
641+ memcpy (code + 4 , & orr , 4 );
642+ }
643+ else {
644+ // mov w8, #byte_offset
645+ uint32_t mov = 0x52800000 | ((byte_offset & 0xFFFF ) << 5 ) | 8 ;
646+ // ldr x8, [x21, x8]
647+ uint32_t ldr = 0xF8686AA8 ;
648+ // orr xDST, x8, #0x1
649+ uint32_t orr = 0xB2400000 | (8 << 5 ) | dst ;
650+
651+ memcpy (code , & mov , 4 );
652+ memcpy (code + 4 , & ldr , 4 );
653+ memcpy (code + 8 , & orr , 4 );
654+ }
644655}
645656
646657#elif defined(__x86_64__ ) || defined(_M_X64 )
0 commit comments