Skip to content

Commit 083b83c

Browse files
author
Tim Hutt
committed
Add a lexer for RISC-V assembly
This adds a basic lexer for RISC-V assembly. Like all assembly (as far as I know) there isn't really a formal grammar, and compilers just kind of do whatever, so this is a best effort. There may be valid assembly it does not highlight properly. I have tested it on several random samples from the internet and it seems to be ok though. The included demo is from the RISC-V ISA manual: https://riscv-specs.timhutt.co.uk/spec/20240411/unpriv-isa-asciidoc.html#_sgemm_example
1 parent 3b461b1 commit 083b83c

4 files changed

Lines changed: 387 additions & 0 deletions

File tree

docs/Languages.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@
171171
- ReasonML (`reasonml`)
172172
- Rego (`rego`)
173173
- ReScript (`rescript`)
174+
- RISC-V Assembly (`riscvasm`)
174175
- RML (`rml`)
175176
- Robot Framework (`robot_framework`)
176177
- Ruby (`ruby`)

lib/rouge/demos/riscvasm

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
# RV64IDV system
2+
#
3+
# void
4+
# sgemm_nn(size_t n,
5+
# size_t m,
6+
# size_t k,
7+
# const float*a, // m * k matrix
8+
# size_t lda,
9+
# const float*b, // k * n matrix
10+
# size_t ldb,
11+
# float*c, // m * n matrix
12+
# size_t ldc)
13+
#
14+
# c += a*b (alpha=1, no transpose on input matrices)
15+
# matrices stored in C row-major order
16+
17+
#define n a0
18+
#define m a1
19+
#define k a2
20+
#define ap a3
21+
#define astride a4
22+
#define bp a5
23+
#define bstride a6
24+
#define cp a7
25+
#define cstride t0
26+
#define kt t1
27+
#define nt t2
28+
#define bnp t3
29+
#define cnp t4
30+
#define akp t5
31+
#define bkp s0
32+
#define nvl s1
33+
#define ccp s2
34+
#define amp s3
35+
36+
# Use args as additional temporaries
37+
#define ft12 fa0
38+
#define ft13 fa1
39+
#define ft14 fa2
40+
#define ft15 fa3
41+
42+
# This version holds a 16*VLMAX block of C matrix in vector registers
43+
# in inner loop, but otherwise does not cache or TLB tiling.
44+
45+
sgemm_nn:
46+
addi sp, sp, -FRAMESIZE
47+
sd s0, OFFSET(sp)
48+
sd s1, OFFSET(sp)
49+
sd s2, OFFSET(sp)
50+
51+
# Check for zero size matrices
52+
beqz n, exit
53+
beqz m, exit
54+
beqz k, exit
55+
56+
# Convert elements strides to byte strides.
57+
ld cstride, OFFSET(sp) # Get arg from stack frame
58+
slli astride, astride, 2
59+
slli bstride, bstride, 2
60+
slli cstride, cstride, 2
61+
62+
slti t6, m, 16
63+
bnez t6, end_rows
64+
65+
c_row_loop: # Loop across rows of C blocks
66+
67+
mv nt, n # Initialize n counter for next row of C blocks
68+
69+
mv bnp, bp # Initialize B n-loop pointer to start
70+
mv cnp, cp # Initialize C n-loop pointer
71+
72+
c_col_loop: # Loop across one row of C blocks
73+
vsetvli nvl, nt, e32, ta, ma # 32-bit vectors, LMUL=1
74+
75+
mv akp, ap # reset pointer into A to beginning
76+
mv bkp, bnp # step to next column in B matrix
77+
78+
# Initalize current C submatrix block from memory.
79+
vle32.v v0, (cnp); add ccp, cnp, cstride;
80+
vle32.v v1, (ccp); add ccp, ccp, cstride;
81+
vle32.v v2, (ccp); add ccp, ccp, cstride;
82+
vle32.v v3, (ccp); add ccp, ccp, cstride;
83+
vle32.v v4, (ccp); add ccp, ccp, cstride;
84+
vle32.v v5, (ccp); add ccp, ccp, cstride;
85+
vle32.v v6, (ccp); add ccp, ccp, cstride;
86+
vle32.v v7, (ccp); add ccp, ccp, cstride;
87+
vle32.v v8, (ccp); add ccp, ccp, cstride;
88+
vle32.v v9, (ccp); add ccp, ccp, cstride;
89+
vle32.v v10, (ccp); add ccp, ccp, cstride;
90+
vle32.v v11, (ccp); add ccp, ccp, cstride;
91+
vle32.v v12, (ccp); add ccp, ccp, cstride;
92+
vle32.v v13, (ccp); add ccp, ccp, cstride;
93+
vle32.v v14, (ccp); add ccp, ccp, cstride;
94+
vle32.v v15, (ccp)
95+
96+
97+
mv kt, k # Initialize inner loop counter
98+
99+
# Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline
100+
# Software pipeline loads
101+
flw ft0, (akp); add amp, akp, astride;
102+
flw ft1, (amp); add amp, amp, astride;
103+
flw ft2, (amp); add amp, amp, astride;
104+
flw ft3, (amp); add amp, amp, astride;
105+
# Get vector from B matrix
106+
vle32.v v16, (bkp)
107+
108+
# Loop on inner dimension for current C block
109+
k_loop:
110+
vfmacc.vf v0, ft0, v16
111+
add bkp, bkp, bstride
112+
flw ft4, (amp)
113+
add amp, amp, astride
114+
vfmacc.vf v1, ft1, v16
115+
addi kt, kt, -1 # Decrement k counter
116+
flw ft5, (amp)
117+
add amp, amp, astride
118+
vfmacc.vf v2, ft2, v16
119+
flw ft6, (amp)
120+
add amp, amp, astride
121+
flw ft7, (amp)
122+
vfmacc.vf v3, ft3, v16
123+
add amp, amp, astride
124+
flw ft8, (amp)
125+
add amp, amp, astride
126+
vfmacc.vf v4, ft4, v16
127+
flw ft9, (amp)
128+
add amp, amp, astride
129+
vfmacc.vf v5, ft5, v16
130+
flw ft10, (amp)
131+
add amp, amp, astride
132+
vfmacc.vf v6, ft6, v16
133+
flw ft11, (amp)
134+
add amp, amp, astride
135+
vfmacc.vf v7, ft7, v16
136+
flw ft12, (amp)
137+
add amp, amp, astride
138+
vfmacc.vf v8, ft8, v16
139+
flw ft13, (amp)
140+
add amp, amp, astride
141+
vfmacc.vf v9, ft9, v16
142+
flw ft14, (amp)
143+
add amp, amp, astride
144+
vfmacc.vf v10, ft10, v16
145+
flw ft15, (amp)
146+
add amp, amp, astride
147+
addi akp, akp, 4 # Move to next column of a
148+
vfmacc.vf v11, ft11, v16
149+
beqz kt, 1f # Don't load past end of matrix
150+
flw ft0, (akp)
151+
add amp, akp, astride
152+
1: vfmacc.vf v12, ft12, v16
153+
beqz kt, 1f
154+
flw ft1, (amp)
155+
add amp, amp, astride
156+
1: vfmacc.vf v13, ft13, v16
157+
beqz kt, 1f
158+
flw ft2, (amp)
159+
add amp, amp, astride
160+
1: vfmacc.vf v14, ft14, v16
161+
beqz kt, 1f # Exit out of loop
162+
flw ft3, (amp)
163+
add amp, amp, astride
164+
vfmacc.vf v15, ft15, v16
165+
vle32.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls
166+
j k_loop
167+
168+
1: vfmacc.vf v15, ft15, v16
169+
170+
# Save C matrix block back to memory
171+
vse32.v v0, (cnp); add ccp, cnp, cstride;
172+
vse32.v v1, (ccp); add ccp, ccp, cstride;
173+
vse32.v v2, (ccp); add ccp, ccp, cstride;
174+
vse32.v v3, (ccp); add ccp, ccp, cstride;
175+
vse32.v v4, (ccp); add ccp, ccp, cstride;
176+
vse32.v v5, (ccp); add ccp, ccp, cstride;
177+
vse32.v v6, (ccp); add ccp, ccp, cstride;
178+
vse32.v v7, (ccp); add ccp, ccp, cstride;
179+
vse32.v v8, (ccp); add ccp, ccp, cstride;
180+
vse32.v v9, (ccp); add ccp, ccp, cstride;
181+
vse32.v v10, (ccp); add ccp, ccp, cstride;
182+
vse32.v v11, (ccp); add ccp, ccp, cstride;
183+
vse32.v v12, (ccp); add ccp, ccp, cstride;
184+
vse32.v v13, (ccp); add ccp, ccp, cstride;
185+
vse32.v v14, (ccp); add ccp, ccp, cstride;
186+
vse32.v v15, (ccp)
187+
188+
# Following tail instructions should be scheduled earlier in free slots during C block save.
189+
# Leaving here for clarity.
190+
191+
# Bump pointers for loop across blocks in one row
192+
slli t6, nvl, 2
193+
add cnp, cnp, t6 # Move C block pointer over
194+
add bnp, bnp, t6 # Move B block pointer over
195+
sub nt, nt, nvl # Decrement element count in n dimension
196+
bnez nt, c_col_loop # Any more to do?
197+
198+
# Move to next set of rows
199+
addi m, m, -16 # Did 16 rows above
200+
slli t6, astride, 4 # Multiply astride by 16
201+
add ap, ap, t6 # Move A matrix pointer down 16 rows
202+
slli t6, cstride, 4 # Multiply cstride by 16
203+
add cp, cp, t6 # Move C matrix pointer down 16 rows
204+
205+
slti t6, m, 16
206+
beqz t6, c_row_loop
207+
208+
# Handle end of matrix with fewer than 16 rows.
209+
# Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns.
210+
end_rows:
211+
# Not done.
212+
213+
exit:
214+
ld s0, OFFSET(sp)
215+
ld s1, OFFSET(sp)
216+
ld s2, OFFSET(sp)
217+
addi sp, sp, FRAMESIZE
218+
ret

lib/rouge/lexers/riscvasm.rb

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# -*- coding: utf-8 -*- #
2+
# frozen_string_literal: true
3+
4+
# Note that like most assembly languages there's no proper grammar for RISC-V assembly.
5+
# It's pretty much "what do GCC and Clang accept". I recommend not trying to read
6+
# their source code because it's a complete mess.
7+
8+
module Rouge
9+
module Lexers
10+
class RiscvAsm < RegexLexer
11+
title "RiscvAsm"
12+
desc "RISC-V assembly syntax"
13+
tag 'riscvasm'
14+
filenames '*.s', '*.S'
15+
16+
# C preprocessor directives. These are only processed for .S files - not .s - however
17+
# the parsing is the same in both cases.
18+
def self.preproc_directive
19+
@preproc_directive ||= %w(
20+
define elif else endif error if ifdef ifndef include line pragma undef warning
21+
)
22+
end
23+
24+
# Standard register name, including ABI names.
25+
def self.register
26+
@register ||= %w(
27+
x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 x16 x17 x18 x19 x20 x21 x22 x23 x24 x25 x26 x27 x28 x29 x30 x31
28+
f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22 f23 f24 f25 f26 f27 f28 f29 f30 f31
29+
v0 v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23 v24 v25 v26 v27 v28 v29 v30 v31
30+
zero ra sp gp tp t0 t1 t2 s0 fp s1 a0 a1 a2 a3 a4 a5 a6 a7 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 t3 t4 t5 t6
31+
ft0 ft1 ft2 ft3 ft4 ft5 ft6 ft7 fs0 fs1 fa0 fa1 fa2 fa3 fa4 fa5 fa6 fa7 fs2 fs3 fs4 fs5 fs6 fs7 fs8 fs9 fs10 fs11 ft8 ft9 ft10 ft11
32+
)
33+
end
34+
35+
# These keywords are used for some vector instructions (vsetvli etc.).
36+
def self.other_keyword
37+
@other_keyword ||= %w(
38+
e8 e16 e32 e64 mf8 mf4 mf2 m1 m2 m4 m8 ta tu ma mu v0.t
39+
)
40+
end
41+
42+
# For %pcrel_hi(...) relocations etc.
43+
def self.relocation_function
44+
@relocation_function ||= %w(
45+
hi lo
46+
pcrel_hi pcrel_lo
47+
tprel_hi tprel_lo
48+
tprel_add
49+
tls_ie_pcrel_hi
50+
tls_gd_pcrel_hi
51+
got_pcrel_hi
52+
)
53+
end
54+
55+
state :comments_and_whitespace do
56+
# Don't eat newlines because those are significant.
57+
rule %r/[ \t]+/, Text::Whitespace
58+
rule %r((//|#).*), Comment::Single
59+
rule %r(/\*.*?\*/)m, Comment::Multiline
60+
end
61+
62+
state :literals do
63+
# 1f, 2b forward/backward label references.
64+
rule %r/[0-9]+[fb]\b/, Name::Label
65+
66+
# Octal
67+
rule %r/\-?0[0-7]+\b/, Num::Oct
68+
# Binary
69+
rule %r/\-?0b[01]+\b/, Num::Integer
70+
# Hex
71+
rule %r/\-?0x[0-9a-fA-F]+\b/, Num::Hex
72+
# Decimal
73+
rule %r/\-?[0-9]+\b/, Num::Integer
74+
75+
# Float. RISC-V supposedly supports C float literals but I doubt
76+
# it really supports all the hex variants etc.
77+
# This is not quite accurate since you can have e.g. `.3`.
78+
rule %r/\-?[0-9]+\.[0-9]*([eE]-?[0-9]+)?[fFlL]?\b/, Num::Float
79+
80+
# Strings.
81+
rule %r/"(\\\\|\\"|[^"])*"/, Str::Double
82+
rule %r/'(\\\\|\\'|[^'])*'/, Str::Single
83+
end
84+
85+
state :root do
86+
# Preprocessor directive. Awkwardly these are the same as single line comments.
87+
# It seems like GCC will silently ignore unknown directives so that comments
88+
# work - even for `.s` files. Yes that means if you have a typo like
89+
#
90+
# #defien DISABLE_DEV_BACKDOOR 1
91+
#
92+
# Then it will silently ignore it!
93+
#
94+
rule %r/^[ \t]*#[ \t]*(:?#{RiscvAsm.preproc_directive.join('|')})\b/, Comment::Preproc, :preprocessor_directive
95+
96+
mixin :comments_and_whitespace
97+
98+
# End of line.
99+
rule %r/\n/, Text::Whitespace
100+
101+
# Assembly directive.
102+
rule %r/\.\w+/, Name::Attribute, :directive
103+
104+
# Label.
105+
rule %r/((\w+)|(\d+)):/, Name::Label
106+
107+
# Instruction or maybe macro call.
108+
rule %r/[\w\.]+\b/, Name::Builtin, :args
109+
end
110+
111+
state :preprocessor_directive do
112+
mixin :comments_and_whitespace
113+
mixin :literals
114+
115+
# Escaped newline. This is one case where you can't parse
116+
# .S and .s the same - if you try to escape a newline in a
117+
# preprocessor directive in .S it will work but in .s it
118+
# will be ignored. Here we assume .S.
119+
rule %r/\\\n/, Text
120+
121+
rule %r/./, Text
122+
rule %r/\n/, Text, :pop!
123+
end
124+
125+
state :directive do
126+
mixin :comments_and_whitespace
127+
mixin :literals
128+
129+
rule %r/./, Text
130+
rule %r/\n/, Text, :pop!
131+
end
132+
133+
state :args do
134+
mixin :comments_and_whitespace
135+
mixin :literals
136+
137+
# End of instruction.
138+
rule %r/[;\n]/, Text::Whitespace, :pop!
139+
140+
# Register names.
141+
rule %r/(?:#{RiscvAsm.register.join('|')})\b/, Name::Constant
142+
# Other keywords.
143+
rule %r/(?:#{RiscvAsm.other_keyword.join('|')})\b/, Name::Constant
144+
# Relocations
145+
rule %r/%(?:#{RiscvAsm.relocation_function.join('|')})\b/, Name::Builtin
146+
147+
# Operators
148+
rule %r/[-~*\/%<>|&\^!+(),]/, Operator
149+
# Variables.
150+
rule %r/\\?[\$\w]+/, Name::Variable
151+
end
152+
end
153+
end
154+
end

spec/lexers/riscvasm_spec.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# -*- coding: utf-8 -*- #
2+
# frozen_string_literal: true
3+
4+
describe Rouge::Lexers::RiscvAsm do
5+
let(:subject) { Rouge::Lexers::RiscvAsm.new }
6+
7+
describe 'guessing' do
8+
include Support::Guessing
9+
10+
it 'guesses by filename' do
11+
assert_guess :filename => 'foo.s'
12+
end
13+
end
14+
end

0 commit comments

Comments
 (0)