From 24d19f6e8c31c038345fb33d3a957fe990222ea2 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss@ledger.fr>
Date: Fri, 4 Apr 2025 17:54:31 +0200
Subject: [PATCH] Add eBPF ISA v4 instructions

In 2023, the eBPF instruction set was modified to add several
instructions related to signed operations (load with sign-extension,
signed division, etc.), a 32-bit jump instruction and some byte-swap
instructions. This became version 4 of eBPF ISA.

Here are some references about this change:

- https://pchaigno.github.io/bpf/2021/10/20/ebpf-instruction-sets.html
  (a blog post about eBPF instruction set extensions)
- https://lore.kernel.org/bpf/4bfe98be-5333-1c7e-2f6d-42486c8ec039@meta.com/
  (documentation sent to Linux Kernel mailing list)
- https://www.rfc-editor.org/rfc/rfc9669.html#name-sign-extension-load-operati
  (IETF's BPF Instruction Set Architecture standard defined the new
  instructions)
- https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/bpf/core.c?h=v6.14#n1859
  (implementation of signed division and remainder in Linux kernel.
  This shows that 32-bit signed DIV and signed MOD are zero-extending
  the result in DST)
- https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/bpf/core.c?h=v6.14#n2135
  (implementation of signed memory load in Linux kernel)
- https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1f9a1ea821ff25353a0e80d971e7958cd55b47a3
  (commit which added signed memory load instructions in Linux kernel)

This can be tested with a recent enough version of clang and LLVM (this
works with clang 19.1.4 on Alpine 3.21).
For example for signed memory load instructions:

    signed int sext_8bit(signed char x) {
        return x;
    }

produces:

    $ clang -O0 -target bpf -mcpu=v4 -c test.c -o test.ebpf
    $ llvm-objdump -rd test.ebpf
    ...
    0000000000000000 <sext_8bit>:
           0:  73 1a ff ff 00 00 00 00  *(u8 *)(r10 - 0x1) = r1
           1:  91 a1 ff ff 00 00 00 00  r1 = *(s8 *)(r10 - 0x1)
           2:  bc 10 00 00 00 00 00 00  w0 = w1
           3:  95 00 00 00 00 00 00 00  exit

(The second instruction is a signed memory load)

Instruction MOVS (Sign extend register MOV) uses offset to encode the
conversion (whether the source register is to be considered as signed
8-bit, 16-bit or 32-bit integer). The mnemonic for these instructions is
quite unclear:

- They are all named MOVS in the proposal
  https://lore.kernel.org/bpf/4bfe98be-5333-1c7e-2f6d-42486c8ec039@meta.com/
- LLVM and Linux disassemblers only display pseudo-code (`r0 = (s8)r1`)
- RFC 9669 (https://datatracker.ietf.org/doc/rfc9669/) uses MOVSX for
  all instructions.
- GCC uses MOVS for all instructions:
  https://github.com/gcc-mirror/gcc/blob/releases/gcc-14.1.0/gcc/config/bpf/bpf.md?plain=1#L326-L365

To make the disassembled code clearer, decode such instructions with a
size suffix: MOVSB, MOVSH, MOVSW.

The decoding of instructions 32-bit JA, BSWAP16, BSWAP32 and BSWAP64 is
straightforward.
---
 .../Processors/eBPF/data/languages/eBPF.sinc  | 54 ++++++++++++++++---
 1 file changed, 46 insertions(+), 8 deletions(-)

diff --git a/Ghidra/Processors/eBPF/data/languages/eBPF.sinc b/Ghidra/Processors/eBPF/data/languages/eBPF.sinc
index 785f104192..dd0e741745 100644
--- a/Ghidra/Processors/eBPF/data/languages/eBPF.sinc
+++ b/Ghidra/Processors/eBPF/data/languages/eBPF.sinc
@@ -65,7 +65,10 @@ SRC4: imm is imm & op_alu_jmp_source=0 { export *[const]:4 imm; }
 
 DST4: dst is dst { local tmp:4 = dst:4; export tmp; }
 
-:MOV dst, SRC8  is SRC8 & dst & op_alu_jmp_opcode=0xb & op_insn_class=0x7 { dst = SRC8; }
+:MOV dst, SRC8  is SRC8 & dst & off=0 & op_alu_jmp_opcode=0xb & op_insn_class=0x7 { dst = SRC8; }
+:MOVSB dst, src  is src & dst & off=8 & op_alu_jmp_opcode=0xb & op_alu_jmp_source=1 & op_insn_class=0x7 { dst = sext(src:1); }
+:MOVSH dst, src  is src & dst & off=16 & op_alu_jmp_opcode=0xb & op_alu_jmp_source=1 & op_insn_class=0x7 { dst = sext(src:2); }
+:MOVSW dst, src  is src & dst & off=32 & op_alu_jmp_opcode=0xb & op_alu_jmp_source=1 & op_insn_class=0x7 { dst = sext(src:4); }
 
 :ADD dst, SRC8  is SRC8 & dst & op_alu_jmp_opcode=0x0 & op_insn_class=0x7 { dst = dst + SRC8; }
 
@@ -73,7 +76,8 @@ DST4: dst is dst { local tmp:4 = dst:4; export tmp; }
 
 :MUL dst, SRC8  is SRC8 & dst & op_alu_jmp_opcode=0x2 & op_insn_class=0x7 { dst = dst * SRC8; }
 
-:DIV dst, SRC8  is SRC8 & dst & op_alu_jmp_opcode=0x3 & op_insn_class=0x7 { dst = dst / SRC8; }
+:DIV dst, SRC8  is SRC8 & dst & off=0 & op_alu_jmp_opcode=0x3 & op_insn_class=0x7 { dst = dst / SRC8; }
+:SDIV dst, SRC8  is SRC8 & dst & off=1 & op_alu_jmp_opcode=0x3 & op_insn_class=0x7 { dst = dst s/ SRC8; }
 
 :OR dst, SRC8  is SRC8 & dst & op_alu_jmp_opcode=0x4 & op_insn_class=0x7 { dst = dst | SRC8; }
 
@@ -85,7 +89,8 @@ DST4: dst is dst { local tmp:4 = dst:4; export tmp; }
 
 :NEG dst  is dst & op_alu_jmp_opcode=0x8 & op_alu_jmp_source=0 & op_insn_class=0x7 { dst = -dst; }
 
-:MOD dst, SRC8  is SRC8 & dst & op_alu_jmp_opcode=0x9 & op_insn_class=0x7 { dst = dst % SRC8; }
+:MOD dst, SRC8  is SRC8 & dst & off=0 & op_alu_jmp_opcode=0x9 & op_insn_class=0x7 { dst = dst % SRC8; }
+:SMOD dst, SRC8  is SRC8 & dst & off=1 & op_alu_jmp_opcode=0x9 & op_insn_class=0x7 { dst = dst s% SRC8; }
 
 :XOR dst, SRC8  is SRC8 & dst & op_alu_jmp_opcode=0xa & op_insn_class=0x7 { dst = dst ^ SRC8; }
 
@@ -95,7 +100,9 @@ DST4: dst is dst { local tmp:4 = dst:4; export tmp; }
 #BPF_ALU
 ###############################################################################
 
-:MOV dst, SRC4  is SRC4 & dst & op_alu_jmp_opcode=0xb & op_insn_class=0x4 { dst = zext(SRC4); }
+:MOV dst, SRC4  is SRC4 & dst & off=0 & op_alu_jmp_opcode=0xb & op_insn_class=0x4 { dst = zext(SRC4); }
+:MOVSB dst, src  is src & dst & off=8 & op_alu_jmp_opcode=0xb & op_alu_jmp_source=1 & op_insn_class=0x4 { local tmp:4 = sext(src:1); dst = zext(tmp); }
+:MOVSH dst, src  is src & dst & off=16 & op_alu_jmp_opcode=0xb & op_alu_jmp_source=1 & op_insn_class=0x4 { local tmp:4 = sext(src:2); dst = zext(tmp); }
 
 :ADD dst, SRC4  is SRC4 & dst & op_alu_jmp_opcode=0x0 & op_insn_class=0x4 { dst = zext(dst:4 + SRC4); }
 
@@ -103,7 +110,8 @@ DST4: dst is dst { local tmp:4 = dst:4; export tmp; }
 
 :MUL dst, SRC4  is SRC4 & dst & op_alu_jmp_opcode=0x2 & op_insn_class=0x4 { dst = zext(dst:4 * SRC4); }
 
-:DIV dst, SRC4  is SRC4 & dst & op_alu_jmp_opcode=0x3 & op_insn_class=0x4 { dst = zext(dst:4 / SRC4); }
+:DIV dst, SRC4  is SRC4 & dst & off=0 & op_alu_jmp_opcode=0x3 & op_insn_class=0x4 { dst = zext(dst:4 / SRC4); }
+:SDIV dst, SRC4  is SRC4 & dst & off=1 & op_alu_jmp_opcode=0x3 & op_insn_class=0x4 { dst = zext(dst:4 s/ SRC4); }
 
 :OR dst, SRC4  is SRC4 & dst & op_alu_jmp_opcode=0x4 & op_insn_class=0x4 { dst = zext(dst:4 | SRC4); }
 
@@ -115,7 +123,8 @@ DST4: dst is dst { local tmp:4 = dst:4; export tmp; }
 
 :NEG dst  is dst & op_alu_jmp_opcode=0x8 & op_alu_jmp_source=0 & op_insn_class=0x4 { dst = zext(-dst:4); }
 
-:MOD dst, SRC4  is SRC4 & dst & op_alu_jmp_opcode=0x9 & op_insn_class=0x4 { dst = zext(dst:4 % SRC4); }
+:MOD dst, SRC4  is SRC4 & dst & off=0 & op_alu_jmp_opcode=0x9 & op_insn_class=0x4 { dst = zext(dst:4 % SRC4); }
+:SMOD dst, SRC4  is SRC4 & dst & off=1 & op_alu_jmp_opcode=0x9 & op_insn_class=0x4 { dst = zext(dst:4 s% SRC4); }
 
 :XOR dst, SRC4  is SRC4 & dst & op_alu_jmp_opcode=0xa & op_insn_class=0x4 { dst = zext(dst:4 ^ SRC4); }
 
@@ -173,6 +182,24 @@ DST4: dst is dst { local tmp:4 = dst:4; export tmp; }
 :BE64 dst  is imm=0x40 & dst & op_alu_jmp_opcode=0xd & op_alu_jmp_source=1 & op_insn_class=0x4 {}
 @endif # ENDIAN = "big"
 
+# BPF_ALU64   | BPF_K   | BPF_END
+:BSWAP16 dst  is imm=0x10 & dst & op_alu_jmp_opcode=0xd & op_alu_jmp_source=0 & op_insn_class=0x7 {
+    dst = ((dst & 0xff00) >> 8) | ((dst & 0x00ff) << 8);
+}
+:BSWAP32 dst  is imm=0x20 & dst & op_alu_jmp_opcode=0xd & op_alu_jmp_source=0 & op_insn_class=0x7 {
+    dst = ((dst & 0xff000000) >> 24) | (((dst) & 0x00ff0000) >> 8) | (((dst) & 0x0000ff00) << 8) | ((dst & 0x000000ff) << 24);
+}
+:BSWAP64 dst  is imm=0x40 & dst & op_alu_jmp_opcode=0xd & op_alu_jmp_source=0 & op_insn_class=0x7 {
+    dst = ((dst << 56) & 0xff00000000000000) |
+        ((dst << 40) & 0x00ff000000000000) |
+        ((dst << 24) & 0x0000ff0000000000) |
+        ((dst <<  8) & 0x000000ff00000000) |
+        ((dst >>  8) & 0x00000000ff000000) |
+        ((dst >> 24) & 0x0000000000ff0000) |
+        ((dst >> 40) & 0x000000000000ff00) |
+        ((dst >> 56) & 0x00000000000000ff);
+}
+
 #Memory instructions - Load and Store
 ###############################################################################
 
@@ -231,6 +258,12 @@ DST4: dst is dst { local tmp:4 = dst:4; export tmp; }
 
 :STXDW [dst + off], src  is off & src & dst & op_ld_st_mode=0x3 & op_ld_st_size=0x3 & op_insn_class=0x3 { *:8 (dst + off)=src:8; }
 
+:LDSXW dst, [src + off]  is off & src & dst & op_ld_st_mode=0x4 & op_ld_st_size=0x0 & op_insn_class=0x1 { dst = sext(*:4 (src + off)); }
+
+:LDSXH dst, [src + off]  is off & src & dst & op_ld_st_mode=0x4 & op_ld_st_size=0x1 & op_insn_class=0x1 { dst = sext(*:2 (src + off)); }
+
+:LDSXB dst, [src + off]  is off & src & dst & op_ld_st_mode=0x4 & op_ld_st_size=0x2 & op_insn_class=0x1 { dst = sext(*:1 (src + off)); }
+
 # BPF_ATOMIC
 # BPF_ADD:
 
@@ -350,6 +383,7 @@ DST4: dst is dst { local tmp:4 = dst:4; export tmp; }
 ###############################################################################
 
 joff: reloc  is off [ reloc = inst_next + off * 8; ] { export *:8 reloc; }
+jimm: reloc  is imm [ reloc = inst_next + imm * 8; ] { export *:8 reloc; }
 
 cond: "EQ" is op_alu_jmp_opcode=0x1 & op_insn_class=0x5 & dst & SRC8 { local cmp = dst  == SRC8; export cmp; }
 cond: "EQ" is op_alu_jmp_opcode=0x1 & op_insn_class=0x6 & DST4 & SRC4 { local cmp = DST4 == SRC4; export cmp; }
@@ -376,11 +410,15 @@ cond: "SLE" is op_alu_jmp_opcode=0xd & op_insn_class=0x5 & dst & SRC8 { local cm
 cond: "SLE" is op_alu_jmp_opcode=0xd & op_insn_class=0x6 & DST4 & SRC4 { local cmp = DST4 s<= SRC4; export cmp; }
 
 
-:JA joff  is joff & op_alu_jmp_opcode=0x0 & op_alu_jmp_source=0 & op_insn_class=0x5 {	
+:JA joff  is joff & op_alu_jmp_opcode=0x0 & op_alu_jmp_source=0 & op_insn_class=0x5 {
     goto joff;
 }
 
-:J^cond dst, SRC8, joff  is joff & SRC8 & dst & cond {	  
+:JA jimm  is jimm & op_alu_jmp_opcode=0x0 & op_alu_jmp_source=0 & op_insn_class=0x6 {
+    goto jimm;
+}
+
+:J^cond dst, SRC8, joff  is joff & SRC8 & dst & cond {
     if (cond) goto joff;
 }