Compilation strategy, prototype JIT

2017-03-09 22:21:50 -07:00 · 2017-03-09 22:21:50 -07:00 · 59a8f58fcd
parent 00e4962605
commit 59a8f58fcd
2 changed files with 182 additions and 13 deletions
--- a/README.md
+++ b/README.md
@ -177,20 +177,31 @@ assembly for `simple.c` to see how it works:
 $ gcc -S simple.c
 ```
-Edited/annotated highlights from the assembly `simple.s` (whose floating-point
+Edited/annotated highlights from the assembly `simple.s`, which is much more
-code is a little circuitous):
+complicated than what we'll end up generating:
 ```s
 interpret:
        // The stack contains local variables referenced to the "base pointer"
        // stored in hardware register %rbp. Here's the layout:
        //
        //   double i  = -8(%rbp)
        //   double r  = -16(%rbp)
        //   src       = -24(%rbp)
        //   dst       = -32(%rbp)
        //   registers = -40(%rbp)      <- comes in as an argument in %rdi
        //   code      = -48(%rbp)      <- comes in as an argument in %rsi
        pushq   %rbp
        movq    %rsp, %rbp              // standard x86-64 function header
-        subq    $48, %rsp               // allocate space for local variables
+        subq    $48, %rsp               // allocate space for six local vars
-        movq    %rdi, -40(%rbp)         // callee saves %rsi and %rdi
+        movq    %rdi, -40(%rbp)         // registers arg -> local var
-        movq    %rsi, -48(%rbp)
+        movq    %rsi, -48(%rbp)         // code arg -> local var
-        jmp     for_loop_condition
+        jmp     for_loop_condition      // commence loopage
 for_loop_body:
-        <a bunch of stuff>
+        // (a bunch of stuff to set up *src and *dst)
        cmpl    $43, %eax               // case '+'
        je      add_branch
        cmpl    $61, %eax               // case '='
@ -272,11 +283,125 @@ for_loop_step:
        addq    $3, -48(%rbp)
 for_loop_condition:
-        movq    -48(%rbp), %rax
+        movq    -48(%rbp), %rax         // %rax = code (the pointer)
-        movzbl  (%rax), %eax
+        movzbl  (%rax), %eax            // %eax = *code (move one byte)
-        testb   %al, %al
+        testb   %al, %al                // is %eax 0?
-        jne     .L8
+        jne     for_loop_body           // if no, then continue
-        nop
+
-        leave                           // reset %rsp
+        leave                           // otherwise rewind stack
        ret                             // pop and jmp
 ```
 #### Compilation strategy
 Most of the above is register-shuffling fluff that we can get rid of. We're
 compiling the code up front, which means all of our register addresses are
 known quantities and we won't need any unknown indirection at runtime. So all
 of the shuffling into and out of `%rax` can be replaced by a much simpler move
 directly to or from `N(%rdi)` -- since `%rdi` is the argument that points to
 the first register's real component.
 If you haven't already, at this point I'd recommend downloading the [Intel
 software developer's
 manual](https://software.intel.com/en-us/articles/intel-sdm), of which volume 2
 describes the semantics and machine code representation of every instruction.
 **NOTE:** GCC uses AT&T assembly syntax, whereas the Intel manuals use Intel
 assembly syntax. An important difference is that AT&T reverses the arguments:
 `mov %rax, %rbx` (AT&T syntax) assigns to `%rbx`, whereas `mov rax, rbx` (Intel
 syntax) assigns to `rax`. All of my code examples use AT&T, and none of this
 will matter once we're working with machine code.
 ##### Example: the Mandelbrot function `*bb+ab`
 ```s
 // Step 1: multiply register B by itself
 movsd 16(%rdi), %xmm0                   // %xmm0 = b.r
 movsd 24(%rdi), %xmm1                   // %xmm1 = b.i
 movsd 16(%rdi), %xmm2                   // %xmm2 = b.r
 movsd 24(%rdi), %xmm3                   // %xmm3 = b.i
 movsd %xmm0, %xmm4                      // %xmm4 = b.r
 mulsd %xmm2, %xmm4                      // %xmm4 = b.r*b.r
 movsd %xmm1, %xmm5                      // %xmm5 = b.i
 mulsd %xmm3, %xmm5                      // %xmm5 = b.i*b.i
 subsd %xmm5, %xmm4                      // %xmm4 = b.r*b.r - b.i*b.i
 movsd %xmm4, 16(%rdi)                   // b.r = %xmm4
 mulsd %xmm0, %xmm3                      // %xmm3 = b.r*b.i
 mulsd %xmm1, %xmm2                      // %xmm2 = b.i*b.r
 addsd %xmm3, %xmm2                      // %xmm2 = b.r*b.i + b.i*b.r
 movsd %xmm2, 24(%rdi)                   // b.i = %xmm2
 // Step 2: add register A to register B
 movpd (%rdi), %xmm0                     // %xmm0 = (a.r, a.i)
 addpd %xmm0, 16(%rdi)                   // (b.r, b.i) += %xmm0
 ```
 The multiplication code isn't optimized for the squaring-a-register use case;
 instead, I left it fully general so we can use it as a template when we start
 generating machine code.
 ### JIT mechanics
 Rather than compiling a real language, let's just get a basic JIT setup.
 ```c
 // jitproto.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/mman.h>
 typedef long(*fn)(long);
 fn compile_identity(void) {
  // Allocate some memory and set its permissions correctly. In particular, we
  // need PROT_EXEC (which isn't normally enabled for data memory, e.g. from
  // malloc()), which tells the processor it's ok to execute it as machine
  // code.
  char *memory = mmap(NULL,             // address
                      4096,             // size
                      PROT_READ | PROT_WRITE | PROT_EXEC,
                      MAP_PRIVATE | MAP_ANONYMOUS,
                      -1,               // fd (not used here)
                      0);               // offset (not used here)
  if (!memory) {
    perror("failed to allocate memory");
    exit(1);
  }
  int i = 0;
  // mov %rdi, %rax
  memory[i++] = 0x48;           // REX.W prefix
  memory[i++] = 0x8b;           // MOV opcode, register/register
  memory[i++] = 0xc7;           // MOD/RM byte for %rdi -> %rax
  // ret
  memory[i++] = 0xc3;           // RET opcode
  return (long(*)(long)) memory;
 }
 int main() {
  fn f = compile_identity();
  int i;
  for (i = 0; i < 10; ++i)
    printf("f(%d) = %ld\n", i, (*f)(i));
  munmap((void*) f, 4096);
  return 0;
 }
 ```
 This does what we expect: we've just produced an identity function.
 ```sh
 $ gcc jitproto.c -o jitproto
 $ ./jitproto
 f(0) = 0
 f(1) = 1
 f(2) = 2
 f(3) = 3
 f(4) = 4
 f(5) = 5
 f(6) = 6
 f(7) = 7
 f(8) = 8
 f(9) = 9
 ```
--- a/jitproto.c
+++ b/jitproto.c
@ -0,0 +1,44 @@
 // jitproto.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/mman.h>
 typedef long(*fn)(long);
 fn compile_identity(void) {
  // Allocate some memory and set its permissions correctly. In particular, we
  // need PROT_EXEC (which isn't normally enabled for data memory, e.g. from
  // malloc()), which tells the processor it's ok to execute it as machine
  // code.
  char *memory = mmap(NULL,             // address
                      4096,             // size
                      PROT_READ | PROT_WRITE | PROT_EXEC,
                      MAP_PRIVATE | MAP_ANONYMOUS,
                      -1,               // fd (not used here)
                      0);               // offset (not used here)
  if (!memory) {
    perror("failed to allocate memory");
    exit(1);
  }
  int i = 0;
  // mov %rdi, %rax
  memory[i++] = 0x48;           // REX.W prefix
  memory[i++] = 0x8b;           // MOV opcode, register/register
  memory[i++] = 0xc7;           // MOD/RM byte for %rsi -> %rax
  // ret
  memory[i++] = 0xc3;           // RET opcode
  return (long(*)(long)) memory;
 }
 int main() {
  fn f = compile_identity();
  int i;
  for (i = 0; i < 10; ++i)
    printf("f(%d) = %ld\n", i, (*f)(i));
  munmap((void*) f, 4096);
  return 0;
 }