From a79ecb465348df55551efbc5e227b499463281cd Mon Sep 17 00:00:00 2001
From: Spencer Tipping <spencer.tipping@gmail.com>
Date: Thu, 9 Mar 2017 19:48:17 -0700
Subject: [PATCH] WIP describing the jit stuff

---
 README.md | 117 ++++++++++++++++++++++
 simple.s  | 293 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 410 insertions(+)
 create mode 100644 simple.s
diff --git a/README.md b/README.md
index d4d1691..e574b6f 100644
--- a/README.md
+++ b/README.md
@@ -163,3 +163,120 @@ user	0m1.328s
 sys	0m0.000s
 $
 ```
+
+### JIT design
+The basic strategy is to replace `interpret(registers, code)` with a function
+`compile(code)` that returns a pointer to a function whose signature is this:
+`void compiled(registers*)`. The memory for the function needs to be allocated
+using `mmap` so we can set permission for the processor to execute it.
+
+The easiest way to start with something like this is probably to emit the
+assembly for `simple.c` to see how it works:
+
+```sh
+$ gcc -S simple.c
+```
+
+Edited/annotated highlights from the assembly `simple.s` (whose floating-point
+code is a little circuitous):
+
+```s
+interpret:
+	pushq	%rbp
+	movq	%rsp, %rbp              // standard x86-64 function header
+	subq	$48, %rsp               // allocate space for local variables
+	movq	%rdi, -40(%rbp)         // callee saves %rsi and %rdi
+	movq	%rsi, -48(%rbp)
+	jmp	for_loop_condition
+
+for_loop_body:
+	<a bunch of stuff>
+	cmpl	$43, %eax               // case '+'
+	je	add_branch
+	cmpl	$61, %eax               // case '='
+	je	assign_branch
+	cmpl	$42, %eax               // case '*'
+	je	mult_branch
+	jmp	switch_default          // default
+
+assign_branch:
+        // the "bunch of stuff" above calculated *src and *dst, which are
+        // stored in -24(%rbp) and -32(%rbp).
+	movq	-24(%rbp), %rax         // %rax = src
+	movsd	(%rax), %xmm0           // %xmm0 = src.r
+	movq	-32(%rbp), %rax         // %rax = dst
+	movsd	%xmm0, (%rax)           // dst.r = %xmm0
+
+	movq	-24(%rbp), %rax         // %rax = src
+	movsd	8(%rax), %xmm0          // %xmm0 = src.i
+	movq	-32(%rbp), %rax         // %rax = dst
+	movsd	%xmm0, 8(%rax)          // dst.i = %xmm0
+
+	jmp	for_loop_step
+
+add_branch:
+	movq	-32(%rbp), %rax         // %rax = dst
+	movsd	(%rax), %xmm1           // %xmm1 = dst.r
+	movq	-24(%rbp), %rax         // %rax = src
+	movsd	(%rax), %xmm0           // %xmm0 = src.r
+	addsd	%xmm1, %xmm0            // %xmm0 += %xmm1
+	movq	-32(%rbp), %rax         // %rax = dst
+	movsd	%xmm0, (%rax)           // dst.r = %xmm0
+
+	movq	-32(%rbp), %rax         // same thing for src.i and dst.i
+	movsd	8(%rax), %xmm1
+	movq	-24(%rbp), %rax
+	movsd	8(%rax), %xmm0
+	addsd	%xmm1, %xmm0
+	movq	-32(%rbp), %rax
+	movsd	%xmm0, 8(%rax)
+
+	jmp	for_loop_step
+
+mult_branch:
+	movq	-32(%rbp), %rax
+	movsd	(%rax), %xmm1
+	movq	-24(%rbp), %rax
+	movsd	(%rax), %xmm0
+	mulsd	%xmm1, %xmm0
+	movq	-32(%rbp), %rax
+	movsd	8(%rax), %xmm2
+	movq	-24(%rbp), %rax
+	movsd	8(%rax), %xmm1
+	mulsd	%xmm2, %xmm1
+	subsd	%xmm1, %xmm0
+	movsd	%xmm0, -16(%rbp)        // double r = src.r*dst.r - src.i*dst.i
+
+	movq	-32(%rbp), %rax
+	movsd	(%rax), %xmm1
+	movq	-24(%rbp), %rax
+	movsd	8(%rax), %xmm0
+	mulsd	%xmm0, %xmm1
+	movq	-32(%rbp), %rax
+	movsd	8(%rax), %xmm2
+	movq	-24(%rbp), %rax
+	movsd	(%rax), %xmm0
+	mulsd	%xmm2, %xmm0
+	addsd	%xmm1, %xmm0
+	movsd	%xmm0, -8(%rbp)         // double i = src.r*dst.i + src.i*dst.r
+
+	movq	-32(%rbp), %rax
+	movsd	-16(%rbp), %xmm0
+	movsd	%xmm0, (%rax)           // dst.r = r
+	movq	-32(%rbp), %rax
+	movsd	-8(%rbp), %xmm0
+	movsd	%xmm0, 8(%rax)          // dst.i = i
+	jmp	for_loop_step
+
+for_loop_step:
+	addq	$3, -48(%rbp)
+
+for_loop_condition:
+	movq	-48(%rbp), %rax
+	movzbl	(%rax), %eax
+	testb	%al, %al
+	jne	.L8
+	nop
+	leave                           // reset %rsp
+	ret                             // pop and jmp
+```
diff --git a/simple.s b/simple.s
new file mode 100644
index 0000000..5e025f9
--- /dev/null
+++ b/simple.s
@@ -0,0 +1,293 @@
+	.file	"simple.c"
+	.section	.rodata
+	.align 8
+.LC0:
+	.string	"undefined instruction %s (ASCII %x)\n"
+	.text
+	.globl	interpret
+	.type	interpret, @function
+interpret:
+.LFB2:
+	.cfi_startproc
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset 6, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register 6
+	subq	$48, %rsp
+	movq	%rdi, -40(%rbp)
+	movq	%rsi, -48(%rbp)
+	jmp	.L2
+.L8:
+	movq	-48(%rbp), %rax
+	addq	$2, %rax
+	movzbl	(%rax), %eax
+	movsbq	%al, %rax
+	salq	$4, %rax
+	leaq	-1552(%rax), %rdx
+	movq	-40(%rbp), %rax
+	addq	%rdx, %rax
+	movq	%rax, -32(%rbp)
+	movq	-48(%rbp), %rax
+	addq	$1, %rax
+	movzbl	(%rax), %eax
+	movsbq	%al, %rax
+	salq	$4, %rax
+	leaq	-1552(%rax), %rdx
+	movq	-40(%rbp), %rax
+	addq	%rdx, %rax
+	movq	%rax, -24(%rbp)
+	movq	-48(%rbp), %rax
+	movzbl	(%rax), %eax
+	movsbl	%al, %eax
+	cmpl	$43, %eax
+	je	.L4
+	cmpl	$61, %eax
+	je	.L5
+	cmpl	$42, %eax
+	je	.L6
+	jmp	.L9
+.L5:
+	movq	-24(%rbp), %rax
+	movsd	(%rax), %xmm0
+	movq	-32(%rbp), %rax
+	movsd	%xmm0, (%rax)
+	movq	-24(%rbp), %rax
+	movsd	8(%rax), %xmm0
+	movq	-32(%rbp), %rax
+	movsd	%xmm0, 8(%rax)
+	jmp	.L7
+.L4:
+	movq	-32(%rbp), %rax
+	movsd	(%rax), %xmm1
+	movq	-24(%rbp), %rax
+	movsd	(%rax), %xmm0
+	addsd	%xmm1, %xmm0
+	movq	-32(%rbp), %rax
+	movsd	%xmm0, (%rax)
+	movq	-32(%rbp), %rax
+	movsd	8(%rax), %xmm1
+	movq	-24(%rbp), %rax
+	movsd	8(%rax), %xmm0
+	addsd	%xmm1, %xmm0
+	movq	-32(%rbp), %rax
+	movsd	%xmm0, 8(%rax)
+	jmp	.L7
+.L6:
+	movq	-32(%rbp), %rax
+	movsd	(%rax), %xmm1
+	movq	-24(%rbp), %rax
+	movsd	(%rax), %xmm0
+	mulsd	%xmm1, %xmm0
+	movq	-32(%rbp), %rax
+	movsd	8(%rax), %xmm2
+	movq	-24(%rbp), %rax
+	movsd	8(%rax), %xmm1
+	mulsd	%xmm2, %xmm1
+	subsd	%xmm1, %xmm0
+	movsd	%xmm0, -16(%rbp)
+	movq	-32(%rbp), %rax
+	movsd	(%rax), %xmm1
+	movq	-24(%rbp), %rax
+	movsd	8(%rax), %xmm0
+	mulsd	%xmm0, %xmm1
+	movq	-32(%rbp), %rax
+	movsd	8(%rax), %xmm2
+	movq	-24(%rbp), %rax
+	movsd	(%rax), %xmm0
+	mulsd	%xmm2, %xmm0
+	addsd	%xmm1, %xmm0
+	movsd	%xmm0, -8(%rbp)
+	movq	-32(%rbp), %rax
+	movsd	-16(%rbp), %xmm0
+	movsd	%xmm0, (%rax)
+	movq	-32(%rbp), %rax
+	movsd	-8(%rbp), %xmm0
+	movsd	%xmm0, 8(%rax)
+	jmp	.L7
+.L9:
+	movq	-48(%rbp), %rax
+	movzbl	(%rax), %eax
+	movsbl	%al, %ecx
+	movq	stderr(%rip), %rax
+	movq	-48(%rbp), %rdx
+	movl	$.LC0, %esi
+	movq	%rax, %rdi
+	movl	$0, %eax
+	call	fprintf
+	movl	$1, %edi
+	call	exit
+.L7:
+	addq	$3, -48(%rbp)
+.L2:
+	movq	-48(%rbp), %rax
+	movzbl	(%rax), %eax
+	testb	%al, %al
+	jne	.L8
+	nop
+	leave
+	.cfi_def_cfa 7, 8
+	ret
+	.cfi_endproc
+.LFE2:
+	.size	interpret, .-interpret
+	.section	.rodata
+.LC1:
+	.string	"P5\n%d %d\n%d\n"
+	.text
+	.globl	main
+	.type	main, @function
+main:
+.LFB3:
+	.cfi_startproc
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset 6, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register 6
+	subq	$1712, %rsp
+	movl	%edi, -1700(%rbp)
+	movq	%rsi, -1712(%rbp)
+	movq	%fs:40, %rax
+	movq	%rax, -8(%rbp)
+	xorl	%eax, %eax
+	movl	$255, %ecx
+	movl	$900, %edx
+	movl	$1600, %esi
+	movl	$.LC1, %edi
+	movl	$0, %eax
+	call	printf
+	movl	$0, -1684(%rbp)
+	jmp	.L11
+.L19:
+	movl	$0, -1688(%rbp)
+	jmp	.L12
+.L18:
+	pxor	%xmm0, %xmm0
+	cvtsi2sd	-1688(%rbp), %xmm0
+	movsd	.LC2(%rip), %xmm1
+	divsd	%xmm1, %xmm0
+	movsd	.LC3(%rip), %xmm1
+	subsd	%xmm1, %xmm0
+	movsd	.LC4(%rip), %xmm1
+	mulsd	%xmm1, %xmm0
+	movsd	%xmm0, -1680(%rbp)
+	pxor	%xmm0, %xmm0
+	cvtsi2sd	-1684(%rbp), %xmm0
+	movsd	.LC5(%rip), %xmm1
+	divsd	%xmm1, %xmm0
+	movsd	.LC3(%rip), %xmm1
+	subsd	%xmm1, %xmm0
+	movsd	.LC6(%rip), %xmm1
+	mulsd	%xmm1, %xmm0
+	movsd	%xmm0, -1672(%rbp)
+	movl	$1, -1692(%rbp)
+	jmp	.L13
+.L14:
+	movl	-1692(%rbp), %eax
+	cltq
+	salq	$4, %rax
+	addq	%rbp, %rax
+	subq	$1672, %rax
+	pxor	%xmm0, %xmm0
+	movsd	%xmm0, (%rax)
+	movl	-1692(%rbp), %eax
+	cltq
+	salq	$4, %rax
+	addq	%rbp, %rax
+	subq	$1672, %rax
+	movsd	(%rax), %xmm0
+	movl	-1692(%rbp), %eax
+	cltq
+	salq	$4, %rax
+	addq	%rbp, %rax
+	subq	$1680, %rax
+	movsd	%xmm0, (%rax)
+	addl	$1, -1692(%rbp)
+.L13:
+	cmpl	$3, -1692(%rbp)
+	jle	.L14
+	movl	$0, -1692(%rbp)
+	jmp	.L15
+.L17:
+	movq	-1712(%rbp), %rax
+	addq	$8, %rax
+	movq	(%rax), %rdx
+	leaq	-1680(%rbp), %rax
+	movq	%rdx, %rsi
+	movq	%rax, %rdi
+	call	interpret
+	addl	$1, -1692(%rbp)
+.L15:
+	cmpl	$255, -1692(%rbp)
+	jg	.L16
+	movsd	-1664(%rbp), %xmm1
+	movsd	-1664(%rbp), %xmm0
+	mulsd	%xmm0, %xmm1
+	movsd	-1656(%rbp), %xmm2
+	movsd	-1656(%rbp), %xmm0
+	mulsd	%xmm2, %xmm0
+	addsd	%xmm1, %xmm0
+	movsd	.LC8(%rip), %xmm1
+	ucomisd	%xmm0, %xmm1
+	ja	.L17
+.L16:
+	movl	-1692(%rbp), %eax
+	movl	%eax, %edx
+	movl	-1688(%rbp), %eax
+	cltq
+	movb	%dl, -1616(%rbp,%rax)
+	addl	$1, -1688(%rbp)
+.L12:
+	cmpl	$1599, -1688(%rbp)
+	jle	.L18
+	movq	stdout(%rip), %rdx
+	leaq	-1616(%rbp), %rax
+	movq	%rdx, %rcx
+	movl	$1600, %edx
+	movl	$1, %esi
+	movq	%rax, %rdi
+	call	fwrite
+	addl	$1, -1684(%rbp)
+.L11:
+	cmpl	$899, -1684(%rbp)
+	jle	.L19
+	movl	$0, %eax
+	movq	-8(%rbp), %rcx
+	xorq	%fs:40, %rcx
+	je	.L21
+	call	__stack_chk_fail
+.L21:
+	leave
+	.cfi_def_cfa 7, 8
+	ret
+	.cfi_endproc
+.LFE3:
+	.size	main, .-main
+	.section	.rodata
+	.align 8
+.LC2:
+	.long	0
+	.long	1083768832
+	.align 8
+.LC3:
+	.long	0
+	.long	1071644672
+	.align 8
+.LC4:
+	.long	2576980378
+	.long	1074370969
+	.align 8
+.LC5:
+	.long	0
+	.long	1082925056
+	.align 8
+.LC6:
+	.long	3435973837
+	.long	1073532108
+	.align 8
+.LC8:
+	.long	0
+	.long	1074790400
+	.ident	"GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.4) 5.4.0 20160609"
+	.section	.note.GNU-stack,"",@progbits