;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; pmode.asm - quick start with (32-bit) protected mode
; Chris Giese	<ChrisGiese@SisAndHappy.com>	http://SisAndHappy.com/ChrisGiese
; Release date: April 2, 2005
; This code is public domain (no copyright).
; You can do whatever you want with it.
;
; Assemble this code with NASM:
;	nasm -f bin -o pmode.com pmode.asm
;
; then run pmode.com from plain DOS or from
; a bootloader that supports .COM files
;
; My recommended practices for protected mode:
;
; 1. For mixed 16- and 32-bit code, such as code that enters
;    protected mode, use NASM.
;
; 2. Use Bochs to debug protected mode code:
;	http://bochs.sourceforge.net
;    Get the version with the built-in debugger. The debugger has a
;    cryptic command-line user interface, but it's very useful.
;
; 3. Use video memory for debugging. After each crucial step in
;    the process of switching to pmode, poke a character into text
;    video memory, so you can see (literally) how far the code gets.
;
; 4. Configure the code and data segment descriptors so
;    everything has the same address in real mode and pmode
;
; 5. To change the segment base addresses once you're in pmode
;    (e.g. to switch to segments with base address = 0):
;    a. Write the new CS and EIP values into the immediate
;       operand of a JMP FAR instruction
;    b. Load DS, SS, ES, FS, and GS
;    c. Use JMP FAR with an immediate operand to load CS and EIP.
;    This method avoids confusing and error-prone JMP or RETF
;    instructions that depend on the DS or SS registers.
;
; 6. Simplify privilege scheme:
;    a. Use only privilege rings 0 and 3
;    b. Use only CPL = RPL = DPL
;    c. Ignore the 'conforming' bit of code segment descriptors
;
; 7. Don't use LDTs, call gates, or task gates. Changing CPU privilege
;    levels can be done with interrupt or trap gates. Task-switching
;    can be done by manipulating the kernel stack. Both of these
;    approaches are also portable to non-x86 CPUs.
;
; 8. Build the IDT at run-time. The 32-bit EIP value stored in each
;    interrupt gate is broken into two 16-bit values, with the other
;    four bytes of the gate between them. No common object file format
;    supports the type of relocation needed for such a value.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	ORG 100h		; .COM file start address

	BITS 16

; check for DOS PSP to see if we booted from DOS or from a bootloader
	mov ax,[es:0]
	cmp ax,20CDh		; "INT 20h"
	jne no_dos
	inc byte [dos]
no_dos:

; check for 32-bit CPU
	pushf
		pushf
		pop bx		; old FLAGS -> BX
		mov ax,bx
		xor ah,70h	; try changing b14 (NT)...
		push ax		; ... or b13:b12 (IOPL)
		popf
		pushf
		pop ax		; new FLAGS -> AX
	popf
	xor ah,bh		; 32-bit CPU if we changed NT...
	and ah,70h		; ...or IOPL
	jne cpu_ok
	mov si,cpu_msg

; display error message and either reboot or exit to DOS
msg_and_exit:
	call cputs
	xor ax,ax
	or al,[dos]
	jne dos_exit

; we were started by a bootloader, so reboot
	mov ah,0		; await key pressed
	int 16h

	int 19h			; re-start the boot process

; we were started from DOS, so exit to DOS
dos_exit:
	mov ax,4C01h
	int 21h

cpu_ok:
; if DOS, check if CPU is in Virtual 8086 mode (Windows DOS box or EMM386)
	xor ax,ax
	or al,[dos]
	je real_mode

	smsw ax			; 'SMSW' is a 286+ instruction
	and al,1
	mov si,v86_msg
	jne msg_and_exit
real_mode:

; point to real-mode text video segment. We will poke characters into
; video memory for debugging purposes. If everything works OK, the top
; line of the display will read "0123456789A". These DEBUG statements
; can be taken out (or just commented out) after you've modified the
; code to your satisfaction, and everything works properly.
	mov ax,0B800h		; DEBUG
	mov es,ax		; DEBUG
	mov byte [es:0],'0'	; DEBUG: this code loaded OK?

; It's good to have pmode code and data segments with the same base
; addresses as the real-mode code. This greatly simplifies the
; transition to protected mode. To make this happen, though, we must
; "patch" or "fix up" some addresses in the protected mode tables.
;
; This patching isn't strictly necessary if you start out with segment
; base address = 0 (all segment registers = 0 in real mode). However,
; the patching code is small, it makes this code more flexible and
; robust, and it doesn't hurt anything to leave it in place.
;
; Set the pmode code segment base = 16 * CS
	xor ebx,ebx
	mov byte [es:2],'1'	; DEBUG: 32-bit instruction above worked OK?
	mov bx,cs		; get real-mode segment value
	shl ebx,4		; EBX = CS * 16

	mov eax,ebx
	shr eax,16
	mov [gdt_cs + 2],bx
	mov [gdt_cs + 4],al
	mov [gdt_cs + 7],ah

; Set the pmode data segment base = 16 * DS. For .COM files like this
; one, CS=DS, but that is not always the case.
	xor ebx,ebx
	mov bx,ds		; get real-mode segment value
	shl ebx,4		; EBX = DS * 16

	mov eax,ebx
	shr eax,16
	mov [gdt_ds + 2],bx
	mov [gdt_ds + 4],al
	mov [gdt_ds + 7],ah

; the GDT address in the GDT "pseudo-descriptor", stored at [gdt_ptr+2],
; is also a linear address, and must also be "patched":
	;mov eax,gdt
	;add eax,ebx
	lea eax,[ebx + gdt]
	mov [gdt_ptr + 2],eax

; Intel segmentation is a form of address translation: the address
; generated by your program is a "virtual" address; different from
; the "physical" address that goes out onto the bus, to the RAM chips.
; The conversion value, virt_to_phys, is just the segment base address:
	mov [virt_to_phys],ebx

; Done patching; ready to move to pmode. Disable interrupts by clearing the
; IF bit in EFLAGS register. If the IOPL or NT bits (also in EFLAGS) happen
; to be set, they may cause problems later. So, instead of using CLI, use
; the code below to zero IF, IOPL, and NT
	push dword 2
	popfd
	mov byte [es:4],'2'	; DEBUG: patching and POPFD worked OK?

; load register GDTR with a pointer to the 6-byte GDT pointer
	lgdt [gdt_ptr]

; set the PE bit in register CR0
	mov eax,cr0
	or al,1
	mov cr0,eax

; Pmode doesn't "kick in" until all the segment registers are reloaded.
; We can still use real-mode addressing with ES, like this:
	mov byte [es:6],'3'	; DEBUG: LGDT and setting PE bit worked OK?

; reload CS with a far jump
	jmp SYS_CODE_SEL:pmode

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; name:			cputs
; action:		displays text on screen
; in:			0-terminated string at SI
; out:			(nothing)
; modifies:		(nothing)
; minimum CPU:		8088
; notes:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

cputs:
	push si
	push bx
	push ax
		mov ah,0Eh	; INT 10h: teletype output
		xor bx,bx	; video page 0
		jmp short cputs_2
cputs_1:
		int 10h
cputs_2:
		lodsb
		or al,al
		jne cputs_1
	pop ax
	pop bx
	pop si
	ret

cpu_msg:
	db "32-bit CPU required", 13, 10, 0

v86_msg:
	db "CPU is in Virtual 8086 mode (Windows DOS box or EMM386 loaded)"
	db 13, 10, 0

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; 32-bit pmode code starts here
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	BITS 32

pmode:
; This is where a lot of pmode code fails. The main problems are:
; 1. The linear address of the GDT, at [gdt_ptr+2], was not
;    set properly, so register GDTR points to a bogus GDT
; 2. The code segment descriptor, pointed to by selector SYS_CODE_SEL,
;    does not have the proper segment base address, so the far jump
;    above goes off into the weeds, instead of coming here
; 3. You left hardware interrupts enabled. Pmode interrupt
;    handling hasn't been set up yet, so your program will crash.
;
; Until we reload it, addressing relative to ES still works
; as it did in real mode
	mov byte [es:8],'4'	; DEBUG: far JMP worked OK?

; load pmode data segment selectors into data segment registers
	mov ax,SYS_DATA_SEL
	mov ds,ax
	mov byte [es:0Ah],'5'	; DEBUG: loading DS worked OK?
	mov ss,ax
	mov fs,ax
	mov gs,ax
	mov byte [es:0Ch],'6'	; DEBUG: loading SS,FS,GS worked OK?

; now try to access text video memory using LINEAR_DATA_SEL with a 32-bit address
	mov ax,LINEAR_DATA_SEL
	mov es,ax
	mov byte [es:0B800Eh],'7' ; DEBUG: LINEAR_DATA_SEL works OK?

; because we are lazy, our pmode code will use the same stack used by
; the real mode code. However, the top 16 bits of ESP must be zeroed
	xor eax,eax
	mov ax,sp
	mov esp,eax

; test the stack by calling a subroutine
; demonstrate video memory access with far pointers
	call far_demo
	mov byte [es:0B8012h],'9' ; DEBUG: return (and stack) worked OK?

; demonstrate video memory access with near pointers
	call near_demo

; how to switch to linear code and data segments (base address = 0)
; without crashing:
;
; 1. store new CS and EIP values into the
;    immediate operand of a far JMP instruction:
;    (EBX still contains virt_to_phys)
	lea eax,[ebx + linear]
	;mov [cs:linear - 6],eax	; this causes a reboot
	mov [linear - 6],eax		; xxx - TINY memory model only

; 2. load data segment registers
	mov ax,LINEAR_DATA_SEL
	mov ds,ax
	mov ss,ax
	;mov es,ax
	mov fs,ax
	mov gs,ax
	mov byte [0B8014h],'A' ; DEBUG: linear DS ok?

; 3. use far JMP with immediate operand to load CS and EIP
;
;		offset
; offset	from
; from JMP	'linear'	size	value		description
; --------	--------	----	-----		-----------
; 0		-7		1	0EAh		JMP FAR immediate
; 1		-6		4	-		new EIP value
; 5		-2		2	LINEAR_CODE_SEL	new CS value
; 7		 0		-	-		next instruction
;
	jmp LINEAR_CODE_SEL:0
linear:
	mov byte [0B8016h],'B' ; DEBUG: linear CS ok?

; 4. adjust ESP if necessary or desired:
	add esp,ebx

; demonstrate video memory access with near pointers,
; and code and data segments with base address = 0
	call inv_demo

; go into an infinite loop
	mov byte [0B8018h],'C' ; DEBUG: now in infinite loop
	jmp $

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; writes string at DS:ESI into video memory using far pointers
; assumes ES=selector to data segment with base address = 0
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

far_demo:
	mov byte [es:0B8010h],'8'; DEBUG: subroutine call worked OK?
	push edi
	push esi
	push ecx
		mov esi,far_msg		; DS:ESI points to message text
		mov edi,0B80A0h		; ES:EDI points to 2nd line
		mov ecx,far_msg_len	; how many bytes to copy?
		rep movsb		; copy them
	pop ecx
	pop esi
	pop edi
	ret

; the alternating spaces are treated as "attribute" bytes by the VGA.
; Their value is 20h, so the text is black (color 0) on green (color 2).
far_msg:
	db "h e l l o "
far_msg_len equ $ - far_msg

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; writes string at ESI into video memory using near pointers
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

near_demo:
	push es
	push edi
	push esi
	push ecx
		push ds
		pop es
		mov esi,near_msg	; DS:ESI points to message text
		mov edi,0B8140h		; ES:EDI points to 3rd line
		sub edi,[virt_to_phys]
		mov ecx,near_msg_len	; how many bytes to copy?
		rep movsb		; copy them
	pop ecx
	pop esi
	pop edi
	pop es
	ret

near_msg:
	db "g o o d b y e "
near_msg_len equ $ - near_msg

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; writes string at ESI into video memory using near pointers
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

inv_demo:
	push edi
	push esi
	push ecx

; now we translate the address in ESI, not the address in EDI
		mov esi,inv_near_msg	; DS:ESI points to message text
		add esi,ebx		; (EBX still contains virt_to_phys)
		mov edi,0B81E0h		; ES:EDI points to 4th line
		mov ecx,inv_near_msg_len ; how many bytes to copy?
		rep movsb		; copy them
	pop ecx
	pop esi
	pop edi
	ret

inv_near_msg:
	db "i n v e r s e "
inv_near_msg_len equ $ - inv_near_msg

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

dos:
	db 0

virt_to_phys:
	dd 0

gdt:
; null descriptor
	dw 0			; limit 15:0
	dw 0			; base 15:0
	db 0			; base 23:16
	db 0			; type
	db 0			; limit 19:16, flags
	db 0			; base 31:24

; linear code descriptor
LINEAR_CODE_SEL	equ	$-gdt
	dw 0FFFFh               ; maximum limit 0FFFFFh (1 meg or 4 gig)
	dw 0			; base for LINEAR_DATA_SEL is always 0
	db 0
	db 9Ah			; present,ring 0,code,non-conforming,readable
	db 0CFh                 ; page-granular (4 gig limit), 32-bit
	db 0

; linear data descriptor
LINEAR_DATA_SEL	equ	$-gdt
	dw 0FFFFh               ; maximum limit 0FFFFFh (1 meg or 4 gig)
	dw 0			; base for LINEAR_DATA_SEL is always 0
	db 0
	db 92h			; present, ring 0, data, expand-up, writable
	db 0CFh                 ; page-granular (4 gig limit), 32-bit
	db 0

gdt_cs:
; code descriptor
SYS_CODE_SEL	equ	$-gdt
	dw 0FFFFh		; maximum limit 0FFFFFh (1 meg or 4 gig)
	dw 0			; base address; gets patched above
	db 0
	db 9Ah			; present,ring 0,code,non-conforming,readable
	db 0CFh			; page-granular (4 gig limit), 32-bit
	db 0

gdt_ds:
; data descriptor
SYS_DATA_SEL	equ	$-gdt
	dw 0FFFFh		; maximum limit 0FFFFFh (1 meg or 4 gig)
	dw 0			; base address; gets patched above
	db 0
	db 92h			; present, ring 0, data, expand-up, writable
	db 0CFh			; page-granular (4 gig limit), 32-bit
	db 0
gdt_end:

; To save 6 bytes of memory, this structure can be stored at (gdt+0);
; in the NULL descriptor. That can be confusing, however.
gdt_ptr:
	dw gdt_end - gdt - 1	; GDT limit
	dd gdt			; linear address of GDT; gets patched above