;   Title:  Signed and Unsigned Division Subroutines for the
;           PPC: 96 and 64 bits by 32 bits
;    File:  div.s
;  Author:  David N. Williams
; License:  LGPL and GPL
; Started:  March 8, 2006
; Revised:  March 11, 2006
; Revised:  March 12, 2006
; Revised:  March 14, 2006
; Revised:  March 15, 2006
; Revised:  March 17, 2006
; Revised:  March 18, 2006
; Revised:  March 19, 2006
; Revised:  April 9, 2006 (dual license statement)
;
; Copyright (C) 2006 David N. Williams
;
; This library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public
; License as published by the Free Software Foundation; either
; version 2.1 of the License, or at your option any later
; version.
;
; It is also provided under the terms of the GNU General Purpose
; License. Anyone who modifies this code is requested to follow
; the same dual license policy.
;
; This library is distributed in the hope that it will be
; useful, but WITHOUT ANY WARRANTY; without even the implied
; warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
; PURPOSE.  See the GNU Lesser General Public License and the
; GNU General Public License for more details.

; OVERVIEW
;
; This code is written for the assembly language syntax of
; Apple's version of gas.
;
; It is based on the binary restoring shift and subtract
; algorithm as described in IBM's "The PowerPPC Compiler
; Writer's Guide", 1996, pp. 62-65, and implemented there for
; unsigned 64-bit by 64-bit division with 64-bit quotient and
; remainder.  That implementation does not correctly handle the
; special case in which the denominator has 64 significant
; digits.  This code does handle our analogous case, in which
; the denominator has 32 significant digits.
;
; There are four basic subroutines, and four driver subroutines.
; Two of the drivers are for unsigned and two are for signed
; division.  They do significant digit case analysis and call
; the appropriate subroutine.
; 
; The four basic subroutines are:
;
;   udiv96by32to96   udiv64by32to64
;   udiv96by32to64   udiv64by32to32
;
; Each assumes a "full" numerator, with nonzero (binary) digits
; in the most significant 32-bit word, and a nonzero
; denominator.
;
; The four driver routines are:
; 
;   udiv96by32   udiv64by32
;   sdiv96by32   sdiv64by32
;
; The two unsigned driver routines analyze the significant digit
; cases and call the appropriate basic subroutine, or the normal
; ppc assembly language op divwu in case only single precision
; is needed.
;
; The two signed driver routines filter out the most negative
; number overflow case, strip and retain sign information, call
; the appropriate unsigned driver, and attach signs to the
; quotient and remainder corresponding to symmetric division.
;
; Numerator arguments are passed in r3:r4:r5 for the 96-bit
; numerator routines, and in r4:r5 for the 64-bit numerator
; routines.  The quotient is also returned in r3:r4:r5 and
; r4:r5, respectively.
;
; In all cases, the denominator is passed and the remainder
; is returned in r6.
;
; Only registers that are declared as volatile and callee
; available in the Mac OS X ppc ABI are used.

; NOTATION
;
; num   = numerator
; denom = denominator
; quo   = quotient
; rem   = remainder
; sd    = significant digits
; lz    = leading zero digits
; #iter = number of loop iterations
;       = num.sd - denom.sd + 1
;	= number of actual quotient bits produced
; CA    = carry bit

; RESTORING SHIFT AND SUBTRACT ALGORITHM
; (nx32 bits divided by 32 bits)
;
; Everything here is big-endian, with the most significant bits
; to the left.
;
; The numerator num.hi:...:num.lo treated as a combined register
; is initially left-shifted into the full combined register
;
;   tmp:num.hi:...:num.lo
;
; so that its leading significant digit is aligned in tmp one
; bit to the right of the leading significant digit of the
; denominator.  Actually there are cases where tmp can be
; identified with num.hi.
;
; The shift and subtract loop builds the remainder bit by bit
; into the right end of tmp (32 bits), and the quotient into the
; right end of num.hi:...:num.lo.  At the top of the loop, the
; full combined register is shifted one bit to the left, with
; any previous CA shifted into the rightmost bit, and with an
; initial CA of zero.  The denominator is subtracted from tmp. 
; If the result is negative, CA is set to zero, and tmp is
; unchanged.  Otherwise CA is set to one, and tmp is set to the
; difference.  Doing this #iter times produces all bits of the
; remainder in tmp, and #iter - 1 bits of the quotient in the
; rest of the full combined register, called the quotient
; register.  The remaining bit of the quotient is produced after
; leaving the loop, by left-shifting the last CA into the right
; end of the quotient register.
;
; When the denominator has 32 significant bits, tmp needs one
; extra bit of headroom.

; DRIVER SUBROUTINES

udiv96by32:
	; Divide unsigned 96 bits by unsigned 32 bits.
	; This routine assumes that denom is nonzero.
	;
	; in:	r3:r4:r5 = num.hi:num.mi:num.lo
	; 	r6       = denom
	; out:	r3:r4:r5 = quo.hi:quo.mi:quo.lo
	;	r6       = rem
	;	r0,r7-r9 = ???

	cntlzw	r8,r6		; r8 = denom.lz
	cntlzw	r7,r3		; r7 = num.hi.lz
	cmpw	r7,r8		; if num.hi.lz > denom.lz
	bgt	0f		; then try num.hi = 0
	b	udiv96by32to96	; else full 96 divide
0:	cmpwi	r3,0		; if num.hi = 0
	beq	1f		; then try num.mi
	b	udiv96by32to64	; else lesser 96 divide
1:				; r3 = num.hi = 0
	cntlzw	r7,r4		; r7 = num.mi.lz
	cmpw	r7,r8		; if num.mi.lz > denom.lz
	bgt	2f		; then try num.mi = 0
	b	udiv64by32to64	; else full 64 divide
2:	cmpwi	r4,0		; if num.mi = 0
	beq	3f		; then normal divide
	b	udiv64by32to32	; else lesser 64 divide
3:				; r4 = num.mi = 0
	divwu	r7,r5,r6	; r7 = quo.lo
	mullw	r6,r7,r6	; r6 = quo.lo*denom
	subf	r6,r6,r5	; r6 = num.lo - quo.lo*denom = rem
	mr	r5,r7		; r5 = quo.lo
	blr

sdiv96by32:
	; Divide signed 96 bits by signed 32 bits, with
	; symmetric quotient and remainder.  The remainder has
	; the sign of the numerator, and the quotient has the
	; product of signs of the numerator and denominator.
	;
	; This routine assumes that denom <> 0.  It tests for 
	; 0x80000000 00000000 00000000 / -1 overflow.
	;
	; in:	r3:r4:r5 = num.hi:num.mi:num.lo
	; 	r6	 = denom
	; out:	EQ = 0, no overflow
	;	  r3:r4:r5 = quo.hi:quom.mi:quo.lo
	;	  r6    = rem
	;         r0    = 1
	;	  r7-r9, r11 = ???
	;	  cr2,cr3 = ???
	;	EQ = 1, overflow
	;	  r3:r4:r5 = num.hi:num.lo
	;	  r6	   = denom
	;	  r0	   = 0

	rlwimi	r0,r3,1,0,31	; r0 = num.hi rotated left 1 bit
	xori	r0,r0,1		; r0 = 0 iff num.hi = most neg
	or	r0,r0,r4
	or	r0,r0,r5	; r0 = 0 iff num.hi:num.mi:num.lo = most neg
	orc.	r0,r0,r6	; r0 = r0 | ~denom
	beqlr			; if EQ = 1 then err exit
	mflr	r11
	cmpwi	r3,0		; num < 0?
	mcrf	cr2,cr0		; cr2.LT = cr0.LT (sign of num)
	cmpwi	r6,0		; denom < 0?
	crxor	12,0,8		; cr3.LT = cr0.LT xor cr2.LT
	_ABS	r6,r7		; r6 = |denom|, r0 = ???
	srawi	r7,r3,31	; r7 = (d < 0) ? -1 : 0
	xor	r3,r7,r3	; r3 = (d < 0) ? ~d.hi : d.hi
	xor	r4,r7,r4	; r4 = (d < 0) ? ~d.mi : d.mi
	xor 	r5,r7,r5	; r5 = (d < 0) ? ~d.lo : d.lo
	neg	r7,r7		; r7 = (d < 0) ? 1 : 0
	addc	r5,r7,r5
	addze	r4,r4
	addze	r3,r3
	bl	udiv96by32
	mtlr	r11
	bge	cr2,1f		; skip if numer >=0
	neg	r6,r6		; r6 = -|rem|
1:	bge	cr3,2f		; skip if denom*num >=0
	nor	r3,r3,r3
	nor	r4,r4,r4
	nor	r5,r5,r5
	addic	r5,r5,1
	addze	r4,r4
	addze	r3,r3
2:	li	r0,1
	cmpwi	r0,0		; EQ = 0
	blr

udiv64by32:
	; Divide unsigned 64 bits by unsigned 32 bits.
	; This routine assumes that denom is nonzero.
	;
	; in:	r4:r5 = num.hi:num.lo
	; 	r6    = denom
	; out:	r4:r5 = quo.hi:quo.lo
	;	r6    = rem
	;    r0,r7-r9 = ???

	cntlzw	r8,r6		; r8 = denom.lz, if denom = 0
	cntlzw	r7,r4		; r7 = num.hi.lz
	cmpw	r7,r8		; if num.hi.lz > denom.lz
	bgt	0f		; then try num.hi = 0
	b	udiv64by32to64	; else full divide
0:	cmpwi	r4,0		; if num.hi = 0
	beq	3b		; then normal divide
	b	udiv64by32to32	; else lesser divide

sdiv64by32:
	; Divide signed 64 bits by signed 32 bits, with
	; symmetric quotient and remainder.  The remainder has
	; the sign of the numerator, and the quotient has the
	; product of signs of the numerator and denominator.
	;
	; This routine assumes that denom <> 0.  It tests for 
	; 0x80000000 00000000 / -1 overflow.
	;
	; in:	r4:r5 = num.hi:num.lo
	; 	r6    = denom
	; out:	EQ = 0, no overflow
	;	  r4:r5 = quo.hi:quo.lo
	;	  r6    = rem
	;         r0    = 1
	;	  r7-r9, r11 = ???
	;	  cr2,cr3 = ???
	;	EQ = 1, overflow
	;	  r4:r5 = num.hi:num.lo
	;	  r6    = denom
	;	  r0    = 0

	rlwimi	r0,r4,1,0,31	; r0 = num.hi rotated left 1 bit
	xori	r0,r0,1		; r0 = 0 iff num.hi = most neg
	or	r0,r0,r5	; r0 = 0 iff num.hi:num.lo = most neg
	orc.	r0,r0,r6	; r0 = r0 | ~denom
	beqlr			; if EQ = 1 then err exit
	mflr	r11
	cmpwi	r4,0		; num < 0?
	mcrf	cr2,cr0		; cr2.LT = cr0.LT (sign of num)
	cmpwi	r6,0		; denom < 0?
	crxor	12,0,8		; cr3.LT = cr0.LT xor cr2.LT
	_ABS	r6,r7		; r6 = |denom|, r0 = ???
	_DABS	r4,r5,r7	; r4:r5 = |num|, r0 = ???
	bl	udiv64by32
	mtlr	r11
	bge	cr2,1f		; skip if numer >=0
	neg	r6,r6		; r6 = -|rem|
1:	bge	cr3,2f		; skip if denom*num >=0
	_DNEG	r4,r5		; r4:r5 = -|num|
2:	li	r0,1
	cmpwi	r0,0		; EQ = 0
	blr

; BASIC SUBROUTINES

udiv96by32to96:
	; Unsigned 96-bit by 32-bit divide with 96-bit quotient
	; and 32-bit remainder.
	;
	; This routine assumes that denom is nonzero, that
	; numer.sd > 64, and that numer.hi.sd >= denom.sd.
	;
	; If numer.hi.sd > denom.sd or numer.hi = denom, then
	; quo.sd > 64.
	;
	; If numer.hi.sd = denom.sd and numer.hi < denom, then
	; quo.sd <= 64.  This is the only case for which we need
	; this routine when the quotient is required to fit into
	; 64 bits.
	;
	; input:      r3:r4:r5 = num.hi:num.mi:num.lo
	; 	      r6       = denom
	;	      r7       = num.hi.lz = num.lz
	;	      r8       = denom.lz >= num.hi.lz
	; output:     r3:r4:r5 = quo.hi:quo.mi:quo.lo
	; 	      r6       = rem
	;	      r0,r7-r9 = ???

	; Get alignment right shift:
	;   r7 = R
	;      = num.hi.sd - denom.sd + 1
	;      = denom.lz -num.hi.lz + 1
	; Get alignment left shift:
	;   r8 = L = 32 - R
	;      = 31 - denom.lz + num.hi.lz
	; Note that #iter = num.sd - denom.sd + 1 = R + 64.
	;
	subfic	r8,r8,31	; r8 = 31 - denom.lz
	add	r8,r7,r8	; r8 = 31 - denom.lz + num.hi.lz = L
	subfic	r7,r8,32	; r7 = 32 -L = R

	; Let r9 = tmp.  Left shift num.hi:num.mi:num.lo
	; into r9:r3:r4:r5 so that tmp.sd = denom.sd - 1.
	;
	srw	r9,r3,r7	; r9 = num.hi >> R
	slw	r3,r3,r8	; r3 = num.hi << L
	srw	r0,r4,r7	; r0 = num.mi >> R
	or	r3,r0,r3	; r3 = num.hi << L | num.mi >> R
	slw	r4,r4,r8	; r4 = num.mi << L
	srw	r0,r5,r7	; r0 = num.lo >> R
	or	r4,r0,r4	; r4 = num.mi << L | num.lo >> R
	slw	r5,r5,r8	; r5 = num.lo << L

	; Initialize loop count and CA.
	;
	addic	r7,r7,64	; r7 = #iter = num.sd -denom.sd + 1, CA = 0
	mtctr	r7
;	li	r8,-1		; r8 = -1 for setting CA when tmp - denom >= 0

	; If denom.sd = 32, we need one extra bit of headroom
	; for tmp.  Conditionally branch to code which uses CA
	; for that.
	;
	cmpwi	r6,0
	blt	2f

	; Restoring shift and subtract loop for denom.sd < 32.
	;
0:	adde	r5,r5,r5	; shift full combination left one bit,
	adde	r4,r4,r4	;   adding previous CA
	adde	r3,r3,r3
	adde	r9,r9,r9
;	subfc.	r0,r6,r9	; r0 = tmp - denom
;	blt	1f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged	
	cmplw	r9,r6		; tmp < denom? (unsigned)
	subfc	r0,r6,r9	; r0 = tmp - denom
	blt	1f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged
	mr	r9,r0		; else tmp = difference
;	addic	r0,r8,1		; CA = 1
1:	bdnz	0b

	; Now r9 = rem.  Left shift quotient combination once
	; more, including last CA as least significant bit.
	;
7:	adde	r5,r5,r5
	adde	r4,r4,r4
	adde	r3,r3,r3	; r3:r4:r5 = quo
	mr	r6,r9		; r6 = rem
	blr

	; Restoring shift and subtract loop for denom.sd = 32.
	;
2:	li	r7,0		; for detecting CA
	li	r8,-1		; for setting CA = 1
3:	adde	r5,r5,r5	; shift full combination left one bit,
	adde	r4,r4,r4	;   adding previous CA
	adde	r3,r3,r3
	adde	r9,r9,r9
	addze.	r0,r7		; if CA = 1
	bne	4f		; then subtract
				; else
	cmplw	r9,r6		; tmp < denom? (unsigned)
	subfc	r0,r6,r9	; r0 = tmp - denom
	blt	5f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged
	mr	r9,r0		; else tmp = difference
5:	bdnz	3b
	b	7b	
4:	addic	r0,r8,1		; CA = 1
	subf	r9,r6,r9	; tmp = difference
	bdnz	3b
	b	7b

udiv96by32to64:
	; Unsigned 96-bit by 32-bit divide with 64-bit quotient
	; and 32-bit remainder.
	;
	; This routine assumes that denom is nonzero, that
	; num.sd > 64, and that num.hi.sd < denom.sd.  It
	; follows that quo.sd <= 64.
	;
	; The other case where denom is nonzero, numer.sd > 64,
	; and quo fits into 64 bits, is num.hi.sd = denom.sd.
	; That case is covered by uvdiv96by32to96, which then
	; yields zero for the most significant 32 bits of its
	; quo.
	;
	; input:      r3:r4:r5 = num.hi:num.mi:num.lo
	;             r6       = denom <> 0
	;	      r7       = num.hi.lz = num.lz
	;	      r8       = denom.lz < num.hi.lz
	; output:     r3:r4:r5 = 0:quo.hi:quo.lo
	;             r6       = rem
	;	      r0,r7-r8 = ???

	; Get alignment left shift:
	;  r8 = L
	;     = denom.sd - num.hi.sd - 1
	;     = num.hi.lz - denom.lz - 1
	; Get alignment right shift:
	;   r7 = R = 32 - L
	;      = 33 - num.hi.lz + denom.lz
	;      = 33 + num.hi.sd - denom.sd
	; Note that #iter = num.sd - denom.sd + 1 = 32 + R
	;
	subfic	r7,r7,33	; r7 = 33 - num.hi.lz
	add	r7,r8,r7	; r7 = 33 - num.hi.lz + denom.lz = R
	subfic	r8,r7,32	; r8 = 32 - R = L

	; We don't need an extra register for tmp, just identify
	; it with r3.  Left shift num.hi:num.mi:num.lo into
	; r3:r4:r5 so that tmp.sd = denom.sd - 1.
	;
	slw	r3,r3,r8	; r3 = num.hi << L
	srw	r0,r4,r7	; r0 = num.mi >> R
	or	r3,r0,r3	; r3 = num.hi << L | num.mi >> R
	slw	r4,r4,r8	; r4 = num.mi << L
	srw	r0,r5,r7	; r0 = num.lo >> R
	or	r4,r0,r4	; r4 = num.mi << L | num.lo >> R
	slw	r5,r5,r8	; r5 = num.lo << L

	; Initialize loop count and CA.
	;
	addic	r7,r7,32	; r7 = #iter = num.sd -denom.sd + 1, CA = 0
	mtctr	r7
;	li	r8,-1		; r8 = -1 for setting CA when tmp - denom >= 0

	; If denom.sd = 32, we need one extra bit of headroom
	; for tmp.  Conditionally branch to code which uses CA
	; for that.
	;
	cmpwi	r6,0
	blt	2f

	; Restoring shift and subtract loop for denom.sd < 32.
	;
0:	adde	r5,r5,r5	; shift full combination left one bit,
	adde	r4,r4,r4	;   adding previous CA
	adde	r3,r3,r3
;	subfc.	r0,r6,r3	; r0 = tmp - denom
;	blt	1f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged
	cmplw	r3,r6		; tmp < denom? (unsigned)
	subfc	r0,r6,r3	; r0 = tmp - denom
	blt	1f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged
	mr	r3,r0		; else tmp = difference
;	addic	r0,r8,1		; CA = 1
1:	bdnz	0b

	; Now r3 = rem.  Left shift quotient combination once
	; more, including last CA as least significant bit.
	;
7:	adde	r5,r5,r5
	adde	r4,r4,r4	; r4:r5 = quo
	mr	r6,r3		; r6 = rem
	li	r3,0		; r3:r4:r4 = quo
	blr

	; Restoring shift and subtract loop for denom.sd = 32.
	;
2:	li	r7,0		; for detecting CA
	li	r8,-1		; for setting CA = 1
3:	adde	r5,r5,r5	; shift full combination left one bit,
	adde	r4,r4,r4	;   adding previous CA
	adde	r3,r3,r3
	addze.	r0,r7		; if CA = 1
	bne	4f		; then subtract
				; else
	cmplw	r3,r6		; tmp < denom? (unsigned)
	subfc	r0,r6,r3	; r0 = tmp - denom
	blt	5f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged
	mr	r3,r0		; else tmp = difference
5:	bdnz	3b
	b	7b	
4:	addic	r0,r8,1		; CA = 1
	subf	r3,r6,r3	; tmp = difference
	bdnz	3b
	b	7b

udiv64by32to64:
	; Unsigned 64-bit by 32-bit divide with 64-bit quotient
	; and 32-bit remainder.

	; This routine assumes that denom is nonzero, that
	; numer.sd > 32, and that numer.hi.sd >= denom.sd.
	;
	; If numer.hi.sd > denom.sd or numer.hi = denom, then
	; quo.sd > 32.
	;
	; If numer.hi.sd = denom.sd and numer.hi < denom, then
	; quo.sd <= 32.
	;
	; input:      r4:r5  = num.hi:num.lo, numer.hi >= denom
	;             r6     = denom <> 0
	;	      r7     = num.hi.lz = num.lz
	;	      r8     = denom.lz >= num.hi.lz
	; output:     r4:r5  = quo.hi:quo.lo
	;             r6     = rem
	;	    r0,r7-r9 = ???

	; Get alignment right shift:
	;   r7 = R
	;      = num.hi.sd - denom.sd + 1
	;      = denom.lz - num.hi.lz + 1
	; Get alignment left shift:
	;   r8 = L = 32 - R
	;      = 31 - denom.lz + num.hi.lz
	; Note that #iter = num.sd - denom.sd + 1 = R + 32.
	;
	subfic	r8,r8,31	; r8 = 31 - denom.lz
	add	r8,r7,r8	; r8 = 31 - denom.lz + num.hi.lz = L
	subfic	r7,r8,32	; r7 = 32 - L = R

	; Let r9 = tmp.  Left shift num.hi:num.lo into r9:r4:r5
	; so that tmp.sd = denom.sd - 1.
	;
	srw	r9,r4,r7	; r9 = num.hi >> R
	slw	r4,r4,r8	; r4 = num.hi << L
	srw	r0,r5,r7	; r0 = num.lo >> R
	or	r4,r0,r4	; r4 = num.hi << L | num.lo >> R
	slw	r5,r5,r8	; r5 = num.lo << L

	; Initialize loop count and CA.
	;
	addic	r7,r7,32	; r7 = #iter = num.sd -denom.sd + 1, CA = 0
	mtctr	r7
;	li	r8,-1		; r8 = -1 for setting CA when tmp - denom >= 0

	; If denom.sd = 32, we need one extra bit of headroom
	; for tmp.  Conditionally branch to code which uses CA
	; for that.
	;
	cmpwi	r6,0
	blt	2f

	; Restoring shift and subtract loop for denom.sd < 32.
	;
0:	adde	r5,r5,r5	; shift full combination left one bit,
	adde	r4,r4,r4	;   adding previous CA
	adde	r9,r9,r9
;	subfc.	r0,r6,r9	; r0 = tmp - denom
;	blt	1f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged	
	cmplw	r9,r6		; tmp < denom? (unsigned)
	subfc	r0,r6,r9	; r0 = tmp - denom
	blt	1f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged
	mr	r9,r0		; else tmp = difference
;	addic	r0,r8,1		; CA = 1
1:	bdnz	0b

	; Now r9 = rem.  Left shift quotient combination once
	; more, including last CA as least significant bit.
	;
7:	adde	r5,r5,r5
	adde	r4,r4,r4	; r4:r5 = quo
	mr	r6,r9		; r6 = rem
	blr

	; Restoring shift and subtract loop for denom.sd = 32.
	;
2:	li	r7,0		; for detecting CA
	li	r8,-1		; for setting CA = 1
3:	adde	r5,r5,r5	; shift full combination left one bit,
	adde	r4,r4,r4	;   adding previous CA
	adde	r9,r9,r9
	addze.	r0,r7		; if CA = 1
	bne	4f		; then subtract
				; else
	cmplw	r9,r6		; tmp < denom? (unsigned)
	subfc	r0,r6,r9	; r0 = tmp - denom
	blt	5f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged
	mr	r9,r0		; else tmp = difference
5:	bdnz	3b
	b	7b	
4:	addic	r0,r8,1		; CA = 1
	subf	r9,r6,r9	; tmp = difference
	bdnz	3b
	b	7b

udiv64by32to32:
	; Unsigned 64-bit by 32-bit divide with 32-bit quotient
	; and 32-bit remainder.
	;
	; This routine assumes that denom is nonzero, that
	; numer.sd > 32, and that numer.hi.sd < denom.sd.  It
	; follows that quo.sd <= 32.
	;
	; The other case where denom is nonzero, numer.sd > 32,
	; and quo fits into 32 bits, is numer.hi.sd = denom.sd.
	; and numer.hi < denom.  That case is covered by
	; uvdiv64by32to64, which then yields zero for the most
	; significant 32 bits of its quo.
	;
	; input:      r4:r5 = num.hi:num.lo
	;             r6    = denom <> 0
	;	      r7    = num.hi.lz = num.lz
	;	      r8    = denom.lz < num.hi.lz
	; output:     r4:r5 = 0:quo.lo
	;             r6    = rem
	;	 r0,r7-r8   = ???

	; Get alignment left shift:
	;   r8 = L
	;      = denom.sd - num.hi.sd - 1
	;      = num.hi.lz - denom.lz - 1
	; Get alignment right shift:
	;   r7 = R = 32 - L
	;      = 33 - num.hi.lz + denom.lz
	;      = 33 + num.hi.sd - denom.sd
	; Note that #iter = num.sd - denom.sd + 1 = R
	;
	subfic	r7,r7,33	; r7 = 33 - num.hi.lz
	add	r7,r8,r7	; r7 = 33 - num.hi.lz + denom.lz = R
	subfic	r8,r7,32	; r8 = 32 - R = L

	; We don't need an extra register for tmp, just identify
	; it with r4.  Left shift num.hi:num.lo into r4:r5 so
	; that tmp.sd = denom.sd - 1.
	;
	slw	r4,r4,r8	; r4 = num.hi << L
	srw	r0,r5,r7	; r0 = num.lo >> R
	or	r4,r0,r4	; r4 = num.mi << L | num.lo >> R
	slw	r5,r5,r8	; r5 = num.lo << L

	; Initialize loop count and CA.
	;
	addic	r7,r7,0		; CA = 0
	mtctr	r7		; #iter = num.sd - denom.sd - 1

	; If denom.sd = 32, we need one extra bit of headroom
	; for tmp.  Conditionally branch to code which uses CA
	; for that.
	;
	cmpwi	r6,0
	blt	2f

	; Restoring shift and subtract loop for denom.sd < 32.
	;
0:	adde	r5,r5,r5	; shift full combination left one bit,
	adde	r4,r4,r4	;   adding previous CA
	cmplw	r4,r6		; tmp < denom? (unsigned)
	subfc	r0,r6,r4	; r0 = tmp - denom
	blt	1f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged
	mr	r4,r0		; else CA = 1, tmp = difference
1:	bdnz	0b

	; Now r4 = rem.  Left shift the quotient once
	; more, including last CA as least significant bit.
	;
7:	adde	r5,r5,r5	; r5 = quo
	mr	r6,r4		; r6 = rem
	li	r4,0		; r4:r5 = quo
	blr

	; Restoring shift and subtract loop for denom.sd = 32.
	;
2:	li	r7,0		; for detecting CA
	li	r8,-1		; for setting CA = 1
3:	adde	r5,r5,r5	; shift full combination left one bit,
	adde	r4,r4,r4	;   adding previous CA
	addze.	r0,r7		; if CA = 1
	bne	4f		; then subtract
				; else
	cmplw	r4,r6		; tmp < denom? (unsigned)
	subfc	r0,r6,r4	; r0 = tmp - denom
	blt	5f		; if tmp < denom, then CA = 0
				;   and tmp is unchanged
	mr	r4,r0		; else tmp = difference
5:	bdnz	3b
	b	7b	
4:	addic	r0,r8,1		; CA = 1
	subf	r4,r6,r4	; tmp = difference
	bdnz	3b
	b	7b