diff --git a/Makefile b/Makefile index 2ae98f6..84d3bce 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ include magic.mk CFLAGS = -march=armv5te -mfloat-abi=soft -Wall \ -Os -ggdb -Iinclude +AFLAGS = -D__ASSEMBLY__ LDFLAGS = --nostdlib -T fernvale.ld LIBS = @@ -12,12 +13,17 @@ SRC_C = \ cmd-irq.c \ irq.c \ main.c \ + scriptic.c \ serial.c \ utils.c \ vectors.c \ vsprintf.c SRC_S = \ + scriptic/set-plls.S \ + scriptic/enable-psram.S \ + _udivsi3.S \ + _divsi3.S \ start.S OBJ = $(addprefix $(BUILD)/, $(SRC_S:.S=.o) $(SRC_C:.c=.o)) @@ -52,9 +58,9 @@ $(OBJ): $(HEADER_BUILD)/generated.h | $(OBJ_DIRS) $(HEADER_BUILD)/generated.h: | $(HEADER_BUILD) touch $@ -OBJ_DIRS = $(sort $(dir $(OBJ))) +OBJ_DIRS = $(sort $(dir $(OBJ))) scriptic $(OBJ_DIRS): - $(MKDIR) -p $@ + $(MKDIR) -p $@ $@/scriptic $(HEADER_BUILD): - $(MKDIR) -p $@ + $(MKDIR) -p $@ build/scriptic -include $(OBJ:.o=.P) diff --git a/_divsi3.S b/_divsi3.S new file mode 100644 index 0000000..cfbadb2 --- /dev/null +++ b/_divsi3.S @@ -0,0 +1,142 @@ + +.macro ARM_DIV_BODY dividend, divisor, result, curbit + +#if __LINUX_ARM_ARCH__ >= 5 + + clz \curbit, \divisor + clz \result, \dividend + sub \result, \curbit, \result + mov \curbit, #1 + mov \divisor, \divisor, lsl \result + mov \curbit, \curbit, lsl \result + mov \result, #0 + +#else + + @ Initially shift the divisor left 3 bits if possible, + @ set curbit accordingly. This allows for curbit to be located + @ at the left end of each 4 bit nibbles in the division loop + @ to save one loop in most cases. + tst \divisor, #0xe0000000 + moveq \divisor, \divisor, lsl #3 + moveq \curbit, #8 + movne \curbit, #1 + + @ Unless the divisor is very big, shift it up in multiples of + @ four bits, since this is the amount of unwinding in the main + @ division loop. Continue shifting until the divisor is + @ larger than the dividend. +1: cmp \divisor, #0x10000000 + cmplo \divisor, \dividend + movlo \divisor, \divisor, lsl #4 + movlo \curbit, \curbit, lsl #4 + blo 1b + + @ For very big divisors, we must shift it a bit at a time, or + @ we will be in danger of overflowing. +1: cmp \divisor, #0x80000000 + cmplo \divisor, \dividend + movlo \divisor, \divisor, lsl #1 + movlo \curbit, \curbit, lsl #1 + blo 1b + + mov \result, #0 + +#endif + + @ Division loop +1: cmp \dividend, \divisor + subhs \dividend, \dividend, \divisor + orrhs \result, \result, \curbit + cmp \dividend, \divisor, lsr #1 + subhs \dividend, \dividend, \divisor, lsr #1 + orrhs \result, \result, \curbit, lsr #1 + cmp \dividend, \divisor, lsr #2 + subhs \dividend, \dividend, \divisor, lsr #2 + orrhs \result, \result, \curbit, lsr #2 + cmp \dividend, \divisor, lsr #3 + subhs \dividend, \dividend, \divisor, lsr #3 + orrhs \result, \result, \curbit, lsr #3 + cmp \dividend, #0 @ Early termination? + movnes \curbit, \curbit, lsr #4 @ No, any more bits to do? + movne \divisor, \divisor, lsr #4 + bne 1b + +.endm + +.macro ARM_DIV2_ORDER divisor, order + +#if __LINUX_ARM_ARCH__ >= 5 + + clz \order, \divisor + rsb \order, \order, #31 + +#else + + cmp \divisor, #(1 << 16) + movhs \divisor, \divisor, lsr #16 + movhs \order, #16 + movlo \order, #0 + + cmp \divisor, #(1 << 8) + movhs \divisor, \divisor, lsr #8 + addhs \order, \order, #8 + + cmp \divisor, #(1 << 4) + movhs \divisor, \divisor, lsr #4 + addhs \order, \order, #4 + + cmp \divisor, #(1 << 2) + addhi \order, \order, #3 + addls \order, \order, \divisor, lsr #1 + +#endif + +.endm + + .align 5 +.globl __divsi3 +.globl __aeabi_idiv +__divsi3: +__aeabi_idiv: + cmp r1, #0 + eor ip, r0, r1 @ save the sign of the result. + beq Ldiv0 + rsbmi r1, r1, #0 @ loops below use unsigned. + subs r2, r1, #1 @ division by 1 or -1 ? + beq 10f + movs r3, r0 + rsbmi r3, r0, #0 @ positive dividend value + cmp r3, r1 + bls 11f + tst r1, r2 @ divisor is power of 2 ? + beq 12f + + ARM_DIV_BODY r3, r1, r0, r2 + + cmp ip, #0 + rsbmi r0, r0, #0 + mov pc, lr + +10: teq ip, r0 @ same sign ? + rsbmi r0, r0, #0 + mov pc, lr + +11: movlo r0, #0 + moveq r0, ip, asr #31 + orreq r0, r0, #1 + mov pc, lr + +12: ARM_DIV2_ORDER r1, r2 + + cmp ip, #0 + mov r0, r3, lsr r2 + rsbmi r0, r0, #0 + mov pc, lr + +Ldiv0: + + str lr, [sp, #-4]! + bl __div0 + mov r0, #0 @ About as wrong as it could be. + ldr pc, [sp], #4 diff --git a/_udivsi3.S b/_udivsi3.S new file mode 100644 index 0000000..1309802 --- /dev/null +++ b/_udivsi3.S @@ -0,0 +1,93 @@ +/* # 1 "libgcc1.S" */ +@ libgcc1 routines for ARM cpu. +@ Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk) +dividend .req r0 +divisor .req r1 +result .req r2 +curbit .req r3 +/* ip .req r12 */ +/* sp .req r13 */ +/* lr .req r14 */ +/* pc .req r15 */ + .text + .globl __udivsi3 + .type __udivsi3 ,function + .globl __aeabi_uidiv + .type __aeabi_uidiv ,function + .align 0 + __udivsi3: + __aeabi_uidiv: + cmp divisor, #0 + beq Ldiv0 + mov curbit, #1 + mov result, #0 + cmp dividend, divisor + bcc Lgot_result +Loop1: + @ Unless the divisor is very big, shift it up in multiples of + @ four bits, since this is the amount of unwinding in the main + @ division loop. Continue shifting until the divisor is + @ larger than the dividend. + cmp divisor, #0x10000000 + cmpcc divisor, dividend + movcc divisor, divisor, lsl #4 + movcc curbit, curbit, lsl #4 + bcc Loop1 +Lbignum: + @ For very big divisors, we must shift it a bit at a time, or + @ we will be in danger of overflowing. + cmp divisor, #0x80000000 + cmpcc divisor, dividend + movcc divisor, divisor, lsl #1 + movcc curbit, curbit, lsl #1 + bcc Lbignum +Loop3: + @ Test for possible subtractions, and note which bits + @ are done in the result. On the final pass, this may subtract + @ too much from the dividend, but the result will be ok, since the + @ "bit" will have been shifted out at the bottom. + cmp dividend, divisor + subcs dividend, dividend, divisor + orrcs result, result, curbit + cmp dividend, divisor, lsr #1 + subcs dividend, dividend, divisor, lsr #1 + orrcs result, result, curbit, lsr #1 + cmp dividend, divisor, lsr #2 + subcs dividend, dividend, divisor, lsr #2 + orrcs result, result, curbit, lsr #2 + cmp dividend, divisor, lsr #3 + subcs dividend, dividend, divisor, lsr #3 + orrcs result, result, curbit, lsr #3 + cmp dividend, #0 @ Early termination? + movnes curbit, curbit, lsr #4 @ No, any more bits to do? + movne divisor, divisor, lsr #4 + bne Loop3 +Lgot_result: + mov r0, result + mov pc, lr +Ldiv0: + str lr, [sp, #-4]! + bl __div0 (PLT) + mov r0, #0 @ about as wrong as it could be + ldmia sp!, {pc} + .size __udivsi3 , . - __udivsi3 + +.globl __aeabi_uidivmod +__aeabi_uidivmod: + + stmfd sp!, {r0, r1, ip, lr} + bl __aeabi_uidiv + ldmfd sp!, {r1, r2, ip, lr} + mul r3, r0, r2 + sub r1, r1, r3 + mov pc, lr + +.globl __aeabi_idivmod +__aeabi_idivmod: + + stmfd sp!, {r0, r1, ip, lr} + bl __aeabi_idiv + ldmfd sp!, {r1, r2, ip, lr} + mul r3, r0, r2 + sub r1, r1, r3 + mov pc, lr diff --git a/start.S b/start.S index aa77d52..51b9fc6 100644 --- a/start.S +++ b/start.S @@ -89,24 +89,6 @@ rv_end: .text -.global __udiv64 -__udiv64: - adds r0,r0,r0 - adc r1,r1,r1 - - .rept 31 - cmp r1,r2 - subcs r1,r1,r2 - adcs r0,r0,r0 - adc r1,r1,r1 - .endr - - cmp r1,r2 - subcs r1,r1,r2 - adcs r0,r0,r0 - - bx lr - .global ram_memcpy ram_memcpy: ldr r0, =ram_memcpy_addr diff --git a/vsprintf.c b/vsprintf.c index b43ea4d..6d21faf 100644 --- a/vsprintf.c +++ b/vsprintf.c @@ -251,7 +251,7 @@ static noinline char* put_dec(char *buf, unsigned NUM_TYPE num) unsigned rem; if (num < 100000) return put_dec_trunc(buf, num); - rem = _udiv64(num, 100000); + rem = do_div(num, 100000); buf = put_dec_full(buf, rem); } }