diff --git a/include/tonccpy.h b/include/tonccpy.h new file mode 100644 index 0000000000..1be8f92cdf --- /dev/null +++ b/include/tonccpy.h @@ -0,0 +1,43 @@ +//# Stuff you may not have yet. + +#ifndef TONCCPY_H +#define TONCCPY_H + + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef unsigned int uint; +#define BIT_MASK(len) ( (1<<(len))-1 ) +static inline u32 quad8(u8 x) { x |= x<<8; return x | x<<16; } + + +//# Declarations and inlines. + +void tonccpy(void *dst, const void *src, uint size); + +void __toncset(void *dst, u32 fill, uint size); +static inline void toncset(void *dst, u8 src, uint size); +static inline void toncset16(void *dst, u16 src, uint size); +static inline void toncset32(void *dst, u32 src, uint size); + + +//! VRAM-safe memset, byte version. Size in bytes. +static inline void toncset(void *dst, u8 src, uint size) +{ __toncset(dst, quad8(src), size); } + +//! VRAM-safe memset, halfword version. Size in hwords. +static inline void toncset16(void *dst, u16 src, uint size) +{ __toncset(dst, src|src<<16, size*2); } + +//! VRAM-safe memset, word version. Size in words. +static inline void toncset32(void *dst, u32 src, uint size) +{ __toncset(dst, src, size*4); } + +#ifdef __cplusplus +} +#endif +#endif diff --git a/lib/src/_Ldtob.c b/lib/src/_Ldtob.c index 3656bd45af..7bf50da2ff 100644 --- a/lib/src/_Ldtob.c +++ b/lib/src/_Ldtob.c @@ -76,7 +76,11 @@ void _Ldtob(printf_struct *args, u8 type) { } err = _Ldunscale(&exp, args); if (err > 0) { - memcpy(args->buff, err == 2 ? "NaN" : "Inf", args->part2_len = 3); + #ifdef TARGET_NDS + tonccpy(args->buff, err == 2 ? "NaN" : "Inf", args->part2_len = 3); + #else + memcpy(args->buff, err == 2 ? "NaN" : "Inf", args->part2_len = 3); + #endif return; } if (err == 0) { @@ -212,10 +216,18 @@ static void _Genld(printf_struct *px, u8 code, u8 *p, s16 nsig, s16 xexp) { if (px->precision < nsig) { nsig = px->precision; } - memcpy(&px->buff[px->part2_len], p, px->part3_len = nsig); + #ifdef TARGET_NDS + tonccpy(&px->buff[px->part2_len], p, px->part3_len = nsig); + #else + memcpy(&px->buff[px->part2_len], p, px->part3_len = nsig); + #endif px->num_trailing_zeros = px->precision - nsig; } else if (nsig < xexp) { /* zeros before point */ - memcpy(&px->buff[px->part2_len], p, nsig); + #ifdef TARGET_NDS + tonccpy(&px->buff[px->part2_len], p, nsig); + #else + memcpy(&px->buff[px->part2_len], p, nsig); + #endif px->part2_len += nsig; px->num_mid_zeros = xexp - nsig; if (0 < px->precision || px->flags & FLAGS_HASH) { @@ -223,7 +235,11 @@ static void _Genld(printf_struct *px, u8 code, u8 *p, s16 nsig, s16 xexp) { } px->num_trailing_zeros = px->precision; } else { /* enough digits before point */ - memcpy(&px->buff[px->part2_len], p, xexp); + #ifdef TARGET_NDS + tonccpy(&px->buff[px->part2_len], p, xexp); + #else + memcpy(&px->buff[px->part2_len], p, xexp); + #endif px->part2_len += xexp; nsig -= xexp; if (0 < px->precision || px->flags & FLAGS_HASH) { @@ -232,7 +248,11 @@ static void _Genld(printf_struct *px, u8 code, u8 *p, s16 nsig, s16 xexp) { if (px->precision < nsig) { nsig = px->precision; } - memcpy(&px->buff[px->part2_len], p + xexp, nsig); + #ifdef TARGET_NDS + tonccpy(&px->buff[px->part2_len], p + xexp, nsig); + #else + memcpy(&px->buff[px->part2_len], p + xexp, nsig); + #endif px->part2_len += nsig; px->num_mid_zeros = px->precision - nsig; } @@ -254,7 +274,11 @@ static void _Genld(printf_struct *px, u8 code, u8 *p, s16 nsig, s16 xexp) { if (px->precision < --nsig) { nsig = px->precision; } - memcpy(&px->buff[px->part2_len], p, nsig); + #ifdef TARGET_NDS + tonccpy(&px->buff[px->part2_len], p, nsig); + #else + memcpy(&px->buff[px->part2_len], p, nsig); + #endif px->part2_len += nsig; px->num_mid_zeros = px->precision - nsig; } diff --git a/lib/src/_Litob.c b/lib/src/_Litob.c index 58ed081fff..526efefcd4 100644 --- a/lib/src/_Litob.c +++ b/lib/src/_Litob.c @@ -44,7 +44,11 @@ void _Litob(printf_struct *args, u8 type) { args->part2_len = BUFF_LEN - buff_ind; - memcpy(args->buff, buff + buff_ind, args->part2_len); + #ifdef TARGET_NDS + tonccpy(args->buff, buff + buff_ind, args->part2_len); + #else + memcpy(args->buff, buff + buff_ind, args->part2_len); + #endif if (args->part2_len < args->precision) { args->num_leading_zeros = args->precision - args->part2_len; diff --git a/lib/src/guMtxF2L.c b/lib/src/guMtxF2L.c index 2b9d93592c..11fe0da9e1 100644 --- a/lib/src/guMtxF2L.c +++ b/lib/src/guMtxF2L.c @@ -42,7 +42,11 @@ void guMtxL2F(float mf[4][4], Mtx *m) { } #else void guMtxF2L(float mf[4][4], Mtx *m) { - memcpy(m, mf, sizeof(Mtx)); + #ifdef TARGET_NDS + tonccpy(m, mf, sizeof(Mtx)); + #else + memcpy(m, mf, sizeof(Mtx)); + #endif } #endif diff --git a/lib/src/sprintf.c b/lib/src/sprintf.c index 8450fbc530..7ce054f532 100644 --- a/lib/src/sprintf.c +++ b/lib/src/sprintf.c @@ -17,5 +17,9 @@ int sprintf(char *dst, const char *fmt, ...) { } char *proutSprintf(char *dst, const char *src, size_t count) { - return (char *) memcpy((u8 *) dst, (u8 *) src, count) + count; + #ifdef TARGET_NDS + return (char *) tonccpy((u8 *) dst, (u8 *) src, count) + count; + #else + return (char *) memcpy((u8 *) dst, (u8 *) src, count) + count; + #endif } diff --git a/src/engine/level_script.c b/src/engine/level_script.c index b0463393d9..92ebbd4318 100644 --- a/src/engine/level_script.c +++ b/src/engine/level_script.c @@ -604,7 +604,11 @@ static void level_cmd_set_terrain_data(void) { data = segmented_to_virtual(CMD_GET(void *, 4)); size = get_area_terrain_size(data) * sizeof(Collision); gAreas[sCurrAreaIndex].terrainData = alloc_only_pool_alloc(sLevelPool, size); - memcpy(gAreas[sCurrAreaIndex].terrainData, data, size); + #ifdef TARGET_NDS + tonccpy(gAreas[sCurrAreaIndex].terrainData, data, size); + #else + memcpy(gAreas[sCurrAreaIndex].terrainData, data, size); + #endif #endif } sCurrentCmd = CMD_NEXT; @@ -630,7 +634,11 @@ static void level_cmd_set_macro_objects(void) { len += 4; } gAreas[sCurrAreaIndex].macroObjects = alloc_only_pool_alloc(sLevelPool, len * sizeof(MacroObject)); - memcpy(gAreas[sCurrAreaIndex].macroObjects, data, len * sizeof(MacroObject)); + #ifdef TARGET_NDS + tonccpy(gAreas[sCurrAreaIndex].macroObjects, data, len * sizeof(MacroObject)); + #else + memcpy(gAreas[sCurrAreaIndex].macroObjects, data, len * sizeof(MacroObject)); + #endif #endif } sCurrentCmd = CMD_NEXT; diff --git a/src/game/memory.c b/src/game/memory.c index 83f55f7c3b..661000f3b8 100644 --- a/src/game/memory.c +++ b/src/game/memory.c @@ -263,7 +263,7 @@ static void dma_read(u8 *dest, u8 *srcStart, u8 *srcEnd) { size -= copySize; } #else - memcpy(dest, srcStart, srcEnd - srcStart); + tonccpy(dest, srcStart, srcEnd - srcStart); #endif } diff --git a/src/goddard/gd_math.c b/src/goddard/gd_math.c index baf43dd825..d2c497a292 100644 --- a/src/goddard/gd_math.c +++ b/src/goddard/gd_math.c @@ -949,13 +949,21 @@ void UNUSED gd_rot_mat_offset(Mat4f *dst, f32 x, f32 y, f32 z, s32 copy) { } mag = gd_sqrt_f(SQ(adj) + SQ(opp)); + #ifdef TARGET_NDS + c = swiDivide(adj, mag); + s = swiDivide(opp, mag); + + vec.x = swiDivide(-y, opp); + vec.y = swiDivide(-x, opp); + vec.z = swiDivide(-z, opp); + #else c = adj / mag; s = opp / mag; vec.x = -y / opp; vec.y = -x / opp; vec.z = -z / opp; - + #endif gd_create_rot_matrix(&rot, &vec, s, c); if (!copy) { gd_mult_mat4f(dst, &rot, dst); diff --git a/src/nds/main.c b/src/nds/main.c index 571f94c87d..21e1b6ad9d 100644 --- a/src/nds/main.c +++ b/src/nds/main.c @@ -46,6 +46,8 @@ int main(void) { main_pool_init(pool, pool + sizeof(pool) / sizeof(pool[0])); gEffectsMemoryPool = mem_pool_init(0x4000, MEMORY_POOL_LEFT); + setCpuClock(true); + renderer_init(); #ifdef LIBFAT diff --git a/src/nds/nds_renderer.c b/src/nds/nds_renderer.c index 5d05ce323b..425fabbba5 100644 --- a/src/nds/nds_renderer.c +++ b/src/nds/nds_renderer.c @@ -6,6 +6,8 @@ #include "nds_renderer.h" +#include "tonccpy.h" + struct Color { uint8_t r, g, b, a; }; @@ -417,7 +419,7 @@ static void g_vtx(Gwords *words) { const Vtx *vertices = (const Vtx*)words->w1; // Store vertices in the vertex buffer - memcpy(&vertex_buffer[index - count], vertices, count * sizeof(Vtx)); + tonccpy(&vertex_buffer[index - count], vertices, count * sizeof(Vtx)); if (geometry_mode & G_LIGHTING) { // Recalculate transformed light vectors if the lights or modelview matrix changed @@ -436,9 +438,9 @@ static void g_vtx(Gwords *words) { int s = (lights[i].nx * lights[i].nx + lights[i].ny * lights[i].ny + lights[i].nz * lights[i].nz) >> 8; if (s > 0) { s = sqrt_fixed(s); - lights[i].nx = (lights[i].nx << 16) / s; - lights[i].ny = (lights[i].ny << 16) / s; - lights[i].nz = (lights[i].nz << 16) / s; + lights[i].nx = div32((lights[i].nx << 16), s); + lights[i].ny = div32((lights[i].ny << 16), s); + lights[i].nz = div32((lights[i].nz << 16), s); } } @@ -511,7 +513,7 @@ static void g_texture(Gwords *words) { static void g_popmtx(Gwords *words) { // Pop matrices from the modelview stack glMatrixMode(GL_MODELVIEW); - glPopMatrix(words->w1 / 64); + glPopMatrix(div32(words->w1, 64)); } static void g_geometrymode(Gwords *words) { @@ -600,7 +602,7 @@ static void g_moveword(Gwords *words) { switch (index) { case G_MW_NUMLIGHT: // Set the current number of lights, including the lookat vectors - num_lights = (words->w1 / 24) + 2; + num_lights = div32(words->w1, 24) + 2; break; case G_MW_FOG: @@ -758,19 +760,19 @@ static void g_rdphalf_2(Gwords *words) { // Draw one half of the rectangle, using depth hijacking glTexCoord2t16(s1, t1); - glVertex3v16(x1, y1, (--z_depth) / 6); + glVertex3v16(x1, y1, div32((--z_depth), 6)); glTexCoord2t16(s1, t2); - glVertex3v16(x1, y2, (--z_depth) / 6); + glVertex3v16(x1, y2, div32((--z_depth), 6)); glTexCoord2t16(s2, t1); - glVertex3v16(x2, y1, (--z_depth) / 6); + glVertex3v16(x2, y1, div32((--z_depth), 6)); // Draw the other half of the rectangle, using depth hijacking glTexCoord2t16(s2, t1); - glVertex3v16(x2, y1, (--z_depth) / 6); + glVertex3v16(x2, y1, div32((--z_depth), 6)); glTexCoord2t16(s1, t2); - glVertex3v16(x1, y2, (--z_depth) / 6); + glVertex3v16(x1, y2, div32((--z_depth), 6)); glTexCoord2t16(s2, t2); - glVertex3v16(x2, y2, (--z_depth) / 6); + glVertex3v16(x2, y2, div32((--z_depth), 6)); // Restore the original matrices glPopMatrix(1); @@ -849,14 +851,14 @@ static void g_fillrect(Gwords *words) { const int16_t y2 = -((((words->w0 >> 0) & 0xFFF) + (1 << 2)) * (2 << 12) / (240 << 2) - (1 << 12)); // Draw one half of the rectangle, using depth hijacking - glVertex3v16(x1, y1, (--z_depth) / 6); - glVertex3v16(x1, y2, (--z_depth) / 6); - glVertex3v16(x2, y1, (--z_depth) / 6); + glVertex3v16(x1, y1, div32((--z_depth), 6)); + glVertex3v16(x1, y2, div32((--z_depth), 6)); + glVertex3v16(x2, y1, div32((--z_depth), 6)); // Draw the other half of the rectangle, using depth hijacking - glVertex3v16(x2, y1, (--z_depth) / 6); - glVertex3v16(x1, y2, (--z_depth) / 6); - glVertex3v16(x2, y2, (--z_depth) / 6); + glVertex3v16(x2, y1, div32((--z_depth), 6)); + glVertex3v16(x1, y2, div32((--z_depth), 6)); + glVertex3v16(x2, y2, div32((--z_depth), 6)); // Restore the original matrices glMatrixMode(GL_PROJECTION); diff --git a/src/nds/tonccpy.c b/src/nds/tonccpy.c new file mode 100644 index 0000000000..b10e960cb9 --- /dev/null +++ b/src/nds/tonccpy.c @@ -0,0 +1,136 @@ +#include "tonccpy.h" +//# tonccpy.c + +//! VRAM-safe cpy. +/*! This version mimics memcpy in functionality, with + the benefit of working for VRAM as well. It is also + slightly faster than the original memcpy, but faster + implementations can be made. + \param dst Destination pointer. + \param src Source pointer. + \param size Fill-length in bytes. + \note The pointers and size need not be word-aligned. +*/ +void tonccpy(void *dst, const void *src, uint size) +{ + if (size==0 || dst==NULL || src==NULL) + return; + + uint count; + u16 *dst16; // hword destination + u8 *src8; // byte source + + // Ideal case: copy by 4x words. Leaves tail for later. + if ( ((u32)src|(u32)dst)%4==0 && size>=4) + { + u32 *src32= (u32*)src, *dst32= (u32*)dst; + + count= size/4; + uint tmp= count&3; + count /= 4; + + // Duff's Device, good friend! + switch(tmp) { + do { *dst32++ = *src32++; + case 3: *dst32++ = *src32++; + case 2: *dst32++ = *src32++; + case 1: *dst32++ = *src32++; + case 0: ; } while (count--); + } + + // Check for tail + size &= 3; + if (size == 0) + return; + + src8= (u8*)src32; + dst16= (u16*)dst32; + } + else // Unaligned. + { + uint dstOfs= (u32)dst&1; + src8= (u8*)src; + dst16= (u16*)(dst-dstOfs); + + // Head: 1 byte. + if (dstOfs != 0) + { + *dst16= (*dst16 & 0xFF) | *src8++<<8; + dst16++; + if (--size==0) + return; + } + } + + // Unaligned main: copy by 2x byte. + count= size/2; + while (count--) + { + *dst16++ = src8[0] | src8[1]<<8; + src8 += 2; + } + + // Tail: 1 byte. + if (size&1) + *dst16= (*dst16 &~ 0xFF) | *src8; +} +//# toncset.c + +//! VRAM-safe memset, internal routine. +/*! This version mimics memset in functionality, with + the benefit of working for VRAM as well. It is also + slightly faster than the original memset. + \param dst Destination pointer. + \param fill Word to fill with. + \param size Fill-length in bytes. + \note The \a dst pointer and \a size need not be + word-aligned. In the case of unaligned fills, \a fill + will be masked off to match the situation. +*/ +void __toncset(void *dst, u32 fill, uint size) +{ + if (size==0 || dst==NULL) + return; + + uint left= (u32)dst&3; + u32 *dst32= (u32*)(dst-left); + u32 count, mask; + + // Unaligned head. + if (left != 0) + { + // Adjust for very small stint. + if (left+size<4) + { + mask= BIT_MASK(size*8)<<(left*8); + *dst32= (*dst32 &~ mask) | (fill & mask); + return; + } + + mask= BIT_MASK(left*8); + *dst32= (*dst32 & mask) | (fill&~mask); + dst32++; + size -= 4-left; + } + + // Main stint. + count= size/4; + uint tmp= count&3; + count /= 4; + + switch(tmp) { + do { *dst32++ = fill; + case 3: *dst32++ = fill; + case 2: *dst32++ = fill; + case 1: *dst32++ = fill; + case 0: ; } while (count--); + } + + // Tail + size &= 3; + if (size) + { + mask= BIT_MASK(size*8); + *dst32= (*dst32 &~ mask) | (fill & mask); + } +} \ No newline at end of file diff --git a/src/nds/ultra_reimplementation.c b/src/nds/ultra_reimplementation.c index 2b28ea849e..c3821570c7 100644 --- a/src/nds/ultra_reimplementation.c +++ b/src/nds/ultra_reimplementation.c @@ -14,7 +14,7 @@ u64 osClockRate = 62500000; s32 osPiStartDma(UNUSED OSIoMesg *mb, UNUSED s32 priority, UNUSED s32 direction, uintptr_t devAddr, void *vAddr, size_t nbytes, UNUSED OSMesgQueue *mq) { - memcpy(vAddr, (const void *) devAddr, nbytes); + tonccpy(vAddr, (const void *) devAddr, nbytes); return 0; } @@ -151,7 +151,7 @@ s32 osEepromLongRead(UNUSED OSMesgQueue *mq, u8 address, u8 *buffer, int nbytes) return -1; } if (fread(content, 1, 512, fp) == 512) { - memcpy(buffer, content + address * 8, nbytes); + tonccpy(buffer, content + address * 8, nbytes); ret = 0; } fclose(fp); @@ -164,7 +164,7 @@ s32 osEepromLongWrite(UNUSED OSMesgQueue *mq, u8 address, u8 *buffer, int nbytes if (address != 0 || nbytes != 512) { osEepromLongRead(mq, 0, content, 512); } - memcpy(content + address * 8, buffer, nbytes); + tonccpy(content + address * 8, buffer, nbytes); #ifdef TARGET_WEB EM_ASM({