diff options
Diffstat (limited to 'src/include/i386/sstring.h')
-rw-r--r-- | src/include/i386/sstring.h | 414 |
1 files changed, 414 insertions, 0 deletions
diff --git a/src/include/i386/sstring.h b/src/include/i386/sstring.h new file mode 100644 index 0000000..bd2dd2d --- /dev/null +++ b/src/include/i386/sstring.h @@ -0,0 +1,414 @@ +#if 0 +#ifndef _I386_STRING_I486_H_ +#define _I386_STRING_I486_H_ +#if defined(__OPTIMIZE__) && defined(__GNUC__) && defined(__i386__) +/* + * This string-include defines all string functions as inline + * functions. Use gcc. It also assumes ds=es=data space, this should be + * normal. Most of the string-functions are rather heavily hand-optimized, + * see especially strtok,strstr,str[c]spn. They should work, but are not + * very easy to understand. Everything is done entirely within the register + * set, making the functions fast and clean. + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Revised and optimized for i486/pentium + * 1994/03/15 by Alberto Vignani/Davide Parodi @crf.it + * + * Split into 2 CPU specific files by Alan Cox to keep #ifdef noise down. + * + * Revised and optimized again by Jan Hubicka (1997/11/16) + * (please report bugs to hubicka@paru.cas.cz) + * + * memset and memcpy routines seems to be always faster at 486 and + * pentium but at pentium MMX they are sometimes bit slower (5-10%).. + * because of less strict register allocation they produces better code. + */ + + +#define __HAVE_ARCH_MEMCPY +#define memcpy(d,s,count) \ +(__builtin_constant_p(count) ? \ + __memcpy_c((d),(s),(count)) : \ + __memcpy_g((d),(s),(count))) + +/* + * These ought to get tweaked to do some cache priming. + */ + + +/* This implementation of the memcpy is designed for moveoldpoints from + * mkrealloctables. It is expected to work well for both small and large + * sizes. + * + * Small (1-10) and meduim (300) sizes seems to be important for XaoS. + * So implementation is not super fast for large sizes, but my experiemnts + * don't show large improvements in speed anyway. + * + * We use rep movsX operations (they works well on PPro and don't seems to be + * so bad on Pentium) and expect cld operation to be set. Hope that it will + * not make problems. + * + * My attempt was to use c code where possible to let GCC do the + */ +extern inline void *__memcpy_g(void *to, const register void *from, + register size_t n); +extern inline void *__memcpy_g(void *to, const register void *from, + register size_t n) +{ + register void *tmp = (void *) to; + if (n >= 7) { + register int c = (-(int) to) & 3; + n -= c; + __asm__ __volatile__( /*Align the destination */ + "rep\n\tmovsb":"=c"(c), "=D"(tmp), + "=S"(from):"c"(c), "D"((long) tmp), + "S"((long) from):"memory"); + c = n >> 2; + __asm__ __volatile__( /*Copy the main body */ + "rep\n\tmovsl":"=c"(c), "=D"(tmp), + "=S"(from):"c"(c), "D"((long) tmp), + "S"((long) from):"memory"); + n &= 3; + } + __asm__ __volatile__("rep\n\tmovsb":"=c"(n), "=D"(tmp), + "=S"(from):"c"(n), "D"((long) tmp), + "S"((long) from):"memory"); + return (to); +} + +/* + * This looks horribly ugly, but the compiler can optimize it totally, + * as the count is constant. + */ + +#define COMMON(x) \ +__asm__ __volatile__ ( \ + "\n.align 4\n" \ + "1:\tmovl (%2),%0\n\t" \ + "addl $4,%2\n\t" \ + "movl %0,(%1)\n\t" \ + "addl $4,%1\n\t" \ + "decl %3\n\t" \ + "jnz 1b\n" \ + x \ + :"=r" (dummy1), "=r" (tmp), "=r" (from), "=r" (dummy2) \ + :"1" (tmp), "2" (from), "3" (n/4) \ + :"memory"); \ +return (to); \ + +extern inline void *__memcpy_c(void *to, const void *from, size_t n); +extern inline void *__memcpy_c(void *to, const void *from, size_t n) +{ + if (n < 24) { + if (n >= 4) + ((unsigned long *) to)[0] = ((const unsigned long *) from)[0]; + if (n >= 8) + ((unsigned long *) to)[1] = ((const unsigned long *) from)[1]; + if (n >= 12) + ((unsigned long *) to)[2] = ((const unsigned long *) from)[2]; + if (n >= 16) + ((unsigned long *) to)[3] = ((const unsigned long *) from)[3]; + if (n >= 20) + ((unsigned long *) to)[4] = ((const unsigned long *) from)[4]; + switch ((unsigned int) (n % 4)) { + case 3: + ((unsigned short *) to)[n / 2 - 1] = + ((const unsigned short *) from)[n / 2 - 1]; + ((unsigned char *) to)[n - 1] = + ((const unsigned char *) from)[n - 1]; + return to; + case 2: + ((unsigned short *) to)[n / 2 - 1] = + ((const unsigned short *) from)[n / 2 - 1]; + return to; + case 1: + ((unsigned char *) to)[n - 1] = + ((const unsigned char *) from)[n - 1]; + case 0: + return to; + } + } + { + register void *tmp = (void *) to; + register int dummy1, dummy2; + switch ((unsigned int) (n % 4)) { + case 0: + COMMON(""); + case 1: + COMMON("movb (%2),%b0 ; movb %b0,(%1)"); + case 2: + COMMON("movw (%2),%w0 ; movw %w0,(%1)"); + case 3: + COMMON("movw (%2),%w0 ; movw %w0,(%1)\n\t" + "movb 2(%2),%b0 ; movb %b0,2(%1)"); + } + } + return to; +} + +#undef COMMON + + +#define __HAVE_ARCH_MEMMOVE +extern inline void *memmove(void *dest, const void *src, size_t n); +extern inline void *memmove(void *dest, const void *src, size_t n) +{ + register void *tmp = (void *) dest; + if (dest < src) + __asm__ __volatile__("cld\n\t" "rep\n\t" "movsb": /* no output */ + :"c"(n), "S"(src), "D"(tmp):"cx", "si", "di", + "memory"); + else + __asm__ __volatile__("std\n\t" "rep\n\t" "movsb\n\t" "cld": /* no output */ + : "c"(n), "S"(n - 1 + (const char *) src), "D"(n - 1 + (char *) tmp):"cx", "si", "di", "memory"); + return dest; +} + +#define memcmp __builtin_memcmp + +#define __HAVE_ARCH_MEMCHR +extern inline void *memchr(const void *cs, int c, size_t count); +extern inline void *memchr(const void *cs, int c, size_t count) +{ + register void *__res; + if (!count) + return NULL; + __asm__ __volatile__("cld\n\t" + "repne\n\t" + "scasb\n\t" + "je 1f\n\t" + "movl $1,%0\n" + "1:\tdecl %0":"=D"(__res):"a"(c), "D"(cs), + "c"(count):"cx"); + return __res; +} + + + +#define __HAVE_ARCH_MEMSET +#define memset(s,c,count) \ +(__builtin_constant_p(c) ? \ + (__builtin_constant_p(count) ? \ + __memset_cc((s),(c),(count)) : \ + __memset_cg((s),(c),(count))) : \ + (__builtin_constant_p(count) ? \ + __memset_gc((s),(c),(count)) : \ + __memset_gg((s),(c),(count)))) + + + + +extern inline void *__memset_cg(void *s, char c, size_t count); +extern inline void *__memset_cg(void *s, char c, size_t count) +{ + int tmp2; + register void *tmp = (void *) s; + __asm__ __volatile__("shrl $1,%%ecx\n\t" + "rep\n\t" + "stosw\n\t" + "jnc 1f\n\t" + "movb %%al,(%%edi)\n" + "1:":"=c"(tmp2), "=D"(tmp):"c"(count), "D"(tmp), + "a"(0x0101U * (unsigned char) c):"memory"); + return s; +} + +extern inline void *__memset_gg(void *s, char c, size_t count); +extern inline void *__memset_gg(void *s, char c, size_t count) +{ + register void *tmp = (void *) s; + int tmp2; + __asm__ __volatile__("movb %%al,%%ah\n\t" + "shrl $1,%%ecx\n\t" + "rep\n\t" + "stosw\n\t" + "jnc 1f\n\t" + "movb %%al,(%%edi)\n" + "1:":"=c"(tmp2), "=D"(tmp):"c"(count), "D"(tmp), + "a"(c):"memory"); + return s; +} + +/* + * This non-rep routines are not much faster (slower for small strings) + * but they allows better register allocation + */ +#define COMMON(x) \ +__asm__ __volatile__ ( \ + "\n.align 4\n" \ + "1:\tmovl %k2,(%k0)\n\t" \ + "addl $4,%k0\n\t" \ + "decl %k1\n\t" \ + "jnz 1b\n" \ + x \ + :"=r" (tmp), "=r" (dummy) \ + :"q" ((unsigned) pattern), "0" (tmp), "1" (count/4) \ + :"memory"); \ +return s; + +extern inline void *__memset_cc(void *s, unsigned long pattern, + size_t count); +extern inline void *__memset_cc(void *s, unsigned long pattern, + size_t count) +{ + pattern = ((unsigned char) pattern) * 0x01010101UL; + if (count < 24) { + /*Handle small values manualy since they are incredibly slow */ + + if (count >= 4) + *(unsigned long *) s = pattern; + if (count >= 8) + ((unsigned long *) s)[1] = pattern; + if (count >= 12) + ((unsigned long *) s)[2] = pattern; + if (count >= 16) + ((unsigned long *) s)[3] = pattern; + if (count >= 20) + ((unsigned long *) s)[4] = pattern; + switch ((unsigned int) (count % 4)) { + case 3: + ((unsigned short *) s)[count / 2 - 1] = pattern; + ((unsigned char *) s)[count - 1] = pattern; + return s; + case 2: + ((unsigned short *) s)[count / 2 - 1] = pattern; + return s; + case 1: + ((unsigned char *) s)[count - 1] = pattern; + case 0: + return s; + } + } else { + register void *tmp = (void *) s; + register int dummy; + switch ((unsigned int) (count % 4)) { + case 0: + COMMON(""); + case 1: + COMMON("movb %b2,(%0)"); + case 2: + COMMON("movw %w2,(%0)"); + case 3: + COMMON("movw %w2,(%0)\n\tmovb %b2,2(%0)"); + } + } + return s; +} + +extern inline void *__memset_gc(void *s, unsigned long pattern, + size_t count); +extern inline void *__memset_gc(void *s, unsigned long pattern, + size_t count) +{ + if (count < 4) { + if (count > 1) + __asm__("movb %b0,%h0\n\t": "=q"(pattern):"0"((unsigned) + pattern)); + switch ((unsigned int) (count)) { + case 3: + ((unsigned short *) s)[0] = pattern; + ((unsigned char *) s)[2] = pattern; + return s; + case 2: + *((unsigned short *) s) = pattern; + return s; + case 1: + *(unsigned char *) s = pattern; + case 0: + return s; + } + } + + __asm__("movb %b0,%h0\n\t" "pushw %w0\n\t" "shll $16,%k0\n\t" "popw %w0\n": "=q"(pattern):"0"((unsigned) + pattern)); + + if (count < 24) { + /*Handle small values manualy since they are incredibly slow */ + + *(unsigned long *) s = pattern; + if (count >= 8) + ((unsigned long *) s)[1] = pattern; + if (count >= 12) + ((unsigned long *) s)[2] = pattern; + if (count >= 16) + ((unsigned long *) s)[3] = pattern; + if (count >= 20) + ((unsigned long *) s)[4] = pattern; + switch ((unsigned int) (count % 4)) { + case 3: + ((unsigned short *) s)[count / 2 - 1] = pattern; + ((unsigned char *) s)[count - 1] = pattern; + return s; + case 2: + ((unsigned short *) s)[count / 2 - 1] = pattern; + return s; + case 1: + ((unsigned char *) s)[count - 1] = pattern; + case 0: + return s; + } + } else { + register void *tmp = (void *) s; + register int dummy; + switch ((unsigned int) (count % 4)) { + case 0: + COMMON(""); + case 1: + COMMON("movb %b2,(%0)"); + case 2: + COMMON("movw %w2,(%0)"); + case 3: + COMMON("movw %w2,(%0)\n\tmovb %b2,2(%0)"); + } + } + return s; +} + +#undef COMMON + + +/* + * find the first occurrence of byte 'c', or 1 past the area if none + */ +#define __HAVE_ARCH_MEMSCAN +extern inline void *memscan(void *addr, int c, size_t size); +extern inline void *memscan(void *addr, int c, size_t size) +{ + if (!size) + return addr; + __asm__ __volatile__("cld\n\t" + "repnz; scasb\n\t" + "jnz 1f\n\t" + "dec %%edi\n\t" + "1:":"=D"(addr), "=c"(size):"0"(addr), "1"(size), + "a"(c)); + return addr; +} + +#define memset_long(x,y,z) __memset_long(x,y,z) + +extern inline void *__memset_long(void *s, char c, size_t count); +extern inline void *__memset_long(void *s, char c, size_t count) +{ + register unsigned int fill = c; + register void *tmp = (void *) s; + if (count >= 7) { + register int c = (-(int) s) & 3; +/*__asm__ __volatile__ ("movb %b0,%h0":"=r"(fill):"r"(fill));*/ + fill |= fill << 8; + count -= c; + fill |= fill << 16; + __asm__ __volatile__("rep\n\tstosb":"=c"(c), "=D"(tmp):"c"(c), + "D"(tmp), "a"(fill):"memory"); + c = count >> 2; + __asm__ __volatile__("rep\n\tstosl":"=c"(c), "=D"(tmp):"c"(c), + "D"(tmp), "a"(fill):"memory"); + count &= 3; + } + __asm__ __volatile__("rep\n\tstosb":"=c"(count), "=D"(tmp):"c"(count), + "D"(tmp), "a"((char) fill):"memory"); + return s; +} +#endif +#endif +#endif |