bsf
コードのライセンスは全てPD。計測してなかったり、速度分かって無かったりで片手落ち。bsf命令のクロック数はここにある。テーブル引きは2clk on pentium4、シフトや+は2clk on pentium4。正しいかは知らない。
高速版+gccの最適化ミスの手動最適化(x86)
ポイント: テーブル参照に8ビットは使えないのでmovzbl %b0, %0になる。alignedは必要ないはず。gccがmovb %%al, %%alとか吐くのでまだ無駄あり。テーブルにstatic足りない気がする。bsf(0)=32なので注意。
__attribute__ ((aligned (16))) const unsigned char bsf_tab[256] = {
8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
};
#define l(x) ((unsigned char)x)
#define x(x) ((unsigned short)x)
__attribute__ ((always_inline)) static unsigned char h(unsigned int x){
unsigned char r;
asm ("movb %h1, %b0\n" : "=q" (r) : "q" (x));
return r;
}
__attribute((always_inline)) static unsigned int bsf(unsigned int a){
if(l(a))
return bsf_tab[l(a)];
if(h(a))
return bsf_tab[h(a)]+8;
a >>= 16;
if(l(a))
return bsf_tab[l(a)]+16;
return bsf_tab[h(a)]+24;
/*
unsigned int rv = 0;
if(!x(a)){
a >>= 16;
rv = 16;
}
if(l(a))
return bsf_tab[l(a)]+rv;
return bsf_tab[h(a)]+8+rv;
*/
}
高速版
bsf(0)=32なので注意
__attribute__ ((aligned (16))) const unsigned char bsf_tab[256] = {
8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
};
__attribute((always_inline)) static unsigned int bsf(unsigned int x){
unsigned int i;
if((i = x&0xff)))
return bsf_tab[i];
if((i = x&0xff00))
return bsf_tab[i>>8]+8;
if((i = x&0xff0000))
return bsf_tab[i>>16]+16;
return bsf_tab[x>>24]+24;
}
多分最高速、ただし一番多そうなケースが遅い
gccの最適化しょぼすぎ。gcc -O3 -fomit-frame-pointer -Wall bsf.cでコンパイルしても、jmp retがretにならないのが残念。レジスタ一つしか使っていない。テーブル4つ用意すれば&を3つ減らせる。やっぱり&じゃなくて+の方が良いと思うのと、分岐も小さい方を優先にすべき。bsf(0)=24なので注意。movzbl %%al,%%eaxはand $0xff,%%eaxと多分同じ。andlはandbの方が良さげ?
__attribute__ ((aligned (16))) const unsigned char bsf_tab[256] = {
0+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
4+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
5+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
4+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
6+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
4+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
5+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
4+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
7+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
4+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
5+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
4+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
6+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
4+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
5+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
4+24, 24, 1+24, 24, 2+24, 24, 1+24, 24, 3+24, 24, 1+24, 24, 2+24, 24, 1+24, 24,
};
__attribute__((noinline)) __attribute__((cdecl)) static unsigned int bsf(unsigned int a){
asm (
"test %%al, %%al\n"
"jne e1\n"
"test %%ah, %%ah\n"
"jne e2\n"
"shrl $0x10,%%eax\n"
"test %%al, %%al\n"
"jne e3\n"
"movzbl %%ah,%%eax\n"
"movb bsf_tab(%%eax),%%al\n"
"jmp ret\n"
"e1:\n"
"movzbl %%al,%%eax\n"
"movb bsf_tab(%%eax),%%al\n"
"andl $0x7,%%eax\n"
"jmp ret\n"
"e2:\n"
"movzbl %%ah,%%eax\n"
"movb bsf_tab(%%eax),%%al\n"
"andl $0xf,%%eax\n"
"jmp ret\n"
"e3:\n"
"movzbl %%al,%%eax\n"
"movb bsf_tab(%%eax),%%al\n"
"andl $0x17,%%eax\n"
"ret:\n"
::);
}
小さなテーブル版
bsf(0)=32なので注意
__attribute((always_inline)) static unsigned int bsf(unsigned int x){
return "\x20\x0\x1\x6\x2\x19\x7\x0\x3\x0\xc\x1a\x8\x13\x0\x0\x4\x17\x0\x11\xf\xd\x1b\x0\x1d\x9\x14\x0\x0\x0\x0\x0\x1f\x5\x18\x0\x0\xb\x12\x0\x16\x10\xe\x0\x1c\x0\x0\x0\x1e\x0\xa\x0\x15"[(x&-x)*70428299>>26];
}