Maximus
January 10th, 2011, 09:05
Hi,
I were optimizing for speed a small memset - where i need to optimize the 128-512 case. This is my code:
with my surprise, it run SLOWER than filling ti manually with a mov [edi],eax/mov [edi+4],eax loop!!
even sadder, it runs SLOWER than:
any suggestion/comment?
I were optimizing for speed a small memset - where i need to optimize the 128-512 case. This is my code:
Code:
_asm {
pxor mm0, mm0
mov edi, TempEndPtr
mov ecx, len
xor eax, eax
_128_loop:
movntq [edi], mm0
movntq [edi+8], mm0
movntq [edi+16], mm0
movntq [edi+32], mm0
movntq [edi+40], mm0
movntq [edi+48], mm0
movntq [edi+56], mm0
//
movntq [edi+64], mm0
movntq [edi+72], mm0
movntq [edi+80], mm0
movntq [edi+88], mm0
movntq [edi+96], mm0
movntq [edi+104], mm0
movntq [edi+112], mm0
movntq [edi+120], mm0
sub ecx, 128
add edi, 128
cmp ecx, 128
jg _128_loop
je _the_end
_4_loop:
mov [edi], eax
add edi, 4
sub ecx, 4
cmp ecx, 0
jg _4_loop
_the_end:
with my surprise, it run SLOWER than filling ti manually with a mov [edi],eax/mov [edi+4],eax loop!!
even sadder, it runs SLOWER than:
Code:
mov ecx, len
mov edi, TempEndPtr
shr ecx, 2
xor eax, eax
inc ecx
rep stosd
any suggestion/comment?