4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
5 ;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
10 ;; Yes, I tried prefetch-ing. It makes no difference or makes
14 ; This program is free software; you can reaxstribute it and/or
15 ; modify it under the terms of the GNU General Public License
16 ; as published by the Free Software Foundation; either version 2
17 ; of the License, or (at your option) any later version.
19 ; This program is distributed in the hope that it will be useful,
20 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ; GNU General Public License for more details.
24 ; You should have received a copy of the GNU General Public License
25 ; along with this program; if not, write to the Free Software
26 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
34 ; int dist1_00(char *blk1,char *blk2,int lx,int h,int distlim);
35 ; distlim unused - costs more to check than the savings of
36 ; aborting the computation early from time to time...
42 ; mm0 = distance accumulator
53 push ebp ; save frame pointer
60 pxor mm0, mm0 ; zero acculumator
62 mov eax, [ebp+8] ; get p1
64 mov ebx, [ebp+12] ; get p2
65 mov edx, [ebp+16] ; get lx
67 mov ecx, [ebp+20] ; get rowsleft
71 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
72 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
73 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
74 add eax, edx ; update pointer to next row
75 paddd mm0, mm4 ; accumulate difference
77 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
79 paddd mm0, mm5 ; accumulate difference
82 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
83 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
84 movq mm4, [eax+8] ; load next 8 bytes of p1 (row 2)
85 add eax, edx ; update pointer to next row
86 paddd mm0, mm6 ; accumulate difference
88 psadbw mm4, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
90 paddd mm0, mm4 ; accumulate difference
92 ;psubd mm2, mm3 ; decrease rowsleft
93 ;movq mm5, mm1 ; copy distlim
94 ;pcmpgtd mm5, mm0 ; distlim > dist?
95 ;pand mm2, mm5 ; mask rowsleft with answer
96 ;movd ecx, mm2 ; move rowsleft to ecx
98 ;add eax, edx ; update pointer to next row
101 ;test ecx, ecx ; check rowsleft
105 movd eax, mm0 ; store return value
118 global dist1_00_Ammxe
119 ;; This is a special version that only does aligned accesses...
120 ;; Wonder if it'll make it faster on a P-III
121 ;; ANSWER: NO its slower hence no longer used.
123 ; int dist1_00(char *blk1,char *blk2,int lx,int h,int distlim);
124 ; distlim unused - costs more to check than the savings of
125 ; aborting the computation early from time to time...
131 ; mm0 = distance accumulator
133 ; mm2 = right shift to adjust for mis-align
134 ; mm3 = left shift to adjust for mis-align
142 push ebp ; save frame pointer
149 pxor mm0, mm0 ; zero acculumator
151 mov eax, [ebp+8] ; get p1
153 and ebx, 7 ; Misalignment!
155 jz near dist1_00_0misalign
156 sub eax, ebx ; Align eax
157 mov ecx, 8 ; ecx = 8-misalignment
159 shl ebx, 3 ; Convert into bit-shifts...
161 movd mm2, ebx ; mm2 = shift to start msb
162 movd mm3, ecx ; mm3 = shift to end lsb
164 mov ebx, [ebp+12] ; get p2
165 mov edx, [ebp+16] ; get lx
166 mov ecx, [ebp+20] ; get rowsleft
170 movq mm4, [eax] ; load first 8 bytes of aligned p1 (row 1)
171 movq mm5, [eax+8] ; load next 8 bytes of aligned p1 (row 1)
173 psrlq mm4, mm2 ; mm4 first 8 bytes of p1 proper
176 psadbw mm4, [ebx] ; compare to first 8 bytes of p2
178 movq mm7, [eax+16] ; load last 8 bytes of aligned p1
179 add eax, edx ; update pointer to next row
180 psrlq mm6, mm2 ; mm6 2nd 8 bytes of p1 proper
185 paddd mm0, mm4 ; accumulate difference
187 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
189 paddd mm0, mm6 ; accumulate difference
194 movd eax, mm0 ; store return value
207 ; int dist1_01(char *blk1,char *blk2,int lx,int h);
214 ; mm0 = distance accumulator
217 ; mm3 = 2 (rows per loop)
232 pxor mm0, mm0 ; zero acculumator
234 mov eax, [ebp+8] ; get p1
235 mov ebx, [ebp+12] ; get p2
236 mov edx, [ebp+16] ; get lx
238 mov ecx, [ebp+20] ; get rowsleft
239 jmp nextrow01 ; snap to it
242 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
243 pavgb mm4, [eax+1] ; Interpolate...
244 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
245 paddd mm0, mm4 ; accumulate difference
247 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
248 pavgb mm5, [eax+9] ; Interpolate
249 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
250 paddd mm0, mm5 ; accumulate difference
252 add eax, edx ; update pointer to next row
255 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
256 pavgb mm6, [eax+1] ; Interpolate
257 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
258 paddd mm0, mm6 ; accumulate difference
260 movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
262 psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
263 paddd mm0, mm7 ; accumulate difference
265 add eax, edx ; update pointer to next row
268 sub ecx, 2 ; check rowsleft
269 jnz nextrow01 ; rinse and repeat
271 movd eax, mm0 ; store return value
277 pop ebp ; restore stack pointer
279 emms ; clear mmx registers
280 ret ; we now return you to your regular programming
285 ; int dist1_10(char *blk1,char *blk2,int lx,int h);
293 ; mm0 = distance accumulator
295 ; mm3 = 2 (rows per loop)
303 push ebp ; save stack pointer
311 pxor mm0, mm0 ; zero acculumator
313 mov eax, [ebp+8] ; get p1
314 mov ebx, [ebp+12] ; get p2
315 mov edx, [ebp+16] ; get lx
318 mov ecx, [ebp+20] ; get rowsleft
319 jmp nextrow10 ; snap to it
322 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
323 pavgb mm4, [edi] ; Interpolate...
324 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
325 paddd mm0, mm4 ; accumulate difference
327 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
328 pavgb mm5, [edi+8] ; Interpolate
329 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
330 paddd mm0, mm5 ; accumulate difference
332 add eax, edx ; update pointer to next row
336 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
337 pavgb mm6, [edi] ; Interpolate
338 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
339 paddd mm0, mm6 ; accumulate difference
341 movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
343 psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
344 paddd mm0, mm7 ; accumulate difference
346 psubd mm2, mm3 ; decrease rowsleft
348 add eax, edx ; update pointer to next row
352 sub ecx, 2 ; check rowsleft (we're doing 2 at a time)
353 jnz nextrow10 ; rinse and repeat
355 movd eax, mm0 ; store return value
362 pop ebp ; restore stack pointer
364 emms ; clear mmx registers
365 ret ; we now return you to your regular programming
370 ; int dist1_11(char *blk1,char *blk2,int lx,int h);
379 ; mm0 = distance accumulator
381 ; mm3 = 2 (rows per loop)
389 push ebp ; save stack pointer
390 mov ebp, esp ; so that we can do this
392 push ebx ; save the pigs
393 push ecx ; make them squeal
394 push edx ; lets have pigs for every meal
397 pxor mm0, mm0 ; zero acculumator
399 mov eax, [ebp+8] ; get p1
400 mov ebx, [ebp+12] ; get p2
401 mov edx, [ebp+16] ; get lx
404 mov ecx, [ebp+20] ; get rowsleft
405 jmp nextrow11 ; snap to it
408 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
409 pavgb mm4, [edi] ; Interpolate...
413 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
414 paddd mm0, mm4 ; accumulate difference
416 movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1)
417 pavgb mm6, [edi+8] ; Interpolate
421 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
422 paddd mm0, mm6 ; accumulate difference
424 add eax, edx ; update pointer to next row
428 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
429 pavgb mm4, [edi] ; Interpolate...
433 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
434 paddd mm0, mm4 ; accumulate difference
436 movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1)
437 pavgb mm6, [edi+8] ; Interpolate
441 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
442 paddd mm0, mm6 ; accumulate difference
444 add eax, edx ; update pointer to next row
449 sub ecx, 2 ; check rowsleft
450 jnz near nextrow11 ; rinse and repeat
452 movd eax, mm0 ; store return value
459 pop ebp ; restore stack pointer
461 emms ; clear mmx registers
462 ret ; we now return you to your regular programming
466 ; int dist22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh);
473 ; mm0 = distance accumulator
475 ; mm3 = 2 (rows per loop)
483 push ebp ; save frame pointer
490 pxor mm0, mm0 ; zero acculumator
492 mov eax, [ebp+8] ; get p1
493 mov ebx, [ebp+12] ; get p2
494 mov edx, [ebp+16] ; get lx
500 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
501 add eax, edx ; update pointer to next row
502 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
504 paddd mm0, mm4 ; accumulate difference
507 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
508 add eax, edx ; update pointer to next row
509 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
511 paddd mm0, mm6 ; accumulate difference
534 ; int dist44_mmxe(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
542 ; mm0 = distance accumulator left block p1
543 ; mm1 = distance accumulator right block p1
561 pxor mm0, mm0 ; zero acculumator
564 mov eax, [ebp+8] ; get p1
565 mov ebx, [ebp+12] ; get p2
566 mov edx, [ebp+16] ; get qlx
568 mov esi, [ebp+20] ; get rowsleft
569 jmp nextrowqd ; snap to it
572 movq mm4, [eax] ; load 8 bytes of p1 (two blocks!)
573 add eax, edx ; update pointer to next row
575 mov ecx, [ebx] ; load 4 bytes of p2
576 punpcklbw mm4, mm2 ; mm4 = bytes 0..3 p1 (spaced out)
578 punpcklbw mm5, mm2 ; mm5 = bytes 0..3 p2 (spaced out)
579 psadbw mm4, mm5 ; compare to left block
582 ; punpckhbw mm6, mm2 ; mm6 = bytes 4..7 p1 (spaced out)
584 paddd mm0, mm4 ; accumulate difference left block
586 ; psadbw mm6,mm5 ; compare to right block
589 ; paddd mm1, mm6 ; accumulate difference right block
604 pop ebp ; restore stack pointer
606 emms ; clear mmx registers
607 ret ; we now return you to your regular programming