SDL_gfx
2.0.25
|
00001 /* 00002 00003 SDL_imageFilter.c: byte-image "filter" routines 00004 00005 Copyright (C) 2001-2012 Andreas Schiffler 00006 Copyright (C) 2013 Sylvain Beucler 00007 00008 This software is provided 'as-is', without any express or implied 00009 warranty. In no event will the authors be held liable for any damages 00010 arising from the use of this software. 00011 00012 Permission is granted to anyone to use this software for any purpose, 00013 including commercial applications, and to alter it and redistribute it 00014 freely, subject to the following restrictions: 00015 00016 1. The origin of this software must not be misrepresented; you must not 00017 claim that you wrote the original software. If you use this software 00018 in a product, an acknowledgment in the product documentation would be 00019 appreciated but is not required. 00020 00021 2. Altered source versions must be plainly marked as such, and must not be 00022 misrepresented as being the original software. 00023 00024 3. This notice may not be removed or altered from any source 00025 distribution. 00026 00027 Andreas Schiffler -- aschiffler at ferzkopp dot net 00028 00029 */ 00030 00031 /* 00032 00033 Note: Uses inline x86 MMX or ASM optimizations if available and enabled. 00034 00035 Note: Most of the MMX code is based on published routines 00036 by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 00037 him for his work. 00038 00039 */ 00040 00041 #include <stdio.h> 00042 #include <stdlib.h> 00043 #include <string.h> 00044 00045 /* Use GCC intrinsics if available: they support both i386 and x86_64, 00046 provide ASM-grade performances, and lift the PUSHA/POPA issues. */ 00047 #ifdef __GNUC__ 00048 # ifdef USE_MMX 00049 # include <mmintrin.h> 00050 # endif 00051 #endif 00052 #include <SDL_cpuinfo.h> 00053 #include "SDL_imageFilter.h" 00054 00058 #define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8) | (((x) & 0x0000ff00) << 8) | ((x) << 24)) 00059 00060 /* ------ Static variables ----- */ 00061 00065 static int SDL_imageFilterUseMMX = 1; 00066 00067 /* Detect GCC */ 00068 #if defined(__GNUC__) 00069 #define GCC__ 00070 #endif 00071 00077 int SDL_imageFilterMMXdetect(void) 00078 { 00079 /* Check override flag */ 00080 if (SDL_imageFilterUseMMX == 0) { 00081 return (0); 00082 } 00083 00084 return SDL_HasMMX(); 00085 } 00086 00090 void SDL_imageFilterMMXoff() 00091 { 00092 SDL_imageFilterUseMMX = 0; 00093 } 00094 00098 void SDL_imageFilterMMXon() 00099 { 00100 SDL_imageFilterUseMMX = 1; 00101 } 00102 00103 /* ------------------------------------------------------------------------------------ */ 00104 00115 static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00116 { 00117 #ifdef USE_MMX 00118 #if !defined(GCC__) 00119 __asm 00120 { 00121 pusha 00122 mov eax, Src1 /* load Src1 address into eax */ 00123 mov ebx, Src2 /* load Src2 address into ebx */ 00124 mov edi, Dest /* load Dest address into edi */ 00125 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00126 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00127 align 16 /* 16 byte alignment of the loop entry */ 00128 L1010: 00129 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00130 paddusb mm1, [ebx] /* mm1=Src1+Src2 (add 8 bytes with saturation) */ 00131 movq [edi], mm1 /* store result in Dest */ 00132 add eax, 8 /* increase Src1, Src2 and Dest */ 00133 add ebx, 8 /* register pointers by 8 */ 00134 add edi, 8 00135 dec ecx /* decrease loop counter */ 00136 jnz L1010 /* check loop termination, proceed if required */ 00137 emms /* exit MMX state */ 00138 popa 00139 } 00140 #else 00141 /* i386 and x86_64 */ 00142 __m64 *mSrc1 = (__m64*)Src1; 00143 __m64 *mSrc2 = (__m64*)Src2; 00144 __m64 *mDest = (__m64*)Dest; 00145 int i; 00146 for (i = 0; i < SrcLength/8; i++) { 00147 *mDest = _m_paddusb(*mSrc1, *mSrc2); /* Src1+Src2 (add 8 bytes with saturation) */ 00148 mSrc1++; 00149 mSrc2++; 00150 mDest++; 00151 } 00152 _m_empty(); /* clean MMX state */ 00153 #endif 00154 return (0); 00155 #else 00156 return (-1); 00157 #endif 00158 } 00159 00170 int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00171 { 00172 unsigned int i, istart; 00173 unsigned char *cursrc1, *cursrc2, *curdst; 00174 int result; 00175 00176 /* Validate input parameters */ 00177 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00178 return(-1); 00179 if (length == 0) 00180 return(0); 00181 00182 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00183 00184 /* Use MMX assembly routine */ 00185 SDL_imageFilterAddMMX(Src1, Src2, Dest, length); 00186 00187 /* Check for unaligned bytes */ 00188 if ((length & 7) > 0) { 00189 /* Setup to process unaligned bytes */ 00190 istart = length & 0xfffffff8; 00191 cursrc1 = &Src1[istart]; 00192 cursrc2 = &Src2[istart]; 00193 curdst = &Dest[istart]; 00194 } else { 00195 /* No unaligned bytes - we are done */ 00196 return (0); 00197 } 00198 } else { 00199 /* Setup to process whole image */ 00200 istart = 0; 00201 cursrc1 = Src1; 00202 cursrc2 = Src2; 00203 curdst = Dest; 00204 } 00205 00206 /* C routine to process image */ 00207 for (i = istart; i < length; i++) { 00208 result = (int) *cursrc1 + (int) *cursrc2; 00209 if (result > 255) 00210 result = 255; 00211 *curdst = (unsigned char) result; 00212 /* Advance pointers */ 00213 cursrc1++; 00214 cursrc2++; 00215 curdst++; 00216 } 00217 00218 return (0); 00219 } 00220 00232 static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength, 00233 unsigned char *Mask) 00234 { 00235 #ifdef USE_MMX 00236 #if !defined(GCC__) 00237 __asm 00238 { 00239 pusha 00240 mov edx, Mask /* load Mask address into edx */ 00241 movq mm0, [edx] /* load Mask into mm0 */ 00242 mov eax, Src1 /* load Src1 address into eax */ 00243 mov ebx, Src2 /* load Src2 address into ebx */ 00244 mov edi, Dest /* load Dest address into edi */ 00245 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00246 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00247 align 16 /* 16 byte alignment of the loop entry */ 00248 L21011: 00249 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00250 movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */ 00251 /* --- Byte shift via Word shift --- */ 00252 psrlw mm1, 1 /* shift 4 WORDS of mm1 1 bit to the right */ 00253 psrlw mm2, 1 /* shift 4 WORDS of mm2 1 bit to the right */ 00254 pand mm1, mm0 // apply Mask to 8 BYTES of mm1 */ 00255 /* byte 0x0f, 0xdb, 0xc8 */ 00256 pand mm2, mm0 // apply Mask to 8 BYTES of mm2 */ 00257 /* byte 0x0f, 0xdb, 0xd0 */ 00258 paddusb mm1, mm2 /* mm1=mm1+mm2 (add 8 bytes with saturation) */ 00259 movq [edi], mm1 /* store result in Dest */ 00260 add eax, 8 /* increase Src1, Src2 and Dest */ 00261 add ebx, 8 /* register pointers by 8 */ 00262 add edi, 8 00263 dec ecx /* decrease loop counter */ 00264 jnz L21011 /* check loop termination, proceed if required */ 00265 emms /* exit MMX state */ 00266 popa 00267 } 00268 #else 00269 /* i386 and x86_64 */ 00270 __m64 *mSrc1 = (__m64*)Src1; 00271 __m64 *mSrc2 = (__m64*)Src2; 00272 __m64 *mDest = (__m64*)Dest; 00273 __m64 *mMask = (__m64*)Mask; 00274 int i; 00275 for (i = 0; i < SrcLength/8; i++) { 00276 __m64 mm1 = *mSrc1, 00277 mm2 = *mSrc2; 00278 mm1 = _m_psrlwi(mm1, 1); /* shift 4 WORDS of mm1 1 bit to the right */ 00279 mm2 = _m_psrlwi(mm2, 1); /* shift 4 WORDS of mm2 1 bit to the right */ 00280 mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of mm1 */ 00281 mm2 = _m_pand(mm2, *mMask); /* apply Mask to 8 BYTES of mm2 */ 00282 *mDest = _m_paddusb(mm1, mm2); /* mm1+mm2 (add 8 bytes with saturation) */ 00283 mSrc1++; 00284 mSrc2++; 00285 mDest++; 00286 } 00287 _m_empty(); /* clean MMX state */ 00288 #endif 00289 return (0); 00290 #else 00291 return (-1); 00292 #endif 00293 } 00294 00305 int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00306 { 00307 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }; 00308 unsigned int i, istart; 00309 unsigned char *cursrc1, *cursrc2, *curdst; 00310 int result; 00311 00312 /* Validate input parameters */ 00313 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00314 return(-1); 00315 if (length == 0) 00316 return(0); 00317 00318 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00319 /* MMX routine */ 00320 SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask); 00321 00322 /* Check for unaligned bytes */ 00323 if ((length & 7) > 0) { 00324 /* Setup to process unaligned bytes */ 00325 istart = length & 0xfffffff8; 00326 cursrc1 = &Src1[istart]; 00327 cursrc2 = &Src2[istart]; 00328 curdst = &Dest[istart]; 00329 } else { 00330 /* No unaligned bytes - we are done */ 00331 return (0); 00332 } 00333 } else { 00334 /* Setup to process whole image */ 00335 istart = 0; 00336 cursrc1 = Src1; 00337 cursrc2 = Src2; 00338 curdst = Dest; 00339 } 00340 00341 /* C routine to process image */ 00342 for (i = istart; i < length; i++) { 00343 result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2; 00344 *curdst = (unsigned char) result; 00345 /* Advance pointers */ 00346 cursrc1++; 00347 cursrc2++; 00348 curdst++; 00349 } 00350 00351 return (0); 00352 } 00353 00364 static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00365 { 00366 #ifdef USE_MMX 00367 #if !defined(GCC__) 00368 __asm 00369 { 00370 pusha 00371 mov eax, Src1 /* load Src1 address into eax */ 00372 mov ebx, Src2 /* load Src2 address into ebx */ 00373 mov edi, Dest /* load Dest address into edi */ 00374 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00375 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00376 align 16 /* 16 byte alignment of the loop entry */ 00377 L1012: 00378 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00379 psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ 00380 movq [edi], mm1 /* store result in Dest */ 00381 add eax, 8 /* increase Src1, Src2 and Dest */ 00382 add ebx, 8 /* register pointers by 8 */ 00383 add edi, 8 00384 dec ecx /* decrease loop counter */ 00385 jnz L1012 /* check loop termination, proceed if required */ 00386 emms /* exit MMX state */ 00387 popa 00388 } 00389 #else 00390 /* i386 and x86_64 */ 00391 __m64 *mSrc1 = (__m64*)Src1; 00392 __m64 *mSrc2 = (__m64*)Src2; 00393 __m64 *mDest = (__m64*)Dest; 00394 int i; 00395 for (i = 0; i < SrcLength/8; i++) { 00396 *mDest = _m_psubusb(*mSrc1, *mSrc2); /* Src1-Src2 (sub 8 bytes with saturation) */ 00397 mSrc1++; 00398 mSrc2++; 00399 mDest++; 00400 } 00401 _m_empty(); /* clean MMX state */ 00402 #endif 00403 return (0); 00404 #else 00405 return (-1); 00406 #endif 00407 } 00408 00419 int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00420 { 00421 unsigned int i, istart; 00422 unsigned char *cursrc1, *cursrc2, *curdst; 00423 int result; 00424 00425 /* Validate input parameters */ 00426 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00427 return(-1); 00428 if (length == 0) 00429 return(0); 00430 00431 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00432 /* MMX routine */ 00433 SDL_imageFilterSubMMX(Src1, Src2, Dest, length); 00434 00435 /* Check for unaligned bytes */ 00436 if ((length & 7) > 0) { 00437 /* Setup to process unaligned bytes */ 00438 istart = length & 0xfffffff8; 00439 cursrc1 = &Src1[istart]; 00440 cursrc2 = &Src2[istart]; 00441 curdst = &Dest[istart]; 00442 } else { 00443 /* No unaligned bytes - we are done */ 00444 return (0); 00445 } 00446 } else { 00447 /* Setup to process whole image */ 00448 istart = 0; 00449 cursrc1 = Src1; 00450 cursrc2 = Src2; 00451 curdst = Dest; 00452 } 00453 00454 /* C routine to process image */ 00455 for (i = istart; i < length; i++) { 00456 result = (int) *cursrc1 - (int) *cursrc2; 00457 if (result < 0) 00458 result = 0; 00459 *curdst = (unsigned char) result; 00460 /* Advance pointers */ 00461 cursrc1++; 00462 cursrc2++; 00463 curdst++; 00464 } 00465 00466 return (0); 00467 } 00468 00479 static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00480 { 00481 #ifdef USE_MMX 00482 #if !defined(GCC__) 00483 __asm 00484 { 00485 pusha 00486 mov eax, Src1 /* load Src1 address into eax */ 00487 mov ebx, Src2 /* load Src2 address into ebx */ 00488 mov edi, Dest /* load Dest address into edi */ 00489 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00490 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00491 align 16 /* 16 byte alignment of the loop entry */ 00492 L1013: 00493 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00494 movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */ 00495 psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ 00496 psubusb mm2, [eax] /* mm2=Src2-Src1 (sub 8 bytes with saturation) */ 00497 por mm1, mm2 /* combine both mm2 and mm1 results */ 00498 movq [edi], mm1 /* store result in Dest */ 00499 add eax, 8 /* increase Src1, Src2 and Dest */ 00500 add ebx, 8 /* register pointers by 8 */ 00501 add edi, 8 00502 dec ecx /* decrease loop counter */ 00503 jnz L1013 /* check loop termination, proceed if required */ 00504 emms /* exit MMX state */ 00505 popa 00506 } 00507 #else 00508 /* i386 and x86_64 */ 00509 __m64 *mSrc1 = (__m64*)Src1; 00510 __m64 *mSrc2 = (__m64*)Src2; 00511 __m64 *mDest = (__m64*)Dest; 00512 int i; 00513 for (i = 0; i < SrcLength/8; i++) { 00514 __m64 mm1 = _m_psubusb(*mSrc2, *mSrc1); /* Src1-Src2 (sub 8 bytes with saturation) */ 00515 __m64 mm2 = _m_psubusb(*mSrc1, *mSrc2); /* Src2-Src1 (sub 8 bytes with saturation) */ 00516 *mDest = _m_por(mm1, mm2); /* combine both mm2 and mm1 results */ 00517 mSrc1++; 00518 mSrc2++; 00519 mDest++; 00520 } 00521 _m_empty(); /* clean MMX state */ 00522 #endif 00523 return (0); 00524 #else 00525 return (-1); 00526 #endif 00527 } 00528 00539 int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00540 { 00541 unsigned int i, istart; 00542 unsigned char *cursrc1, *cursrc2, *curdst; 00543 int result; 00544 00545 /* Validate input parameters */ 00546 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00547 return(-1); 00548 if (length == 0) 00549 return(0); 00550 00551 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00552 /* MMX routine */ 00553 SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length); 00554 00555 /* Check for unaligned bytes */ 00556 if ((length & 7) > 0) { 00557 /* Setup to process unaligned bytes */ 00558 istart = length & 0xfffffff8; 00559 cursrc1 = &Src1[istart]; 00560 cursrc2 = &Src2[istart]; 00561 curdst = &Dest[istart]; 00562 } else { 00563 /* No unaligned bytes - we are done */ 00564 return (0); 00565 } 00566 } else { 00567 /* Setup to process whole image */ 00568 istart = 0; 00569 cursrc1 = Src1; 00570 cursrc2 = Src2; 00571 curdst = Dest; 00572 } 00573 00574 /* C routine to process image */ 00575 for (i = istart; i < length; i++) { 00576 result = abs((int) *cursrc1 - (int) *cursrc2); 00577 *curdst = (unsigned char) result; 00578 /* Advance pointers */ 00579 cursrc1++; 00580 cursrc2++; 00581 curdst++; 00582 } 00583 00584 return (0); 00585 } 00586 00597 static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00598 { 00599 #ifdef USE_MMX 00600 #if !defined(GCC__) 00601 __asm 00602 { 00603 pusha 00604 mov eax, Src1 /* load Src1 address into eax */ 00605 mov ebx, Src2 /* load Src2 address into ebx */ 00606 mov edi, Dest /* load Dest address into edi */ 00607 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00608 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00609 pxor mm0, mm0 /* zero mm0 register */ 00610 align 16 /* 16 byte alignment of the loop entry */ 00611 L1014: 00612 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00613 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */ 00614 movq mm2, mm1 /* copy mm1 into mm2 */ 00615 movq mm4, mm3 /* copy mm3 into mm4 */ 00616 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */ 00617 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */ 00618 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */ 00619 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */ 00620 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */ 00621 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */ 00622 /* Take abs value of the results (signed words) */ 00623 movq mm5, mm1 /* copy mm1 into mm5 */ 00624 movq mm6, mm2 /* copy mm2 into mm6 */ 00625 psraw mm5, 15 /* fill mm5 words with word sign bit */ 00626 psraw mm6, 15 /* fill mm6 words with word sign bit */ 00627 pxor mm1, mm5 /* take 1's compliment of only neg. words */ 00628 pxor mm2, mm6 /* take 1's compliment of only neg. words */ 00629 psubsw mm1, mm5 /* add 1 to only neg. words, W-(-1) or W-0 */ 00630 psubsw mm2, mm6 /* add 1 to only neg. words, W-(-1) or W-0 */ 00631 packuswb mm1, mm2 /* pack words back into bytes with saturation */ 00632 movq [edi], mm1 /* store result in Dest */ 00633 add eax, 8 /* increase Src1, Src2 and Dest */ 00634 add ebx, 8 /* register pointers by 8 */ 00635 add edi, 8 00636 dec ecx /* decrease loop counter */ 00637 jnz L1014 /* check loop termination, proceed if required */ 00638 emms /* exit MMX state */ 00639 popa 00640 } 00641 #else 00642 /* i386 ASM with constraints: */ 00643 /* asm volatile ( */ 00644 /* "shr $3, %%ecx \n\t" /\* counter/8 (MMX loads 8 bytes at a time) *\/ */ 00645 /* "pxor %%mm0, %%mm0 \n\t" /\* zero mm0 register *\/ */ 00646 /* ".align 16 \n\t" /\* 16 byte alignment of the loop entry *\/ */ 00647 /* "1: movq (%%eax), %%mm1 \n\t" /\* load 8 bytes from Src1 into mm1 *\/ */ 00648 /* "movq (%%ebx), %%mm3 \n\t" /\* load 8 bytes from Src2 into mm3 *\/ */ 00649 /* "movq %%mm1, %%mm2 \n\t" /\* copy mm1 into mm2 *\/ */ 00650 /* "movq %%mm3, %%mm4 \n\t" /\* copy mm3 into mm4 *\/ */ 00651 /* "punpcklbw %%mm0, %%mm1 \n\t" /\* unpack low bytes of Src1 into words *\/ */ 00652 /* "punpckhbw %%mm0, %%mm2 \n\t" /\* unpack high bytes of Src1 into words *\/ */ 00653 /* "punpcklbw %%mm0, %%mm3 \n\t" /\* unpack low bytes of Src2 into words *\/ */ 00654 /* "punpckhbw %%mm0, %%mm4 \n\t" /\* unpack high bytes of Src2 into words *\/ */ 00655 /* "pmullw %%mm3, %%mm1 \n\t" /\* mul low bytes of Src1 and Src2 *\/ */ 00656 /* "pmullw %%mm4, %%mm2 \n\t" /\* mul high bytes of Src1 and Src2 *\/ */ 00657 /* /\* Take abs value of the results (signed words) *\/ */ 00658 /* "movq %%mm1, %%mm5 \n\t" /\* copy mm1 into mm5 *\/ */ 00659 /* "movq %%mm2, %%mm6 \n\t" /\* copy mm2 into mm6 *\/ */ 00660 /* "psraw $15, %%mm5 \n\t" /\* fill mm5 words with word sign bit *\/ */ 00661 /* "psraw $15, %%mm6 \n\t" /\* fill mm6 words with word sign bit *\/ */ 00662 /* "pxor %%mm5, %%mm1 \n\t" /\* take 1's compliment of only neg. words *\/ */ 00663 /* "pxor %%mm6, %%mm2 \n\t" /\* take 1's compliment of only neg. words *\/ */ 00664 /* "psubsw %%mm5, %%mm1 \n\t" /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */ 00665 /* "psubsw %%mm6, %%mm2 \n\t" /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */ 00666 /* "packuswb %%mm2, %%mm1 \n\t" /\* pack words back into bytes with saturation *\/ */ 00667 /* "movq %%mm1, (%%edi) \n\t" /\* store result in Dest *\/ */ 00668 /* "add $8, %%eax \n\t" /\* increase Src1, Src2 and Dest *\/ */ 00669 /* "add $8, %%ebx \n\t" /\* register pointers by 8 *\/ */ 00670 /* "add $8, %%edi \n\t" */ 00671 /* "dec %%ecx \n\t" /\* decrease loop counter *\/ */ 00672 /* "jnz 1b \n\t" /\* check loop termination, proceed if required *\/ */ 00673 /* "emms \n\t" /\* exit MMX state *\/ */ 00674 /* : "+a" (Src1), /\* load Src1 address into rax, modified by the loop *\/ */ 00675 /* "+b" (Src2), /\* load Src2 address into rbx, modified by the loop *\/ */ 00676 /* "+c" (SrcLength), /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */ 00677 /* "+D" (Dest) /\* load Dest address into rdi, modified by the loop *\/ */ 00678 /* : */ 00679 /* : "memory", /\* *Dest is modified *\/ */ 00680 /* "mm0","mm1","mm2","mm3","mm4","mm5","mm6" /\* registers modified *\/ */ 00681 /* ); */ 00682 00683 /* i386 and x86_64 */ 00684 __m64 *mSrc1 = (__m64*)Src1; 00685 __m64 *mSrc2 = (__m64*)Src2; 00686 __m64 *mDest = (__m64*)Dest; 00687 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 00688 int i; 00689 for (i = 0; i < SrcLength/8; i++) { 00690 __m64 mm1, mm2, mm3, mm4, mm5, mm6; 00691 mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 00692 mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 00693 mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */ 00694 mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */ 00695 mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */ 00696 mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */ 00697 mm5 = _m_psrawi(mm1, 15); /* fill mm5 words with word sign bit */ 00698 mm6 = _m_psrawi(mm2, 15); /* fill mm6 words with word sign bit */ 00699 mm1 = _m_pxor(mm1, mm5); /* take 1's compliment of only neg. words */ 00700 mm2 = _m_pxor(mm2, mm6); /* take 1's compliment of only neg. words */ 00701 mm1 = _m_psubsw(mm1, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */ 00702 mm2 = _m_psubsw(mm2, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */ 00703 *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */ 00704 mSrc1++; 00705 mSrc2++; 00706 mDest++; 00707 } 00708 _m_empty(); /* clean MMX state */ 00709 #endif 00710 return (0); 00711 #else 00712 return (-1); 00713 #endif 00714 } 00715 00726 int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00727 { 00728 unsigned int i, istart; 00729 unsigned char *cursrc1, *cursrc2, *curdst; 00730 int result; 00731 00732 /* Validate input parameters */ 00733 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00734 return(-1); 00735 if (length == 0) 00736 return(0); 00737 00738 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00739 /* MMX routine */ 00740 SDL_imageFilterMultMMX(Src1, Src2, Dest, length); 00741 00742 /* Check for unaligned bytes */ 00743 if ((length & 7) > 0) { 00744 /* Setup to process unaligned bytes */ 00745 istart = length & 0xfffffff8; 00746 cursrc1 = &Src1[istart]; 00747 cursrc2 = &Src2[istart]; 00748 curdst = &Dest[istart]; 00749 } else { 00750 /* No unaligned bytes - we are done */ 00751 return (0); 00752 } 00753 } else { 00754 /* Setup to process whole image */ 00755 istart = 0; 00756 cursrc1 = Src1; 00757 cursrc2 = Src2; 00758 curdst = Dest; 00759 } 00760 00761 /* C routine to process image */ 00762 for (i = istart; i < length; i++) { 00763 00764 /* NOTE: this is probably wrong - dunno what the MMX code does */ 00765 00766 result = (int) *cursrc1 * (int) *cursrc2; 00767 if (result > 255) 00768 result = 255; 00769 *curdst = (unsigned char) result; 00770 /* Advance pointers */ 00771 cursrc1++; 00772 cursrc2++; 00773 curdst++; 00774 } 00775 00776 return (0); 00777 } 00778 00789 int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00790 { 00791 #ifdef USE_MMX 00792 #if !defined(GCC__) 00793 __asm 00794 { 00795 pusha 00796 mov edx, Src1 /* load Src1 address into edx */ 00797 mov esi, Src2 /* load Src2 address into esi */ 00798 mov edi, Dest /* load Dest address into edi */ 00799 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00800 align 16 /* 16 byte alignment of the loop entry */ 00801 L10141: 00802 mov al, [edx] /* load a byte from Src1 */ 00803 mul [esi] /* mul with a byte from Src2 */ 00804 mov [edi], al /* move a byte result to Dest */ 00805 inc edx /* increment Src1, Src2, Dest */ 00806 inc esi /* pointer registers by one */ 00807 inc edi 00808 dec ecx /* decrease loop counter */ 00809 jnz L10141 /* check loop termination, proceed if required */ 00810 popa 00811 } 00812 #else 00813 /* Note: ~5% gain on i386, less efficient than C on x86_64 */ 00814 /* Also depends on whether this function is static (?!) */ 00815 asm volatile ( 00816 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 00817 # if defined(i386) 00818 "1:mov (%%edx), %%al \n\t" /* load a byte from Src1 */ 00819 "mulb (%%esi) \n\t" /* mul with a byte from Src2 */ 00820 "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */ 00821 "inc %%edx \n\t" /* increment Src1, Src2, Dest */ 00822 "inc %%esi \n\t" /* pointer registers by one */ 00823 "inc %%edi \n\t" 00824 "dec %%ecx \n\t" /* decrease loop counter */ 00825 # elif defined(__x86_64__) 00826 "1:mov (%%rdx), %%al \n\t" /* load a byte from Src1 */ 00827 "mulb (%%rsi) \n\t" /* mul with a byte from Src2 */ 00828 "mov %%al, (%%rdi) \n\t" /* move a byte result to Dest */ 00829 "inc %%rdx \n\t" /* increment Src1, Src2, Dest */ 00830 "inc %%rsi \n\t" /* pointer registers by one */ 00831 "inc %%rdi \n\t" 00832 "dec %%rcx \n\t" /* decrease loop counter */ 00833 # endif 00834 "jnz 1b \n\t" /* check loop termination, proceed if required */ 00835 : "+d" (Src1), /* load Src1 address into edx */ 00836 "+S" (Src2), /* load Src2 address into esi */ 00837 "+c" (SrcLength), /* load loop counter (SIZE) into ecx */ 00838 "+D" (Dest) /* load Dest address into edi */ 00839 : 00840 : "memory", "rax" 00841 ); 00842 #endif 00843 return (0); 00844 #else 00845 return (-1); 00846 #endif 00847 } 00848 00859 int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00860 { 00861 unsigned int i, istart; 00862 unsigned char *cursrc1, *cursrc2, *curdst; 00863 00864 /* Validate input parameters */ 00865 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00866 return(-1); 00867 if (length == 0) 00868 return(0); 00869 00870 if (SDL_imageFilterMMXdetect()) { 00871 if (length > 0) { 00872 /* ASM routine */ 00873 SDL_imageFilterMultNorASM(Src1, Src2, Dest, length); 00874 00875 /* Check for unaligned bytes */ 00876 if ((length & 7) > 0) { 00877 /* Setup to process unaligned bytes */ 00878 istart = length & 0xfffffff8; 00879 cursrc1 = &Src1[istart]; 00880 cursrc2 = &Src2[istart]; 00881 curdst = &Dest[istart]; 00882 } else { 00883 /* No unaligned bytes - we are done */ 00884 return (0); 00885 } 00886 } else { 00887 /* No bytes - we are done */ 00888 return (0); 00889 } 00890 } else { 00891 /* Setup to process whole image */ 00892 istart = 0; 00893 cursrc1 = Src1; 00894 cursrc2 = Src2; 00895 curdst = Dest; 00896 } 00897 00898 /* C routine to process image */ 00899 for (i = istart; i < length; i++) { 00900 *curdst = (int)*cursrc1 * (int)*cursrc2; // (int) for efficiency 00901 /* Advance pointers */ 00902 cursrc1++; 00903 cursrc2++; 00904 curdst++; 00905 } 00906 00907 return (0); 00908 } 00909 00920 static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00921 { 00922 #ifdef USE_MMX 00923 #if !defined(GCC__) 00924 __asm 00925 { 00926 pusha 00927 mov eax, Src1 /* load Src1 address into eax */ 00928 mov ebx, Src2 /* load Src2 address into ebx */ 00929 mov edi, Dest /* load Dest address into edi */ 00930 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00931 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00932 pxor mm0, mm0 /* zero mm0 register */ 00933 align 16 /* 16 byte alignment of the loop entry */ 00934 L1015: 00935 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00936 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */ 00937 movq mm2, mm1 /* copy mm1 into mm2 */ 00938 movq mm4, mm3 /* copy mm3 into mm4 */ 00939 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */ 00940 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */ 00941 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */ 00942 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */ 00943 psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */ 00944 psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */ 00945 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */ 00946 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */ 00947 packuswb mm1, mm2 /* pack words back into bytes with saturation */ 00948 movq [edi], mm1 /* store result in Dest */ 00949 add eax, 8 /* increase Src1, Src2 and Dest */ 00950 add ebx, 8 /* register pointers by 8 */ 00951 add edi, 8 00952 dec ecx /* decrease loop counter */ 00953 jnz L1015 /* check loop termination, proceed if required */ 00954 emms /* exit MMX state */ 00955 popa 00956 } 00957 #else 00958 /* i386 and x86_64 */ 00959 __m64 *mSrc1 = (__m64*)Src1; 00960 __m64 *mSrc2 = (__m64*)Src2; 00961 __m64 *mDest = (__m64*)Dest; 00962 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 00963 int i; 00964 for (i = 0; i < SrcLength/8; i++) { 00965 __m64 mm1, mm2, mm3, mm4, mm5, mm6; 00966 mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 00967 mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 00968 mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */ 00969 mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */ 00970 mm1 = _m_psrlwi(mm1, 1); /* divide mm1 words by 2, Src1 low bytes */ 00971 mm2 = _m_psrlwi(mm2, 1); /* divide mm2 words by 2, Src1 high bytes */ 00972 mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */ 00973 mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */ 00974 *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */ 00975 mSrc1++; 00976 mSrc2++; 00977 mDest++; 00978 } 00979 _m_empty(); /* clean MMX state */ 00980 #endif 00981 return (0); 00982 #else 00983 return (-1); 00984 #endif 00985 } 00986 00997 int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00998 { 00999 unsigned int i, istart; 01000 unsigned char *cursrc1, *cursrc2, *curdst; 01001 int result; 01002 01003 /* Validate input parameters */ 01004 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01005 return(-1); 01006 if (length == 0) 01007 return(0); 01008 01009 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01010 /* MMX routine */ 01011 SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length); 01012 01013 /* Check for unaligned bytes */ 01014 if ((length & 7) > 0) { 01015 /* Setup to process unaligned bytes */ 01016 istart = length & 0xfffffff8; 01017 cursrc1 = &Src1[istart]; 01018 cursrc2 = &Src2[istart]; 01019 curdst = &Dest[istart]; 01020 } else { 01021 /* No unaligned bytes - we are done */ 01022 return (0); 01023 } 01024 } else { 01025 /* Setup to process whole image */ 01026 istart = 0; 01027 cursrc1 = Src1; 01028 cursrc2 = Src2; 01029 curdst = Dest; 01030 } 01031 01032 /* C routine to process image */ 01033 for (i = istart; i < length; i++) { 01034 result = ((int) *cursrc1 / 2) * (int) *cursrc2; 01035 if (result > 255) 01036 result = 255; 01037 *curdst = (unsigned char) result; 01038 /* Advance pointers */ 01039 cursrc1++; 01040 cursrc2++; 01041 curdst++; 01042 } 01043 01044 return (0); 01045 } 01046 01057 static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01058 { 01059 #ifdef USE_MMX 01060 #if !defined(GCC__) 01061 __asm 01062 { 01063 pusha 01064 mov eax, Src1 /* load Src1 address into eax */ 01065 mov ebx, Src2 /* load Src2 address into ebx */ 01066 mov edi, Dest /* load Dest address into edi */ 01067 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01068 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01069 pxor mm0, mm0 /* zero mm0 register */ 01070 align 16 /* 16 byte alignment of the loop entry */ 01071 L1016: 01072 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 01073 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */ 01074 movq mm2, mm1 /* copy mm1 into mm2 */ 01075 movq mm4, mm3 /* copy mm3 into mm4 */ 01076 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */ 01077 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */ 01078 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */ 01079 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */ 01080 psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */ 01081 psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */ 01082 psrlw mm3, 1 /* divide mm3 words by 2, Src2 low bytes */ 01083 psrlw mm4, 1 /* divide mm4 words by 2, Src2 high bytes */ 01084 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */ 01085 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */ 01086 packuswb mm1, mm2 /* pack words back into bytes with saturation */ 01087 movq [edi], mm1 /* store result in Dest */ 01088 add eax, 8 /* increase Src1, Src2 and Dest */ 01089 add ebx, 8 /* register pointers by 8 */ 01090 add edi, 8 01091 dec ecx /* decrease loop counter */ 01092 jnz L1016 /* check loop termination, proceed if required */ 01093 emms /* exit MMX state */ 01094 popa 01095 } 01096 #else 01097 /* i386 and x86_64 */ 01098 __m64 *mSrc1 = (__m64*)Src1; 01099 __m64 *mSrc2 = (__m64*)Src2; 01100 __m64 *mDest = (__m64*)Dest; 01101 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 01102 int i; 01103 for (i = 0; i < SrcLength/8; i++) { 01104 __m64 mm1, mm2, mm3, mm4, mm5, mm6; 01105 mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 01106 mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 01107 mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */ 01108 mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */ 01109 mm1 = _m_psrlwi(mm1, 1); /* divide mm1 words by 2, Src1 low bytes */ 01110 mm2 = _m_psrlwi(mm2, 1); /* divide mm2 words by 2, Src1 high bytes */ 01111 mm3 = _m_psrlwi(mm3, 1); /* divide mm3 words by 2, Src2 low bytes */ 01112 mm4 = _m_psrlwi(mm4, 1); /* divide mm4 words by 2, Src2 high bytes */ 01113 mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */ 01114 mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */ 01115 *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */ 01116 mSrc1++; 01117 mSrc2++; 01118 mDest++; 01119 } 01120 _m_empty(); /* clean MMX state */ 01121 #endif 01122 return (0); 01123 #else 01124 return (-1); 01125 #endif 01126 } 01127 01138 int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01139 { 01140 unsigned int i, istart; 01141 unsigned char *cursrc1, *cursrc2, *curdst; 01142 int result; 01143 01144 /* Validate input parameters */ 01145 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01146 return(-1); 01147 if (length == 0) 01148 return(0); 01149 01150 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01151 /* MMX routine */ 01152 SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length); 01153 01154 /* Check for unaligned bytes */ 01155 if ((length & 7) > 0) { 01156 /* Setup to process unaligned bytes */ 01157 istart = length & 0xfffffff8; 01158 cursrc1 = &Src1[istart]; 01159 cursrc2 = &Src2[istart]; 01160 curdst = &Dest[istart]; 01161 } else { 01162 /* No unaligned bytes - we are done */ 01163 return (0); 01164 } 01165 } else { 01166 /* Setup to process whole image */ 01167 istart = 0; 01168 cursrc1 = Src1; 01169 cursrc2 = Src2; 01170 curdst = Dest; 01171 } 01172 01173 /* C routine to process image */ 01174 for (i = istart; i < length; i++) { 01175 result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2); 01176 if (result > 255) 01177 result = 255; 01178 *curdst = (unsigned char) result; 01179 /* Advance pointers */ 01180 cursrc1++; 01181 cursrc2++; 01182 curdst++; 01183 } 01184 01185 return (0); 01186 } 01187 01198 static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01199 { 01200 #ifdef USE_MMX 01201 #if !defined(GCC__) 01202 __asm 01203 { 01204 pusha 01205 mov eax, Src1 /* load Src1 address into eax */ 01206 mov ebx, Src2 /* load Src2 address into ebx */ 01207 mov edi, Dest /* load Dest address into edi */ 01208 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01209 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01210 align 16 /* 16 byte alignment of the loop entry */ 01211 L1017: 01212 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 01213 pand mm1, [ebx] /* mm1=Src1&Src2 */ 01214 movq [edi], mm1 /* store result in Dest */ 01215 add eax, 8 /* increase Src1, Src2 and Dest */ 01216 add ebx, 8 /* register pointers by 8 */ 01217 add edi, 8 01218 dec ecx /* decrease loop counter */ 01219 jnz L1017 /* check loop termination, proceed if required */ 01220 emms /* exit MMX state */ 01221 popa 01222 } 01223 #else 01224 /* x86_64 ASM with constraints: */ 01225 /* asm volatile ( */ 01226 /* "shr $3, %%rcx \n\t" /\* counter/8 (MMX loads 8 bytes at a time) *\/ */ 01227 /* ".align 16 \n\t" /\* 16 byte alignment of the loop entry *\/ */ 01228 /* "1: movq (%%rax), %%mm1 \n\t" /\* load 8 bytes from Src1 into mm1 *\/ */ 01229 /* "pand (%%rbx), %%mm1 \n\t" /\* mm1=Src1&Src2 *\/ */ 01230 /* "movq %%mm1, (%%rdi) \n\t" /\* store result in Dest *\/ */ 01231 /* "add $8, %%rax \n\t" /\* increase Src1, Src2 and Dest *\/ */ 01232 /* "add $8, %%rbx \n\t" /\* register pointers by 8 *\/ */ 01233 /* "add $8, %%rdi \n\t" */ 01234 /* "dec %%rcx \n\t" /\* decrease loop counter *\/ */ 01235 /* "jnz 1b \n\t" /\* check loop termination, proceed if required *\/ */ 01236 /* "emms \n\t" /\* exit MMX state *\/ */ 01237 /* : "+a" (Src1), /\* load Src1 address into rax, modified by the loop *\/ */ 01238 /* "+b" (Src2), /\* load Src2 address into rbx, modified by the loop *\/ */ 01239 /* "+c" (SrcLength), /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */ 01240 /* "+D" (Dest) /\* load Dest address into rdi, modified by the loop *\/ */ 01241 /* : */ 01242 /* : "memory", /\* *Dest is modified *\/ */ 01243 /* "mm1" /\* register mm1 modified *\/ */ 01244 /* ); */ 01245 01246 /* i386 and x86_64 */ 01247 __m64 *mSrc1 = (__m64*)Src1; 01248 __m64 *mSrc2 = (__m64*)Src2; 01249 __m64 *mDest = (__m64*)Dest; 01250 int i; 01251 for (i = 0; i < SrcLength/8; i++) { 01252 *mDest = _m_pand(*mSrc1, *mSrc2); /* Src1&Src2 */ 01253 mSrc1++; 01254 mSrc2++; 01255 mDest++; 01256 } 01257 _m_empty(); /* clean MMX state */ 01258 #endif 01259 return (0); 01260 #else 01261 return (-1); 01262 #endif 01263 } 01264 01275 int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01276 { 01277 unsigned int i, istart; 01278 unsigned char *cursrc1, *cursrc2, *curdst; 01279 01280 /* Validate input parameters */ 01281 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01282 return(-1); 01283 if (length == 0) 01284 return(0); 01285 01286 if ((SDL_imageFilterMMXdetect()>0) && (length>7)) { 01287 /* if (length > 7) { */ 01288 /* Call MMX routine */ 01289 01290 SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length); 01291 01292 /* Check for unaligned bytes */ 01293 if ((length & 7) > 0) { 01294 01295 /* Setup to process unaligned bytes */ 01296 istart = length & 0xfffffff8; 01297 cursrc1 = &Src1[istart]; 01298 cursrc2 = &Src2[istart]; 01299 curdst = &Dest[istart]; 01300 } else { 01301 /* No unaligned bytes - we are done */ 01302 return (0); 01303 } 01304 } else { 01305 /* Setup to process whole image */ 01306 istart = 0; 01307 cursrc1 = Src1; 01308 cursrc2 = Src2; 01309 curdst = Dest; 01310 } 01311 01312 /* C routine to process image */ 01313 for (i = istart; i < length; i++) { 01314 *curdst = (*cursrc1) & (*cursrc2); 01315 /* Advance pointers */ 01316 cursrc1++; 01317 cursrc2++; 01318 curdst++; 01319 } 01320 01321 return (0); 01322 } 01323 01334 static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01335 { 01336 #ifdef USE_MMX 01337 #if !defined(GCC__) 01338 __asm 01339 { 01340 pusha 01341 mov eax, Src1 /* load Src1 address into eax */ 01342 mov ebx, Src2 /* load Src2 address into ebx */ 01343 mov edi, Dest /* load Dest address into edi */ 01344 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01345 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01346 align 16 /* 16 byte alignment of the loop entry */ 01347 L91017: 01348 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 01349 por mm1, [ebx] /* mm1=Src1|Src2 */ 01350 movq [edi], mm1 /* store result in Dest */ 01351 add eax, 8 /* increase Src1, Src2 and Dest */ 01352 add ebx, 8 /* register pointers by 8 */ 01353 add edi, 8 01354 dec ecx /* decrease loop counter */ 01355 jnz L91017 /* check loop termination, proceed if required */ 01356 emms /* exit MMX state */ 01357 popa 01358 } 01359 #else 01360 /* i386 and x86_64 */ 01361 __m64 *mSrc1 = (__m64*)Src1; 01362 __m64 *mSrc2 = (__m64*)Src2; 01363 __m64 *mDest = (__m64*)Dest; 01364 int i; 01365 for (i = 0; i < SrcLength/8; i++) { 01366 *mDest = _m_por(*mSrc1, *mSrc2); /* Src1|Src2 */ 01367 mSrc1++; 01368 mSrc2++; 01369 mDest++; 01370 } 01371 _m_empty(); /* clean MMX state */ 01372 #endif 01373 return (0); 01374 #else 01375 return (-1); 01376 #endif 01377 } 01378 01389 int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01390 { 01391 unsigned int i, istart; 01392 unsigned char *cursrc1, *cursrc2, *curdst; 01393 01394 /* Validate input parameters */ 01395 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01396 return(-1); 01397 if (length == 0) 01398 return(0); 01399 01400 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01401 01402 /* MMX routine */ 01403 SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length); 01404 01405 /* Check for unaligned bytes */ 01406 if ((length & 7) > 0) { 01407 /* Setup to process unaligned bytes */ 01408 istart = length & 0xfffffff8; 01409 cursrc1 = &Src1[istart]; 01410 cursrc2 = &Src2[istart]; 01411 curdst = &Dest[istart]; 01412 } else { 01413 /* No unaligned bytes - we are done */ 01414 return (0); 01415 } 01416 } else { 01417 /* Setup to process whole image */ 01418 istart = 0; 01419 cursrc1 = Src1; 01420 cursrc2 = Src2; 01421 curdst = Dest; 01422 } 01423 01424 /* C routine to process image */ 01425 for (i = istart; i < length; i++) { 01426 *curdst = *cursrc1 | *cursrc2; 01427 /* Advance pointers */ 01428 cursrc1++; 01429 cursrc2++; 01430 curdst++; 01431 } 01432 return (0); 01433 } 01434 01445 static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01446 { 01447 #ifdef USE_MMX 01448 #if !defined(GCC__) 01449 __asm 01450 { 01451 pusha 01452 mov edx, Src1 /* load Src1 address into edx */ 01453 mov esi, Src2 /* load Src2 address into esi */ 01454 mov edi, Dest /* load Dest address into edi */ 01455 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01456 align 16 /* 16 byte alignment of the loop entry */ 01457 L10191: 01458 mov bl, [esi] /* load a byte from Src2 */ 01459 cmp bl, 0 /* check if it zero */ 01460 jnz L10192 01461 mov [edi], 255 /* division by zero = 255 !!! */ 01462 jmp L10193 01463 L10192: 01464 xor ah, ah /* prepare AX, zero AH register */ 01465 mov al, [edx] /* load a byte from Src1 into AL */ 01466 div bl /* divide AL by BL */ 01467 mov [edi], al /* move a byte result to Dest */ 01468 L10193: 01469 inc edx /* increment Src1, Src2, Dest */ 01470 inc esi /* pointer registers by one */ 01471 inc edi 01472 dec ecx /* decrease loop counter */ 01473 jnz L10191 /* check loop termination, proceed if required */ 01474 popa 01475 } 01476 #else 01477 /* Note: ~15% gain on i386, less efficient than C on x86_64 */ 01478 /* Also depends on whether the function is static (?!) */ 01479 /* Also depends on whether we work on malloc() or static char[] */ 01480 asm volatile ( 01481 # if defined(i386) 01482 "pushl %%ebx \n\t" /* %ebx may be the PIC register. */ 01483 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01484 "1: mov (%%esi), %%bl \n\t" /* load a byte from Src2 */ 01485 "cmp $0, %%bl \n\t" /* check if it zero */ 01486 "jnz 2f \n\t" 01487 "movb $255, (%%edi) \n\t" /* division by zero = 255 !!! */ 01488 "jmp 3f \n\t" 01489 "2: xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */ 01490 "mov (%%edx), %%al \n\t" /* load a byte from Src1 into AL */ 01491 "div %%bl \n\t" /* divide AL by BL */ 01492 "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */ 01493 "3: inc %%edx \n\t" /* increment Src1, Src2, Dest */ 01494 "inc %%esi \n\t" /* pointer registers by one */ 01495 "inc %%edi \n\t" 01496 "dec %%ecx \n\t" /* decrease loop counter */ 01497 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01498 "popl %%ebx \n\t" /* restore %ebx */ 01499 : "+d" (Src1), /* load Src1 address into edx */ 01500 "+S" (Src2), /* load Src2 address into esi */ 01501 "+c" (SrcLength), /* load loop counter (SIZE) into ecx */ 01502 "+D" (Dest) /* load Dest address into edi */ 01503 : 01504 : "memory", "rax" 01505 # elif defined(__x86_64__) 01506 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01507 "1: mov (%%rsi), %%bl \n\t" /* load a byte from Src2 */ 01508 "cmp $0, %%bl \n\t" /* check if it zero */ 01509 "jnz 2f \n\t" 01510 "movb $255, (%%rdi) \n\t" /* division by zero = 255 !!! */ 01511 "jmp 3f \n\t" 01512 "2: xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */ 01513 "mov (%%rdx), %%al \n\t" /* load a byte from Src1 into AL */ 01514 "div %%bl \n\t" /* divide AL by BL */ 01515 "mov %%al, (%%rdi) \n\t" /* move a byte result to Dest */ 01516 "3: inc %%rdx \n\t" /* increment Src1, Src2, Dest */ 01517 "inc %%rsi \n\t" /* pointer registers by one */ 01518 "inc %%rdi \n\t" 01519 "dec %%rcx \n\t" /* decrease loop counter */ 01520 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01521 : "+d" (Src1), /* load Src1 address into edx */ 01522 "+S" (Src2), /* load Src2 address into esi */ 01523 "+c" (SrcLength), /* load loop counter (SIZE) into ecx */ 01524 "+D" (Dest) /* load Dest address into edi */ 01525 : 01526 : "memory", "rax", "rbx" 01527 # endif 01528 ); 01529 #endif 01530 return (0); 01531 #else 01532 return (-1); 01533 #endif 01534 } 01535 01546 int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01547 { 01548 unsigned int i, istart; 01549 unsigned char *cursrc1, *cursrc2, *curdst; 01550 01551 /* Validate input parameters */ 01552 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01553 return(-1); 01554 if (length == 0) 01555 return(0); 01556 01557 if (SDL_imageFilterMMXdetect()) { 01558 if (length > 0) { 01559 /* Call ASM routine */ 01560 SDL_imageFilterDivASM(Src1, Src2, Dest, length); 01561 01562 /* Never unaligned bytes - we are done */ 01563 return (0); 01564 } else { 01565 return (-1); 01566 } 01567 } 01568 01569 /* Setup to process whole image */ 01570 istart = 0; 01571 cursrc1 = Src1; 01572 cursrc2 = Src2; 01573 curdst = Dest; 01574 01575 /* C routine to process image */ 01576 /* for (i = istart; i < length; i++) { */ 01577 /* if (*cursrc2 == 0) { */ 01578 /* *curdst = 255; */ 01579 /* } else { */ 01580 /* result = (int) *cursrc1 / (int) *cursrc2; */ 01581 /* *curdst = (unsigned char) result; */ 01582 /* } */ 01583 /* /\* Advance pointers *\/ */ 01584 /* cursrc1++; */ 01585 /* cursrc2++; */ 01586 /* curdst++; */ 01587 /* } */ 01588 for (i = istart; i < length; i++) { 01589 if (*cursrc2 == 0) { 01590 *curdst = 255; 01591 } else { 01592 *curdst = (int)*cursrc1 / (int)*cursrc2; // (int) for efficiency 01593 } 01594 /* Advance pointers */ 01595 cursrc1++; 01596 cursrc2++; 01597 curdst++; 01598 } 01599 01600 return (0); 01601 } 01602 01603 /* ------------------------------------------------------------------------------------ */ 01604 01614 static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength) 01615 { 01616 #ifdef USE_MMX 01617 #if !defined(GCC__) 01618 __asm 01619 { 01620 pusha 01621 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 01622 mov eax, Src1 /* load Src1 address into eax */ 01623 mov edi, Dest /* load Dest address into edi */ 01624 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01625 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01626 align 16 /* 16 byte alignment of the loop entry */ 01627 L91117: 01628 movq mm0, [eax] /* load 8 bytes from Src1 into mm1 */ 01629 pxor mm0, mm1 /* negate mm0 by xoring with mm1 */ 01630 movq [edi], mm0 /* store result in Dest */ 01631 add eax, 8 /* increase Src1, Src2 and Dest */ 01632 add edi, 8 01633 dec ecx /* decrease loop counter */ 01634 jnz L91117 /* check loop termination, proceed if required */ 01635 emms /* exit MMX state */ 01636 popa 01637 } 01638 #else 01639 /* i386 and x86_64 */ 01640 __m64 *mSrc1 = (__m64*)Src1; 01641 __m64 *mDest = (__m64*)Dest; 01642 __m64 mm1; 01643 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */ 01644 int i; 01645 for (i = 0; i < SrcLength/8; i++) { 01646 *mDest = _m_pxor(*mSrc1, mm1); /* negate mm0 by xoring with mm1 */ 01647 mSrc1++; 01648 mDest++; 01649 } 01650 _m_empty(); /* clean MMX state */ 01651 01652 #endif 01653 return (0); 01654 #else 01655 return (-1); 01656 #endif 01657 } 01658 01668 int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length) 01669 { 01670 unsigned int i, istart; 01671 unsigned char *cursrc1, *curdst; 01672 01673 /* Validate input parameters */ 01674 if ((Src1 == NULL) || (Dest == NULL)) 01675 return(-1); 01676 if (length == 0) 01677 return(0); 01678 01679 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01680 /* MMX routine */ 01681 SDL_imageFilterBitNegationMMX(Src1, Dest, length); 01682 01683 /* Check for unaligned bytes */ 01684 if ((length & 7) > 0) { 01685 /* Setup to process unaligned bytes */ 01686 istart = length & 0xfffffff8; 01687 cursrc1 = &Src1[istart]; 01688 curdst = &Dest[istart]; 01689 } else { 01690 /* No unaligned bytes - we are done */ 01691 return (0); 01692 } 01693 } else { 01694 /* Setup to process whole image */ 01695 istart = 0; 01696 cursrc1 = Src1; 01697 curdst = Dest; 01698 } 01699 01700 /* C routine to process image */ 01701 for (i = istart; i < length; i++) { 01702 *curdst = ~(*cursrc1); 01703 /* Advance pointers */ 01704 cursrc1++; 01705 curdst++; 01706 } 01707 01708 return (0); 01709 } 01710 01721 static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C) 01722 { 01723 #ifdef USE_MMX 01724 #if !defined(GCC__) 01725 __asm 01726 { 01727 pusha 01728 /* ** Duplicate C in 8 bytes of MM1 ** */ 01729 mov al, C /* load C into AL */ 01730 mov ah, al /* copy AL into AH */ 01731 mov bx, ax /* copy AX into BX */ 01732 shl eax, 16 /* shift 2 bytes of EAX left */ 01733 mov ax, bx /* copy BX into AX */ 01734 movd mm1, eax /* copy EAX into MM1 */ 01735 movd mm2, eax /* copy EAX into MM2 */ 01736 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 01737 mov eax, Src1 /* load Src1 address into eax */ 01738 mov edi, Dest /* load Dest address into edi */ 01739 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01740 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01741 align 16 /* 16 byte alignment of the loop entry */ 01742 L1021: 01743 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */ 01744 paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */ 01745 movq [edi], mm0 /* store result in Dest */ 01746 add eax, 8 /* increase Dest register pointer by 8 */ 01747 add edi, 8 /* increase Dest register pointer by 8 */ 01748 dec ecx /* decrease loop counter */ 01749 jnz L1021 /* check loop termination, proceed if required */ 01750 emms /* exit MMX state */ 01751 popa 01752 } 01753 #else 01754 /* i386 and x86_64 */ 01755 __m64 *mSrc1 = (__m64*)Src1; 01756 __m64 *mDest = (__m64*)Dest; 01757 /* Duplicate C in 8 bytes of MM1 */ 01758 int i; 01759 memset(&i, C, 4); 01760 __m64 mm1 = _m_from_int(i); 01761 __m64 mm2 = _m_from_int(i); 01762 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */ 01763 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 01764 for (i = 0; i < SrcLength/8; i++) { 01765 *mDest = _m_paddusb(*mSrc1, mm1); /* Src1+C (add 8 bytes with saturation) */ 01766 mSrc1++; 01767 mDest++; 01768 } 01769 _m_empty(); /* clean MMX state */ 01770 #endif 01771 return (0); 01772 #else 01773 return (-1); 01774 #endif 01775 } 01776 01788 int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 01789 { 01790 unsigned int i, istart; 01791 int iC; 01792 unsigned char *cursrc1, *curdest; 01793 int result; 01794 01795 /* Validate input parameters */ 01796 if ((Src1 == NULL) || (Dest == NULL)) 01797 return(-1); 01798 if (length == 0) 01799 return(0); 01800 01801 /* Special case: C==0 */ 01802 if (C == 0) { 01803 memcpy(Src1, Dest, length); 01804 return (0); 01805 } 01806 01807 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01808 01809 /* MMX routine */ 01810 SDL_imageFilterAddByteMMX(Src1, Dest, length, C); 01811 01812 /* Check for unaligned bytes */ 01813 if ((length & 7) > 0) { 01814 /* Setup to process unaligned bytes */ 01815 istart = length & 0xfffffff8; 01816 cursrc1 = &Src1[istart]; 01817 curdest = &Dest[istart]; 01818 } else { 01819 /* No unaligned bytes - we are done */ 01820 return (0); 01821 } 01822 } else { 01823 /* Setup to process whole image */ 01824 istart = 0; 01825 cursrc1 = Src1; 01826 curdest = Dest; 01827 } 01828 01829 /* C routine to process image */ 01830 iC = (int) C; 01831 for (i = istart; i < length; i++) { 01832 result = (int) *cursrc1 + iC; 01833 if (result > 255) 01834 result = 255; 01835 *curdest = (unsigned char) result; 01836 /* Advance pointers */ 01837 cursrc1++; 01838 curdest++; 01839 } 01840 return (0); 01841 } 01842 01854 static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D) 01855 { 01856 #ifdef USE_MMX 01857 #if !defined(GCC__) 01858 __asm 01859 { 01860 pusha 01861 /* ** Duplicate (int)C in 8 bytes of MM1 ** */ 01862 mov eax, C /* load C into EAX */ 01863 movd mm1, eax /* copy EAX into MM1 */ 01864 mov eax, D /* load D into EAX */ 01865 movd mm2, eax /* copy EAX into MM2 */ 01866 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 01867 mov eax, Src1 /* load Src1 address into eax */ 01868 mov edi, Dest /* load Dest address into edi */ 01869 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01870 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01871 align 16 /* 16 byte alignment of the loop entry */ 01872 L11023: 01873 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 01874 paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */ 01875 movq [edi], mm0 /* store result in SrcDest */ 01876 add eax, 8 /* increase Src1 register pointer by 8 */ 01877 add edi, 8 /* increase Dest register pointer by 8 */ 01878 dec ecx /* decrease loop counter */ 01879 jnz L11023 /* check loop termination, proceed if required */ 01880 emms /* exit MMX state */ 01881 popa 01882 } 01883 #else 01884 /* i386 and x86_64 */ 01885 __m64 *mSrc1 = (__m64*)Src1; 01886 __m64 *mDest = (__m64*)Dest; 01887 /* Duplicate (int)C in 8 bytes of MM1 */ 01888 __m64 mm1 = _m_from_int(C); 01889 __m64 mm2 = _m_from_int(C); 01890 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */ 01891 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 01892 int i; 01893 for (i = 0; i < SrcLength/8; i++) { 01894 *mDest = _m_paddusb(*mSrc1, mm1); /* Src1+C (add 8 bytes with saturation) */ 01895 mSrc1++; 01896 mDest++; 01897 } 01898 _m_empty(); /* clean MMX state */ 01899 #endif 01900 return (0); 01901 #else 01902 return (-1); 01903 #endif 01904 } 01905 01916 int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C) 01917 { 01918 unsigned int i, j, istart, D; 01919 int iC[4]; 01920 unsigned char *cursrc1; 01921 unsigned char *curdest; 01922 int result; 01923 01924 /* Validate input parameters */ 01925 if ((Src1 == NULL) || (Dest == NULL)) 01926 return(-1); 01927 if (length == 0) 01928 return(0); 01929 01930 /* Special case: C==0 */ 01931 if (C == 0) { 01932 memcpy(Src1, Dest, length); 01933 return (0); 01934 } 01935 01936 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01937 01938 /* MMX routine */ 01939 D=SWAP_32(C); 01940 SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D); 01941 01942 /* Check for unaligned bytes */ 01943 if ((length & 7) > 0) { 01944 /* Setup to process unaligned bytes */ 01945 istart = length & 0xfffffff8; 01946 cursrc1 = &Src1[istart]; 01947 curdest = &Dest[istart]; 01948 } else { 01949 /* No unaligned bytes - we are done */ 01950 return (0); 01951 } 01952 } else { 01953 /* Setup to process whole image */ 01954 istart = 0; 01955 cursrc1 = Src1; 01956 curdest = Dest; 01957 } 01958 01959 /* C routine to process bytes */ 01960 iC[3] = (int) ((C >> 24) & 0xff); 01961 iC[2] = (int) ((C >> 16) & 0xff); 01962 iC[1] = (int) ((C >> 8) & 0xff); 01963 iC[0] = (int) ((C >> 0) & 0xff); 01964 for (i = istart; i < length; i += 4) { 01965 for (j = 0; j < 4; j++) { 01966 if ((i+j)<length) { 01967 result = (int) *cursrc1 + iC[j]; 01968 if (result > 255) result = 255; 01969 *curdest = (unsigned char) result; 01970 /* Advance pointers */ 01971 cursrc1++; 01972 curdest++; 01973 } 01974 } 01975 } 01976 return (0); 01977 } 01978 01990 static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C, 01991 unsigned char *Mask) 01992 { 01993 #ifdef USE_MMX 01994 #if !defined(GCC__) 01995 __asm 01996 { 01997 pusha 01998 /* ** Duplicate C in 8 bytes of MM1 ** */ 01999 mov al, C /* load C into AL */ 02000 mov ah, al /* copy AL into AH */ 02001 mov bx, ax /* copy AX into BX */ 02002 shl eax, 16 /* shift 2 bytes of EAX left */ 02003 mov ax, bx /* copy BX into AX */ 02004 movd mm1, eax /* copy EAX into MM1 */ 02005 movd mm2, eax /* copy EAX into MM2 */ 02006 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 02007 mov edx, Mask /* load Mask address into edx */ 02008 movq mm0, [edx] /* load Mask into mm0 */ 02009 mov eax, Src1 /* load Src1 address into eax */ 02010 mov edi, Dest /* load Dest address into edi */ 02011 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02012 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02013 align 16 /* 16 byte alignment of the loop entry */ 02014 L1022: 02015 movq mm2, [eax] /* load 8 bytes from Src1 into MM2 */ 02016 psrlw mm2, 1 /* shift 4 WORDS of MM2 1 bit to the right */ 02017 pand mm2, mm0 // apply Mask to 8 BYTES of MM2 */ 02018 paddusb mm2, mm1 /* MM2=SrcDest+C (add 8 bytes with saturation) */ 02019 movq [edi], mm2 /* store result in Dest */ 02020 add eax, 8 /* increase Src1 register pointer by 8 */ 02021 add edi, 8 /* increase Dest register pointer by 8 */ 02022 dec ecx /* decrease loop counter */ 02023 jnz L1022 /* check loop termination, proceed if required */ 02024 emms /* exit MMX state */ 02025 popa 02026 } 02027 #else 02028 /* i386 and x86_64 */ 02029 __m64 *mSrc1 = (__m64*)Src1; 02030 __m64 *mDest = (__m64*)Dest; 02031 __m64 *mMask = (__m64*)Mask; 02032 /* Duplicate C in 8 bytes of MM1 */ 02033 int i; 02034 memset(&i, C, 4); 02035 __m64 mm1 = _m_from_int(i); 02036 __m64 mm2 = _m_from_int(i); 02037 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */ 02038 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 02039 for (i = 0; i < SrcLength/8; i++) { 02040 __m64 mm2 = _m_psrlwi(*mSrc1, 1); /* shift 4 WORDS of MM2 1 bit to the right */ 02041 mm2 = _m_pand(mm2, *mMask); /* apply Mask to 8 BYTES of MM2 */ 02042 /* byte 0x0f, 0xdb, 0xd0 */ 02043 *mDest = _m_paddusb(mm1, mm2); /* Src1+C (add 8 bytes with saturation) */ 02044 mSrc1++; 02045 mDest++; 02046 } 02047 _m_empty(); /* clean MMX state */ 02048 #endif 02049 return (0); 02050 #else 02051 return (-1); 02052 #endif 02053 } 02054 02065 int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 02066 { 02067 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }; 02068 unsigned int i, istart; 02069 int iC; 02070 unsigned char *cursrc1; 02071 unsigned char *curdest; 02072 int result; 02073 02074 /* Validate input parameters */ 02075 if ((Src1 == NULL) || (Dest == NULL)) 02076 return(-1); 02077 if (length == 0) 02078 return(0); 02079 02080 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02081 02082 /* MMX routine */ 02083 SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask); 02084 02085 /* Check for unaligned bytes */ 02086 if ((length & 7) > 0) { 02087 /* Setup to process unaligned bytes */ 02088 istart = length & 0xfffffff8; 02089 cursrc1 = &Src1[istart]; 02090 curdest = &Dest[istart]; 02091 } else { 02092 /* No unaligned bytes - we are done */ 02093 return (0); 02094 } 02095 } else { 02096 /* Setup to process whole image */ 02097 istart = 0; 02098 cursrc1 = Src1; 02099 curdest = Dest; 02100 } 02101 02102 /* C routine to process image */ 02103 iC = (int) C; 02104 for (i = istart; i < length; i++) { 02105 result = (int) (*cursrc1 / 2) + iC; 02106 if (result > 255) 02107 result = 255; 02108 *curdest = (unsigned char) result; 02109 /* Advance pointers */ 02110 cursrc1++; 02111 curdest++; 02112 } 02113 02114 return (0); 02115 } 02116 02127 int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C) 02128 { 02129 #ifdef USE_MMX 02130 #if !defined(GCC__) 02131 __asm 02132 { 02133 pusha 02134 /* ** Duplicate C in 8 bytes of MM1 ** */ 02135 mov al, C /* load C into AL */ 02136 mov ah, al /* copy AL into AH */ 02137 mov bx, ax /* copy AX into BX */ 02138 shl eax, 16 /* shift 2 bytes of EAX left */ 02139 mov ax, bx /* copy BX into AX */ 02140 movd mm1, eax /* copy EAX into MM1 */ 02141 movd mm2, eax /* copy EAX into MM2 */ 02142 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 02143 mov eax, Src1 /* load Src1 address into eax */ 02144 mov edi, Dest /* load Dest address into edi */ 02145 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02146 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02147 align 16 /* 16 byte alignment of the loop entry */ 02148 L1023: 02149 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02150 psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */ 02151 movq [edi], mm0 /* store result in SrcDest */ 02152 add eax, 8 /* increase Src1 register pointer by 8 */ 02153 add edi, 8 /* increase Dest register pointer by 8 */ 02154 dec ecx /* decrease loop counter */ 02155 jnz L1023 /* check loop termination, proceed if required */ 02156 emms /* exit MMX state */ 02157 popa 02158 } 02159 #else 02160 /* i386 and x86_64 */ 02161 __m64 *mSrc1 = (__m64*)Src1; 02162 __m64 *mDest = (__m64*)Dest; 02163 /* Duplicate C in 8 bytes of MM1 */ 02164 int i; 02165 memset(&i, C, 4); 02166 __m64 mm1 = _m_from_int(i); 02167 __m64 mm2 = _m_from_int(i); 02168 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */ 02169 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 02170 for (i = 0; i < SrcLength/8; i++) { 02171 *mDest = _m_psubusb(*mSrc1, mm1); /* Src1-C (sub 8 bytes with saturation) */ 02172 mSrc1++; 02173 mDest++; 02174 } 02175 _m_empty(); /* clean MMX state */ 02176 #endif 02177 return (0); 02178 #else 02179 return (-1); 02180 #endif 02181 } 02182 02193 int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 02194 { 02195 unsigned int i, istart; 02196 int iC; 02197 unsigned char *cursrc1; 02198 unsigned char *curdest; 02199 int result; 02200 02201 /* Validate input parameters */ 02202 if ((Src1 == NULL) || (Dest == NULL)) 02203 return(-1); 02204 if (length == 0) 02205 return(0); 02206 02207 /* Special case: C==0 */ 02208 if (C == 0) { 02209 memcpy(Src1, Dest, length); 02210 return (0); 02211 } 02212 02213 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02214 02215 /* MMX routine */ 02216 SDL_imageFilterSubByteMMX(Src1, Dest, length, C); 02217 02218 /* Check for unaligned bytes */ 02219 if ((length & 7) > 0) { 02220 /* Setup to process unaligned bytes */ 02221 istart = length & 0xfffffff8; 02222 cursrc1 = &Src1[istart]; 02223 curdest = &Dest[istart]; 02224 } else { 02225 /* No unaligned bytes - we are done */ 02226 return (0); 02227 } 02228 } else { 02229 /* Setup to process whole image */ 02230 istart = 0; 02231 cursrc1 = Src1; 02232 curdest = Dest; 02233 } 02234 02235 /* C routine to process image */ 02236 iC = (int) C; 02237 for (i = istart; i < length; i++) { 02238 result = (int) *cursrc1 - iC; 02239 if (result < 0) 02240 result = 0; 02241 *curdest = (unsigned char) result; 02242 /* Advance pointers */ 02243 cursrc1++; 02244 curdest++; 02245 } 02246 return (0); 02247 } 02248 02260 static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D) 02261 { 02262 #ifdef USE_MMX 02263 #if !defined(GCC__) 02264 __asm 02265 { 02266 pusha 02267 /* ** Duplicate (int)C in 8 bytes of MM1 ** */ 02268 mov eax, C /* load C into EAX */ 02269 movd mm1, eax /* copy EAX into MM1 */ 02270 mov eax, D /* load D into EAX */ 02271 movd mm2, eax /* copy EAX into MM2 */ 02272 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 02273 mov eax, Src1 /* load Src1 address into eax */ 02274 mov edi, Dest /* load Dest address into edi */ 02275 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02276 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02277 align 16 /* 16 byte alignment of the loop entry */ 02278 L11024: 02279 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02280 psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */ 02281 movq [edi], mm0 /* store result in SrcDest */ 02282 add eax, 8 /* increase Src1 register pointer by 8 */ 02283 add edi, 8 /* increase Dest register pointer by 8 */ 02284 dec ecx /* decrease loop counter */ 02285 jnz L11024 /* check loop termination, proceed if required */ 02286 emms /* exit MMX state */ 02287 popa 02288 } 02289 #else 02290 /* i386 and x86_64 */ 02291 __m64 *mSrc1 = (__m64*)Src1; 02292 __m64 *mDest = (__m64*)Dest; 02293 /* Duplicate (int)C in 8 bytes of MM1 */ 02294 __m64 mm1 = _m_from_int(C); 02295 __m64 mm2 = _m_from_int(C); 02296 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */ 02297 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 02298 int i; 02299 for (i = 0; i < SrcLength/8; i++) { 02300 *mDest = _m_psubusb(*mSrc1, mm1); /* Src1-C (sub 8 bytes with saturation) */ 02301 mSrc1++; 02302 mDest++; 02303 } 02304 _m_empty(); /* clean MMX state */ 02305 #endif 02306 return (0); 02307 #else 02308 return (-1); 02309 #endif 02310 } 02311 02322 int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C) 02323 { 02324 unsigned int i, j, istart, D; 02325 int iC[4]; 02326 unsigned char *cursrc1; 02327 unsigned char *curdest; 02328 int result; 02329 02330 /* Validate input parameters */ 02331 if ((Src1 == NULL) || (Dest == NULL)) 02332 return(-1); 02333 if (length == 0) 02334 return(0); 02335 02336 /* Special case: C==0 */ 02337 if (C == 0) { 02338 memcpy(Src1, Dest, length); 02339 return (0); 02340 } 02341 02342 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02343 02344 /* MMX routine */ 02345 D=SWAP_32(C); 02346 SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D); 02347 02348 /* Check for unaligned bytes */ 02349 if ((length & 7) > 0) { 02350 /* Setup to process unaligned bytes */ 02351 istart = length & 0xfffffff8; 02352 cursrc1 = &Src1[istart]; 02353 curdest = &Dest[istart]; 02354 } else { 02355 /* No unaligned bytes - we are done */ 02356 return (0); 02357 } 02358 } else { 02359 /* Setup to process whole image */ 02360 istart = 0; 02361 cursrc1 = Src1; 02362 curdest = Dest; 02363 } 02364 02365 /* C routine to process image */ 02366 iC[3] = (int) ((C >> 24) & 0xff); 02367 iC[2] = (int) ((C >> 16) & 0xff); 02368 iC[1] = (int) ((C >> 8) & 0xff); 02369 iC[0] = (int) ((C >> 0) & 0xff); 02370 for (i = istart; i < length; i += 4) { 02371 for (j = 0; j < 4; j++) { 02372 if ((i+j)<length) { 02373 result = (int) *cursrc1 - iC[j]; 02374 if (result < 0) result = 0; 02375 *curdest = (unsigned char) result; 02376 /* Advance pointers */ 02377 cursrc1++; 02378 curdest++; 02379 } 02380 } 02381 } 02382 return (0); 02383 } 02384 02396 static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N, 02397 unsigned char *Mask) 02398 { 02399 #ifdef USE_MMX 02400 #if !defined(GCC__) 02401 __asm 02402 { 02403 pusha 02404 mov edx, Mask /* load Mask address into edx */ 02405 movq mm0, [edx] /* load Mask into mm0 */ 02406 xor ecx, ecx /* zero ECX */ 02407 mov cl, N /* load loop counter (N) into CL */ 02408 movd mm3, ecx /* copy (N) into MM3 */ 02409 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 02410 L10240: /* ** Prepare proper bit-Mask in MM1 ** */ 02411 psrlw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the right */ 02412 pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */ 02413 /* byte 0x0f, 0xdb, 0xc8 */ 02414 dec cl /* decrease loop counter */ 02415 jnz L10240 /* check loop termination, proceed if required */ 02416 /* ** Shift all bytes of the image ** */ 02417 mov eax, Src1 /* load Src1 address into eax */ 02418 mov edi, Dest /* load Dest address into edi */ 02419 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02420 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02421 align 16 /* 16 byte alignment of the loop entry */ 02422 L10241: 02423 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02424 psrlw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the right */ 02425 pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */ 02426 /* byte 0x0f, 0xdb, 0xc1 */ 02427 movq [edi], mm0 /* store result in SrcDest */ 02428 add eax, 8 /* increase Src1 register pointer by 8 */ 02429 add edi, 8 /* increase Dest register pointer by 8 */ 02430 dec ecx /* decrease loop counter */ 02431 jnz L10241 /* check loop termination, proceed if required */ 02432 emms /* exit MMX state */ 02433 popa 02434 } 02435 #else 02436 /* i386 and x86_64 */ 02437 __m64 *mSrc1 = (__m64*)Src1; 02438 __m64 *mDest = (__m64*)Dest; 02439 __m64 *mMask = (__m64*)Mask; 02440 __m64 mm1; 02441 int i; 02442 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */ 02443 /* Prepare proper bit-Mask in MM1 */ 02444 for (i = 0; i < N; i++) { 02445 mm1 = _m_psrlwi(mm1, 1); /* shift 4 WORDS of MM1 1 bit to the right */ 02446 mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of MM1 */ 02447 } 02448 /* Shift all bytes of the image */ 02449 for (i = 0; i < SrcLength/8; i++) { 02450 __m64 mm0 = _m_psrlwi(*mSrc1, N); /* shift 4 WORDS of MM0 (N) bits to the right */ 02451 *mDest = _m_pand(mm0, mm1); /* apply proper bit-Mask to 8 BYTES of MM0 */ 02452 mSrc1++; 02453 mDest++; 02454 } 02455 _m_empty(); /* clean MMX state */ 02456 #endif 02457 return (0); 02458 #else 02459 return (-1); 02460 #endif 02461 } 02462 02473 int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 02474 { 02475 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }; 02476 unsigned int i, istart; 02477 unsigned char *cursrc1; 02478 unsigned char *curdest; 02479 02480 /* Validate input parameters */ 02481 if ((Src1 == NULL) || (Dest == NULL)) 02482 return(-1); 02483 if (length == 0) 02484 return(0); 02485 02486 /* Check shift */ 02487 if (N > 8) { 02488 return (-1); 02489 } 02490 02491 /* Special case: N==0 */ 02492 if (N == 0) { 02493 memcpy(Src1, Dest, length); 02494 return (0); 02495 } 02496 02497 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02498 02499 /* MMX routine */ 02500 SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask); 02501 02502 /* Check for unaligned bytes */ 02503 if ((length & 7) > 0) { 02504 /* Setup to process unaligned bytes */ 02505 istart = length & 0xfffffff8; 02506 cursrc1 = &Src1[istart]; 02507 curdest = &Dest[istart]; 02508 } else { 02509 /* No unaligned bytes - we are done */ 02510 return (0); 02511 } 02512 } else { 02513 /* Setup to process whole image */ 02514 istart = 0; 02515 cursrc1 = Src1; 02516 curdest = Dest; 02517 } 02518 02519 /* C routine to process image */ 02520 for (i = istart; i < length; i++) { 02521 *curdest = (unsigned char) *cursrc1 >> N; 02522 /* Advance pointers */ 02523 cursrc1++; 02524 curdest++; 02525 } 02526 02527 return (0); 02528 } 02529 02540 static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N) 02541 { 02542 #ifdef USE_MMX 02543 #if !defined(GCC__) 02544 __asm 02545 { 02546 pusha 02547 mov eax, Src1 /* load Src1 address into eax */ 02548 mov edi, Dest /* load Dest address into edi */ 02549 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02550 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02551 align 16 /* 16 byte alignment of the loop entry */ 02552 L13023: 02553 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02554 psrld mm0, N 02555 movq [edi], mm0 /* store result in SrcDest */ 02556 add eax, 8 /* increase Src1 register pointer by 8 */ 02557 add edi, 8 /* increase Dest register pointer by 8 */ 02558 dec ecx /* decrease loop counter */ 02559 jnz L13023 /* check loop termination, proceed if required */ 02560 emms /* exit MMX state */ 02561 popa 02562 } 02563 #else 02564 /* i386 and x86_64 */ 02565 __m64 *mSrc1 = (__m64*)Src1; 02566 __m64 *mDest = (__m64*)Dest; 02567 int i; 02568 for (i = 0; i < SrcLength/8; i++) { 02569 *mDest = _m_psrldi(*mSrc1, N); 02570 mSrc1++; 02571 mDest++; 02572 } 02573 _m_empty(); /* clean MMX state */ 02574 #endif 02575 return (0); 02576 #else 02577 return (-1); 02578 #endif 02579 } 02580 02591 int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 02592 { 02593 unsigned int i, istart; 02594 unsigned char *cursrc1, *curdest; 02595 unsigned int *icursrc1, *icurdest; 02596 unsigned int result; 02597 02598 /* Validate input parameters */ 02599 if ((Src1 == NULL) || (Dest == NULL)) 02600 return(-1); 02601 if (length == 0) 02602 return(0); 02603 02604 if (N > 32) { 02605 return (-1); 02606 } 02607 02608 /* Special case: N==0 */ 02609 if (N == 0) { 02610 memcpy(Src1, Dest, length); 02611 return (0); 02612 } 02613 02614 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02615 02616 SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N); 02617 02618 /* Check for unaligned bytes */ 02619 if ((length & 7) > 0) { 02620 /* Setup to process unaligned bytes */ 02621 istart = length & 0xfffffff8; 02622 cursrc1 = &Src1[istart]; 02623 curdest = &Dest[istart]; 02624 } else { 02625 /* No unaligned bytes - we are done */ 02626 return (0); 02627 } 02628 } else { 02629 /* Setup to process whole image */ 02630 istart = 0; 02631 cursrc1 = Src1; 02632 curdest = Dest; 02633 } 02634 02635 /* C routine to process image */ 02636 icursrc1=(unsigned int *)cursrc1; 02637 icurdest=(unsigned int *)curdest; 02638 for (i = istart; i < length; i += 4) { 02639 if ((i+4)<length) { 02640 result = ((unsigned int)*icursrc1 >> N); 02641 *icurdest = result; 02642 } 02643 /* Advance pointers */ 02644 icursrc1++; 02645 icurdest++; 02646 } 02647 02648 return (0); 02649 } 02650 02661 static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C) 02662 { 02663 #ifdef USE_MMX 02664 #if !defined(GCC__) 02665 __asm 02666 { 02667 pusha 02668 /* ** Duplicate C in 4 words of MM1 ** */ 02669 mov al, C /* load C into AL */ 02670 xor ah, ah /* zero AH */ 02671 mov bx, ax /* copy AX into BX */ 02672 shl eax, 16 /* shift 2 bytes of EAX left */ 02673 mov ax, bx /* copy BX into AX */ 02674 movd mm1, eax /* copy EAX into MM1 */ 02675 movd mm2, eax /* copy EAX into MM2 */ 02676 punpckldq mm1, mm2 /* fill higher words of MM1 with C */ 02677 pxor mm0, mm0 /* zero MM0 register */ 02678 mov eax, Src1 /* load Src1 address into eax */ 02679 mov edi, Dest /* load Dest address into edi */ 02680 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02681 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02682 cmp al, 128 /* if (C <= 128) execute more efficient code */ 02683 jg L10251 02684 align 16 /* 16 byte alignment of the loop entry */ 02685 L10250: 02686 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 02687 movq mm4, mm3 /* copy MM3 into MM4 */ 02688 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 02689 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 02690 pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */ 02691 pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */ 02692 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 02693 movq [edi], mm3 /* store result in Dest */ 02694 add eax, 8 /* increase Src1 register pointer by 8 */ 02695 add edi, 8 /* increase Dest register pointer by 8 */ 02696 dec ecx /* decrease loop counter */ 02697 jnz L10250 /* check loop termination, proceed if required */ 02698 jmp L10252 02699 align 16 /* 16 byte alignment of the loop entry */ 02700 L10251: 02701 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 02702 movq mm4, mm3 /* copy MM3 into MM4 */ 02703 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 02704 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 02705 pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */ 02706 pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */ 02707 /* ** Take abs value of the results (signed words) ** */ 02708 movq mm5, mm3 /* copy mm3 into mm5 */ 02709 movq mm6, mm4 /* copy mm4 into mm6 */ 02710 psraw mm5, 15 /* fill mm5 words with word sign bit */ 02711 psraw mm6, 15 /* fill mm6 words with word sign bit */ 02712 pxor mm3, mm5 /* take 1's compliment of only neg words */ 02713 pxor mm4, mm6 /* take 1's compliment of only neg words */ 02714 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */ 02715 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 02716 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 02717 movq [edi], mm3 /* store result in Dest */ 02718 add eax, 8 /* increase Src1 register pointer by 8 */ 02719 add edi, 8 /* increase Dest register pointer by 8 */ 02720 dec ecx /* decrease loop counter */ 02721 jnz L10251 /* check loop termination, proceed if required */ 02722 L10252: 02723 emms /* exit MMX state */ 02724 popa 02725 } 02726 #else 02727 /* i386 and x86_64 */ 02728 __m64 *mSrc1 = (__m64*)Src1; 02729 __m64 *mDest = (__m64*)Dest; 02730 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 02731 /* Duplicate C in 4 words of MM1 */ 02732 int i; 02733 i = C | C<<16; 02734 __m64 mm1 = _m_from_int(i); 02735 __m64 mm2 = _m_from_int(i); 02736 mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with C */ 02737 // long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48; 02738 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 02739 if (C <= 128) { /* if (C <= 128) execute more efficient code */ 02740 for (i = 0; i < SrcLength/8; i++) { 02741 __m64 mm3, mm4; 02742 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 02743 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 02744 mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */ 02745 mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */ 02746 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 02747 mSrc1++; 02748 mDest++; 02749 } 02750 } else { 02751 for (i = 0; i < SrcLength/8; i++) { 02752 __m64 mm3, mm4, mm5, mm6; 02753 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 02754 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 02755 mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */ 02756 mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */ 02757 /* Take abs value of the results (signed words) */ 02758 mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */ 02759 mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */ 02760 mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */ 02761 mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */ 02762 mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */ 02763 mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */ 02764 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 02765 mSrc1++; 02766 mDest++; 02767 } 02768 } 02769 _m_empty(); /* clean MMX state */ 02770 #endif 02771 return (0); 02772 #else 02773 return (-1); 02774 #endif 02775 } 02776 02787 int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 02788 { 02789 unsigned int i, istart; 02790 int iC; 02791 unsigned char *cursrc1; 02792 unsigned char *curdest; 02793 int result; 02794 02795 /* Validate input parameters */ 02796 if ((Src1 == NULL) || (Dest == NULL)) 02797 return(-1); 02798 if (length == 0) 02799 return(0); 02800 02801 /* Special case: C==1 */ 02802 if (C == 1) { 02803 memcpy(Src1, Dest, length); 02804 return (0); 02805 } 02806 02807 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02808 02809 SDL_imageFilterMultByByteMMX(Src1, Dest, length, C); 02810 02811 /* Check for unaligned bytes */ 02812 if ((length & 7) > 0) { 02813 /* Setup to process unaligned bytes */ 02814 istart = length & 0xfffffff8; 02815 cursrc1 = &Src1[istart]; 02816 curdest = &Dest[istart]; 02817 } else { 02818 /* No unaligned bytes - we are done */ 02819 return (0); 02820 } 02821 } else { 02822 /* Setup to process whole image */ 02823 istart = 0; 02824 cursrc1 = Src1; 02825 curdest = Dest; 02826 } 02827 02828 /* C routine to process image */ 02829 iC = (int) C; 02830 for (i = istart; i < length; i++) { 02831 result = (int) *cursrc1 * iC; 02832 if (result > 255) 02833 result = 255; 02834 *curdest = (unsigned char) result; 02835 /* Advance pointers */ 02836 cursrc1++; 02837 curdest++; 02838 } 02839 02840 return (0); 02841 } 02842 02854 static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N, 02855 unsigned char C) 02856 { 02857 #ifdef USE_MMX 02858 #if !defined(GCC__) 02859 __asm 02860 { 02861 pusha 02862 /* ** Duplicate C in 4 words of MM1 ** */ 02863 mov al, C /* load C into AL */ 02864 xor ah, ah /* zero AH */ 02865 mov bx, ax /* copy AX into BX */ 02866 shl eax, 16 /* shift 2 bytes of EAX left */ 02867 mov ax, bx /* copy BX into AX */ 02868 movd mm1, eax /* copy EAX into MM1 */ 02869 movd mm2, eax /* copy EAX into MM2 */ 02870 punpckldq mm1, mm2 /* fill higher words of MM1 with C */ 02871 xor ecx, ecx /* zero ECX */ 02872 mov cl, N /* load N into CL */ 02873 movd mm7, ecx /* copy N into MM7 */ 02874 pxor mm0, mm0 /* zero MM0 register */ 02875 mov eax, Src1 /* load Src1 address into eax */ 02876 mov edi, Dest /* load Dest address into edi */ 02877 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02878 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02879 align 16 /* 16 byte alignment of the loop entry */ 02880 L1026: 02881 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 02882 movq mm4, mm3 /* copy MM3 into MM4 */ 02883 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 02884 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 02885 psrlw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the right */ 02886 psrlw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the right */ 02887 pmullw mm3, mm1 /* mul low bytes of SrcDest by MM1 */ 02888 pmullw mm4, mm1 /* mul high bytes of SrcDest by MM1 */ 02889 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 02890 movq [edi], mm3 /* store result in Dest */ 02891 add eax, 8 /* increase Src1 register pointer by 8 */ 02892 add edi, 8 /* increase Dest register pointer by 8 */ 02893 dec ecx /* decrease loop counter */ 02894 jnz L1026 /* check loop termination, proceed if required */ 02895 emms /* exit MMX state */ 02896 popa 02897 } 02898 #else 02899 /* i386 and x86_64 */ 02900 __m64 *mSrc1 = (__m64*)Src1; 02901 __m64 *mDest = (__m64*)Dest; 02902 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 02903 /* Duplicate C in 4 words of MM1 */ 02904 int i; 02905 i = (C<<16)|C; 02906 __m64 mm1 = _m_from_int(i); 02907 __m64 mm2 = _m_from_int(i); 02908 mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with C */ 02909 for (i = 0; i < SrcLength/8; i++) { 02910 __m64 mm3, mm4, mm5, mm6; 02911 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 02912 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 02913 mm3 = _m_psrlwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the right */ 02914 mm4 = _m_psrlwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the right */ 02915 mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */ 02916 mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */ 02917 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 02918 mSrc1++; 02919 mDest++; 02920 } 02921 _m_empty(); /* clean MMX state */ 02922 #endif 02923 return (0); 02924 #else 02925 return (-1); 02926 #endif 02927 } 02928 02940 int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N, 02941 unsigned char C) 02942 { 02943 unsigned int i, istart; 02944 int iC; 02945 unsigned char *cursrc1; 02946 unsigned char *curdest; 02947 int result; 02948 02949 /* Validate input parameters */ 02950 if ((Src1 == NULL) || (Dest == NULL)) 02951 return(-1); 02952 if (length == 0) 02953 return(0); 02954 02955 /* Check shift */ 02956 if (N > 8) { 02957 return (-1); 02958 } 02959 02960 /* Special case: N==0 && C==1 */ 02961 if ((N == 0) && (C == 1)) { 02962 memcpy(Src1, Dest, length); 02963 return (0); 02964 } 02965 02966 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02967 02968 SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C); 02969 02970 /* Check for unaligned bytes */ 02971 if ((length & 7) > 0) { 02972 /* Setup to process unaligned bytes */ 02973 istart = length & 0xfffffff8; 02974 cursrc1 = &Src1[istart]; 02975 curdest = &Dest[istart]; 02976 } else { 02977 /* No unaligned bytes - we are done */ 02978 return (0); 02979 } 02980 } else { 02981 /* Setup to process whole image */ 02982 istart = 0; 02983 cursrc1 = Src1; 02984 curdest = Dest; 02985 } 02986 02987 /* C routine to process image */ 02988 iC = (int) C; 02989 for (i = istart; i < length; i++) { 02990 result = (int) (*cursrc1 >> N) * iC; 02991 if (result > 255) 02992 result = 255; 02993 *curdest = (unsigned char) result; 02994 /* Advance pointers */ 02995 cursrc1++; 02996 curdest++; 02997 } 02998 02999 return (0); 03000 } 03001 03013 static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N, 03014 unsigned char *Mask) 03015 { 03016 #ifdef USE_MMX 03017 #if !defined(GCC__) 03018 __asm 03019 { 03020 pusha 03021 mov edx, Mask /* load Mask address into edx */ 03022 movq mm0, [edx] /* load Mask into mm0 */ 03023 xor ecx, ecx /* zero ECX */ 03024 mov cl, N /* load loop counter (N) into CL */ 03025 movd mm3, ecx /* copy (N) into MM3 */ 03026 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 03027 L10270: /* ** Prepare proper bit-Mask in MM1 ** */ 03028 psllw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the left */ 03029 pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */ 03030 /* byte 0x0f, 0xdb, 0xc8 */ 03031 dec cl /* decrease loop counter */ 03032 jnz L10270 /* check loop termination, proceed if required */ 03033 /* ** Shift all bytes of the image ** */ 03034 mov eax, Src1 /* load Src1 address into eax */ 03035 mov edi, Dest /* load SrcDest address into edi */ 03036 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03037 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03038 align 16 /* 16 byte alignment of the loop entry */ 03039 L10271: 03040 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */ 03041 psllw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the left */ 03042 pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */ 03043 /* byte 0x0f, 0xdb, 0xc1 */ 03044 movq [edi], mm0 /* store result in Dest */ 03045 add eax, 8 /* increase Src1 register pointer by 8 */ 03046 add edi, 8 /* increase Dest register pointer by 8 */ 03047 dec ecx /* decrease loop counter */ 03048 jnz L10271 /* check loop termination, proceed if required */ 03049 emms /* exit MMX state */ 03050 popa 03051 } 03052 #else 03053 /* i386 and x86_64 */ 03054 __m64 *mSrc1 = (__m64*)Src1; 03055 __m64 *mDest = (__m64*)Dest; 03056 __m64 *mMask = (__m64*)Mask; 03057 __m64 mm1; 03058 int i; 03059 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */ 03060 /* Prepare proper bit-Mask in MM1 */ 03061 for (i = 0; i < N; i++) { 03062 mm1 = _m_psllwi(mm1, 1); /* shift 4 WORDS of MM1 1 bit to the left */ 03063 mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of MM1 */ 03064 } 03065 /* ** Shift all bytes of the image ** */ 03066 for (i = 0; i < SrcLength/8; i++) { 03067 __m64 mm0 = _m_psllwi(*mSrc1, N); /* shift 4 WORDS of MM0 (N) bits to the left */ 03068 *mDest = _m_pand(mm0, mm1); /* apply proper bit-Mask to 8 BYTES of MM0 */ 03069 mSrc1++; 03070 mDest++; 03071 } 03072 _m_empty(); /* clean MMX state */ 03073 #endif 03074 return (0); 03075 #else 03076 return (-1); 03077 #endif 03078 } 03079 03090 int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 03091 { 03092 static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE }; 03093 unsigned int i, istart; 03094 unsigned char *cursrc1, *curdest; 03095 int result; 03096 03097 /* Validate input parameters */ 03098 if ((Src1 == NULL) || (Dest == NULL)) 03099 return(-1); 03100 if (length == 0) 03101 return(0); 03102 03103 if (N > 8) { 03104 return (-1); 03105 } 03106 03107 /* Special case: N==0 */ 03108 if (N == 0) { 03109 memcpy(Src1, Dest, length); 03110 return (0); 03111 } 03112 03113 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03114 03115 SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask); 03116 03117 /* Check for unaligned bytes */ 03118 if ((length & 7) > 0) { 03119 /* Setup to process unaligned bytes */ 03120 istart = length & 0xfffffff8; 03121 cursrc1 = &Src1[istart]; 03122 curdest = &Dest[istart]; 03123 } else { 03124 /* No unaligned bytes - we are done */ 03125 return (0); 03126 } 03127 } else { 03128 /* Setup to process whole image */ 03129 istart = 0; 03130 cursrc1 = Src1; 03131 curdest = Dest; 03132 } 03133 03134 /* C routine to process image */ 03135 for (i = istart; i < length; i++) { 03136 result = ((int) *cursrc1 << N) & 0xff; 03137 *curdest = (unsigned char) result; 03138 /* Advance pointers */ 03139 cursrc1++; 03140 curdest++; 03141 } 03142 03143 return (0); 03144 } 03145 03156 static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N) 03157 { 03158 #ifdef USE_MMX 03159 #if !defined(GCC__) 03160 __asm 03161 { 03162 pusha 03163 mov eax, Src1 /* load Src1 address into eax */ 03164 mov edi, Dest /* load Dest address into edi */ 03165 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03166 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03167 align 16 /* 16 byte alignment of the loop entry */ 03168 L12023: 03169 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 03170 pslld mm0, N /* MM0=SrcDest+C (add 8 bytes with saturation) */ 03171 movq [edi], mm0 /* store result in SrcDest */ 03172 add eax, 8 /* increase Src1 register pointer by 8 */ 03173 add edi, 8 /* increase Dest register pointer by 8 */ 03174 dec ecx /* decrease loop counter */ 03175 jnz L12023 /* check loop termination, proceed if required */ 03176 emms /* exit MMX state */ 03177 popa 03178 } 03179 #else 03180 /* i386 and x86_64 */ 03181 __m64 *mSrc1 = (__m64*)Src1; 03182 __m64 *mDest = (__m64*)Dest; 03183 int i; 03184 for (i = 0; i < SrcLength/8; i++) { 03185 *mDest = _m_pslldi(*mSrc1, N); /* Src1+C (add 8 bytes with saturation) */ 03186 mSrc1++; 03187 mDest++; 03188 } 03189 _m_empty(); /* clean MMX state */ 03190 #endif 03191 return (0); 03192 #else 03193 return (-1); 03194 #endif 03195 } 03196 03207 int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 03208 { 03209 unsigned int i, istart; 03210 unsigned char *cursrc1, *curdest; 03211 unsigned int *icursrc1, *icurdest; 03212 unsigned int result; 03213 03214 /* Validate input parameters */ 03215 if ((Src1 == NULL) || (Dest == NULL)) 03216 return(-1); 03217 if (length == 0) 03218 return(0); 03219 03220 if (N > 32) { 03221 return (-1); 03222 } 03223 03224 /* Special case: N==0 */ 03225 if (N == 0) { 03226 memcpy(Src1, Dest, length); 03227 return (0); 03228 } 03229 03230 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03231 03232 SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N); 03233 03234 /* Check for unaligned bytes */ 03235 if ((length & 7) > 0) { 03236 /* Setup to process unaligned bytes */ 03237 istart = length & 0xfffffff8; 03238 cursrc1 = &Src1[istart]; 03239 curdest = &Dest[istart]; 03240 } else { 03241 /* No unaligned bytes - we are done */ 03242 return (0); 03243 } 03244 } else { 03245 /* Setup to process whole image */ 03246 istart = 0; 03247 cursrc1 = Src1; 03248 curdest = Dest; 03249 } 03250 03251 /* C routine to process image */ 03252 icursrc1=(unsigned int *)cursrc1; 03253 icurdest=(unsigned int *)curdest; 03254 for (i = istart; i < length; i += 4) { 03255 if ((i+4)<length) { 03256 result = ((unsigned int)*icursrc1 << N); 03257 *icurdest = result; 03258 } 03259 /* Advance pointers */ 03260 icursrc1++; 03261 icurdest++; 03262 } 03263 03264 return (0); 03265 } 03266 03277 static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N) 03278 { 03279 #ifdef USE_MMX 03280 #if !defined(GCC__) 03281 __asm 03282 { 03283 pusha 03284 xor eax, eax /* zero EAX */ 03285 mov al, N /* load N into AL */ 03286 movd mm7, eax /* copy N into MM7 */ 03287 pxor mm0, mm0 /* zero MM0 register */ 03288 mov eax, Src1 /* load Src1 address into eax */ 03289 mov edi, Dest /* load Dest address into edi */ 03290 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03291 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03292 cmp al, 7 /* if (N <= 7) execute more efficient code */ 03293 jg L10281 03294 align 16 /* 16 byte alignment of the loop entry */ 03295 L10280: 03296 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 03297 movq mm4, mm3 /* copy MM3 into MM4 */ 03298 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 03299 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 03300 psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the left */ 03301 psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the left */ 03302 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 03303 movq [edi], mm3 /* store result in Dest */ 03304 add eax, 8 /* increase Src1 register pointer by 8 */ 03305 add edi, 8 /* increase Dest register pointer by 8 */ 03306 dec ecx /* decrease loop counter */ 03307 jnz L10280 /* check loop termination, proceed if required */ 03308 jmp L10282 03309 align 16 /* 16 byte alignment of the loop entry */ 03310 L10281: 03311 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 03312 movq mm4, mm3 /* copy MM3 into MM4 */ 03313 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 03314 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 03315 psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the left */ 03316 psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the left */ 03317 /* ** Take abs value of the signed words ** */ 03318 movq mm5, mm3 /* copy mm3 into mm5 */ 03319 movq mm6, mm4 /* copy mm4 into mm6 */ 03320 psraw mm5, 15 /* fill mm5 words with word sign bit */ 03321 psraw mm6, 15 /* fill mm6 words with word sign bit */ 03322 pxor mm3, mm5 /* take 1's compliment of only neg words */ 03323 pxor mm4, mm6 /* take 1's compliment of only neg words */ 03324 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */ 03325 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 03326 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 03327 movq [edi], mm3 /* store result in Dest */ 03328 add eax, 8 /* increase Src1 register pointer by 8 */ 03329 add edi, 8 /* increase Dest register pointer by 8 */ 03330 dec ecx /* decrease loop counter */ 03331 jnz L10281 /* check loop termination, proceed if required */ 03332 L10282: 03333 emms /* exit MMX state */ 03334 popa 03335 } 03336 #else 03337 /* i386 and x86_64 */ 03338 __m64 *mSrc1 = (__m64*)Src1; 03339 __m64 *mDest = (__m64*)Dest; 03340 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 03341 int i; 03342 if (N <= 7) { /* if (N <= 7) execute more efficient code */ 03343 for (i = 0; i < SrcLength/8; i++) { 03344 __m64 mm3, mm4; 03345 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 03346 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 03347 mm3 = _m_psllwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the left */ 03348 mm4 = _m_psllwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the left */ 03349 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 03350 mSrc1++; 03351 mDest++; 03352 } 03353 } else { 03354 for (i = 0; i < SrcLength/8; i++) { 03355 __m64 mm3, mm4, mm5, mm6; 03356 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 03357 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 03358 mm3 = _m_psllwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the left */ 03359 mm4 = _m_psllwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the left */ 03360 /* Take abs value of the signed words */ 03361 mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */ 03362 mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */ 03363 mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */ 03364 mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */ 03365 mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */ 03366 mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */ 03367 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 03368 mSrc1++; 03369 mDest++; 03370 } 03371 } 03372 _m_empty(); /* clean MMX state */ 03373 #endif 03374 return (0); 03375 #else 03376 return (-1); 03377 #endif 03378 } 03379 03390 int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 03391 { 03392 unsigned int i, istart; 03393 unsigned char *cursrc1, *curdest; 03394 int result; 03395 03396 /* Validate input parameters */ 03397 if ((Src1 == NULL) || (Dest == NULL)) 03398 return(-1); 03399 if (length == 0) 03400 return(0); 03401 03402 if (N > 8) { 03403 return (-1); 03404 } 03405 03406 /* Special case: N==0 */ 03407 if (N == 0) { 03408 memcpy(Src1, Dest, length); 03409 return (0); 03410 } 03411 03412 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03413 03414 SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N); 03415 03416 /* Check for unaligned bytes */ 03417 if ((length & 7) > 0) { 03418 /* Setup to process unaligned bytes */ 03419 istart = length & 0xfffffff8; 03420 cursrc1 = &Src1[istart]; 03421 curdest = &Dest[istart]; 03422 } else { 03423 /* No unaligned bytes - we are done */ 03424 return (0); 03425 } 03426 } else { 03427 /* Setup to process whole image */ 03428 istart = 0; 03429 cursrc1 = Src1; 03430 curdest = Dest; 03431 } 03432 03433 /* C routine to process image */ 03434 for (i = istart; i < length; i++) { 03435 result = (int) *cursrc1 << N; 03436 if (result > 255) 03437 result = 255; 03438 *curdest = (unsigned char) result; 03439 /* Advance pointers */ 03440 cursrc1++; 03441 curdest++; 03442 } 03443 03444 return (0); 03445 } 03446 03457 static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T) 03458 { 03459 #ifdef USE_MMX 03460 #if !defined(GCC__) 03461 __asm 03462 { 03463 pusha 03464 /* ** Duplicate T in 8 bytes of MM3 ** */ 03465 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 03466 pcmpeqb mm2, mm2 /* generate all 1's in mm2 */ 03467 mov al, T /* load T into AL */ 03468 mov ah, al /* copy AL into AH */ 03469 mov bx, ax /* copy AX into BX */ 03470 shl eax, 16 /* shift 2 bytes of EAX left */ 03471 mov ax, bx /* copy BX into AX */ 03472 movd mm3, eax /* copy EAX into MM3 */ 03473 movd mm4, eax /* copy EAX into MM4 */ 03474 punpckldq mm3, mm4 /* fill higher bytes of MM3 with T */ 03475 psubusb mm2, mm3 /* store 0xFF - T in MM2 */ 03476 mov eax, Src1 /* load Src1 address into eax */ 03477 mov edi, Dest /* load Dest address into edi */ 03478 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03479 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03480 align 16 /* 16 byte alignment of the loop entry */ 03481 L1029: 03482 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 03483 paddusb mm0, mm2 /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */ 03484 pcmpeqb mm0, mm1 /* binarize 255:0, comparing to 255 */ 03485 movq [edi], mm0 /* store result in SrcDest */ 03486 add eax, 8 /* increase Src1 register pointer by 8 */ 03487 add edi, 8 /* increase Dest register pointer by 8 */ 03488 dec ecx /* decrease loop counter */ 03489 jnz L1029 /* check loop termination, proceed if required */ 03490 emms /* exit MMX state */ 03491 popa 03492 } 03493 #else 03494 /* i386 and x86_64 */ 03495 __m64 *mSrc1 = (__m64*)Src1; 03496 __m64 *mDest = (__m64*)Dest; 03497 /* Duplicate T in 8 bytes of MM3 */ 03498 __m64 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */ 03499 __m64 mm2 = _m_pcmpeqb(mm2, mm2); /* generate all 1's in mm1 */ 03500 int i; 03501 memset(&i, T, 4); 03502 __m64 mm3 = _m_from_int(i); 03503 __m64 mm4 = _m_from_int(i); 03504 mm3 = _m_punpckldq(mm3, mm4); /* fill higher bytes of MM3 with T */ 03505 mm2 = _m_psubusb(mm2, mm3); /* store 0xFF - T in MM2 */ 03506 //__m64 mm3 = _m_from_int64(lli); // x86_64 only 03507 for (i = 0; i < SrcLength/8; i++) { 03508 __m64 mm0 = _m_paddusb(*mSrc1, mm2); /* Src1+(0xFF-T) (add 8 bytes with saturation) */ 03509 *mDest = _m_pcmpeqb(mm0, mm1); /* binarize 255:0, comparing to 255 */ 03510 mSrc1++; 03511 mDest++; 03512 } 03513 _m_empty(); /* clean MMX state */ 03514 #endif 03515 return (0); 03516 #else 03517 return (-1); 03518 #endif 03519 } 03520 03531 int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T) 03532 { 03533 unsigned int i, istart; 03534 unsigned char *cursrc1; 03535 unsigned char *curdest; 03536 03537 /* Validate input parameters */ 03538 if ((Src1 == NULL) || (Dest == NULL)) 03539 return(-1); 03540 if (length == 0) 03541 return(0); 03542 03543 /* Special case: T==0 */ 03544 if (T == 0) { 03545 memset(Dest, 255, length); 03546 return (0); 03547 } 03548 03549 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03550 03551 SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T); 03552 03553 /* Check for unaligned bytes */ 03554 if ((length & 7) > 0) { 03555 /* Setup to process unaligned bytes */ 03556 istart = length & 0xfffffff8; 03557 cursrc1 = &Src1[istart]; 03558 curdest = &Dest[istart]; 03559 } else { 03560 /* No unaligned bytes - we are done */ 03561 return (0); 03562 } 03563 } else { 03564 /* Setup to process whole image */ 03565 istart = 0; 03566 cursrc1 = Src1; 03567 curdest = Dest; 03568 } 03569 03570 /* C routine to process image */ 03571 for (i = istart; i < length; i++) { 03572 *curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0); 03573 /* Advance pointers */ 03574 cursrc1++; 03575 curdest++; 03576 } 03577 03578 return (0); 03579 } 03580 03592 static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin, 03593 unsigned char Tmax) 03594 { 03595 #ifdef USE_MMX 03596 #if !defined(GCC__) 03597 __asm 03598 { 03599 pusha 03600 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 03601 /* ** Duplicate Tmax in 8 bytes of MM3 ** */ 03602 mov al, Tmax /* load Tmax into AL */ 03603 mov ah, al /* copy AL into AH */ 03604 mov bx, ax /* copy AX into BX */ 03605 shl eax, 16 /* shift 2 bytes of EAX left */ 03606 mov ax, bx /* copy BX into AX */ 03607 movd mm3, eax /* copy EAX into MM3 */ 03608 movd mm4, eax /* copy EAX into MM4 */ 03609 punpckldq mm3, mm4 /* fill higher bytes of MM3 with Tmax */ 03610 psubusb mm1, mm3 /* store 0xFF - Tmax in MM1 */ 03611 /* ** Duplicate Tmin in 8 bytes of MM5 ** */ 03612 mov al, Tmin /* load Tmin into AL */ 03613 mov ah, al /* copy AL into AH */ 03614 mov bx, ax /* copy AX into BX */ 03615 shl eax, 16 /* shift 2 bytes of EAX left */ 03616 mov ax, bx /* copy BX into AX */ 03617 movd mm5, eax /* copy EAX into MM5 */ 03618 movd mm4, eax /* copy EAX into MM4 */ 03619 punpckldq mm5, mm4 /* fill higher bytes of MM5 with Tmin */ 03620 movq mm7, mm5 /* copy MM5 into MM7 */ 03621 paddusb mm7, mm1 /* store 0xFF - Tmax + Tmin in MM7 */ 03622 mov eax, Src1 /* load Src1 address into eax */ 03623 mov edi, Dest /* load Dest address into edi */ 03624 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03625 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03626 align 16 /* 16 byte alignment of the loop entry */ 03627 L1030: 03628 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */ 03629 paddusb mm0, mm1 /* MM0=SrcDest+(0xFF-Tmax) */ 03630 psubusb mm0, mm7 /* MM0=MM0-(0xFF-Tmax+Tmin) */ 03631 paddusb mm0, mm5 /* MM0=MM0+Tmin */ 03632 movq [edi], mm0 /* store result in Dest */ 03633 add eax, 8 /* increase Src1 register pointer by 8 */ 03634 add edi, 8 /* increase Dest register pointer by 8 */ 03635 dec ecx /* decrease loop counter */ 03636 jnz L1030 /* check loop termination, proceed if required */ 03637 emms /* exit MMX state */ 03638 popa 03639 } 03640 #else 03641 /* i386 and x86_64 */ 03642 __m64 *mSrc1 = (__m64*)Src1; 03643 __m64 *mDest = (__m64*)Dest; 03644 __m64 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */ 03645 int i; 03646 /* Duplicate Tmax in 8 bytes of MM3 */ 03647 __m64 mm3, mm4; 03648 memset(&i, Tmax, 4); 03649 mm3 = _m_from_int(i); 03650 mm4 = _m_from_int(i); 03651 mm3 = _m_punpckldq(mm3, mm4); /* fill higher bytes of MM3 with Tmax */ 03652 mm1 = _m_psubusb(mm1, mm3); /* store 0xFF - Tmax in MM1 */ 03653 //__m64 mm3 = _m_from_int64(lli); // x86_64 only 03654 /* Duplicate Tmax in 8 bytes of MM3 */ 03655 __m64 mm5, mm7; 03656 memset(&i, Tmin, 4); 03657 mm5 = _m_from_int(i); 03658 mm4 = _m_from_int(i); 03659 mm5 = _m_punpckldq(mm5, mm4); /* fill higher bytes of MM5 with Tmin */ 03660 mm7 = _m_paddusb(mm5, mm1); /* store 0xFF - Tmax + Tmin in MM7 */ 03661 for (i = 0; i < SrcLength/8; i++) { 03662 __m64 mm0; 03663 mm0 = _m_paddusb(*mSrc1, mm1); /* MM0=Src1+(0xFF-Tmax) */ 03664 mm0 = _m_psubusb(mm0, mm7); /* MM0=MM0-(0xFF-Tmax+Tmin) */ 03665 *mDest = _m_paddusb(mm0, mm5); /* MM0+Tmin */ 03666 mSrc1++; 03667 mDest++; 03668 } 03669 _m_empty(); /* clean MMX state */ 03670 #endif 03671 return (0); 03672 #else 03673 return (-1); 03674 #endif 03675 } 03676 03688 int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin, 03689 unsigned char Tmax) 03690 { 03691 unsigned int i, istart; 03692 unsigned char *cursrc1; 03693 unsigned char *curdest; 03694 03695 /* Validate input parameters */ 03696 if ((Src1 == NULL) || (Dest == NULL)) 03697 return(-1); 03698 if (length == 0) 03699 return(0); 03700 03701 /* Special case: Tmin==0 && Tmax = 255 */ 03702 if ((Tmin == 0) && (Tmax == 25)) { 03703 memcpy(Src1, Dest, length); 03704 return (0); 03705 } 03706 03707 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03708 03709 SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax); 03710 03711 /* Check for unaligned bytes */ 03712 if ((length & 7) > 0) { 03713 /* Setup to process unaligned bytes */ 03714 istart = length & 0xfffffff8; 03715 cursrc1 = &Src1[istart]; 03716 curdest = &Dest[istart]; 03717 } else { 03718 /* No unaligned bytes - we are done */ 03719 return (0); 03720 } 03721 } else { 03722 /* Setup to process whole image */ 03723 istart = 0; 03724 cursrc1 = Src1; 03725 curdest = Dest; 03726 } 03727 03728 /* C routine to process image */ 03729 for (i = istart; i < length; i++) { 03730 if (*cursrc1 < Tmin) { 03731 *curdest = Tmin; 03732 } else if (*cursrc1 > Tmax) { 03733 *curdest = Tmax; 03734 } else { 03735 *curdest = *cursrc1; 03736 } 03737 /* Advance pointers */ 03738 cursrc1++; 03739 curdest++; 03740 } 03741 03742 return (0); 03743 } 03744 03758 static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax, 03759 int Nmin, int Nmax) 03760 { 03761 #ifdef USE_MMX 03762 #if !defined(GCC__) 03763 __asm 03764 { 03765 pusha 03766 mov ax, WORD PTR Nmax /* load Nmax in AX */ 03767 mov bx, WORD PTR Cmax /* load Cmax in BX */ 03768 sub ax, WORD PTR Nmin /* AX = Nmax - Nmin */ 03769 sub bx, WORD PTR Cmin /* BX = Cmax - Cmin */ 03770 jz L10311 /* check division by zero */ 03771 xor dx, dx /* prepare for division, zero DX */ 03772 div bx /* AX = AX/BX */ 03773 jmp L10312 03774 L10311: 03775 mov ax, 255 /* if div by zero, assume result max byte value */ 03776 L10312: /* ** Duplicate AX in 4 words of MM0 ** */ 03777 mov bx, ax /* copy AX into BX */ 03778 shl eax, 16 /* shift 2 bytes of EAX left */ 03779 mov ax, bx /* copy BX into AX */ 03780 movd mm0, eax /* copy EAX into MM0 */ 03781 movd mm1, eax /* copy EAX into MM1 */ 03782 punpckldq mm0, mm1 /* fill higher words of MM0 with AX */ 03783 /* ** Duplicate Cmin in 4 words of MM1 ** */ 03784 mov ax, WORD PTR Cmin /* load Cmin into AX */ 03785 mov bx, ax /* copy AX into BX */ 03786 shl eax, 16 /* shift 2 bytes of EAX left */ 03787 mov ax, bx /* copy BX into AX */ 03788 movd mm1, eax /* copy EAX into MM1 */ 03789 movd mm2, eax /* copy EAX into MM2 */ 03790 punpckldq mm1, mm2 /* fill higher words of MM1 with Cmin */ 03791 /* ** Duplicate Nmin in 4 words of MM2 ** */ 03792 mov ax, WORD PTR Nmin /* load Nmin into AX */ 03793 mov bx, ax /* copy AX into BX */ 03794 shl eax, 16 /* shift 2 bytes of EAX left */ 03795 mov ax, bx /* copy BX into AX */ 03796 movd mm2, eax /* copy EAX into MM2 */ 03797 movd mm3, eax /* copy EAX into MM3 */ 03798 punpckldq mm2, mm3 /* fill higher words of MM2 with Nmin */ 03799 pxor mm7, mm7 /* zero MM7 register */ 03800 mov eax, Src1 /* load Src1 address into eax */ 03801 mov edi, Dest /* load Dest address into edi */ 03802 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03803 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03804 align 16 /* 16 byte alignment of the loop entry */ 03805 L1031: 03806 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 03807 movq mm4, mm3 /* copy MM3 into MM4 */ 03808 punpcklbw mm3, mm7 /* unpack low bytes of SrcDest into words */ 03809 punpckhbw mm4, mm7 /* unpack high bytes of SrcDest into words */ 03810 psubusb mm3, mm1 /* S-Cmin, low bytes */ 03811 psubusb mm4, mm1 /* S-Cmin, high bytes */ 03812 pmullw mm3, mm0 /* MM0*(S-Cmin), low bytes */ 03813 pmullw mm4, mm0 /* MM0*(S-Cmin), high bytes */ 03814 paddusb mm3, mm2 /* MM0*(S-Cmin)+Nmin, low bytes */ 03815 paddusb mm4, mm2 /* MM0*(S-Cmin)+Nmin, high bytes */ 03816 /* ** Take abs value of the signed words ** */ 03817 movq mm5, mm3 /* copy mm3 into mm5 */ 03818 movq mm6, mm4 /* copy mm4 into mm6 */ 03819 psraw mm5, 15 /* fill mm5 words with word sign bit */ 03820 psraw mm6, 15 /* fill mm6 words with word sign bit */ 03821 pxor mm3, mm5 /* take 1's compliment of only neg words */ 03822 pxor mm4, mm6 /* take 1's compliment of only neg words */ 03823 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */ 03824 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 03825 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 03826 movq [edi], mm3 /* store result in Dest */ 03827 add eax, 8 /* increase Src1 register pointer by 8 */ 03828 add edi, 8 /* increase Dest register pointer by 8 */ 03829 dec ecx /* decrease loop counter */ 03830 jnz L1031 /* check loop termination, proceed if required */ 03831 emms /* exit MMX state */ 03832 popa 03833 } 03834 #else 03835 /* i386 and x86_64 */ 03836 __m64 *mSrc1 = (__m64*)Src1; 03837 __m64 *mDest = (__m64*)Dest; 03838 __m64 mm0, mm1, mm2, mm3; 03839 03840 int i; 03841 /* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */ 03842 unsigned short a = Nmax - Nmin; 03843 unsigned short b = Cmax - Cmin; 03844 if (b == 0) { 03845 a = 255; 03846 } else { 03847 a /= b; 03848 } 03849 i = (a<<16)|a; 03850 mm0 = _m_from_int(i); 03851 mm1 = _m_from_int(i); 03852 mm0 = _m_punpckldq(mm0, mm1); /* fill higher words of MM0 with AX */ 03853 /* Duplicate Cmin in 4 words of MM1 */ 03854 i = (Cmin<<16)|(short)Cmin; 03855 mm1 = _m_from_int(i); 03856 mm2 = _m_from_int(i); 03857 mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with Cmin */ 03858 /* Duplicate Nmin in 4 words of MM2 */ 03859 i = (Nmin<<16)|(short)Nmin; 03860 mm2 = _m_from_int(i); 03861 mm3 = _m_from_int(i); 03862 mm2 = _m_punpckldq(mm2, mm3); /* fill higher words of MM2 with Nmin */ 03863 __m64 mm7 = _m_from_int(0); /* zero mm0 register */ 03864 for (i = 0; i < SrcLength/8; i++) { 03865 __m64 mm3, mm4, mm5, mm6; 03866 mm3 = _m_punpcklbw(*mSrc1, mm7); /* unpack low bytes of Src1 into words */ 03867 mm4 = _m_punpckhbw(*mSrc1, mm7); /* unpack high bytes of Src1 into words */ 03868 mm3 = _m_psubusb(mm3, mm1); /* S-Cmin, low bytes */ 03869 mm4 = _m_psubusb(mm4, mm1); /* S-Cmin, high bytes */ 03870 mm3 = _m_pmullw(mm3, mm0); /* MM0*(S-Cmin), low bytes */ 03871 mm4 = _m_pmullw(mm4, mm0); /* MM0*(S-Cmin), high bytes */ 03872 mm3 = _m_paddusb(mm3, mm2); /* MM0*(S-Cmin)+Nmin, low bytes */ 03873 mm4 = _m_paddusb(mm4, mm2); /* MM0*(S-Cmin)+Nmin, high bytes */ 03874 /* Take abs value of the signed words */ 03875 mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */ 03876 mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */ 03877 mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */ 03878 mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */ 03879 mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */ 03880 mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */ 03881 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 03882 mSrc1++; 03883 mDest++; 03884 } 03885 _m_empty(); /* clean MMX state */ 03886 #endif 03887 return (0); 03888 #else 03889 return (-1); 03890 #endif 03891 } 03892 03906 int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin, 03907 int Nmax) 03908 { 03909 unsigned int i, istart; 03910 unsigned char *cursrc; 03911 unsigned char *curdest; 03912 int dN, dC, factor; 03913 int result; 03914 03915 /* Validate input parameters */ 03916 if ((Src == NULL) || (Dest == NULL)) 03917 return(-1); 03918 if (length == 0) 03919 return(0); 03920 03921 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03922 03923 SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax); 03924 03925 /* Check for unaligned bytes */ 03926 if ((length & 7) > 0) { 03927 /* Setup to process unaligned bytes */ 03928 istart = length & 0xfffffff8; 03929 cursrc = &Src[istart]; 03930 curdest = &Dest[istart]; 03931 } else { 03932 /* No unaligned bytes - we are done */ 03933 return (0); 03934 } 03935 } else { 03936 /* Setup to process whole image */ 03937 istart = 0; 03938 cursrc = Src; 03939 curdest = Dest; 03940 } 03941 03942 /* C routine to process image */ 03943 dC = Cmax - Cmin; 03944 if (dC == 0) 03945 return (0); 03946 dN = Nmax - Nmin; 03947 factor = dN / dC; 03948 for (i = istart; i < length; i++) { 03949 result = factor * ((int) (*cursrc) - Cmin) + Nmin; 03950 if (result > 255) 03951 result = 255; 03952 *curdest = (unsigned char) result; 03953 /* Advance pointers */ 03954 cursrc++; 03955 curdest++; 03956 } 03957 03958 return (0); 03959 } 03960 03961 /* ------------------------------------------------------------------------------------ */ 03962 03977 int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 03978 signed short *Kernel, unsigned char Divisor) 03979 { 03980 /* Validate input parameters */ 03981 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 03982 return(-1); 03983 03984 if ((columns < 3) || (rows < 3) || (Divisor == 0)) 03985 return (-1); 03986 03987 if ((SDL_imageFilterMMXdetect())) { 03988 //#ifdef USE_MMX 03989 #if defined(USE_MMX) && defined(i386) 03990 #if !defined(GCC__) 03991 __asm 03992 { 03993 pusha 03994 pxor mm0, mm0 /* zero MM0 */ 03995 xor ebx, ebx /* zero EBX */ 03996 mov bl, Divisor /* load Divisor into BL */ 03997 mov edx, Kernel /* load Kernel address into EDX */ 03998 movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */ 03999 add edx, 8 /* second row |K0 K1 K2 0| */ 04000 movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 04001 add edx, 8 /* third row |K6 K7 K8 0| */ 04002 movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */ 04003 /* ---, */ 04004 mov eax, columns /* load columns into EAX */ 04005 mov esi, Src /* ESI = Src row 0 address */ 04006 mov edi, Dest /* load Dest address to EDI */ 04007 add edi, eax /* EDI = EDI + columns */ 04008 inc edi /* 1 byte offset from the left edge */ 04009 mov edx, rows /* initialize ROWS counter */ 04010 sub edx, 2 /* do not use first and last row */ 04011 /* ---, */ 04012 L10320: 04013 mov ecx, eax /* initialize COLUMS counter */ 04014 sub ecx, 2 /* do not use first and last column */ 04015 align 16 /* 16 byte alignment of the loop entry */ 04016 L10322: 04017 /* ---, */ 04018 movq mm1, [esi] /* load 8 bytes of the image first row */ 04019 add esi, eax /* move one row below */ 04020 movq mm2, [esi] /* load 8 bytes of the image second row */ 04021 add esi, eax /* move one row below */ 04022 movq mm3, [esi] /* load 8 bytes of the image third row */ 04023 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04024 punpcklbw mm2, mm0 /* unpack first 4 bytes into words */ 04025 punpcklbw mm3, mm0 /* unpack first 4 bytes into words */ 04026 pmullw mm1, mm5 /* multiply words first row image*Kernel */ 04027 pmullw mm2, mm6 /* multiply words second row image*Kernel */ 04028 pmullw mm3, mm7 /* multiply words third row image*Kernel */ 04029 paddsw mm1, mm2 /* add 4 words of the first and second rows */ 04030 paddsw mm1, mm3 /* add 4 words of the third row and result */ 04031 movq mm2, mm1 /* copy MM1 into MM2 */ 04032 psrlq mm1, 32 /* shift 2 left words to the right */ 04033 paddsw mm1, mm2 /* add 2 left and 2 right result words */ 04034 movq mm3, mm1 /* copy MM1 into MM3 */ 04035 psrlq mm1, 16 /* shift 1 left word to the right */ 04036 paddsw mm1, mm3 /* add 1 left and 1 right result words */ 04037 /* --, */ 04038 movd mm2, eax /* save EAX in MM2 */ 04039 movd mm3, edx /* save EDX in MM3 */ 04040 movd eax, mm1 /* copy MM1 into EAX */ 04041 psraw mm1, 15 /* spread sign bit of the result */ 04042 movd edx, mm1 /* fill EDX with a sign bit */ 04043 idiv bx /* IDIV - VERY EXPENSIVE */ 04044 movd mm1, eax /* move result of division into MM1 */ 04045 packuswb mm1, mm0 /* pack division result with saturation */ 04046 movd eax, mm1 /* copy saturated result into EAX */ 04047 mov [edi], al /* copy a byte result into Dest */ 04048 movd edx, mm3 /* restore saved EDX */ 04049 movd eax, mm2 /* restore saved EAX */ 04050 /* --, */ 04051 sub esi, eax /* move two rows up */ 04052 sub esi, eax /* */ 04053 inc esi /* move Src pointer to the next pixel */ 04054 inc edi /* move Dest pointer to the next pixel */ 04055 /* ---, */ 04056 dec ecx /* decrease loop counter COLUMNS */ 04057 jnz L10322 /* check loop termination, proceed if required */ 04058 add esi, 2 /* move to the next row in Src */ 04059 add edi, 2 /* move to the next row in Dest */ 04060 dec edx /* decrease loop counter ROWS */ 04061 jnz L10320 /* check loop termination, proceed if required */ 04062 /* ---, */ 04063 emms /* exit MMX state */ 04064 popa 04065 } 04066 #else 04067 asm volatile 04068 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 04069 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 04070 "mov %5, %%bl \n\t" /* load Divisor into BL */ 04071 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 04072 "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */ 04073 "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */ 04074 "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 04075 "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */ 04076 "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */ 04077 /* --- */ 04078 "mov %3, %%eax \n\t" /* load columns into EAX */ 04079 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 04080 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 04081 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 04082 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 04083 "mov %2, %%edx \n\t" /* initialize ROWS counter */ 04084 "sub $2, %%edx \n\t" /* do not use first and last row */ 04085 /* --- */ 04086 ".L10320: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 04087 "sub $2, %%ecx \n\t" /* do not use first and last column */ 04088 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 04089 ".L10322: \n\t" 04090 /* --- */ 04091 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */ 04092 "add %%eax, %%esi \n\t" /* move one row below */ 04093 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */ 04094 "add %%eax, %%esi \n\t" /* move one row below */ 04095 "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */ 04096 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04097 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */ 04098 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */ 04099 "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */ 04100 "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */ 04101 "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */ 04102 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */ 04103 "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */ 04104 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04105 "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */ 04106 "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */ 04107 "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */ 04108 "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */ 04109 "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */ 04110 /* -- */ 04111 "movd %%eax, %%mm2 \n\t" /* save EAX in MM2 */ 04112 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 04113 "movd %%mm1, %%eax \n\t" /* copy MM1 into EAX */ 04114 "psraw $15, %%mm1 \n\t" /* spread sign bit of the result */ 04115 "movd %%mm1, %%edx \n\t" /* fill EDX with a sign bit */ 04116 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 04117 "movd %%eax, %%mm1 \n\t" /* move result of division into MM1 */ 04118 "packuswb %%mm0, %%mm1 \n\t" /* pack division result with saturation */ 04119 "movd %%mm1, %%eax \n\t" /* copy saturated result into EAX */ 04120 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 04121 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 04122 "movd %%mm2, %%eax \n\t" /* restore saved EAX */ 04123 /* -- */ 04124 "sub %%eax, %%esi \n\t" /* move two rows up */ 04125 "sub %%eax, %%esi \n\t" /* */ 04126 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 04127 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 04128 /* --- */ 04129 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 04130 "jnz .L10322 \n\t" /* check loop termination, proceed if required */ 04131 "add $2, %%esi \n\t" /* move to the next row in Src */ 04132 "add $2, %%edi \n\t" /* move to the next row in Dest */ 04133 "dec %%edx \n\t" /* decrease loop counter ROWS */ 04134 "jnz .L10320 \n\t" /* check loop termination, proceed if required */ 04135 /* --- */ 04136 "emms \n\t" /* exit MMX state */ 04137 "popa \n\t":"=m" (Dest) /* %0 */ 04138 :"m"(Src), /* %1 */ 04139 "m"(rows), /* %2 */ 04140 "m"(columns), /* %3 */ 04141 "m"(Kernel), /* %4 */ 04142 "m"(Divisor) /* %5 */ 04143 ); 04144 #endif 04145 #endif 04146 return (0); 04147 } else { 04148 /* No non-MMX implementation yet */ 04149 return (-1); 04150 } 04151 } 04152 04167 int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 04168 signed short *Kernel, unsigned char Divisor) 04169 { 04170 /* Validate input parameters */ 04171 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 04172 return(-1); 04173 04174 if ((columns < 5) || (rows < 5) || (Divisor == 0)) 04175 return (-1); 04176 04177 if ((SDL_imageFilterMMXdetect())) { 04178 //#ifdef USE_MMX 04179 #if defined(USE_MMX) && defined(i386) 04180 #if !defined(GCC__) 04181 __asm 04182 { 04183 pusha 04184 pxor mm0, mm0 /* zero MM0 */ 04185 xor ebx, ebx /* zero EBX */ 04186 mov bl, Divisor /* load Divisor into BL */ 04187 movd mm5, ebx /* copy Divisor into MM5 */ 04188 mov edx, Kernel /* load Kernel address into EDX */ 04189 mov esi, Src /* load Src address to ESI */ 04190 mov edi, Dest /* load Dest address to EDI */ 04191 add edi, 2 /* 2 column offset from the left edge */ 04192 mov eax, columns /* load columns into EAX */ 04193 shl eax, 1 /* EAX = columns * 2 */ 04194 add edi, eax /* 2 row offset from the top edge */ 04195 shr eax, 1 /* EAX = columns */ 04196 mov ebx, rows /* initialize ROWS counter */ 04197 sub ebx, 4 /* do not use first 2 and last 2 rows */ 04198 /* ---, */ 04199 L10330: 04200 mov ecx, eax /* initialize COLUMNS counter */ 04201 sub ecx, 4 /* do not use first 2 and last 2 columns */ 04202 align 16 /* 16 byte alignment of the loop entry */ 04203 L10332: 04204 pxor mm7, mm7 /* zero MM7 (accumulator) */ 04205 movd mm6, esi /* save ESI in MM6 */ 04206 /* --- 1 */ 04207 movq mm1, [esi] /* load 8 bytes of the Src */ 04208 movq mm2, mm1 /* copy MM1 into MM2 */ 04209 add esi, eax /* move Src pointer 1 row below */ 04210 movq mm3, [edx] /* load 4 words of Kernel */ 04211 add edx, 8 /* move pointer to other 4 words */ 04212 movq mm4, [edx] /* load 4 words of Kernel */ 04213 add edx, 8 /* move pointer to other 4 words */ 04214 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04215 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04216 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04217 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04218 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04219 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04220 /* --- 2 */ 04221 movq mm1, [esi] /* load 8 bytes of the Src */ 04222 movq mm2, mm1 /* copy MM1 into MM2 */ 04223 add esi, eax /* move Src pointer 1 row below */ 04224 movq mm3, [edx] /* load 4 words of Kernel */ 04225 add edx, 8 /* move pointer to other 4 words */ 04226 movq mm4, [edx] /* load 4 words of Kernel */ 04227 add edx, 8 /* move pointer to other 4 words */ 04228 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04229 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04230 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04231 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04232 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04233 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04234 /* --- 3 */ 04235 movq mm1, [esi] /* load 8 bytes of the Src */ 04236 movq mm2, mm1 /* copy MM1 into MM2 */ 04237 add esi, eax /* move Src pointer 1 row below */ 04238 movq mm3, [edx] /* load 4 words of Kernel */ 04239 add edx, 8 /* move pointer to other 4 words */ 04240 movq mm4, [edx] /* load 4 words of Kernel */ 04241 add edx, 8 /* move pointer to other 4 words */ 04242 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04243 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04244 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04245 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04246 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04247 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04248 /* --- 4 */ 04249 movq mm1, [esi] /* load 8 bytes of the Src */ 04250 movq mm2, mm1 /* copy MM1 into MM2 */ 04251 add esi, eax /* move Src pointer 1 row below */ 04252 movq mm3, [edx] /* load 4 words of Kernel */ 04253 add edx, 8 /* move pointer to other 4 words */ 04254 movq mm4, [edx] /* load 4 words of Kernel */ 04255 add edx, 8 /* move pointer to other 4 words */ 04256 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04257 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04258 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04259 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04260 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04261 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04262 /* --- 5 */ 04263 movq mm1, [esi] /* load 8 bytes of the Src */ 04264 movq mm2, mm1 /* copy MM1 into MM2 */ 04265 movq mm3, [edx] /* load 4 words of Kernel */ 04266 add edx, 8 /* move pointer to other 4 words */ 04267 movq mm4, [edx] /* load 4 words of Kernel */ 04268 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04269 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04270 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04271 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04272 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04273 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04274 /* ---, */ 04275 movq mm3, mm7 /* copy MM7 into MM3 */ 04276 psrlq mm7, 32 /* shift 2 left words to the right */ 04277 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 04278 movq mm2, mm7 /* copy MM7 into MM2 */ 04279 psrlq mm7, 16 /* shift 1 left word to the right */ 04280 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 04281 /* ---, */ 04282 movd mm1, eax /* save EDX in MM1 */ 04283 movd mm2, ebx /* save EDX in MM2 */ 04284 movd mm3, edx /* save EDX in MM3 */ 04285 movd eax, mm7 /* load summation result into EAX */ 04286 psraw mm7, 15 /* spread sign bit of the result */ 04287 movd ebx, mm5 /* load Divisor into EBX */ 04288 movd edx, mm7 /* fill EDX with a sign bit */ 04289 idiv bx /* IDIV - VERY EXPENSIVE */ 04290 movd mm7, eax /* move result of division into MM7 */ 04291 packuswb mm7, mm0 /* pack division result with saturation */ 04292 movd eax, mm7 /* copy saturated result into EAX */ 04293 mov [edi], al /* copy a byte result into Dest */ 04294 movd edx, mm3 /* restore saved EDX */ 04295 movd ebx, mm2 /* restore saved EBX */ 04296 movd eax, mm1 /* restore saved EAX */ 04297 /* --, */ 04298 movd esi, mm6 /* move Src pointer to the top pixel */ 04299 sub edx, 72 /* EDX = Kernel address */ 04300 inc esi /* move Src pointer to the next pixel */ 04301 inc edi /* move Dest pointer to the next pixel */ 04302 /* ---, */ 04303 dec ecx /* decrease loop counter COLUMNS */ 04304 jnz L10332 /* check loop termination, proceed if required */ 04305 add esi, 4 /* move to the next row in Src */ 04306 add edi, 4 /* move to the next row in Dest */ 04307 dec ebx /* decrease loop counter ROWS */ 04308 jnz L10330 /* check loop termination, proceed if required */ 04309 /* ---, */ 04310 emms /* exit MMX state */ 04311 popa 04312 } 04313 #else 04314 asm volatile 04315 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 04316 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 04317 "mov %5, %%bl \n\t" /* load Divisor into BL */ 04318 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */ 04319 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 04320 "mov %1, %%esi \n\t" /* load Src address to ESI */ 04321 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 04322 "add $2, %%edi \n\t" /* 2 column offset from the left edge */ 04323 "mov %3, %%eax \n\t" /* load columns into EAX */ 04324 "shl $1, %%eax \n\t" /* EAX = columns * 2 */ 04325 "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */ 04326 "shr $1, %%eax \n\t" /* EAX = columns */ 04327 "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 04328 "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */ 04329 /* --- */ 04330 ".L10330: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 04331 "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */ 04332 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 04333 ".L10332: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 04334 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 04335 /* --- 1 */ 04336 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04337 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04338 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04339 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04340 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04341 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04342 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04343 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04344 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04345 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04346 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04347 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04348 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04349 /* --- 2 */ 04350 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04351 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04352 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04353 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04354 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04355 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04356 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04357 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04358 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04359 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04360 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04361 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04362 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04363 /* --- 3 */ 04364 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04365 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04366 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04367 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04368 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04369 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04370 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04371 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04372 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04373 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04374 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04375 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04376 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04377 /* --- 4 */ 04378 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04379 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04380 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04381 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04382 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04383 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04384 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04385 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04386 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04387 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04388 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04389 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04390 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04391 /* --- 5 */ 04392 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04393 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04394 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04395 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04396 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04397 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04398 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04399 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04400 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04401 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04402 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04403 /* --- */ 04404 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 04405 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 04406 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 04407 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 04408 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 04409 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 04410 /* --- */ 04411 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */ 04412 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */ 04413 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 04414 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */ 04415 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */ 04416 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */ 04417 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */ 04418 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 04419 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */ 04420 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 04421 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 04422 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 04423 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 04424 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */ 04425 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 04426 /* -- */ 04427 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 04428 "sub $72, %%edx \n\t" /* EDX = Kernel address */ 04429 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 04430 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 04431 /* --- */ 04432 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 04433 "jnz .L10332 \n\t" /* check loop termination, proceed if required */ 04434 "add $4, %%esi \n\t" /* move to the next row in Src */ 04435 "add $4, %%edi \n\t" /* move to the next row in Dest */ 04436 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 04437 "jnz .L10330 \n\t" /* check loop termination, proceed if required */ 04438 /* --- */ 04439 "emms \n\t" /* exit MMX state */ 04440 "popa \n\t":"=m" (Dest) /* %0 */ 04441 :"m"(Src), /* %1 */ 04442 "m"(rows), /* %2 */ 04443 "m"(columns), /* %3 */ 04444 "m"(Kernel), /* %4 */ 04445 "m"(Divisor) /* %5 */ 04446 ); 04447 #endif 04448 #endif 04449 return (0); 04450 } else { 04451 /* No non-MMX implementation yet */ 04452 return (-1); 04453 } 04454 } 04455 04470 int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 04471 signed short *Kernel, unsigned char Divisor) 04472 { 04473 /* Validate input parameters */ 04474 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 04475 return(-1); 04476 04477 if ((columns < 7) || (rows < 7) || (Divisor == 0)) 04478 return (-1); 04479 04480 if ((SDL_imageFilterMMXdetect())) { 04481 //#ifdef USE_MMX 04482 #if defined(USE_MMX) && defined(i386) 04483 #if !defined(GCC__) 04484 __asm 04485 { 04486 pusha 04487 pxor mm0, mm0 /* zero MM0 */ 04488 xor ebx, ebx /* zero EBX */ 04489 mov bl, Divisor /* load Divisor into BL */ 04490 movd mm5, ebx /* copy Divisor into MM5 */ 04491 mov edx, Kernel /* load Kernel address into EDX */ 04492 mov esi, Src /* load Src address to ESI */ 04493 mov edi, Dest /* load Dest address to EDI */ 04494 add edi, 3 /* 3 column offset from the left edge */ 04495 mov eax, columns /* load columns into EAX */ 04496 add edi, eax /* 3 row offset from the top edge */ 04497 add edi, eax 04498 add edi, eax 04499 mov ebx, rows /* initialize ROWS counter */ 04500 sub ebx, 6 /* do not use first 3 and last 3 rows */ 04501 /* ---, */ 04502 L10340: 04503 mov ecx, eax /* initialize COLUMNS counter */ 04504 sub ecx, 6 /* do not use first 3 and last 3 columns */ 04505 align 16 /* 16 byte alignment of the loop entry */ 04506 L10342: 04507 pxor mm7, mm7 /* zero MM7 (accumulator) */ 04508 movd mm6, esi /* save ESI in MM6 */ 04509 /* --- 1 */ 04510 movq mm1, [esi] /* load 8 bytes of the Src */ 04511 movq mm2, mm1 /* copy MM1 into MM2 */ 04512 add esi, eax /* move Src pointer 1 row below */ 04513 movq mm3, [edx] /* load 4 words of Kernel */ 04514 add edx, 8 /* move pointer to other 4 words */ 04515 movq mm4, [edx] /* load 4 words of Kernel */ 04516 add edx, 8 /* move pointer to other 4 words */ 04517 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04518 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04519 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04520 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04521 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04522 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04523 /* --- 2 */ 04524 movq mm1, [esi] /* load 8 bytes of the Src */ 04525 movq mm2, mm1 /* copy MM1 into MM2 */ 04526 add esi, eax /* move Src pointer 1 row below */ 04527 movq mm3, [edx] /* load 4 words of Kernel */ 04528 add edx, 8 /* move pointer to other 4 words */ 04529 movq mm4, [edx] /* load 4 words of Kernel */ 04530 add edx, 8 /* move pointer to other 4 words */ 04531 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04532 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04533 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04534 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04535 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04536 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04537 /* --- 3 */ 04538 movq mm1, [esi] /* load 8 bytes of the Src */ 04539 movq mm2, mm1 /* copy MM1 into MM2 */ 04540 add esi, eax /* move Src pointer 1 row below */ 04541 movq mm3, [edx] /* load 4 words of Kernel */ 04542 add edx, 8 /* move pointer to other 4 words */ 04543 movq mm4, [edx] /* load 4 words of Kernel */ 04544 add edx, 8 /* move pointer to other 4 words */ 04545 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04546 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04547 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04548 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04549 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04550 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04551 /* --- 4 */ 04552 movq mm1, [esi] /* load 8 bytes of the Src */ 04553 movq mm2, mm1 /* copy MM1 into MM2 */ 04554 add esi, eax /* move Src pointer 1 row below */ 04555 movq mm3, [edx] /* load 4 words of Kernel */ 04556 add edx, 8 /* move pointer to other 4 words */ 04557 movq mm4, [edx] /* load 4 words of Kernel */ 04558 add edx, 8 /* move pointer to other 4 words */ 04559 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04560 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04561 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04562 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04563 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04564 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04565 /* --- 5 */ 04566 movq mm1, [esi] /* load 8 bytes of the Src */ 04567 movq mm2, mm1 /* copy MM1 into MM2 */ 04568 add esi, eax /* move Src pointer 1 row below */ 04569 movq mm3, [edx] /* load 4 words of Kernel */ 04570 add edx, 8 /* move pointer to other 4 words */ 04571 movq mm4, [edx] /* load 4 words of Kernel */ 04572 add edx, 8 /* move pointer to other 4 words */ 04573 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04574 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04575 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04576 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04577 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04578 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04579 /* --- 6 */ 04580 movq mm1, [esi] /* load 8 bytes of the Src */ 04581 movq mm2, mm1 /* copy MM1 into MM2 */ 04582 add esi, eax /* move Src pointer 1 row below */ 04583 movq mm3, [edx] /* load 4 words of Kernel */ 04584 add edx, 8 /* move pointer to other 4 words */ 04585 movq mm4, [edx] /* load 4 words of Kernel */ 04586 add edx, 8 /* move pointer to other 4 words */ 04587 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04588 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04589 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04590 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04591 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04592 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04593 /* --- 7 */ 04594 movq mm1, [esi] /* load 8 bytes of the Src */ 04595 movq mm2, mm1 /* copy MM1 into MM2 */ 04596 movq mm3, [edx] /* load 4 words of Kernel */ 04597 add edx, 8 /* move pointer to other 4 words */ 04598 movq mm4, [edx] /* load 4 words of Kernel */ 04599 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04600 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04601 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04602 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04603 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04604 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04605 /* ---, */ 04606 movq mm3, mm7 /* copy MM7 into MM3 */ 04607 psrlq mm7, 32 /* shift 2 left words to the right */ 04608 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 04609 movq mm2, mm7 /* copy MM7 into MM2 */ 04610 psrlq mm7, 16 /* shift 1 left word to the right */ 04611 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 04612 /* ---, */ 04613 movd mm1, eax /* save EDX in MM1 */ 04614 movd mm2, ebx /* save EDX in MM2 */ 04615 movd mm3, edx /* save EDX in MM3 */ 04616 movd eax, mm7 /* load summation result into EAX */ 04617 psraw mm7, 15 /* spread sign bit of the result */ 04618 movd ebx, mm5 /* load Divisor into EBX */ 04619 movd edx, mm7 /* fill EDX with a sign bit */ 04620 idiv bx /* IDIV - VERY EXPENSIVE */ 04621 movd mm7, eax /* move result of division into MM7 */ 04622 packuswb mm7, mm0 /* pack division result with saturation */ 04623 movd eax, mm7 /* copy saturated result into EAX */ 04624 mov [edi], al /* copy a byte result into Dest */ 04625 movd edx, mm3 /* restore saved EDX */ 04626 movd ebx, mm2 /* restore saved EBX */ 04627 movd eax, mm1 /* restore saved EAX */ 04628 /* --, */ 04629 movd esi, mm6 /* move Src pointer to the top pixel */ 04630 sub edx, 104 /* EDX = Kernel address */ 04631 inc esi /* move Src pointer to the next pixel */ 04632 inc edi /* move Dest pointer to the next pixel */ 04633 /* ---, */ 04634 dec ecx /* decrease loop counter COLUMNS */ 04635 jnz L10342 /* check loop termination, proceed if required */ 04636 add esi, 6 /* move to the next row in Src */ 04637 add edi, 6 /* move to the next row in Dest */ 04638 dec ebx /* decrease loop counter ROWS */ 04639 jnz L10340 /* check loop termination, proceed if required */ 04640 /* ---, */ 04641 emms /* exit MMX state */ 04642 popa 04643 } 04644 #else 04645 asm volatile 04646 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 04647 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 04648 "mov %5, %%bl \n\t" /* load Divisor into BL */ 04649 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */ 04650 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 04651 "mov %1, %%esi \n\t" /* load Src address to ESI */ 04652 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 04653 "add $3, %%edi \n\t" /* 3 column offset from the left edge */ 04654 "mov %3, %%eax \n\t" /* load columns into EAX */ 04655 "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */ 04656 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 04657 "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */ 04658 /* --- */ 04659 ".L10340: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 04660 "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */ 04661 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 04662 ".L10342: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 04663 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 04664 /* --- 1 */ 04665 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04666 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04667 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04668 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04669 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04670 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04671 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04672 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04673 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04674 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04675 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04676 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04677 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04678 /* --- 2 */ 04679 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04680 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04681 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04682 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04683 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04684 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04685 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04686 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04687 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04688 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04689 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04690 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04691 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04692 /* --- 3 */ 04693 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04694 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04695 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04696 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04697 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04698 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04699 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04700 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04701 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04702 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04703 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04704 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04705 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04706 /* --- 4 */ 04707 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04708 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04709 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04710 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04711 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04712 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04713 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04714 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04715 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04716 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04717 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04718 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04719 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04720 /* --- 5 */ 04721 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04722 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04723 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04724 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04725 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04726 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04727 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04728 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04729 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04730 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04731 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04732 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04733 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04734 /* --- 6 */ 04735 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04736 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04737 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04738 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04739 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04740 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04741 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04742 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04743 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04744 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04745 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04746 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04747 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04748 /* --- 7 */ 04749 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04750 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04751 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04752 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04753 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04754 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04755 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04756 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04757 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04758 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04759 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04760 /* --- */ 04761 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 04762 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 04763 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 04764 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 04765 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 04766 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 04767 /* --- */ 04768 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */ 04769 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */ 04770 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 04771 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */ 04772 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */ 04773 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */ 04774 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */ 04775 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 04776 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */ 04777 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 04778 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 04779 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 04780 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 04781 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */ 04782 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 04783 /* -- */ 04784 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 04785 "sub $104, %%edx \n\t" /* EDX = Kernel address */ 04786 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 04787 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 04788 /* --- */ 04789 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 04790 "jnz .L10342 \n\t" /* check loop termination, proceed if required */ 04791 "add $6, %%esi \n\t" /* move to the next row in Src */ 04792 "add $6, %%edi \n\t" /* move to the next row in Dest */ 04793 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 04794 "jnz .L10340 \n\t" /* check loop termination, proceed if required */ 04795 /* --- */ 04796 "emms \n\t" /* exit MMX state */ 04797 "popa \n\t":"=m" (Dest) /* %0 */ 04798 :"m"(Src), /* %1 */ 04799 "m"(rows), /* %2 */ 04800 "m"(columns), /* %3 */ 04801 "m"(Kernel), /* %4 */ 04802 "m"(Divisor) /* %5 */ 04803 ); 04804 #endif 04805 #endif 04806 return (0); 04807 } else { 04808 /* No non-MMX implementation yet */ 04809 return (-1); 04810 } 04811 } 04812 04827 int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 04828 signed short *Kernel, unsigned char Divisor) 04829 { 04830 /* Validate input parameters */ 04831 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 04832 return(-1); 04833 04834 if ((columns < 9) || (rows < 9) || (Divisor == 0)) 04835 return (-1); 04836 04837 if ((SDL_imageFilterMMXdetect())) { 04838 //#ifdef USE_MMX 04839 #if defined(USE_MMX) && defined(i386) 04840 #if !defined(GCC__) 04841 __asm 04842 { 04843 pusha 04844 pxor mm0, mm0 /* zero MM0 */ 04845 xor ebx, ebx /* zero EBX */ 04846 mov bl, Divisor /* load Divisor into BL */ 04847 movd mm5, ebx /* copy Divisor into MM5 */ 04848 mov edx, Kernel /* load Kernel address into EDX */ 04849 mov esi, Src /* load Src address to ESI */ 04850 mov edi, Dest /* load Dest address to EDI */ 04851 add edi, 4 /* 4 column offset from the left edge */ 04852 mov eax, columns /* load columns into EAX */ 04853 add edi, eax /* 4 row offset from the top edge */ 04854 add edi, eax 04855 add edi, eax 04856 add edi, eax 04857 mov ebx, rows /* initialize ROWS counter */ 04858 sub ebx, 8 /* do not use first 4 and last 4 rows */ 04859 /* ---, */ 04860 L10350: 04861 mov ecx, eax /* initialize COLUMNS counter */ 04862 sub ecx, 8 /* do not use first 4 and last 4 columns */ 04863 align 16 /* 16 byte alignment of the loop entry */ 04864 L10352: 04865 pxor mm7, mm7 /* zero MM7 (accumulator) */ 04866 movd mm6, esi /* save ESI in MM6 */ 04867 /* --- 1 */ 04868 movq mm1, [esi] /* load 8 bytes of the Src */ 04869 movq mm2, mm1 /* copy MM1 into MM2 */ 04870 inc esi /* move pointer to the next 8 bytes of Src */ 04871 movq mm3, [edx] /* load 4 words of Kernel */ 04872 add edx, 8 /* move pointer to other 4 words */ 04873 movq mm4, [edx] /* load 4 words of Kernel */ 04874 add edx, 8 /* move pointer to other 4 words */ 04875 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04876 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04877 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04878 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04879 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04880 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04881 movq mm1, [esi] /* load 8 bytes of the Src */ 04882 dec esi 04883 add esi, eax /* move Src pointer 1 row below */ 04884 movq mm3, [edx] /* load 4 words of Kernel */ 04885 add edx, 8 /* move pointer to other 4 words */ 04886 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04887 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04888 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04889 /* --- 2 */ 04890 movq mm1, [esi] /* load 8 bytes of the Src */ 04891 movq mm2, mm1 /* copy MM1 into MM2 */ 04892 inc esi /* move pointer to the next 8 bytes of Src */ 04893 movq mm3, [edx] /* load 4 words of Kernel */ 04894 add edx, 8 /* move pointer to other 4 words */ 04895 movq mm4, [edx] /* load 4 words of Kernel */ 04896 add edx, 8 /* move pointer to other 4 words */ 04897 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04898 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04899 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04900 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04901 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04902 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04903 movq mm1, [esi] /* load 8 bytes of the Src */ 04904 dec esi 04905 add esi, eax /* move Src pointer 1 row below */ 04906 movq mm3, [edx] /* load 4 words of Kernel */ 04907 add edx, 8 /* move pointer to other 4 words */ 04908 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04909 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04910 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04911 /* --- 3 */ 04912 movq mm1, [esi] /* load 8 bytes of the Src */ 04913 movq mm2, mm1 /* copy MM1 into MM2 */ 04914 inc esi /* move pointer to the next 8 bytes of Src */ 04915 movq mm3, [edx] /* load 4 words of Kernel */ 04916 add edx, 8 /* move pointer to other 4 words */ 04917 movq mm4, [edx] /* load 4 words of Kernel */ 04918 add edx, 8 /* move pointer to other 4 words */ 04919 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04920 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04921 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04922 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04923 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04924 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04925 movq mm1, [esi] /* load 8 bytes of the Src */ 04926 dec esi 04927 add esi, eax /* move Src pointer 1 row below */ 04928 movq mm3, [edx] /* load 4 words of Kernel */ 04929 add edx, 8 /* move pointer to other 4 words */ 04930 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04931 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04932 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04933 /* --- 4 */ 04934 movq mm1, [esi] /* load 8 bytes of the Src */ 04935 movq mm2, mm1 /* copy MM1 into MM2 */ 04936 inc esi /* move pointer to the next 8 bytes of Src */ 04937 movq mm3, [edx] /* load 4 words of Kernel */ 04938 add edx, 8 /* move pointer to other 4 words */ 04939 movq mm4, [edx] /* load 4 words of Kernel */ 04940 add edx, 8 /* move pointer to other 4 words */ 04941 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04942 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04943 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04944 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04945 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04946 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04947 movq mm1, [esi] /* load 8 bytes of the Src */ 04948 dec esi 04949 add esi, eax /* move Src pointer 1 row below */ 04950 movq mm3, [edx] /* load 4 words of Kernel */ 04951 add edx, 8 /* move pointer to other 4 words */ 04952 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04953 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04954 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04955 /* --- 5 */ 04956 movq mm1, [esi] /* load 8 bytes of the Src */ 04957 movq mm2, mm1 /* copy MM1 into MM2 */ 04958 inc esi /* move pointer to the next 8 bytes of Src */ 04959 movq mm3, [edx] /* load 4 words of Kernel */ 04960 add edx, 8 /* move pointer to other 4 words */ 04961 movq mm4, [edx] /* load 4 words of Kernel */ 04962 add edx, 8 /* move pointer to other 4 words */ 04963 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04964 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04965 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04966 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04967 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04968 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04969 movq mm1, [esi] /* load 8 bytes of the Src */ 04970 dec esi 04971 add esi, eax /* move Src pointer 1 row below */ 04972 movq mm3, [edx] /* load 4 words of Kernel */ 04973 add edx, 8 /* move pointer to other 4 words */ 04974 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04975 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04976 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04977 /* --- 6 */ 04978 movq mm1, [esi] /* load 8 bytes of the Src */ 04979 movq mm2, mm1 /* copy MM1 into MM2 */ 04980 inc esi /* move pointer to the next 8 bytes of Src */ 04981 movq mm3, [edx] /* load 4 words of Kernel */ 04982 add edx, 8 /* move pointer to other 4 words */ 04983 movq mm4, [edx] /* load 4 words of Kernel */ 04984 add edx, 8 /* move pointer to other 4 words */ 04985 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04986 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04987 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04988 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04989 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04990 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04991 movq mm1, [esi] /* load 8 bytes of the Src */ 04992 dec esi 04993 add esi, eax /* move Src pointer 1 row below */ 04994 movq mm3, [edx] /* load 4 words of Kernel */ 04995 add edx, 8 /* move pointer to other 4 words */ 04996 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04997 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04998 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04999 /* --- 7 */ 05000 movq mm1, [esi] /* load 8 bytes of the Src */ 05001 movq mm2, mm1 /* copy MM1 into MM2 */ 05002 inc esi /* move pointer to the next 8 bytes of Src */ 05003 movq mm3, [edx] /* load 4 words of Kernel */ 05004 add edx, 8 /* move pointer to other 4 words */ 05005 movq mm4, [edx] /* load 4 words of Kernel */ 05006 add edx, 8 /* move pointer to other 4 words */ 05007 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05008 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05009 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05010 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05011 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05012 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05013 movq mm1, [esi] /* load 8 bytes of the Src */ 05014 dec esi 05015 add esi, eax /* move Src pointer 1 row below */ 05016 movq mm3, [edx] /* load 4 words of Kernel */ 05017 add edx, 8 /* move pointer to other 4 words */ 05018 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05019 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05020 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05021 /* --- 8 */ 05022 movq mm1, [esi] /* load 8 bytes of the Src */ 05023 movq mm2, mm1 /* copy MM1 into MM2 */ 05024 inc esi /* move pointer to the next 8 bytes of Src */ 05025 movq mm3, [edx] /* load 4 words of Kernel */ 05026 add edx, 8 /* move pointer to other 4 words */ 05027 movq mm4, [edx] /* load 4 words of Kernel */ 05028 add edx, 8 /* move pointer to other 4 words */ 05029 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05030 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05031 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05032 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05033 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05034 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05035 movq mm1, [esi] /* load 8 bytes of the Src */ 05036 dec esi 05037 add esi, eax /* move Src pointer 1 row below */ 05038 movq mm3, [edx] /* load 4 words of Kernel */ 05039 add edx, 8 /* move pointer to other 4 words */ 05040 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05041 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05042 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05043 /* --- 9 */ 05044 movq mm1, [esi] /* load 8 bytes of the Src */ 05045 movq mm2, mm1 /* copy MM1 into MM2 */ 05046 inc esi /* move pointer to the next 8 bytes of Src */ 05047 movq mm3, [edx] /* load 4 words of Kernel */ 05048 add edx, 8 /* move pointer to other 4 words */ 05049 movq mm4, [edx] /* load 4 words of Kernel */ 05050 add edx, 8 /* move pointer to other 4 words */ 05051 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05052 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05053 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05054 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05055 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05056 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05057 movq mm1, [esi] /* load 8 bytes of the Src */ 05058 movq mm3, [edx] /* load 4 words of Kernel */ 05059 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05060 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05061 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05062 /* ---, */ 05063 movq mm3, mm7 /* copy MM7 into MM3 */ 05064 psrlq mm7, 32 /* shift 2 left words to the right */ 05065 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 05066 movq mm2, mm7 /* copy MM7 into MM2 */ 05067 psrlq mm7, 16 /* shift 1 left word to the right */ 05068 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 05069 /* ---, */ 05070 movd mm1, eax /* save EDX in MM1 */ 05071 movd mm2, ebx /* save EDX in MM2 */ 05072 movd mm3, edx /* save EDX in MM3 */ 05073 movd eax, mm7 /* load summation result into EAX */ 05074 psraw mm7, 15 /* spread sign bit of the result */ 05075 movd ebx, mm5 /* load Divisor into EBX */ 05076 movd edx, mm7 /* fill EDX with a sign bit */ 05077 idiv bx /* IDIV - VERY EXPENSIVE */ 05078 movd mm7, eax /* move result of division into MM7 */ 05079 packuswb mm7, mm0 /* pack division result with saturation */ 05080 movd eax, mm7 /* copy saturated result into EAX */ 05081 mov [edi], al /* copy a byte result into Dest */ 05082 movd edx, mm3 /* restore saved EDX */ 05083 movd ebx, mm2 /* restore saved EBX */ 05084 movd eax, mm1 /* restore saved EAX */ 05085 /* --, */ 05086 movd esi, mm6 /* move Src pointer to the top pixel */ 05087 sub edx, 208 /* EDX = Kernel address */ 05088 inc esi /* move Src pointer to the next pixel */ 05089 inc edi /* move Dest pointer to the next pixel */ 05090 /* ---, */ 05091 dec ecx /* decrease loop counter COLUMNS */ 05092 jnz L10352 /* check loop termination, proceed if required */ 05093 add esi, 8 /* move to the next row in Src */ 05094 add edi, 8 /* move to the next row in Dest */ 05095 dec ebx /* decrease loop counter ROWS */ 05096 jnz L10350 /* check loop termination, proceed if required */ 05097 /* ---, */ 05098 emms /* exit MMX state */ 05099 popa 05100 } 05101 #else 05102 asm volatile 05103 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 05104 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 05105 "mov %5, %%bl \n\t" /* load Divisor into BL */ 05106 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */ 05107 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 05108 "mov %1, %%esi \n\t" /* load Src address to ESI */ 05109 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 05110 "add $4, %%edi \n\t" /* 4 column offset from the left edge */ 05111 "mov %3, %%eax \n\t" /* load columns into EAX */ 05112 "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */ 05113 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 05114 "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */ 05115 /* --- */ 05116 ".L10350: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 05117 "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */ 05118 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 05119 ".L10352: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 05120 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 05121 /* --- 1 */ 05122 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05123 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05124 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05125 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05126 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05127 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05128 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05129 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05130 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05131 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05132 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05133 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05134 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05135 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05136 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05137 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05138 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05139 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05140 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05141 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05142 /* --- 2 */ 05143 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05144 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05145 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05146 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05147 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05148 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05149 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05150 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05151 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05152 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05153 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05154 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05155 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05156 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05157 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05158 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05159 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05160 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05161 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05162 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05163 /* --- 3 */ 05164 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05165 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05166 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05167 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05168 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05169 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05170 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05171 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05172 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05173 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05174 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05175 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05176 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05177 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05178 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05179 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05180 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05181 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05182 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05183 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05184 /* --- 4 */ 05185 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05186 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05187 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05188 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05189 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05190 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05191 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05192 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05193 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05194 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05195 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05196 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05197 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05198 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05199 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05200 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05201 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05202 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05203 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05204 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05205 /* --- 5 */ 05206 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05207 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05208 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05209 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05210 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05211 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05212 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05213 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05214 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05215 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05216 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05217 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05218 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05219 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05220 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05221 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05222 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05223 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05224 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05225 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05226 /* --- 6 */ 05227 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05228 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05229 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05230 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05231 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05232 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05233 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05234 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05235 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05236 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05237 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05238 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05239 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05240 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05241 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05242 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05243 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05244 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05245 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05246 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05247 /* --- 7 */ 05248 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05249 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05250 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05251 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05252 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05253 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05254 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05255 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05256 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05257 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05258 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05259 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05260 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05261 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05262 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05263 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05264 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05265 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05266 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05267 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05268 /* --- 8 */ 05269 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05270 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05271 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05272 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05273 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05274 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05275 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05276 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05277 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05278 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05279 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05280 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05281 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05282 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05283 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05284 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05285 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05286 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05287 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05288 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05289 /* --- 9 */ 05290 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05291 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05292 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05293 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05294 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05295 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05296 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05297 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05298 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05299 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05300 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05301 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05302 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05303 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05304 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05305 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05306 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05307 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05308 /* --- */ 05309 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 05310 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 05311 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 05312 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 05313 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 05314 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 05315 /* --- */ 05316 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */ 05317 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */ 05318 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 05319 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */ 05320 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */ 05321 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */ 05322 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */ 05323 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 05324 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */ 05325 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 05326 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 05327 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 05328 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 05329 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */ 05330 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 05331 /* -- */ 05332 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 05333 "sub $208, %%edx \n\t" /* EDX = Kernel address */ 05334 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 05335 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 05336 /* --- */ 05337 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 05338 "jnz .L10352 \n\t" /* check loop termination, proceed if required */ 05339 "add $8, %%esi \n\t" /* move to the next row in Src */ 05340 "add $8, %%edi \n\t" /* move to the next row in Dest */ 05341 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 05342 "jnz .L10350 \n\t" /* check loop termination, proceed if required */ 05343 /* --- */ 05344 "emms \n\t" /* exit MMX state */ 05345 "popa \n\t":"=m" (Dest) /* %0 */ 05346 :"m"(Src), /* %1 */ 05347 "m"(rows), /* %2 */ 05348 "m"(columns), /* %3 */ 05349 "m"(Kernel), /* %4 */ 05350 "m"(Divisor) /* %5 */ 05351 ); 05352 #endif 05353 #endif 05354 return (0); 05355 } else { 05356 /* No non-MMX implementation yet */ 05357 return (-1); 05358 } 05359 } 05360 05375 int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 05376 signed short *Kernel, unsigned char NRightShift) 05377 { 05378 /* Validate input parameters */ 05379 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 05380 return(-1); 05381 05382 if ((columns < 3) || (rows < 3) || (NRightShift > 7)) 05383 return (-1); 05384 05385 if ((SDL_imageFilterMMXdetect())) { 05386 //#ifdef USE_MMX 05387 #if defined(USE_MMX) && defined(i386) 05388 #if !defined(GCC__) 05389 __asm 05390 { 05391 pusha 05392 pxor mm0, mm0 /* zero MM0 */ 05393 xor ebx, ebx /* zero EBX */ 05394 mov bl, NRightShift /* load NRightShift into BL */ 05395 movd mm4, ebx /* copy NRightShift into MM4 */ 05396 mov edx, Kernel /* load Kernel address into EDX */ 05397 movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */ 05398 add edx, 8 /* second row |K0 K1 K2 0| */ 05399 movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 05400 add edx, 8 /* third row |K6 K7 K8 0| */ 05401 movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */ 05402 /* ---, */ 05403 mov eax, columns /* load columns into EAX */ 05404 mov esi, Src /* ESI = Src row 0 address */ 05405 mov edi, Dest /* load Dest address to EDI */ 05406 add edi, eax /* EDI = EDI + columns */ 05407 inc edi /* 1 byte offset from the left edge */ 05408 mov edx, rows /* initialize ROWS counter */ 05409 sub edx, 2 /* do not use first and last row */ 05410 /* ---, */ 05411 L10360: 05412 mov ecx, eax /* initialize COLUMS counter */ 05413 sub ecx, 2 /* do not use first and last column */ 05414 align 16 /* 16 byte alignment of the loop entry */ 05415 L10362: 05416 /* ---, */ 05417 movq mm1, [esi] /* load 8 bytes of the image first row */ 05418 add esi, eax /* move one row below */ 05419 movq mm2, [esi] /* load 8 bytes of the image second row */ 05420 add esi, eax /* move one row below */ 05421 movq mm3, [esi] /* load 8 bytes of the image third row */ 05422 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05423 punpcklbw mm2, mm0 /* unpack first 4 bytes into words */ 05424 punpcklbw mm3, mm0 /* unpack first 4 bytes into words */ 05425 psrlw mm1, mm4 /* shift right each pixel NshiftRight times */ 05426 psrlw mm2, mm4 /* shift right each pixel NshiftRight times */ 05427 psrlw mm3, mm4 /* shift right each pixel NshiftRight times */ 05428 pmullw mm1, mm5 /* multiply words first row image*Kernel */ 05429 pmullw mm2, mm6 /* multiply words second row image*Kernel */ 05430 pmullw mm3, mm7 /* multiply words third row image*Kernel */ 05431 paddsw mm1, mm2 /* add 4 words of the first and second rows */ 05432 paddsw mm1, mm3 /* add 4 words of the third row and result */ 05433 movq mm2, mm1 /* copy MM1 into MM2 */ 05434 psrlq mm1, 32 /* shift 2 left words to the right */ 05435 paddsw mm1, mm2 /* add 2 left and 2 right result words */ 05436 movq mm3, mm1 /* copy MM1 into MM3 */ 05437 psrlq mm1, 16 /* shift 1 left word to the right */ 05438 paddsw mm1, mm3 /* add 1 left and 1 right result words */ 05439 packuswb mm1, mm0 /* pack shift result with saturation */ 05440 movd ebx, mm1 /* copy saturated result into EBX */ 05441 mov [edi], bl /* copy a byte result into Dest */ 05442 /* --, */ 05443 sub esi, eax /* move two rows up */ 05444 sub esi, eax 05445 inc esi /* move Src pointer to the next pixel */ 05446 inc edi /* move Dest pointer to the next pixel */ 05447 /* ---, */ 05448 dec ecx /* decrease loop counter COLUMNS */ 05449 jnz L10362 /* check loop termination, proceed if required */ 05450 add esi, 2 /* move to the next row in Src */ 05451 add edi, 2 /* move to the next row in Dest */ 05452 dec edx /* decrease loop counter ROWS */ 05453 jnz L10360 /* check loop termination, proceed if required */ 05454 /* ---, */ 05455 emms /* exit MMX state */ 05456 popa 05457 } 05458 #else 05459 asm volatile 05460 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 05461 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 05462 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 05463 "movd %%ebx, %%mm4 \n\t" /* copy NRightShift into MM4 */ 05464 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 05465 "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */ 05466 "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */ 05467 "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 05468 "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */ 05469 "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */ 05470 /* --- */ 05471 "mov %3, %%eax \n\t" /* load columns into EAX */ 05472 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 05473 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 05474 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 05475 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 05476 "mov %2, %%edx \n\t" /* initialize ROWS counter */ 05477 "sub $2, %%edx \n\t" /* do not use first and last row */ 05478 /* --- */ 05479 ".L10360: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 05480 "sub $2, %%ecx \n\t" /* do not use first and last column */ 05481 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 05482 ".L10362: \n\t" 05483 /* --- */ 05484 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */ 05485 "add %%eax, %%esi \n\t" /* move one row below */ 05486 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */ 05487 "add %%eax, %%esi \n\t" /* move one row below */ 05488 "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */ 05489 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05490 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */ 05491 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */ 05492 "psrlw %%mm4, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05493 "psrlw %%mm4, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05494 "psrlw %%mm4, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 05495 "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */ 05496 "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */ 05497 "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */ 05498 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */ 05499 "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */ 05500 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05501 "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */ 05502 "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */ 05503 "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */ 05504 "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */ 05505 "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */ 05506 "packuswb %%mm0, %%mm1 \n\t" /* pack shift result with saturation */ 05507 "movd %%mm1, %%ebx \n\t" /* copy saturated result into EBX */ 05508 "mov %%bl, (%%edi) \n\t" /* copy a byte result into Dest */ 05509 /* -- */ 05510 "sub %%eax, %%esi \n\t" /* move two rows up */ 05511 "sub %%eax, %%esi \n\t" "inc %%esi \n\t" /* move Src pointer to the next pixel */ 05512 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 05513 /* --- */ 05514 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 05515 "jnz .L10362 \n\t" /* check loop termination, proceed if required */ 05516 "add $2, %%esi \n\t" /* move to the next row in Src */ 05517 "add $2, %%edi \n\t" /* move to the next row in Dest */ 05518 "dec %%edx \n\t" /* decrease loop counter ROWS */ 05519 "jnz .L10360 \n\t" /* check loop termination, proceed if required */ 05520 /* --- */ 05521 "emms \n\t" /* exit MMX state */ 05522 "popa \n\t":"=m" (Dest) /* %0 */ 05523 :"m"(Src), /* %1 */ 05524 "m"(rows), /* %2 */ 05525 "m"(columns), /* %3 */ 05526 "m"(Kernel), /* %4 */ 05527 "m"(NRightShift) /* %5 */ 05528 ); 05529 #endif 05530 #endif 05531 return (0); 05532 } else { 05533 /* No non-MMX implementation yet */ 05534 return (-1); 05535 } 05536 } 05537 05552 int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 05553 signed short *Kernel, unsigned char NRightShift) 05554 { 05555 /* Validate input parameters */ 05556 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 05557 return(-1); 05558 05559 if ((columns < 5) || (rows < 5) || (NRightShift > 7)) 05560 return (-1); 05561 05562 if ((SDL_imageFilterMMXdetect())) { 05563 //#ifdef USE_MMX 05564 #if defined(USE_MMX) && defined(i386) 05565 #if !defined(GCC__) 05566 __asm 05567 { 05568 pusha 05569 pxor mm0, mm0 /* zero MM0 */ 05570 xor ebx, ebx /* zero EBX */ 05571 mov bl, NRightShift /* load NRightShift into BL */ 05572 movd mm5, ebx /* copy NRightShift into MM5 */ 05573 mov edx, Kernel /* load Kernel address into EDX */ 05574 mov esi, Src /* load Src address to ESI */ 05575 mov edi, Dest /* load Dest address to EDI */ 05576 add edi, 2 /* 2 column offset from the left edge */ 05577 mov eax, columns /* load columns into EAX */ 05578 shl eax, 1 /* EAX = columns * 2 */ 05579 add edi, eax /* 2 row offset from the top edge */ 05580 shr eax, 1 /* EAX = columns */ 05581 mov ebx, rows /* initialize ROWS counter */ 05582 sub ebx, 4 /* do not use first 2 and last 2 rows */ 05583 /* ---, */ 05584 L10370: 05585 mov ecx, eax /* initialize COLUMNS counter */ 05586 sub ecx, 4 /* do not use first 2 and last 2 columns */ 05587 align 16 /* 16 byte alignment of the loop entry */ 05588 L10372: 05589 pxor mm7, mm7 /* zero MM7 (accumulator) */ 05590 movd mm6, esi /* save ESI in MM6 */ 05591 /* --- 1 */ 05592 movq mm1, [esi] /* load 8 bytes of the Src */ 05593 movq mm2, mm1 /* copy MM1 into MM2 */ 05594 add esi, eax /* move Src pointer 1 row below */ 05595 movq mm3, [edx] /* load 4 words of Kernel */ 05596 add edx, 8 /* move pointer to other 4 words */ 05597 movq mm4, [edx] /* load 4 words of Kernel */ 05598 add edx, 8 /* move pointer to other 4 words */ 05599 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05600 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05601 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05602 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05603 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05604 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05605 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05606 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05607 /* --- 2 */ 05608 movq mm1, [esi] /* load 8 bytes of the Src */ 05609 movq mm2, mm1 /* copy MM1 into MM2 */ 05610 add esi, eax /* move Src pointer 1 row below */ 05611 movq mm3, [edx] /* load 4 words of Kernel */ 05612 add edx, 8 /* move pointer to other 4 words */ 05613 movq mm4, [edx] /* load 4 words of Kernel */ 05614 add edx, 8 /* move pointer to other 4 words */ 05615 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05616 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05617 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05618 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05619 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05620 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05621 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05622 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05623 /* --- 3 */ 05624 movq mm1, [esi] /* load 8 bytes of the Src */ 05625 movq mm2, mm1 /* copy MM1 into MM2 */ 05626 add esi, eax /* move Src pointer 1 row below */ 05627 movq mm3, [edx] /* load 4 words of Kernel */ 05628 add edx, 8 /* move pointer to other 4 words */ 05629 movq mm4, [edx] /* load 4 words of Kernel */ 05630 add edx, 8 /* move pointer to other 4 words */ 05631 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05632 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05633 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05634 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05635 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05636 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05637 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05638 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05639 /* --- 4 */ 05640 movq mm1, [esi] /* load 8 bytes of the Src */ 05641 movq mm2, mm1 /* copy MM1 into MM2 */ 05642 add esi, eax /* move Src pointer 1 row below */ 05643 movq mm3, [edx] /* load 4 words of Kernel */ 05644 add edx, 8 /* move pointer to other 4 words */ 05645 movq mm4, [edx] /* load 4 words of Kernel */ 05646 add edx, 8 /* move pointer to other 4 words */ 05647 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05648 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05649 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05650 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05651 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05652 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05653 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05654 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05655 /* --- 5 */ 05656 movq mm1, [esi] /* load 8 bytes of the Src */ 05657 movq mm2, mm1 /* copy MM1 into MM2 */ 05658 movq mm3, [edx] /* load 4 words of Kernel */ 05659 add edx, 8 /* move pointer to other 4 words */ 05660 movq mm4, [edx] /* load 4 words of Kernel */ 05661 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05662 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05663 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05664 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05665 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05666 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05667 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05668 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05669 /* ---, */ 05670 movq mm3, mm7 /* copy MM7 into MM3 */ 05671 psrlq mm7, 32 /* shift 2 left words to the right */ 05672 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 05673 movq mm2, mm7 /* copy MM7 into MM2 */ 05674 psrlq mm7, 16 /* shift 1 left word to the right */ 05675 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 05676 movd mm1, eax /* save EAX in MM1 */ 05677 packuswb mm7, mm0 /* pack division result with saturation */ 05678 movd eax, mm7 /* copy saturated result into EAX */ 05679 mov [edi], al /* copy a byte result into Dest */ 05680 movd eax, mm1 /* restore saved EAX */ 05681 /* --, */ 05682 movd esi, mm6 /* move Src pointer to the top pixel */ 05683 sub edx, 72 /* EDX = Kernel address */ 05684 inc esi /* move Src pointer to the next pixel */ 05685 inc edi /* move Dest pointer to the next pixel */ 05686 /* ---, */ 05687 dec ecx /* decrease loop counter COLUMNS */ 05688 jnz L10372 /* check loop termination, proceed if required */ 05689 add esi, 4 /* move to the next row in Src */ 05690 add edi, 4 /* move to the next row in Dest */ 05691 dec ebx /* decrease loop counter ROWS */ 05692 jnz L10370 /* check loop termination, proceed if required */ 05693 /* ---, */ 05694 emms /* exit MMX state */ 05695 popa 05696 } 05697 #else 05698 asm volatile 05699 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 05700 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 05701 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 05702 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */ 05703 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 05704 "mov %1, %%esi \n\t" /* load Src address to ESI */ 05705 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 05706 "add $2, %%edi \n\t" /* 2 column offset from the left edge */ 05707 "mov %3, %%eax \n\t" /* load columns into EAX */ 05708 "shl $1, %%eax \n\t" /* EAX = columns * 2 */ 05709 "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */ 05710 "shr $1, %%eax \n\t" /* EAX = columns */ 05711 "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 05712 "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */ 05713 /* --- */ 05714 ".L10370: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 05715 "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */ 05716 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 05717 ".L10372: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 05718 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 05719 /* --- 1 */ 05720 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05721 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05722 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05723 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05724 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05725 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05726 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05727 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05728 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05729 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05730 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05731 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05732 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05733 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05734 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05735 /* --- 2 */ 05736 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05737 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05738 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05739 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05740 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05741 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05742 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05743 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05744 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05745 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05746 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05747 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05748 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05749 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05750 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05751 /* --- 3 */ 05752 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05753 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05754 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05755 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05756 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05757 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05758 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05759 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05760 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05761 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05762 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05763 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05764 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05765 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05766 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05767 /* --- 4 */ 05768 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05769 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05770 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05771 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05772 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05773 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05774 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05775 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05776 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05777 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05778 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05779 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05780 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05781 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05782 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05783 /* --- 5 */ 05784 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05785 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05786 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05787 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05788 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05789 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05790 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05791 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05792 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05793 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05794 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05795 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05796 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05797 /* --- */ 05798 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 05799 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 05800 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 05801 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 05802 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 05803 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 05804 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */ 05805 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 05806 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 05807 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 05808 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 05809 /* -- */ 05810 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 05811 "sub $72, %%edx \n\t" /* EDX = Kernel address */ 05812 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 05813 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 05814 /* --- */ 05815 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 05816 "jnz .L10372 \n\t" /* check loop termination, proceed if required */ 05817 "add $4, %%esi \n\t" /* move to the next row in Src */ 05818 "add $4, %%edi \n\t" /* move to the next row in Dest */ 05819 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 05820 "jnz .L10370 \n\t" /* check loop termination, proceed if required */ 05821 /* --- */ 05822 "emms \n\t" /* exit MMX state */ 05823 "popa \n\t":"=m" (Dest) /* %0 */ 05824 :"m"(Src), /* %1 */ 05825 "m"(rows), /* %2 */ 05826 "m"(columns), /* %3 */ 05827 "m"(Kernel), /* %4 */ 05828 "m"(NRightShift) /* %5 */ 05829 ); 05830 #endif 05831 #endif 05832 return (0); 05833 } else { 05834 /* No non-MMX implementation yet */ 05835 return (-1); 05836 } 05837 } 05838 05853 int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 05854 signed short *Kernel, unsigned char NRightShift) 05855 { 05856 /* Validate input parameters */ 05857 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 05858 return(-1); 05859 05860 if ((columns < 7) || (rows < 7) || (NRightShift > 7)) 05861 return (-1); 05862 05863 if ((SDL_imageFilterMMXdetect())) { 05864 //#ifdef USE_MMX 05865 #if defined(USE_MMX) && defined(i386) 05866 #if !defined(GCC__) 05867 __asm 05868 { 05869 pusha 05870 pxor mm0, mm0 /* zero MM0 */ 05871 xor ebx, ebx /* zero EBX */ 05872 mov bl, NRightShift /* load NRightShift into BL */ 05873 movd mm5, ebx /* copy NRightShift into MM5 */ 05874 mov edx, Kernel /* load Kernel address into EDX */ 05875 mov esi, Src /* load Src address to ESI */ 05876 mov edi, Dest /* load Dest address to EDI */ 05877 add edi, 3 /* 3 column offset from the left edge */ 05878 mov eax, columns /* load columns into EAX */ 05879 add edi, eax /* 3 row offset from the top edge */ 05880 add edi, eax 05881 add edi, eax 05882 mov ebx, rows /* initialize ROWS counter */ 05883 sub ebx, 6 /* do not use first 3 and last 3 rows */ 05884 /* ---, */ 05885 L10380: 05886 mov ecx, eax /* initialize COLUMNS counter */ 05887 sub ecx, 6 /* do not use first 3 and last 3 columns */ 05888 align 16 /* 16 byte alignment of the loop entry */ 05889 L10382: 05890 pxor mm7, mm7 /* zero MM7 (accumulator) */ 05891 movd mm6, esi /* save ESI in MM6 */ 05892 /* --- 1 */ 05893 movq mm1, [esi] /* load 8 bytes of the Src */ 05894 movq mm2, mm1 /* copy MM1 into MM2 */ 05895 add esi, eax /* move Src pointer 1 row below */ 05896 movq mm3, [edx] /* load 4 words of Kernel */ 05897 add edx, 8 /* move pointer to other 4 words */ 05898 movq mm4, [edx] /* load 4 words of Kernel */ 05899 add edx, 8 /* move pointer to other 4 words */ 05900 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05901 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05902 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05903 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05904 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05905 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05906 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05907 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05908 /* --- 2 */ 05909 movq mm1, [esi] /* load 8 bytes of the Src */ 05910 movq mm2, mm1 /* copy MM1 into MM2 */ 05911 add esi, eax /* move Src pointer 1 row below */ 05912 movq mm3, [edx] /* load 4 words of Kernel */ 05913 add edx, 8 /* move pointer to other 4 words */ 05914 movq mm4, [edx] /* load 4 words of Kernel */ 05915 add edx, 8 /* move pointer to other 4 words */ 05916 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05917 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05918 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05919 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05920 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05921 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05922 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05923 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05924 /* --- 3 */ 05925 movq mm1, [esi] /* load 8 bytes of the Src */ 05926 movq mm2, mm1 /* copy MM1 into MM2 */ 05927 add esi, eax /* move Src pointer 1 row below */ 05928 movq mm3, [edx] /* load 4 words of Kernel */ 05929 add edx, 8 /* move pointer to other 4 words */ 05930 movq mm4, [edx] /* load 4 words of Kernel */ 05931 add edx, 8 /* move pointer to other 4 words */ 05932 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05933 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05934 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05935 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05936 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05937 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05938 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05939 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05940 /* --- 4 */ 05941 movq mm1, [esi] /* load 8 bytes of the Src */ 05942 movq mm2, mm1 /* copy MM1 into MM2 */ 05943 add esi, eax /* move Src pointer 1 row below */ 05944 movq mm3, [edx] /* load 4 words of Kernel */ 05945 add edx, 8 /* move pointer to other 4 words */ 05946 movq mm4, [edx] /* load 4 words of Kernel */ 05947 add edx, 8 /* move pointer to other 4 words */ 05948 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05949 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05950 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05951 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05952 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05953 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05954 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05955 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05956 /* --- 5 */ 05957 movq mm1, [esi] /* load 8 bytes of the Src */ 05958 movq mm2, mm1 /* copy MM1 into MM2 */ 05959 add esi, eax /* move Src pointer 1 row below */ 05960 movq mm3, [edx] /* load 4 words of Kernel */ 05961 add edx, 8 /* move pointer to other 4 words */ 05962 movq mm4, [edx] /* load 4 words of Kernel */ 05963 add edx, 8 /* move pointer to other 4 words */ 05964 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05965 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05966 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05967 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05968 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05969 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05970 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05971 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05972 /* --- 6 */ 05973 movq mm1, [esi] /* load 8 bytes of the Src */ 05974 movq mm2, mm1 /* copy MM1 into MM2 */ 05975 add esi, eax /* move Src pointer 1 row below */ 05976 movq mm3, [edx] /* load 4 words of Kernel */ 05977 add edx, 8 /* move pointer to other 4 words */ 05978 movq mm4, [edx] /* load 4 words of Kernel */ 05979 add edx, 8 /* move pointer to other 4 words */ 05980 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05981 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05982 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05983 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05984 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05985 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05986 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05987 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05988 /* --- 7 */ 05989 movq mm1, [esi] /* load 8 bytes of the Src */ 05990 movq mm2, mm1 /* copy MM1 into MM2 */ 05991 movq mm3, [edx] /* load 4 words of Kernel */ 05992 add edx, 8 /* move pointer to other 4 words */ 05993 movq mm4, [edx] /* load 4 words of Kernel */ 05994 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05995 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05996 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05997 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05998 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05999 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06000 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06001 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06002 /* ---, */ 06003 movq mm3, mm7 /* copy MM7 into MM3 */ 06004 psrlq mm7, 32 /* shift 2 left words to the right */ 06005 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 06006 movq mm2, mm7 /* copy MM7 into MM2 */ 06007 psrlq mm7, 16 /* shift 1 left word to the right */ 06008 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 06009 movd mm1, eax /* save EAX in MM1 */ 06010 packuswb mm7, mm0 /* pack division result with saturation */ 06011 movd eax, mm7 /* copy saturated result into EAX */ 06012 mov [edi], al /* copy a byte result into Dest */ 06013 movd eax, mm1 /* restore saved EAX */ 06014 /* --, */ 06015 movd esi, mm6 /* move Src pointer to the top pixel */ 06016 sub edx, 104 /* EDX = Kernel address */ 06017 inc esi /* move Src pointer to the next pixel */ 06018 inc edi /* move Dest pointer to the next pixel */ 06019 /* ---, */ 06020 dec ecx /* decrease loop counter COLUMNS */ 06021 jnz L10382 /* check loop termination, proceed if required */ 06022 add esi, 6 /* move to the next row in Src */ 06023 add edi, 6 /* move to the next row in Dest */ 06024 dec ebx /* decrease loop counter ROWS */ 06025 jnz L10380 /* check loop termination, proceed if required */ 06026 /* ---, */ 06027 emms /* exit MMX state */ 06028 popa 06029 } 06030 #else 06031 asm volatile 06032 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 06033 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 06034 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 06035 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */ 06036 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 06037 "mov %1, %%esi \n\t" /* load Src address to ESI */ 06038 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 06039 "add $3, %%edi \n\t" /* 3 column offset from the left edge */ 06040 "mov %3, %%eax \n\t" /* load columns into EAX */ 06041 "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */ 06042 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 06043 "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */ 06044 /* --- */ 06045 ".L10380: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 06046 "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */ 06047 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 06048 ".L10382: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 06049 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 06050 /* --- 1 */ 06051 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06052 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06053 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06054 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06055 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06056 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06057 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06058 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06059 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06060 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06061 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06062 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06063 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06064 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06065 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06066 /* --- 2 */ 06067 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06068 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06069 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06070 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06071 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06072 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06073 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06074 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06075 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06076 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06077 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06078 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06079 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06080 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06081 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06082 /* --- 3 */ 06083 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06084 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06085 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06086 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06087 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06088 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06089 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06090 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06091 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06092 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06093 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06094 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06095 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06096 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06097 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06098 /* --- 4 */ 06099 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06100 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06101 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06102 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06103 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06104 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06105 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06106 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06107 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06108 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06109 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06110 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06111 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06112 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06113 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06114 /* --- 5 */ 06115 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06116 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06117 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06118 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06119 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06120 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06121 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06122 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06123 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06124 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06125 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06126 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06127 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06128 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06129 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06130 /* --- 6 */ 06131 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06132 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06133 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06134 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06135 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06136 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06137 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06138 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06139 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06140 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06141 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06142 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06143 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06144 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06145 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06146 /* --- 7 */ 06147 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06148 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06149 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06150 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06151 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06152 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06153 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06154 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06155 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06156 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06157 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06158 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06159 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06160 /* --- */ 06161 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 06162 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 06163 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 06164 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 06165 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 06166 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 06167 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */ 06168 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 06169 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 06170 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 06171 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 06172 /* -- */ 06173 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 06174 "sub $104, %%edx \n\t" /* EDX = Kernel address */ 06175 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 06176 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 06177 /* --- */ 06178 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 06179 "jnz .L10382 \n\t" /* check loop termination, proceed if required */ 06180 "add $6, %%esi \n\t" /* move to the next row in Src */ 06181 "add $6, %%edi \n\t" /* move to the next row in Dest */ 06182 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 06183 "jnz .L10380 \n\t" /* check loop termination, proceed if required */ 06184 /* --- */ 06185 "emms \n\t" /* exit MMX state */ 06186 "popa \n\t":"=m" (Dest) /* %0 */ 06187 :"m"(Src), /* %1 */ 06188 "m"(rows), /* %2 */ 06189 "m"(columns), /* %3 */ 06190 "m"(Kernel), /* %4 */ 06191 "m"(NRightShift) /* %5 */ 06192 ); 06193 #endif 06194 #endif 06195 return (0); 06196 } else { 06197 /* No non-MMX implementation yet */ 06198 return (-1); 06199 } 06200 } 06201 06216 int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 06217 signed short *Kernel, unsigned char NRightShift) 06218 { 06219 /* Validate input parameters */ 06220 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 06221 return(-1); 06222 06223 if ((columns < 9) || (rows < 9) || (NRightShift > 7)) 06224 return (-1); 06225 06226 if ((SDL_imageFilterMMXdetect())) { 06227 //#ifdef USE_MMX 06228 #if defined(USE_MMX) && defined(i386) 06229 #if !defined(GCC__) 06230 __asm 06231 { 06232 pusha 06233 pxor mm0, mm0 /* zero MM0 */ 06234 xor ebx, ebx /* zero EBX */ 06235 mov bl, NRightShift /* load NRightShift into BL */ 06236 movd mm5, ebx /* copy NRightShift into MM5 */ 06237 mov edx, Kernel /* load Kernel address into EDX */ 06238 mov esi, Src /* load Src address to ESI */ 06239 mov edi, Dest /* load Dest address to EDI */ 06240 add edi, 4 /* 4 column offset from the left edge */ 06241 mov eax, columns /* load columns into EAX */ 06242 add edi, eax /* 4 row offset from the top edge */ 06243 add edi, eax 06244 add edi, eax 06245 add edi, eax 06246 mov ebx, rows /* initialize ROWS counter */ 06247 sub ebx, 8 /* do not use first 4 and last 4 rows */ 06248 /* ---, */ 06249 L10390: 06250 mov ecx, eax /* initialize COLUMNS counter */ 06251 sub ecx, 8 /* do not use first 4 and last 4 columns */ 06252 align 16 /* 16 byte alignment of the loop entry */ 06253 L10392: 06254 pxor mm7, mm7 /* zero MM7 (accumulator) */ 06255 movd mm6, esi /* save ESI in MM6 */ 06256 /* --- 1 */ 06257 movq mm1, [esi] /* load 8 bytes of the Src */ 06258 movq mm2, mm1 /* copy MM1 into MM2 */ 06259 inc esi /* move pointer to the next 8 bytes of Src */ 06260 movq mm3, [edx] /* load 4 words of Kernel */ 06261 add edx, 8 /* move pointer to other 4 words */ 06262 movq mm4, [edx] /* load 4 words of Kernel */ 06263 add edx, 8 /* move pointer to other 4 words */ 06264 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06265 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06266 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06267 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06268 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06269 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06270 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06271 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06272 movq mm1, [esi] /* load 8 bytes of the Src */ 06273 dec esi 06274 add esi, eax /* move Src pointer 1 row below */ 06275 movq mm3, [edx] /* load 4 words of Kernel */ 06276 add edx, 8 /* move pointer to other 4 words */ 06277 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06278 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06279 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06280 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06281 /* --- 2 */ 06282 movq mm1, [esi] /* load 8 bytes of the Src */ 06283 movq mm2, mm1 /* copy MM1 into MM2 */ 06284 inc esi /* move pointer to the next 8 bytes of Src */ 06285 movq mm3, [edx] /* load 4 words of Kernel */ 06286 add edx, 8 /* move pointer to other 4 words */ 06287 movq mm4, [edx] /* load 4 words of Kernel */ 06288 add edx, 8 /* move pointer to other 4 words */ 06289 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06290 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06291 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06292 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06293 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06294 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06295 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06296 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06297 movq mm1, [esi] /* load 8 bytes of the Src */ 06298 dec esi 06299 add esi, eax /* move Src pointer 1 row below */ 06300 movq mm3, [edx] /* load 4 words of Kernel */ 06301 add edx, 8 /* move pointer to other 4 words */ 06302 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06303 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06304 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06305 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06306 /* --- 3 */ 06307 movq mm1, [esi] /* load 8 bytes of the Src */ 06308 movq mm2, mm1 /* copy MM1 into MM2 */ 06309 inc esi /* move pointer to the next 8 bytes of Src */ 06310 movq mm3, [edx] /* load 4 words of Kernel */ 06311 add edx, 8 /* move pointer to other 4 words */ 06312 movq mm4, [edx] /* load 4 words of Kernel */ 06313 add edx, 8 /* move pointer to other 4 words */ 06314 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06315 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06316 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06317 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06318 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06319 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06320 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06321 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06322 movq mm1, [esi] /* load 8 bytes of the Src */ 06323 dec esi 06324 add esi, eax /* move Src pointer 1 row below */ 06325 movq mm3, [edx] /* load 4 words of Kernel */ 06326 add edx, 8 /* move pointer to other 4 words */ 06327 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06328 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06329 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06330 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06331 /* --- 4 */ 06332 movq mm1, [esi] /* load 8 bytes of the Src */ 06333 movq mm2, mm1 /* copy MM1 into MM2 */ 06334 inc esi /* move pointer to the next 8 bytes of Src */ 06335 movq mm3, [edx] /* load 4 words of Kernel */ 06336 add edx, 8 /* move pointer to other 4 words */ 06337 movq mm4, [edx] /* load 4 words of Kernel */ 06338 add edx, 8 /* move pointer to other 4 words */ 06339 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06340 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06341 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06342 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06343 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06344 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06345 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06346 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06347 movq mm1, [esi] /* load 8 bytes of the Src */ 06348 dec esi 06349 add esi, eax /* move Src pointer 1 row below */ 06350 movq mm3, [edx] /* load 4 words of Kernel */ 06351 add edx, 8 /* move pointer to other 4 words */ 06352 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06353 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06354 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06355 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06356 /* --- 5 */ 06357 movq mm1, [esi] /* load 8 bytes of the Src */ 06358 movq mm2, mm1 /* copy MM1 into MM2 */ 06359 inc esi /* move pointer to the next 8 bytes of Src */ 06360 movq mm3, [edx] /* load 4 words of Kernel */ 06361 add edx, 8 /* move pointer to other 4 words */ 06362 movq mm4, [edx] /* load 4 words of Kernel */ 06363 add edx, 8 /* move pointer to other 4 words */ 06364 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06365 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06366 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06367 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06368 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06369 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06370 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06371 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06372 movq mm1, [esi] /* load 8 bytes of the Src */ 06373 dec esi 06374 add esi, eax /* move Src pointer 1 row below */ 06375 movq mm3, [edx] /* load 4 words of Kernel */ 06376 add edx, 8 /* move pointer to other 4 words */ 06377 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06378 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06379 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06380 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06381 /* --- 6 */ 06382 movq mm1, [esi] /* load 8 bytes of the Src */ 06383 movq mm2, mm1 /* copy MM1 into MM2 */ 06384 inc esi /* move pointer to the next 8 bytes of Src */ 06385 movq mm3, [edx] /* load 4 words of Kernel */ 06386 add edx, 8 /* move pointer to other 4 words */ 06387 movq mm4, [edx] /* load 4 words of Kernel */ 06388 add edx, 8 /* move pointer to other 4 words */ 06389 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06390 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06391 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06392 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06393 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06394 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06395 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06396 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06397 movq mm1, [esi] /* load 8 bytes of the Src */ 06398 dec esi 06399 add esi, eax /* move Src pointer 1 row below */ 06400 movq mm3, [edx] /* load 4 words of Kernel */ 06401 add edx, 8 /* move pointer to other 4 words */ 06402 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06403 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06404 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06405 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06406 /* --- 7 */ 06407 movq mm1, [esi] /* load 8 bytes of the Src */ 06408 movq mm2, mm1 /* copy MM1 into MM2 */ 06409 inc esi /* move pointer to the next 8 bytes of Src */ 06410 movq mm3, [edx] /* load 4 words of Kernel */ 06411 add edx, 8 /* move pointer to other 4 words */ 06412 movq mm4, [edx] /* load 4 words of Kernel */ 06413 add edx, 8 /* move pointer to other 4 words */ 06414 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06415 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06416 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06417 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06418 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06419 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06420 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06421 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06422 movq mm1, [esi] /* load 8 bytes of the Src */ 06423 dec esi 06424 add esi, eax /* move Src pointer 1 row below */ 06425 movq mm3, [edx] /* load 4 words of Kernel */ 06426 add edx, 8 /* move pointer to other 4 words */ 06427 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06428 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06429 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06430 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06431 /* --- 8 */ 06432 movq mm1, [esi] /* load 8 bytes of the Src */ 06433 movq mm2, mm1 /* copy MM1 into MM2 */ 06434 inc esi /* move pointer to the next 8 bytes of Src */ 06435 movq mm3, [edx] /* load 4 words of Kernel */ 06436 add edx, 8 /* move pointer to other 4 words */ 06437 movq mm4, [edx] /* load 4 words of Kernel */ 06438 add edx, 8 /* move pointer to other 4 words */ 06439 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06440 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06441 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06442 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06443 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06444 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06445 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06446 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06447 movq mm1, [esi] /* load 8 bytes of the Src */ 06448 dec esi 06449 add esi, eax /* move Src pointer 1 row below */ 06450 movq mm3, [edx] /* load 4 words of Kernel */ 06451 add edx, 8 /* move pointer to other 4 words */ 06452 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06453 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06454 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06455 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06456 /* --- 9 */ 06457 movq mm1, [esi] /* load 8 bytes of the Src */ 06458 movq mm2, mm1 /* copy MM1 into MM2 */ 06459 inc esi /* move pointer to the next 8 bytes of Src */ 06460 movq mm3, [edx] /* load 4 words of Kernel */ 06461 add edx, 8 /* move pointer to other 4 words */ 06462 movq mm4, [edx] /* load 4 words of Kernel */ 06463 add edx, 8 /* move pointer to other 4 words */ 06464 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06465 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06466 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06467 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06468 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06469 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06470 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06471 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06472 movq mm1, [esi] /* load 8 bytes of the Src */ 06473 movq mm3, [edx] /* load 4 words of Kernel */ 06474 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06475 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06476 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06477 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06478 /* ---, */ 06479 movq mm3, mm7 /* copy MM7 into MM3 */ 06480 psrlq mm7, 32 /* shift 2 left words to the right */ 06481 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 06482 movq mm2, mm7 /* copy MM7 into MM2 */ 06483 psrlq mm7, 16 /* shift 1 left word to the right */ 06484 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 06485 movd mm1, eax /* save EAX in MM1 */ 06486 packuswb mm7, mm0 /* pack division result with saturation */ 06487 movd eax, mm7 /* copy saturated result into EAX */ 06488 mov [edi], al /* copy a byte result into Dest */ 06489 movd eax, mm1 /* restore saved EAX */ 06490 /* --, */ 06491 movd esi, mm6 /* move Src pointer to the top pixel */ 06492 sub edx, 208 /* EDX = Kernel address */ 06493 inc esi /* move Src pointer to the next pixel */ 06494 inc edi /* move Dest pointer to the next pixel */ 06495 /* ---, */ 06496 dec ecx /* decrease loop counter COLUMNS */ 06497 jnz L10392 /* check loop termination, proceed if required */ 06498 add esi, 8 /* move to the next row in Src */ 06499 add edi, 8 /* move to the next row in Dest */ 06500 dec ebx /* decrease loop counter ROWS */ 06501 jnz L10390 /* check loop termination, proceed if required */ 06502 /* ---, */ 06503 emms /* exit MMX state */ 06504 popa 06505 } 06506 #else 06507 asm volatile 06508 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 06509 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 06510 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 06511 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */ 06512 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 06513 "mov %1, %%esi \n\t" /* load Src address to ESI */ 06514 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 06515 "add $4, %%edi \n\t" /* 4 column offset from the left edge */ 06516 "mov %3, %%eax \n\t" /* load columns into EAX */ 06517 "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */ 06518 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 06519 "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */ 06520 /* --- */ 06521 ".L10390: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 06522 "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */ 06523 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 06524 ".L10392: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 06525 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 06526 /* --- 1 */ 06527 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06528 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06529 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06530 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06531 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06532 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06533 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06534 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06535 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06536 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06537 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06538 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06539 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06540 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06541 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06542 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06543 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06544 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06545 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06546 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06547 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06548 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06549 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06550 /* --- 2 */ 06551 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06552 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06553 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06554 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06555 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06556 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06557 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06558 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06559 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06560 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06561 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06562 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06563 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06564 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06565 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06566 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06567 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06568 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06569 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06570 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06571 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06572 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06573 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06574 /* --- 3 */ 06575 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06576 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06577 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06578 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06579 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06580 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06581 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06582 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06583 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06584 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06585 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06586 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06587 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06588 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06589 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06590 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06591 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06592 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06593 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06594 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06595 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06596 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06597 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06598 /* --- 4 */ 06599 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06600 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06601 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06602 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06603 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06604 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06605 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06606 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06607 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06608 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06609 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06610 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06611 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06612 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06613 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06614 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06615 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06616 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06617 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06618 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06619 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06620 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06621 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06622 /* --- 5 */ 06623 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06624 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06625 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06626 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06627 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06628 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06629 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06630 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06631 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06632 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06633 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06634 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06635 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06636 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06637 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06638 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06639 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06640 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06641 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06642 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06643 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06644 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06645 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06646 /* --- 6 */ 06647 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06648 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06649 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06650 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06651 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06652 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06653 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06654 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06655 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06656 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06657 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06658 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06659 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06660 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06661 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06662 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06663 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06664 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06665 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06666 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06667 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06668 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06669 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06670 /* --- 7 */ 06671 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06672 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06673 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06674 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06675 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06676 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06677 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06678 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06679 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06680 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06681 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06682 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06683 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06684 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06685 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06686 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06687 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06688 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06689 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06690 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06691 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06692 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06693 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06694 /* --- 8 */ 06695 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06696 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06697 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06698 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06699 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06700 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06701 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06702 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06703 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06704 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06705 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06706 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06707 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06708 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06709 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06710 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06711 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06712 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06713 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06714 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06715 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06716 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06717 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06718 /* --- 9 */ 06719 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06720 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06721 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06722 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06723 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06724 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06725 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06726 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06727 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06728 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06729 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06730 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06731 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06732 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06733 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06734 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06735 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06736 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06737 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06738 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06739 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06740 /* --- */ 06741 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 06742 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 06743 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 06744 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 06745 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 06746 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 06747 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */ 06748 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 06749 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 06750 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 06751 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 06752 /* -- */ 06753 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 06754 "sub $208, %%edx \n\t" /* EDX = Kernel address */ 06755 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 06756 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 06757 /* --- */ 06758 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 06759 "jnz .L10392 \n\t" /* check loop termination, proceed if required */ 06760 "add $8, %%esi \n\t" /* move to the next row in Src */ 06761 "add $8, %%edi \n\t" /* move to the next row in Dest */ 06762 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 06763 "jnz .L10390 \n\t" /* check loop termination, proceed if required */ 06764 /* --- */ 06765 "emms \n\t" /* exit MMX state */ 06766 "popa \n\t":"=m" (Dest) /* %0 */ 06767 :"m"(Src), /* %1 */ 06768 "m"(rows), /* %2 */ 06769 "m"(columns), /* %3 */ 06770 "m"(Kernel), /* %4 */ 06771 "m"(NRightShift) /* %5 */ 06772 ); 06773 #endif 06774 #endif 06775 return (0); 06776 } else { 06777 /* No non-MMX implementation yet */ 06778 return (-1); 06779 } 06780 } 06781 06782 /* ------------------------------------------------------------------------------------ */ 06783 06796 int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns) 06797 { 06798 /* Validate input parameters */ 06799 if ((Src == NULL) || (Dest == NULL)) 06800 return(-1); 06801 06802 if ((columns < 8) || (rows < 3)) 06803 return (-1); 06804 06805 if ((SDL_imageFilterMMXdetect())) { 06806 //#ifdef USE_MMX 06807 #if defined(USE_MMX) && defined(i386) 06808 #if !defined(GCC__) 06809 __asm 06810 { 06811 pusha 06812 pxor mm0, mm0 /* zero MM0 */ 06813 mov eax, columns /* load columns into EAX */ 06814 /* ---, */ 06815 mov esi, Src /* ESI = Src row 0 address */ 06816 mov edi, Dest /* load Dest address to EDI */ 06817 add edi, eax /* EDI = EDI + columns */ 06818 inc edi /* 1 byte offset from the left edge */ 06819 mov edx, rows /* initialize ROWS counter */ 06820 sub edx, 2 /* do not use first and last rows */ 06821 /* ---, */ 06822 L10400: 06823 mov ecx, eax /* initialize COLUMS counter */ 06824 shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */ 06825 mov ebx, esi /* save ESI in EBX */ 06826 movd mm1, edi /* save EDI in MM1 */ 06827 align 16 /* 16 byte alignment of the loop entry */ 06828 L10402: 06829 /* ---, */ 06830 movq mm4, [esi] /* load 8 bytes from Src */ 06831 movq mm5, mm4 /* save MM4 in MM5 */ 06832 add esi, 2 /* move ESI pointer 2 bytes right */ 06833 punpcklbw mm4, mm0 /* unpack 4 low bytes into words */ 06834 punpckhbw mm5, mm0 /* unpack 4 high bytes into words */ 06835 movq mm6, [esi] /* load 8 bytes from Src */ 06836 movq mm7, mm6 /* save MM6 in MM7 */ 06837 sub esi, 2 /* move ESI pointer back 2 bytes left */ 06838 punpcklbw mm6, mm0 /* unpack 4 low bytes into words */ 06839 punpckhbw mm7, mm0 /* unpack 4 high bytes into words */ 06840 add esi, eax /* move to the next row of Src */ 06841 movq mm2, [esi] /* load 8 bytes from Src */ 06842 movq mm3, mm2 /* save MM2 in MM3 */ 06843 add esi, 2 /* move ESI pointer 2 bytes right */ 06844 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 06845 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 06846 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 06847 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 06848 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 06849 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 06850 movq mm2, [esi] /* load 8 bytes from Src */ 06851 movq mm3, mm2 /* save MM2 in MM3 */ 06852 sub esi, 2 /* move ESI pointer back 2 bytes left */ 06853 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 06854 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 06855 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 06856 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 06857 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 06858 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 06859 add esi, eax /* move to the next row of Src */ 06860 movq mm2, [esi] /* load 8 bytes from Src */ 06861 movq mm3, mm2 /* save MM2 in MM3 */ 06862 add esi, 2 /* move ESI pointer 2 bytes right */ 06863 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 06864 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 06865 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 06866 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 06867 movq mm2, [esi] /* load 8 bytes from Src */ 06868 movq mm3, mm2 /* save MM2 in MM3 */ 06869 sub esi, 2 /* move ESI pointer back 2 bytes left */ 06870 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 06871 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 06872 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 06873 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 06874 /* ---, */ 06875 movq mm2, mm4 /* copy MM4 into MM2 */ 06876 psrlq mm4, 32 /* shift 2 left words to the right */ 06877 psubw mm4, mm2 /* MM4 = MM4 - MM2 */ 06878 movq mm3, mm6 /* copy MM6 into MM3 */ 06879 psrlq mm6, 32 /* shift 2 left words to the right */ 06880 psubw mm6, mm3 /* MM6 = MM6 - MM3 */ 06881 punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */ 06882 movq mm2, mm5 /* copy MM6 into MM2 */ 06883 psrlq mm5, 32 /* shift 2 left words to the right */ 06884 psubw mm5, mm2 /* MM5 = MM5 - MM2 */ 06885 movq mm3, mm7 /* copy MM7 into MM3 */ 06886 psrlq mm7, 32 /* shift 2 left words to the right */ 06887 psubw mm7, mm3 /* MM7 = MM7 - MM3 */ 06888 punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */ 06889 /* Take abs values of MM4 and MM5 */ 06890 movq mm6, mm4 /* copy MM4 into MM6 */ 06891 movq mm7, mm5 /* copy MM5 into MM7 */ 06892 psraw mm6, 15 /* fill MM6 words with word sign bit */ 06893 psraw mm7, 15 /* fill MM7 words with word sign bit */ 06894 pxor mm4, mm6 /* take 1's compliment of only neg words */ 06895 pxor mm5, mm7 /* take 1's compliment of only neg words */ 06896 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 06897 psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */ 06898 packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */ 06899 movq [edi], mm4 /* store result in Dest */ 06900 /* ---, */ 06901 sub esi, eax /* move to the current top row in Src */ 06902 sub esi, eax 06903 add esi, 8 /* move Src pointer to the next 8 pixels */ 06904 add edi, 8 /* move Dest pointer to the next 8 pixels */ 06905 /* ---, */ 06906 dec ecx /* decrease loop counter COLUMNS */ 06907 jnz L10402 /* check loop termination, proceed if required */ 06908 mov esi, ebx /* restore most left current row Src address */ 06909 movd edi, mm1 /* restore most left current row Dest address */ 06910 add esi, eax /* move to the next row in Src */ 06911 add edi, eax /* move to the next row in Dest */ 06912 dec edx /* decrease loop counter ROWS */ 06913 jnz L10400 /* check loop termination, proceed if required */ 06914 /* ---, */ 06915 emms /* exit MMX state */ 06916 popa 06917 } 06918 #else 06919 asm volatile 06920 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 06921 "mov %3, %%eax \n\t" /* load columns into EAX */ 06922 /* --- */ 06923 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 06924 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 06925 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 06926 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 06927 "mov %2, %%edx \n\t" /* initialize ROWS counter */ 06928 "sub $2, %%edx \n\t" /* do not use first and last rows */ 06929 /* --- */ 06930 ".L10400: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 06931 "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */ 06932 "mov %%esi, %%ebx \n\t" /* save ESI in EBX */ 06933 "movd %%edi, %%mm1 \n\t" /* save EDI in MM1 */ 06934 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 06935 ".L10402: \n\t" 06936 /* --- */ 06937 "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */ 06938 "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */ 06939 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 06940 "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */ 06941 "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */ 06942 "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */ 06943 "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */ 06944 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 06945 "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */ 06946 "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */ 06947 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 06948 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 06949 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 06950 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 06951 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 06952 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 06953 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 06954 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 06955 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 06956 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 06957 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 06958 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 06959 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 06960 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 06961 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 06962 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 06963 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 06964 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 06965 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 06966 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 06967 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 06968 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 06969 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 06970 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 06971 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 06972 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 06973 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 06974 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 06975 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 06976 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 06977 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 06978 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 06979 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 06980 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 06981 /* --- */ 06982 "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */ 06983 "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */ 06984 "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */ 06985 "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */ 06986 "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */ 06987 "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */ 06988 "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */ 06989 "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */ 06990 "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */ 06991 "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */ 06992 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 06993 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 06994 "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */ 06995 "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */ 06996 /* Take abs values of MM4 and MM5 */ 06997 "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */ 06998 "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */ 06999 "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */ 07000 "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */ 07001 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */ 07002 "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */ 07003 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07004 "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07005 "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */ 07006 "movq %%mm4, (%%edi) \n\t" /* store result in Dest */ 07007 /* --- */ 07008 "sub %%eax, %%esi \n\t" /* move to the current top row in Src */ 07009 "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */ 07010 "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */ 07011 /* --- */ 07012 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 07013 "jnz .L10402 \n\t" /* check loop termination, proceed if required */ 07014 "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */ 07015 "movd %%mm1, %%edi \n\t" /* restore most left current row Dest address */ 07016 "add %%eax, %%esi \n\t" /* move to the next row in Src */ 07017 "add %%eax, %%edi \n\t" /* move to the next row in Dest */ 07018 "dec %%edx \n\t" /* decrease loop counter ROWS */ 07019 "jnz .L10400 \n\t" /* check loop termination, proceed if required */ 07020 /* --- */ 07021 "emms \n\t" /* exit MMX state */ 07022 "popa \n\t":"=m" (Dest) /* %0 */ 07023 :"m"(Src), /* %1 */ 07024 "m"(rows), /* %2 */ 07025 "m"(columns) /* %3 */ 07026 ); 07027 #endif 07028 #endif 07029 return (0); 07030 } else { 07031 /* No non-MMX implementation yet */ 07032 return (-1); 07033 } 07034 } 07035 07049 int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 07050 unsigned char NRightShift) 07051 { 07052 /* Validate input parameters */ 07053 if ((Src == NULL) || (Dest == NULL)) 07054 return(-1); 07055 if ((columns < 8) || (rows < 3) || (NRightShift > 7)) 07056 return (-1); 07057 07058 if ((SDL_imageFilterMMXdetect())) { 07059 //#ifdef USE_MMX 07060 #if defined(USE_MMX) && defined(i386) 07061 #if !defined(GCC__) 07062 __asm 07063 { 07064 pusha 07065 pxor mm0, mm0 /* zero MM0 */ 07066 mov eax, columns /* load columns into EAX */ 07067 xor ebx, ebx /* zero EBX */ 07068 mov bl, NRightShift /* load NRightShift into BL */ 07069 movd mm1, ebx /* copy NRightShift into MM1 */ 07070 /* ---, */ 07071 mov esi, Src /* ESI = Src row 0 address */ 07072 mov edi, Dest /* load Dest address to EDI */ 07073 add edi, eax /* EDI = EDI + columns */ 07074 inc edi /* 1 byte offset from the left edge */ 07075 /* initialize ROWS counter */ 07076 sub rows, 2 /* do not use first and last rows */ 07077 /* ---, */ 07078 L10410: 07079 mov ecx, eax /* initialize COLUMS counter */ 07080 shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */ 07081 mov ebx, esi /* save ESI in EBX */ 07082 mov edx, edi /* save EDI in EDX */ 07083 align 16 /* 16 byte alignment of the loop entry */ 07084 L10412: 07085 /* ---, */ 07086 movq mm4, [esi] /* load 8 bytes from Src */ 07087 movq mm5, mm4 /* save MM4 in MM5 */ 07088 add esi, 2 /* move ESI pointer 2 bytes right */ 07089 punpcklbw mm4, mm0 /* unpack 4 low bytes into words */ 07090 punpckhbw mm5, mm0 /* unpack 4 high bytes into words */ 07091 psrlw mm4, mm1 /* shift right each pixel NshiftRight times */ 07092 psrlw mm5, mm1 /* shift right each pixel NshiftRight times */ 07093 movq mm6, [esi] /* load 8 bytes from Src */ 07094 movq mm7, mm6 /* save MM6 in MM7 */ 07095 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07096 punpcklbw mm6, mm0 /* unpack 4 low bytes into words */ 07097 punpckhbw mm7, mm0 /* unpack 4 high bytes into words */ 07098 psrlw mm6, mm1 /* shift right each pixel NshiftRight times */ 07099 psrlw mm7, mm1 /* shift right each pixel NshiftRight times */ 07100 add esi, eax /* move to the next row of Src */ 07101 movq mm2, [esi] /* load 8 bytes from Src */ 07102 movq mm3, mm2 /* save MM2 in MM3 */ 07103 add esi, 2 /* move ESI pointer 2 bytes right */ 07104 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07105 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07106 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07107 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07108 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07109 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07110 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07111 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07112 movq mm2, [esi] /* load 8 bytes from Src */ 07113 movq mm3, mm2 /* save MM2 in MM3 */ 07114 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07115 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07116 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07117 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07118 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07119 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07120 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07121 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07122 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07123 add esi, eax /* move to the next row of Src */ 07124 movq mm2, [esi] /* load 8 bytes from Src */ 07125 movq mm3, mm2 /* save MM2 in MM3 */ 07126 add esi, 2 /* move ESI pointer 2 bytes right */ 07127 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07128 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07129 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07130 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07131 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07132 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07133 movq mm2, [esi] /* load 8 bytes from Src */ 07134 movq mm3, mm2 /* save MM2 in MM3 */ 07135 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07136 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07137 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07138 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07139 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07140 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07141 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07142 /* ---, */ 07143 movq mm2, mm4 /* copy MM4 into MM2 */ 07144 psrlq mm4, 32 /* shift 2 left words to the right */ 07145 psubw mm4, mm2 /* MM4 = MM4 - MM2 */ 07146 movq mm3, mm6 /* copy MM6 into MM3 */ 07147 psrlq mm6, 32 /* shift 2 left words to the right */ 07148 psubw mm6, mm3 /* MM6 = MM6 - MM3 */ 07149 punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */ 07150 movq mm2, mm5 /* copy MM6 into MM2 */ 07151 psrlq mm5, 32 /* shift 2 left words to the right */ 07152 psubw mm5, mm2 /* MM5 = MM5 - MM2 */ 07153 movq mm3, mm7 /* copy MM7 into MM3 */ 07154 psrlq mm7, 32 /* shift 2 left words to the right */ 07155 psubw mm7, mm3 /* MM7 = MM7 - MM3 */ 07156 punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */ 07157 /* Take abs values of MM4 and MM5 */ 07158 movq mm6, mm4 /* copy MM4 into MM6 */ 07159 movq mm7, mm5 /* copy MM5 into MM7 */ 07160 psraw mm6, 15 /* fill MM6 words with word sign bit */ 07161 psraw mm7, 15 /* fill MM7 words with word sign bit */ 07162 pxor mm4, mm6 /* take 1's compliment of only neg words */ 07163 pxor mm5, mm7 /* take 1's compliment of only neg words */ 07164 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 07165 psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */ 07166 packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */ 07167 movq [edi], mm4 /* store result in Dest */ 07168 /* ---, */ 07169 sub esi, eax /* move to the current top row in Src */ 07170 sub esi, eax 07171 add esi, 8 /* move Src pointer to the next 8 pixels */ 07172 add edi, 8 /* move Dest pointer to the next 8 pixels */ 07173 /* ---, */ 07174 dec ecx /* decrease loop counter COLUMNS */ 07175 jnz L10412 /* check loop termination, proceed if required */ 07176 mov esi, ebx /* restore most left current row Src address */ 07177 mov edi, edx /* restore most left current row Dest address */ 07178 add esi, eax /* move to the next row in Src */ 07179 add edi, eax /* move to the next row in Dest */ 07180 dec rows /* decrease loop counter ROWS */ 07181 jnz L10410 /* check loop termination, proceed if required */ 07182 /* ---, */ 07183 emms /* exit MMX state */ 07184 popa 07185 } 07186 #else 07187 asm volatile 07188 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 07189 "mov %3, %%eax \n\t" /* load columns into EAX */ 07190 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 07191 "mov %4, %%bl \n\t" /* load NRightShift into BL */ 07192 "movd %%ebx, %%mm1 \n\t" /* copy NRightShift into MM1 */ 07193 /* --- */ 07194 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 07195 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 07196 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 07197 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 07198 /* initialize ROWS counter */ 07199 "subl $2, %2 \n\t" /* do not use first and last rows */ 07200 /* --- */ 07201 ".L10410: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 07202 "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */ 07203 "mov %%esi, %%ebx \n\t" /* save ESI in EBX */ 07204 "mov %%edi, %%edx \n\t" /* save EDI in EDX */ 07205 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 07206 ".L10412: \n\t" 07207 /* --- */ 07208 "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */ 07209 "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */ 07210 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07211 "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */ 07212 "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */ 07213 "psrlw %%mm1, %%mm4 \n\t" /* shift right each pixel NshiftRight times */ 07214 "psrlw %%mm1, %%mm5 \n\t" /* shift right each pixel NshiftRight times */ 07215 "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */ 07216 "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */ 07217 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07218 "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */ 07219 "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */ 07220 "psrlw %%mm1, %%mm6 \n\t" /* shift right each pixel NshiftRight times */ 07221 "psrlw %%mm1, %%mm7 \n\t" /* shift right each pixel NshiftRight times */ 07222 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 07223 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07224 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07225 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07226 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07227 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07228 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07229 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07230 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07231 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07232 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07233 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07234 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07235 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07236 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07237 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07238 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07239 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07240 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07241 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07242 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07243 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07244 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07245 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 07246 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07247 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07248 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07249 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07250 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07251 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07252 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07253 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07254 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07255 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07256 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07257 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07258 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07259 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07260 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07261 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07262 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07263 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07264 /* --- */ 07265 "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */ 07266 "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */ 07267 "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */ 07268 "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */ 07269 "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */ 07270 "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */ 07271 "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */ 07272 "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */ 07273 "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */ 07274 "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */ 07275 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 07276 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 07277 "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */ 07278 "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */ 07279 /* Take abs values of MM4 and MM5 */ 07280 "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */ 07281 "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */ 07282 "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */ 07283 "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */ 07284 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */ 07285 "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */ 07286 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07287 "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07288 "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */ 07289 "movq %%mm4, (%%edi) \n\t" /* store result in Dest */ 07290 /* --- */ 07291 "sub %%eax, %%esi \n\t" /* move to the current top row in Src */ 07292 "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */ 07293 "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */ 07294 /* --- */ 07295 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 07296 "jnz .L10412 \n\t" /* check loop termination, proceed if required */ 07297 "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */ 07298 "mov %%edx, %%edi \n\t" /* restore most left current row Dest address */ 07299 "add %%eax, %%esi \n\t" /* move to the next row in Src */ 07300 "add %%eax, %%edi \n\t" /* move to the next row in Dest */ 07301 "decl %2 \n\t" /* decrease loop counter ROWS */ 07302 "jnz .L10410 \n\t" /* check loop termination, proceed if required */ 07303 /* --- */ 07304 "emms \n\t" /* exit MMX state */ 07305 "popa \n\t":"=m" (Dest) /* %0 */ 07306 :"m"(Src), /* %1 */ 07307 "m"(rows), /* %2 */ 07308 "m"(columns), /* %3 */ 07309 "m"(NRightShift) /* %4 */ 07310 ); 07311 #endif 07312 #endif 07313 return (0); 07314 } else { 07315 /* No non-MMX implementation yet */ 07316 return (-1); 07317 } 07318 } 07319 07323 void SDL_imageFilterAlignStack(void) 07324 { 07325 #ifdef USE_MMX 07326 #if !defined(GCC__) 07327 __asm 07328 { /* --- stack alignment --- */ 07329 mov ebx, esp /* load ESP into EBX */ 07330 sub ebx, 4 /* reserve space on stack for old value of ESP */ 07331 and ebx, -32 /* align EBX along a 32 byte boundary */ 07332 mov [ebx], esp /* save old value of ESP in stack, behind the bndry */ 07333 mov esp, ebx /* align ESP along a 32 byte boundary */ 07334 } 07335 #else 07336 asm volatile 07337 ( /* --- stack alignment --- */ 07338 "mov %%esp, %%ebx \n\t" /* load ESP into EBX */ 07339 "sub $4, %%ebx \n\t" /* reserve space on stack for old value of ESP */ 07340 "and $-32, %%ebx \n\t" /* align EBX along a 32 byte boundary */ 07341 "mov %%esp, (%%ebx) \n\t" /* save old value of ESP in stack, behind the bndry */ 07342 "mov %%ebx, %%esp \n\t" /* align ESP along a 32 byte boundary */ 07343 ::); 07344 #endif 07345 #endif 07346 } 07347 07351 void SDL_imageFilterRestoreStack(void) 07352 { 07353 #ifdef USE_MMX 07354 #if !defined(GCC__) 07355 __asm 07356 { /* --- restoring old stack --- */ 07357 mov ebx, [esp] /* load old value of ESP */ 07358 mov esp, ebx /* restore old value of ESP */ 07359 } 07360 #else 07361 asm volatile 07362 ( /* --- restoring old stack --- */ 07363 "mov (%%esp), %%ebx \n\t" /* load old value of ESP */ 07364 "mov %%ebx, %%esp \n\t" /* restore old value of ESP */ 07365 ::); 07366 #endif 07367 #endif 07368 }