SDL2_gfx
1.0.1
GraphicsprimitivesandsurfacefunctionsforSDL2
|
00001 /* 00002 00003 SDL2_imageFilter.c: byte-image "filter" routines 00004 00005 Copyright (C) 2001-2012 Andreas Schiffler 00006 Copyright (C) 2013 Sylvain Beucler 00007 00008 This software is provided 'as-is', without any express or implied 00009 warranty. In no event will the authors be held liable for any damages 00010 arising from the use of this software. 00011 00012 Permission is granted to anyone to use this software for any purpose, 00013 including commercial applications, and to alter it and redistribute it 00014 freely, subject to the following restrictions: 00015 00016 1. The origin of this software must not be misrepresented; you must not 00017 claim that you wrote the original software. If you use this software 00018 in a product, an acknowledgment in the product documentation would be 00019 appreciated but is not required. 00020 00021 2. Altered source versions must be plainly marked as such, and must not be 00022 misrepresented as being the original software. 00023 00024 3. This notice may not be removed or altered from any source 00025 distribution. 00026 00027 Andreas Schiffler -- aschiffler at ferzkopp dot net 00028 00029 */ 00030 00031 /* 00032 00033 Note: Uses inline x86 MMX or ASM optimizations if available and enabled. 00034 00035 Note: Most of the MMX code is based on published routines 00036 by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 00037 him for his work. 00038 00039 */ 00040 00041 #include <stdio.h> 00042 #include <stdlib.h> 00043 #include <string.h> 00044 00045 #include "SDL.h" 00046 00047 /* Use GCC intrinsics if available: they support both i386 and x86_64, 00048 provide ASM-grade performances, and lift the PUSHA/POPA issues. */ 00049 #ifdef __GNUC__ 00050 # ifdef USE_MMX 00051 # include <mmintrin.h> 00052 # endif 00053 # include <SDL_cpuinfo.h> 00054 #endif 00055 00056 #include "SDL2_imageFilter.h" 00057 00061 #define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8) | (((x) & 0x0000ff00) << 8) | ((x) << 24)) 00062 00063 /* ------ Static variables ----- */ 00064 00068 static int SDL_imageFilterUseMMX = 1; 00069 00070 /* Detect GCC */ 00071 #if defined(__GNUC__) 00072 #define GCC__ 00073 #endif 00074 00080 int SDL_imageFilterMMXdetect(void) 00081 { 00082 /* Check override flag */ 00083 if (SDL_imageFilterUseMMX == 0) { 00084 return (0); 00085 } 00086 00087 return SDL_HasMMX(); 00088 } 00089 00093 void SDL_imageFilterMMXoff() 00094 { 00095 SDL_imageFilterUseMMX = 0; 00096 } 00097 00101 void SDL_imageFilterMMXon() 00102 { 00103 SDL_imageFilterUseMMX = 1; 00104 } 00105 00106 /* ------------------------------------------------------------------------------------ */ 00107 00118 static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00119 { 00120 #ifdef USE_MMX 00121 #if !defined(GCC__) 00122 __asm 00123 { 00124 pusha 00125 mov eax, Src1 /* load Src1 address into eax */ 00126 mov ebx, Src2 /* load Src2 address into ebx */ 00127 mov edi, Dest /* load Dest address into edi */ 00128 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00129 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00130 align 16 /* 16 byte alignment of the loop entry */ 00131 L1010: 00132 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00133 paddusb mm1, [ebx] /* mm1=Src1+Src2 (add 8 bytes with saturation) */ 00134 movq [edi], mm1 /* store result in Dest */ 00135 add eax, 8 /* increase Src1, Src2 and Dest */ 00136 add ebx, 8 /* register pointers by 8 */ 00137 add edi, 8 00138 dec ecx /* decrease loop counter */ 00139 jnz L1010 /* check loop termination, proceed if required */ 00140 emms /* exit MMX state */ 00141 popa 00142 } 00143 #else 00144 /* i386 and x86_64 */ 00145 __m64 *mSrc1 = (__m64*)Src1; 00146 __m64 *mSrc2 = (__m64*)Src2; 00147 __m64 *mDest = (__m64*)Dest; 00148 int i; 00149 for (i = 0; i < SrcLength/8; i++) { 00150 *mDest = _m_paddusb(*mSrc1, *mSrc2); /* Src1+Src2 (add 8 bytes with saturation) */ 00151 mSrc1++; 00152 mSrc2++; 00153 mDest++; 00154 } 00155 _m_empty(); /* clean MMX state */ 00156 #endif 00157 return (0); 00158 #else 00159 return (-1); 00160 #endif 00161 } 00162 00173 int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00174 { 00175 unsigned int i, istart; 00176 unsigned char *cursrc1, *cursrc2, *curdst; 00177 int result; 00178 00179 /* Validate input parameters */ 00180 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00181 return(-1); 00182 if (length == 0) 00183 return(0); 00184 00185 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00186 00187 /* Use MMX assembly routine */ 00188 SDL_imageFilterAddMMX(Src1, Src2, Dest, length); 00189 00190 /* Check for unaligned bytes */ 00191 if ((length & 7) > 0) { 00192 /* Setup to process unaligned bytes */ 00193 istart = length & 0xfffffff8; 00194 cursrc1 = &Src1[istart]; 00195 cursrc2 = &Src2[istart]; 00196 curdst = &Dest[istart]; 00197 } else { 00198 /* No unaligned bytes - we are done */ 00199 return (0); 00200 } 00201 } else { 00202 /* Setup to process whole image */ 00203 istart = 0; 00204 cursrc1 = Src1; 00205 cursrc2 = Src2; 00206 curdst = Dest; 00207 } 00208 00209 /* C routine to process image */ 00210 for (i = istart; i < length; i++) { 00211 result = (int) *cursrc1 + (int) *cursrc2; 00212 if (result > 255) 00213 result = 255; 00214 *curdst = (unsigned char) result; 00215 /* Advance pointers */ 00216 cursrc1++; 00217 cursrc2++; 00218 curdst++; 00219 } 00220 00221 return (0); 00222 } 00223 00235 static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength, 00236 unsigned char *Mask) 00237 { 00238 #ifdef USE_MMX 00239 #if !defined(GCC__) 00240 __asm 00241 { 00242 pusha 00243 mov edx, Mask /* load Mask address into edx */ 00244 movq mm0, [edx] /* load Mask into mm0 */ 00245 mov eax, Src1 /* load Src1 address into eax */ 00246 mov ebx, Src2 /* load Src2 address into ebx */ 00247 mov edi, Dest /* load Dest address into edi */ 00248 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00249 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00250 align 16 /* 16 byte alignment of the loop entry */ 00251 L21011: 00252 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00253 movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */ 00254 /* --- Byte shift via Word shift --- */ 00255 psrlw mm1, 1 /* shift 4 WORDS of mm1 1 bit to the right */ 00256 psrlw mm2, 1 /* shift 4 WORDS of mm2 1 bit to the right */ 00257 pand mm1, mm0 // apply Mask to 8 BYTES of mm1 */ 00258 /* byte 0x0f, 0xdb, 0xc8 */ 00259 pand mm2, mm0 // apply Mask to 8 BYTES of mm2 */ 00260 /* byte 0x0f, 0xdb, 0xd0 */ 00261 paddusb mm1, mm2 /* mm1=mm1+mm2 (add 8 bytes with saturation) */ 00262 movq [edi], mm1 /* store result in Dest */ 00263 add eax, 8 /* increase Src1, Src2 and Dest */ 00264 add ebx, 8 /* register pointers by 8 */ 00265 add edi, 8 00266 dec ecx /* decrease loop counter */ 00267 jnz L21011 /* check loop termination, proceed if required */ 00268 emms /* exit MMX state */ 00269 popa 00270 } 00271 #else 00272 /* i386 and x86_64 */ 00273 __m64 *mSrc1 = (__m64*)Src1; 00274 __m64 *mSrc2 = (__m64*)Src2; 00275 __m64 *mDest = (__m64*)Dest; 00276 __m64 *mMask = (__m64*)Mask; 00277 int i; 00278 for (i = 0; i < SrcLength/8; i++) { 00279 __m64 mm1 = *mSrc1, 00280 mm2 = *mSrc2; 00281 mm1 = _m_psrlwi(mm1, 1); /* shift 4 WORDS of mm1 1 bit to the right */ 00282 mm2 = _m_psrlwi(mm2, 1); /* shift 4 WORDS of mm2 1 bit to the right */ 00283 mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of mm1 */ 00284 mm2 = _m_pand(mm2, *mMask); /* apply Mask to 8 BYTES of mm2 */ 00285 *mDest = _m_paddusb(mm1, mm2); /* mm1+mm2 (add 8 bytes with saturation) */ 00286 mSrc1++; 00287 mSrc2++; 00288 mDest++; 00289 } 00290 _m_empty(); /* clean MMX state */ 00291 #endif 00292 return (0); 00293 #else 00294 return (-1); 00295 #endif 00296 } 00297 00308 int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00309 { 00310 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }; 00311 unsigned int i, istart; 00312 unsigned char *cursrc1, *cursrc2, *curdst; 00313 int result; 00314 00315 /* Validate input parameters */ 00316 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00317 return(-1); 00318 if (length == 0) 00319 return(0); 00320 00321 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00322 /* MMX routine */ 00323 SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask); 00324 00325 /* Check for unaligned bytes */ 00326 if ((length & 7) > 0) { 00327 /* Setup to process unaligned bytes */ 00328 istart = length & 0xfffffff8; 00329 cursrc1 = &Src1[istart]; 00330 cursrc2 = &Src2[istart]; 00331 curdst = &Dest[istart]; 00332 } else { 00333 /* No unaligned bytes - we are done */ 00334 return (0); 00335 } 00336 } else { 00337 /* Setup to process whole image */ 00338 istart = 0; 00339 cursrc1 = Src1; 00340 cursrc2 = Src2; 00341 curdst = Dest; 00342 } 00343 00344 /* C routine to process image */ 00345 for (i = istart; i < length; i++) { 00346 result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2; 00347 *curdst = (unsigned char) result; 00348 /* Advance pointers */ 00349 cursrc1++; 00350 cursrc2++; 00351 curdst++; 00352 } 00353 00354 return (0); 00355 } 00356 00367 static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00368 { 00369 #ifdef USE_MMX 00370 #if !defined(GCC__) 00371 __asm 00372 { 00373 pusha 00374 mov eax, Src1 /* load Src1 address into eax */ 00375 mov ebx, Src2 /* load Src2 address into ebx */ 00376 mov edi, Dest /* load Dest address into edi */ 00377 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00378 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00379 align 16 /* 16 byte alignment of the loop entry */ 00380 L1012: 00381 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00382 psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ 00383 movq [edi], mm1 /* store result in Dest */ 00384 add eax, 8 /* increase Src1, Src2 and Dest */ 00385 add ebx, 8 /* register pointers by 8 */ 00386 add edi, 8 00387 dec ecx /* decrease loop counter */ 00388 jnz L1012 /* check loop termination, proceed if required */ 00389 emms /* exit MMX state */ 00390 popa 00391 } 00392 #else 00393 /* i386 and x86_64 */ 00394 __m64 *mSrc1 = (__m64*)Src1; 00395 __m64 *mSrc2 = (__m64*)Src2; 00396 __m64 *mDest = (__m64*)Dest; 00397 int i; 00398 for (i = 0; i < SrcLength/8; i++) { 00399 *mDest = _m_psubusb(*mSrc1, *mSrc2); /* Src1-Src2 (sub 8 bytes with saturation) */ 00400 mSrc1++; 00401 mSrc2++; 00402 mDest++; 00403 } 00404 _m_empty(); /* clean MMX state */ 00405 #endif 00406 return (0); 00407 #else 00408 return (-1); 00409 #endif 00410 } 00411 00422 int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00423 { 00424 unsigned int i, istart; 00425 unsigned char *cursrc1, *cursrc2, *curdst; 00426 int result; 00427 00428 /* Validate input parameters */ 00429 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00430 return(-1); 00431 if (length == 0) 00432 return(0); 00433 00434 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00435 /* MMX routine */ 00436 SDL_imageFilterSubMMX(Src1, Src2, Dest, length); 00437 00438 /* Check for unaligned bytes */ 00439 if ((length & 7) > 0) { 00440 /* Setup to process unaligned bytes */ 00441 istart = length & 0xfffffff8; 00442 cursrc1 = &Src1[istart]; 00443 cursrc2 = &Src2[istart]; 00444 curdst = &Dest[istart]; 00445 } else { 00446 /* No unaligned bytes - we are done */ 00447 return (0); 00448 } 00449 } else { 00450 /* Setup to process whole image */ 00451 istart = 0; 00452 cursrc1 = Src1; 00453 cursrc2 = Src2; 00454 curdst = Dest; 00455 } 00456 00457 /* C routine to process image */ 00458 for (i = istart; i < length; i++) { 00459 result = (int) *cursrc1 - (int) *cursrc2; 00460 if (result < 0) 00461 result = 0; 00462 *curdst = (unsigned char) result; 00463 /* Advance pointers */ 00464 cursrc1++; 00465 cursrc2++; 00466 curdst++; 00467 } 00468 00469 return (0); 00470 } 00471 00482 static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00483 { 00484 #ifdef USE_MMX 00485 #if !defined(GCC__) 00486 __asm 00487 { 00488 pusha 00489 mov eax, Src1 /* load Src1 address into eax */ 00490 mov ebx, Src2 /* load Src2 address into ebx */ 00491 mov edi, Dest /* load Dest address into edi */ 00492 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00493 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00494 align 16 /* 16 byte alignment of the loop entry */ 00495 L1013: 00496 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00497 movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */ 00498 psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ 00499 psubusb mm2, [eax] /* mm2=Src2-Src1 (sub 8 bytes with saturation) */ 00500 por mm1, mm2 /* combine both mm2 and mm1 results */ 00501 movq [edi], mm1 /* store result in Dest */ 00502 add eax, 8 /* increase Src1, Src2 and Dest */ 00503 add ebx, 8 /* register pointers by 8 */ 00504 add edi, 8 00505 dec ecx /* decrease loop counter */ 00506 jnz L1013 /* check loop termination, proceed if required */ 00507 emms /* exit MMX state */ 00508 popa 00509 } 00510 #else 00511 /* i386 and x86_64 */ 00512 __m64 *mSrc1 = (__m64*)Src1; 00513 __m64 *mSrc2 = (__m64*)Src2; 00514 __m64 *mDest = (__m64*)Dest; 00515 int i; 00516 for (i = 0; i < SrcLength/8; i++) { 00517 __m64 mm1 = _m_psubusb(*mSrc2, *mSrc1); /* Src1-Src2 (sub 8 bytes with saturation) */ 00518 __m64 mm2 = _m_psubusb(*mSrc1, *mSrc2); /* Src2-Src1 (sub 8 bytes with saturation) */ 00519 *mDest = _m_por(mm1, mm2); /* combine both mm2 and mm1 results */ 00520 mSrc1++; 00521 mSrc2++; 00522 mDest++; 00523 } 00524 _m_empty(); /* clean MMX state */ 00525 #endif 00526 return (0); 00527 #else 00528 return (-1); 00529 #endif 00530 } 00531 00542 int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00543 { 00544 unsigned int i, istart; 00545 unsigned char *cursrc1, *cursrc2, *curdst; 00546 int result; 00547 00548 /* Validate input parameters */ 00549 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00550 return(-1); 00551 if (length == 0) 00552 return(0); 00553 00554 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00555 /* MMX routine */ 00556 SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length); 00557 00558 /* Check for unaligned bytes */ 00559 if ((length & 7) > 0) { 00560 /* Setup to process unaligned bytes */ 00561 istart = length & 0xfffffff8; 00562 cursrc1 = &Src1[istart]; 00563 cursrc2 = &Src2[istart]; 00564 curdst = &Dest[istart]; 00565 } else { 00566 /* No unaligned bytes - we are done */ 00567 return (0); 00568 } 00569 } else { 00570 /* Setup to process whole image */ 00571 istart = 0; 00572 cursrc1 = Src1; 00573 cursrc2 = Src2; 00574 curdst = Dest; 00575 } 00576 00577 /* C routine to process image */ 00578 for (i = istart; i < length; i++) { 00579 result = abs((int) *cursrc1 - (int) *cursrc2); 00580 *curdst = (unsigned char) result; 00581 /* Advance pointers */ 00582 cursrc1++; 00583 cursrc2++; 00584 curdst++; 00585 } 00586 00587 return (0); 00588 } 00589 00600 static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00601 { 00602 #ifdef USE_MMX 00603 #if !defined(GCC__) 00604 __asm 00605 { 00606 pusha 00607 mov eax, Src1 /* load Src1 address into eax */ 00608 mov ebx, Src2 /* load Src2 address into ebx */ 00609 mov edi, Dest /* load Dest address into edi */ 00610 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00611 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00612 pxor mm0, mm0 /* zero mm0 register */ 00613 align 16 /* 16 byte alignment of the loop entry */ 00614 L1014: 00615 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00616 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */ 00617 movq mm2, mm1 /* copy mm1 into mm2 */ 00618 movq mm4, mm3 /* copy mm3 into mm4 */ 00619 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */ 00620 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */ 00621 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */ 00622 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */ 00623 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */ 00624 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */ 00625 /* Take abs value of the results (signed words) */ 00626 movq mm5, mm1 /* copy mm1 into mm5 */ 00627 movq mm6, mm2 /* copy mm2 into mm6 */ 00628 psraw mm5, 15 /* fill mm5 words with word sign bit */ 00629 psraw mm6, 15 /* fill mm6 words with word sign bit */ 00630 pxor mm1, mm5 /* take 1's compliment of only neg. words */ 00631 pxor mm2, mm6 /* take 1's compliment of only neg. words */ 00632 psubsw mm1, mm5 /* add 1 to only neg. words, W-(-1) or W-0 */ 00633 psubsw mm2, mm6 /* add 1 to only neg. words, W-(-1) or W-0 */ 00634 packuswb mm1, mm2 /* pack words back into bytes with saturation */ 00635 movq [edi], mm1 /* store result in Dest */ 00636 add eax, 8 /* increase Src1, Src2 and Dest */ 00637 add ebx, 8 /* register pointers by 8 */ 00638 add edi, 8 00639 dec ecx /* decrease loop counter */ 00640 jnz L1014 /* check loop termination, proceed if required */ 00641 emms /* exit MMX state */ 00642 popa 00643 } 00644 #else 00645 /* i386 ASM with constraints: */ 00646 /* asm volatile ( */ 00647 /* "shr $3, %%ecx \n\t" /\* counter/8 (MMX loads 8 bytes at a time) *\/ */ 00648 /* "pxor %%mm0, %%mm0 \n\t" /\* zero mm0 register *\/ */ 00649 /* ".align 16 \n\t" /\* 16 byte alignment of the loop entry *\/ */ 00650 /* "1: movq (%%eax), %%mm1 \n\t" /\* load 8 bytes from Src1 into mm1 *\/ */ 00651 /* "movq (%%ebx), %%mm3 \n\t" /\* load 8 bytes from Src2 into mm3 *\/ */ 00652 /* "movq %%mm1, %%mm2 \n\t" /\* copy mm1 into mm2 *\/ */ 00653 /* "movq %%mm3, %%mm4 \n\t" /\* copy mm3 into mm4 *\/ */ 00654 /* "punpcklbw %%mm0, %%mm1 \n\t" /\* unpack low bytes of Src1 into words *\/ */ 00655 /* "punpckhbw %%mm0, %%mm2 \n\t" /\* unpack high bytes of Src1 into words *\/ */ 00656 /* "punpcklbw %%mm0, %%mm3 \n\t" /\* unpack low bytes of Src2 into words *\/ */ 00657 /* "punpckhbw %%mm0, %%mm4 \n\t" /\* unpack high bytes of Src2 into words *\/ */ 00658 /* "pmullw %%mm3, %%mm1 \n\t" /\* mul low bytes of Src1 and Src2 *\/ */ 00659 /* "pmullw %%mm4, %%mm2 \n\t" /\* mul high bytes of Src1 and Src2 *\/ */ 00660 /* /\* Take abs value of the results (signed words) *\/ */ 00661 /* "movq %%mm1, %%mm5 \n\t" /\* copy mm1 into mm5 *\/ */ 00662 /* "movq %%mm2, %%mm6 \n\t" /\* copy mm2 into mm6 *\/ */ 00663 /* "psraw $15, %%mm5 \n\t" /\* fill mm5 words with word sign bit *\/ */ 00664 /* "psraw $15, %%mm6 \n\t" /\* fill mm6 words with word sign bit *\/ */ 00665 /* "pxor %%mm5, %%mm1 \n\t" /\* take 1's compliment of only neg. words *\/ */ 00666 /* "pxor %%mm6, %%mm2 \n\t" /\* take 1's compliment of only neg. words *\/ */ 00667 /* "psubsw %%mm5, %%mm1 \n\t" /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */ 00668 /* "psubsw %%mm6, %%mm2 \n\t" /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */ 00669 /* "packuswb %%mm2, %%mm1 \n\t" /\* pack words back into bytes with saturation *\/ */ 00670 /* "movq %%mm1, (%%edi) \n\t" /\* store result in Dest *\/ */ 00671 /* "add $8, %%eax \n\t" /\* increase Src1, Src2 and Dest *\/ */ 00672 /* "add $8, %%ebx \n\t" /\* register pointers by 8 *\/ */ 00673 /* "add $8, %%edi \n\t" */ 00674 /* "dec %%ecx \n\t" /\* decrease loop counter *\/ */ 00675 /* "jnz 1b \n\t" /\* check loop termination, proceed if required *\/ */ 00676 /* "emms \n\t" /\* exit MMX state *\/ */ 00677 /* : "+a" (Src1), /\* load Src1 address into rax, modified by the loop *\/ */ 00678 /* "+b" (Src2), /\* load Src2 address into rbx, modified by the loop *\/ */ 00679 /* "+c" (SrcLength), /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */ 00680 /* "+D" (Dest) /\* load Dest address into rdi, modified by the loop *\/ */ 00681 /* : */ 00682 /* : "memory", /\* *Dest is modified *\/ */ 00683 /* "mm0","mm1","mm2","mm3","mm4","mm5","mm6" /\* registers modified *\/ */ 00684 /* ); */ 00685 00686 /* i386 and x86_64 */ 00687 __m64 *mSrc1 = (__m64*)Src1; 00688 __m64 *mSrc2 = (__m64*)Src2; 00689 __m64 *mDest = (__m64*)Dest; 00690 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 00691 int i; 00692 for (i = 0; i < SrcLength/8; i++) { 00693 __m64 mm1, mm2, mm3, mm4, mm5, mm6; 00694 mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 00695 mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 00696 mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */ 00697 mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */ 00698 mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */ 00699 mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */ 00700 mm5 = _m_psrawi(mm1, 15); /* fill mm5 words with word sign bit */ 00701 mm6 = _m_psrawi(mm2, 15); /* fill mm6 words with word sign bit */ 00702 mm1 = _m_pxor(mm1, mm5); /* take 1's compliment of only neg. words */ 00703 mm2 = _m_pxor(mm2, mm6); /* take 1's compliment of only neg. words */ 00704 mm1 = _m_psubsw(mm1, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */ 00705 mm2 = _m_psubsw(mm2, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */ 00706 *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */ 00707 mSrc1++; 00708 mSrc2++; 00709 mDest++; 00710 } 00711 _m_empty(); /* clean MMX state */ 00712 #endif 00713 return (0); 00714 #else 00715 return (-1); 00716 #endif 00717 } 00718 00729 int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00730 { 00731 unsigned int i, istart; 00732 unsigned char *cursrc1, *cursrc2, *curdst; 00733 int result; 00734 00735 /* Validate input parameters */ 00736 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00737 return(-1); 00738 if (length == 0) 00739 return(0); 00740 00741 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 00742 /* MMX routine */ 00743 SDL_imageFilterMultMMX(Src1, Src2, Dest, length); 00744 00745 /* Check for unaligned bytes */ 00746 if ((length & 7) > 0) { 00747 /* Setup to process unaligned bytes */ 00748 istart = length & 0xfffffff8; 00749 cursrc1 = &Src1[istart]; 00750 cursrc2 = &Src2[istart]; 00751 curdst = &Dest[istart]; 00752 } else { 00753 /* No unaligned bytes - we are done */ 00754 return (0); 00755 } 00756 } else { 00757 /* Setup to process whole image */ 00758 istart = 0; 00759 cursrc1 = Src1; 00760 cursrc2 = Src2; 00761 curdst = Dest; 00762 } 00763 00764 /* C routine to process image */ 00765 for (i = istart; i < length; i++) { 00766 00767 /* NOTE: this is probably wrong - dunno what the MMX code does */ 00768 00769 result = (int) *cursrc1 * (int) *cursrc2; 00770 if (result > 255) 00771 result = 255; 00772 *curdst = (unsigned char) result; 00773 /* Advance pointers */ 00774 cursrc1++; 00775 cursrc2++; 00776 curdst++; 00777 } 00778 00779 return (0); 00780 } 00781 00792 int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00793 { 00794 #ifdef USE_MMX 00795 #if !defined(GCC__) 00796 __asm 00797 { 00798 pusha 00799 mov edx, Src1 /* load Src1 address into edx */ 00800 mov esi, Src2 /* load Src2 address into esi */ 00801 mov edi, Dest /* load Dest address into edi */ 00802 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00803 align 16 /* 16 byte alignment of the loop entry */ 00804 L10141: 00805 mov al, [edx] /* load a byte from Src1 */ 00806 mul [esi] /* mul with a byte from Src2 */ 00807 mov [edi], al /* move a byte result to Dest */ 00808 inc edx /* increment Src1, Src2, Dest */ 00809 inc esi /* pointer registers by one */ 00810 inc edi 00811 dec ecx /* decrease loop counter */ 00812 jnz L10141 /* check loop termination, proceed if required */ 00813 popa 00814 } 00815 #else 00816 /* Note: ~5% gain on i386, less efficient than C on x86_64 */ 00817 /* Also depends on whether this function is static (?!) */ 00818 asm volatile ( 00819 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 00820 # if defined(i386) 00821 "1:mov (%%edx), %%al \n\t" /* load a byte from Src1 */ 00822 "mulb (%%esi) \n\t" /* mul with a byte from Src2 */ 00823 "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */ 00824 "inc %%edx \n\t" /* increment Src1, Src2, Dest */ 00825 "inc %%esi \n\t" /* pointer registers by one */ 00826 "inc %%edi \n\t" 00827 "dec %%ecx \n\t" /* decrease loop counter */ 00828 # elif defined(__x86_64__) 00829 "1:mov (%%rdx), %%al \n\t" /* load a byte from Src1 */ 00830 "mulb (%%rsi) \n\t" /* mul with a byte from Src2 */ 00831 "mov %%al, (%%rdi) \n\t" /* move a byte result to Dest */ 00832 "inc %%rdx \n\t" /* increment Src1, Src2, Dest */ 00833 "inc %%rsi \n\t" /* pointer registers by one */ 00834 "inc %%rdi \n\t" 00835 "dec %%rcx \n\t" /* decrease loop counter */ 00836 # endif 00837 "jnz 1b \n\t" /* check loop termination, proceed if required */ 00838 : "+d" (Src1), /* load Src1 address into edx */ 00839 "+S" (Src2), /* load Src2 address into esi */ 00840 "+c" (SrcLength), /* load loop counter (SIZE) into ecx */ 00841 "+D" (Dest) /* load Dest address into edi */ 00842 : 00843 : "memory", "rax" 00844 ); 00845 #endif 00846 return (0); 00847 #else 00848 return (-1); 00849 #endif 00850 } 00851 00862 int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 00863 { 00864 unsigned int i, istart; 00865 unsigned char *cursrc1, *cursrc2, *curdst; 00866 00867 /* Validate input parameters */ 00868 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 00869 return(-1); 00870 if (length == 0) 00871 return(0); 00872 00873 if (SDL_imageFilterMMXdetect()) { 00874 if (length > 0) { 00875 /* ASM routine */ 00876 SDL_imageFilterMultNorASM(Src1, Src2, Dest, length); 00877 00878 /* Check for unaligned bytes */ 00879 if ((length & 7) > 0) { 00880 /* Setup to process unaligned bytes */ 00881 istart = length & 0xfffffff8; 00882 cursrc1 = &Src1[istart]; 00883 cursrc2 = &Src2[istart]; 00884 curdst = &Dest[istart]; 00885 } else { 00886 /* No unaligned bytes - we are done */ 00887 return (0); 00888 } 00889 } else { 00890 /* No bytes - we are done */ 00891 return (0); 00892 } 00893 } else { 00894 /* Setup to process whole image */ 00895 istart = 0; 00896 cursrc1 = Src1; 00897 cursrc2 = Src2; 00898 curdst = Dest; 00899 } 00900 00901 /* C routine to process image */ 00902 for (i = istart; i < length; i++) { 00903 *curdst = (int)*cursrc1 * (int)*cursrc2; // (int) for efficiency 00904 /* Advance pointers */ 00905 cursrc1++; 00906 cursrc2++; 00907 curdst++; 00908 } 00909 00910 return (0); 00911 } 00912 00923 static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 00924 { 00925 #ifdef USE_MMX 00926 #if !defined(GCC__) 00927 __asm 00928 { 00929 pusha 00930 mov eax, Src1 /* load Src1 address into eax */ 00931 mov ebx, Src2 /* load Src2 address into ebx */ 00932 mov edi, Dest /* load Dest address into edi */ 00933 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 00934 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 00935 pxor mm0, mm0 /* zero mm0 register */ 00936 align 16 /* 16 byte alignment of the loop entry */ 00937 L1015: 00938 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 00939 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */ 00940 movq mm2, mm1 /* copy mm1 into mm2 */ 00941 movq mm4, mm3 /* copy mm3 into mm4 */ 00942 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */ 00943 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */ 00944 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */ 00945 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */ 00946 psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */ 00947 psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */ 00948 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */ 00949 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */ 00950 packuswb mm1, mm2 /* pack words back into bytes with saturation */ 00951 movq [edi], mm1 /* store result in Dest */ 00952 add eax, 8 /* increase Src1, Src2 and Dest */ 00953 add ebx, 8 /* register pointers by 8 */ 00954 add edi, 8 00955 dec ecx /* decrease loop counter */ 00956 jnz L1015 /* check loop termination, proceed if required */ 00957 emms /* exit MMX state */ 00958 popa 00959 } 00960 #else 00961 /* i386 and x86_64 */ 00962 __m64 *mSrc1 = (__m64*)Src1; 00963 __m64 *mSrc2 = (__m64*)Src2; 00964 __m64 *mDest = (__m64*)Dest; 00965 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 00966 int i; 00967 for (i = 0; i < SrcLength/8; i++) { 00968 __m64 mm1, mm2, mm3, mm4, mm5, mm6; 00969 mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 00970 mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 00971 mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */ 00972 mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */ 00973 mm1 = _m_psrlwi(mm1, 1); /* divide mm1 words by 2, Src1 low bytes */ 00974 mm2 = _m_psrlwi(mm2, 1); /* divide mm2 words by 2, Src1 high bytes */ 00975 mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */ 00976 mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */ 00977 *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */ 00978 mSrc1++; 00979 mSrc2++; 00980 mDest++; 00981 } 00982 _m_empty(); /* clean MMX state */ 00983 #endif 00984 return (0); 00985 #else 00986 return (-1); 00987 #endif 00988 } 00989 01000 int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01001 { 01002 unsigned int i, istart; 01003 unsigned char *cursrc1, *cursrc2, *curdst; 01004 int result; 01005 01006 /* Validate input parameters */ 01007 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01008 return(-1); 01009 if (length == 0) 01010 return(0); 01011 01012 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01013 /* MMX routine */ 01014 SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length); 01015 01016 /* Check for unaligned bytes */ 01017 if ((length & 7) > 0) { 01018 /* Setup to process unaligned bytes */ 01019 istart = length & 0xfffffff8; 01020 cursrc1 = &Src1[istart]; 01021 cursrc2 = &Src2[istart]; 01022 curdst = &Dest[istart]; 01023 } else { 01024 /* No unaligned bytes - we are done */ 01025 return (0); 01026 } 01027 } else { 01028 /* Setup to process whole image */ 01029 istart = 0; 01030 cursrc1 = Src1; 01031 cursrc2 = Src2; 01032 curdst = Dest; 01033 } 01034 01035 /* C routine to process image */ 01036 for (i = istart; i < length; i++) { 01037 result = ((int) *cursrc1 / 2) * (int) *cursrc2; 01038 if (result > 255) 01039 result = 255; 01040 *curdst = (unsigned char) result; 01041 /* Advance pointers */ 01042 cursrc1++; 01043 cursrc2++; 01044 curdst++; 01045 } 01046 01047 return (0); 01048 } 01049 01060 static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01061 { 01062 #ifdef USE_MMX 01063 #if !defined(GCC__) 01064 __asm 01065 { 01066 pusha 01067 mov eax, Src1 /* load Src1 address into eax */ 01068 mov ebx, Src2 /* load Src2 address into ebx */ 01069 mov edi, Dest /* load Dest address into edi */ 01070 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01071 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01072 pxor mm0, mm0 /* zero mm0 register */ 01073 align 16 /* 16 byte alignment of the loop entry */ 01074 L1016: 01075 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 01076 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */ 01077 movq mm2, mm1 /* copy mm1 into mm2 */ 01078 movq mm4, mm3 /* copy mm3 into mm4 */ 01079 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */ 01080 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */ 01081 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */ 01082 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */ 01083 psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */ 01084 psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */ 01085 psrlw mm3, 1 /* divide mm3 words by 2, Src2 low bytes */ 01086 psrlw mm4, 1 /* divide mm4 words by 2, Src2 high bytes */ 01087 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */ 01088 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */ 01089 packuswb mm1, mm2 /* pack words back into bytes with saturation */ 01090 movq [edi], mm1 /* store result in Dest */ 01091 add eax, 8 /* increase Src1, Src2 and Dest */ 01092 add ebx, 8 /* register pointers by 8 */ 01093 add edi, 8 01094 dec ecx /* decrease loop counter */ 01095 jnz L1016 /* check loop termination, proceed if required */ 01096 emms /* exit MMX state */ 01097 popa 01098 } 01099 #else 01100 /* i386 and x86_64 */ 01101 __m64 *mSrc1 = (__m64*)Src1; 01102 __m64 *mSrc2 = (__m64*)Src2; 01103 __m64 *mDest = (__m64*)Dest; 01104 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 01105 int i; 01106 for (i = 0; i < SrcLength/8; i++) { 01107 __m64 mm1, mm2, mm3, mm4, mm5, mm6; 01108 mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 01109 mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 01110 mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */ 01111 mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */ 01112 mm1 = _m_psrlwi(mm1, 1); /* divide mm1 words by 2, Src1 low bytes */ 01113 mm2 = _m_psrlwi(mm2, 1); /* divide mm2 words by 2, Src1 high bytes */ 01114 mm3 = _m_psrlwi(mm3, 1); /* divide mm3 words by 2, Src2 low bytes */ 01115 mm4 = _m_psrlwi(mm4, 1); /* divide mm4 words by 2, Src2 high bytes */ 01116 mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */ 01117 mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */ 01118 *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */ 01119 mSrc1++; 01120 mSrc2++; 01121 mDest++; 01122 } 01123 _m_empty(); /* clean MMX state */ 01124 #endif 01125 return (0); 01126 #else 01127 return (-1); 01128 #endif 01129 } 01130 01141 int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01142 { 01143 unsigned int i, istart; 01144 unsigned char *cursrc1, *cursrc2, *curdst; 01145 int result; 01146 01147 /* Validate input parameters */ 01148 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01149 return(-1); 01150 if (length == 0) 01151 return(0); 01152 01153 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01154 /* MMX routine */ 01155 SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length); 01156 01157 /* Check for unaligned bytes */ 01158 if ((length & 7) > 0) { 01159 /* Setup to process unaligned bytes */ 01160 istart = length & 0xfffffff8; 01161 cursrc1 = &Src1[istart]; 01162 cursrc2 = &Src2[istart]; 01163 curdst = &Dest[istart]; 01164 } else { 01165 /* No unaligned bytes - we are done */ 01166 return (0); 01167 } 01168 } else { 01169 /* Setup to process whole image */ 01170 istart = 0; 01171 cursrc1 = Src1; 01172 cursrc2 = Src2; 01173 curdst = Dest; 01174 } 01175 01176 /* C routine to process image */ 01177 for (i = istart; i < length; i++) { 01178 result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2); 01179 if (result > 255) 01180 result = 255; 01181 *curdst = (unsigned char) result; 01182 /* Advance pointers */ 01183 cursrc1++; 01184 cursrc2++; 01185 curdst++; 01186 } 01187 01188 return (0); 01189 } 01190 01201 static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01202 { 01203 #ifdef USE_MMX 01204 #if !defined(GCC__) 01205 __asm 01206 { 01207 pusha 01208 mov eax, Src1 /* load Src1 address into eax */ 01209 mov ebx, Src2 /* load Src2 address into ebx */ 01210 mov edi, Dest /* load Dest address into edi */ 01211 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01212 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01213 align 16 /* 16 byte alignment of the loop entry */ 01214 L1017: 01215 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 01216 pand mm1, [ebx] /* mm1=Src1&Src2 */ 01217 movq [edi], mm1 /* store result in Dest */ 01218 add eax, 8 /* increase Src1, Src2 and Dest */ 01219 add ebx, 8 /* register pointers by 8 */ 01220 add edi, 8 01221 dec ecx /* decrease loop counter */ 01222 jnz L1017 /* check loop termination, proceed if required */ 01223 emms /* exit MMX state */ 01224 popa 01225 } 01226 #else 01227 /* x86_64 ASM with constraints: */ 01228 /* asm volatile ( */ 01229 /* "shr $3, %%rcx \n\t" /\* counter/8 (MMX loads 8 bytes at a time) *\/ */ 01230 /* ".align 16 \n\t" /\* 16 byte alignment of the loop entry *\/ */ 01231 /* "1: movq (%%rax), %%mm1 \n\t" /\* load 8 bytes from Src1 into mm1 *\/ */ 01232 /* "pand (%%rbx), %%mm1 \n\t" /\* mm1=Src1&Src2 *\/ */ 01233 /* "movq %%mm1, (%%rdi) \n\t" /\* store result in Dest *\/ */ 01234 /* "add $8, %%rax \n\t" /\* increase Src1, Src2 and Dest *\/ */ 01235 /* "add $8, %%rbx \n\t" /\* register pointers by 8 *\/ */ 01236 /* "add $8, %%rdi \n\t" */ 01237 /* "dec %%rcx \n\t" /\* decrease loop counter *\/ */ 01238 /* "jnz 1b \n\t" /\* check loop termination, proceed if required *\/ */ 01239 /* "emms \n\t" /\* exit MMX state *\/ */ 01240 /* : "+a" (Src1), /\* load Src1 address into rax, modified by the loop *\/ */ 01241 /* "+b" (Src2), /\* load Src2 address into rbx, modified by the loop *\/ */ 01242 /* "+c" (SrcLength), /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */ 01243 /* "+D" (Dest) /\* load Dest address into rdi, modified by the loop *\/ */ 01244 /* : */ 01245 /* : "memory", /\* *Dest is modified *\/ */ 01246 /* "mm1" /\* register mm1 modified *\/ */ 01247 /* ); */ 01248 01249 /* i386 and x86_64 */ 01250 __m64 *mSrc1 = (__m64*)Src1; 01251 __m64 *mSrc2 = (__m64*)Src2; 01252 __m64 *mDest = (__m64*)Dest; 01253 int i; 01254 for (i = 0; i < SrcLength/8; i++) { 01255 *mDest = _m_pand(*mSrc1, *mSrc2); /* Src1&Src2 */ 01256 mSrc1++; 01257 mSrc2++; 01258 mDest++; 01259 } 01260 _m_empty(); /* clean MMX state */ 01261 #endif 01262 return (0); 01263 #else 01264 return (-1); 01265 #endif 01266 } 01267 01278 int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01279 { 01280 unsigned int i, istart; 01281 unsigned char *cursrc1, *cursrc2, *curdst; 01282 01283 /* Validate input parameters */ 01284 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01285 return(-1); 01286 if (length == 0) 01287 return(0); 01288 01289 if ((SDL_imageFilterMMXdetect()>0) && (length>7)) { 01290 /* if (length > 7) { */ 01291 /* Call MMX routine */ 01292 01293 SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length); 01294 01295 /* Check for unaligned bytes */ 01296 if ((length & 7) > 0) { 01297 01298 /* Setup to process unaligned bytes */ 01299 istart = length & 0xfffffff8; 01300 cursrc1 = &Src1[istart]; 01301 cursrc2 = &Src2[istart]; 01302 curdst = &Dest[istart]; 01303 } else { 01304 /* No unaligned bytes - we are done */ 01305 return (0); 01306 } 01307 } else { 01308 /* Setup to process whole image */ 01309 istart = 0; 01310 cursrc1 = Src1; 01311 cursrc2 = Src2; 01312 curdst = Dest; 01313 } 01314 01315 /* C routine to process image */ 01316 for (i = istart; i < length; i++) { 01317 *curdst = (*cursrc1) & (*cursrc2); 01318 /* Advance pointers */ 01319 cursrc1++; 01320 cursrc2++; 01321 curdst++; 01322 } 01323 01324 return (0); 01325 } 01326 01337 static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01338 { 01339 #ifdef USE_MMX 01340 #if !defined(GCC__) 01341 __asm 01342 { 01343 pusha 01344 mov eax, Src1 /* load Src1 address into eax */ 01345 mov ebx, Src2 /* load Src2 address into ebx */ 01346 mov edi, Dest /* load Dest address into edi */ 01347 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01348 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01349 align 16 /* 16 byte alignment of the loop entry */ 01350 L91017: 01351 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */ 01352 por mm1, [ebx] /* mm1=Src1|Src2 */ 01353 movq [edi], mm1 /* store result in Dest */ 01354 add eax, 8 /* increase Src1, Src2 and Dest */ 01355 add ebx, 8 /* register pointers by 8 */ 01356 add edi, 8 01357 dec ecx /* decrease loop counter */ 01358 jnz L91017 /* check loop termination, proceed if required */ 01359 emms /* exit MMX state */ 01360 popa 01361 } 01362 #else 01363 /* i386 and x86_64 */ 01364 __m64 *mSrc1 = (__m64*)Src1; 01365 __m64 *mSrc2 = (__m64*)Src2; 01366 __m64 *mDest = (__m64*)Dest; 01367 int i; 01368 for (i = 0; i < SrcLength/8; i++) { 01369 *mDest = _m_por(*mSrc1, *mSrc2); /* Src1|Src2 */ 01370 mSrc1++; 01371 mSrc2++; 01372 mDest++; 01373 } 01374 _m_empty(); /* clean MMX state */ 01375 #endif 01376 return (0); 01377 #else 01378 return (-1); 01379 #endif 01380 } 01381 01392 int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01393 { 01394 unsigned int i, istart; 01395 unsigned char *cursrc1, *cursrc2, *curdst; 01396 01397 /* Validate input parameters */ 01398 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01399 return(-1); 01400 if (length == 0) 01401 return(0); 01402 01403 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01404 01405 /* MMX routine */ 01406 SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length); 01407 01408 /* Check for unaligned bytes */ 01409 if ((length & 7) > 0) { 01410 /* Setup to process unaligned bytes */ 01411 istart = length & 0xfffffff8; 01412 cursrc1 = &Src1[istart]; 01413 cursrc2 = &Src2[istart]; 01414 curdst = &Dest[istart]; 01415 } else { 01416 /* No unaligned bytes - we are done */ 01417 return (0); 01418 } 01419 } else { 01420 /* Setup to process whole image */ 01421 istart = 0; 01422 cursrc1 = Src1; 01423 cursrc2 = Src2; 01424 curdst = Dest; 01425 } 01426 01427 /* C routine to process image */ 01428 for (i = istart; i < length; i++) { 01429 *curdst = *cursrc1 | *cursrc2; 01430 /* Advance pointers */ 01431 cursrc1++; 01432 cursrc2++; 01433 curdst++; 01434 } 01435 return (0); 01436 } 01437 01448 static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength) 01449 { 01450 #ifdef USE_MMX 01451 #if !defined(GCC__) 01452 __asm 01453 { 01454 pusha 01455 mov edx, Src1 /* load Src1 address into edx */ 01456 mov esi, Src2 /* load Src2 address into esi */ 01457 mov edi, Dest /* load Dest address into edi */ 01458 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01459 align 16 /* 16 byte alignment of the loop entry */ 01460 L10191: 01461 mov bl, [esi] /* load a byte from Src2 */ 01462 cmp bl, 0 /* check if it zero */ 01463 jnz L10192 01464 mov [edi], 255 /* division by zero = 255 !!! */ 01465 jmp L10193 01466 L10192: 01467 xor ah, ah /* prepare AX, zero AH register */ 01468 mov al, [edx] /* load a byte from Src1 into AL */ 01469 div bl /* divide AL by BL */ 01470 mov [edi], al /* move a byte result to Dest */ 01471 L10193: 01472 inc edx /* increment Src1, Src2, Dest */ 01473 inc esi /* pointer registers by one */ 01474 inc edi 01475 dec ecx /* decrease loop counter */ 01476 jnz L10191 /* check loop termination, proceed if required */ 01477 popa 01478 } 01479 #else 01480 /* Note: ~15% gain on i386, less efficient than C on x86_64 */ 01481 /* Also depends on whether the function is static (?!) */ 01482 /* Also depends on whether we work on malloc() or static char[] */ 01483 asm volatile ( 01484 # if defined(i386) 01485 "pushl %%ebx \n\t" /* %ebx may be the PIC register. */ 01486 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01487 "1: mov (%%esi), %%bl \n\t" /* load a byte from Src2 */ 01488 "cmp $0, %%bl \n\t" /* check if it zero */ 01489 "jnz 2f \n\t" 01490 "movb $255, (%%edi) \n\t" /* division by zero = 255 !!! */ 01491 "jmp 3f \n\t" 01492 "2: xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */ 01493 "mov (%%edx), %%al \n\t" /* load a byte from Src1 into AL */ 01494 "div %%bl \n\t" /* divide AL by BL */ 01495 "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */ 01496 "3: inc %%edx \n\t" /* increment Src1, Src2, Dest */ 01497 "inc %%esi \n\t" /* pointer registers by one */ 01498 "inc %%edi \n\t" 01499 "dec %%ecx \n\t" /* decrease loop counter */ 01500 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01501 "popl %%ebx \n\t" /* restore %ebx */ 01502 : "+d" (Src1), /* load Src1 address into edx */ 01503 "+S" (Src2), /* load Src2 address into esi */ 01504 "+c" (SrcLength), /* load loop counter (SIZE) into ecx */ 01505 "+D" (Dest) /* load Dest address into edi */ 01506 : 01507 : "memory", "rax" 01508 # elif defined(__x86_64__) 01509 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 01510 "1: mov (%%rsi), %%bl \n\t" /* load a byte from Src2 */ 01511 "cmp $0, %%bl \n\t" /* check if it zero */ 01512 "jnz 2f \n\t" 01513 "movb $255, (%%rdi) \n\t" /* division by zero = 255 !!! */ 01514 "jmp 3f \n\t" 01515 "2: xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */ 01516 "mov (%%rdx), %%al \n\t" /* load a byte from Src1 into AL */ 01517 "div %%bl \n\t" /* divide AL by BL */ 01518 "mov %%al, (%%rdi) \n\t" /* move a byte result to Dest */ 01519 "3: inc %%rdx \n\t" /* increment Src1, Src2, Dest */ 01520 "inc %%rsi \n\t" /* pointer registers by one */ 01521 "inc %%rdi \n\t" 01522 "dec %%rcx \n\t" /* decrease loop counter */ 01523 "jnz 1b \n\t" /* check loop termination, proceed if required */ 01524 : "+d" (Src1), /* load Src1 address into edx */ 01525 "+S" (Src2), /* load Src2 address into esi */ 01526 "+c" (SrcLength), /* load loop counter (SIZE) into ecx */ 01527 "+D" (Dest) /* load Dest address into edi */ 01528 : 01529 : "memory", "rax", "rbx" 01530 # endif 01531 ); 01532 #endif 01533 return (0); 01534 #else 01535 return (-1); 01536 #endif 01537 } 01538 01549 int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length) 01550 { 01551 unsigned int i, istart; 01552 unsigned char *cursrc1, *cursrc2, *curdst; 01553 01554 /* Validate input parameters */ 01555 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL)) 01556 return(-1); 01557 if (length == 0) 01558 return(0); 01559 01560 if (SDL_imageFilterMMXdetect()) { 01561 if (length > 0) { 01562 /* Call ASM routine */ 01563 SDL_imageFilterDivASM(Src1, Src2, Dest, length); 01564 01565 /* Never unaligned bytes - we are done */ 01566 return (0); 01567 } else { 01568 return (-1); 01569 } 01570 } 01571 01572 /* Setup to process whole image */ 01573 istart = 0; 01574 cursrc1 = Src1; 01575 cursrc2 = Src2; 01576 curdst = Dest; 01577 01578 /* C routine to process image */ 01579 /* for (i = istart; i < length; i++) { */ 01580 /* if (*cursrc2 == 0) { */ 01581 /* *curdst = 255; */ 01582 /* } else { */ 01583 /* result = (int) *cursrc1 / (int) *cursrc2; */ 01584 /* *curdst = (unsigned char) result; */ 01585 /* } */ 01586 /* /\* Advance pointers *\/ */ 01587 /* cursrc1++; */ 01588 /* cursrc2++; */ 01589 /* curdst++; */ 01590 /* } */ 01591 for (i = istart; i < length; i++) { 01592 if (*cursrc2 == 0) { 01593 *curdst = 255; 01594 } else { 01595 *curdst = (int)*cursrc1 / (int)*cursrc2; // (int) for efficiency 01596 } 01597 /* Advance pointers */ 01598 cursrc1++; 01599 cursrc2++; 01600 curdst++; 01601 } 01602 01603 return (0); 01604 } 01605 01606 /* ------------------------------------------------------------------------------------ */ 01607 01617 static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength) 01618 { 01619 #ifdef USE_MMX 01620 #if !defined(GCC__) 01621 __asm 01622 { 01623 pusha 01624 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 01625 mov eax, Src1 /* load Src1 address into eax */ 01626 mov edi, Dest /* load Dest address into edi */ 01627 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01628 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01629 align 16 /* 16 byte alignment of the loop entry */ 01630 L91117: 01631 movq mm0, [eax] /* load 8 bytes from Src1 into mm1 */ 01632 pxor mm0, mm1 /* negate mm0 by xoring with mm1 */ 01633 movq [edi], mm0 /* store result in Dest */ 01634 add eax, 8 /* increase Src1, Src2 and Dest */ 01635 add edi, 8 01636 dec ecx /* decrease loop counter */ 01637 jnz L91117 /* check loop termination, proceed if required */ 01638 emms /* exit MMX state */ 01639 popa 01640 } 01641 #else 01642 /* i386 and x86_64 */ 01643 __m64 *mSrc1 = (__m64*)Src1; 01644 __m64 *mDest = (__m64*)Dest; 01645 __m64 mm1; 01646 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */ 01647 int i; 01648 for (i = 0; i < SrcLength/8; i++) { 01649 *mDest = _m_pxor(*mSrc1, mm1); /* negate mm0 by xoring with mm1 */ 01650 mSrc1++; 01651 mDest++; 01652 } 01653 _m_empty(); /* clean MMX state */ 01654 01655 #endif 01656 return (0); 01657 #else 01658 return (-1); 01659 #endif 01660 } 01661 01671 int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length) 01672 { 01673 unsigned int i, istart; 01674 unsigned char *cursrc1, *curdst; 01675 01676 /* Validate input parameters */ 01677 if ((Src1 == NULL) || (Dest == NULL)) 01678 return(-1); 01679 if (length == 0) 01680 return(0); 01681 01682 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01683 /* MMX routine */ 01684 SDL_imageFilterBitNegationMMX(Src1, Dest, length); 01685 01686 /* Check for unaligned bytes */ 01687 if ((length & 7) > 0) { 01688 /* Setup to process unaligned bytes */ 01689 istart = length & 0xfffffff8; 01690 cursrc1 = &Src1[istart]; 01691 curdst = &Dest[istart]; 01692 } else { 01693 /* No unaligned bytes - we are done */ 01694 return (0); 01695 } 01696 } else { 01697 /* Setup to process whole image */ 01698 istart = 0; 01699 cursrc1 = Src1; 01700 curdst = Dest; 01701 } 01702 01703 /* C routine to process image */ 01704 for (i = istart; i < length; i++) { 01705 *curdst = ~(*cursrc1); 01706 /* Advance pointers */ 01707 cursrc1++; 01708 curdst++; 01709 } 01710 01711 return (0); 01712 } 01713 01724 static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C) 01725 { 01726 #ifdef USE_MMX 01727 #if !defined(GCC__) 01728 __asm 01729 { 01730 pusha 01731 /* ** Duplicate C in 8 bytes of MM1 ** */ 01732 mov al, C /* load C into AL */ 01733 mov ah, al /* copy AL into AH */ 01734 mov bx, ax /* copy AX into BX */ 01735 shl eax, 16 /* shift 2 bytes of EAX left */ 01736 mov ax, bx /* copy BX into AX */ 01737 movd mm1, eax /* copy EAX into MM1 */ 01738 movd mm2, eax /* copy EAX into MM2 */ 01739 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 01740 mov eax, Src1 /* load Src1 address into eax */ 01741 mov edi, Dest /* load Dest address into edi */ 01742 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01743 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01744 align 16 /* 16 byte alignment of the loop entry */ 01745 L1021: 01746 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */ 01747 paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */ 01748 movq [edi], mm0 /* store result in Dest */ 01749 add eax, 8 /* increase Dest register pointer by 8 */ 01750 add edi, 8 /* increase Dest register pointer by 8 */ 01751 dec ecx /* decrease loop counter */ 01752 jnz L1021 /* check loop termination, proceed if required */ 01753 emms /* exit MMX state */ 01754 popa 01755 } 01756 #else 01757 /* i386 and x86_64 */ 01758 __m64 *mSrc1 = (__m64*)Src1; 01759 __m64 *mDest = (__m64*)Dest; 01760 /* Duplicate C in 8 bytes of MM1 */ 01761 int i; 01762 memset(&i, C, 4); 01763 __m64 mm1 = _m_from_int(i); 01764 __m64 mm2 = _m_from_int(i); 01765 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */ 01766 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 01767 for (i = 0; i < SrcLength/8; i++) { 01768 *mDest = _m_paddusb(*mSrc1, mm1); /* Src1+C (add 8 bytes with saturation) */ 01769 mSrc1++; 01770 mDest++; 01771 } 01772 _m_empty(); /* clean MMX state */ 01773 #endif 01774 return (0); 01775 #else 01776 return (-1); 01777 #endif 01778 } 01779 01791 int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 01792 { 01793 unsigned int i, istart; 01794 int iC; 01795 unsigned char *cursrc1, *curdest; 01796 int result; 01797 01798 /* Validate input parameters */ 01799 if ((Src1 == NULL) || (Dest == NULL)) 01800 return(-1); 01801 if (length == 0) 01802 return(0); 01803 01804 /* Special case: C==0 */ 01805 if (C == 0) { 01806 memcpy(Src1, Dest, length); 01807 return (0); 01808 } 01809 01810 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01811 01812 /* MMX routine */ 01813 SDL_imageFilterAddByteMMX(Src1, Dest, length, C); 01814 01815 /* Check for unaligned bytes */ 01816 if ((length & 7) > 0) { 01817 /* Setup to process unaligned bytes */ 01818 istart = length & 0xfffffff8; 01819 cursrc1 = &Src1[istart]; 01820 curdest = &Dest[istart]; 01821 } else { 01822 /* No unaligned bytes - we are done */ 01823 return (0); 01824 } 01825 } else { 01826 /* Setup to process whole image */ 01827 istart = 0; 01828 cursrc1 = Src1; 01829 curdest = Dest; 01830 } 01831 01832 /* C routine to process image */ 01833 iC = (int) C; 01834 for (i = istart; i < length; i++) { 01835 result = (int) *cursrc1 + iC; 01836 if (result > 255) 01837 result = 255; 01838 *curdest = (unsigned char) result; 01839 /* Advance pointers */ 01840 cursrc1++; 01841 curdest++; 01842 } 01843 return (0); 01844 } 01845 01857 static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D) 01858 { 01859 #ifdef USE_MMX 01860 #if !defined(GCC__) 01861 __asm 01862 { 01863 pusha 01864 /* ** Duplicate (int)C in 8 bytes of MM1 ** */ 01865 mov eax, C /* load C into EAX */ 01866 movd mm1, eax /* copy EAX into MM1 */ 01867 mov eax, D /* load D into EAX */ 01868 movd mm2, eax /* copy EAX into MM2 */ 01869 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 01870 mov eax, Src1 /* load Src1 address into eax */ 01871 mov edi, Dest /* load Dest address into edi */ 01872 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 01873 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 01874 align 16 /* 16 byte alignment of the loop entry */ 01875 L11023: 01876 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 01877 paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */ 01878 movq [edi], mm0 /* store result in SrcDest */ 01879 add eax, 8 /* increase Src1 register pointer by 8 */ 01880 add edi, 8 /* increase Dest register pointer by 8 */ 01881 dec ecx /* decrease loop counter */ 01882 jnz L11023 /* check loop termination, proceed if required */ 01883 emms /* exit MMX state */ 01884 popa 01885 } 01886 #else 01887 /* i386 and x86_64 */ 01888 __m64 *mSrc1 = (__m64*)Src1; 01889 __m64 *mDest = (__m64*)Dest; 01890 /* Duplicate (int)C in 8 bytes of MM1 */ 01891 __m64 mm1 = _m_from_int(C); 01892 __m64 mm2 = _m_from_int(C); 01893 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */ 01894 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 01895 int i; 01896 for (i = 0; i < SrcLength/8; i++) { 01897 *mDest = _m_paddusb(*mSrc1, mm1); /* Src1+C (add 8 bytes with saturation) */ 01898 mSrc1++; 01899 mDest++; 01900 } 01901 _m_empty(); /* clean MMX state */ 01902 #endif 01903 return (0); 01904 #else 01905 return (-1); 01906 #endif 01907 } 01908 01919 int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C) 01920 { 01921 unsigned int i, j, istart, D; 01922 int iC[4]; 01923 unsigned char *cursrc1; 01924 unsigned char *curdest; 01925 int result; 01926 01927 /* Validate input parameters */ 01928 if ((Src1 == NULL) || (Dest == NULL)) 01929 return(-1); 01930 if (length == 0) 01931 return(0); 01932 01933 /* Special case: C==0 */ 01934 if (C == 0) { 01935 memcpy(Src1, Dest, length); 01936 return (0); 01937 } 01938 01939 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 01940 01941 /* MMX routine */ 01942 D=SWAP_32(C); 01943 SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D); 01944 01945 /* Check for unaligned bytes */ 01946 if ((length & 7) > 0) { 01947 /* Setup to process unaligned bytes */ 01948 istart = length & 0xfffffff8; 01949 cursrc1 = &Src1[istart]; 01950 curdest = &Dest[istart]; 01951 } else { 01952 /* No unaligned bytes - we are done */ 01953 return (0); 01954 } 01955 } else { 01956 /* Setup to process whole image */ 01957 istart = 0; 01958 cursrc1 = Src1; 01959 curdest = Dest; 01960 } 01961 01962 /* C routine to process bytes */ 01963 iC[3] = (int) ((C >> 24) & 0xff); 01964 iC[2] = (int) ((C >> 16) & 0xff); 01965 iC[1] = (int) ((C >> 8) & 0xff); 01966 iC[0] = (int) ((C >> 0) & 0xff); 01967 for (i = istart; i < length; i += 4) { 01968 for (j = 0; j < 4; j++) { 01969 if ((i+j)<length) { 01970 result = (int) *cursrc1 + iC[j]; 01971 if (result > 255) result = 255; 01972 *curdest = (unsigned char) result; 01973 /* Advance pointers */ 01974 cursrc1++; 01975 curdest++; 01976 } 01977 } 01978 } 01979 return (0); 01980 } 01981 01993 static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C, 01994 unsigned char *Mask) 01995 { 01996 #ifdef USE_MMX 01997 #if !defined(GCC__) 01998 __asm 01999 { 02000 pusha 02001 /* ** Duplicate C in 8 bytes of MM1 ** */ 02002 mov al, C /* load C into AL */ 02003 mov ah, al /* copy AL into AH */ 02004 mov bx, ax /* copy AX into BX */ 02005 shl eax, 16 /* shift 2 bytes of EAX left */ 02006 mov ax, bx /* copy BX into AX */ 02007 movd mm1, eax /* copy EAX into MM1 */ 02008 movd mm2, eax /* copy EAX into MM2 */ 02009 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 02010 mov edx, Mask /* load Mask address into edx */ 02011 movq mm0, [edx] /* load Mask into mm0 */ 02012 mov eax, Src1 /* load Src1 address into eax */ 02013 mov edi, Dest /* load Dest address into edi */ 02014 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02015 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02016 align 16 /* 16 byte alignment of the loop entry */ 02017 L1022: 02018 movq mm2, [eax] /* load 8 bytes from Src1 into MM2 */ 02019 psrlw mm2, 1 /* shift 4 WORDS of MM2 1 bit to the right */ 02020 pand mm2, mm0 // apply Mask to 8 BYTES of MM2 */ 02021 paddusb mm2, mm1 /* MM2=SrcDest+C (add 8 bytes with saturation) */ 02022 movq [edi], mm2 /* store result in Dest */ 02023 add eax, 8 /* increase Src1 register pointer by 8 */ 02024 add edi, 8 /* increase Dest register pointer by 8 */ 02025 dec ecx /* decrease loop counter */ 02026 jnz L1022 /* check loop termination, proceed if required */ 02027 emms /* exit MMX state */ 02028 popa 02029 } 02030 #else 02031 /* i386 and x86_64 */ 02032 __m64 *mSrc1 = (__m64*)Src1; 02033 __m64 *mDest = (__m64*)Dest; 02034 __m64 *mMask = (__m64*)Mask; 02035 /* Duplicate C in 8 bytes of MM1 */ 02036 int i; 02037 memset(&i, C, 4); 02038 __m64 mm1 = _m_from_int(i); 02039 __m64 mm2 = _m_from_int(i); 02040 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */ 02041 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 02042 for (i = 0; i < SrcLength/8; i++) { 02043 __m64 mm2 = _m_psrlwi(*mSrc1, 1); /* shift 4 WORDS of MM2 1 bit to the right */ 02044 mm2 = _m_pand(mm2, *mMask); /* apply Mask to 8 BYTES of MM2 */ 02045 /* byte 0x0f, 0xdb, 0xd0 */ 02046 *mDest = _m_paddusb(mm1, mm2); /* Src1+C (add 8 bytes with saturation) */ 02047 mSrc1++; 02048 mDest++; 02049 } 02050 _m_empty(); /* clean MMX state */ 02051 #endif 02052 return (0); 02053 #else 02054 return (-1); 02055 #endif 02056 } 02057 02068 int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 02069 { 02070 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }; 02071 unsigned int i, istart; 02072 int iC; 02073 unsigned char *cursrc1; 02074 unsigned char *curdest; 02075 int result; 02076 02077 /* Validate input parameters */ 02078 if ((Src1 == NULL) || (Dest == NULL)) 02079 return(-1); 02080 if (length == 0) 02081 return(0); 02082 02083 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02084 02085 /* MMX routine */ 02086 SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask); 02087 02088 /* Check for unaligned bytes */ 02089 if ((length & 7) > 0) { 02090 /* Setup to process unaligned bytes */ 02091 istart = length & 0xfffffff8; 02092 cursrc1 = &Src1[istart]; 02093 curdest = &Dest[istart]; 02094 } else { 02095 /* No unaligned bytes - we are done */ 02096 return (0); 02097 } 02098 } else { 02099 /* Setup to process whole image */ 02100 istart = 0; 02101 cursrc1 = Src1; 02102 curdest = Dest; 02103 } 02104 02105 /* C routine to process image */ 02106 iC = (int) C; 02107 for (i = istart; i < length; i++) { 02108 result = (int) (*cursrc1 / 2) + iC; 02109 if (result > 255) 02110 result = 255; 02111 *curdest = (unsigned char) result; 02112 /* Advance pointers */ 02113 cursrc1++; 02114 curdest++; 02115 } 02116 02117 return (0); 02118 } 02119 02130 int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C) 02131 { 02132 #ifdef USE_MMX 02133 #if !defined(GCC__) 02134 __asm 02135 { 02136 pusha 02137 /* ** Duplicate C in 8 bytes of MM1 ** */ 02138 mov al, C /* load C into AL */ 02139 mov ah, al /* copy AL into AH */ 02140 mov bx, ax /* copy AX into BX */ 02141 shl eax, 16 /* shift 2 bytes of EAX left */ 02142 mov ax, bx /* copy BX into AX */ 02143 movd mm1, eax /* copy EAX into MM1 */ 02144 movd mm2, eax /* copy EAX into MM2 */ 02145 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 02146 mov eax, Src1 /* load Src1 address into eax */ 02147 mov edi, Dest /* load Dest address into edi */ 02148 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02149 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02150 align 16 /* 16 byte alignment of the loop entry */ 02151 L1023: 02152 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02153 psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */ 02154 movq [edi], mm0 /* store result in SrcDest */ 02155 add eax, 8 /* increase Src1 register pointer by 8 */ 02156 add edi, 8 /* increase Dest register pointer by 8 */ 02157 dec ecx /* decrease loop counter */ 02158 jnz L1023 /* check loop termination, proceed if required */ 02159 emms /* exit MMX state */ 02160 popa 02161 } 02162 #else 02163 /* i386 and x86_64 */ 02164 __m64 *mSrc1 = (__m64*)Src1; 02165 __m64 *mDest = (__m64*)Dest; 02166 /* Duplicate C in 8 bytes of MM1 */ 02167 int i; 02168 memset(&i, C, 4); 02169 __m64 mm1 = _m_from_int(i); 02170 __m64 mm2 = _m_from_int(i); 02171 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */ 02172 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 02173 for (i = 0; i < SrcLength/8; i++) { 02174 *mDest = _m_psubusb(*mSrc1, mm1); /* Src1-C (sub 8 bytes with saturation) */ 02175 mSrc1++; 02176 mDest++; 02177 } 02178 _m_empty(); /* clean MMX state */ 02179 #endif 02180 return (0); 02181 #else 02182 return (-1); 02183 #endif 02184 } 02185 02196 int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 02197 { 02198 unsigned int i, istart; 02199 int iC; 02200 unsigned char *cursrc1; 02201 unsigned char *curdest; 02202 int result; 02203 02204 /* Validate input parameters */ 02205 if ((Src1 == NULL) || (Dest == NULL)) 02206 return(-1); 02207 if (length == 0) 02208 return(0); 02209 02210 /* Special case: C==0 */ 02211 if (C == 0) { 02212 memcpy(Src1, Dest, length); 02213 return (0); 02214 } 02215 02216 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02217 02218 /* MMX routine */ 02219 SDL_imageFilterSubByteMMX(Src1, Dest, length, C); 02220 02221 /* Check for unaligned bytes */ 02222 if ((length & 7) > 0) { 02223 /* Setup to process unaligned bytes */ 02224 istart = length & 0xfffffff8; 02225 cursrc1 = &Src1[istart]; 02226 curdest = &Dest[istart]; 02227 } else { 02228 /* No unaligned bytes - we are done */ 02229 return (0); 02230 } 02231 } else { 02232 /* Setup to process whole image */ 02233 istart = 0; 02234 cursrc1 = Src1; 02235 curdest = Dest; 02236 } 02237 02238 /* C routine to process image */ 02239 iC = (int) C; 02240 for (i = istart; i < length; i++) { 02241 result = (int) *cursrc1 - iC; 02242 if (result < 0) 02243 result = 0; 02244 *curdest = (unsigned char) result; 02245 /* Advance pointers */ 02246 cursrc1++; 02247 curdest++; 02248 } 02249 return (0); 02250 } 02251 02263 static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D) 02264 { 02265 #ifdef USE_MMX 02266 #if !defined(GCC__) 02267 __asm 02268 { 02269 pusha 02270 /* ** Duplicate (int)C in 8 bytes of MM1 ** */ 02271 mov eax, C /* load C into EAX */ 02272 movd mm1, eax /* copy EAX into MM1 */ 02273 mov eax, D /* load D into EAX */ 02274 movd mm2, eax /* copy EAX into MM2 */ 02275 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */ 02276 mov eax, Src1 /* load Src1 address into eax */ 02277 mov edi, Dest /* load Dest address into edi */ 02278 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02279 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02280 align 16 /* 16 byte alignment of the loop entry */ 02281 L11024: 02282 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02283 psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */ 02284 movq [edi], mm0 /* store result in SrcDest */ 02285 add eax, 8 /* increase Src1 register pointer by 8 */ 02286 add edi, 8 /* increase Dest register pointer by 8 */ 02287 dec ecx /* decrease loop counter */ 02288 jnz L11024 /* check loop termination, proceed if required */ 02289 emms /* exit MMX state */ 02290 popa 02291 } 02292 #else 02293 /* i386 and x86_64 */ 02294 __m64 *mSrc1 = (__m64*)Src1; 02295 __m64 *mDest = (__m64*)Dest; 02296 /* Duplicate (int)C in 8 bytes of MM1 */ 02297 __m64 mm1 = _m_from_int(C); 02298 __m64 mm2 = _m_from_int(C); 02299 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */ 02300 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 02301 int i; 02302 for (i = 0; i < SrcLength/8; i++) { 02303 *mDest = _m_psubusb(*mSrc1, mm1); /* Src1-C (sub 8 bytes with saturation) */ 02304 mSrc1++; 02305 mDest++; 02306 } 02307 _m_empty(); /* clean MMX state */ 02308 #endif 02309 return (0); 02310 #else 02311 return (-1); 02312 #endif 02313 } 02314 02325 int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C) 02326 { 02327 unsigned int i, j, istart, D; 02328 int iC[4]; 02329 unsigned char *cursrc1; 02330 unsigned char *curdest; 02331 int result; 02332 02333 /* Validate input parameters */ 02334 if ((Src1 == NULL) || (Dest == NULL)) 02335 return(-1); 02336 if (length == 0) 02337 return(0); 02338 02339 /* Special case: C==0 */ 02340 if (C == 0) { 02341 memcpy(Src1, Dest, length); 02342 return (0); 02343 } 02344 02345 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02346 02347 /* MMX routine */ 02348 D=SWAP_32(C); 02349 SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D); 02350 02351 /* Check for unaligned bytes */ 02352 if ((length & 7) > 0) { 02353 /* Setup to process unaligned bytes */ 02354 istart = length & 0xfffffff8; 02355 cursrc1 = &Src1[istart]; 02356 curdest = &Dest[istart]; 02357 } else { 02358 /* No unaligned bytes - we are done */ 02359 return (0); 02360 } 02361 } else { 02362 /* Setup to process whole image */ 02363 istart = 0; 02364 cursrc1 = Src1; 02365 curdest = Dest; 02366 } 02367 02368 /* C routine to process image */ 02369 iC[3] = (int) ((C >> 24) & 0xff); 02370 iC[2] = (int) ((C >> 16) & 0xff); 02371 iC[1] = (int) ((C >> 8) & 0xff); 02372 iC[0] = (int) ((C >> 0) & 0xff); 02373 for (i = istart; i < length; i += 4) { 02374 for (j = 0; j < 4; j++) { 02375 if ((i+j)<length) { 02376 result = (int) *cursrc1 - iC[j]; 02377 if (result < 0) result = 0; 02378 *curdest = (unsigned char) result; 02379 /* Advance pointers */ 02380 cursrc1++; 02381 curdest++; 02382 } 02383 } 02384 } 02385 return (0); 02386 } 02387 02399 static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N, 02400 unsigned char *Mask) 02401 { 02402 #ifdef USE_MMX 02403 #if !defined(GCC__) 02404 __asm 02405 { 02406 pusha 02407 mov edx, Mask /* load Mask address into edx */ 02408 movq mm0, [edx] /* load Mask into mm0 */ 02409 xor ecx, ecx /* zero ECX */ 02410 mov cl, N /* load loop counter (N) into CL */ 02411 movd mm3, ecx /* copy (N) into MM3 */ 02412 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 02413 L10240: /* ** Prepare proper bit-Mask in MM1 ** */ 02414 psrlw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the right */ 02415 pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */ 02416 /* byte 0x0f, 0xdb, 0xc8 */ 02417 dec cl /* decrease loop counter */ 02418 jnz L10240 /* check loop termination, proceed if required */ 02419 /* ** Shift all bytes of the image ** */ 02420 mov eax, Src1 /* load Src1 address into eax */ 02421 mov edi, Dest /* load Dest address into edi */ 02422 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02423 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02424 align 16 /* 16 byte alignment of the loop entry */ 02425 L10241: 02426 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02427 psrlw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the right */ 02428 pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */ 02429 /* byte 0x0f, 0xdb, 0xc1 */ 02430 movq [edi], mm0 /* store result in SrcDest */ 02431 add eax, 8 /* increase Src1 register pointer by 8 */ 02432 add edi, 8 /* increase Dest register pointer by 8 */ 02433 dec ecx /* decrease loop counter */ 02434 jnz L10241 /* check loop termination, proceed if required */ 02435 emms /* exit MMX state */ 02436 popa 02437 } 02438 #else 02439 /* i386 and x86_64 */ 02440 __m64 *mSrc1 = (__m64*)Src1; 02441 __m64 *mDest = (__m64*)Dest; 02442 __m64 *mMask = (__m64*)Mask; 02443 __m64 mm1; 02444 int i; 02445 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */ 02446 /* Prepare proper bit-Mask in MM1 */ 02447 for (i = 0; i < N; i++) { 02448 mm1 = _m_psrlwi(mm1, 1); /* shift 4 WORDS of MM1 1 bit to the right */ 02449 mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of MM1 */ 02450 } 02451 /* Shift all bytes of the image */ 02452 for (i = 0; i < SrcLength/8; i++) { 02453 __m64 mm0 = _m_psrlwi(*mSrc1, N); /* shift 4 WORDS of MM0 (N) bits to the right */ 02454 *mDest = _m_pand(mm0, mm1); /* apply proper bit-Mask to 8 BYTES of MM0 */ 02455 mSrc1++; 02456 mDest++; 02457 } 02458 _m_empty(); /* clean MMX state */ 02459 #endif 02460 return (0); 02461 #else 02462 return (-1); 02463 #endif 02464 } 02465 02476 int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 02477 { 02478 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F }; 02479 unsigned int i, istart; 02480 unsigned char *cursrc1; 02481 unsigned char *curdest; 02482 02483 /* Validate input parameters */ 02484 if ((Src1 == NULL) || (Dest == NULL)) 02485 return(-1); 02486 if (length == 0) 02487 return(0); 02488 02489 /* Check shift */ 02490 if (N > 8) { 02491 return (-1); 02492 } 02493 02494 /* Special case: N==0 */ 02495 if (N == 0) { 02496 memcpy(Src1, Dest, length); 02497 return (0); 02498 } 02499 02500 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02501 02502 /* MMX routine */ 02503 SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask); 02504 02505 /* Check for unaligned bytes */ 02506 if ((length & 7) > 0) { 02507 /* Setup to process unaligned bytes */ 02508 istart = length & 0xfffffff8; 02509 cursrc1 = &Src1[istart]; 02510 curdest = &Dest[istart]; 02511 } else { 02512 /* No unaligned bytes - we are done */ 02513 return (0); 02514 } 02515 } else { 02516 /* Setup to process whole image */ 02517 istart = 0; 02518 cursrc1 = Src1; 02519 curdest = Dest; 02520 } 02521 02522 /* C routine to process image */ 02523 for (i = istart; i < length; i++) { 02524 *curdest = (unsigned char) *cursrc1 >> N; 02525 /* Advance pointers */ 02526 cursrc1++; 02527 curdest++; 02528 } 02529 02530 return (0); 02531 } 02532 02543 static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N) 02544 { 02545 #ifdef USE_MMX 02546 #if !defined(GCC__) 02547 __asm 02548 { 02549 pusha 02550 mov eax, Src1 /* load Src1 address into eax */ 02551 mov edi, Dest /* load Dest address into edi */ 02552 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02553 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02554 align 16 /* 16 byte alignment of the loop entry */ 02555 L13023: 02556 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 02557 psrld mm0, N 02558 movq [edi], mm0 /* store result in SrcDest */ 02559 add eax, 8 /* increase Src1 register pointer by 8 */ 02560 add edi, 8 /* increase Dest register pointer by 8 */ 02561 dec ecx /* decrease loop counter */ 02562 jnz L13023 /* check loop termination, proceed if required */ 02563 emms /* exit MMX state */ 02564 popa 02565 } 02566 #else 02567 /* i386 and x86_64 */ 02568 __m64 *mSrc1 = (__m64*)Src1; 02569 __m64 *mDest = (__m64*)Dest; 02570 int i; 02571 for (i = 0; i < SrcLength/8; i++) { 02572 *mDest = _m_psrldi(*mSrc1, N); 02573 mSrc1++; 02574 mDest++; 02575 } 02576 _m_empty(); /* clean MMX state */ 02577 #endif 02578 return (0); 02579 #else 02580 return (-1); 02581 #endif 02582 } 02583 02594 int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 02595 { 02596 unsigned int i, istart; 02597 unsigned char *cursrc1, *curdest; 02598 unsigned int *icursrc1, *icurdest; 02599 unsigned int result; 02600 02601 /* Validate input parameters */ 02602 if ((Src1 == NULL) || (Dest == NULL)) 02603 return(-1); 02604 if (length == 0) 02605 return(0); 02606 02607 if (N > 32) { 02608 return (-1); 02609 } 02610 02611 /* Special case: N==0 */ 02612 if (N == 0) { 02613 memcpy(Src1, Dest, length); 02614 return (0); 02615 } 02616 02617 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02618 02619 SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N); 02620 02621 /* Check for unaligned bytes */ 02622 if ((length & 7) > 0) { 02623 /* Setup to process unaligned bytes */ 02624 istart = length & 0xfffffff8; 02625 cursrc1 = &Src1[istart]; 02626 curdest = &Dest[istart]; 02627 } else { 02628 /* No unaligned bytes - we are done */ 02629 return (0); 02630 } 02631 } else { 02632 /* Setup to process whole image */ 02633 istart = 0; 02634 cursrc1 = Src1; 02635 curdest = Dest; 02636 } 02637 02638 /* C routine to process image */ 02639 icursrc1=(unsigned int *)cursrc1; 02640 icurdest=(unsigned int *)curdest; 02641 for (i = istart; i < length; i += 4) { 02642 if ((i+4)<length) { 02643 result = ((unsigned int)*icursrc1 >> N); 02644 *icurdest = result; 02645 } 02646 /* Advance pointers */ 02647 icursrc1++; 02648 icurdest++; 02649 } 02650 02651 return (0); 02652 } 02653 02664 static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C) 02665 { 02666 #ifdef USE_MMX 02667 #if !defined(GCC__) 02668 __asm 02669 { 02670 pusha 02671 /* ** Duplicate C in 4 words of MM1 ** */ 02672 mov al, C /* load C into AL */ 02673 xor ah, ah /* zero AH */ 02674 mov bx, ax /* copy AX into BX */ 02675 shl eax, 16 /* shift 2 bytes of EAX left */ 02676 mov ax, bx /* copy BX into AX */ 02677 movd mm1, eax /* copy EAX into MM1 */ 02678 movd mm2, eax /* copy EAX into MM2 */ 02679 punpckldq mm1, mm2 /* fill higher words of MM1 with C */ 02680 pxor mm0, mm0 /* zero MM0 register */ 02681 mov eax, Src1 /* load Src1 address into eax */ 02682 mov edi, Dest /* load Dest address into edi */ 02683 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02684 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02685 cmp al, 128 /* if (C <= 128) execute more efficient code */ 02686 jg L10251 02687 align 16 /* 16 byte alignment of the loop entry */ 02688 L10250: 02689 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 02690 movq mm4, mm3 /* copy MM3 into MM4 */ 02691 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 02692 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 02693 pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */ 02694 pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */ 02695 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 02696 movq [edi], mm3 /* store result in Dest */ 02697 add eax, 8 /* increase Src1 register pointer by 8 */ 02698 add edi, 8 /* increase Dest register pointer by 8 */ 02699 dec ecx /* decrease loop counter */ 02700 jnz L10250 /* check loop termination, proceed if required */ 02701 jmp L10252 02702 align 16 /* 16 byte alignment of the loop entry */ 02703 L10251: 02704 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 02705 movq mm4, mm3 /* copy MM3 into MM4 */ 02706 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 02707 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 02708 pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */ 02709 pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */ 02710 /* ** Take abs value of the results (signed words) ** */ 02711 movq mm5, mm3 /* copy mm3 into mm5 */ 02712 movq mm6, mm4 /* copy mm4 into mm6 */ 02713 psraw mm5, 15 /* fill mm5 words with word sign bit */ 02714 psraw mm6, 15 /* fill mm6 words with word sign bit */ 02715 pxor mm3, mm5 /* take 1's compliment of only neg words */ 02716 pxor mm4, mm6 /* take 1's compliment of only neg words */ 02717 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */ 02718 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 02719 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 02720 movq [edi], mm3 /* store result in Dest */ 02721 add eax, 8 /* increase Src1 register pointer by 8 */ 02722 add edi, 8 /* increase Dest register pointer by 8 */ 02723 dec ecx /* decrease loop counter */ 02724 jnz L10251 /* check loop termination, proceed if required */ 02725 L10252: 02726 emms /* exit MMX state */ 02727 popa 02728 } 02729 #else 02730 /* i386 and x86_64 */ 02731 __m64 *mSrc1 = (__m64*)Src1; 02732 __m64 *mDest = (__m64*)Dest; 02733 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 02734 /* Duplicate C in 4 words of MM1 */ 02735 int i; 02736 i = C | C<<16; 02737 __m64 mm1 = _m_from_int(i); 02738 __m64 mm2 = _m_from_int(i); 02739 mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with C */ 02740 // long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48; 02741 //__m64 mm1 = _m_from_int64(lli); // x86_64 only 02742 if (C <= 128) { /* if (C <= 128) execute more efficient code */ 02743 for (i = 0; i < SrcLength/8; i++) { 02744 __m64 mm3, mm4; 02745 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 02746 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 02747 mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */ 02748 mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */ 02749 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 02750 mSrc1++; 02751 mDest++; 02752 } 02753 } else { 02754 for (i = 0; i < SrcLength/8; i++) { 02755 __m64 mm3, mm4, mm5, mm6; 02756 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 02757 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 02758 mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */ 02759 mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */ 02760 /* Take abs value of the results (signed words) */ 02761 mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */ 02762 mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */ 02763 mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */ 02764 mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */ 02765 mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */ 02766 mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */ 02767 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 02768 mSrc1++; 02769 mDest++; 02770 } 02771 } 02772 _m_empty(); /* clean MMX state */ 02773 #endif 02774 return (0); 02775 #else 02776 return (-1); 02777 #endif 02778 } 02779 02790 int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C) 02791 { 02792 unsigned int i, istart; 02793 int iC; 02794 unsigned char *cursrc1; 02795 unsigned char *curdest; 02796 int result; 02797 02798 /* Validate input parameters */ 02799 if ((Src1 == NULL) || (Dest == NULL)) 02800 return(-1); 02801 if (length == 0) 02802 return(0); 02803 02804 /* Special case: C==1 */ 02805 if (C == 1) { 02806 memcpy(Src1, Dest, length); 02807 return (0); 02808 } 02809 02810 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02811 02812 SDL_imageFilterMultByByteMMX(Src1, Dest, length, C); 02813 02814 /* Check for unaligned bytes */ 02815 if ((length & 7) > 0) { 02816 /* Setup to process unaligned bytes */ 02817 istart = length & 0xfffffff8; 02818 cursrc1 = &Src1[istart]; 02819 curdest = &Dest[istart]; 02820 } else { 02821 /* No unaligned bytes - we are done */ 02822 return (0); 02823 } 02824 } else { 02825 /* Setup to process whole image */ 02826 istart = 0; 02827 cursrc1 = Src1; 02828 curdest = Dest; 02829 } 02830 02831 /* C routine to process image */ 02832 iC = (int) C; 02833 for (i = istart; i < length; i++) { 02834 result = (int) *cursrc1 * iC; 02835 if (result > 255) 02836 result = 255; 02837 *curdest = (unsigned char) result; 02838 /* Advance pointers */ 02839 cursrc1++; 02840 curdest++; 02841 } 02842 02843 return (0); 02844 } 02845 02857 static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N, 02858 unsigned char C) 02859 { 02860 #ifdef USE_MMX 02861 #if !defined(GCC__) 02862 __asm 02863 { 02864 pusha 02865 /* ** Duplicate C in 4 words of MM1 ** */ 02866 mov al, C /* load C into AL */ 02867 xor ah, ah /* zero AH */ 02868 mov bx, ax /* copy AX into BX */ 02869 shl eax, 16 /* shift 2 bytes of EAX left */ 02870 mov ax, bx /* copy BX into AX */ 02871 movd mm1, eax /* copy EAX into MM1 */ 02872 movd mm2, eax /* copy EAX into MM2 */ 02873 punpckldq mm1, mm2 /* fill higher words of MM1 with C */ 02874 xor ecx, ecx /* zero ECX */ 02875 mov cl, N /* load N into CL */ 02876 movd mm7, ecx /* copy N into MM7 */ 02877 pxor mm0, mm0 /* zero MM0 register */ 02878 mov eax, Src1 /* load Src1 address into eax */ 02879 mov edi, Dest /* load Dest address into edi */ 02880 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 02881 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 02882 align 16 /* 16 byte alignment of the loop entry */ 02883 L1026: 02884 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 02885 movq mm4, mm3 /* copy MM3 into MM4 */ 02886 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 02887 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 02888 psrlw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the right */ 02889 psrlw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the right */ 02890 pmullw mm3, mm1 /* mul low bytes of SrcDest by MM1 */ 02891 pmullw mm4, mm1 /* mul high bytes of SrcDest by MM1 */ 02892 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 02893 movq [edi], mm3 /* store result in Dest */ 02894 add eax, 8 /* increase Src1 register pointer by 8 */ 02895 add edi, 8 /* increase Dest register pointer by 8 */ 02896 dec ecx /* decrease loop counter */ 02897 jnz L1026 /* check loop termination, proceed if required */ 02898 emms /* exit MMX state */ 02899 popa 02900 } 02901 #else 02902 /* i386 and x86_64 */ 02903 __m64 *mSrc1 = (__m64*)Src1; 02904 __m64 *mDest = (__m64*)Dest; 02905 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 02906 /* Duplicate C in 4 words of MM1 */ 02907 int i; 02908 i = (C<<16)|C; 02909 __m64 mm1 = _m_from_int(i); 02910 __m64 mm2 = _m_from_int(i); 02911 mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with C */ 02912 for (i = 0; i < SrcLength/8; i++) { 02913 __m64 mm3, mm4, mm5, mm6; 02914 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 02915 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 02916 mm3 = _m_psrlwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the right */ 02917 mm4 = _m_psrlwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the right */ 02918 mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */ 02919 mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */ 02920 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 02921 mSrc1++; 02922 mDest++; 02923 } 02924 _m_empty(); /* clean MMX state */ 02925 #endif 02926 return (0); 02927 #else 02928 return (-1); 02929 #endif 02930 } 02931 02943 int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N, 02944 unsigned char C) 02945 { 02946 unsigned int i, istart; 02947 int iC; 02948 unsigned char *cursrc1; 02949 unsigned char *curdest; 02950 int result; 02951 02952 /* Validate input parameters */ 02953 if ((Src1 == NULL) || (Dest == NULL)) 02954 return(-1); 02955 if (length == 0) 02956 return(0); 02957 02958 /* Check shift */ 02959 if (N > 8) { 02960 return (-1); 02961 } 02962 02963 /* Special case: N==0 && C==1 */ 02964 if ((N == 0) && (C == 1)) { 02965 memcpy(Src1, Dest, length); 02966 return (0); 02967 } 02968 02969 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 02970 02971 SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C); 02972 02973 /* Check for unaligned bytes */ 02974 if ((length & 7) > 0) { 02975 /* Setup to process unaligned bytes */ 02976 istart = length & 0xfffffff8; 02977 cursrc1 = &Src1[istart]; 02978 curdest = &Dest[istart]; 02979 } else { 02980 /* No unaligned bytes - we are done */ 02981 return (0); 02982 } 02983 } else { 02984 /* Setup to process whole image */ 02985 istart = 0; 02986 cursrc1 = Src1; 02987 curdest = Dest; 02988 } 02989 02990 /* C routine to process image */ 02991 iC = (int) C; 02992 for (i = istart; i < length; i++) { 02993 result = (int) (*cursrc1 >> N) * iC; 02994 if (result > 255) 02995 result = 255; 02996 *curdest = (unsigned char) result; 02997 /* Advance pointers */ 02998 cursrc1++; 02999 curdest++; 03000 } 03001 03002 return (0); 03003 } 03004 03016 static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N, 03017 unsigned char *Mask) 03018 { 03019 #ifdef USE_MMX 03020 #if !defined(GCC__) 03021 __asm 03022 { 03023 pusha 03024 mov edx, Mask /* load Mask address into edx */ 03025 movq mm0, [edx] /* load Mask into mm0 */ 03026 xor ecx, ecx /* zero ECX */ 03027 mov cl, N /* load loop counter (N) into CL */ 03028 movd mm3, ecx /* copy (N) into MM3 */ 03029 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 03030 L10270: /* ** Prepare proper bit-Mask in MM1 ** */ 03031 psllw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the left */ 03032 pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */ 03033 /* byte 0x0f, 0xdb, 0xc8 */ 03034 dec cl /* decrease loop counter */ 03035 jnz L10270 /* check loop termination, proceed if required */ 03036 /* ** Shift all bytes of the image ** */ 03037 mov eax, Src1 /* load Src1 address into eax */ 03038 mov edi, Dest /* load SrcDest address into edi */ 03039 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03040 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03041 align 16 /* 16 byte alignment of the loop entry */ 03042 L10271: 03043 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */ 03044 psllw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the left */ 03045 pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */ 03046 /* byte 0x0f, 0xdb, 0xc1 */ 03047 movq [edi], mm0 /* store result in Dest */ 03048 add eax, 8 /* increase Src1 register pointer by 8 */ 03049 add edi, 8 /* increase Dest register pointer by 8 */ 03050 dec ecx /* decrease loop counter */ 03051 jnz L10271 /* check loop termination, proceed if required */ 03052 emms /* exit MMX state */ 03053 popa 03054 } 03055 #else 03056 /* i386 and x86_64 */ 03057 __m64 *mSrc1 = (__m64*)Src1; 03058 __m64 *mDest = (__m64*)Dest; 03059 __m64 *mMask = (__m64*)Mask; 03060 __m64 mm1; 03061 int i; 03062 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */ 03063 /* Prepare proper bit-Mask in MM1 */ 03064 for (i = 0; i < N; i++) { 03065 mm1 = _m_psllwi(mm1, 1); /* shift 4 WORDS of MM1 1 bit to the left */ 03066 mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of MM1 */ 03067 } 03068 /* ** Shift all bytes of the image ** */ 03069 for (i = 0; i < SrcLength/8; i++) { 03070 __m64 mm0 = _m_psllwi(*mSrc1, N); /* shift 4 WORDS of MM0 (N) bits to the left */ 03071 *mDest = _m_pand(mm0, mm1); /* apply proper bit-Mask to 8 BYTES of MM0 */ 03072 mSrc1++; 03073 mDest++; 03074 } 03075 _m_empty(); /* clean MMX state */ 03076 #endif 03077 return (0); 03078 #else 03079 return (-1); 03080 #endif 03081 } 03082 03093 int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 03094 { 03095 static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE }; 03096 unsigned int i, istart; 03097 unsigned char *cursrc1, *curdest; 03098 int result; 03099 03100 /* Validate input parameters */ 03101 if ((Src1 == NULL) || (Dest == NULL)) 03102 return(-1); 03103 if (length == 0) 03104 return(0); 03105 03106 if (N > 8) { 03107 return (-1); 03108 } 03109 03110 /* Special case: N==0 */ 03111 if (N == 0) { 03112 memcpy(Src1, Dest, length); 03113 return (0); 03114 } 03115 03116 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03117 03118 SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask); 03119 03120 /* Check for unaligned bytes */ 03121 if ((length & 7) > 0) { 03122 /* Setup to process unaligned bytes */ 03123 istart = length & 0xfffffff8; 03124 cursrc1 = &Src1[istart]; 03125 curdest = &Dest[istart]; 03126 } else { 03127 /* No unaligned bytes - we are done */ 03128 return (0); 03129 } 03130 } else { 03131 /* Setup to process whole image */ 03132 istart = 0; 03133 cursrc1 = Src1; 03134 curdest = Dest; 03135 } 03136 03137 /* C routine to process image */ 03138 for (i = istart; i < length; i++) { 03139 result = ((int) *cursrc1 << N) & 0xff; 03140 *curdest = (unsigned char) result; 03141 /* Advance pointers */ 03142 cursrc1++; 03143 curdest++; 03144 } 03145 03146 return (0); 03147 } 03148 03159 static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N) 03160 { 03161 #ifdef USE_MMX 03162 #if !defined(GCC__) 03163 __asm 03164 { 03165 pusha 03166 mov eax, Src1 /* load Src1 address into eax */ 03167 mov edi, Dest /* load Dest address into edi */ 03168 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03169 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03170 align 16 /* 16 byte alignment of the loop entry */ 03171 L12023: 03172 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 03173 pslld mm0, N /* MM0=SrcDest+C (add 8 bytes with saturation) */ 03174 movq [edi], mm0 /* store result in SrcDest */ 03175 add eax, 8 /* increase Src1 register pointer by 8 */ 03176 add edi, 8 /* increase Dest register pointer by 8 */ 03177 dec ecx /* decrease loop counter */ 03178 jnz L12023 /* check loop termination, proceed if required */ 03179 emms /* exit MMX state */ 03180 popa 03181 } 03182 #else 03183 /* i386 and x86_64 */ 03184 __m64 *mSrc1 = (__m64*)Src1; 03185 __m64 *mDest = (__m64*)Dest; 03186 int i; 03187 for (i = 0; i < SrcLength/8; i++) { 03188 *mDest = _m_pslldi(*mSrc1, N); /* Src1+C (add 8 bytes with saturation) */ 03189 mSrc1++; 03190 mDest++; 03191 } 03192 _m_empty(); /* clean MMX state */ 03193 #endif 03194 return (0); 03195 #else 03196 return (-1); 03197 #endif 03198 } 03199 03210 int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 03211 { 03212 unsigned int i, istart; 03213 unsigned char *cursrc1, *curdest; 03214 unsigned int *icursrc1, *icurdest; 03215 unsigned int result; 03216 03217 /* Validate input parameters */ 03218 if ((Src1 == NULL) || (Dest == NULL)) 03219 return(-1); 03220 if (length == 0) 03221 return(0); 03222 03223 if (N > 32) { 03224 return (-1); 03225 } 03226 03227 /* Special case: N==0 */ 03228 if (N == 0) { 03229 memcpy(Src1, Dest, length); 03230 return (0); 03231 } 03232 03233 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03234 03235 SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N); 03236 03237 /* Check for unaligned bytes */ 03238 if ((length & 7) > 0) { 03239 /* Setup to process unaligned bytes */ 03240 istart = length & 0xfffffff8; 03241 cursrc1 = &Src1[istart]; 03242 curdest = &Dest[istart]; 03243 } else { 03244 /* No unaligned bytes - we are done */ 03245 return (0); 03246 } 03247 } else { 03248 /* Setup to process whole image */ 03249 istart = 0; 03250 cursrc1 = Src1; 03251 curdest = Dest; 03252 } 03253 03254 /* C routine to process image */ 03255 icursrc1=(unsigned int *)cursrc1; 03256 icurdest=(unsigned int *)curdest; 03257 for (i = istart; i < length; i += 4) { 03258 if ((i+4)<length) { 03259 result = ((unsigned int)*icursrc1 << N); 03260 *icurdest = result; 03261 } 03262 /* Advance pointers */ 03263 icursrc1++; 03264 icurdest++; 03265 } 03266 03267 return (0); 03268 } 03269 03280 static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N) 03281 { 03282 #ifdef USE_MMX 03283 #if !defined(GCC__) 03284 __asm 03285 { 03286 pusha 03287 xor eax, eax /* zero EAX */ 03288 mov al, N /* load N into AL */ 03289 movd mm7, eax /* copy N into MM7 */ 03290 pxor mm0, mm0 /* zero MM0 register */ 03291 mov eax, Src1 /* load Src1 address into eax */ 03292 mov edi, Dest /* load Dest address into edi */ 03293 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03294 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03295 cmp al, 7 /* if (N <= 7) execute more efficient code */ 03296 jg L10281 03297 align 16 /* 16 byte alignment of the loop entry */ 03298 L10280: 03299 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 03300 movq mm4, mm3 /* copy MM3 into MM4 */ 03301 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 03302 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 03303 psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the left */ 03304 psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the left */ 03305 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 03306 movq [edi], mm3 /* store result in Dest */ 03307 add eax, 8 /* increase Src1 register pointer by 8 */ 03308 add edi, 8 /* increase Dest register pointer by 8 */ 03309 dec ecx /* decrease loop counter */ 03310 jnz L10280 /* check loop termination, proceed if required */ 03311 jmp L10282 03312 align 16 /* 16 byte alignment of the loop entry */ 03313 L10281: 03314 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 03315 movq mm4, mm3 /* copy MM3 into MM4 */ 03316 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */ 03317 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */ 03318 psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the left */ 03319 psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the left */ 03320 /* ** Take abs value of the signed words ** */ 03321 movq mm5, mm3 /* copy mm3 into mm5 */ 03322 movq mm6, mm4 /* copy mm4 into mm6 */ 03323 psraw mm5, 15 /* fill mm5 words with word sign bit */ 03324 psraw mm6, 15 /* fill mm6 words with word sign bit */ 03325 pxor mm3, mm5 /* take 1's compliment of only neg words */ 03326 pxor mm4, mm6 /* take 1's compliment of only neg words */ 03327 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */ 03328 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 03329 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 03330 movq [edi], mm3 /* store result in Dest */ 03331 add eax, 8 /* increase Src1 register pointer by 8 */ 03332 add edi, 8 /* increase Dest register pointer by 8 */ 03333 dec ecx /* decrease loop counter */ 03334 jnz L10281 /* check loop termination, proceed if required */ 03335 L10282: 03336 emms /* exit MMX state */ 03337 popa 03338 } 03339 #else 03340 /* i386 and x86_64 */ 03341 __m64 *mSrc1 = (__m64*)Src1; 03342 __m64 *mDest = (__m64*)Dest; 03343 __m64 mm0 = _m_from_int(0); /* zero mm0 register */ 03344 int i; 03345 if (N <= 7) { /* if (N <= 7) execute more efficient code */ 03346 for (i = 0; i < SrcLength/8; i++) { 03347 __m64 mm3, mm4; 03348 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 03349 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 03350 mm3 = _m_psllwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the left */ 03351 mm4 = _m_psllwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the left */ 03352 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 03353 mSrc1++; 03354 mDest++; 03355 } 03356 } else { 03357 for (i = 0; i < SrcLength/8; i++) { 03358 __m64 mm3, mm4, mm5, mm6; 03359 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */ 03360 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */ 03361 mm3 = _m_psllwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the left */ 03362 mm4 = _m_psllwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the left */ 03363 /* Take abs value of the signed words */ 03364 mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */ 03365 mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */ 03366 mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */ 03367 mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */ 03368 mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */ 03369 mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */ 03370 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 03371 mSrc1++; 03372 mDest++; 03373 } 03374 } 03375 _m_empty(); /* clean MMX state */ 03376 #endif 03377 return (0); 03378 #else 03379 return (-1); 03380 #endif 03381 } 03382 03393 int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N) 03394 { 03395 unsigned int i, istart; 03396 unsigned char *cursrc1, *curdest; 03397 int result; 03398 03399 /* Validate input parameters */ 03400 if ((Src1 == NULL) || (Dest == NULL)) 03401 return(-1); 03402 if (length == 0) 03403 return(0); 03404 03405 if (N > 8) { 03406 return (-1); 03407 } 03408 03409 /* Special case: N==0 */ 03410 if (N == 0) { 03411 memcpy(Src1, Dest, length); 03412 return (0); 03413 } 03414 03415 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03416 03417 SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N); 03418 03419 /* Check for unaligned bytes */ 03420 if ((length & 7) > 0) { 03421 /* Setup to process unaligned bytes */ 03422 istart = length & 0xfffffff8; 03423 cursrc1 = &Src1[istart]; 03424 curdest = &Dest[istart]; 03425 } else { 03426 /* No unaligned bytes - we are done */ 03427 return (0); 03428 } 03429 } else { 03430 /* Setup to process whole image */ 03431 istart = 0; 03432 cursrc1 = Src1; 03433 curdest = Dest; 03434 } 03435 03436 /* C routine to process image */ 03437 for (i = istart; i < length; i++) { 03438 result = (int) *cursrc1 << N; 03439 if (result > 255) 03440 result = 255; 03441 *curdest = (unsigned char) result; 03442 /* Advance pointers */ 03443 cursrc1++; 03444 curdest++; 03445 } 03446 03447 return (0); 03448 } 03449 03460 static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T) 03461 { 03462 #ifdef USE_MMX 03463 #if !defined(GCC__) 03464 __asm 03465 { 03466 pusha 03467 /* ** Duplicate T in 8 bytes of MM3 ** */ 03468 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 03469 pcmpeqb mm2, mm2 /* generate all 1's in mm2 */ 03470 mov al, T /* load T into AL */ 03471 mov ah, al /* copy AL into AH */ 03472 mov bx, ax /* copy AX into BX */ 03473 shl eax, 16 /* shift 2 bytes of EAX left */ 03474 mov ax, bx /* copy BX into AX */ 03475 movd mm3, eax /* copy EAX into MM3 */ 03476 movd mm4, eax /* copy EAX into MM4 */ 03477 punpckldq mm3, mm4 /* fill higher bytes of MM3 with T */ 03478 psubusb mm2, mm3 /* store 0xFF - T in MM2 */ 03479 mov eax, Src1 /* load Src1 address into eax */ 03480 mov edi, Dest /* load Dest address into edi */ 03481 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03482 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03483 align 16 /* 16 byte alignment of the loop entry */ 03484 L1029: 03485 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */ 03486 paddusb mm0, mm2 /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */ 03487 pcmpeqb mm0, mm1 /* binarize 255:0, comparing to 255 */ 03488 movq [edi], mm0 /* store result in SrcDest */ 03489 add eax, 8 /* increase Src1 register pointer by 8 */ 03490 add edi, 8 /* increase Dest register pointer by 8 */ 03491 dec ecx /* decrease loop counter */ 03492 jnz L1029 /* check loop termination, proceed if required */ 03493 emms /* exit MMX state */ 03494 popa 03495 } 03496 #else 03497 /* i386 and x86_64 */ 03498 __m64 *mSrc1 = (__m64*)Src1; 03499 __m64 *mDest = (__m64*)Dest; 03500 /* Duplicate T in 8 bytes of MM3 */ 03501 __m64 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */ 03502 __m64 mm2 = _m_pcmpeqb(mm2, mm2); /* generate all 1's in mm1 */ 03503 int i; 03504 memset(&i, T, 4); 03505 __m64 mm3 = _m_from_int(i); 03506 __m64 mm4 = _m_from_int(i); 03507 mm3 = _m_punpckldq(mm3, mm4); /* fill higher bytes of MM3 with T */ 03508 mm2 = _m_psubusb(mm2, mm3); /* store 0xFF - T in MM2 */ 03509 //__m64 mm3 = _m_from_int64(lli); // x86_64 only 03510 for (i = 0; i < SrcLength/8; i++) { 03511 __m64 mm0 = _m_paddusb(*mSrc1, mm2); /* Src1+(0xFF-T) (add 8 bytes with saturation) */ 03512 *mDest = _m_pcmpeqb(mm0, mm1); /* binarize 255:0, comparing to 255 */ 03513 mSrc1++; 03514 mDest++; 03515 } 03516 _m_empty(); /* clean MMX state */ 03517 #endif 03518 return (0); 03519 #else 03520 return (-1); 03521 #endif 03522 } 03523 03534 int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T) 03535 { 03536 unsigned int i, istart; 03537 unsigned char *cursrc1; 03538 unsigned char *curdest; 03539 03540 /* Validate input parameters */ 03541 if ((Src1 == NULL) || (Dest == NULL)) 03542 return(-1); 03543 if (length == 0) 03544 return(0); 03545 03546 /* Special case: T==0 */ 03547 if (T == 0) { 03548 memset(Dest, 255, length); 03549 return (0); 03550 } 03551 03552 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03553 03554 SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T); 03555 03556 /* Check for unaligned bytes */ 03557 if ((length & 7) > 0) { 03558 /* Setup to process unaligned bytes */ 03559 istart = length & 0xfffffff8; 03560 cursrc1 = &Src1[istart]; 03561 curdest = &Dest[istart]; 03562 } else { 03563 /* No unaligned bytes - we are done */ 03564 return (0); 03565 } 03566 } else { 03567 /* Setup to process whole image */ 03568 istart = 0; 03569 cursrc1 = Src1; 03570 curdest = Dest; 03571 } 03572 03573 /* C routine to process image */ 03574 for (i = istart; i < length; i++) { 03575 *curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0); 03576 /* Advance pointers */ 03577 cursrc1++; 03578 curdest++; 03579 } 03580 03581 return (0); 03582 } 03583 03595 static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin, 03596 unsigned char Tmax) 03597 { 03598 #ifdef USE_MMX 03599 #if !defined(GCC__) 03600 __asm 03601 { 03602 pusha 03603 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */ 03604 /* ** Duplicate Tmax in 8 bytes of MM3 ** */ 03605 mov al, Tmax /* load Tmax into AL */ 03606 mov ah, al /* copy AL into AH */ 03607 mov bx, ax /* copy AX into BX */ 03608 shl eax, 16 /* shift 2 bytes of EAX left */ 03609 mov ax, bx /* copy BX into AX */ 03610 movd mm3, eax /* copy EAX into MM3 */ 03611 movd mm4, eax /* copy EAX into MM4 */ 03612 punpckldq mm3, mm4 /* fill higher bytes of MM3 with Tmax */ 03613 psubusb mm1, mm3 /* store 0xFF - Tmax in MM1 */ 03614 /* ** Duplicate Tmin in 8 bytes of MM5 ** */ 03615 mov al, Tmin /* load Tmin into AL */ 03616 mov ah, al /* copy AL into AH */ 03617 mov bx, ax /* copy AX into BX */ 03618 shl eax, 16 /* shift 2 bytes of EAX left */ 03619 mov ax, bx /* copy BX into AX */ 03620 movd mm5, eax /* copy EAX into MM5 */ 03621 movd mm4, eax /* copy EAX into MM4 */ 03622 punpckldq mm5, mm4 /* fill higher bytes of MM5 with Tmin */ 03623 movq mm7, mm5 /* copy MM5 into MM7 */ 03624 paddusb mm7, mm1 /* store 0xFF - Tmax + Tmin in MM7 */ 03625 mov eax, Src1 /* load Src1 address into eax */ 03626 mov edi, Dest /* load Dest address into edi */ 03627 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03628 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03629 align 16 /* 16 byte alignment of the loop entry */ 03630 L1030: 03631 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */ 03632 paddusb mm0, mm1 /* MM0=SrcDest+(0xFF-Tmax) */ 03633 psubusb mm0, mm7 /* MM0=MM0-(0xFF-Tmax+Tmin) */ 03634 paddusb mm0, mm5 /* MM0=MM0+Tmin */ 03635 movq [edi], mm0 /* store result in Dest */ 03636 add eax, 8 /* increase Src1 register pointer by 8 */ 03637 add edi, 8 /* increase Dest register pointer by 8 */ 03638 dec ecx /* decrease loop counter */ 03639 jnz L1030 /* check loop termination, proceed if required */ 03640 emms /* exit MMX state */ 03641 popa 03642 } 03643 #else 03644 /* i386 and x86_64 */ 03645 __m64 *mSrc1 = (__m64*)Src1; 03646 __m64 *mDest = (__m64*)Dest; 03647 __m64 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */ 03648 int i; 03649 /* Duplicate Tmax in 8 bytes of MM3 */ 03650 __m64 mm3, mm4; 03651 memset(&i, Tmax, 4); 03652 mm3 = _m_from_int(i); 03653 mm4 = _m_from_int(i); 03654 mm3 = _m_punpckldq(mm3, mm4); /* fill higher bytes of MM3 with Tmax */ 03655 mm1 = _m_psubusb(mm1, mm3); /* store 0xFF - Tmax in MM1 */ 03656 //__m64 mm3 = _m_from_int64(lli); // x86_64 only 03657 /* Duplicate Tmax in 8 bytes of MM3 */ 03658 __m64 mm5, mm7; 03659 memset(&i, Tmin, 4); 03660 mm5 = _m_from_int(i); 03661 mm4 = _m_from_int(i); 03662 mm5 = _m_punpckldq(mm5, mm4); /* fill higher bytes of MM5 with Tmin */ 03663 mm7 = _m_paddusb(mm5, mm1); /* store 0xFF - Tmax + Tmin in MM7 */ 03664 for (i = 0; i < SrcLength/8; i++) { 03665 __m64 mm0; 03666 mm0 = _m_paddusb(*mSrc1, mm1); /* MM0=Src1+(0xFF-Tmax) */ 03667 mm0 = _m_psubusb(mm0, mm7); /* MM0=MM0-(0xFF-Tmax+Tmin) */ 03668 *mDest = _m_paddusb(mm0, mm5); /* MM0+Tmin */ 03669 mSrc1++; 03670 mDest++; 03671 } 03672 _m_empty(); /* clean MMX state */ 03673 #endif 03674 return (0); 03675 #else 03676 return (-1); 03677 #endif 03678 } 03679 03691 int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin, 03692 unsigned char Tmax) 03693 { 03694 unsigned int i, istart; 03695 unsigned char *cursrc1; 03696 unsigned char *curdest; 03697 03698 /* Validate input parameters */ 03699 if ((Src1 == NULL) || (Dest == NULL)) 03700 return(-1); 03701 if (length == 0) 03702 return(0); 03703 03704 /* Special case: Tmin==0 && Tmax = 255 */ 03705 if ((Tmin == 0) && (Tmax == 25)) { 03706 memcpy(Src1, Dest, length); 03707 return (0); 03708 } 03709 03710 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03711 03712 SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax); 03713 03714 /* Check for unaligned bytes */ 03715 if ((length & 7) > 0) { 03716 /* Setup to process unaligned bytes */ 03717 istart = length & 0xfffffff8; 03718 cursrc1 = &Src1[istart]; 03719 curdest = &Dest[istart]; 03720 } else { 03721 /* No unaligned bytes - we are done */ 03722 return (0); 03723 } 03724 } else { 03725 /* Setup to process whole image */ 03726 istart = 0; 03727 cursrc1 = Src1; 03728 curdest = Dest; 03729 } 03730 03731 /* C routine to process image */ 03732 for (i = istart; i < length; i++) { 03733 if (*cursrc1 < Tmin) { 03734 *curdest = Tmin; 03735 } else if (*cursrc1 > Tmax) { 03736 *curdest = Tmax; 03737 } else { 03738 *curdest = *cursrc1; 03739 } 03740 /* Advance pointers */ 03741 cursrc1++; 03742 curdest++; 03743 } 03744 03745 return (0); 03746 } 03747 03761 static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax, 03762 int Nmin, int Nmax) 03763 { 03764 #ifdef USE_MMX 03765 #if !defined(GCC__) 03766 __asm 03767 { 03768 pusha 03769 mov ax, WORD PTR Nmax /* load Nmax in AX */ 03770 mov bx, WORD PTR Cmax /* load Cmax in BX */ 03771 sub ax, WORD PTR Nmin /* AX = Nmax - Nmin */ 03772 sub bx, WORD PTR Cmin /* BX = Cmax - Cmin */ 03773 jz L10311 /* check division by zero */ 03774 xor dx, dx /* prepare for division, zero DX */ 03775 div bx /* AX = AX/BX */ 03776 jmp L10312 03777 L10311: 03778 mov ax, 255 /* if div by zero, assume result max byte value */ 03779 L10312: /* ** Duplicate AX in 4 words of MM0 ** */ 03780 mov bx, ax /* copy AX into BX */ 03781 shl eax, 16 /* shift 2 bytes of EAX left */ 03782 mov ax, bx /* copy BX into AX */ 03783 movd mm0, eax /* copy EAX into MM0 */ 03784 movd mm1, eax /* copy EAX into MM1 */ 03785 punpckldq mm0, mm1 /* fill higher words of MM0 with AX */ 03786 /* ** Duplicate Cmin in 4 words of MM1 ** */ 03787 mov ax, WORD PTR Cmin /* load Cmin into AX */ 03788 mov bx, ax /* copy AX into BX */ 03789 shl eax, 16 /* shift 2 bytes of EAX left */ 03790 mov ax, bx /* copy BX into AX */ 03791 movd mm1, eax /* copy EAX into MM1 */ 03792 movd mm2, eax /* copy EAX into MM2 */ 03793 punpckldq mm1, mm2 /* fill higher words of MM1 with Cmin */ 03794 /* ** Duplicate Nmin in 4 words of MM2 ** */ 03795 mov ax, WORD PTR Nmin /* load Nmin into AX */ 03796 mov bx, ax /* copy AX into BX */ 03797 shl eax, 16 /* shift 2 bytes of EAX left */ 03798 mov ax, bx /* copy BX into AX */ 03799 movd mm2, eax /* copy EAX into MM2 */ 03800 movd mm3, eax /* copy EAX into MM3 */ 03801 punpckldq mm2, mm3 /* fill higher words of MM2 with Nmin */ 03802 pxor mm7, mm7 /* zero MM7 register */ 03803 mov eax, Src1 /* load Src1 address into eax */ 03804 mov edi, Dest /* load Dest address into edi */ 03805 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */ 03806 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */ 03807 align 16 /* 16 byte alignment of the loop entry */ 03808 L1031: 03809 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */ 03810 movq mm4, mm3 /* copy MM3 into MM4 */ 03811 punpcklbw mm3, mm7 /* unpack low bytes of SrcDest into words */ 03812 punpckhbw mm4, mm7 /* unpack high bytes of SrcDest into words */ 03813 psubusb mm3, mm1 /* S-Cmin, low bytes */ 03814 psubusb mm4, mm1 /* S-Cmin, high bytes */ 03815 pmullw mm3, mm0 /* MM0*(S-Cmin), low bytes */ 03816 pmullw mm4, mm0 /* MM0*(S-Cmin), high bytes */ 03817 paddusb mm3, mm2 /* MM0*(S-Cmin)+Nmin, low bytes */ 03818 paddusb mm4, mm2 /* MM0*(S-Cmin)+Nmin, high bytes */ 03819 /* ** Take abs value of the signed words ** */ 03820 movq mm5, mm3 /* copy mm3 into mm5 */ 03821 movq mm6, mm4 /* copy mm4 into mm6 */ 03822 psraw mm5, 15 /* fill mm5 words with word sign bit */ 03823 psraw mm6, 15 /* fill mm6 words with word sign bit */ 03824 pxor mm3, mm5 /* take 1's compliment of only neg words */ 03825 pxor mm4, mm6 /* take 1's compliment of only neg words */ 03826 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */ 03827 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 03828 packuswb mm3, mm4 /* pack words back into bytes with saturation */ 03829 movq [edi], mm3 /* store result in Dest */ 03830 add eax, 8 /* increase Src1 register pointer by 8 */ 03831 add edi, 8 /* increase Dest register pointer by 8 */ 03832 dec ecx /* decrease loop counter */ 03833 jnz L1031 /* check loop termination, proceed if required */ 03834 emms /* exit MMX state */ 03835 popa 03836 } 03837 #else 03838 /* i386 and x86_64 */ 03839 __m64 *mSrc1 = (__m64*)Src1; 03840 __m64 *mDest = (__m64*)Dest; 03841 __m64 mm0, mm1, mm2, mm3; 03842 03843 int i; 03844 /* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */ 03845 unsigned short a = Nmax - Nmin; 03846 unsigned short b = Cmax - Cmin; 03847 if (b == 0) { 03848 a = 255; 03849 } else { 03850 a /= b; 03851 } 03852 i = (a<<16)|a; 03853 mm0 = _m_from_int(i); 03854 mm1 = _m_from_int(i); 03855 mm0 = _m_punpckldq(mm0, mm1); /* fill higher words of MM0 with AX */ 03856 /* Duplicate Cmin in 4 words of MM1 */ 03857 i = (Cmin<<16)|(short)Cmin; 03858 mm1 = _m_from_int(i); 03859 mm2 = _m_from_int(i); 03860 mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with Cmin */ 03861 /* Duplicate Nmin in 4 words of MM2 */ 03862 i = (Nmin<<16)|(short)Nmin; 03863 mm2 = _m_from_int(i); 03864 mm3 = _m_from_int(i); 03865 mm2 = _m_punpckldq(mm2, mm3); /* fill higher words of MM2 with Nmin */ 03866 __m64 mm7 = _m_from_int(0); /* zero mm0 register */ 03867 for (i = 0; i < SrcLength/8; i++) { 03868 __m64 mm3, mm4, mm5, mm6; 03869 mm3 = _m_punpcklbw(*mSrc1, mm7); /* unpack low bytes of Src1 into words */ 03870 mm4 = _m_punpckhbw(*mSrc1, mm7); /* unpack high bytes of Src1 into words */ 03871 mm3 = _m_psubusb(mm3, mm1); /* S-Cmin, low bytes */ 03872 mm4 = _m_psubusb(mm4, mm1); /* S-Cmin, high bytes */ 03873 mm3 = _m_pmullw(mm3, mm0); /* MM0*(S-Cmin), low bytes */ 03874 mm4 = _m_pmullw(mm4, mm0); /* MM0*(S-Cmin), high bytes */ 03875 mm3 = _m_paddusb(mm3, mm2); /* MM0*(S-Cmin)+Nmin, low bytes */ 03876 mm4 = _m_paddusb(mm4, mm2); /* MM0*(S-Cmin)+Nmin, high bytes */ 03877 /* Take abs value of the signed words */ 03878 mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */ 03879 mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */ 03880 mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */ 03881 mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */ 03882 mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */ 03883 mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */ 03884 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */ 03885 mSrc1++; 03886 mDest++; 03887 } 03888 _m_empty(); /* clean MMX state */ 03889 #endif 03890 return (0); 03891 #else 03892 return (-1); 03893 #endif 03894 } 03895 03909 int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin, 03910 int Nmax) 03911 { 03912 unsigned int i, istart; 03913 unsigned char *cursrc; 03914 unsigned char *curdest; 03915 int dN, dC, factor; 03916 int result; 03917 03918 /* Validate input parameters */ 03919 if ((Src == NULL) || (Dest == NULL)) 03920 return(-1); 03921 if (length == 0) 03922 return(0); 03923 03924 if ((SDL_imageFilterMMXdetect()) && (length > 7)) { 03925 03926 SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax); 03927 03928 /* Check for unaligned bytes */ 03929 if ((length & 7) > 0) { 03930 /* Setup to process unaligned bytes */ 03931 istart = length & 0xfffffff8; 03932 cursrc = &Src[istart]; 03933 curdest = &Dest[istart]; 03934 } else { 03935 /* No unaligned bytes - we are done */ 03936 return (0); 03937 } 03938 } else { 03939 /* Setup to process whole image */ 03940 istart = 0; 03941 cursrc = Src; 03942 curdest = Dest; 03943 } 03944 03945 /* C routine to process image */ 03946 dC = Cmax - Cmin; 03947 if (dC == 0) 03948 return (0); 03949 dN = Nmax - Nmin; 03950 factor = dN / dC; 03951 for (i = istart; i < length; i++) { 03952 result = factor * ((int) (*cursrc) - Cmin) + Nmin; 03953 if (result > 255) 03954 result = 255; 03955 *curdest = (unsigned char) result; 03956 /* Advance pointers */ 03957 cursrc++; 03958 curdest++; 03959 } 03960 03961 return (0); 03962 } 03963 03964 /* ------------------------------------------------------------------------------------ */ 03965 03980 int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 03981 signed short *Kernel, unsigned char Divisor) 03982 { 03983 /* Validate input parameters */ 03984 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 03985 return(-1); 03986 03987 if ((columns < 3) || (rows < 3) || (Divisor == 0)) 03988 return (-1); 03989 03990 if ((SDL_imageFilterMMXdetect())) { 03991 //#ifdef USE_MMX 03992 #if defined(USE_MMX) && defined(i386) 03993 #if !defined(GCC__) 03994 __asm 03995 { 03996 pusha 03997 pxor mm0, mm0 /* zero MM0 */ 03998 xor ebx, ebx /* zero EBX */ 03999 mov bl, Divisor /* load Divisor into BL */ 04000 mov edx, Kernel /* load Kernel address into EDX */ 04001 movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */ 04002 add edx, 8 /* second row |K0 K1 K2 0| */ 04003 movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 04004 add edx, 8 /* third row |K6 K7 K8 0| */ 04005 movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */ 04006 /* ---, */ 04007 mov eax, columns /* load columns into EAX */ 04008 mov esi, Src /* ESI = Src row 0 address */ 04009 mov edi, Dest /* load Dest address to EDI */ 04010 add edi, eax /* EDI = EDI + columns */ 04011 inc edi /* 1 byte offset from the left edge */ 04012 mov edx, rows /* initialize ROWS counter */ 04013 sub edx, 2 /* do not use first and last row */ 04014 /* ---, */ 04015 L10320: 04016 mov ecx, eax /* initialize COLUMS counter */ 04017 sub ecx, 2 /* do not use first and last column */ 04018 align 16 /* 16 byte alignment of the loop entry */ 04019 L10322: 04020 /* ---, */ 04021 movq mm1, [esi] /* load 8 bytes of the image first row */ 04022 add esi, eax /* move one row below */ 04023 movq mm2, [esi] /* load 8 bytes of the image second row */ 04024 add esi, eax /* move one row below */ 04025 movq mm3, [esi] /* load 8 bytes of the image third row */ 04026 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04027 punpcklbw mm2, mm0 /* unpack first 4 bytes into words */ 04028 punpcklbw mm3, mm0 /* unpack first 4 bytes into words */ 04029 pmullw mm1, mm5 /* multiply words first row image*Kernel */ 04030 pmullw mm2, mm6 /* multiply words second row image*Kernel */ 04031 pmullw mm3, mm7 /* multiply words third row image*Kernel */ 04032 paddsw mm1, mm2 /* add 4 words of the first and second rows */ 04033 paddsw mm1, mm3 /* add 4 words of the third row and result */ 04034 movq mm2, mm1 /* copy MM1 into MM2 */ 04035 psrlq mm1, 32 /* shift 2 left words to the right */ 04036 paddsw mm1, mm2 /* add 2 left and 2 right result words */ 04037 movq mm3, mm1 /* copy MM1 into MM3 */ 04038 psrlq mm1, 16 /* shift 1 left word to the right */ 04039 paddsw mm1, mm3 /* add 1 left and 1 right result words */ 04040 /* --, */ 04041 movd mm2, eax /* save EAX in MM2 */ 04042 movd mm3, edx /* save EDX in MM3 */ 04043 movd eax, mm1 /* copy MM1 into EAX */ 04044 psraw mm1, 15 /* spread sign bit of the result */ 04045 movd edx, mm1 /* fill EDX with a sign bit */ 04046 idiv bx /* IDIV - VERY EXPENSIVE */ 04047 movd mm1, eax /* move result of division into MM1 */ 04048 packuswb mm1, mm0 /* pack division result with saturation */ 04049 movd eax, mm1 /* copy saturated result into EAX */ 04050 mov [edi], al /* copy a byte result into Dest */ 04051 movd edx, mm3 /* restore saved EDX */ 04052 movd eax, mm2 /* restore saved EAX */ 04053 /* --, */ 04054 sub esi, eax /* move two rows up */ 04055 sub esi, eax /* */ 04056 inc esi /* move Src pointer to the next pixel */ 04057 inc edi /* move Dest pointer to the next pixel */ 04058 /* ---, */ 04059 dec ecx /* decrease loop counter COLUMNS */ 04060 jnz L10322 /* check loop termination, proceed if required */ 04061 add esi, 2 /* move to the next row in Src */ 04062 add edi, 2 /* move to the next row in Dest */ 04063 dec edx /* decrease loop counter ROWS */ 04064 jnz L10320 /* check loop termination, proceed if required */ 04065 /* ---, */ 04066 emms /* exit MMX state */ 04067 popa 04068 } 04069 #else 04070 asm volatile 04071 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 04072 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 04073 "mov %5, %%bl \n\t" /* load Divisor into BL */ 04074 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 04075 "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */ 04076 "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */ 04077 "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 04078 "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */ 04079 "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */ 04080 /* --- */ 04081 "mov %3, %%eax \n\t" /* load columns into EAX */ 04082 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 04083 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 04084 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 04085 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 04086 "mov %2, %%edx \n\t" /* initialize ROWS counter */ 04087 "sub $2, %%edx \n\t" /* do not use first and last row */ 04088 /* --- */ 04089 ".L10320: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 04090 "sub $2, %%ecx \n\t" /* do not use first and last column */ 04091 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 04092 ".L10322: \n\t" 04093 /* --- */ 04094 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */ 04095 "add %%eax, %%esi \n\t" /* move one row below */ 04096 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */ 04097 "add %%eax, %%esi \n\t" /* move one row below */ 04098 "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */ 04099 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04100 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */ 04101 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */ 04102 "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */ 04103 "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */ 04104 "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */ 04105 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */ 04106 "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */ 04107 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04108 "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */ 04109 "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */ 04110 "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */ 04111 "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */ 04112 "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */ 04113 /* -- */ 04114 "movd %%eax, %%mm2 \n\t" /* save EAX in MM2 */ 04115 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 04116 "movd %%mm1, %%eax \n\t" /* copy MM1 into EAX */ 04117 "psraw $15, %%mm1 \n\t" /* spread sign bit of the result */ 04118 "movd %%mm1, %%edx \n\t" /* fill EDX with a sign bit */ 04119 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 04120 "movd %%eax, %%mm1 \n\t" /* move result of division into MM1 */ 04121 "packuswb %%mm0, %%mm1 \n\t" /* pack division result with saturation */ 04122 "movd %%mm1, %%eax \n\t" /* copy saturated result into EAX */ 04123 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 04124 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 04125 "movd %%mm2, %%eax \n\t" /* restore saved EAX */ 04126 /* -- */ 04127 "sub %%eax, %%esi \n\t" /* move two rows up */ 04128 "sub %%eax, %%esi \n\t" /* */ 04129 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 04130 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 04131 /* --- */ 04132 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 04133 "jnz .L10322 \n\t" /* check loop termination, proceed if required */ 04134 "add $2, %%esi \n\t" /* move to the next row in Src */ 04135 "add $2, %%edi \n\t" /* move to the next row in Dest */ 04136 "dec %%edx \n\t" /* decrease loop counter ROWS */ 04137 "jnz .L10320 \n\t" /* check loop termination, proceed if required */ 04138 /* --- */ 04139 "emms \n\t" /* exit MMX state */ 04140 "popa \n\t":"=m" (Dest) /* %0 */ 04141 :"m"(Src), /* %1 */ 04142 "m"(rows), /* %2 */ 04143 "m"(columns), /* %3 */ 04144 "m"(Kernel), /* %4 */ 04145 "m"(Divisor) /* %5 */ 04146 ); 04147 #endif 04148 #endif 04149 return (0); 04150 } else { 04151 /* No non-MMX implementation yet */ 04152 return (-1); 04153 } 04154 } 04155 04170 int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 04171 signed short *Kernel, unsigned char Divisor) 04172 { 04173 /* Validate input parameters */ 04174 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 04175 return(-1); 04176 04177 if ((columns < 5) || (rows < 5) || (Divisor == 0)) 04178 return (-1); 04179 04180 if ((SDL_imageFilterMMXdetect())) { 04181 //#ifdef USE_MMX 04182 #if defined(USE_MMX) && defined(i386) 04183 #if !defined(GCC__) 04184 __asm 04185 { 04186 pusha 04187 pxor mm0, mm0 /* zero MM0 */ 04188 xor ebx, ebx /* zero EBX */ 04189 mov bl, Divisor /* load Divisor into BL */ 04190 movd mm5, ebx /* copy Divisor into MM5 */ 04191 mov edx, Kernel /* load Kernel address into EDX */ 04192 mov esi, Src /* load Src address to ESI */ 04193 mov edi, Dest /* load Dest address to EDI */ 04194 add edi, 2 /* 2 column offset from the left edge */ 04195 mov eax, columns /* load columns into EAX */ 04196 shl eax, 1 /* EAX = columns * 2 */ 04197 add edi, eax /* 2 row offset from the top edge */ 04198 shr eax, 1 /* EAX = columns */ 04199 mov ebx, rows /* initialize ROWS counter */ 04200 sub ebx, 4 /* do not use first 2 and last 2 rows */ 04201 /* ---, */ 04202 L10330: 04203 mov ecx, eax /* initialize COLUMNS counter */ 04204 sub ecx, 4 /* do not use first 2 and last 2 columns */ 04205 align 16 /* 16 byte alignment of the loop entry */ 04206 L10332: 04207 pxor mm7, mm7 /* zero MM7 (accumulator) */ 04208 movd mm6, esi /* save ESI in MM6 */ 04209 /* --- 1 */ 04210 movq mm1, [esi] /* load 8 bytes of the Src */ 04211 movq mm2, mm1 /* copy MM1 into MM2 */ 04212 add esi, eax /* move Src pointer 1 row below */ 04213 movq mm3, [edx] /* load 4 words of Kernel */ 04214 add edx, 8 /* move pointer to other 4 words */ 04215 movq mm4, [edx] /* load 4 words of Kernel */ 04216 add edx, 8 /* move pointer to other 4 words */ 04217 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04218 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04219 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04220 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04221 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04222 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04223 /* --- 2 */ 04224 movq mm1, [esi] /* load 8 bytes of the Src */ 04225 movq mm2, mm1 /* copy MM1 into MM2 */ 04226 add esi, eax /* move Src pointer 1 row below */ 04227 movq mm3, [edx] /* load 4 words of Kernel */ 04228 add edx, 8 /* move pointer to other 4 words */ 04229 movq mm4, [edx] /* load 4 words of Kernel */ 04230 add edx, 8 /* move pointer to other 4 words */ 04231 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04232 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04233 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04234 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04235 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04236 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04237 /* --- 3 */ 04238 movq mm1, [esi] /* load 8 bytes of the Src */ 04239 movq mm2, mm1 /* copy MM1 into MM2 */ 04240 add esi, eax /* move Src pointer 1 row below */ 04241 movq mm3, [edx] /* load 4 words of Kernel */ 04242 add edx, 8 /* move pointer to other 4 words */ 04243 movq mm4, [edx] /* load 4 words of Kernel */ 04244 add edx, 8 /* move pointer to other 4 words */ 04245 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04246 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04247 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04248 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04249 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04250 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04251 /* --- 4 */ 04252 movq mm1, [esi] /* load 8 bytes of the Src */ 04253 movq mm2, mm1 /* copy MM1 into MM2 */ 04254 add esi, eax /* move Src pointer 1 row below */ 04255 movq mm3, [edx] /* load 4 words of Kernel */ 04256 add edx, 8 /* move pointer to other 4 words */ 04257 movq mm4, [edx] /* load 4 words of Kernel */ 04258 add edx, 8 /* move pointer to other 4 words */ 04259 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04260 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04261 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04262 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04263 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04264 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04265 /* --- 5 */ 04266 movq mm1, [esi] /* load 8 bytes of the Src */ 04267 movq mm2, mm1 /* copy MM1 into MM2 */ 04268 movq mm3, [edx] /* load 4 words of Kernel */ 04269 add edx, 8 /* move pointer to other 4 words */ 04270 movq mm4, [edx] /* load 4 words of Kernel */ 04271 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04272 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04273 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04274 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04275 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04276 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04277 /* ---, */ 04278 movq mm3, mm7 /* copy MM7 into MM3 */ 04279 psrlq mm7, 32 /* shift 2 left words to the right */ 04280 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 04281 movq mm2, mm7 /* copy MM7 into MM2 */ 04282 psrlq mm7, 16 /* shift 1 left word to the right */ 04283 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 04284 /* ---, */ 04285 movd mm1, eax /* save EDX in MM1 */ 04286 movd mm2, ebx /* save EDX in MM2 */ 04287 movd mm3, edx /* save EDX in MM3 */ 04288 movd eax, mm7 /* load summation result into EAX */ 04289 psraw mm7, 15 /* spread sign bit of the result */ 04290 movd ebx, mm5 /* load Divisor into EBX */ 04291 movd edx, mm7 /* fill EDX with a sign bit */ 04292 idiv bx /* IDIV - VERY EXPENSIVE */ 04293 movd mm7, eax /* move result of division into MM7 */ 04294 packuswb mm7, mm0 /* pack division result with saturation */ 04295 movd eax, mm7 /* copy saturated result into EAX */ 04296 mov [edi], al /* copy a byte result into Dest */ 04297 movd edx, mm3 /* restore saved EDX */ 04298 movd ebx, mm2 /* restore saved EBX */ 04299 movd eax, mm1 /* restore saved EAX */ 04300 /* --, */ 04301 movd esi, mm6 /* move Src pointer to the top pixel */ 04302 sub edx, 72 /* EDX = Kernel address */ 04303 inc esi /* move Src pointer to the next pixel */ 04304 inc edi /* move Dest pointer to the next pixel */ 04305 /* ---, */ 04306 dec ecx /* decrease loop counter COLUMNS */ 04307 jnz L10332 /* check loop termination, proceed if required */ 04308 add esi, 4 /* move to the next row in Src */ 04309 add edi, 4 /* move to the next row in Dest */ 04310 dec ebx /* decrease loop counter ROWS */ 04311 jnz L10330 /* check loop termination, proceed if required */ 04312 /* ---, */ 04313 emms /* exit MMX state */ 04314 popa 04315 } 04316 #else 04317 asm volatile 04318 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 04319 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 04320 "mov %5, %%bl \n\t" /* load Divisor into BL */ 04321 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */ 04322 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 04323 "mov %1, %%esi \n\t" /* load Src address to ESI */ 04324 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 04325 "add $2, %%edi \n\t" /* 2 column offset from the left edge */ 04326 "mov %3, %%eax \n\t" /* load columns into EAX */ 04327 "shl $1, %%eax \n\t" /* EAX = columns * 2 */ 04328 "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */ 04329 "shr $1, %%eax \n\t" /* EAX = columns */ 04330 "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 04331 "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */ 04332 /* --- */ 04333 ".L10330: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 04334 "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */ 04335 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 04336 ".L10332: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 04337 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 04338 /* --- 1 */ 04339 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04340 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04341 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04342 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04343 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04344 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04345 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04346 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04347 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04348 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04349 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04350 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04351 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04352 /* --- 2 */ 04353 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04354 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04355 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04356 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04357 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04358 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04359 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04360 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04361 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04362 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04363 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04364 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04365 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04366 /* --- 3 */ 04367 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04368 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04369 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04370 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04371 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04372 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04373 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04374 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04375 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04376 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04377 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04378 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04379 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04380 /* --- 4 */ 04381 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04382 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04383 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04384 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04385 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04386 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04387 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04388 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04389 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04390 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04391 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04392 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04393 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04394 /* --- 5 */ 04395 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04396 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04397 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04398 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04399 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04400 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04401 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04402 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04403 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04404 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04405 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04406 /* --- */ 04407 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 04408 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 04409 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 04410 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 04411 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 04412 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 04413 /* --- */ 04414 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */ 04415 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */ 04416 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 04417 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */ 04418 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */ 04419 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */ 04420 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */ 04421 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 04422 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */ 04423 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 04424 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 04425 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 04426 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 04427 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */ 04428 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 04429 /* -- */ 04430 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 04431 "sub $72, %%edx \n\t" /* EDX = Kernel address */ 04432 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 04433 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 04434 /* --- */ 04435 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 04436 "jnz .L10332 \n\t" /* check loop termination, proceed if required */ 04437 "add $4, %%esi \n\t" /* move to the next row in Src */ 04438 "add $4, %%edi \n\t" /* move to the next row in Dest */ 04439 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 04440 "jnz .L10330 \n\t" /* check loop termination, proceed if required */ 04441 /* --- */ 04442 "emms \n\t" /* exit MMX state */ 04443 "popa \n\t":"=m" (Dest) /* %0 */ 04444 :"m"(Src), /* %1 */ 04445 "m"(rows), /* %2 */ 04446 "m"(columns), /* %3 */ 04447 "m"(Kernel), /* %4 */ 04448 "m"(Divisor) /* %5 */ 04449 ); 04450 #endif 04451 #endif 04452 return (0); 04453 } else { 04454 /* No non-MMX implementation yet */ 04455 return (-1); 04456 } 04457 } 04458 04473 int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 04474 signed short *Kernel, unsigned char Divisor) 04475 { 04476 /* Validate input parameters */ 04477 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 04478 return(-1); 04479 04480 if ((columns < 7) || (rows < 7) || (Divisor == 0)) 04481 return (-1); 04482 04483 if ((SDL_imageFilterMMXdetect())) { 04484 //#ifdef USE_MMX 04485 #if defined(USE_MMX) && defined(i386) 04486 #if !defined(GCC__) 04487 __asm 04488 { 04489 pusha 04490 pxor mm0, mm0 /* zero MM0 */ 04491 xor ebx, ebx /* zero EBX */ 04492 mov bl, Divisor /* load Divisor into BL */ 04493 movd mm5, ebx /* copy Divisor into MM5 */ 04494 mov edx, Kernel /* load Kernel address into EDX */ 04495 mov esi, Src /* load Src address to ESI */ 04496 mov edi, Dest /* load Dest address to EDI */ 04497 add edi, 3 /* 3 column offset from the left edge */ 04498 mov eax, columns /* load columns into EAX */ 04499 add edi, eax /* 3 row offset from the top edge */ 04500 add edi, eax 04501 add edi, eax 04502 mov ebx, rows /* initialize ROWS counter */ 04503 sub ebx, 6 /* do not use first 3 and last 3 rows */ 04504 /* ---, */ 04505 L10340: 04506 mov ecx, eax /* initialize COLUMNS counter */ 04507 sub ecx, 6 /* do not use first 3 and last 3 columns */ 04508 align 16 /* 16 byte alignment of the loop entry */ 04509 L10342: 04510 pxor mm7, mm7 /* zero MM7 (accumulator) */ 04511 movd mm6, esi /* save ESI in MM6 */ 04512 /* --- 1 */ 04513 movq mm1, [esi] /* load 8 bytes of the Src */ 04514 movq mm2, mm1 /* copy MM1 into MM2 */ 04515 add esi, eax /* move Src pointer 1 row below */ 04516 movq mm3, [edx] /* load 4 words of Kernel */ 04517 add edx, 8 /* move pointer to other 4 words */ 04518 movq mm4, [edx] /* load 4 words of Kernel */ 04519 add edx, 8 /* move pointer to other 4 words */ 04520 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04521 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04522 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04523 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04524 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04525 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04526 /* --- 2 */ 04527 movq mm1, [esi] /* load 8 bytes of the Src */ 04528 movq mm2, mm1 /* copy MM1 into MM2 */ 04529 add esi, eax /* move Src pointer 1 row below */ 04530 movq mm3, [edx] /* load 4 words of Kernel */ 04531 add edx, 8 /* move pointer to other 4 words */ 04532 movq mm4, [edx] /* load 4 words of Kernel */ 04533 add edx, 8 /* move pointer to other 4 words */ 04534 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04535 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04536 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04537 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04538 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04539 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04540 /* --- 3 */ 04541 movq mm1, [esi] /* load 8 bytes of the Src */ 04542 movq mm2, mm1 /* copy MM1 into MM2 */ 04543 add esi, eax /* move Src pointer 1 row below */ 04544 movq mm3, [edx] /* load 4 words of Kernel */ 04545 add edx, 8 /* move pointer to other 4 words */ 04546 movq mm4, [edx] /* load 4 words of Kernel */ 04547 add edx, 8 /* move pointer to other 4 words */ 04548 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04549 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04550 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04551 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04552 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04553 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04554 /* --- 4 */ 04555 movq mm1, [esi] /* load 8 bytes of the Src */ 04556 movq mm2, mm1 /* copy MM1 into MM2 */ 04557 add esi, eax /* move Src pointer 1 row below */ 04558 movq mm3, [edx] /* load 4 words of Kernel */ 04559 add edx, 8 /* move pointer to other 4 words */ 04560 movq mm4, [edx] /* load 4 words of Kernel */ 04561 add edx, 8 /* move pointer to other 4 words */ 04562 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04563 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04564 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04565 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04566 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04567 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04568 /* --- 5 */ 04569 movq mm1, [esi] /* load 8 bytes of the Src */ 04570 movq mm2, mm1 /* copy MM1 into MM2 */ 04571 add esi, eax /* move Src pointer 1 row below */ 04572 movq mm3, [edx] /* load 4 words of Kernel */ 04573 add edx, 8 /* move pointer to other 4 words */ 04574 movq mm4, [edx] /* load 4 words of Kernel */ 04575 add edx, 8 /* move pointer to other 4 words */ 04576 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04577 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04578 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04579 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04580 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04581 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04582 /* --- 6 */ 04583 movq mm1, [esi] /* load 8 bytes of the Src */ 04584 movq mm2, mm1 /* copy MM1 into MM2 */ 04585 add esi, eax /* move Src pointer 1 row below */ 04586 movq mm3, [edx] /* load 4 words of Kernel */ 04587 add edx, 8 /* move pointer to other 4 words */ 04588 movq mm4, [edx] /* load 4 words of Kernel */ 04589 add edx, 8 /* move pointer to other 4 words */ 04590 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04591 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04592 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04593 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04594 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04595 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04596 /* --- 7 */ 04597 movq mm1, [esi] /* load 8 bytes of the Src */ 04598 movq mm2, mm1 /* copy MM1 into MM2 */ 04599 movq mm3, [edx] /* load 4 words of Kernel */ 04600 add edx, 8 /* move pointer to other 4 words */ 04601 movq mm4, [edx] /* load 4 words of Kernel */ 04602 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04603 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04604 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 04605 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 04606 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04607 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04608 /* ---, */ 04609 movq mm3, mm7 /* copy MM7 into MM3 */ 04610 psrlq mm7, 32 /* shift 2 left words to the right */ 04611 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 04612 movq mm2, mm7 /* copy MM7 into MM2 */ 04613 psrlq mm7, 16 /* shift 1 left word to the right */ 04614 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 04615 /* ---, */ 04616 movd mm1, eax /* save EDX in MM1 */ 04617 movd mm2, ebx /* save EDX in MM2 */ 04618 movd mm3, edx /* save EDX in MM3 */ 04619 movd eax, mm7 /* load summation result into EAX */ 04620 psraw mm7, 15 /* spread sign bit of the result */ 04621 movd ebx, mm5 /* load Divisor into EBX */ 04622 movd edx, mm7 /* fill EDX with a sign bit */ 04623 idiv bx /* IDIV - VERY EXPENSIVE */ 04624 movd mm7, eax /* move result of division into MM7 */ 04625 packuswb mm7, mm0 /* pack division result with saturation */ 04626 movd eax, mm7 /* copy saturated result into EAX */ 04627 mov [edi], al /* copy a byte result into Dest */ 04628 movd edx, mm3 /* restore saved EDX */ 04629 movd ebx, mm2 /* restore saved EBX */ 04630 movd eax, mm1 /* restore saved EAX */ 04631 /* --, */ 04632 movd esi, mm6 /* move Src pointer to the top pixel */ 04633 sub edx, 104 /* EDX = Kernel address */ 04634 inc esi /* move Src pointer to the next pixel */ 04635 inc edi /* move Dest pointer to the next pixel */ 04636 /* ---, */ 04637 dec ecx /* decrease loop counter COLUMNS */ 04638 jnz L10342 /* check loop termination, proceed if required */ 04639 add esi, 6 /* move to the next row in Src */ 04640 add edi, 6 /* move to the next row in Dest */ 04641 dec ebx /* decrease loop counter ROWS */ 04642 jnz L10340 /* check loop termination, proceed if required */ 04643 /* ---, */ 04644 emms /* exit MMX state */ 04645 popa 04646 } 04647 #else 04648 asm volatile 04649 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 04650 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 04651 "mov %5, %%bl \n\t" /* load Divisor into BL */ 04652 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */ 04653 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 04654 "mov %1, %%esi \n\t" /* load Src address to ESI */ 04655 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 04656 "add $3, %%edi \n\t" /* 3 column offset from the left edge */ 04657 "mov %3, %%eax \n\t" /* load columns into EAX */ 04658 "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */ 04659 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 04660 "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */ 04661 /* --- */ 04662 ".L10340: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 04663 "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */ 04664 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 04665 ".L10342: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 04666 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 04667 /* --- 1 */ 04668 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04669 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04670 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04671 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04672 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04673 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04674 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04675 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04676 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04677 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04678 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04679 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04680 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04681 /* --- 2 */ 04682 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04683 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04684 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04685 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04686 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04687 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04688 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04689 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04690 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04691 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04692 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04693 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04694 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04695 /* --- 3 */ 04696 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04697 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04698 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04699 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04700 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04701 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04702 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04703 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04704 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04705 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04706 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04707 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04708 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04709 /* --- 4 */ 04710 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04711 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04712 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04713 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04714 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04715 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04716 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04717 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04718 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04719 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04720 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04721 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04722 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04723 /* --- 5 */ 04724 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04725 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04726 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04727 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04728 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04729 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04730 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04731 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04732 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04733 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04734 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04735 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04736 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04737 /* --- 6 */ 04738 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04739 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04740 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 04741 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04742 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04743 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04744 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04745 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04746 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04747 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04748 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04749 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04750 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04751 /* --- 7 */ 04752 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 04753 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 04754 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 04755 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 04756 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 04757 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 04758 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 04759 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 04760 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 04761 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 04762 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 04763 /* --- */ 04764 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 04765 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 04766 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 04767 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 04768 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 04769 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 04770 /* --- */ 04771 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */ 04772 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */ 04773 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 04774 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */ 04775 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */ 04776 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */ 04777 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */ 04778 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 04779 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */ 04780 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 04781 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 04782 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 04783 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 04784 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */ 04785 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 04786 /* -- */ 04787 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 04788 "sub $104, %%edx \n\t" /* EDX = Kernel address */ 04789 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 04790 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 04791 /* --- */ 04792 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 04793 "jnz .L10342 \n\t" /* check loop termination, proceed if required */ 04794 "add $6, %%esi \n\t" /* move to the next row in Src */ 04795 "add $6, %%edi \n\t" /* move to the next row in Dest */ 04796 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 04797 "jnz .L10340 \n\t" /* check loop termination, proceed if required */ 04798 /* --- */ 04799 "emms \n\t" /* exit MMX state */ 04800 "popa \n\t":"=m" (Dest) /* %0 */ 04801 :"m"(Src), /* %1 */ 04802 "m"(rows), /* %2 */ 04803 "m"(columns), /* %3 */ 04804 "m"(Kernel), /* %4 */ 04805 "m"(Divisor) /* %5 */ 04806 ); 04807 #endif 04808 #endif 04809 return (0); 04810 } else { 04811 /* No non-MMX implementation yet */ 04812 return (-1); 04813 } 04814 } 04815 04830 int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, 04831 signed short *Kernel, unsigned char Divisor) 04832 { 04833 /* Validate input parameters */ 04834 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 04835 return(-1); 04836 04837 if ((columns < 9) || (rows < 9) || (Divisor == 0)) 04838 return (-1); 04839 04840 if ((SDL_imageFilterMMXdetect())) { 04841 //#ifdef USE_MMX 04842 #if defined(USE_MMX) && defined(i386) 04843 #if !defined(GCC__) 04844 __asm 04845 { 04846 pusha 04847 pxor mm0, mm0 /* zero MM0 */ 04848 xor ebx, ebx /* zero EBX */ 04849 mov bl, Divisor /* load Divisor into BL */ 04850 movd mm5, ebx /* copy Divisor into MM5 */ 04851 mov edx, Kernel /* load Kernel address into EDX */ 04852 mov esi, Src /* load Src address to ESI */ 04853 mov edi, Dest /* load Dest address to EDI */ 04854 add edi, 4 /* 4 column offset from the left edge */ 04855 mov eax, columns /* load columns into EAX */ 04856 add edi, eax /* 4 row offset from the top edge */ 04857 add edi, eax 04858 add edi, eax 04859 add edi, eax 04860 mov ebx, rows /* initialize ROWS counter */ 04861 sub ebx, 8 /* do not use first 4 and last 4 rows */ 04862 /* ---, */ 04863 L10350: 04864 mov ecx, eax /* initialize COLUMNS counter */ 04865 sub ecx, 8 /* do not use first 4 and last 4 columns */ 04866 align 16 /* 16 byte alignment of the loop entry */ 04867 L10352: 04868 pxor mm7, mm7 /* zero MM7 (accumulator) */ 04869 movd mm6, esi /* save ESI in MM6 */ 04870 /* --- 1 */ 04871 movq mm1, [esi] /* load 8 bytes of the Src */ 04872 movq mm2, mm1 /* copy MM1 into MM2 */ 04873 inc esi /* move pointer to the next 8 bytes of Src */ 04874 movq mm3, [edx] /* load 4 words of Kernel */ 04875 add edx, 8 /* move pointer to other 4 words */ 04876 movq mm4, [edx] /* load 4 words of Kernel */ 04877 add edx, 8 /* move pointer to other 4 words */ 04878 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04879 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04880 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04881 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04882 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04883 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04884 movq mm1, [esi] /* load 8 bytes of the Src */ 04885 dec esi 04886 add esi, eax /* move Src pointer 1 row below */ 04887 movq mm3, [edx] /* load 4 words of Kernel */ 04888 add edx, 8 /* move pointer to other 4 words */ 04889 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04890 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04891 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04892 /* --- 2 */ 04893 movq mm1, [esi] /* load 8 bytes of the Src */ 04894 movq mm2, mm1 /* copy MM1 into MM2 */ 04895 inc esi /* move pointer to the next 8 bytes of Src */ 04896 movq mm3, [edx] /* load 4 words of Kernel */ 04897 add edx, 8 /* move pointer to other 4 words */ 04898 movq mm4, [edx] /* load 4 words of Kernel */ 04899 add edx, 8 /* move pointer to other 4 words */ 04900 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04901 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04902 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04903 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04904 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04905 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04906 movq mm1, [esi] /* load 8 bytes of the Src */ 04907 dec esi 04908 add esi, eax /* move Src pointer 1 row below */ 04909 movq mm3, [edx] /* load 4 words of Kernel */ 04910 add edx, 8 /* move pointer to other 4 words */ 04911 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04912 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04913 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04914 /* --- 3 */ 04915 movq mm1, [esi] /* load 8 bytes of the Src */ 04916 movq mm2, mm1 /* copy MM1 into MM2 */ 04917 inc esi /* move pointer to the next 8 bytes of Src */ 04918 movq mm3, [edx] /* load 4 words of Kernel */ 04919 add edx, 8 /* move pointer to other 4 words */ 04920 movq mm4, [edx] /* load 4 words of Kernel */ 04921 add edx, 8 /* move pointer to other 4 words */ 04922 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04923 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04924 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04925 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04926 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04927 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04928 movq mm1, [esi] /* load 8 bytes of the Src */ 04929 dec esi 04930 add esi, eax /* move Src pointer 1 row below */ 04931 movq mm3, [edx] /* load 4 words of Kernel */ 04932 add edx, 8 /* move pointer to other 4 words */ 04933 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04934 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04935 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04936 /* --- 4 */ 04937 movq mm1, [esi] /* load 8 bytes of the Src */ 04938 movq mm2, mm1 /* copy MM1 into MM2 */ 04939 inc esi /* move pointer to the next 8 bytes of Src */ 04940 movq mm3, [edx] /* load 4 words of Kernel */ 04941 add edx, 8 /* move pointer to other 4 words */ 04942 movq mm4, [edx] /* load 4 words of Kernel */ 04943 add edx, 8 /* move pointer to other 4 words */ 04944 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04945 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04946 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04947 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04948 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04949 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04950 movq mm1, [esi] /* load 8 bytes of the Src */ 04951 dec esi 04952 add esi, eax /* move Src pointer 1 row below */ 04953 movq mm3, [edx] /* load 4 words of Kernel */ 04954 add edx, 8 /* move pointer to other 4 words */ 04955 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04956 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04957 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04958 /* --- 5 */ 04959 movq mm1, [esi] /* load 8 bytes of the Src */ 04960 movq mm2, mm1 /* copy MM1 into MM2 */ 04961 inc esi /* move pointer to the next 8 bytes of Src */ 04962 movq mm3, [edx] /* load 4 words of Kernel */ 04963 add edx, 8 /* move pointer to other 4 words */ 04964 movq mm4, [edx] /* load 4 words of Kernel */ 04965 add edx, 8 /* move pointer to other 4 words */ 04966 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04967 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04968 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04969 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04970 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04971 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04972 movq mm1, [esi] /* load 8 bytes of the Src */ 04973 dec esi 04974 add esi, eax /* move Src pointer 1 row below */ 04975 movq mm3, [edx] /* load 4 words of Kernel */ 04976 add edx, 8 /* move pointer to other 4 words */ 04977 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04978 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04979 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04980 /* --- 6 */ 04981 movq mm1, [esi] /* load 8 bytes of the Src */ 04982 movq mm2, mm1 /* copy MM1 into MM2 */ 04983 inc esi /* move pointer to the next 8 bytes of Src */ 04984 movq mm3, [edx] /* load 4 words of Kernel */ 04985 add edx, 8 /* move pointer to other 4 words */ 04986 movq mm4, [edx] /* load 4 words of Kernel */ 04987 add edx, 8 /* move pointer to other 4 words */ 04988 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 04989 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 04990 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 04991 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 04992 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 04993 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 04994 movq mm1, [esi] /* load 8 bytes of the Src */ 04995 dec esi 04996 add esi, eax /* move Src pointer 1 row below */ 04997 movq mm3, [edx] /* load 4 words of Kernel */ 04998 add edx, 8 /* move pointer to other 4 words */ 04999 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05000 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05001 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05002 /* --- 7 */ 05003 movq mm1, [esi] /* load 8 bytes of the Src */ 05004 movq mm2, mm1 /* copy MM1 into MM2 */ 05005 inc esi /* move pointer to the next 8 bytes of Src */ 05006 movq mm3, [edx] /* load 4 words of Kernel */ 05007 add edx, 8 /* move pointer to other 4 words */ 05008 movq mm4, [edx] /* load 4 words of Kernel */ 05009 add edx, 8 /* move pointer to other 4 words */ 05010 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05011 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05012 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05013 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05014 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05015 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05016 movq mm1, [esi] /* load 8 bytes of the Src */ 05017 dec esi 05018 add esi, eax /* move Src pointer 1 row below */ 05019 movq mm3, [edx] /* load 4 words of Kernel */ 05020 add edx, 8 /* move pointer to other 4 words */ 05021 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05022 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05023 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05024 /* --- 8 */ 05025 movq mm1, [esi] /* load 8 bytes of the Src */ 05026 movq mm2, mm1 /* copy MM1 into MM2 */ 05027 inc esi /* move pointer to the next 8 bytes of Src */ 05028 movq mm3, [edx] /* load 4 words of Kernel */ 05029 add edx, 8 /* move pointer to other 4 words */ 05030 movq mm4, [edx] /* load 4 words of Kernel */ 05031 add edx, 8 /* move pointer to other 4 words */ 05032 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05033 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05034 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05035 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05036 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05037 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05038 movq mm1, [esi] /* load 8 bytes of the Src */ 05039 dec esi 05040 add esi, eax /* move Src pointer 1 row below */ 05041 movq mm3, [edx] /* load 4 words of Kernel */ 05042 add edx, 8 /* move pointer to other 4 words */ 05043 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05044 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05045 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05046 /* --- 9 */ 05047 movq mm1, [esi] /* load 8 bytes of the Src */ 05048 movq mm2, mm1 /* copy MM1 into MM2 */ 05049 inc esi /* move pointer to the next 8 bytes of Src */ 05050 movq mm3, [edx] /* load 4 words of Kernel */ 05051 add edx, 8 /* move pointer to other 4 words */ 05052 movq mm4, [edx] /* load 4 words of Kernel */ 05053 add edx, 8 /* move pointer to other 4 words */ 05054 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05055 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05056 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05057 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */ 05058 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05059 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05060 movq mm1, [esi] /* load 8 bytes of the Src */ 05061 movq mm3, [edx] /* load 4 words of Kernel */ 05062 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05063 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */ 05064 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05065 /* ---, */ 05066 movq mm3, mm7 /* copy MM7 into MM3 */ 05067 psrlq mm7, 32 /* shift 2 left words to the right */ 05068 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 05069 movq mm2, mm7 /* copy MM7 into MM2 */ 05070 psrlq mm7, 16 /* shift 1 left word to the right */ 05071 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 05072 /* ---, */ 05073 movd mm1, eax /* save EDX in MM1 */ 05074 movd mm2, ebx /* save EDX in MM2 */ 05075 movd mm3, edx /* save EDX in MM3 */ 05076 movd eax, mm7 /* load summation result into EAX */ 05077 psraw mm7, 15 /* spread sign bit of the result */ 05078 movd ebx, mm5 /* load Divisor into EBX */ 05079 movd edx, mm7 /* fill EDX with a sign bit */ 05080 idiv bx /* IDIV - VERY EXPENSIVE */ 05081 movd mm7, eax /* move result of division into MM7 */ 05082 packuswb mm7, mm0 /* pack division result with saturation */ 05083 movd eax, mm7 /* copy saturated result into EAX */ 05084 mov [edi], al /* copy a byte result into Dest */ 05085 movd edx, mm3 /* restore saved EDX */ 05086 movd ebx, mm2 /* restore saved EBX */ 05087 movd eax, mm1 /* restore saved EAX */ 05088 /* --, */ 05089 movd esi, mm6 /* move Src pointer to the top pixel */ 05090 sub edx, 208 /* EDX = Kernel address */ 05091 inc esi /* move Src pointer to the next pixel */ 05092 inc edi /* move Dest pointer to the next pixel */ 05093 /* ---, */ 05094 dec ecx /* decrease loop counter COLUMNS */ 05095 jnz L10352 /* check loop termination, proceed if required */ 05096 add esi, 8 /* move to the next row in Src */ 05097 add edi, 8 /* move to the next row in Dest */ 05098 dec ebx /* decrease loop counter ROWS */ 05099 jnz L10350 /* check loop termination, proceed if required */ 05100 /* ---, */ 05101 emms /* exit MMX state */ 05102 popa 05103 } 05104 #else 05105 asm volatile 05106 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 05107 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 05108 "mov %5, %%bl \n\t" /* load Divisor into BL */ 05109 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */ 05110 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 05111 "mov %1, %%esi \n\t" /* load Src address to ESI */ 05112 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 05113 "add $4, %%edi \n\t" /* 4 column offset from the left edge */ 05114 "mov %3, %%eax \n\t" /* load columns into EAX */ 05115 "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */ 05116 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 05117 "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */ 05118 /* --- */ 05119 ".L10350: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 05120 "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */ 05121 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 05122 ".L10352: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 05123 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 05124 /* --- 1 */ 05125 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05126 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05127 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05128 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05129 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05130 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05131 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05132 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05133 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05134 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05135 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05136 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05137 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05138 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05139 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05140 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05141 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05142 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05143 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05144 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05145 /* --- 2 */ 05146 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05147 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05148 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05149 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05150 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05151 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05152 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05153 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05154 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05155 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05156 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05157 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05158 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05159 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05160 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05161 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05162 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05163 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05164 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05165 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05166 /* --- 3 */ 05167 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05168 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05169 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05170 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05171 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05172 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05173 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05174 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05175 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05176 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05177 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05178 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05179 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05180 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05181 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05182 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05183 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05184 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05185 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05186 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05187 /* --- 4 */ 05188 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05189 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05190 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05191 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05192 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05193 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05194 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05195 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05196 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05197 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05198 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05199 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05200 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05201 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05202 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05203 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05204 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05205 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05206 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05207 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05208 /* --- 5 */ 05209 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05210 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05211 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05212 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05213 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05214 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05215 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05216 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05217 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05218 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05219 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05220 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05221 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05222 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05223 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05224 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05225 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05226 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05227 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05228 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05229 /* --- 6 */ 05230 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05231 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05232 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05233 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05234 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05235 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05236 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05237 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05238 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05239 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05240 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05241 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05242 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05243 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05244 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05245 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05246 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05247 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05248 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05249 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05250 /* --- 7 */ 05251 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05252 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05253 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05254 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05255 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05256 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05257 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05258 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05259 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05260 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05261 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05262 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05263 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05264 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05265 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05266 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05267 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05268 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05269 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05270 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05271 /* --- 8 */ 05272 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05273 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05274 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05275 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05276 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05277 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05278 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05279 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05280 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05281 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05282 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05283 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05284 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05285 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05286 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05287 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05288 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05289 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05290 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05291 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05292 /* --- 9 */ 05293 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05294 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05295 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 05296 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05297 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05298 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05299 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05300 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05301 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05302 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05303 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05304 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05305 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05306 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05307 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05308 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05309 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05310 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05311 /* --- */ 05312 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 05313 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 05314 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 05315 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 05316 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 05317 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 05318 /* --- */ 05319 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */ 05320 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */ 05321 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */ 05322 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */ 05323 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */ 05324 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */ 05325 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */ 05326 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */ 05327 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */ 05328 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 05329 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 05330 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 05331 "movd %%mm3, %%edx \n\t" /* restore saved EDX */ 05332 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */ 05333 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 05334 /* -- */ 05335 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 05336 "sub $208, %%edx \n\t" /* EDX = Kernel address */ 05337 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 05338 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 05339 /* --- */ 05340 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 05341 "jnz .L10352 \n\t" /* check loop termination, proceed if required */ 05342 "add $8, %%esi \n\t" /* move to the next row in Src */ 05343 "add $8, %%edi \n\t" /* move to the next row in Dest */ 05344 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 05345 "jnz .L10350 \n\t" /* check loop termination, proceed if required */ 05346 /* --- */ 05347 "emms \n\t" /* exit MMX state */ 05348 "popa \n\t":"=m" (Dest) /* %0 */ 05349 :"m"(Src), /* %1 */ 05350 "m"(rows), /* %2 */ 05351 "m"(columns), /* %3 */ 05352 "m"(Kernel), /* %4 */ 05353 "m"(Divisor) /* %5 */ 05354 ); 05355 #endif 05356 #endif 05357 return (0); 05358 } else { 05359 /* No non-MMX implementation yet */ 05360 return (-1); 05361 } 05362 } 05363 05378 int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 05379 signed short *Kernel, unsigned char NRightShift) 05380 { 05381 /* Validate input parameters */ 05382 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 05383 return(-1); 05384 05385 if ((columns < 3) || (rows < 3) || (NRightShift > 7)) 05386 return (-1); 05387 05388 if ((SDL_imageFilterMMXdetect())) { 05389 //#ifdef USE_MMX 05390 #if defined(USE_MMX) && defined(i386) 05391 #if !defined(GCC__) 05392 __asm 05393 { 05394 pusha 05395 pxor mm0, mm0 /* zero MM0 */ 05396 xor ebx, ebx /* zero EBX */ 05397 mov bl, NRightShift /* load NRightShift into BL */ 05398 movd mm4, ebx /* copy NRightShift into MM4 */ 05399 mov edx, Kernel /* load Kernel address into EDX */ 05400 movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */ 05401 add edx, 8 /* second row |K0 K1 K2 0| */ 05402 movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 05403 add edx, 8 /* third row |K6 K7 K8 0| */ 05404 movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */ 05405 /* ---, */ 05406 mov eax, columns /* load columns into EAX */ 05407 mov esi, Src /* ESI = Src row 0 address */ 05408 mov edi, Dest /* load Dest address to EDI */ 05409 add edi, eax /* EDI = EDI + columns */ 05410 inc edi /* 1 byte offset from the left edge */ 05411 mov edx, rows /* initialize ROWS counter */ 05412 sub edx, 2 /* do not use first and last row */ 05413 /* ---, */ 05414 L10360: 05415 mov ecx, eax /* initialize COLUMS counter */ 05416 sub ecx, 2 /* do not use first and last column */ 05417 align 16 /* 16 byte alignment of the loop entry */ 05418 L10362: 05419 /* ---, */ 05420 movq mm1, [esi] /* load 8 bytes of the image first row */ 05421 add esi, eax /* move one row below */ 05422 movq mm2, [esi] /* load 8 bytes of the image second row */ 05423 add esi, eax /* move one row below */ 05424 movq mm3, [esi] /* load 8 bytes of the image third row */ 05425 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05426 punpcklbw mm2, mm0 /* unpack first 4 bytes into words */ 05427 punpcklbw mm3, mm0 /* unpack first 4 bytes into words */ 05428 psrlw mm1, mm4 /* shift right each pixel NshiftRight times */ 05429 psrlw mm2, mm4 /* shift right each pixel NshiftRight times */ 05430 psrlw mm3, mm4 /* shift right each pixel NshiftRight times */ 05431 pmullw mm1, mm5 /* multiply words first row image*Kernel */ 05432 pmullw mm2, mm6 /* multiply words second row image*Kernel */ 05433 pmullw mm3, mm7 /* multiply words third row image*Kernel */ 05434 paddsw mm1, mm2 /* add 4 words of the first and second rows */ 05435 paddsw mm1, mm3 /* add 4 words of the third row and result */ 05436 movq mm2, mm1 /* copy MM1 into MM2 */ 05437 psrlq mm1, 32 /* shift 2 left words to the right */ 05438 paddsw mm1, mm2 /* add 2 left and 2 right result words */ 05439 movq mm3, mm1 /* copy MM1 into MM3 */ 05440 psrlq mm1, 16 /* shift 1 left word to the right */ 05441 paddsw mm1, mm3 /* add 1 left and 1 right result words */ 05442 packuswb mm1, mm0 /* pack shift result with saturation */ 05443 movd ebx, mm1 /* copy saturated result into EBX */ 05444 mov [edi], bl /* copy a byte result into Dest */ 05445 /* --, */ 05446 sub esi, eax /* move two rows up */ 05447 sub esi, eax 05448 inc esi /* move Src pointer to the next pixel */ 05449 inc edi /* move Dest pointer to the next pixel */ 05450 /* ---, */ 05451 dec ecx /* decrease loop counter COLUMNS */ 05452 jnz L10362 /* check loop termination, proceed if required */ 05453 add esi, 2 /* move to the next row in Src */ 05454 add edi, 2 /* move to the next row in Dest */ 05455 dec edx /* decrease loop counter ROWS */ 05456 jnz L10360 /* check loop termination, proceed if required */ 05457 /* ---, */ 05458 emms /* exit MMX state */ 05459 popa 05460 } 05461 #else 05462 asm volatile 05463 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 05464 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 05465 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 05466 "movd %%ebx, %%mm4 \n\t" /* copy NRightShift into MM4 */ 05467 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 05468 "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */ 05469 "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */ 05470 "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */ 05471 "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */ 05472 "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */ 05473 /* --- */ 05474 "mov %3, %%eax \n\t" /* load columns into EAX */ 05475 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 05476 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 05477 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 05478 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 05479 "mov %2, %%edx \n\t" /* initialize ROWS counter */ 05480 "sub $2, %%edx \n\t" /* do not use first and last row */ 05481 /* --- */ 05482 ".L10360: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 05483 "sub $2, %%ecx \n\t" /* do not use first and last column */ 05484 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 05485 ".L10362: \n\t" 05486 /* --- */ 05487 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */ 05488 "add %%eax, %%esi \n\t" /* move one row below */ 05489 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */ 05490 "add %%eax, %%esi \n\t" /* move one row below */ 05491 "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */ 05492 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05493 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */ 05494 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */ 05495 "psrlw %%mm4, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05496 "psrlw %%mm4, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05497 "psrlw %%mm4, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 05498 "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */ 05499 "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */ 05500 "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */ 05501 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */ 05502 "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */ 05503 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05504 "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */ 05505 "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */ 05506 "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */ 05507 "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */ 05508 "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */ 05509 "packuswb %%mm0, %%mm1 \n\t" /* pack shift result with saturation */ 05510 "movd %%mm1, %%ebx \n\t" /* copy saturated result into EBX */ 05511 "mov %%bl, (%%edi) \n\t" /* copy a byte result into Dest */ 05512 /* -- */ 05513 "sub %%eax, %%esi \n\t" /* move two rows up */ 05514 "sub %%eax, %%esi \n\t" "inc %%esi \n\t" /* move Src pointer to the next pixel */ 05515 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 05516 /* --- */ 05517 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 05518 "jnz .L10362 \n\t" /* check loop termination, proceed if required */ 05519 "add $2, %%esi \n\t" /* move to the next row in Src */ 05520 "add $2, %%edi \n\t" /* move to the next row in Dest */ 05521 "dec %%edx \n\t" /* decrease loop counter ROWS */ 05522 "jnz .L10360 \n\t" /* check loop termination, proceed if required */ 05523 /* --- */ 05524 "emms \n\t" /* exit MMX state */ 05525 "popa \n\t":"=m" (Dest) /* %0 */ 05526 :"m"(Src), /* %1 */ 05527 "m"(rows), /* %2 */ 05528 "m"(columns), /* %3 */ 05529 "m"(Kernel), /* %4 */ 05530 "m"(NRightShift) /* %5 */ 05531 ); 05532 #endif 05533 #endif 05534 return (0); 05535 } else { 05536 /* No non-MMX implementation yet */ 05537 return (-1); 05538 } 05539 } 05540 05555 int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 05556 signed short *Kernel, unsigned char NRightShift) 05557 { 05558 /* Validate input parameters */ 05559 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 05560 return(-1); 05561 05562 if ((columns < 5) || (rows < 5) || (NRightShift > 7)) 05563 return (-1); 05564 05565 if ((SDL_imageFilterMMXdetect())) { 05566 //#ifdef USE_MMX 05567 #if defined(USE_MMX) && defined(i386) 05568 #if !defined(GCC__) 05569 __asm 05570 { 05571 pusha 05572 pxor mm0, mm0 /* zero MM0 */ 05573 xor ebx, ebx /* zero EBX */ 05574 mov bl, NRightShift /* load NRightShift into BL */ 05575 movd mm5, ebx /* copy NRightShift into MM5 */ 05576 mov edx, Kernel /* load Kernel address into EDX */ 05577 mov esi, Src /* load Src address to ESI */ 05578 mov edi, Dest /* load Dest address to EDI */ 05579 add edi, 2 /* 2 column offset from the left edge */ 05580 mov eax, columns /* load columns into EAX */ 05581 shl eax, 1 /* EAX = columns * 2 */ 05582 add edi, eax /* 2 row offset from the top edge */ 05583 shr eax, 1 /* EAX = columns */ 05584 mov ebx, rows /* initialize ROWS counter */ 05585 sub ebx, 4 /* do not use first 2 and last 2 rows */ 05586 /* ---, */ 05587 L10370: 05588 mov ecx, eax /* initialize COLUMNS counter */ 05589 sub ecx, 4 /* do not use first 2 and last 2 columns */ 05590 align 16 /* 16 byte alignment of the loop entry */ 05591 L10372: 05592 pxor mm7, mm7 /* zero MM7 (accumulator) */ 05593 movd mm6, esi /* save ESI in MM6 */ 05594 /* --- 1 */ 05595 movq mm1, [esi] /* load 8 bytes of the Src */ 05596 movq mm2, mm1 /* copy MM1 into MM2 */ 05597 add esi, eax /* move Src pointer 1 row below */ 05598 movq mm3, [edx] /* load 4 words of Kernel */ 05599 add edx, 8 /* move pointer to other 4 words */ 05600 movq mm4, [edx] /* load 4 words of Kernel */ 05601 add edx, 8 /* move pointer to other 4 words */ 05602 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05603 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05604 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05605 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05606 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05607 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05608 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05609 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05610 /* --- 2 */ 05611 movq mm1, [esi] /* load 8 bytes of the Src */ 05612 movq mm2, mm1 /* copy MM1 into MM2 */ 05613 add esi, eax /* move Src pointer 1 row below */ 05614 movq mm3, [edx] /* load 4 words of Kernel */ 05615 add edx, 8 /* move pointer to other 4 words */ 05616 movq mm4, [edx] /* load 4 words of Kernel */ 05617 add edx, 8 /* move pointer to other 4 words */ 05618 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05619 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05620 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05621 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05622 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05623 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05624 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05625 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05626 /* --- 3 */ 05627 movq mm1, [esi] /* load 8 bytes of the Src */ 05628 movq mm2, mm1 /* copy MM1 into MM2 */ 05629 add esi, eax /* move Src pointer 1 row below */ 05630 movq mm3, [edx] /* load 4 words of Kernel */ 05631 add edx, 8 /* move pointer to other 4 words */ 05632 movq mm4, [edx] /* load 4 words of Kernel */ 05633 add edx, 8 /* move pointer to other 4 words */ 05634 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05635 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05636 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05637 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05638 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05639 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05640 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05641 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05642 /* --- 4 */ 05643 movq mm1, [esi] /* load 8 bytes of the Src */ 05644 movq mm2, mm1 /* copy MM1 into MM2 */ 05645 add esi, eax /* move Src pointer 1 row below */ 05646 movq mm3, [edx] /* load 4 words of Kernel */ 05647 add edx, 8 /* move pointer to other 4 words */ 05648 movq mm4, [edx] /* load 4 words of Kernel */ 05649 add edx, 8 /* move pointer to other 4 words */ 05650 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05651 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05652 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05653 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05654 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05655 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05656 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05657 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05658 /* --- 5 */ 05659 movq mm1, [esi] /* load 8 bytes of the Src */ 05660 movq mm2, mm1 /* copy MM1 into MM2 */ 05661 movq mm3, [edx] /* load 4 words of Kernel */ 05662 add edx, 8 /* move pointer to other 4 words */ 05663 movq mm4, [edx] /* load 4 words of Kernel */ 05664 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05665 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05666 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05667 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05668 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05669 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05670 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05671 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05672 /* ---, */ 05673 movq mm3, mm7 /* copy MM7 into MM3 */ 05674 psrlq mm7, 32 /* shift 2 left words to the right */ 05675 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 05676 movq mm2, mm7 /* copy MM7 into MM2 */ 05677 psrlq mm7, 16 /* shift 1 left word to the right */ 05678 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 05679 movd mm1, eax /* save EAX in MM1 */ 05680 packuswb mm7, mm0 /* pack division result with saturation */ 05681 movd eax, mm7 /* copy saturated result into EAX */ 05682 mov [edi], al /* copy a byte result into Dest */ 05683 movd eax, mm1 /* restore saved EAX */ 05684 /* --, */ 05685 movd esi, mm6 /* move Src pointer to the top pixel */ 05686 sub edx, 72 /* EDX = Kernel address */ 05687 inc esi /* move Src pointer to the next pixel */ 05688 inc edi /* move Dest pointer to the next pixel */ 05689 /* ---, */ 05690 dec ecx /* decrease loop counter COLUMNS */ 05691 jnz L10372 /* check loop termination, proceed if required */ 05692 add esi, 4 /* move to the next row in Src */ 05693 add edi, 4 /* move to the next row in Dest */ 05694 dec ebx /* decrease loop counter ROWS */ 05695 jnz L10370 /* check loop termination, proceed if required */ 05696 /* ---, */ 05697 emms /* exit MMX state */ 05698 popa 05699 } 05700 #else 05701 asm volatile 05702 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 05703 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 05704 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 05705 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */ 05706 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 05707 "mov %1, %%esi \n\t" /* load Src address to ESI */ 05708 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 05709 "add $2, %%edi \n\t" /* 2 column offset from the left edge */ 05710 "mov %3, %%eax \n\t" /* load columns into EAX */ 05711 "shl $1, %%eax \n\t" /* EAX = columns * 2 */ 05712 "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */ 05713 "shr $1, %%eax \n\t" /* EAX = columns */ 05714 "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 05715 "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */ 05716 /* --- */ 05717 ".L10370: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 05718 "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */ 05719 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 05720 ".L10372: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 05721 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 05722 /* --- 1 */ 05723 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05724 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05725 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05726 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05727 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05728 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05729 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05730 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05731 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05732 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05733 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05734 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05735 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05736 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05737 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05738 /* --- 2 */ 05739 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05740 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05741 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05742 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05743 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05744 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05745 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05746 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05747 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05748 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05749 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05750 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05751 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05752 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05753 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05754 /* --- 3 */ 05755 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05756 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05757 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05758 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05759 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05760 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05761 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05762 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05763 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05764 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05765 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05766 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05767 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05768 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05769 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05770 /* --- 4 */ 05771 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05772 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05773 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 05774 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05775 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05776 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05777 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05778 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05779 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05780 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05781 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05782 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05783 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05784 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05785 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05786 /* --- 5 */ 05787 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 05788 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 05789 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 05790 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 05791 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 05792 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 05793 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 05794 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 05795 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 05796 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 05797 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 05798 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 05799 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 05800 /* --- */ 05801 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 05802 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 05803 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 05804 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 05805 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 05806 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 05807 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */ 05808 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 05809 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 05810 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 05811 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 05812 /* -- */ 05813 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 05814 "sub $72, %%edx \n\t" /* EDX = Kernel address */ 05815 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 05816 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 05817 /* --- */ 05818 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 05819 "jnz .L10372 \n\t" /* check loop termination, proceed if required */ 05820 "add $4, %%esi \n\t" /* move to the next row in Src */ 05821 "add $4, %%edi \n\t" /* move to the next row in Dest */ 05822 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 05823 "jnz .L10370 \n\t" /* check loop termination, proceed if required */ 05824 /* --- */ 05825 "emms \n\t" /* exit MMX state */ 05826 "popa \n\t":"=m" (Dest) /* %0 */ 05827 :"m"(Src), /* %1 */ 05828 "m"(rows), /* %2 */ 05829 "m"(columns), /* %3 */ 05830 "m"(Kernel), /* %4 */ 05831 "m"(NRightShift) /* %5 */ 05832 ); 05833 #endif 05834 #endif 05835 return (0); 05836 } else { 05837 /* No non-MMX implementation yet */ 05838 return (-1); 05839 } 05840 } 05841 05856 int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 05857 signed short *Kernel, unsigned char NRightShift) 05858 { 05859 /* Validate input parameters */ 05860 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 05861 return(-1); 05862 05863 if ((columns < 7) || (rows < 7) || (NRightShift > 7)) 05864 return (-1); 05865 05866 if ((SDL_imageFilterMMXdetect())) { 05867 //#ifdef USE_MMX 05868 #if defined(USE_MMX) && defined(i386) 05869 #if !defined(GCC__) 05870 __asm 05871 { 05872 pusha 05873 pxor mm0, mm0 /* zero MM0 */ 05874 xor ebx, ebx /* zero EBX */ 05875 mov bl, NRightShift /* load NRightShift into BL */ 05876 movd mm5, ebx /* copy NRightShift into MM5 */ 05877 mov edx, Kernel /* load Kernel address into EDX */ 05878 mov esi, Src /* load Src address to ESI */ 05879 mov edi, Dest /* load Dest address to EDI */ 05880 add edi, 3 /* 3 column offset from the left edge */ 05881 mov eax, columns /* load columns into EAX */ 05882 add edi, eax /* 3 row offset from the top edge */ 05883 add edi, eax 05884 add edi, eax 05885 mov ebx, rows /* initialize ROWS counter */ 05886 sub ebx, 6 /* do not use first 3 and last 3 rows */ 05887 /* ---, */ 05888 L10380: 05889 mov ecx, eax /* initialize COLUMNS counter */ 05890 sub ecx, 6 /* do not use first 3 and last 3 columns */ 05891 align 16 /* 16 byte alignment of the loop entry */ 05892 L10382: 05893 pxor mm7, mm7 /* zero MM7 (accumulator) */ 05894 movd mm6, esi /* save ESI in MM6 */ 05895 /* --- 1 */ 05896 movq mm1, [esi] /* load 8 bytes of the Src */ 05897 movq mm2, mm1 /* copy MM1 into MM2 */ 05898 add esi, eax /* move Src pointer 1 row below */ 05899 movq mm3, [edx] /* load 4 words of Kernel */ 05900 add edx, 8 /* move pointer to other 4 words */ 05901 movq mm4, [edx] /* load 4 words of Kernel */ 05902 add edx, 8 /* move pointer to other 4 words */ 05903 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05904 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05905 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05906 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05907 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05908 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05909 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05910 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05911 /* --- 2 */ 05912 movq mm1, [esi] /* load 8 bytes of the Src */ 05913 movq mm2, mm1 /* copy MM1 into MM2 */ 05914 add esi, eax /* move Src pointer 1 row below */ 05915 movq mm3, [edx] /* load 4 words of Kernel */ 05916 add edx, 8 /* move pointer to other 4 words */ 05917 movq mm4, [edx] /* load 4 words of Kernel */ 05918 add edx, 8 /* move pointer to other 4 words */ 05919 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05920 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05921 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05922 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05923 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05924 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05925 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05926 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05927 /* --- 3 */ 05928 movq mm1, [esi] /* load 8 bytes of the Src */ 05929 movq mm2, mm1 /* copy MM1 into MM2 */ 05930 add esi, eax /* move Src pointer 1 row below */ 05931 movq mm3, [edx] /* load 4 words of Kernel */ 05932 add edx, 8 /* move pointer to other 4 words */ 05933 movq mm4, [edx] /* load 4 words of Kernel */ 05934 add edx, 8 /* move pointer to other 4 words */ 05935 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05936 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05937 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05938 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05939 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05940 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05941 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05942 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05943 /* --- 4 */ 05944 movq mm1, [esi] /* load 8 bytes of the Src */ 05945 movq mm2, mm1 /* copy MM1 into MM2 */ 05946 add esi, eax /* move Src pointer 1 row below */ 05947 movq mm3, [edx] /* load 4 words of Kernel */ 05948 add edx, 8 /* move pointer to other 4 words */ 05949 movq mm4, [edx] /* load 4 words of Kernel */ 05950 add edx, 8 /* move pointer to other 4 words */ 05951 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05952 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05953 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05954 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05955 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05956 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05957 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05958 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05959 /* --- 5 */ 05960 movq mm1, [esi] /* load 8 bytes of the Src */ 05961 movq mm2, mm1 /* copy MM1 into MM2 */ 05962 add esi, eax /* move Src pointer 1 row below */ 05963 movq mm3, [edx] /* load 4 words of Kernel */ 05964 add edx, 8 /* move pointer to other 4 words */ 05965 movq mm4, [edx] /* load 4 words of Kernel */ 05966 add edx, 8 /* move pointer to other 4 words */ 05967 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05968 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05969 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05970 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05971 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05972 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05973 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05974 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05975 /* --- 6 */ 05976 movq mm1, [esi] /* load 8 bytes of the Src */ 05977 movq mm2, mm1 /* copy MM1 into MM2 */ 05978 add esi, eax /* move Src pointer 1 row below */ 05979 movq mm3, [edx] /* load 4 words of Kernel */ 05980 add edx, 8 /* move pointer to other 4 words */ 05981 movq mm4, [edx] /* load 4 words of Kernel */ 05982 add edx, 8 /* move pointer to other 4 words */ 05983 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05984 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05985 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 05986 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 05987 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 05988 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 05989 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 05990 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 05991 /* --- 7 */ 05992 movq mm1, [esi] /* load 8 bytes of the Src */ 05993 movq mm2, mm1 /* copy MM1 into MM2 */ 05994 movq mm3, [edx] /* load 4 words of Kernel */ 05995 add edx, 8 /* move pointer to other 4 words */ 05996 movq mm4, [edx] /* load 4 words of Kernel */ 05997 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 05998 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 05999 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06000 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06001 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06002 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06003 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06004 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06005 /* ---, */ 06006 movq mm3, mm7 /* copy MM7 into MM3 */ 06007 psrlq mm7, 32 /* shift 2 left words to the right */ 06008 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 06009 movq mm2, mm7 /* copy MM7 into MM2 */ 06010 psrlq mm7, 16 /* shift 1 left word to the right */ 06011 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 06012 movd mm1, eax /* save EAX in MM1 */ 06013 packuswb mm7, mm0 /* pack division result with saturation */ 06014 movd eax, mm7 /* copy saturated result into EAX */ 06015 mov [edi], al /* copy a byte result into Dest */ 06016 movd eax, mm1 /* restore saved EAX */ 06017 /* --, */ 06018 movd esi, mm6 /* move Src pointer to the top pixel */ 06019 sub edx, 104 /* EDX = Kernel address */ 06020 inc esi /* move Src pointer to the next pixel */ 06021 inc edi /* move Dest pointer to the next pixel */ 06022 /* ---, */ 06023 dec ecx /* decrease loop counter COLUMNS */ 06024 jnz L10382 /* check loop termination, proceed if required */ 06025 add esi, 6 /* move to the next row in Src */ 06026 add edi, 6 /* move to the next row in Dest */ 06027 dec ebx /* decrease loop counter ROWS */ 06028 jnz L10380 /* check loop termination, proceed if required */ 06029 /* ---, */ 06030 emms /* exit MMX state */ 06031 popa 06032 } 06033 #else 06034 asm volatile 06035 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 06036 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 06037 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 06038 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */ 06039 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 06040 "mov %1, %%esi \n\t" /* load Src address to ESI */ 06041 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 06042 "add $3, %%edi \n\t" /* 3 column offset from the left edge */ 06043 "mov %3, %%eax \n\t" /* load columns into EAX */ 06044 "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */ 06045 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 06046 "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */ 06047 /* --- */ 06048 ".L10380: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 06049 "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */ 06050 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 06051 ".L10382: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 06052 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 06053 /* --- 1 */ 06054 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06055 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06056 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06057 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06058 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06059 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06060 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06061 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06062 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06063 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06064 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06065 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06066 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06067 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06068 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06069 /* --- 2 */ 06070 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06071 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06072 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06073 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06074 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06075 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06076 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06077 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06078 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06079 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06080 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06081 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06082 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06083 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06084 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06085 /* --- 3 */ 06086 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06087 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06088 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06089 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06090 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06091 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06092 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06093 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06094 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06095 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06096 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06097 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06098 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06099 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06100 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06101 /* --- 4 */ 06102 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06103 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06104 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06105 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06106 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06107 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06108 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06109 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06110 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06111 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06112 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06113 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06114 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06115 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06116 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06117 /* --- 5 */ 06118 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06119 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06120 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06121 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06122 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06123 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06124 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06125 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06126 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06127 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06128 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06129 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06130 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06131 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06132 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06133 /* --- 6 */ 06134 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06135 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06136 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06137 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06138 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06139 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06140 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06141 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06142 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06143 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06144 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06145 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06146 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06147 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06148 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06149 /* --- 7 */ 06150 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06151 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06152 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06153 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06154 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06155 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06156 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06157 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06158 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06159 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06160 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06161 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06162 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06163 /* --- */ 06164 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 06165 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 06166 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 06167 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 06168 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 06169 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 06170 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */ 06171 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 06172 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 06173 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 06174 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 06175 /* -- */ 06176 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 06177 "sub $104, %%edx \n\t" /* EDX = Kernel address */ 06178 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 06179 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 06180 /* --- */ 06181 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 06182 "jnz .L10382 \n\t" /* check loop termination, proceed if required */ 06183 "add $6, %%esi \n\t" /* move to the next row in Src */ 06184 "add $6, %%edi \n\t" /* move to the next row in Dest */ 06185 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 06186 "jnz .L10380 \n\t" /* check loop termination, proceed if required */ 06187 /* --- */ 06188 "emms \n\t" /* exit MMX state */ 06189 "popa \n\t":"=m" (Dest) /* %0 */ 06190 :"m"(Src), /* %1 */ 06191 "m"(rows), /* %2 */ 06192 "m"(columns), /* %3 */ 06193 "m"(Kernel), /* %4 */ 06194 "m"(NRightShift) /* %5 */ 06195 ); 06196 #endif 06197 #endif 06198 return (0); 06199 } else { 06200 /* No non-MMX implementation yet */ 06201 return (-1); 06202 } 06203 } 06204 06219 int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 06220 signed short *Kernel, unsigned char NRightShift) 06221 { 06222 /* Validate input parameters */ 06223 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL)) 06224 return(-1); 06225 06226 if ((columns < 9) || (rows < 9) || (NRightShift > 7)) 06227 return (-1); 06228 06229 if ((SDL_imageFilterMMXdetect())) { 06230 //#ifdef USE_MMX 06231 #if defined(USE_MMX) && defined(i386) 06232 #if !defined(GCC__) 06233 __asm 06234 { 06235 pusha 06236 pxor mm0, mm0 /* zero MM0 */ 06237 xor ebx, ebx /* zero EBX */ 06238 mov bl, NRightShift /* load NRightShift into BL */ 06239 movd mm5, ebx /* copy NRightShift into MM5 */ 06240 mov edx, Kernel /* load Kernel address into EDX */ 06241 mov esi, Src /* load Src address to ESI */ 06242 mov edi, Dest /* load Dest address to EDI */ 06243 add edi, 4 /* 4 column offset from the left edge */ 06244 mov eax, columns /* load columns into EAX */ 06245 add edi, eax /* 4 row offset from the top edge */ 06246 add edi, eax 06247 add edi, eax 06248 add edi, eax 06249 mov ebx, rows /* initialize ROWS counter */ 06250 sub ebx, 8 /* do not use first 4 and last 4 rows */ 06251 /* ---, */ 06252 L10390: 06253 mov ecx, eax /* initialize COLUMNS counter */ 06254 sub ecx, 8 /* do not use first 4 and last 4 columns */ 06255 align 16 /* 16 byte alignment of the loop entry */ 06256 L10392: 06257 pxor mm7, mm7 /* zero MM7 (accumulator) */ 06258 movd mm6, esi /* save ESI in MM6 */ 06259 /* --- 1 */ 06260 movq mm1, [esi] /* load 8 bytes of the Src */ 06261 movq mm2, mm1 /* copy MM1 into MM2 */ 06262 inc esi /* move pointer to the next 8 bytes of Src */ 06263 movq mm3, [edx] /* load 4 words of Kernel */ 06264 add edx, 8 /* move pointer to other 4 words */ 06265 movq mm4, [edx] /* load 4 words of Kernel */ 06266 add edx, 8 /* move pointer to other 4 words */ 06267 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06268 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06269 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06270 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06271 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06272 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06273 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06274 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06275 movq mm1, [esi] /* load 8 bytes of the Src */ 06276 dec esi 06277 add esi, eax /* move Src pointer 1 row below */ 06278 movq mm3, [edx] /* load 4 words of Kernel */ 06279 add edx, 8 /* move pointer to other 4 words */ 06280 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06281 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06282 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06283 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06284 /* --- 2 */ 06285 movq mm1, [esi] /* load 8 bytes of the Src */ 06286 movq mm2, mm1 /* copy MM1 into MM2 */ 06287 inc esi /* move pointer to the next 8 bytes of Src */ 06288 movq mm3, [edx] /* load 4 words of Kernel */ 06289 add edx, 8 /* move pointer to other 4 words */ 06290 movq mm4, [edx] /* load 4 words of Kernel */ 06291 add edx, 8 /* move pointer to other 4 words */ 06292 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06293 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06294 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06295 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06296 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06297 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06298 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06299 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06300 movq mm1, [esi] /* load 8 bytes of the Src */ 06301 dec esi 06302 add esi, eax /* move Src pointer 1 row below */ 06303 movq mm3, [edx] /* load 4 words of Kernel */ 06304 add edx, 8 /* move pointer to other 4 words */ 06305 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06306 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06307 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06308 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06309 /* --- 3 */ 06310 movq mm1, [esi] /* load 8 bytes of the Src */ 06311 movq mm2, mm1 /* copy MM1 into MM2 */ 06312 inc esi /* move pointer to the next 8 bytes of Src */ 06313 movq mm3, [edx] /* load 4 words of Kernel */ 06314 add edx, 8 /* move pointer to other 4 words */ 06315 movq mm4, [edx] /* load 4 words of Kernel */ 06316 add edx, 8 /* move pointer to other 4 words */ 06317 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06318 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06319 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06320 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06321 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06322 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06323 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06324 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06325 movq mm1, [esi] /* load 8 bytes of the Src */ 06326 dec esi 06327 add esi, eax /* move Src pointer 1 row below */ 06328 movq mm3, [edx] /* load 4 words of Kernel */ 06329 add edx, 8 /* move pointer to other 4 words */ 06330 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06331 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06332 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06333 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06334 /* --- 4 */ 06335 movq mm1, [esi] /* load 8 bytes of the Src */ 06336 movq mm2, mm1 /* copy MM1 into MM2 */ 06337 inc esi /* move pointer to the next 8 bytes of Src */ 06338 movq mm3, [edx] /* load 4 words of Kernel */ 06339 add edx, 8 /* move pointer to other 4 words */ 06340 movq mm4, [edx] /* load 4 words of Kernel */ 06341 add edx, 8 /* move pointer to other 4 words */ 06342 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06343 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06344 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06345 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06346 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06347 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06348 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06349 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06350 movq mm1, [esi] /* load 8 bytes of the Src */ 06351 dec esi 06352 add esi, eax /* move Src pointer 1 row below */ 06353 movq mm3, [edx] /* load 4 words of Kernel */ 06354 add edx, 8 /* move pointer to other 4 words */ 06355 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06356 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06357 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06358 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06359 /* --- 5 */ 06360 movq mm1, [esi] /* load 8 bytes of the Src */ 06361 movq mm2, mm1 /* copy MM1 into MM2 */ 06362 inc esi /* move pointer to the next 8 bytes of Src */ 06363 movq mm3, [edx] /* load 4 words of Kernel */ 06364 add edx, 8 /* move pointer to other 4 words */ 06365 movq mm4, [edx] /* load 4 words of Kernel */ 06366 add edx, 8 /* move pointer to other 4 words */ 06367 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06368 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06369 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06370 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06371 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06372 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06373 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06374 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06375 movq mm1, [esi] /* load 8 bytes of the Src */ 06376 dec esi 06377 add esi, eax /* move Src pointer 1 row below */ 06378 movq mm3, [edx] /* load 4 words of Kernel */ 06379 add edx, 8 /* move pointer to other 4 words */ 06380 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06381 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06382 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06383 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06384 /* --- 6 */ 06385 movq mm1, [esi] /* load 8 bytes of the Src */ 06386 movq mm2, mm1 /* copy MM1 into MM2 */ 06387 inc esi /* move pointer to the next 8 bytes of Src */ 06388 movq mm3, [edx] /* load 4 words of Kernel */ 06389 add edx, 8 /* move pointer to other 4 words */ 06390 movq mm4, [edx] /* load 4 words of Kernel */ 06391 add edx, 8 /* move pointer to other 4 words */ 06392 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06393 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06394 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06395 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06396 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06397 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06398 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06399 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06400 movq mm1, [esi] /* load 8 bytes of the Src */ 06401 dec esi 06402 add esi, eax /* move Src pointer 1 row below */ 06403 movq mm3, [edx] /* load 4 words of Kernel */ 06404 add edx, 8 /* move pointer to other 4 words */ 06405 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06406 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06407 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06408 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06409 /* --- 7 */ 06410 movq mm1, [esi] /* load 8 bytes of the Src */ 06411 movq mm2, mm1 /* copy MM1 into MM2 */ 06412 inc esi /* move pointer to the next 8 bytes of Src */ 06413 movq mm3, [edx] /* load 4 words of Kernel */ 06414 add edx, 8 /* move pointer to other 4 words */ 06415 movq mm4, [edx] /* load 4 words of Kernel */ 06416 add edx, 8 /* move pointer to other 4 words */ 06417 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06418 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06419 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06420 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06421 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06422 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06423 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06424 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06425 movq mm1, [esi] /* load 8 bytes of the Src */ 06426 dec esi 06427 add esi, eax /* move Src pointer 1 row below */ 06428 movq mm3, [edx] /* load 4 words of Kernel */ 06429 add edx, 8 /* move pointer to other 4 words */ 06430 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06431 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06432 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06433 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06434 /* --- 8 */ 06435 movq mm1, [esi] /* load 8 bytes of the Src */ 06436 movq mm2, mm1 /* copy MM1 into MM2 */ 06437 inc esi /* move pointer to the next 8 bytes of Src */ 06438 movq mm3, [edx] /* load 4 words of Kernel */ 06439 add edx, 8 /* move pointer to other 4 words */ 06440 movq mm4, [edx] /* load 4 words of Kernel */ 06441 add edx, 8 /* move pointer to other 4 words */ 06442 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06443 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06444 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06445 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06446 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06447 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06448 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06449 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06450 movq mm1, [esi] /* load 8 bytes of the Src */ 06451 dec esi 06452 add esi, eax /* move Src pointer 1 row below */ 06453 movq mm3, [edx] /* load 4 words of Kernel */ 06454 add edx, 8 /* move pointer to other 4 words */ 06455 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06456 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06457 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06458 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06459 /* --- 9 */ 06460 movq mm1, [esi] /* load 8 bytes of the Src */ 06461 movq mm2, mm1 /* copy MM1 into MM2 */ 06462 inc esi /* move pointer to the next 8 bytes of Src */ 06463 movq mm3, [edx] /* load 4 words of Kernel */ 06464 add edx, 8 /* move pointer to other 4 words */ 06465 movq mm4, [edx] /* load 4 words of Kernel */ 06466 add edx, 8 /* move pointer to other 4 words */ 06467 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06468 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */ 06469 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06470 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */ 06471 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06472 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */ 06473 paddsw mm1, mm2 /* add 4 words of the high and low bytes */ 06474 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06475 movq mm1, [esi] /* load 8 bytes of the Src */ 06476 movq mm3, [edx] /* load 4 words of Kernel */ 06477 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */ 06478 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */ 06479 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */ 06480 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */ 06481 /* ---, */ 06482 movq mm3, mm7 /* copy MM7 into MM3 */ 06483 psrlq mm7, 32 /* shift 2 left words to the right */ 06484 paddsw mm7, mm3 /* add 2 left and 2 right result words */ 06485 movq mm2, mm7 /* copy MM7 into MM2 */ 06486 psrlq mm7, 16 /* shift 1 left word to the right */ 06487 paddsw mm7, mm2 /* add 1 left and 1 right result words */ 06488 movd mm1, eax /* save EAX in MM1 */ 06489 packuswb mm7, mm0 /* pack division result with saturation */ 06490 movd eax, mm7 /* copy saturated result into EAX */ 06491 mov [edi], al /* copy a byte result into Dest */ 06492 movd eax, mm1 /* restore saved EAX */ 06493 /* --, */ 06494 movd esi, mm6 /* move Src pointer to the top pixel */ 06495 sub edx, 208 /* EDX = Kernel address */ 06496 inc esi /* move Src pointer to the next pixel */ 06497 inc edi /* move Dest pointer to the next pixel */ 06498 /* ---, */ 06499 dec ecx /* decrease loop counter COLUMNS */ 06500 jnz L10392 /* check loop termination, proceed if required */ 06501 add esi, 8 /* move to the next row in Src */ 06502 add edi, 8 /* move to the next row in Dest */ 06503 dec ebx /* decrease loop counter ROWS */ 06504 jnz L10390 /* check loop termination, proceed if required */ 06505 /* ---, */ 06506 emms /* exit MMX state */ 06507 popa 06508 } 06509 #else 06510 asm volatile 06511 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 06512 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 06513 "mov %5, %%bl \n\t" /* load NRightShift into BL */ 06514 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */ 06515 "mov %4, %%edx \n\t" /* load Kernel address into EDX */ 06516 "mov %1, %%esi \n\t" /* load Src address to ESI */ 06517 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 06518 "add $4, %%edi \n\t" /* 4 column offset from the left edge */ 06519 "mov %3, %%eax \n\t" /* load columns into EAX */ 06520 "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */ 06521 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */ 06522 "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */ 06523 /* --- */ 06524 ".L10390: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */ 06525 "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */ 06526 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 06527 ".L10392: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */ 06528 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */ 06529 /* --- 1 */ 06530 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06531 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06532 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06533 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06534 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06535 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06536 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06537 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06538 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06539 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06540 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06541 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06542 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06543 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06544 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06545 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06546 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06547 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06548 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06549 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06550 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06551 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06552 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06553 /* --- 2 */ 06554 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06555 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06556 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06557 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06558 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06559 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06560 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06561 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06562 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06563 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06564 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06565 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06566 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06567 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06568 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06569 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06570 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06571 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06572 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06573 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06574 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06575 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06576 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06577 /* --- 3 */ 06578 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06579 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06580 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06581 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06582 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06583 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06584 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06585 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06586 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06587 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06588 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06589 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06590 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06591 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06592 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06593 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06594 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06595 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06596 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06597 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06598 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06599 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06600 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06601 /* --- 4 */ 06602 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06603 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06604 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06605 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06606 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06607 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06608 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06609 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06610 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06611 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06612 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06613 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06614 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06615 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06616 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06617 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06618 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06619 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06620 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06621 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06622 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06623 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06624 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06625 /* --- 5 */ 06626 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06627 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06628 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06629 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06630 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06631 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06632 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06633 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06634 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06635 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06636 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06637 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06638 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06639 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06640 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06641 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06642 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06643 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06644 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06645 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06646 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06647 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06648 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06649 /* --- 6 */ 06650 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06651 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06652 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06653 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06654 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06655 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06656 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06657 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06658 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06659 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06660 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06661 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06662 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06663 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06664 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06665 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06666 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06667 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06668 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06669 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06670 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06671 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06672 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06673 /* --- 7 */ 06674 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06675 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06676 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06677 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06678 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06679 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06680 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06681 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06682 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06683 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06684 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06685 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06686 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06687 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06688 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06689 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06690 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06691 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06692 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06693 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06694 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06695 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06696 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06697 /* --- 8 */ 06698 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06699 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06700 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06701 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06702 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06703 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06704 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06705 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06706 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06707 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06708 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06709 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06710 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06711 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06712 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06713 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06714 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */ 06715 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06716 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06717 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06718 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06719 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06720 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06721 /* --- 9 */ 06722 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06723 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */ 06724 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */ 06725 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06726 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06727 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */ 06728 "add $8, %%edx \n\t" /* move pointer to other 4 words */ 06729 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06730 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */ 06731 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06732 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 06733 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06734 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */ 06735 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */ 06736 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06737 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */ 06738 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */ 06739 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */ 06740 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */ 06741 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */ 06742 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */ 06743 /* --- */ 06744 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 06745 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 06746 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */ 06747 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */ 06748 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */ 06749 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */ 06750 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */ 06751 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */ 06752 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */ 06753 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */ 06754 "movd %%mm1, %%eax \n\t" /* restore saved EAX */ 06755 /* -- */ 06756 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */ 06757 "sub $208, %%edx \n\t" /* EDX = Kernel address */ 06758 "inc %%esi \n\t" /* move Src pointer to the next pixel */ 06759 "inc %%edi \n\t" /* move Dest pointer to the next pixel */ 06760 /* --- */ 06761 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 06762 "jnz .L10392 \n\t" /* check loop termination, proceed if required */ 06763 "add $8, %%esi \n\t" /* move to the next row in Src */ 06764 "add $8, %%edi \n\t" /* move to the next row in Dest */ 06765 "dec %%ebx \n\t" /* decrease loop counter ROWS */ 06766 "jnz .L10390 \n\t" /* check loop termination, proceed if required */ 06767 /* --- */ 06768 "emms \n\t" /* exit MMX state */ 06769 "popa \n\t":"=m" (Dest) /* %0 */ 06770 :"m"(Src), /* %1 */ 06771 "m"(rows), /* %2 */ 06772 "m"(columns), /* %3 */ 06773 "m"(Kernel), /* %4 */ 06774 "m"(NRightShift) /* %5 */ 06775 ); 06776 #endif 06777 #endif 06778 return (0); 06779 } else { 06780 /* No non-MMX implementation yet */ 06781 return (-1); 06782 } 06783 } 06784 06785 /* ------------------------------------------------------------------------------------ */ 06786 06799 int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns) 06800 { 06801 /* Validate input parameters */ 06802 if ((Src == NULL) || (Dest == NULL)) 06803 return(-1); 06804 06805 if ((columns < 8) || (rows < 3)) 06806 return (-1); 06807 06808 if ((SDL_imageFilterMMXdetect())) { 06809 //#ifdef USE_MMX 06810 #if defined(USE_MMX) && defined(i386) 06811 #if !defined(GCC__) 06812 __asm 06813 { 06814 pusha 06815 pxor mm0, mm0 /* zero MM0 */ 06816 mov eax, columns /* load columns into EAX */ 06817 /* ---, */ 06818 mov esi, Src /* ESI = Src row 0 address */ 06819 mov edi, Dest /* load Dest address to EDI */ 06820 add edi, eax /* EDI = EDI + columns */ 06821 inc edi /* 1 byte offset from the left edge */ 06822 mov edx, rows /* initialize ROWS counter */ 06823 sub edx, 2 /* do not use first and last rows */ 06824 /* ---, */ 06825 L10400: 06826 mov ecx, eax /* initialize COLUMS counter */ 06827 shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */ 06828 mov ebx, esi /* save ESI in EBX */ 06829 movd mm1, edi /* save EDI in MM1 */ 06830 align 16 /* 16 byte alignment of the loop entry */ 06831 L10402: 06832 /* ---, */ 06833 movq mm4, [esi] /* load 8 bytes from Src */ 06834 movq mm5, mm4 /* save MM4 in MM5 */ 06835 add esi, 2 /* move ESI pointer 2 bytes right */ 06836 punpcklbw mm4, mm0 /* unpack 4 low bytes into words */ 06837 punpckhbw mm5, mm0 /* unpack 4 high bytes into words */ 06838 movq mm6, [esi] /* load 8 bytes from Src */ 06839 movq mm7, mm6 /* save MM6 in MM7 */ 06840 sub esi, 2 /* move ESI pointer back 2 bytes left */ 06841 punpcklbw mm6, mm0 /* unpack 4 low bytes into words */ 06842 punpckhbw mm7, mm0 /* unpack 4 high bytes into words */ 06843 add esi, eax /* move to the next row of Src */ 06844 movq mm2, [esi] /* load 8 bytes from Src */ 06845 movq mm3, mm2 /* save MM2 in MM3 */ 06846 add esi, 2 /* move ESI pointer 2 bytes right */ 06847 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 06848 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 06849 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 06850 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 06851 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 06852 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 06853 movq mm2, [esi] /* load 8 bytes from Src */ 06854 movq mm3, mm2 /* save MM2 in MM3 */ 06855 sub esi, 2 /* move ESI pointer back 2 bytes left */ 06856 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 06857 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 06858 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 06859 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 06860 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 06861 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 06862 add esi, eax /* move to the next row of Src */ 06863 movq mm2, [esi] /* load 8 bytes from Src */ 06864 movq mm3, mm2 /* save MM2 in MM3 */ 06865 add esi, 2 /* move ESI pointer 2 bytes right */ 06866 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 06867 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 06868 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 06869 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 06870 movq mm2, [esi] /* load 8 bytes from Src */ 06871 movq mm3, mm2 /* save MM2 in MM3 */ 06872 sub esi, 2 /* move ESI pointer back 2 bytes left */ 06873 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 06874 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 06875 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 06876 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 06877 /* ---, */ 06878 movq mm2, mm4 /* copy MM4 into MM2 */ 06879 psrlq mm4, 32 /* shift 2 left words to the right */ 06880 psubw mm4, mm2 /* MM4 = MM4 - MM2 */ 06881 movq mm3, mm6 /* copy MM6 into MM3 */ 06882 psrlq mm6, 32 /* shift 2 left words to the right */ 06883 psubw mm6, mm3 /* MM6 = MM6 - MM3 */ 06884 punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */ 06885 movq mm2, mm5 /* copy MM6 into MM2 */ 06886 psrlq mm5, 32 /* shift 2 left words to the right */ 06887 psubw mm5, mm2 /* MM5 = MM5 - MM2 */ 06888 movq mm3, mm7 /* copy MM7 into MM3 */ 06889 psrlq mm7, 32 /* shift 2 left words to the right */ 06890 psubw mm7, mm3 /* MM7 = MM7 - MM3 */ 06891 punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */ 06892 /* Take abs values of MM4 and MM5 */ 06893 movq mm6, mm4 /* copy MM4 into MM6 */ 06894 movq mm7, mm5 /* copy MM5 into MM7 */ 06895 psraw mm6, 15 /* fill MM6 words with word sign bit */ 06896 psraw mm7, 15 /* fill MM7 words with word sign bit */ 06897 pxor mm4, mm6 /* take 1's compliment of only neg words */ 06898 pxor mm5, mm7 /* take 1's compliment of only neg words */ 06899 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 06900 psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */ 06901 packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */ 06902 movq [edi], mm4 /* store result in Dest */ 06903 /* ---, */ 06904 sub esi, eax /* move to the current top row in Src */ 06905 sub esi, eax 06906 add esi, 8 /* move Src pointer to the next 8 pixels */ 06907 add edi, 8 /* move Dest pointer to the next 8 pixels */ 06908 /* ---, */ 06909 dec ecx /* decrease loop counter COLUMNS */ 06910 jnz L10402 /* check loop termination, proceed if required */ 06911 mov esi, ebx /* restore most left current row Src address */ 06912 movd edi, mm1 /* restore most left current row Dest address */ 06913 add esi, eax /* move to the next row in Src */ 06914 add edi, eax /* move to the next row in Dest */ 06915 dec edx /* decrease loop counter ROWS */ 06916 jnz L10400 /* check loop termination, proceed if required */ 06917 /* ---, */ 06918 emms /* exit MMX state */ 06919 popa 06920 } 06921 #else 06922 asm volatile 06923 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 06924 "mov %3, %%eax \n\t" /* load columns into EAX */ 06925 /* --- */ 06926 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 06927 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 06928 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 06929 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 06930 "mov %2, %%edx \n\t" /* initialize ROWS counter */ 06931 "sub $2, %%edx \n\t" /* do not use first and last rows */ 06932 /* --- */ 06933 ".L10400: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 06934 "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */ 06935 "mov %%esi, %%ebx \n\t" /* save ESI in EBX */ 06936 "movd %%edi, %%mm1 \n\t" /* save EDI in MM1 */ 06937 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 06938 ".L10402: \n\t" 06939 /* --- */ 06940 "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */ 06941 "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */ 06942 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 06943 "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */ 06944 "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */ 06945 "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */ 06946 "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */ 06947 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 06948 "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */ 06949 "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */ 06950 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 06951 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 06952 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 06953 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 06954 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 06955 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 06956 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 06957 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 06958 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 06959 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 06960 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 06961 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 06962 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 06963 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 06964 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 06965 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 06966 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 06967 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 06968 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 06969 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 06970 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 06971 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 06972 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 06973 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 06974 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 06975 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 06976 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 06977 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 06978 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 06979 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 06980 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 06981 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 06982 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 06983 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 06984 /* --- */ 06985 "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */ 06986 "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */ 06987 "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */ 06988 "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */ 06989 "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */ 06990 "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */ 06991 "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */ 06992 "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */ 06993 "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */ 06994 "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */ 06995 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 06996 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 06997 "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */ 06998 "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */ 06999 /* Take abs values of MM4 and MM5 */ 07000 "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */ 07001 "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */ 07002 "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */ 07003 "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */ 07004 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */ 07005 "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */ 07006 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07007 "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07008 "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */ 07009 "movq %%mm4, (%%edi) \n\t" /* store result in Dest */ 07010 /* --- */ 07011 "sub %%eax, %%esi \n\t" /* move to the current top row in Src */ 07012 "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */ 07013 "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */ 07014 /* --- */ 07015 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 07016 "jnz .L10402 \n\t" /* check loop termination, proceed if required */ 07017 "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */ 07018 "movd %%mm1, %%edi \n\t" /* restore most left current row Dest address */ 07019 "add %%eax, %%esi \n\t" /* move to the next row in Src */ 07020 "add %%eax, %%edi \n\t" /* move to the next row in Dest */ 07021 "dec %%edx \n\t" /* decrease loop counter ROWS */ 07022 "jnz .L10400 \n\t" /* check loop termination, proceed if required */ 07023 /* --- */ 07024 "emms \n\t" /* exit MMX state */ 07025 "popa \n\t":"=m" (Dest) /* %0 */ 07026 :"m"(Src), /* %1 */ 07027 "m"(rows), /* %2 */ 07028 "m"(columns) /* %3 */ 07029 ); 07030 #endif 07031 #endif 07032 return (0); 07033 } else { 07034 /* No non-MMX implementation yet */ 07035 return (-1); 07036 } 07037 } 07038 07052 int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, 07053 unsigned char NRightShift) 07054 { 07055 /* Validate input parameters */ 07056 if ((Src == NULL) || (Dest == NULL)) 07057 return(-1); 07058 if ((columns < 8) || (rows < 3) || (NRightShift > 7)) 07059 return (-1); 07060 07061 if ((SDL_imageFilterMMXdetect())) { 07062 //#ifdef USE_MMX 07063 #if defined(USE_MMX) && defined(i386) 07064 #if !defined(GCC__) 07065 __asm 07066 { 07067 pusha 07068 pxor mm0, mm0 /* zero MM0 */ 07069 mov eax, columns /* load columns into EAX */ 07070 xor ebx, ebx /* zero EBX */ 07071 mov bl, NRightShift /* load NRightShift into BL */ 07072 movd mm1, ebx /* copy NRightShift into MM1 */ 07073 /* ---, */ 07074 mov esi, Src /* ESI = Src row 0 address */ 07075 mov edi, Dest /* load Dest address to EDI */ 07076 add edi, eax /* EDI = EDI + columns */ 07077 inc edi /* 1 byte offset from the left edge */ 07078 /* initialize ROWS counter */ 07079 sub rows, 2 /* do not use first and last rows */ 07080 /* ---, */ 07081 L10410: 07082 mov ecx, eax /* initialize COLUMS counter */ 07083 shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */ 07084 mov ebx, esi /* save ESI in EBX */ 07085 mov edx, edi /* save EDI in EDX */ 07086 align 16 /* 16 byte alignment of the loop entry */ 07087 L10412: 07088 /* ---, */ 07089 movq mm4, [esi] /* load 8 bytes from Src */ 07090 movq mm5, mm4 /* save MM4 in MM5 */ 07091 add esi, 2 /* move ESI pointer 2 bytes right */ 07092 punpcklbw mm4, mm0 /* unpack 4 low bytes into words */ 07093 punpckhbw mm5, mm0 /* unpack 4 high bytes into words */ 07094 psrlw mm4, mm1 /* shift right each pixel NshiftRight times */ 07095 psrlw mm5, mm1 /* shift right each pixel NshiftRight times */ 07096 movq mm6, [esi] /* load 8 bytes from Src */ 07097 movq mm7, mm6 /* save MM6 in MM7 */ 07098 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07099 punpcklbw mm6, mm0 /* unpack 4 low bytes into words */ 07100 punpckhbw mm7, mm0 /* unpack 4 high bytes into words */ 07101 psrlw mm6, mm1 /* shift right each pixel NshiftRight times */ 07102 psrlw mm7, mm1 /* shift right each pixel NshiftRight times */ 07103 add esi, eax /* move to the next row of Src */ 07104 movq mm2, [esi] /* load 8 bytes from Src */ 07105 movq mm3, mm2 /* save MM2 in MM3 */ 07106 add esi, 2 /* move ESI pointer 2 bytes right */ 07107 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07108 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07109 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07110 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07111 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07112 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07113 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07114 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07115 movq mm2, [esi] /* load 8 bytes from Src */ 07116 movq mm3, mm2 /* save MM2 in MM3 */ 07117 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07118 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07119 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07120 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07121 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07122 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07123 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07124 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07125 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07126 add esi, eax /* move to the next row of Src */ 07127 movq mm2, [esi] /* load 8 bytes from Src */ 07128 movq mm3, mm2 /* save MM2 in MM3 */ 07129 add esi, 2 /* move ESI pointer 2 bytes right */ 07130 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07131 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07132 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07133 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07134 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */ 07135 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */ 07136 movq mm2, [esi] /* load 8 bytes from Src */ 07137 movq mm3, mm2 /* save MM2 in MM3 */ 07138 sub esi, 2 /* move ESI pointer back 2 bytes left */ 07139 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */ 07140 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */ 07141 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */ 07142 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */ 07143 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */ 07144 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */ 07145 /* ---, */ 07146 movq mm2, mm4 /* copy MM4 into MM2 */ 07147 psrlq mm4, 32 /* shift 2 left words to the right */ 07148 psubw mm4, mm2 /* MM4 = MM4 - MM2 */ 07149 movq mm3, mm6 /* copy MM6 into MM3 */ 07150 psrlq mm6, 32 /* shift 2 left words to the right */ 07151 psubw mm6, mm3 /* MM6 = MM6 - MM3 */ 07152 punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */ 07153 movq mm2, mm5 /* copy MM6 into MM2 */ 07154 psrlq mm5, 32 /* shift 2 left words to the right */ 07155 psubw mm5, mm2 /* MM5 = MM5 - MM2 */ 07156 movq mm3, mm7 /* copy MM7 into MM3 */ 07157 psrlq mm7, 32 /* shift 2 left words to the right */ 07158 psubw mm7, mm3 /* MM7 = MM7 - MM3 */ 07159 punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */ 07160 /* Take abs values of MM4 and MM5 */ 07161 movq mm6, mm4 /* copy MM4 into MM6 */ 07162 movq mm7, mm5 /* copy MM5 into MM7 */ 07163 psraw mm6, 15 /* fill MM6 words with word sign bit */ 07164 psraw mm7, 15 /* fill MM7 words with word sign bit */ 07165 pxor mm4, mm6 /* take 1's compliment of only neg words */ 07166 pxor mm5, mm7 /* take 1's compliment of only neg words */ 07167 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */ 07168 psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */ 07169 packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */ 07170 movq [edi], mm4 /* store result in Dest */ 07171 /* ---, */ 07172 sub esi, eax /* move to the current top row in Src */ 07173 sub esi, eax 07174 add esi, 8 /* move Src pointer to the next 8 pixels */ 07175 add edi, 8 /* move Dest pointer to the next 8 pixels */ 07176 /* ---, */ 07177 dec ecx /* decrease loop counter COLUMNS */ 07178 jnz L10412 /* check loop termination, proceed if required */ 07179 mov esi, ebx /* restore most left current row Src address */ 07180 mov edi, edx /* restore most left current row Dest address */ 07181 add esi, eax /* move to the next row in Src */ 07182 add edi, eax /* move to the next row in Dest */ 07183 dec rows /* decrease loop counter ROWS */ 07184 jnz L10410 /* check loop termination, proceed if required */ 07185 /* ---, */ 07186 emms /* exit MMX state */ 07187 popa 07188 } 07189 #else 07190 asm volatile 07191 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */ 07192 "mov %3, %%eax \n\t" /* load columns into EAX */ 07193 "xor %%ebx, %%ebx \n\t" /* zero EBX */ 07194 "mov %4, %%bl \n\t" /* load NRightShift into BL */ 07195 "movd %%ebx, %%mm1 \n\t" /* copy NRightShift into MM1 */ 07196 /* --- */ 07197 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */ 07198 "mov %0, %%edi \n\t" /* load Dest address to EDI */ 07199 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */ 07200 "inc %%edi \n\t" /* 1 byte offset from the left edge */ 07201 /* initialize ROWS counter */ 07202 "subl $2, %2 \n\t" /* do not use first and last rows */ 07203 /* --- */ 07204 ".L10410: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */ 07205 "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */ 07206 "mov %%esi, %%ebx \n\t" /* save ESI in EBX */ 07207 "mov %%edi, %%edx \n\t" /* save EDI in EDX */ 07208 ".align 16 \n\t" /* 16 byte alignment of the loop entry */ 07209 ".L10412: \n\t" 07210 /* --- */ 07211 "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */ 07212 "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */ 07213 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07214 "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */ 07215 "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */ 07216 "psrlw %%mm1, %%mm4 \n\t" /* shift right each pixel NshiftRight times */ 07217 "psrlw %%mm1, %%mm5 \n\t" /* shift right each pixel NshiftRight times */ 07218 "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */ 07219 "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */ 07220 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07221 "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */ 07222 "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */ 07223 "psrlw %%mm1, %%mm6 \n\t" /* shift right each pixel NshiftRight times */ 07224 "psrlw %%mm1, %%mm7 \n\t" /* shift right each pixel NshiftRight times */ 07225 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 07226 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07227 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07228 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07229 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07230 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07231 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07232 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07233 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07234 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07235 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07236 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07237 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07238 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07239 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07240 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07241 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07242 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07243 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07244 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07245 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07246 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07247 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07248 "add %%eax, %%esi \n\t" /* move to the next row of Src */ 07249 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07250 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07251 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */ 07252 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07253 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07254 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07255 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07256 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */ 07257 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */ 07258 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */ 07259 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */ 07260 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */ 07261 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */ 07262 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */ 07263 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */ 07264 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */ 07265 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */ 07266 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */ 07267 /* --- */ 07268 "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */ 07269 "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */ 07270 "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */ 07271 "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */ 07272 "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */ 07273 "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */ 07274 "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */ 07275 "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */ 07276 "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */ 07277 "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */ 07278 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */ 07279 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */ 07280 "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */ 07281 "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */ 07282 /* Take abs values of MM4 and MM5 */ 07283 "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */ 07284 "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */ 07285 "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */ 07286 "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */ 07287 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */ 07288 "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */ 07289 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07290 "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */ 07291 "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */ 07292 "movq %%mm4, (%%edi) \n\t" /* store result in Dest */ 07293 /* --- */ 07294 "sub %%eax, %%esi \n\t" /* move to the current top row in Src */ 07295 "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */ 07296 "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */ 07297 /* --- */ 07298 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */ 07299 "jnz .L10412 \n\t" /* check loop termination, proceed if required */ 07300 "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */ 07301 "mov %%edx, %%edi \n\t" /* restore most left current row Dest address */ 07302 "add %%eax, %%esi \n\t" /* move to the next row in Src */ 07303 "add %%eax, %%edi \n\t" /* move to the next row in Dest */ 07304 "decl %2 \n\t" /* decrease loop counter ROWS */ 07305 "jnz .L10410 \n\t" /* check loop termination, proceed if required */ 07306 /* --- */ 07307 "emms \n\t" /* exit MMX state */ 07308 "popa \n\t":"=m" (Dest) /* %0 */ 07309 :"m"(Src), /* %1 */ 07310 "m"(rows), /* %2 */ 07311 "m"(columns), /* %3 */ 07312 "m"(NRightShift) /* %4 */ 07313 ); 07314 #endif 07315 #endif 07316 return (0); 07317 } else { 07318 /* No non-MMX implementation yet */ 07319 return (-1); 07320 } 07321 } 07322 07326 void SDL_imageFilterAlignStack(void) 07327 { 07328 #ifdef USE_MMX 07329 #if !defined(GCC__) 07330 __asm 07331 { /* --- stack alignment --- */ 07332 mov ebx, esp /* load ESP into EBX */ 07333 sub ebx, 4 /* reserve space on stack for old value of ESP */ 07334 and ebx, -32 /* align EBX along a 32 byte boundary */ 07335 mov [ebx], esp /* save old value of ESP in stack, behind the bndry */ 07336 mov esp, ebx /* align ESP along a 32 byte boundary */ 07337 } 07338 #else 07339 asm volatile 07340 ( /* --- stack alignment --- */ 07341 "mov %%esp, %%ebx \n\t" /* load ESP into EBX */ 07342 "sub $4, %%ebx \n\t" /* reserve space on stack for old value of ESP */ 07343 "and $-32, %%ebx \n\t" /* align EBX along a 32 byte boundary */ 07344 "mov %%esp, (%%ebx) \n\t" /* save old value of ESP in stack, behind the bndry */ 07345 "mov %%ebx, %%esp \n\t" /* align ESP along a 32 byte boundary */ 07346 ::); 07347 #endif 07348 #endif 07349 } 07350 07354 void SDL_imageFilterRestoreStack(void) 07355 { 07356 #ifdef USE_MMX 07357 #if !defined(GCC__) 07358 __asm 07359 { /* --- restoring old stack --- */ 07360 mov ebx, [esp] /* load old value of ESP */ 07361 mov esp, ebx /* restore old value of ESP */ 07362 } 07363 #else 07364 asm volatile 07365 ( /* --- restoring old stack --- */ 07366 "mov (%%esp), %%ebx \n\t" /* load old value of ESP */ 07367 "mov %%ebx, %%esp \n\t" /* restore old value of ESP */ 07368 ::); 07369 #endif 07370 #endif 07371 }