SDL2_gfx: I:/Sources/sdl2gfx/SDL2_imageFilter.c Source File

Go to the documentation of this file.
00001 /*
00002 
00003 SDL2_imageFilter.c: byte-image "filter" routines
00004 
00005 Copyright (C) 2001-2012  Andreas Schiffler
00006 Copyright (C) 2013  Sylvain Beucler
00007 
00008 This software is provided 'as-is', without any express or implied
00009 warranty. In no event will the authors be held liable for any damages
00010 arising from the use of this software.
00011 
00012 Permission is granted to anyone to use this software for any purpose,
00013 including commercial applications, and to alter it and redistribute it
00014 freely, subject to the following restrictions:
00015 
00016    1. The origin of this software must not be misrepresented; you must not
00017    claim that you wrote the original software. If you use this software
00018    in a product, an acknowledgment in the product documentation would be
00019    appreciated but is not required.
00020 
00021    2. Altered source versions must be plainly marked as such, and must not be
00022    misrepresented as being the original software.
00023 
00024    3. This notice may not be removed or altered from any source
00025    distribution.
00026 
00027 Andreas Schiffler -- aschiffler at ferzkopp dot net
00028 
00029 */
00030 
00031 /*
00032 
00033 Note: Uses inline x86 MMX or ASM optimizations if available and enabled.
00034 
00035 Note: Most of the MMX code is based on published routines 
00036 by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 
00037 him for his work.
00038 
00039 */
00040 
00041 #include <stdio.h>
00042 #include <stdlib.h>
00043 #include <string.h>
00044 
00045 #include "SDL.h"
00046 
00047 /* Use GCC intrinsics if available: they support both i386 and x86_64,
00048    provide ASM-grade performances, and lift the PUSHA/POPA issues. */
00049 #ifdef __GNUC__
00050 #  ifdef USE_MMX
00051 #    include <mmintrin.h>
00052 #  endif
00053 #  include <SDL_cpuinfo.h>
00054 #endif
00055 
00056 #include "SDL2_imageFilter.h"
00057 
00061 #define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8)  | (((x) & 0x0000ff00) << 8)  | ((x) << 24))
00062 
00063 /* ------ Static variables ----- */
00064 
00068 static int SDL_imageFilterUseMMX = 1;
00069 
00070 /* Detect GCC */
00071 #if defined(__GNUC__)
00072 #define GCC__
00073 #endif
00074 
00080 int SDL_imageFilterMMXdetect(void)
00081 {
00082         /* Check override flag */
00083         if (SDL_imageFilterUseMMX == 0) {
00084                 return (0);
00085         }
00086 
00087     return SDL_HasMMX();
00088 }
00089 
00093 void SDL_imageFilterMMXoff()
00094 {
00095         SDL_imageFilterUseMMX = 0;
00096 }
00097 
00101 void SDL_imageFilterMMXon()
00102 {
00103         SDL_imageFilterUseMMX = 1;
00104 }
00105 
00106 /* ------------------------------------------------------------------------------------ */
00107 
00118 static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00119 {
00120 #ifdef USE_MMX
00121 #if !defined(GCC__)
00122         __asm
00123         {
00124                 pusha
00125                         mov eax, Src1   /* load Src1 address into eax */
00126                         mov ebx, Src2   /* load Src2 address into ebx */
00127                         mov edi, Dest   /* load Dest address into edi */
00128                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
00129                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
00130                         align 16        /* 16 byte alignment of the loop entry */
00131 L1010:
00132                 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
00133                 paddusb mm1, [ebx]      /* mm1=Src1+Src2 (add 8 bytes with saturation) */
00134                 movq [edi], mm1 /* store result in Dest */
00135                         add eax, 8      /* increase Src1, Src2 and Dest  */
00136                         add ebx, 8      /* register pointers by 8 */
00137                         add edi, 8
00138                         dec ecx /* decrease loop counter */
00139                         jnz L1010       /* check loop termination, proceed if required */
00140                         emms /* exit MMX state */
00141                         popa
00142         }
00143 #else
00144         /* i386 and x86_64 */
00145         __m64 *mSrc1 = (__m64*)Src1;
00146         __m64 *mSrc2 = (__m64*)Src2;
00147         __m64 *mDest = (__m64*)Dest;
00148         int i;
00149         for (i = 0; i < SrcLength/8; i++) {
00150                 *mDest = _m_paddusb(*mSrc1, *mSrc2);    /* Src1+Src2 (add 8 bytes with saturation) */
00151                 mSrc1++;
00152                 mSrc2++;
00153                 mDest++;
00154         }
00155         _m_empty();                                     /* clean MMX state */
00156 #endif
00157         return (0);
00158 #else
00159         return (-1);
00160 #endif
00161 }
00162 
00173 int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00174 {
00175         unsigned int i, istart;
00176         unsigned char *cursrc1, *cursrc2, *curdst;
00177         int result;
00178 
00179         /* Validate input parameters */
00180         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00181                 return(-1);
00182         if (length == 0)
00183                 return(0);
00184 
00185         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00186 
00187                 /* Use MMX assembly routine */
00188                 SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
00189 
00190                 /* Check for unaligned bytes */
00191                 if ((length & 7) > 0) {
00192                         /* Setup to process unaligned bytes */
00193                         istart = length & 0xfffffff8;
00194                         cursrc1 = &Src1[istart];
00195                         cursrc2 = &Src2[istart];
00196                         curdst = &Dest[istart];
00197                 } else {
00198                         /* No unaligned bytes - we are done */
00199                         return (0);
00200                 }
00201         } else {
00202                 /* Setup to process whole image */
00203                 istart = 0;
00204                 cursrc1 = Src1;
00205                 cursrc2 = Src2;
00206                 curdst = Dest;
00207         }
00208 
00209         /* C routine to process image */
00210         for (i = istart; i < length; i++) {
00211                 result = (int) *cursrc1 + (int) *cursrc2;
00212                 if (result > 255)
00213                         result = 255;
00214                 *curdst = (unsigned char) result;
00215                 /* Advance pointers */
00216                 cursrc1++;
00217                 cursrc2++;
00218                 curdst++;
00219         }
00220 
00221         return (0);
00222 }
00223 
00235 static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
00236                                                    unsigned char *Mask)
00237 {
00238 #ifdef USE_MMX
00239 #if !defined(GCC__)
00240         __asm
00241         { 
00242                 pusha
00243                         mov edx, Mask /* load Mask address into edx */
00244                         movq mm0, [edx] /* load Mask into mm0 */
00245                 mov eax, Src1 /* load Src1 address into eax */
00246                         mov ebx, Src2 /* load Src2 address into ebx */
00247                         mov edi, Dest /* load Dest address into edi */
00248                         mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
00249                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
00250                         align 16        /* 16 byte alignment of the loop entry */
00251 L21011:
00252                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00253                 movq mm2,  [ebx]        /* load 8 bytes from Src2 into mm2 */
00254                 /* --- Byte shift via Word shift --- */
00255                 psrlw mm1, 1    /* shift 4 WORDS of mm1 1 bit to the right */
00256                         psrlw mm2, 1    /* shift 4 WORDS of mm2 1 bit to the right */
00257                         pand mm1, mm0   // apply Mask to 8 BYTES of mm1 */
00258                         /* byte     0x0f, 0xdb, 0xc8 */
00259                         pand mm2, mm0   // apply Mask to 8 BYTES of mm2 */
00260                         /* byte     0x0f, 0xdb, 0xd0 */
00261                         paddusb mm1,  mm2       /* mm1=mm1+mm2 (add 8 bytes with saturation) */
00262                         movq [edi],  mm1        /* store result in Dest */
00263                         add eax,  8     /* increase Src1, Src2 and Dest  */
00264                         add ebx,  8     /* register pointers by 8 */
00265                         add edi,  8
00266                         dec ecx         /* decrease loop counter */
00267                         jnz L21011      /* check loop termination, proceed if required */
00268                         emms    /* exit MMX state */
00269                         popa
00270         }
00271 #else
00272         /* i386 and x86_64 */
00273         __m64 *mSrc1 = (__m64*)Src1;
00274         __m64 *mSrc2 = (__m64*)Src2;
00275         __m64 *mDest = (__m64*)Dest;
00276         __m64 *mMask = (__m64*)Mask;
00277         int i;
00278         for (i = 0; i < SrcLength/8; i++) {
00279                 __m64 mm1 = *mSrc1,
00280                       mm2 = *mSrc2;
00281                 mm1 = _m_psrlwi(mm1, 1);        /* shift 4 WORDS of mm1 1 bit to the right */
00282                 mm2 = _m_psrlwi(mm2, 1);        /* shift 4 WORDS of mm2 1 bit to the right */
00283                 mm1 = _m_pand(mm1, *mMask);     /* apply Mask to 8 BYTES of mm1 */
00284                 mm2 = _m_pand(mm2, *mMask);     /* apply Mask to 8 BYTES of mm2 */
00285                 *mDest = _m_paddusb(mm1, mm2);  /* mm1+mm2 (add 8 bytes with saturation) */
00286                 mSrc1++;
00287                 mSrc2++;
00288                 mDest++;
00289         }
00290         _m_empty();                             /* clean MMX state */
00291 #endif
00292         return (0);
00293 #else
00294         return (-1);
00295 #endif
00296 }
00297 
00308 int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00309 {
00310         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
00311         unsigned int i, istart;
00312         unsigned char *cursrc1, *cursrc2, *curdst;
00313         int result;
00314 
00315         /* Validate input parameters */
00316         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00317                 return(-1);
00318         if (length == 0)
00319                 return(0);
00320 
00321         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00322                 /* MMX routine */
00323                 SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
00324 
00325                 /* Check for unaligned bytes */
00326                 if ((length & 7) > 0) {
00327                         /* Setup to process unaligned bytes */
00328                         istart = length & 0xfffffff8;
00329                         cursrc1 = &Src1[istart];
00330                         cursrc2 = &Src2[istart];
00331                         curdst = &Dest[istart];
00332                 } else {
00333                         /* No unaligned bytes - we are done */
00334                         return (0);
00335                 }
00336         } else {
00337                 /* Setup to process whole image */
00338                 istart = 0;
00339                 cursrc1 = Src1;
00340                 cursrc2 = Src2;
00341                 curdst = Dest;
00342         }
00343 
00344         /* C routine to process image */
00345         for (i = istart; i < length; i++) {
00346                 result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
00347                 *curdst = (unsigned char) result;
00348                 /* Advance pointers */
00349                 cursrc1++;
00350                 cursrc2++;
00351                 curdst++;
00352         }
00353 
00354         return (0);
00355 }
00356 
00367 static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00368 {
00369 #ifdef USE_MMX
00370 #if !defined(GCC__)
00371         __asm
00372         {
00373                 pusha
00374                         mov eax,  Src1  /* load Src1 address into eax */
00375                         mov ebx,  Src2  /* load Src2 address into ebx */
00376                         mov edi,  Dest  /* load Dest address into edi */
00377                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
00378                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
00379                         align 16 /* 16 byte alignment of the loop entry */
00380 L1012:
00381                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00382                 psubusb mm1,  [ebx]     /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
00383                 movq [edi],  mm1        /* store result in Dest */
00384                         add eax, 8      /* increase Src1, Src2 and Dest  */
00385                         add ebx, 8      /* register pointers by 8 */
00386                         add edi, 8
00387                         dec ecx /* decrease loop counter */
00388                         jnz L1012       /* check loop termination, proceed if required */
00389                         emms /* exit MMX state */
00390                         popa
00391         }
00392 #else
00393         /* i386 and x86_64 */
00394         __m64 *mSrc1 = (__m64*)Src1;
00395         __m64 *mSrc2 = (__m64*)Src2;
00396         __m64 *mDest = (__m64*)Dest;
00397         int i;
00398         for (i = 0; i < SrcLength/8; i++) {
00399                 *mDest = _m_psubusb(*mSrc1, *mSrc2);    /* Src1-Src2 (sub 8 bytes with saturation) */
00400                 mSrc1++;
00401                 mSrc2++;
00402                 mDest++;
00403         }
00404         _m_empty();                                     /* clean MMX state */
00405 #endif
00406         return (0);
00407 #else
00408         return (-1);
00409 #endif
00410 }
00411 
00422 int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00423 {
00424         unsigned int i, istart;
00425         unsigned char *cursrc1, *cursrc2, *curdst;
00426         int result;
00427 
00428         /* Validate input parameters */
00429         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00430                 return(-1);
00431         if (length == 0)
00432                 return(0);
00433 
00434         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00435                 /* MMX routine */
00436                 SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
00437 
00438                 /* Check for unaligned bytes */
00439                 if ((length & 7) > 0) {
00440                         /* Setup to process unaligned bytes */
00441                         istart = length & 0xfffffff8;
00442                         cursrc1 = &Src1[istart];
00443                         cursrc2 = &Src2[istart];
00444                         curdst = &Dest[istart];
00445                 } else {
00446                         /* No unaligned bytes - we are done */
00447                         return (0);
00448                 }
00449         } else {
00450                 /* Setup to process whole image */
00451                 istart = 0;
00452                 cursrc1 = Src1;
00453                 cursrc2 = Src2;
00454                 curdst = Dest;
00455         }
00456 
00457         /* C routine to process image */
00458         for (i = istart; i < length; i++) {
00459                 result = (int) *cursrc1 - (int) *cursrc2;
00460                 if (result < 0)
00461                         result = 0;
00462                 *curdst = (unsigned char) result;
00463                 /* Advance pointers */
00464                 cursrc1++;
00465                 cursrc2++;
00466                 curdst++;
00467         }
00468 
00469         return (0);
00470 }
00471 
00482 static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00483 {
00484 #ifdef USE_MMX
00485 #if !defined(GCC__)
00486         __asm
00487         {
00488                 pusha
00489                         mov eax, Src1   /* load Src1 address into eax */
00490                         mov ebx, Src2   /* load Src2 address into ebx */
00491                         mov edi, Dest   /* load Dest address into edi */
00492                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
00493                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
00494                         align 16        /* 16 byte alignment of the loop entry */
00495 L1013:
00496                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00497                 movq mm2,  [ebx]        /* load 8 bytes from Src2 into mm2 */
00498                 psubusb mm1,  [ebx]     /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
00499                 psubusb mm2,  [eax]     /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
00500                 por mm1,  mm2   /* combine both mm2 and mm1 results */
00501                         movq [edi],  mm1        /* store result in Dest */
00502                         add eax, 8      /* increase Src1, Src2 and Dest  */
00503                         add ebx, 8      /* register pointers by 8 */
00504                         add edi, 8
00505                         dec ecx         /* decrease loop counter */
00506                         jnz L1013       /* check loop termination, proceed if required */
00507                         emms         /* exit MMX state */
00508                         popa
00509         }
00510 #else
00511         /* i386 and x86_64 */
00512         __m64 *mSrc1 = (__m64*)Src1;
00513         __m64 *mSrc2 = (__m64*)Src2;
00514         __m64 *mDest = (__m64*)Dest;
00515         int i;
00516         for (i = 0; i < SrcLength/8; i++) {
00517                 __m64 mm1 = _m_psubusb(*mSrc2, *mSrc1); /* Src1-Src2 (sub 8 bytes with saturation) */
00518                 __m64 mm2 = _m_psubusb(*mSrc1, *mSrc2); /* Src2-Src1 (sub 8 bytes with saturation) */
00519                 *mDest = _m_por(mm1, mm2);              /* combine both mm2 and mm1 results */
00520                 mSrc1++;
00521                 mSrc2++;
00522                 mDest++;
00523         }
00524         _m_empty();                                     /* clean MMX state */
00525 #endif
00526         return (0);
00527 #else
00528         return (-1);
00529 #endif
00530 }
00531 
00542 int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00543 {
00544         unsigned int i, istart;
00545         unsigned char *cursrc1, *cursrc2, *curdst;
00546         int result;
00547 
00548         /* Validate input parameters */
00549         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00550                 return(-1);
00551         if (length == 0)
00552                 return(0);
00553 
00554         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00555                 /* MMX routine */
00556                 SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
00557 
00558                 /* Check for unaligned bytes */
00559                 if ((length & 7) > 0) {
00560                         /* Setup to process unaligned bytes */
00561                         istart = length & 0xfffffff8;
00562                         cursrc1 = &Src1[istart];
00563                         cursrc2 = &Src2[istart];
00564                         curdst = &Dest[istart];
00565                 } else {
00566                         /* No unaligned bytes - we are done */
00567                         return (0);
00568                 }
00569         } else {
00570                 /* Setup to process whole image */
00571                 istart = 0;
00572                 cursrc1 = Src1;
00573                 cursrc2 = Src2;
00574                 curdst = Dest;
00575         }
00576 
00577         /* C routine to process image */
00578         for (i = istart; i < length; i++) {
00579                 result = abs((int) *cursrc1 - (int) *cursrc2);
00580                 *curdst = (unsigned char) result;
00581                 /* Advance pointers */
00582                 cursrc1++;
00583                 cursrc2++;
00584                 curdst++;
00585         }
00586 
00587         return (0);
00588 }
00589 
00600 static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00601 {
00602 #ifdef USE_MMX
00603 #if !defined(GCC__)
00604         __asm
00605         {
00606                 pusha
00607                         mov eax, Src1   /* load Src1 address into eax */
00608                         mov ebx, Src2   /* load Src2 address into ebx */
00609                         mov edi, Dest   /* load Dest address into edi */
00610                         mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
00611                         shr ecx, 3   /* counter/8 (MMX loads 8 bytes at a time) */
00612                         pxor mm0, mm0   /* zero mm0 register */
00613                         align 16        /* 16 byte alignment of the loop entry */
00614 L1014:
00615                 movq mm1, [eax]   /* load 8 bytes from Src1 into mm1 */
00616                 movq mm3, [ebx]   /* load 8 bytes from Src2 into mm3 */
00617                 movq mm2, mm1   /* copy mm1 into mm2 */
00618                         movq mm4, mm3   /* copy mm3 into mm4  */
00619                         punpcklbw mm1, mm0   /* unpack low  bytes of Src1 into words */
00620                         punpckhbw mm2, mm0   /* unpack high bytes of Src1 into words */
00621                         punpcklbw mm3, mm0   /* unpack low  bytes of Src2 into words */
00622                         punpckhbw mm4, mm0   /* unpack high bytes of Src2 into words */
00623                         pmullw mm1, mm3   /* mul low  bytes of Src1 and Src2  */
00624                         pmullw mm2, mm4   /* mul high bytes of Src1 and Src2 */
00625                         /* Take abs value of the results (signed words) */
00626                         movq mm5, mm1   /* copy mm1 into mm5 */
00627                         movq mm6, mm2   /* copy mm2 into mm6 */
00628                         psraw mm5, 15   /* fill mm5 words with word sign bit */
00629                         psraw mm6, 15   /* fill mm6 words with word sign bit */
00630                         pxor mm1, mm5   /* take 1's compliment of only neg. words */
00631                         pxor mm2, mm6   /* take 1's compliment of only neg. words */
00632                         psubsw mm1, mm5   /* add 1 to only neg. words, W-(-1) or W-0 */
00633                         psubsw mm2, mm6   /* add 1 to only neg. words, W-(-1) or W-0 */
00634                         packuswb mm1, mm2   /* pack words back into bytes with saturation */
00635                         movq [edi], mm1   /* store result in Dest */
00636                         add eax, 8   /* increase Src1, Src2 and Dest  */
00637                         add ebx, 8   /* register pointers by 8 */
00638                         add edi, 8
00639                         dec ecx         /* decrease loop counter */
00640                         jnz L1014       /* check loop termination, proceed if required */
00641                         emms /* exit MMX state */
00642                         popa
00643         }
00644 #else
00645         /* i386 ASM with constraints: */
00646         /* asm volatile ( */
00647         /*      "shr $3, %%ecx \n\t"    /\* counter/8 (MMX loads 8 bytes at a time) *\/ */
00648         /*      "pxor      %%mm0, %%mm0 \n\t"   /\* zero mm0 register *\/ */
00649         /*      ".align 16       \n\t"  /\* 16 byte alignment of the loop entry *\/ */
00650         /*      "1: movq (%%eax), %%mm1 \n\t"     /\* load 8 bytes from Src1 into mm1 *\/ */
00651         /*      "movq    (%%ebx), %%mm3 \n\t"   /\* load 8 bytes from Src2 into mm3 *\/ */
00652         /*      "movq      %%mm1, %%mm2 \n\t"   /\* copy mm1 into mm2 *\/ */
00653         /*      "movq      %%mm3, %%mm4 \n\t"   /\* copy mm3 into mm4  *\/ */
00654         /*      "punpcklbw %%mm0, %%mm1 \n\t"   /\* unpack low  bytes of Src1 into words *\/ */
00655         /*      "punpckhbw %%mm0, %%mm2 \n\t"   /\* unpack high bytes of Src1 into words *\/ */
00656         /*      "punpcklbw %%mm0, %%mm3 \n\t"   /\* unpack low  bytes of Src2 into words *\/ */
00657         /*      "punpckhbw %%mm0, %%mm4 \n\t"   /\* unpack high bytes of Src2 into words *\/ */
00658         /*      "pmullw    %%mm3, %%mm1 \n\t"   /\* mul low  bytes of Src1 and Src2  *\/ */
00659         /*      "pmullw    %%mm4, %%mm2 \n\t"   /\* mul high bytes of Src1 and Src2 *\/ */
00660         /*      /\* Take abs value of the results (signed words) *\/ */
00661         /*      "movq      %%mm1, %%mm5 \n\t"   /\* copy mm1 into mm5 *\/ */
00662         /*      "movq      %%mm2, %%mm6 \n\t"   /\* copy mm2 into mm6 *\/ */
00663         /*      "psraw       $15, %%mm5 \n\t"   /\* fill mm5 words with word sign bit *\/ */
00664         /*      "psraw       $15, %%mm6 \n\t"   /\* fill mm6 words with word sign bit *\/ */
00665         /*      "pxor      %%mm5, %%mm1 \n\t"   /\* take 1's compliment of only neg. words *\/ */
00666         /*      "pxor      %%mm6, %%mm2 \n\t"   /\* take 1's compliment of only neg. words *\/ */
00667         /*      "psubsw    %%mm5, %%mm1 \n\t"   /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
00668         /*      "psubsw    %%mm6, %%mm2 \n\t"   /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
00669         /*      "packuswb  %%mm2, %%mm1 \n\t"   /\* pack words back into bytes with saturation *\/ */
00670         /*      "movq    %%mm1, (%%edi) \n\t"   /\* store result in Dest *\/ */
00671         /*      "add $8, %%eax \n\t"    /\* increase Src1, Src2 and Dest  *\/ */
00672         /*      "add $8, %%ebx \n\t"    /\* register pointers by 8 *\/ */
00673         /*      "add $8, %%edi \n\t" */
00674         /*      "dec %%ecx     \n\t"    /\* decrease loop counter *\/ */
00675         /*      "jnz 1b        \n\t"    /\* check loop termination, proceed if required *\/ */
00676         /*      "emms          \n\t"    /\* exit MMX state *\/ */
00677         /*      : "+a" (Src1),          /\* load Src1 address into rax, modified by the loop *\/ */
00678         /*        "+b" (Src2),          /\* load Src2 address into rbx, modified by the loop *\/ */
00679         /*        "+c" (SrcLength),     /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
00680         /*        "+D" (Dest)           /\* load Dest address into rdi, modified by the loop *\/ */
00681         /*      : */
00682         /*      : "memory",             /\* *Dest is modified *\/ */
00683         /*           "mm0","mm1","mm2","mm3","mm4","mm5","mm6"  /\* registers modified *\/ */
00684         /* ); */
00685 
00686         /* i386 and x86_64 */
00687         __m64 *mSrc1 = (__m64*)Src1;
00688         __m64 *mSrc2 = (__m64*)Src2;
00689         __m64 *mDest = (__m64*)Dest;
00690         __m64 mm0 = _m_from_int(0); /* zero mm0 register */
00691         int i;
00692         for (i = 0; i < SrcLength/8; i++) {
00693                 __m64 mm1, mm2, mm3, mm4, mm5, mm6;
00694                 mm1 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
00695                 mm2 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
00696                 mm3 = _m_punpcklbw(*mSrc2, mm0);        /* unpack low  bytes of Src2 into words */
00697                 mm4 = _m_punpckhbw(*mSrc2, mm0);        /* unpack high bytes of Src2 into words */
00698                 mm1 = _m_pmullw(mm1, mm3);              /* mul low  bytes of Src1 and Src2  */
00699                 mm2 = _m_pmullw(mm2, mm4);              /* mul high bytes of Src1 and Src2 */
00700                 mm5 = _m_psrawi(mm1, 15);               /* fill mm5 words with word sign bit */
00701                 mm6 = _m_psrawi(mm2, 15);               /* fill mm6 words with word sign bit */
00702                 mm1 = _m_pxor(mm1, mm5);                /* take 1's compliment of only neg. words */
00703                 mm2 = _m_pxor(mm2, mm6);                /* take 1's compliment of only neg. words */
00704                 mm1 = _m_psubsw(mm1, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */
00705                 mm2 = _m_psubsw(mm2, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */
00706                 *mDest = _m_packuswb(mm1, mm2);         /* pack words back into bytes with saturation */
00707                 mSrc1++;
00708                 mSrc2++;
00709                 mDest++;
00710         }
00711         _m_empty();                                     /* clean MMX state */
00712 #endif
00713         return (0);
00714 #else
00715         return (-1);
00716 #endif
00717 }
00718 
00729 int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00730 {
00731         unsigned int i, istart;
00732         unsigned char *cursrc1, *cursrc2, *curdst;
00733         int result;
00734 
00735         /* Validate input parameters */
00736         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00737                 return(-1);
00738         if (length == 0)
00739                 return(0);
00740 
00741         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00742                 /* MMX routine */
00743                 SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
00744 
00745                 /* Check for unaligned bytes */
00746                 if ((length & 7) > 0) {
00747                         /* Setup to process unaligned bytes */
00748                         istart = length & 0xfffffff8;
00749                         cursrc1 = &Src1[istart];
00750                         cursrc2 = &Src2[istart];
00751                         curdst = &Dest[istart];
00752                 } else {
00753                         /* No unaligned bytes - we are done */
00754                         return (0);
00755                 }
00756         } else {
00757                 /* Setup to process whole image */
00758                 istart = 0;
00759                 cursrc1 = Src1;
00760                 cursrc2 = Src2;
00761                 curdst = Dest;
00762         }
00763 
00764         /* C routine to process image */
00765         for (i = istart; i < length; i++) {
00766 
00767                 /* NOTE: this is probably wrong - dunno what the MMX code does */
00768 
00769                 result = (int) *cursrc1 * (int) *cursrc2;
00770                 if (result > 255)
00771                         result = 255;
00772                 *curdst = (unsigned char) result;
00773                 /* Advance pointers */
00774                 cursrc1++;
00775                 cursrc2++;
00776                 curdst++;
00777         }
00778 
00779         return (0);
00780 }
00781 
00792 int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00793 {
00794 #ifdef USE_MMX
00795 #if !defined(GCC__)
00796         __asm
00797         {
00798                 pusha
00799                         mov edx, Src1   /* load Src1 address into edx */
00800                         mov esi, Src2   /* load Src2 address into esi */
00801                         mov edi, Dest   /* load Dest address into edi */
00802                         mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
00803                         align 16        /* 16 byte alignment of the loop entry */
00804 L10141:
00805                 mov al, [edx]   /* load a byte from Src1 */
00806                 mul [esi]       /* mul with a byte from Src2 */
00807                 mov [edi], al   /* move a byte result to Dest */
00808                         inc edx         /* increment Src1, Src2, Dest */
00809                         inc esi                 /* pointer registers by one */
00810                         inc edi
00811                         dec ecx /* decrease loop counter */
00812                         jnz L10141      /* check loop termination, proceed if required */
00813                         popa
00814         }
00815 #else
00816         /* Note: ~5% gain on i386, less efficient than C on x86_64 */
00817         /* Also depends on whether this function is static (?!) */
00818         asm volatile (
00819                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
00820 #  if defined(i386)
00821                 "1:mov  (%%edx), %%al \n\t"      /* load a byte from Src1 */
00822                 "mulb (%%esi)       \n\t"       /* mul with a byte from Src2 */
00823                 "mov %%al, (%%edi)  \n\t"       /* move a byte result to Dest */
00824                 "inc %%edx \n\t"                /* increment Src1, Src2, Dest */
00825                 "inc %%esi \n\t"                /* pointer registers by one */
00826                 "inc %%edi \n\t"
00827                 "dec %%ecx      \n\t"   /* decrease loop counter */
00828 #  elif defined(__x86_64__)
00829                 "1:mov  (%%rdx), %%al \n\t"      /* load a byte from Src1 */
00830                 "mulb (%%rsi)       \n\t"       /* mul with a byte from Src2 */
00831                 "mov %%al, (%%rdi)  \n\t"       /* move a byte result to Dest */
00832                 "inc %%rdx \n\t"                /* increment Src1, Src2, Dest */
00833                 "inc %%rsi \n\t"                /* pointer registers by one */
00834                 "inc %%rdi \n\t"
00835                 "dec %%rcx      \n\t"   /* decrease loop counter */
00836 #  endif
00837                 "jnz 1b         \n\t"   /* check loop termination, proceed if required */
00838                 : "+d" (Src1),          /* load Src1 address into edx */
00839                   "+S" (Src2),          /* load Src2 address into esi */
00840                   "+c" (SrcLength),     /* load loop counter (SIZE) into ecx */
00841                   "+D" (Dest)           /* load Dest address into edi */
00842                 :
00843                 : "memory", "rax"
00844                 );
00845 #endif
00846         return (0);
00847 #else
00848         return (-1);
00849 #endif
00850 }
00851 
00862 int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00863 {
00864         unsigned int i, istart;
00865         unsigned char *cursrc1, *cursrc2, *curdst;
00866 
00867         /* Validate input parameters */
00868         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00869                 return(-1);
00870         if (length == 0)
00871                 return(0);
00872 
00873         if (SDL_imageFilterMMXdetect()) {
00874                 if (length > 0) {
00875                         /* ASM routine */
00876                         SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
00877 
00878                         /* Check for unaligned bytes */
00879                         if ((length & 7) > 0) {
00880                                 /* Setup to process unaligned bytes */
00881                                 istart = length & 0xfffffff8;
00882                                 cursrc1 = &Src1[istart];
00883                                 cursrc2 = &Src2[istart];
00884                                 curdst = &Dest[istart];
00885                         } else {
00886                                 /* No unaligned bytes - we are done */
00887                                 return (0);
00888                         }
00889                 } else {
00890                         /* No bytes - we are done */
00891                         return (0);
00892                 }
00893         } else {
00894                 /* Setup to process whole image */
00895                 istart = 0;
00896                 cursrc1 = Src1;
00897                 cursrc2 = Src2;
00898                 curdst = Dest;
00899         }
00900 
00901         /* C routine to process image */
00902         for (i = istart; i < length; i++) {
00903                 *curdst = (int)*cursrc1 * (int)*cursrc2;  // (int) for efficiency
00904                 /* Advance pointers */
00905                 cursrc1++;
00906                 cursrc2++;
00907                 curdst++;
00908         }
00909 
00910         return (0);
00911 }
00912 
00923 static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00924 {
00925 #ifdef USE_MMX
00926 #if !defined(GCC__)
00927         __asm
00928         { 
00929                 pusha
00930                         mov eax, Src1           /* load Src1 address into eax */
00931                         mov ebx, Src2           /* load Src2 address into ebx */
00932                         mov edi, Dest           /* load Dest address into edi */
00933                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
00934                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
00935                         pxor mm0,  mm0  /* zero mm0 register */
00936                         align 16                /* 16 byte alignment of the loop entry */
00937 L1015:
00938                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00939                 movq mm3,  [ebx]        /* load 8 bytes from Src2 into mm3 */
00940                 movq mm2,  mm1  /* copy mm1 into mm2 */
00941                         movq mm4,  mm3  /* copy mm3 into mm4  */
00942                         punpcklbw mm1,  mm0     /* unpack low  bytes of Src1 into words */
00943                         punpckhbw mm2,  mm0     /* unpack high bytes of Src1 into words */
00944                         punpcklbw mm3,  mm0     /* unpack low  bytes of Src2 into words */
00945                         punpckhbw mm4,  mm0     /* unpack high bytes of Src2 into words */
00946                         psrlw mm1,  1   /* divide mm1 words by 2, Src1 low bytes */
00947                         psrlw mm2,  1   /* divide mm2 words by 2, Src1 high bytes */
00948                         pmullw mm1,  mm3        /* mul low  bytes of Src1 and Src2  */
00949                         pmullw mm2,  mm4        /* mul high bytes of Src1 and Src2 */
00950                         packuswb mm1,  mm2      /* pack words back into bytes with saturation */
00951                         movq [edi],  mm1        /* store result in Dest */
00952                         add eax,  8     /* increase Src1, Src2 and Dest  */
00953                         add ebx,  8     /* register pointers by 8 */
00954                         add edi,  8
00955                         dec ecx         /* decrease loop counter */
00956                         jnz L1015               /* check loop termination, proceed if required */
00957                         emms                    /* exit MMX state */
00958                         popa
00959         }
00960 #else
00961         /* i386 and x86_64 */
00962         __m64 *mSrc1 = (__m64*)Src1;
00963         __m64 *mSrc2 = (__m64*)Src2;
00964         __m64 *mDest = (__m64*)Dest;
00965         __m64 mm0 = _m_from_int(0); /* zero mm0 register */
00966         int i;
00967         for (i = 0; i < SrcLength/8; i++) {
00968                 __m64 mm1, mm2, mm3, mm4, mm5, mm6;
00969                 mm1 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
00970                 mm2 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
00971                 mm3 = _m_punpcklbw(*mSrc2, mm0);        /* unpack low  bytes of Src2 into words */
00972                 mm4 = _m_punpckhbw(*mSrc2, mm0);        /* unpack high bytes of Src2 into words */
00973                 mm1 = _m_psrlwi(mm1, 1);                /* divide mm1 words by 2, Src1 low bytes */
00974                 mm2 = _m_psrlwi(mm2, 1);                /* divide mm2 words by 2, Src1 high bytes */
00975                 mm1 = _m_pmullw(mm1, mm3);              /* mul low  bytes of Src1 and Src2  */
00976                 mm2 = _m_pmullw(mm2, mm4);              /* mul high bytes of Src1 and Src2 */
00977                 *mDest = _m_packuswb(mm1, mm2);         /* pack words back into bytes with saturation */
00978                 mSrc1++;
00979                 mSrc2++;
00980                 mDest++;
00981         }
00982         _m_empty();                                     /* clean MMX state */
00983 #endif
00984         return (0);
00985 #else
00986         return (-1);
00987 #endif
00988 }
00989 
01000 int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01001 {
01002         unsigned int i, istart;
01003         unsigned char *cursrc1, *cursrc2, *curdst;
01004         int result;
01005 
01006         /* Validate input parameters */
01007         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01008                 return(-1);
01009         if (length == 0)
01010                 return(0);
01011 
01012         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01013                 /* MMX routine */
01014                 SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
01015 
01016                 /* Check for unaligned bytes */
01017                 if ((length & 7) > 0) {
01018                         /* Setup to process unaligned bytes */
01019                         istart = length & 0xfffffff8;
01020                         cursrc1 = &Src1[istart];
01021                         cursrc2 = &Src2[istart];
01022                         curdst = &Dest[istart];
01023                 } else {
01024                         /* No unaligned bytes - we are done */
01025                         return (0);
01026                 }
01027         } else {
01028                 /* Setup to process whole image */
01029                 istart = 0;
01030                 cursrc1 = Src1;
01031                 cursrc2 = Src2;
01032                 curdst = Dest;
01033         }
01034 
01035         /* C routine to process image */
01036         for (i = istart; i < length; i++) {
01037                 result = ((int) *cursrc1 / 2) * (int) *cursrc2;
01038                 if (result > 255)
01039                         result = 255;
01040                 *curdst = (unsigned char) result;
01041                 /* Advance pointers */
01042                 cursrc1++;
01043                 cursrc2++;
01044                 curdst++;
01045         }
01046 
01047         return (0);
01048 }
01049 
01060 static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01061 {
01062 #ifdef USE_MMX
01063 #if !defined(GCC__)
01064         __asm
01065         {
01066                 pusha
01067                         mov eax, Src1           /* load Src1 address into eax */
01068                         mov ebx, Src2           /* load Src2 address into ebx */
01069                         mov edi, Dest           /* load Dest address into edi */
01070                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01071                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01072                         pxor mm0, mm0           /* zero mm0 register */
01073                         align 16                /* 16 byte alignment of the loop entry */
01074 L1016:
01075                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */
01076                 movq mm3, [ebx]         /* load 8 bytes from Src2 into mm3 */
01077                 movq mm2, mm1           /* copy mm1 into mm2 */
01078                         movq mm4, mm3           /* copy mm3 into mm4  */
01079                         punpcklbw mm1, mm0      /* unpack low  bytes of Src1 into words */
01080                         punpckhbw mm2, mm0      /* unpack high bytes of Src1 into words */
01081                         punpcklbw mm3, mm0      /* unpack low  bytes of Src2 into words */
01082                         punpckhbw mm4, mm0      /* unpack high bytes of Src2 into words */
01083                         psrlw mm1, 1    /* divide mm1 words by 2, Src1 low bytes */
01084                         psrlw mm2, 1    /* divide mm2 words by 2, Src1 high bytes */
01085                         psrlw mm3, 1    /* divide mm3 words by 2, Src2 low bytes */
01086                         psrlw mm4, 1    /* divide mm4 words by 2, Src2 high bytes */
01087                         pmullw mm1, mm3         /* mul low  bytes of Src1 and Src2  */
01088                         pmullw mm2, mm4         /* mul high bytes of Src1 and Src2 */
01089                         packuswb mm1, mm2       /* pack words back into bytes with saturation */
01090                         movq [edi], mm1         /* store result in Dest */
01091                         add eax, 8      /* increase Src1, Src2 and Dest  */
01092                         add ebx, 8      /* register pointers by 8 */
01093                         add edi,  8
01094                         dec ecx         /* decrease loop counter */
01095                         jnz L1016               /* check loop termination, proceed if required */
01096                         emms                    /* exit MMX state */
01097                         popa
01098         }
01099 #else
01100         /* i386 and x86_64 */
01101         __m64 *mSrc1 = (__m64*)Src1;
01102         __m64 *mSrc2 = (__m64*)Src2;
01103         __m64 *mDest = (__m64*)Dest;
01104         __m64 mm0 = _m_from_int(0); /* zero mm0 register */
01105         int i;
01106         for (i = 0; i < SrcLength/8; i++) {
01107                 __m64 mm1, mm2, mm3, mm4, mm5, mm6;
01108                 mm1 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
01109                 mm2 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
01110                 mm3 = _m_punpcklbw(*mSrc2, mm0);        /* unpack low  bytes of Src2 into words */
01111                 mm4 = _m_punpckhbw(*mSrc2, mm0);        /* unpack high bytes of Src2 into words */
01112                 mm1 = _m_psrlwi(mm1, 1);                /* divide mm1 words by 2, Src1 low bytes */
01113                 mm2 = _m_psrlwi(mm2, 1);                /* divide mm2 words by 2, Src1 high bytes */
01114                 mm3 = _m_psrlwi(mm3, 1);                /* divide mm3 words by 2, Src2 low bytes */
01115                 mm4 = _m_psrlwi(mm4, 1);                /* divide mm4 words by 2, Src2 high bytes */
01116                 mm1 = _m_pmullw(mm1, mm3);              /* mul low  bytes of Src1 and Src2  */
01117                 mm2 = _m_pmullw(mm2, mm4);              /* mul high bytes of Src1 and Src2 */
01118                 *mDest = _m_packuswb(mm1, mm2);         /* pack words back into bytes with saturation */
01119                 mSrc1++;
01120                 mSrc2++;
01121                 mDest++;
01122         }
01123         _m_empty();                                     /* clean MMX state */
01124 #endif
01125         return (0);
01126 #else
01127         return (-1);
01128 #endif
01129 }
01130 
01141 int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01142 {
01143         unsigned int i, istart;
01144         unsigned char *cursrc1, *cursrc2, *curdst;
01145         int result;
01146 
01147         /* Validate input parameters */
01148         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01149                 return(-1);
01150         if (length == 0)
01151                 return(0);
01152 
01153         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01154                 /* MMX routine */
01155                 SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
01156 
01157                 /* Check for unaligned bytes */
01158                 if ((length & 7) > 0) {
01159                         /* Setup to process unaligned bytes */
01160                         istart = length & 0xfffffff8;
01161                         cursrc1 = &Src1[istart];
01162                         cursrc2 = &Src2[istart];
01163                         curdst = &Dest[istart];
01164                 } else {
01165                         /* No unaligned bytes - we are done */
01166                         return (0);
01167                 }
01168         } else {
01169                 /* Setup to process whole image */
01170                 istart = 0;
01171                 cursrc1 = Src1;
01172                 cursrc2 = Src2;
01173                 curdst = Dest;
01174         }
01175 
01176         /* C routine to process image */
01177         for (i = istart; i < length; i++) {
01178                 result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
01179                 if (result > 255)
01180                         result = 255;
01181                 *curdst = (unsigned char) result;
01182                 /* Advance pointers */
01183                 cursrc1++;
01184                 cursrc2++;
01185                 curdst++;
01186         }
01187 
01188         return (0);
01189 }
01190 
01201 static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01202 {
01203 #ifdef USE_MMX
01204 #if !defined(GCC__)
01205         __asm
01206         {
01207                 pusha
01208                         mov eax, Src1           /* load Src1 address into eax */
01209                         mov ebx, Src2           /* load Src2 address into ebx */
01210                         mov edi, Dest           /* load Dest address into edi */
01211                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01212                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
01213                         align 16                /* 16 byte alignment of the loop entry */
01214 L1017:
01215                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */
01216                 pand mm1, [ebx]         /* mm1=Src1&Src2 */
01217                 movq [edi], mm1         /* store result in Dest */
01218                         add eax, 8      /* increase Src1, Src2 and Dest  */
01219                         add ebx, 8      /* register pointers by 8 */
01220                         add edi, 8
01221                         dec ecx         /* decrease loop counter */
01222                         jnz L1017               /* check loop termination, proceed if required */
01223                         emms                    /* exit MMX state */
01224                         popa
01225         }
01226 #else
01227         /* x86_64 ASM with constraints: */
01228         /* asm volatile ( */
01229         /*      "shr $3, %%rcx \n\t"    /\* counter/8 (MMX loads 8 bytes at a time) *\/ */
01230         /*      ".align 16       \n\t"  /\* 16 byte alignment of the loop entry *\/ */
01231         /*      "1: movq (%%rax), %%mm1 \n\t"   /\* load 8 bytes from Src1 into mm1 *\/ */
01232         /*      "pand    (%%rbx), %%mm1 \n\t"   /\* mm1=Src1&Src2 *\/ */
01233         /*      "movq    %%mm1, (%%rdi) \n\t"   /\* store result in Dest *\/ */
01234         /*      "add $8, %%rax \n\t"    /\* increase Src1, Src2 and Dest  *\/ */
01235         /*      "add $8, %%rbx \n\t"    /\* register pointers by 8 *\/ */
01236         /*      "add $8, %%rdi \n\t" */
01237         /*      "dec %%rcx     \n\t"    /\* decrease loop counter *\/ */
01238         /*      "jnz 1b        \n\t"    /\* check loop termination, proceed if required *\/ */
01239         /*      "emms          \n\t"    /\* exit MMX state *\/ */
01240         /*      : "+a" (Src1),          /\* load Src1 address into rax, modified by the loop *\/ */
01241         /*        "+b" (Src2),          /\* load Src2 address into rbx, modified by the loop *\/ */
01242         /*        "+c" (SrcLength),     /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
01243         /*        "+D" (Dest)           /\* load Dest address into rdi, modified by the loop *\/ */
01244         /*      : */
01245         /*      : "memory",             /\* *Dest is modified *\/ */
01246         /*           "mm1"                      /\* register mm1 modified *\/ */
01247         /* ); */
01248 
01249         /* i386 and x86_64 */
01250         __m64 *mSrc1 = (__m64*)Src1;
01251         __m64 *mSrc2 = (__m64*)Src2;
01252         __m64 *mDest = (__m64*)Dest;
01253         int i;
01254         for (i = 0; i < SrcLength/8; i++) {
01255                 *mDest = _m_pand(*mSrc1, *mSrc2);       /* Src1&Src2 */
01256                 mSrc1++;
01257                 mSrc2++;
01258                 mDest++;
01259         }
01260         _m_empty();                                     /* clean MMX state */
01261 #endif
01262         return (0);
01263 #else
01264         return (-1);
01265 #endif
01266 }
01267 
01278 int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01279 {
01280         unsigned int i, istart;
01281         unsigned char *cursrc1, *cursrc2, *curdst;
01282 
01283         /* Validate input parameters */
01284         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01285                 return(-1);
01286         if (length == 0)
01287                 return(0);
01288 
01289         if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
01290                 /*  if (length > 7) { */
01291                 /* Call MMX routine */
01292 
01293                 SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
01294 
01295                 /* Check for unaligned bytes */
01296                 if ((length & 7) > 0) {
01297 
01298                         /* Setup to process unaligned bytes */
01299                         istart = length & 0xfffffff8;
01300                         cursrc1 = &Src1[istart];
01301                         cursrc2 = &Src2[istart];
01302                         curdst = &Dest[istart];
01303                 } else {
01304                         /* No unaligned bytes - we are done */
01305                         return (0);
01306                 }
01307         } else {
01308                 /* Setup to process whole image */
01309                 istart = 0;
01310                 cursrc1 = Src1;
01311                 cursrc2 = Src2;
01312                 curdst = Dest;
01313         }
01314 
01315         /* C routine to process image */
01316         for (i = istart; i < length; i++) {
01317                 *curdst = (*cursrc1) & (*cursrc2);
01318                 /* Advance pointers */
01319                 cursrc1++;
01320                 cursrc2++;
01321                 curdst++;
01322         }
01323 
01324         return (0);
01325 }
01326 
01337 static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01338 {
01339 #ifdef USE_MMX
01340 #if !defined(GCC__)
01341         __asm
01342         {
01343                 pusha
01344                         mov eax, Src1           /* load Src1 address into eax */
01345                         mov ebx, Src2           /* load Src2 address into ebx */
01346                         mov edi, Dest           /* load Dest address into edi */
01347                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01348                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01349                         align 16                /* 16 byte alignment of the loop entry */
01350 L91017:
01351                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */
01352                 por mm1, [ebx]          /* mm1=Src1|Src2 */
01353                 movq [edi], mm1         /* store result in Dest */
01354                         add eax, 8      /* increase Src1, Src2 and Dest  */
01355                         add ebx, 8      /* register pointers by 8 */
01356                         add edi,  8
01357                         dec ecx         /* decrease loop counter */
01358                         jnz L91017              /* check loop termination, proceed if required */
01359                         emms                    /* exit MMX state */
01360                         popa
01361         }
01362 #else
01363         /* i386 and x86_64 */
01364         __m64 *mSrc1 = (__m64*)Src1;
01365         __m64 *mSrc2 = (__m64*)Src2;
01366         __m64 *mDest = (__m64*)Dest;
01367         int i;
01368         for (i = 0; i < SrcLength/8; i++) {
01369                 *mDest = _m_por(*mSrc1, *mSrc2);        /* Src1|Src2 */
01370                 mSrc1++;
01371                 mSrc2++;
01372                 mDest++;
01373         }
01374         _m_empty();                                     /* clean MMX state */
01375 #endif
01376         return (0);
01377 #else
01378         return (-1);
01379 #endif
01380 }
01381 
01392 int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01393 {
01394         unsigned int i, istart;
01395         unsigned char *cursrc1, *cursrc2, *curdst;
01396 
01397         /* Validate input parameters */
01398         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01399                 return(-1);
01400         if (length == 0)
01401                 return(0);
01402 
01403         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01404 
01405                 /* MMX routine */
01406                 SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
01407 
01408                 /* Check for unaligned bytes */
01409                 if ((length & 7) > 0) {
01410                         /* Setup to process unaligned bytes */
01411                         istart = length & 0xfffffff8;
01412                         cursrc1 = &Src1[istart];
01413                         cursrc2 = &Src2[istart];
01414                         curdst = &Dest[istart];
01415                 } else {
01416                         /* No unaligned bytes - we are done */
01417                         return (0);
01418                 }
01419         } else {
01420                 /* Setup to process whole image */
01421                 istart = 0;
01422                 cursrc1 = Src1;
01423                 cursrc2 = Src2;
01424                 curdst = Dest;
01425         }
01426 
01427         /* C routine to process image */
01428         for (i = istart; i < length; i++) {
01429                 *curdst = *cursrc1 | *cursrc2;
01430                 /* Advance pointers */
01431                 cursrc1++;
01432                 cursrc2++;
01433                 curdst++;
01434         }
01435         return (0);
01436 }
01437 
01448 static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01449 {
01450 #ifdef USE_MMX
01451 #if !defined(GCC__)
01452         __asm
01453         {
01454                 pusha
01455                         mov edx, Src1           /* load Src1 address into edx */
01456                         mov esi, Src2           /* load Src2 address into esi */
01457                         mov edi, Dest           /* load Dest address into edi */
01458                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01459                         align 16                /* 16 byte alignment of the loop entry */
01460 L10191:
01461                 mov bl, [esi]           /* load a byte from Src2 */
01462                 cmp bl, 0       /* check if it zero */
01463                         jnz L10192
01464                         mov [edi], 255          /* division by zero = 255 !!! */
01465                         jmp  L10193
01466 L10192:
01467                 xor ah, ah      /* prepare AX, zero AH register */
01468                         mov al, [edx]           /* load a byte from Src1 into AL */
01469                 div   bl                /* divide AL by BL */
01470                         mov [edi], al           /* move a byte result to Dest */
01471 L10193:
01472                 inc edx         /* increment Src1, Src2, Dest */
01473                         inc esi                 /* pointer registers by one */
01474                         inc edi
01475                         dec ecx         /* decrease loop counter */
01476                         jnz L10191      /* check loop termination, proceed if required */
01477                         popa
01478         }
01479 #else
01480         /* Note: ~15% gain on i386, less efficient than C on x86_64 */
01481         /* Also depends on whether the function is static (?!) */
01482         /* Also depends on whether we work on malloc() or static char[] */
01483         asm volatile (
01484 #  if defined(i386)
01485                 "pushl %%ebx \n\t"              /* %ebx may be the PIC register.  */
01486                 ".align 16     \n\t"            /* 16 byte alignment of the loop entry */
01487                 "1: mov (%%esi), %%bl  \n\t"    /* load a byte from Src2 */
01488                 "cmp       $0, %%bl    \n\t"    /* check if it zero */
01489                 "jnz 2f                \n\t"
01490                 "movb  $255, (%%edi)   \n\t"    /* division by zero = 255 !!! */
01491                 "jmp 3f                \n\t"
01492                 "2: xor %%ah, %%ah     \n\t"    /* prepare AX, zero AH register */
01493                 "mov   (%%edx), %%al   \n\t"    /* load a byte from Src1 into AL */
01494                 "div   %%bl            \n\t"    /* divide AL by BL */
01495                 "mov   %%al, (%%edi)   \n\t"    /* move a byte result to Dest */
01496                 "3: inc %%edx          \n\t"    /* increment Src1, Src2, Dest */
01497                 "inc %%esi \n\t"                /* pointer registers by one */
01498                 "inc %%edi \n\t"
01499                 "dec %%ecx \n\t"                /* decrease loop counter */
01500                 "jnz 1b    \n\t"                /* check loop termination, proceed if required */
01501                 "popl %%ebx \n\t"               /* restore %ebx */
01502                 : "+d" (Src1),          /* load Src1 address into edx */
01503                   "+S" (Src2),          /* load Src2 address into esi */
01504                   "+c" (SrcLength),     /* load loop counter (SIZE) into ecx */
01505                   "+D" (Dest)           /* load Dest address into edi */
01506                 :
01507                 : "memory", "rax"
01508 #  elif defined(__x86_64__)
01509                 ".align 16     \n\t"            /* 16 byte alignment of the loop entry */
01510                 "1: mov (%%rsi), %%bl  \n\t"    /* load a byte from Src2 */
01511                 "cmp       $0, %%bl    \n\t"    /* check if it zero */
01512                 "jnz 2f                \n\t"
01513                 "movb  $255, (%%rdi)   \n\t"    /* division by zero = 255 !!! */
01514                 "jmp 3f                \n\t"
01515                 "2: xor %%ah, %%ah     \n\t"    /* prepare AX, zero AH register */
01516                 "mov   (%%rdx), %%al   \n\t"    /* load a byte from Src1 into AL */
01517                 "div   %%bl            \n\t"    /* divide AL by BL */
01518                 "mov   %%al, (%%rdi)   \n\t"    /* move a byte result to Dest */
01519                 "3: inc %%rdx          \n\t"    /* increment Src1, Src2, Dest */
01520                 "inc %%rsi \n\t"                /* pointer registers by one */
01521                 "inc %%rdi \n\t"
01522                 "dec %%rcx \n\t"                /* decrease loop counter */
01523                 "jnz 1b    \n\t"                /* check loop termination, proceed if required */
01524                 : "+d" (Src1),          /* load Src1 address into edx */
01525                   "+S" (Src2),          /* load Src2 address into esi */
01526                   "+c" (SrcLength),     /* load loop counter (SIZE) into ecx */
01527                   "+D" (Dest)           /* load Dest address into edi */
01528                 :
01529                 : "memory", "rax", "rbx"
01530 #  endif
01531                 );
01532 #endif
01533         return (0);
01534 #else
01535         return (-1);
01536 #endif
01537 }
01538 
01549 int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01550 {
01551         unsigned int i, istart;
01552         unsigned char *cursrc1, *cursrc2, *curdst;
01553 
01554         /* Validate input parameters */
01555         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01556                 return(-1);
01557         if (length == 0)
01558                 return(0);
01559 
01560         if (SDL_imageFilterMMXdetect()) {
01561                 if (length > 0) {
01562                         /* Call ASM routine */
01563                         SDL_imageFilterDivASM(Src1, Src2, Dest, length);
01564 
01565                         /* Never unaligned bytes - we are done */
01566                         return (0);
01567                 } else {
01568                         return (-1);
01569                 }
01570         } 
01571         
01572         /* Setup to process whole image */
01573         istart = 0;
01574         cursrc1 = Src1;
01575         cursrc2 = Src2;
01576         curdst = Dest;
01577 
01578         /* C routine to process image */
01579         /* for (i = istart; i < length; i++) { */
01580         /*      if (*cursrc2 == 0) { */
01581         /*              *curdst = 255; */
01582         /*      } else { */
01583         /*              result = (int) *cursrc1 / (int) *cursrc2; */
01584         /*              *curdst = (unsigned char) result; */
01585         /*      } */
01586         /*      /\* Advance pointers *\/ */
01587         /*      cursrc1++; */
01588         /*      cursrc2++; */
01589         /*      curdst++; */
01590         /* } */
01591         for (i = istart; i < length; i++) {
01592                 if (*cursrc2 == 0) {
01593                         *curdst = 255;
01594                 } else {
01595                         *curdst = (int)*cursrc1 / (int)*cursrc2;  // (int) for efficiency
01596                 }
01597                 /* Advance pointers */
01598                 cursrc1++;
01599                 cursrc2++;
01600                 curdst++;
01601         }
01602 
01603         return (0);
01604 }
01605 
01606 /* ------------------------------------------------------------------------------------ */
01607 
01617 static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
01618 {
01619 #ifdef USE_MMX
01620 #if !defined(GCC__)
01621         __asm
01622         {
01623                 pusha
01624                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
01625                         mov eax, Src1           /* load Src1 address into eax */
01626                         mov edi, Dest           /* load Dest address into edi */
01627                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01628                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01629                         align 16                /* 16 byte alignment of the loop entry */
01630 L91117:
01631                 movq mm0, [eax]         /* load 8 bytes from Src1 into mm1 */
01632                 pxor mm0, mm1           /* negate mm0 by xoring with mm1 */
01633                         movq [edi], mm0         /* store result in Dest */
01634                         add eax, 8      /* increase Src1, Src2 and Dest  */
01635                         add edi,  8
01636                         dec ecx         /* decrease loop counter */
01637                         jnz L91117              /* check loop termination, proceed if required */
01638                         emms                    /* exit MMX state */
01639                         popa
01640         }
01641 #else
01642         /* i386 and x86_64 */
01643         __m64 *mSrc1 = (__m64*)Src1;
01644         __m64 *mDest = (__m64*)Dest;
01645         __m64 mm1;
01646         mm1 = _m_pcmpeqb(mm1, mm1);             /* generate all 1's in mm1 */
01647         int i;
01648         for (i = 0; i < SrcLength/8; i++) {
01649                 *mDest = _m_pxor(*mSrc1, mm1);  /* negate mm0 by xoring with mm1 */
01650                 mSrc1++;
01651                 mDest++;
01652         }
01653         _m_empty();                             /* clean MMX state */
01654 
01655 #endif
01656         return (0);
01657 #else
01658         return (-1);
01659 #endif
01660 }
01661 
01671 int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
01672 {
01673         unsigned int i, istart;
01674         unsigned char *cursrc1, *curdst;
01675 
01676         /* Validate input parameters */
01677         if ((Src1 == NULL) || (Dest == NULL))
01678                 return(-1);
01679         if (length == 0)
01680                 return(0);
01681 
01682         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01683                 /* MMX routine */
01684                 SDL_imageFilterBitNegationMMX(Src1, Dest, length);
01685 
01686                 /* Check for unaligned bytes */
01687                 if ((length & 7) > 0) {
01688                         /* Setup to process unaligned bytes */
01689                         istart = length & 0xfffffff8;
01690                         cursrc1 = &Src1[istart];
01691                         curdst = &Dest[istart];
01692                 } else {
01693                         /* No unaligned bytes - we are done */
01694                         return (0);
01695                 }
01696         } else {
01697                 /* Setup to process whole image */
01698                 istart = 0;
01699                 cursrc1 = Src1;
01700                 curdst = Dest;
01701         }
01702 
01703         /* C routine to process image */
01704         for (i = istart; i < length; i++) {
01705                 *curdst = ~(*cursrc1);
01706                 /* Advance pointers */
01707                 cursrc1++;
01708                 curdst++;
01709         }
01710 
01711         return (0);
01712 }
01713 
01724 static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
01725 {
01726 #ifdef USE_MMX
01727 #if !defined(GCC__)
01728         __asm
01729         {
01730                 pusha
01731                         /* ** Duplicate C in 8 bytes of MM1 ** */
01732                         mov al, C       /* load C into AL */
01733                         mov ah, al      /* copy AL into AH */
01734                         mov bx, ax      /* copy AX into BX */
01735                         shl eax, 16     /* shift 2 bytes of EAX left */
01736                         mov ax, bx      /* copy BX into AX */
01737                         movd mm1, eax           /* copy EAX into MM1 */
01738                         movd mm2, eax           /* copy EAX into MM2 */
01739                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
01740                         mov eax, Src1           /* load Src1 address into eax */
01741                         mov edi, Dest           /* load Dest address into edi */
01742                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01743                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01744                         align 16                        /* 16 byte alignment of the loop entry */
01745 L1021:
01746                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */
01747                 paddusb mm0,  mm1       /* MM0=SrcDest+C (add 8 bytes with saturation) */
01748                         movq [edi], mm0         /* store result in Dest */
01749                         add eax, 8      /* increase Dest register pointer by 8 */
01750                         add edi, 8      /* increase Dest register pointer by 8 */
01751                         dec              ecx            /* decrease loop counter */
01752                         jnz             L1021           /* check loop termination, proceed if required */
01753                         emms                            /* exit MMX state */
01754                         popa
01755         }
01756 #else
01757         /* i386 and x86_64 */
01758         __m64 *mSrc1 = (__m64*)Src1;
01759         __m64 *mDest = (__m64*)Dest;
01760         /* Duplicate C in 8 bytes of MM1 */
01761         int i;
01762         memset(&i, C, 4);
01763         __m64 mm1 = _m_from_int(i);
01764         __m64 mm2 = _m_from_int(i);
01765         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */
01766         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
01767         for (i = 0; i < SrcLength/8; i++) {
01768                 *mDest = _m_paddusb(*mSrc1, mm1);       /* Src1+C (add 8 bytes with saturation) */
01769                 mSrc1++;
01770                 mDest++;
01771         }
01772         _m_empty();                                     /* clean MMX state */
01773 #endif
01774         return (0);
01775 #else
01776         return (-1);
01777 #endif
01778 }
01779 
01791 int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
01792 {
01793         unsigned int i, istart;
01794         int iC;
01795         unsigned char *cursrc1, *curdest;
01796         int result;
01797 
01798         /* Validate input parameters */
01799         if ((Src1 == NULL) || (Dest == NULL))
01800                 return(-1);
01801         if (length == 0)
01802                 return(0);
01803 
01804         /* Special case: C==0 */
01805         if (C == 0) {
01806                 memcpy(Src1, Dest, length);
01807                 return (0); 
01808         }
01809 
01810         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01811 
01812                 /* MMX routine */
01813                 SDL_imageFilterAddByteMMX(Src1, Dest, length, C);
01814 
01815                 /* Check for unaligned bytes */
01816                 if ((length & 7) > 0) {
01817                         /* Setup to process unaligned bytes */
01818                         istart = length & 0xfffffff8;
01819                         cursrc1 = &Src1[istart];
01820                         curdest = &Dest[istart];
01821                 } else {
01822                         /* No unaligned bytes - we are done */
01823                         return (0);
01824                 }
01825         } else {
01826                 /* Setup to process whole image */
01827                 istart = 0;
01828                 cursrc1 = Src1;
01829                 curdest = Dest;
01830         }
01831 
01832         /* C routine to process image */
01833         iC = (int) C;
01834         for (i = istart; i < length; i++) {
01835                 result = (int) *cursrc1 + iC;
01836                 if (result > 255)
01837                         result = 255;
01838                 *curdest = (unsigned char) result;
01839                 /* Advance pointers */
01840                 cursrc1++;
01841                 curdest++;
01842         }
01843         return (0);
01844 }
01845 
01857 static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
01858 {
01859 #ifdef USE_MMX
01860 #if !defined(GCC__)
01861         __asm
01862         {
01863                 pusha
01864                         /* ** Duplicate (int)C in 8 bytes of MM1 ** */
01865                         mov eax, C      /* load C into EAX */
01866                         movd mm1, eax           /* copy EAX into MM1 */
01867                         mov eax, D      /* load D into EAX */
01868                         movd mm2, eax           /* copy EAX into MM2 */
01869                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
01870                         mov eax, Src1           /* load Src1 address into eax */
01871                         mov edi, Dest           /* load Dest address into edi */
01872                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01873                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01874                         align 16                        /* 16 byte alignment of the loop entry */
01875 L11023:
01876                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
01877                 paddusb mm0,  mm1       /* MM0=SrcDest+C (add 8 bytes with saturation) */
01878                         movq [edi],  mm0        /* store result in SrcDest */
01879                         add eax, 8      /* increase Src1 register pointer by 8 */
01880                         add edi, 8      /* increase Dest register pointer by 8 */
01881                         dec              ecx            /* decrease loop counter */
01882                         jnz             L11023          /* check loop termination, proceed if required */
01883                         emms                            /* exit MMX state */
01884                         popa
01885         }
01886 #else
01887         /* i386 and x86_64 */
01888         __m64 *mSrc1 = (__m64*)Src1;
01889         __m64 *mDest = (__m64*)Dest;
01890         /* Duplicate (int)C in 8 bytes of MM1 */
01891         __m64 mm1 = _m_from_int(C);
01892         __m64 mm2 = _m_from_int(C);
01893         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */
01894         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
01895         int i;
01896         for (i = 0; i < SrcLength/8; i++) {
01897                 *mDest = _m_paddusb(*mSrc1, mm1);       /* Src1+C (add 8 bytes with saturation) */
01898                 mSrc1++;
01899                 mDest++;
01900         }
01901         _m_empty();                                     /* clean MMX state */
01902 #endif
01903         return (0);
01904 #else
01905         return (-1);
01906 #endif
01907 }
01908 
01919 int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
01920 {
01921         unsigned int i, j, istart, D;
01922         int iC[4];
01923         unsigned char *cursrc1;
01924         unsigned char *curdest;
01925         int result;
01926 
01927         /* Validate input parameters */
01928         if ((Src1 == NULL) || (Dest == NULL))
01929                 return(-1);
01930         if (length == 0)
01931                 return(0);
01932 
01933         /* Special case: C==0 */
01934         if (C == 0) {
01935                 memcpy(Src1, Dest, length);
01936                 return (0); 
01937         }
01938 
01939         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01940 
01941                 /* MMX routine */
01942                 D=SWAP_32(C);
01943                 SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);
01944 
01945                 /* Check for unaligned bytes */
01946                 if ((length & 7) > 0) {
01947                         /* Setup to process unaligned bytes */
01948                         istart = length & 0xfffffff8;
01949                         cursrc1 = &Src1[istart];
01950                         curdest = &Dest[istart];
01951                 } else {
01952                         /* No unaligned bytes - we are done */
01953                         return (0);
01954                 }
01955         } else {
01956                 /* Setup to process whole image */
01957                 istart = 0;
01958                 cursrc1 = Src1;
01959                 curdest = Dest;
01960         }
01961 
01962         /* C routine to process bytes */
01963         iC[3] = (int) ((C >> 24) & 0xff);
01964         iC[2] = (int) ((C >> 16) & 0xff);
01965         iC[1] = (int) ((C >>  8) & 0xff);
01966         iC[0] = (int) ((C >>  0) & 0xff);
01967         for (i = istart; i < length; i += 4) {
01968                 for (j = 0; j < 4; j++) {
01969                         if ((i+j)<length) {
01970                                 result = (int) *cursrc1 + iC[j];
01971                                 if (result > 255) result = 255;
01972                                 *curdest = (unsigned char) result;
01973                                 /* Advance pointers */
01974                                 cursrc1++;
01975                                 curdest++;
01976                         }
01977                 }
01978         }
01979         return (0);
01980 }
01981 
01993 static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
01994                                                                         unsigned char *Mask)
01995 {
01996 #ifdef USE_MMX
01997 #if !defined(GCC__)
01998         __asm
01999         {
02000                 pusha
02001                         /* ** Duplicate C in 8 bytes of MM1 ** */
02002                         mov al, C       /* load C into AL */
02003                         mov ah, al      /* copy AL into AH */
02004                         mov bx, ax      /* copy AX into BX */
02005                         shl eax, 16     /* shift 2 bytes of EAX left */
02006                         mov ax, bx      /* copy BX into AX */
02007                         movd mm1, eax           /* copy EAX into MM1 */
02008                         movd mm2, eax           /* copy EAX into MM2 */
02009                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
02010                         mov edx, Mask           /* load Mask address into edx */
02011                         movq mm0, [edx]         /* load Mask into mm0 */
02012                 mov eax, Src1           /* load Src1 address into eax */
02013                         mov edi, Dest           /* load Dest address into edi */
02014                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02015                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02016                         align 16                        /* 16 byte alignment of the loop entry */
02017 L1022:
02018                 movq mm2, [eax]         /* load 8 bytes from Src1 into MM2 */
02019                 psrlw mm2, 1    /* shift 4 WORDS of MM2 1 bit to the right */
02020                         pand mm2, mm0        // apply Mask to 8 BYTES of MM2 */
02021                         paddusb mm2,  mm1       /* MM2=SrcDest+C (add 8 bytes with saturation) */
02022                         movq [edi], mm2         /* store result in Dest */
02023                         add eax, 8      /* increase Src1 register pointer by 8 */
02024                         add edi, 8      /* increase Dest register pointer by 8 */
02025                         dec              ecx            /* decrease loop counter */
02026                         jnz             L1022           /* check loop termination, proceed if required */
02027                         emms                            /* exit MMX state */
02028                         popa
02029         }
02030 #else
02031         /* i386 and x86_64 */
02032         __m64 *mSrc1 = (__m64*)Src1;
02033         __m64 *mDest = (__m64*)Dest;
02034         __m64 *mMask = (__m64*)Mask;
02035         /* Duplicate C in 8 bytes of MM1 */
02036         int i;
02037         memset(&i, C, 4);
02038         __m64 mm1 = _m_from_int(i);
02039         __m64 mm2 = _m_from_int(i);
02040         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */
02041         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
02042         for (i = 0; i < SrcLength/8; i++) {
02043                 __m64 mm2 = _m_psrlwi(*mSrc1, 1);       /* shift 4 WORDS of MM2 1 bit to the right */
02044                 mm2 = _m_pand(mm2, *mMask);             /* apply Mask to 8 BYTES of MM2 */
02045                                                         /* byte     0x0f, 0xdb, 0xd0 */
02046                 *mDest = _m_paddusb(mm1, mm2);          /* Src1+C (add 8 bytes with saturation) */
02047                 mSrc1++;
02048                 mDest++;
02049         }
02050         _m_empty();                                     /* clean MMX state */
02051 #endif
02052         return (0);
02053 #else
02054         return (-1);
02055 #endif
02056 }
02057 
02068 int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
02069 {
02070         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
02071         unsigned int i, istart;
02072         int iC;
02073         unsigned char *cursrc1;
02074         unsigned char *curdest;
02075         int result;
02076 
02077         /* Validate input parameters */
02078         if ((Src1 == NULL) || (Dest == NULL))
02079                 return(-1);
02080         if (length == 0)
02081                 return(0);
02082 
02083         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02084 
02085                 /* MMX routine */
02086                 SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);
02087 
02088                 /* Check for unaligned bytes */
02089                 if ((length & 7) > 0) {
02090                         /* Setup to process unaligned bytes */
02091                         istart = length & 0xfffffff8;
02092                         cursrc1 = &Src1[istart];
02093                         curdest = &Dest[istart];
02094                 } else {
02095                         /* No unaligned bytes - we are done */
02096                         return (0);
02097                 }
02098         } else {
02099                 /* Setup to process whole image */
02100                 istart = 0;
02101                 cursrc1 = Src1;
02102                 curdest = Dest;
02103         }
02104 
02105         /* C routine to process image */
02106         iC = (int) C;
02107         for (i = istart; i < length; i++) {
02108                 result = (int) (*cursrc1 / 2) + iC;
02109                 if (result > 255)
02110                         result = 255;
02111                 *curdest = (unsigned char) result;
02112                 /* Advance pointers */
02113                 cursrc1++;
02114                 curdest++;
02115         }
02116 
02117         return (0);
02118 }
02119 
02130 int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
02131 {
02132 #ifdef USE_MMX
02133 #if !defined(GCC__)
02134         __asm
02135         {
02136                 pusha
02137                         /* ** Duplicate C in 8 bytes of MM1 ** */
02138                         mov al, C       /* load C into AL */
02139                         mov ah, al      /* copy AL into AH */
02140                         mov bx, ax      /* copy AX into BX */
02141                         shl eax, 16     /* shift 2 bytes of EAX left */
02142                         mov ax, bx      /* copy BX into AX */
02143                         movd mm1, eax           /* copy EAX into MM1 */
02144                         movd mm2, eax           /* copy EAX into MM2 */
02145                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
02146                         mov eax, Src1           /* load Src1 address into eax */
02147                         mov edi, Dest           /* load Dest address into edi */
02148                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02149                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02150                         align 16                        /* 16 byte alignment of the loop entry */
02151 L1023:
02152                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02153                 psubusb mm0,  mm1       /* MM0=SrcDest-C (sub 8 bytes with saturation) */
02154                         movq [edi], mm0         /* store result in SrcDest */
02155                         add eax, 8      /* increase Src1 register pointer by 8 */
02156                         add edi, 8      /* increase Dest register pointer by 8 */
02157                         dec              ecx            /* decrease loop counter */
02158                         jnz             L1023           /* check loop termination, proceed if required */
02159                         emms                            /* exit MMX state */
02160                         popa
02161         }
02162 #else
02163         /* i386 and x86_64 */
02164         __m64 *mSrc1 = (__m64*)Src1;
02165         __m64 *mDest = (__m64*)Dest;
02166         /* Duplicate C in 8 bytes of MM1 */
02167         int i;
02168         memset(&i, C, 4);
02169         __m64 mm1 = _m_from_int(i);
02170         __m64 mm2 = _m_from_int(i);
02171         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */
02172         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
02173         for (i = 0; i < SrcLength/8; i++) {
02174                 *mDest = _m_psubusb(*mSrc1, mm1);       /* Src1-C (sub 8 bytes with saturation) */
02175                 mSrc1++;
02176                 mDest++;
02177         }
02178         _m_empty();                                     /* clean MMX state */
02179 #endif
02180         return (0);
02181 #else
02182         return (-1);
02183 #endif
02184 }
02185 
02196 int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
02197 {
02198         unsigned int i, istart;
02199         int iC;
02200         unsigned char *cursrc1;
02201         unsigned char *curdest;
02202         int result;
02203 
02204         /* Validate input parameters */
02205         if ((Src1 == NULL) || (Dest == NULL))
02206                 return(-1);
02207         if (length == 0)
02208                 return(0);
02209 
02210         /* Special case: C==0 */
02211         if (C == 0) {
02212                 memcpy(Src1, Dest, length);
02213                 return (0); 
02214         }
02215 
02216         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02217 
02218                 /* MMX routine */
02219                 SDL_imageFilterSubByteMMX(Src1, Dest, length, C);
02220 
02221                 /* Check for unaligned bytes */
02222                 if ((length & 7) > 0) {
02223                         /* Setup to process unaligned bytes */
02224                         istart = length & 0xfffffff8;
02225                         cursrc1 = &Src1[istart];
02226                         curdest = &Dest[istart];
02227                 } else {
02228                         /* No unaligned bytes - we are done */
02229                         return (0);
02230                 }
02231         } else {
02232                 /* Setup to process whole image */
02233                 istart = 0;
02234                 cursrc1 = Src1;
02235                 curdest = Dest;
02236         }
02237 
02238         /* C routine to process image */
02239         iC = (int) C;
02240         for (i = istart; i < length; i++) {
02241                 result = (int) *cursrc1 - iC;
02242                 if (result < 0)
02243                         result = 0;
02244                 *curdest = (unsigned char) result;
02245                 /* Advance pointers */
02246                 cursrc1++;
02247                 curdest++;
02248         }
02249         return (0);
02250 }
02251 
02263 static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
02264 {
02265 #ifdef USE_MMX
02266 #if !defined(GCC__)
02267         __asm
02268         {
02269                 pusha
02270                         /* ** Duplicate (int)C in 8 bytes of MM1 ** */
02271                         mov eax, C      /* load C into EAX */
02272                         movd mm1, eax           /* copy EAX into MM1 */
02273                         mov eax, D      /* load D into EAX */
02274                         movd mm2, eax           /* copy EAX into MM2 */
02275                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
02276                         mov eax, Src1           /* load Src1 address into eax */
02277                         mov edi, Dest           /* load Dest address into edi */
02278                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02279                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02280                         align 16                        /* 16 byte alignment of the loop entry */
02281 L11024:
02282                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02283                 psubusb mm0, mm1        /* MM0=SrcDest-C (sub 8 bytes with saturation) */
02284                         movq [edi], mm0         /* store result in SrcDest */
02285                         add eax, 8      /* increase Src1 register pointer by 8 */
02286                         add edi, 8      /* increase Dest register pointer by 8 */
02287                         dec              ecx            /* decrease loop counter */
02288                         jnz             L11024          /* check loop termination, proceed if required */
02289                         emms                            /* exit MMX state */
02290                         popa
02291         }
02292 #else
02293         /* i386 and x86_64 */
02294         __m64 *mSrc1 = (__m64*)Src1;
02295         __m64 *mDest = (__m64*)Dest;
02296         /* Duplicate (int)C in 8 bytes of MM1 */
02297         __m64 mm1 = _m_from_int(C);
02298         __m64 mm2 = _m_from_int(C);
02299         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */
02300         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
02301         int i;
02302         for (i = 0; i < SrcLength/8; i++) {
02303                 *mDest = _m_psubusb(*mSrc1, mm1);       /* Src1-C (sub 8 bytes with saturation) */
02304                 mSrc1++;
02305                 mDest++;
02306         }
02307         _m_empty();                                     /* clean MMX state */
02308 #endif
02309         return (0);
02310 #else
02311         return (-1);
02312 #endif
02313 }
02314 
02325 int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
02326 {
02327         unsigned int i, j, istart, D;
02328         int iC[4];
02329         unsigned char *cursrc1;
02330         unsigned char *curdest;
02331         int result;
02332 
02333         /* Validate input parameters */
02334         if ((Src1 == NULL) || (Dest == NULL))
02335                 return(-1);
02336         if (length == 0)
02337                 return(0);
02338 
02339     /* Special case: C==0 */
02340         if (C == 0) {
02341                 memcpy(Src1, Dest, length);
02342                 return (0); 
02343         }
02344 
02345         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02346 
02347                 /* MMX routine */
02348                 D=SWAP_32(C);
02349                 SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);
02350 
02351                 /* Check for unaligned bytes */
02352                 if ((length & 7) > 0) {
02353                         /* Setup to process unaligned bytes */
02354                         istart = length & 0xfffffff8;
02355                         cursrc1 = &Src1[istart];
02356                         curdest = &Dest[istart];
02357                 } else {
02358                         /* No unaligned bytes - we are done */
02359                         return (0);
02360                 }
02361         } else {
02362                 /* Setup to process whole image */
02363                 istart = 0;
02364                 cursrc1 = Src1;
02365                 curdest = Dest;
02366         }
02367 
02368         /* C routine to process image */
02369         iC[3] = (int) ((C >> 24) & 0xff);
02370         iC[2] = (int) ((C >> 16) & 0xff);
02371         iC[1] = (int) ((C >>  8) & 0xff);
02372         iC[0] = (int) ((C >>  0) & 0xff);
02373         for (i = istart; i < length; i += 4) {
02374                 for (j = 0; j < 4; j++) {
02375                         if ((i+j)<length) {
02376                                 result = (int) *cursrc1 - iC[j];
02377                                 if (result < 0) result = 0;
02378                                 *curdest = (unsigned char) result;
02379                                 /* Advance pointers */
02380                                 cursrc1++;
02381                                 curdest++;
02382                         }
02383                 }
02384         }
02385         return (0);
02386 }
02387 
02399 static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
02400                                                                  unsigned char *Mask)
02401 {
02402 #ifdef USE_MMX
02403 #if !defined(GCC__)
02404         __asm
02405         {
02406                 pusha
02407                         mov edx, Mask           /* load Mask address into edx */
02408                         movq mm0, [edx]         /* load Mask into mm0 */
02409                 xor ecx, ecx    /* zero ECX */
02410                         mov cl,  N      /* load loop counter (N) into CL */
02411                         movd mm3,  ecx  /* copy (N) into MM3  */
02412                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
02413 L10240:                         /* ** Prepare proper bit-Mask in MM1 ** */
02414                 psrlw mm1,  1   /* shift 4 WORDS of MM1 1 bit to the right */
02415                         pand mm1, mm0   // apply Mask to 8 BYTES of MM1 */
02416                         /*  byte     0x0f, 0xdb, 0xc8 */
02417                         dec               cl            /* decrease loop counter */
02418                         jnz            L10240           /* check loop termination, proceed if required */
02419                         /* ** Shift all bytes of the image ** */
02420                         mov eax, Src1           /* load Src1 address into eax */
02421                         mov edi, Dest           /* load Dest address into edi */
02422                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02423                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02424                         align 16                        /* 16 byte alignment of the loop entry */
02425 L10241:
02426                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02427                 psrlw mm0, mm3          /* shift 4 WORDS of MM0 (N) bits to the right */
02428                         pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
02429                         /* byte     0x0f, 0xdb, 0xc1 */
02430                         movq [edi], mm0         /* store result in SrcDest */
02431                         add eax, 8      /* increase Src1 register pointer by 8 */
02432                         add edi, 8      /* increase Dest register pointer by 8 */
02433                         dec              ecx            /* decrease loop counter */
02434                         jnz            L10241           /* check loop termination, proceed if required */
02435                         emms                            /* exit MMX state */
02436                         popa
02437         }
02438 #else
02439         /* i386 and x86_64 */
02440         __m64 *mSrc1 = (__m64*)Src1;
02441         __m64 *mDest = (__m64*)Dest;
02442         __m64 *mMask = (__m64*)Mask;
02443         __m64 mm1;
02444         int i;
02445         mm1 = _m_pcmpeqb(mm1, mm1);                     /* generate all 1's in mm1 */
02446         /* Prepare proper bit-Mask in MM1 */
02447         for (i = 0; i < N; i++) {
02448                 mm1 = _m_psrlwi(mm1, 1);                /* shift 4 WORDS of MM1 1 bit to the right */
02449                 mm1 = _m_pand(mm1, *mMask);             /* apply Mask to 8 BYTES of MM1 */
02450         }
02451         /* Shift all bytes of the image */
02452         for (i = 0; i < SrcLength/8; i++) {
02453                 __m64 mm0 = _m_psrlwi(*mSrc1, N);       /* shift 4 WORDS of MM0 (N) bits to the right */
02454                 *mDest = _m_pand(mm0, mm1);             /* apply proper bit-Mask to 8 BYTES of MM0 */
02455                 mSrc1++;
02456                 mDest++;
02457         }
02458         _m_empty();                                     /* clean MMX state */
02459 #endif
02460         return (0);
02461 #else
02462         return (-1);
02463 #endif
02464 }
02465 
02476 int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
02477 {
02478         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
02479         unsigned int i, istart;
02480         unsigned char *cursrc1;
02481         unsigned char *curdest;
02482 
02483         /* Validate input parameters */
02484         if ((Src1 == NULL) || (Dest == NULL))
02485                 return(-1);
02486         if (length == 0)
02487                 return(0);
02488 
02489         /* Check shift */
02490         if (N > 8) {
02491                 return (-1);
02492         }
02493 
02494         /* Special case: N==0 */
02495         if (N == 0) {
02496                 memcpy(Src1, Dest, length);
02497                 return (0); 
02498         }
02499 
02500         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02501 
02502                 /* MMX routine */
02503                 SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);
02504 
02505                 /* Check for unaligned bytes */
02506                 if ((length & 7) > 0) {
02507                         /* Setup to process unaligned bytes */
02508                         istart = length & 0xfffffff8;
02509                         cursrc1 = &Src1[istart];
02510                         curdest = &Dest[istart];
02511                 } else {
02512                         /* No unaligned bytes - we are done */
02513                         return (0);
02514                 }
02515         } else {
02516                 /* Setup to process whole image */
02517                 istart = 0;
02518                 cursrc1 = Src1;
02519                 curdest = Dest;
02520         }
02521 
02522         /* C routine to process image */
02523         for (i = istart; i < length; i++) {
02524                 *curdest = (unsigned char) *cursrc1 >> N;
02525                 /* Advance pointers */
02526                 cursrc1++;
02527                 curdest++;
02528         }
02529 
02530         return (0);
02531 }
02532 
02543 static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
02544 {
02545 #ifdef USE_MMX
02546 #if !defined(GCC__)
02547         __asm
02548         {
02549                 pusha
02550                         mov eax, Src1           /* load Src1 address into eax */
02551                         mov edi, Dest           /* load Dest address into edi */
02552                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
02553                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
02554                         align 16                        /* 16 byte alignment of the loop entry */
02555 L13023:
02556                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02557                 psrld mm0, N
02558                         movq [edi], mm0         /* store result in SrcDest */
02559                         add eax, 8      /* increase Src1 register pointer by 8 */
02560                         add edi, 8      /* increase Dest register pointer by 8 */
02561                         dec              ecx            /* decrease loop counter */
02562                         jnz             L13023          /* check loop termination, proceed if required */
02563                         emms                            /* exit MMX state */
02564                         popa
02565         }
02566 #else
02567         /* i386 and x86_64 */
02568         __m64 *mSrc1 = (__m64*)Src1;
02569         __m64 *mDest = (__m64*)Dest;
02570         int i;
02571         for (i = 0; i < SrcLength/8; i++) {
02572                 *mDest = _m_psrldi(*mSrc1, N);
02573                 mSrc1++;
02574                 mDest++;
02575         }
02576         _m_empty();                                     /* clean MMX state */
02577 #endif
02578         return (0);
02579 #else
02580         return (-1);
02581 #endif
02582 }
02583 
02594 int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
02595 {
02596         unsigned int i, istart;
02597         unsigned char *cursrc1, *curdest;
02598         unsigned int *icursrc1, *icurdest;
02599         unsigned int result;
02600 
02601         /* Validate input parameters */
02602         if ((Src1 == NULL) || (Dest == NULL))
02603                 return(-1);
02604         if (length == 0)
02605                 return(0);
02606 
02607         if (N > 32) {
02608                 return (-1);
02609         }
02610 
02611         /* Special case: N==0 */
02612         if (N == 0) {
02613                 memcpy(Src1, Dest, length);
02614                 return (0); 
02615         }
02616 
02617         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02618 
02619                 SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);
02620 
02621                 /* Check for unaligned bytes */
02622                 if ((length & 7) > 0) {
02623                         /* Setup to process unaligned bytes */
02624                         istart = length & 0xfffffff8;
02625                         cursrc1 = &Src1[istart];
02626                         curdest = &Dest[istart];
02627                 } else {
02628                         /* No unaligned bytes - we are done */
02629                         return (0);
02630                 }
02631         } else {
02632                 /* Setup to process whole image */
02633                 istart = 0;
02634                 cursrc1 = Src1;
02635                 curdest = Dest;
02636         }
02637 
02638         /* C routine to process image */
02639         icursrc1=(unsigned int *)cursrc1;
02640         icurdest=(unsigned int *)curdest;
02641         for (i = istart; i < length; i += 4) {
02642                 if ((i+4)<length) {
02643                         result = ((unsigned int)*icursrc1 >> N);
02644                         *icurdest = result;
02645                 }
02646                 /* Advance pointers */
02647                 icursrc1++;
02648                 icurdest++;
02649         }
02650 
02651         return (0);
02652 }
02653 
02664 static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
02665 {
02666 #ifdef USE_MMX
02667 #if !defined(GCC__)
02668         __asm
02669         {
02670                 pusha
02671                         /* ** Duplicate C in 4 words of MM1 ** */
02672                         mov al, C       /* load C into AL */
02673                         xor ah, ah      /* zero AH */
02674                         mov bx, ax      /* copy AX into BX */
02675                         shl eax, 16     /* shift 2 bytes of EAX left */
02676                         mov ax, bx      /* copy BX into AX */
02677                         movd mm1, eax           /* copy EAX into MM1 */
02678                         movd mm2, eax           /* copy EAX into MM2 */
02679                         punpckldq mm1, mm2      /* fill higher words of MM1 with C */
02680                         pxor mm0, mm0           /* zero MM0 register */
02681                         mov eax, Src1           /* load Src1 address into eax */
02682                         mov edi, Dest           /* load Dest address into edi */
02683                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
02684                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
02685                         cmp al, 128     /* if (C <= 128) execute more efficient code */
02686                         jg             L10251
02687                         align 16                        /* 16 byte alignment of the loop entry */
02688 L10250:
02689                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
02690                 movq mm4, mm3           /* copy MM3 into MM4  */
02691                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
02692                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
02693                         pmullw mm3, mm1         /* mul low  bytes of SrcDest and MM1 */
02694                         pmullw mm4, mm1         /* mul high bytes of SrcDest and MM1 */
02695                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
02696                         movq [edi], mm3         /* store result in Dest */
02697                         add eax, 8      /* increase Src1 register pointer by 8 */
02698                         add edi, 8      /* increase Dest register pointer by 8 */
02699                         dec              ecx            /* decrease loop counter */
02700                         jnz            L10250           /* check loop termination, proceed if required */
02701                         jmp            L10252
02702                         align 16                        /* 16 byte alignment of the loop entry */
02703 L10251:
02704                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
02705                 movq mm4, mm3           /* copy MM3 into MM4  */
02706                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
02707                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
02708                         pmullw mm3, mm1         /* mul low  bytes of SrcDest and MM1 */
02709                         pmullw mm4, mm1         /* mul high bytes of SrcDest and MM1 */
02710                         /* ** Take abs value of the results (signed words) ** */
02711                         movq mm5, mm3           /* copy mm3 into mm5 */
02712                         movq mm6, mm4           /* copy mm4 into mm6 */
02713                         psraw mm5, 15           /* fill mm5 words with word sign bit */
02714                         psraw mm6, 15           /* fill mm6 words with word sign bit */
02715                         pxor mm3, mm5           /* take 1's compliment of only neg words */
02716                         pxor mm4, mm6           /* take 1's compliment of only neg words */
02717                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */
02718                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
02719                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
02720                         movq [edi], mm3         /* store result in Dest */
02721                         add eax, 8      /* increase Src1 register pointer by 8 */
02722                         add edi, 8      /* increase Dest register pointer by 8 */
02723                         dec              ecx            /* decrease loop counter */
02724                         jnz            L10251           /* check loop termination, proceed if required */
02725 L10252:
02726                 emms                            /* exit MMX state */
02727                         popa
02728         }
02729 #else
02730         /* i386 and x86_64 */
02731         __m64 *mSrc1 = (__m64*)Src1;
02732         __m64 *mDest = (__m64*)Dest;
02733         __m64 mm0 = _m_from_int(0);                             /* zero mm0 register */
02734         /* Duplicate C in 4 words of MM1 */
02735         int i;
02736         i = C | C<<16;
02737         __m64 mm1 = _m_from_int(i);
02738         __m64 mm2 = _m_from_int(i);
02739         mm1 = _m_punpckldq(mm1, mm2);                           /* fill higher words of MM1 with C */
02740         // long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48;
02741         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
02742         if (C <= 128) {                                         /* if (C <= 128) execute more efficient code */
02743                 for (i = 0; i < SrcLength/8; i++) {
02744                         __m64 mm3, mm4;
02745                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
02746                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
02747                         mm3 = _m_pmullw(mm3, mm1);              /* mul low  bytes of Src1 and MM1 */
02748                         mm4 = _m_pmullw(mm4, mm1);              /* mul high bytes of Src1 and MM1 */
02749                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
02750                         mSrc1++;
02751                         mDest++;
02752                 }
02753         } else {
02754                 for (i = 0; i < SrcLength/8; i++) {
02755                         __m64 mm3, mm4, mm5, mm6;
02756                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
02757                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
02758                         mm3 = _m_pmullw(mm3, mm1);              /* mul low  bytes of Src1 and MM1 */
02759                         mm4 = _m_pmullw(mm4, mm1);              /* mul high bytes of Src1 and MM1 */
02760                         /* Take abs value of the results (signed words) */
02761                         mm5 = _m_psrawi(mm3, 15);               /* fill mm5 words with word sign bit */
02762                         mm6 = _m_psrawi(mm4, 15);               /* fill mm6 words with word sign bit */
02763                         mm3 = _m_pxor(mm3, mm5);                /* take 1's compliment of only neg. words */
02764                         mm4 = _m_pxor(mm4, mm6);                /* take 1's compliment of only neg. words */
02765                         mm3 = _m_psubsw(mm3, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */
02766                         mm4 = _m_psubsw(mm4, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */
02767                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
02768                         mSrc1++;
02769                         mDest++;
02770                 }
02771         }
02772         _m_empty();                                             /* clean MMX state */
02773 #endif
02774         return (0);
02775 #else
02776         return (-1);
02777 #endif
02778 }
02779 
02790 int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
02791 {
02792         unsigned int i, istart;
02793         int iC;
02794         unsigned char *cursrc1;
02795         unsigned char *curdest;
02796         int result;
02797 
02798         /* Validate input parameters */
02799         if ((Src1 == NULL) || (Dest == NULL))
02800                 return(-1);
02801         if (length == 0)
02802                 return(0);
02803 
02804         /* Special case: C==1 */
02805         if (C == 1) {
02806                 memcpy(Src1, Dest, length);
02807                 return (0); 
02808         }
02809 
02810         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02811 
02812                 SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);
02813 
02814                 /* Check for unaligned bytes */
02815                 if ((length & 7) > 0) {
02816                         /* Setup to process unaligned bytes */
02817                         istart = length & 0xfffffff8;
02818                         cursrc1 = &Src1[istart];
02819                         curdest = &Dest[istart];
02820                 } else {
02821                         /* No unaligned bytes - we are done */
02822                         return (0);
02823                 }
02824         } else {
02825                 /* Setup to process whole image */
02826                 istart = 0;
02827                 cursrc1 = Src1;
02828                 curdest = Dest;
02829         }
02830 
02831         /* C routine to process image */
02832         iC = (int) C;
02833         for (i = istart; i < length; i++) {
02834                 result = (int) *cursrc1 * iC;
02835                 if (result > 255)
02836                         result = 255;
02837                 *curdest = (unsigned char) result;
02838                 /* Advance pointers */
02839                 cursrc1++;
02840                 curdest++;
02841         }
02842 
02843         return (0);
02844 }
02845 
02857 static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
02858                                                                                           unsigned char C)
02859 {
02860 #ifdef USE_MMX
02861 #if !defined(GCC__)
02862         __asm
02863         {
02864                 pusha
02865                         /* ** Duplicate C in 4 words of MM1 ** */
02866                         mov al, C       /* load C into AL */
02867                         xor ah, ah      /* zero AH */
02868                         mov bx, ax      /* copy AX into BX */
02869                         shl eax, 16     /* shift 2 bytes of EAX left */
02870                         mov ax, bx      /* copy BX into AX */
02871                         movd mm1, eax           /* copy EAX into MM1 */
02872                         movd mm2, eax           /* copy EAX into MM2 */
02873                         punpckldq mm1, mm2      /* fill higher words of MM1 with C */
02874                         xor ecx, ecx    /* zero ECX */
02875                         mov cl, N       /* load N into CL */
02876                         movd mm7, ecx           /* copy N into MM7 */
02877                         pxor mm0, mm0           /* zero MM0 register */
02878                         mov eax, Src1           /* load Src1 address into eax */
02879                         mov edi, Dest           /* load Dest address into edi */
02880                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
02881                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
02882                         align 16                        /* 16 byte alignment of the loop entry */
02883 L1026:
02884                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
02885                 movq mm4, mm3           /* copy MM3 into MM4  */
02886                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
02887                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
02888                         psrlw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the right */
02889                         psrlw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the right */
02890                         pmullw mm3, mm1         /* mul low  bytes of SrcDest by MM1 */
02891                         pmullw mm4, mm1         /* mul high bytes of SrcDest by MM1 */
02892                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
02893                         movq [edi], mm3         /* store result in Dest */
02894                         add eax, 8      /* increase Src1 register pointer by 8 */
02895                         add edi, 8      /* increase Dest register pointer by 8 */
02896                         dec              ecx            /* decrease loop counter */
02897                         jnz             L1026           /* check loop termination, proceed if required */
02898                         emms                            /* exit MMX state */
02899                         popa
02900         }
02901 #else
02902         /* i386 and x86_64 */
02903         __m64 *mSrc1 = (__m64*)Src1;
02904         __m64 *mDest = (__m64*)Dest;
02905         __m64 mm0 = _m_from_int(0);                     /* zero mm0 register */
02906         /* Duplicate C in 4 words of MM1 */
02907         int i;
02908         i = (C<<16)|C;
02909         __m64 mm1 = _m_from_int(i);
02910         __m64 mm2 = _m_from_int(i);
02911         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher words of MM1 with C */
02912         for (i = 0; i < SrcLength/8; i++) {
02913                 __m64 mm3, mm4, mm5, mm6;
02914                 mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
02915                 mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
02916                 mm3 = _m_psrlwi(mm3, N);                /* shift 4 WORDS of MM3 (N) bits to the right */
02917                 mm4 = _m_psrlwi(mm4, N);                /* shift 4 WORDS of MM4 (N) bits to the right */
02918                 mm3 = _m_pmullw(mm3, mm1);              /* mul low  bytes of Src1 and MM1 */
02919                 mm4 = _m_pmullw(mm4, mm1);              /* mul high bytes of Src1 and MM1 */
02920                 *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
02921                 mSrc1++;
02922                 mDest++;
02923         }
02924         _m_empty();                                     /* clean MMX state */
02925 #endif
02926         return (0);
02927 #else
02928         return (-1);
02929 #endif
02930 }
02931 
02943 int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
02944                                                                                    unsigned char C)
02945 {
02946         unsigned int i, istart;
02947         int iC;
02948         unsigned char *cursrc1;
02949         unsigned char *curdest;
02950         int result;
02951 
02952         /* Validate input parameters */
02953         if ((Src1 == NULL) || (Dest == NULL))
02954                 return(-1);
02955         if (length == 0)
02956                 return(0);
02957 
02958         /* Check shift */
02959         if (N > 8) {
02960                 return (-1);
02961         }
02962 
02963         /* Special case: N==0 && C==1 */
02964         if ((N == 0) && (C == 1)) {
02965                 memcpy(Src1, Dest, length);
02966                 return (0); 
02967         }
02968 
02969         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02970 
02971                 SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);
02972 
02973                 /* Check for unaligned bytes */
02974                 if ((length & 7) > 0) {
02975                         /* Setup to process unaligned bytes */
02976                         istart = length & 0xfffffff8;
02977                         cursrc1 = &Src1[istart];
02978                         curdest = &Dest[istart];
02979                 } else {
02980                         /* No unaligned bytes - we are done */
02981                         return (0);
02982                 }
02983         } else {
02984                 /* Setup to process whole image */
02985                 istart = 0;
02986                 cursrc1 = Src1;
02987                 curdest = Dest;
02988         }
02989 
02990         /* C routine to process image */
02991         iC = (int) C;
02992         for (i = istart; i < length; i++) {
02993                 result = (int) (*cursrc1 >> N) * iC;
02994                 if (result > 255)
02995                         result = 255;
02996                 *curdest = (unsigned char) result;
02997                 /* Advance pointers */
02998                 cursrc1++;
02999                 curdest++;
03000         }
03001 
03002         return (0);
03003 }
03004 
03016 static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
03017                                                                         unsigned char *Mask)
03018 {
03019 #ifdef USE_MMX
03020 #if !defined(GCC__)
03021         __asm
03022         {
03023                 pusha
03024                         mov edx, Mask           /* load Mask address into edx */
03025                         movq mm0, [edx]         /* load Mask into mm0 */
03026                 xor ecx, ecx    /* zero ECX */
03027                         mov cl, N       /* load loop counter (N) into CL */
03028                         movd mm3, ecx           /* copy (N) into MM3  */
03029                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
03030 L10270:                         /* ** Prepare proper bit-Mask in MM1 ** */
03031                 psllw mm1, 1    /* shift 4 WORDS of MM1 1 bit to the left */
03032                         pand mm1, mm0        // apply Mask to 8 BYTES of MM1 */
03033                         /*  byte     0x0f, 0xdb, 0xc8 */
03034                         dec cl                          /* decrease loop counter */
03035                         jnz            L10270           /* check loop termination, proceed if required */
03036                         /* ** Shift all bytes of the image ** */
03037                         mov eax, Src1           /* load Src1 address into eax */
03038                         mov edi, Dest           /* load SrcDest address into edi */
03039                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03040                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03041                         align 16                        /* 16 byte alignment of the loop entry */
03042 L10271:
03043                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */
03044                 psllw mm0, mm3          /* shift 4 WORDS of MM0 (N) bits to the left */
03045                         pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
03046                         /* byte     0x0f, 0xdb, 0xc1 */
03047                         movq [edi], mm0         /* store result in Dest */
03048                         add eax, 8      /* increase Src1 register pointer by 8 */
03049                         add edi, 8      /* increase Dest register pointer by 8 */
03050                         dec              ecx            /* decrease loop counter */
03051                         jnz            L10271           /* check loop termination, proceed if required */
03052                         emms                            /* exit MMX state */
03053                         popa
03054         }
03055 #else
03056         /* i386 and x86_64 */
03057         __m64 *mSrc1 = (__m64*)Src1;
03058         __m64 *mDest = (__m64*)Dest;
03059         __m64 *mMask = (__m64*)Mask;
03060         __m64 mm1;
03061         int i;
03062         mm1 = _m_pcmpeqb(mm1, mm1);                     /* generate all 1's in mm1 */
03063         /* Prepare proper bit-Mask in MM1 */
03064         for (i = 0; i < N; i++) {
03065                 mm1 = _m_psllwi(mm1, 1);                /* shift 4 WORDS of MM1 1 bit to the left */
03066                 mm1 = _m_pand(mm1, *mMask);             /* apply Mask to 8 BYTES of MM1 */
03067         }
03068         /* ** Shift all bytes of the image ** */
03069         for (i = 0; i < SrcLength/8; i++) {
03070                 __m64 mm0 = _m_psllwi(*mSrc1, N);       /* shift 4 WORDS of MM0 (N) bits to the left */
03071                 *mDest = _m_pand(mm0, mm1);             /* apply proper bit-Mask to 8 BYTES of MM0 */
03072                 mSrc1++;
03073                 mDest++;
03074         }
03075         _m_empty();                                     /* clean MMX state */
03076 #endif
03077         return (0);
03078 #else
03079         return (-1);
03080 #endif
03081 }
03082 
03093 int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
03094 {
03095         static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
03096         unsigned int i, istart;
03097         unsigned char *cursrc1, *curdest;
03098         int result;
03099 
03100         /* Validate input parameters */
03101         if ((Src1 == NULL) || (Dest == NULL))
03102                 return(-1);
03103         if (length == 0)
03104                 return(0);
03105 
03106         if (N > 8) {
03107                 return (-1);
03108         }
03109 
03110         /* Special case: N==0 */
03111         if (N == 0) {
03112                 memcpy(Src1, Dest, length);
03113                 return (0); 
03114         }
03115 
03116         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03117 
03118                 SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);
03119 
03120                 /* Check for unaligned bytes */
03121                 if ((length & 7) > 0) {
03122                         /* Setup to process unaligned bytes */
03123                         istart = length & 0xfffffff8;
03124                         cursrc1 = &Src1[istart];
03125                         curdest = &Dest[istart];
03126                 } else {
03127                         /* No unaligned bytes - we are done */
03128                         return (0);
03129                 }
03130         } else {
03131                 /* Setup to process whole image */
03132                 istart = 0;
03133                 cursrc1 = Src1;
03134                 curdest = Dest;
03135         }
03136 
03137         /* C routine to process image */
03138         for (i = istart; i < length; i++) {
03139                 result = ((int) *cursrc1 << N) & 0xff;
03140                 *curdest = (unsigned char) result;
03141                 /* Advance pointers */
03142                 cursrc1++;
03143                 curdest++;
03144         }
03145 
03146         return (0);
03147 }
03148 
03159 static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
03160 {
03161 #ifdef USE_MMX
03162 #if !defined(GCC__)
03163         __asm
03164         {
03165                 pusha
03166                         mov eax, Src1           /* load Src1 address into eax */
03167                         mov edi, Dest           /* load Dest address into edi */
03168                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03169                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03170                         align 16                        /* 16 byte alignment of the loop entry */
03171 L12023:
03172                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
03173                 pslld mm0, N    /* MM0=SrcDest+C (add 8 bytes with saturation) */
03174                         movq [edi], mm0         /* store result in SrcDest */
03175                         add eax, 8      /* increase Src1 register pointer by 8 */
03176                         add edi, 8      /* increase Dest register pointer by 8 */
03177                         dec              ecx            /* decrease loop counter */
03178                         jnz             L12023          /* check loop termination, proceed if required */
03179                         emms                            /* exit MMX state */
03180                         popa
03181         }
03182 #else
03183         /* i386 and x86_64 */
03184         __m64 *mSrc1 = (__m64*)Src1;
03185         __m64 *mDest = (__m64*)Dest;
03186         int i;
03187         for (i = 0; i < SrcLength/8; i++) {
03188                 *mDest = _m_pslldi(*mSrc1, N);  /* Src1+C (add 8 bytes with saturation) */
03189                 mSrc1++;
03190                 mDest++;
03191         }
03192         _m_empty();                             /* clean MMX state */
03193 #endif
03194         return (0);
03195 #else
03196         return (-1);
03197 #endif
03198 }
03199 
03210 int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
03211 {
03212         unsigned int i, istart;
03213         unsigned char *cursrc1, *curdest;
03214         unsigned int *icursrc1, *icurdest;
03215         unsigned int result;
03216 
03217         /* Validate input parameters */
03218         if ((Src1 == NULL) || (Dest == NULL))
03219                 return(-1);
03220         if (length == 0)
03221                 return(0);
03222 
03223         if (N > 32) {
03224                 return (-1);
03225         }
03226 
03227         /* Special case: N==0 */
03228         if (N == 0) {
03229                 memcpy(Src1, Dest, length);
03230                 return (0); 
03231         }
03232 
03233         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03234 
03235                 SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);
03236 
03237                 /* Check for unaligned bytes */
03238                 if ((length & 7) > 0) {
03239                         /* Setup to process unaligned bytes */
03240                         istart = length & 0xfffffff8;
03241                         cursrc1 = &Src1[istart];
03242                         curdest = &Dest[istart];
03243                 } else {
03244                         /* No unaligned bytes - we are done */
03245                         return (0);
03246                 }
03247         } else {
03248                 /* Setup to process whole image */
03249                 istart = 0;
03250                 cursrc1 = Src1;
03251                 curdest = Dest;
03252         }
03253 
03254         /* C routine to process image */
03255         icursrc1=(unsigned int *)cursrc1;
03256         icurdest=(unsigned int *)curdest;
03257         for (i = istart; i < length; i += 4) {
03258                 if ((i+4)<length) {
03259                         result = ((unsigned int)*icursrc1 << N);
03260                         *icurdest = result;
03261                 }
03262                 /* Advance pointers */
03263                 icursrc1++;
03264                 icurdest++;
03265         }
03266 
03267         return (0);
03268 }
03269 
03280 static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
03281 {
03282 #ifdef USE_MMX
03283 #if !defined(GCC__)
03284         __asm
03285         {
03286                 pusha
03287                         xor eax, eax    /* zero EAX */
03288                         mov al, N       /* load N into AL */
03289                         movd mm7, eax           /* copy N into MM7 */
03290                         pxor mm0, mm0           /* zero MM0 register */
03291                         mov eax, Src1           /* load Src1 address into eax */
03292                         mov edi, Dest           /* load Dest address into edi */
03293                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03294                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03295                         cmp al, 7       /* if (N <= 7) execute more efficient code */
03296                         jg             L10281
03297                         align 16                        /* 16 byte alignment of the loop entry */
03298 L10280:
03299                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
03300                 movq mm4, mm3           /* copy MM3 into MM4  */
03301                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
03302                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
03303                         psllw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the left */
03304                         psllw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the left */
03305                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
03306                         movq [edi], mm3         /* store result in Dest */
03307                         add eax, 8      /* increase Src1 register pointer by 8 */
03308                         add edi, 8      /* increase Dest register pointer by 8 */
03309                         dec              ecx            /* decrease loop counter */
03310                         jnz            L10280           /* check loop termination, proceed if required */
03311                         jmp            L10282
03312                         align 16                        /* 16 byte alignment of the loop entry */
03313 L10281:
03314                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
03315                 movq mm4, mm3           /* copy MM3 into MM4  */
03316                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
03317                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
03318                         psllw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the left */
03319                         psllw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the left */
03320                         /* ** Take abs value of the signed words ** */
03321                         movq mm5, mm3           /* copy mm3 into mm5 */
03322                         movq mm6, mm4           /* copy mm4 into mm6 */
03323                         psraw mm5, 15           /* fill mm5 words with word sign bit */
03324                         psraw mm6, 15           /* fill mm6 words with word sign bit */
03325                         pxor mm3, mm5           /* take 1's compliment of only neg words */
03326                         pxor mm4, mm6           /* take 1's compliment of only neg words */
03327                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */
03328                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
03329                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
03330                         movq [edi], mm3         /* store result in Dest */
03331                         add eax, 8      /* increase Src1 register pointer by 8 */
03332                         add edi, 8      /* increase Dest register pointer by 8 */
03333                         dec              ecx            /* decrease loop counter */
03334                         jnz            L10281           /* check loop termination, proceed if required */
03335 L10282:
03336                 emms                            /* exit MMX state */
03337                         popa
03338         }
03339 #else
03340         /* i386 and x86_64 */
03341         __m64 *mSrc1 = (__m64*)Src1;
03342         __m64 *mDest = (__m64*)Dest;
03343         __m64 mm0 = _m_from_int(0);                             /* zero mm0 register */
03344         int i;
03345         if (N <= 7) {                                           /* if (N <= 7) execute more efficient code */
03346                 for (i = 0; i < SrcLength/8; i++) {
03347                         __m64 mm3, mm4;
03348                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
03349                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
03350                         mm3 = _m_psllwi(mm3, N);                /* shift 4 WORDS of MM3 (N) bits to the left */
03351                         mm4 = _m_psllwi(mm4, N);                /* shift 4 WORDS of MM4 (N) bits to the left */
03352                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
03353                         mSrc1++;
03354                         mDest++;
03355                 }
03356         } else {
03357                 for (i = 0; i < SrcLength/8; i++) {
03358                         __m64 mm3, mm4, mm5, mm6;
03359                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
03360                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
03361                         mm3 = _m_psllwi(mm3, N);                /* shift 4 WORDS of MM3 (N) bits to the left */
03362                         mm4 = _m_psllwi(mm4, N);                /* shift 4 WORDS of MM4 (N) bits to the left */
03363                         /* Take abs value of the signed words */
03364                         mm5 = _m_psrawi(mm3, 15);               /* fill mm5 words with word sign bit */
03365                         mm6 = _m_psrawi(mm4, 15);               /* fill mm6 words with word sign bit */
03366                         mm3 = _m_pxor(mm3, mm5);                /* take 1's compliment of only neg. words */
03367                         mm4 = _m_pxor(mm4, mm6);                /* take 1's compliment of only neg. words */
03368                         mm3 = _m_psubsw(mm3, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */
03369                         mm4 = _m_psubsw(mm4, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */
03370                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
03371                         mSrc1++;
03372                         mDest++;
03373                 }
03374         }
03375         _m_empty();                                             /* clean MMX state */
03376 #endif
03377         return (0);
03378 #else
03379         return (-1);
03380 #endif
03381 }
03382 
03393 int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
03394 {
03395         unsigned int i, istart;
03396         unsigned char *cursrc1, *curdest;
03397         int result;
03398 
03399         /* Validate input parameters */
03400         if ((Src1 == NULL) || (Dest == NULL))
03401                 return(-1);
03402         if (length == 0)
03403                 return(0);
03404 
03405         if (N > 8) {
03406                 return (-1);
03407         }
03408 
03409         /* Special case: N==0 */
03410         if (N == 0) {
03411                 memcpy(Src1, Dest, length);
03412                 return (0); 
03413         }
03414 
03415         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03416 
03417                 SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);
03418 
03419                 /* Check for unaligned bytes */
03420                 if ((length & 7) > 0) {
03421                         /* Setup to process unaligned bytes */
03422                         istart = length & 0xfffffff8;
03423                         cursrc1 = &Src1[istart];
03424                         curdest = &Dest[istart];
03425                 } else {
03426                         /* No unaligned bytes - we are done */
03427                         return (0);
03428                 }
03429         } else {
03430                 /* Setup to process whole image */
03431                 istart = 0;
03432                 cursrc1 = Src1;
03433                 curdest = Dest;
03434         }
03435 
03436         /* C routine to process image */
03437         for (i = istart; i < length; i++) {
03438                 result = (int) *cursrc1 << N;
03439                 if (result > 255)
03440                         result = 255;
03441                 *curdest = (unsigned char) result;
03442                 /* Advance pointers */
03443                 cursrc1++;
03444                 curdest++;
03445         }
03446 
03447         return (0);
03448 }
03449 
03460 static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
03461 {
03462 #ifdef USE_MMX
03463 #if !defined(GCC__)
03464         __asm
03465         {
03466                 pusha
03467                         /* ** Duplicate T in 8 bytes of MM3 ** */
03468                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
03469                         pcmpeqb mm2, mm2        /* generate all 1's in mm2 */
03470                         mov al, T       /* load T into AL */
03471                         mov ah, al      /* copy AL into AH */
03472                         mov bx, ax      /* copy AX into BX */
03473                         shl eax, 16     /* shift 2 bytes of EAX left */
03474                         mov ax, bx      /* copy BX into AX */
03475                         movd mm3, eax           /* copy EAX into MM3 */
03476                         movd mm4, eax           /* copy EAX into MM4 */
03477                         punpckldq mm3, mm4      /* fill higher bytes of MM3 with T */
03478                         psubusb mm2, mm3        /* store 0xFF - T in MM2 */
03479                         mov eax, Src1           /* load Src1 address into eax */
03480                         mov edi, Dest           /* load Dest address into edi */
03481                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03482                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03483                         align 16                        /* 16 byte alignment of the loop entry */
03484 L1029:
03485                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
03486                 paddusb mm0, mm2        /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
03487                         pcmpeqb mm0, mm1        /* binarize 255:0, comparing to 255 */
03488                         movq [edi], mm0         /* store result in SrcDest */
03489                         add eax, 8      /* increase Src1 register pointer by 8 */
03490                         add edi, 8      /* increase Dest register pointer by 8 */
03491                         dec              ecx            /* decrease loop counter */
03492                         jnz             L1029           /* check loop termination, proceed if required */
03493                         emms                            /* exit MMX state */
03494                         popa
03495         }
03496 #else
03497         /* i386 and x86_64 */
03498         __m64 *mSrc1 = (__m64*)Src1;
03499         __m64 *mDest = (__m64*)Dest;
03500         /* Duplicate T in 8 bytes of MM3 */
03501         __m64 mm1 = _m_pcmpeqb(mm1, mm1);                       /* generate all 1's in mm1 */
03502         __m64 mm2 = _m_pcmpeqb(mm2, mm2);                       /* generate all 1's in mm1 */
03503         int i;
03504         memset(&i, T, 4);
03505         __m64 mm3 = _m_from_int(i);
03506         __m64 mm4 = _m_from_int(i);
03507         mm3 = _m_punpckldq(mm3, mm4);                   /* fill higher bytes of MM3 with T */
03508         mm2 = _m_psubusb(mm2, mm3);                     /* store 0xFF - T in MM2 */
03509         //__m64 mm3 = _m_from_int64(lli); // x86_64 only
03510         for (i = 0; i < SrcLength/8; i++) {
03511                 __m64 mm0 = _m_paddusb(*mSrc1, mm2);    /* Src1+(0xFF-T) (add 8 bytes with saturation) */
03512                 *mDest = _m_pcmpeqb(mm0, mm1);          /* binarize 255:0, comparing to 255 */
03513                 mSrc1++;
03514                 mDest++;
03515         }
03516         _m_empty();                                     /* clean MMX state */
03517 #endif
03518         return (0);
03519 #else
03520         return (-1);
03521 #endif
03522 }
03523 
03534 int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
03535 {
03536         unsigned int i, istart;
03537         unsigned char *cursrc1;
03538         unsigned char *curdest;
03539 
03540         /* Validate input parameters */
03541         if ((Src1 == NULL) || (Dest == NULL))
03542                 return(-1);
03543         if (length == 0)
03544                 return(0);
03545 
03546         /* Special case: T==0 */
03547         if (T == 0) {
03548                 memset(Dest, 255, length);
03549                 return (0); 
03550         }
03551 
03552         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03553 
03554                 SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);
03555 
03556                 /* Check for unaligned bytes */
03557                 if ((length & 7) > 0) {
03558                         /* Setup to process unaligned bytes */
03559                         istart = length & 0xfffffff8;
03560                         cursrc1 = &Src1[istart];
03561                         curdest = &Dest[istart];
03562                 } else {
03563                         /* No unaligned bytes - we are done */
03564                         return (0);
03565                 }
03566         } else {
03567                 /* Setup to process whole image */
03568                 istart = 0;
03569                 cursrc1 = Src1;
03570                 curdest = Dest;
03571         }
03572 
03573         /* C routine to process image */
03574         for (i = istart; i < length; i++) {
03575                 *curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0);
03576                 /* Advance pointers */
03577                 cursrc1++;
03578                 curdest++;
03579         }
03580 
03581         return (0);
03582 }
03583 
03595 static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
03596                                                                   unsigned char Tmax)
03597 {
03598 #ifdef USE_MMX
03599 #if !defined(GCC__)
03600         __asm
03601         {
03602                 pusha
03603                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
03604                         /* ** Duplicate Tmax in 8 bytes of MM3 ** */
03605                         mov al, Tmax    /* load Tmax into AL */
03606                         mov ah, al      /* copy AL into AH */
03607                         mov bx, ax      /* copy AX into BX */
03608                         shl eax, 16     /* shift 2 bytes of EAX left */
03609                         mov ax, bx      /* copy BX into AX */
03610                         movd mm3, eax           /* copy EAX into MM3 */
03611                         movd mm4, eax           /* copy EAX into MM4 */
03612                         punpckldq mm3, mm4      /* fill higher bytes of MM3 with Tmax */
03613                         psubusb mm1, mm3        /* store 0xFF - Tmax in MM1 */
03614                         /* ** Duplicate Tmin in 8 bytes of MM5 ** */
03615                         mov al, Tmin    /* load Tmin into AL */
03616                         mov ah, al      /* copy AL into AH */
03617                         mov bx, ax      /* copy AX into BX */
03618                         shl eax, 16     /* shift 2 bytes of EAX left */
03619                         mov ax, bx      /* copy BX into AX */
03620                         movd mm5, eax           /* copy EAX into MM5 */
03621                         movd mm4, eax           /* copy EAX into MM4 */
03622                         punpckldq mm5, mm4      /* fill higher bytes of MM5 with Tmin */
03623                         movq mm7, mm5           /* copy MM5 into MM7 */
03624                         paddusb mm7, mm1        /* store 0xFF - Tmax + Tmin in MM7 */
03625                         mov eax, Src1           /* load Src1 address into eax */
03626                         mov edi, Dest           /* load Dest address into edi */
03627                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03628                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03629                         align 16                        /* 16 byte alignment of the loop entry */
03630 L1030:
03631                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */
03632                 paddusb mm0, mm1        /* MM0=SrcDest+(0xFF-Tmax) */
03633                         psubusb mm0, mm7        /* MM0=MM0-(0xFF-Tmax+Tmin) */
03634                         paddusb mm0, mm5        /* MM0=MM0+Tmin */
03635                         movq [edi], mm0         /* store result in Dest */
03636                         add eax, 8      /* increase Src1 register pointer by 8 */
03637                         add edi, 8      /* increase Dest register pointer by 8 */
03638                         dec              ecx            /* decrease loop counter */
03639                         jnz             L1030           /* check loop termination, proceed if required */
03640                         emms                            /* exit MMX state */
03641                         popa
03642         }
03643 #else
03644         /* i386 and x86_64 */
03645         __m64 *mSrc1 = (__m64*)Src1;
03646         __m64 *mDest = (__m64*)Dest;
03647         __m64 mm1 = _m_pcmpeqb(mm1, mm1);       /* generate all 1's in mm1 */
03648         int i;
03649         /* Duplicate Tmax in 8 bytes of MM3 */
03650         __m64 mm3, mm4;
03651         memset(&i, Tmax, 4);
03652         mm3 = _m_from_int(i);
03653         mm4 = _m_from_int(i);
03654         mm3 = _m_punpckldq(mm3, mm4);           /* fill higher bytes of MM3 with Tmax */
03655         mm1 = _m_psubusb(mm1, mm3);             /* store 0xFF - Tmax in MM1 */
03656         //__m64 mm3 = _m_from_int64(lli); // x86_64 only
03657         /* Duplicate Tmax in 8 bytes of MM3 */
03658         __m64 mm5, mm7;
03659         memset(&i, Tmin, 4);
03660         mm5 = _m_from_int(i);
03661         mm4 = _m_from_int(i);
03662         mm5 = _m_punpckldq(mm5, mm4);           /* fill higher bytes of MM5 with Tmin */
03663         mm7 = _m_paddusb(mm5, mm1);     /* store 0xFF - Tmax + Tmin in MM7 */
03664         for (i = 0; i < SrcLength/8; i++) {
03665                 __m64 mm0;
03666                 mm0 = _m_paddusb(*mSrc1, mm1);  /* MM0=Src1+(0xFF-Tmax) */
03667                 mm0 = _m_psubusb(mm0, mm7);     /* MM0=MM0-(0xFF-Tmax+Tmin) */
03668                 *mDest = _m_paddusb(mm0, mm5);  /* MM0+Tmin */
03669                 mSrc1++;
03670                 mDest++;
03671         }
03672         _m_empty();                             /* clean MMX state */
03673 #endif
03674         return (0);
03675 #else
03676         return (-1);
03677 #endif
03678 }
03679 
03691 int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
03692                                                            unsigned char Tmax)
03693 {
03694         unsigned int i, istart;
03695         unsigned char *cursrc1;
03696         unsigned char *curdest;
03697 
03698         /* Validate input parameters */
03699         if ((Src1 == NULL) || (Dest == NULL))
03700                 return(-1);
03701         if (length == 0)
03702                 return(0);
03703 
03704         /* Special case: Tmin==0 && Tmax = 255 */
03705         if ((Tmin == 0) && (Tmax == 25)) {
03706                 memcpy(Src1, Dest, length);
03707                 return (0); 
03708         }
03709 
03710         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03711 
03712                 SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);
03713 
03714                 /* Check for unaligned bytes */
03715                 if ((length & 7) > 0) {
03716                         /* Setup to process unaligned bytes */
03717                         istart = length & 0xfffffff8;
03718                         cursrc1 = &Src1[istart];
03719                         curdest = &Dest[istart];
03720                 } else {
03721                         /* No unaligned bytes - we are done */
03722                         return (0);
03723                 }
03724         } else {
03725                 /* Setup to process whole image */
03726                 istart = 0;
03727                 cursrc1 = Src1;
03728                 curdest = Dest;
03729         }
03730 
03731         /* C routine to process image */
03732         for (i = istart; i < length; i++) {
03733                 if (*cursrc1 < Tmin) {
03734                         *curdest = Tmin;
03735                 } else if (*cursrc1 > Tmax) {
03736                         *curdest = Tmax;
03737                 } else {
03738                         *curdest = *cursrc1;
03739                 }
03740                 /* Advance pointers */
03741                 cursrc1++;
03742                 curdest++;
03743         }
03744 
03745         return (0);
03746 }
03747 
03761 static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
03762                                                                           int Nmin, int Nmax)
03763 {
03764 #ifdef USE_MMX
03765 #if !defined(GCC__)
03766         __asm
03767         {
03768                 pusha
03769                         mov ax, WORD PTR Nmax           /* load Nmax in AX */
03770                         mov bx, WORD PTR Cmax           /* load Cmax in BX */
03771                         sub ax, WORD PTR Nmin           /* AX = Nmax - Nmin */
03772                         sub bx, WORD PTR Cmin           /* BX = Cmax - Cmin */
03773                         jz             L10311           /* check division by zero */
03774                         xor dx, dx      /* prepare for division, zero DX */
03775                         div               bx            /* AX = AX/BX */
03776                         jmp            L10312
03777 L10311:
03778                 mov ax, 255     /* if div by zero, assume result max byte value */
03779 L10312:                         /* ** Duplicate AX in 4 words of MM0 ** */
03780                 mov bx, ax      /* copy AX into BX */
03781                         shl eax, 16     /* shift 2 bytes of EAX left */
03782                         mov ax, bx      /* copy BX into AX */
03783                         movd mm0, eax           /* copy EAX into MM0 */
03784                         movd mm1, eax           /* copy EAX into MM1 */
03785                         punpckldq mm0, mm1      /* fill higher words of MM0 with AX */
03786                         /* ** Duplicate Cmin in 4 words of MM1 ** */
03787                         mov ax, WORD PTR Cmin           /* load Cmin into AX */
03788                         mov bx, ax      /* copy AX into BX */
03789                         shl eax, 16     /* shift 2 bytes of EAX left */
03790                         mov ax, bx      /* copy BX into AX */
03791                         movd mm1, eax           /* copy EAX into MM1 */
03792                         movd mm2, eax           /* copy EAX into MM2 */
03793                         punpckldq mm1, mm2      /* fill higher words of MM1 with Cmin */
03794                         /* ** Duplicate Nmin in 4 words of MM2 ** */
03795                         mov ax, WORD PTR Nmin           /* load Nmin into AX */
03796                         mov bx, ax      /* copy AX into BX */
03797                         shl eax, 16     /* shift 2 bytes of EAX left */
03798                         mov ax, bx      /* copy BX into AX */
03799                         movd mm2, eax           /* copy EAX into MM2 */
03800                         movd mm3, eax           /* copy EAX into MM3 */
03801                         punpckldq mm2, mm3      /* fill higher words of MM2 with Nmin */
03802                         pxor mm7, mm7           /* zero MM7 register */
03803                         mov eax, Src1           /* load Src1 address into eax */
03804                         mov edi, Dest           /* load Dest address into edi */
03805                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03806                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03807                         align 16                        /* 16 byte alignment of the loop entry */
03808 L1031:
03809                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
03810                 movq mm4, mm3           /* copy MM3 into MM4  */
03811                         punpcklbw mm3, mm7      /* unpack low  bytes of SrcDest into words */
03812                         punpckhbw mm4, mm7      /* unpack high bytes of SrcDest into words */
03813                         psubusb mm3, mm1        /* S-Cmin, low  bytes */
03814                         psubusb mm4, mm1        /* S-Cmin, high bytes */
03815                         pmullw mm3, mm0         /* MM0*(S-Cmin), low  bytes */
03816                         pmullw mm4, mm0         /* MM0*(S-Cmin), high bytes */
03817                         paddusb mm3, mm2        /* MM0*(S-Cmin)+Nmin, low  bytes */
03818                         paddusb mm4, mm2        /* MM0*(S-Cmin)+Nmin, high bytes */
03819                         /* ** Take abs value of the signed words ** */
03820                         movq mm5, mm3           /* copy mm3 into mm5 */
03821                         movq mm6, mm4           /* copy mm4 into mm6 */
03822                         psraw mm5, 15           /* fill mm5 words with word sign bit */
03823                         psraw mm6, 15           /* fill mm6 words with word sign bit */
03824                         pxor mm3, mm5           /* take 1's compliment of only neg words */
03825                         pxor mm4, mm6           /* take 1's compliment of only neg words */
03826                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */
03827                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
03828                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
03829                         movq [edi], mm3         /* store result in Dest */
03830                         add eax, 8      /* increase Src1 register pointer by 8 */
03831                         add edi, 8      /* increase Dest register pointer by 8 */
03832                         dec              ecx            /* decrease loop counter */
03833                         jnz             L1031           /* check loop termination, proceed if required */
03834                         emms                            /* exit MMX state */
03835                         popa
03836         }
03837 #else
03838         /* i386 and x86_64 */
03839         __m64 *mSrc1 = (__m64*)Src1;
03840         __m64 *mDest = (__m64*)Dest;
03841         __m64 mm0, mm1, mm2, mm3;
03842 
03843         int i;
03844         /* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */
03845         unsigned short a = Nmax - Nmin;
03846         unsigned short b = Cmax - Cmin;
03847         if (b == 0) {
03848             a = 255;
03849         } else {
03850             a /= b;
03851         }
03852         i = (a<<16)|a;
03853         mm0 = _m_from_int(i);
03854         mm1 = _m_from_int(i);
03855         mm0 = _m_punpckldq(mm0, mm1);                   /* fill higher words of MM0 with AX */
03856         /* Duplicate Cmin in 4 words of MM1 */
03857         i = (Cmin<<16)|(short)Cmin;
03858         mm1 = _m_from_int(i);
03859         mm2 = _m_from_int(i);
03860         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher words of MM1 with Cmin */
03861         /* Duplicate Nmin in 4 words of MM2 */
03862         i = (Nmin<<16)|(short)Nmin;
03863         mm2 = _m_from_int(i);
03864         mm3 = _m_from_int(i);
03865         mm2 = _m_punpckldq(mm2, mm3);                   /* fill higher words of MM2 with Nmin */
03866         __m64 mm7 = _m_from_int(0);                     /* zero mm0 register */
03867         for (i = 0; i < SrcLength/8; i++) {
03868                 __m64 mm3, mm4, mm5, mm6;
03869                 mm3 = _m_punpcklbw(*mSrc1, mm7);        /* unpack low  bytes of Src1 into words */
03870                 mm4 = _m_punpckhbw(*mSrc1, mm7);        /* unpack high bytes of Src1 into words */
03871                 mm3 = _m_psubusb(mm3, mm1);             /* S-Cmin, low  bytes */
03872                 mm4 = _m_psubusb(mm4, mm1);             /* S-Cmin, high bytes */
03873                 mm3 = _m_pmullw(mm3, mm0);              /* MM0*(S-Cmin), low  bytes */
03874                 mm4 = _m_pmullw(mm4, mm0);              /* MM0*(S-Cmin), high bytes */
03875                 mm3 = _m_paddusb(mm3, mm2);             /* MM0*(S-Cmin)+Nmin, low  bytes */
03876                 mm4 = _m_paddusb(mm4, mm2);             /* MM0*(S-Cmin)+Nmin, high bytes */
03877                 /* Take abs value of the signed words */
03878                 mm5 = _m_psrawi(mm3, 15);               /* fill mm5 words with word sign bit */
03879                 mm6 = _m_psrawi(mm4, 15);               /* fill mm6 words with word sign bit */
03880                 mm3 = _m_pxor(mm3, mm5);                /* take 1's compliment of only neg. words */
03881                 mm4 = _m_pxor(mm4, mm6);                /* take 1's compliment of only neg. words */
03882                 mm3 = _m_psubsw(mm3, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */
03883                 mm4 = _m_psubsw(mm4, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */
03884                 *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
03885                 mSrc1++;
03886                 mDest++;
03887         }
03888         _m_empty();                                     /* clean MMX state */
03889 #endif
03890         return (0);
03891 #else
03892         return (-1);
03893 #endif
03894 }
03895 
03909 int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
03910                                                                    int Nmax)
03911 {
03912         unsigned int i, istart;
03913         unsigned char *cursrc;
03914         unsigned char *curdest;
03915         int dN, dC, factor;
03916         int result;
03917 
03918         /* Validate input parameters */
03919         if ((Src == NULL) || (Dest == NULL))
03920                 return(-1);
03921         if (length == 0)
03922                 return(0);
03923 
03924         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03925 
03926                 SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);
03927 
03928                 /* Check for unaligned bytes */
03929                 if ((length & 7) > 0) {
03930                         /* Setup to process unaligned bytes */
03931                         istart = length & 0xfffffff8;
03932                         cursrc = &Src[istart];
03933                         curdest = &Dest[istart];
03934                 } else {
03935                         /* No unaligned bytes - we are done */
03936                         return (0);
03937                 }
03938         } else {
03939                 /* Setup to process whole image */
03940                 istart = 0;
03941                 cursrc = Src;
03942                 curdest = Dest;
03943         }
03944 
03945         /* C routine to process image */
03946         dC = Cmax - Cmin;
03947         if (dC == 0)
03948                 return (0);
03949         dN = Nmax - Nmin;
03950         factor = dN / dC;
03951         for (i = istart; i < length; i++) {
03952                 result = factor * ((int) (*cursrc) - Cmin) + Nmin;
03953                 if (result > 255)
03954                         result = 255;
03955                 *curdest = (unsigned char) result;
03956                 /* Advance pointers */
03957                 cursrc++;
03958                 curdest++;
03959         }
03960 
03961         return (0);
03962 }
03963 
03964 /* ------------------------------------------------------------------------------------ */
03965 
03980 int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
03981                                                                                    signed short *Kernel, unsigned char Divisor)
03982 {
03983         /* Validate input parameters */
03984         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
03985                 return(-1);
03986 
03987         if ((columns < 3) || (rows < 3) || (Divisor == 0))
03988                 return (-1);
03989 
03990         if ((SDL_imageFilterMMXdetect())) {
03991 //#ifdef USE_MMX
03992 #if defined(USE_MMX) && defined(i386)
03993 #if !defined(GCC__)
03994                 __asm
03995                 {
03996                         pusha
03997                                 pxor mm0, mm0           /* zero MM0 */
03998                                 xor ebx, ebx    /* zero EBX */
03999                                 mov bl, Divisor         /* load Divisor into BL */
04000                                 mov edx, Kernel         /* load Kernel address into EDX */
04001                                 movq mm5, [edx]         /* MM5 = {0,K2,K1,K0} */
04002                         add edx, 8      /* second row              |K0 K1 K2 0| */
04003                                 movq mm6, [edx]         /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
04004                         add edx, 8      /* third row               |K6 K7 K8 0| */
04005                                 movq mm7, [edx]         /* MM7 = {0,K8,K7,K6} */
04006                         /* ---, */
04007                         mov eax, columns        /* load columns into EAX */
04008                                 mov esi, Src    /* ESI = Src row 0 address */
04009                                 mov edi, Dest           /* load Dest address to EDI */
04010                                 add edi, eax    /* EDI = EDI + columns */
04011                                 inc              edi            /* 1 byte offset from the left edge */
04012                                 mov edx, rows           /* initialize ROWS counter */
04013                                 sub edx, 2      /* do not use first and last row */
04014                                 /* ---, */
04015 L10320:
04016                         mov ecx, eax    /* initialize COLUMS counter */
04017                                 sub ecx, 2      /* do not use first and last column */
04018                                 align 16                        /* 16 byte alignment of the loop entry */
04019 L10322:
04020                         /* ---, */
04021                         movq mm1, [esi]         /* load 8 bytes of the image first row */
04022                         add esi, eax    /* move one row below */
04023                                 movq mm2, [esi]         /* load 8 bytes of the image second row */
04024                         add esi, eax    /* move one row below */
04025                                 movq mm3, [esi]         /* load 8 bytes of the image third row */
04026                         punpcklbw mm1, mm0      /* unpack first 4 bytes into words */
04027                                 punpcklbw mm2, mm0      /* unpack first 4 bytes into words */
04028                                 punpcklbw mm3, mm0      /* unpack first 4 bytes into words */
04029                                 pmullw mm1, mm5         /* multiply words first row  image*Kernel */
04030                                 pmullw mm2, mm6         /* multiply words second row image*Kernel */
04031                                 pmullw mm3, mm7         /* multiply words third row  image*Kernel */
04032                                 paddsw mm1, mm2         /* add 4 words of the first and second rows */
04033                                 paddsw mm1, mm3         /* add 4 words of the third row and result */
04034                                 movq mm2, mm1           /* copy MM1 into MM2 */
04035                                 psrlq mm1, 32           /* shift 2 left words to the right */
04036                                 paddsw mm1, mm2         /* add 2 left and 2 right result words */
04037                                 movq mm3, mm1           /* copy MM1 into MM3 */
04038                                 psrlq mm1, 16           /* shift 1 left word to the right */
04039                                 paddsw mm1, mm3         /* add 1 left and 1 right result words */
04040                                 /* --, */
04041                                 movd mm2, eax           /* save EAX in MM2 */
04042                                 movd mm3, edx           /* save EDX in MM3 */
04043                                 movd eax, mm1           /* copy MM1 into EAX */
04044                                 psraw mm1, 15           /* spread sign bit of the result */
04045                                 movd edx, mm1           /* fill EDX with a sign bit */
04046                                 idiv bx         /* IDIV - VERY EXPENSIVE */
04047                                 movd mm1, eax           /* move result of division into MM1 */
04048                                 packuswb mm1, mm0       /* pack division result with saturation */
04049                                 movd eax, mm1           /* copy saturated result into EAX */
04050                                 mov [edi], al           /* copy a byte result into Dest */
04051                                 movd edx, mm3           /* restore saved EDX */
04052                                 movd eax, mm2           /* restore saved EAX */
04053                                 /* --, */
04054                                 sub esi, eax    /* move two rows up */
04055                                 sub esi, eax    /* */
04056                                 inc              esi            /* move Src  pointer to the next pixel */
04057                                 inc              edi            /* move Dest pointer to the next pixel */
04058                                 /* ---, */
04059                                 dec              ecx            /* decrease loop counter COLUMNS */
04060                                 jnz            L10322           /* check loop termination, proceed if required */
04061                                 add esi, 2      /* move to the next row in Src */
04062                                 add edi, 2      /* move to the next row in Dest */
04063                                 dec              edx            /* decrease loop counter ROWS */
04064                                 jnz            L10320           /* check loop termination, proceed if required */
04065                                 /* ---, */
04066                                 emms                            /* exit MMX state */
04067                                 popa
04068                 }
04069 #else
04070                 asm volatile
04071                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
04072                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
04073                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
04074                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
04075                         "movq    (%%edx), %%mm5 \n\t"   /* MM5 = {0,K2,K1,K0} */
04076                         "add          $8, %%edx \n\t"   /* second row              |K0 K1 K2 0| */
04077                         "movq    (%%edx), %%mm6 \n\t"   /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
04078                         "add          $8, %%edx \n\t"   /* third row               |K6 K7 K8 0| */
04079                         "movq    (%%edx), %%mm7 \n\t"   /* MM7 = {0,K8,K7,K6} */
04080                         /* --- */
04081                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
04082                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
04083                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
04084                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
04085                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
04086                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */
04087                         "sub          $2, %%edx \n\t"   /* do not use first and last row */
04088                         /* --- */
04089                         ".L10320:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMS counter */
04090                         "sub          $2, %%ecx \n\t"   /* do not use first and last column */
04091                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
04092                         ".L10322:               \n\t"
04093                         /* --- */
04094                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the image first row */
04095                         "add       %%eax, %%esi \n\t"   /* move one row below */
04096                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes of the image second row */
04097                         "add       %%eax, %%esi \n\t"   /* move one row below */
04098                         "movq    (%%esi), %%mm3 \n\t"   /* load 8 bytes of the image third row */
04099                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first 4 bytes into words */
04100                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack first 4 bytes into words */
04101                         "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack first 4 bytes into words */
04102                         "pmullw    %%mm5, %%mm1 \n\t"   /* multiply words first row  image*Kernel */
04103                         "pmullw    %%mm6, %%mm2 \n\t"   /* multiply words second row image*Kernel */
04104                         "pmullw    %%mm7, %%mm3 \n\t"   /* multiply words third row  image*Kernel */
04105                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the first and second rows */
04106                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 4 words of the third row and result */
04107                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04108                         "psrlq       $32, %%mm1 \n\t"   /* shift 2 left words to the right */
04109                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 2 left and 2 right result words */
04110                         "movq      %%mm1, %%mm3 \n\t"   /* copy MM1 into MM3 */
04111                         "psrlq       $16, %%mm1 \n\t"   /* shift 1 left word to the right */
04112                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 1 left and 1 right result words */
04113                         /* -- */
04114                         "movd      %%eax, %%mm2 \n\t"   /* save EAX in MM2 */
04115                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
04116                         "movd      %%mm1, %%eax \n\t"   /* copy MM1 into EAX */
04117                         "psraw       $15, %%mm1 \n\t"   /* spread sign bit of the result */
04118                         "movd      %%mm1, %%edx \n\t"   /* fill EDX with a sign bit */
04119                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
04120                         "movd      %%eax, %%mm1 \n\t"   /* move result of division into MM1 */
04121                         "packuswb  %%mm0, %%mm1 \n\t"   /* pack division result with saturation */
04122                         "movd      %%mm1, %%eax \n\t"   /* copy saturated result into EAX */
04123                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
04124                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
04125                         "movd      %%mm2, %%eax \n\t"   /* restore saved EAX */
04126                         /* -- */
04127                         "sub       %%eax, %%esi \n\t"   /* move two rows up */
04128                         "sub       %%eax, %%esi \n\t"   /* */
04129                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
04130                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
04131                         /* --- */
04132                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
04133                         "jnz            .L10322 \n\t"   /* check loop termination, proceed if required */
04134                         "add          $2, %%esi \n\t"   /* move to the next row in Src */
04135                         "add          $2, %%edi \n\t"   /* move to the next row in Dest */
04136                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */
04137                         "jnz            .L10320 \n\t"   /* check loop termination, proceed if required */
04138                         /* --- */
04139                         "emms                   \n\t"   /* exit MMX state */
04140                         "popa                   \n\t":"=m" (Dest)       /* %0 */
04141                         :"m"(Src),              /* %1 */
04142                         "m"(rows),              /* %2 */
04143                         "m"(columns),           /* %3 */
04144                         "m"(Kernel),            /* %4 */
04145                         "m"(Divisor)            /* %5 */
04146                         );
04147 #endif
04148 #endif
04149                 return (0);
04150         } else {
04151                 /* No non-MMX implementation yet */
04152                 return (-1);
04153         }
04154 }
04155 
04170 int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
04171                                                                                    signed short *Kernel, unsigned char Divisor)
04172 {
04173         /* Validate input parameters */
04174         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
04175                 return(-1);
04176 
04177         if ((columns < 5) || (rows < 5) || (Divisor == 0))
04178                 return (-1);
04179 
04180         if ((SDL_imageFilterMMXdetect())) {
04181 //#ifdef USE_MMX
04182 #if defined(USE_MMX) && defined(i386)
04183 #if !defined(GCC__)
04184                 __asm
04185                 {
04186                         pusha
04187                                 pxor mm0, mm0           /* zero MM0 */
04188                                 xor ebx, ebx    /* zero EBX */
04189                                 mov bl, Divisor         /* load Divisor into BL */
04190                                 movd mm5, ebx           /* copy Divisor into MM5 */
04191                                 mov edx, Kernel         /* load Kernel address into EDX */
04192                                 mov esi, Src    /* load Src  address to ESI */
04193                                 mov edi, Dest           /* load Dest address to EDI */
04194                                 add edi, 2      /* 2 column offset from the left edge */
04195                                 mov eax, columns        /* load columns into EAX */
04196                                 shl eax, 1      /* EAX = columns * 2 */
04197                                 add edi, eax    /* 2 row offset from the top edge */
04198                                 shr eax, 1      /* EAX = columns */
04199                                 mov ebx, rows           /* initialize ROWS counter */
04200                                 sub ebx, 4      /* do not use first 2 and last 2 rows */
04201                                 /* ---, */
04202 L10330:
04203                         mov ecx, eax    /* initialize COLUMNS counter */
04204                                 sub ecx, 4      /* do not use first 2 and last 2 columns */
04205                                 align 16                        /* 16 byte alignment of the loop entry */
04206 L10332:
04207                         pxor mm7, mm7           /* zero MM7 (accumulator) */
04208                                 movd mm6, esi           /* save ESI in MM6 */
04209                                 /* --- 1 */
04210                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04211                         movq mm2, mm1           /* copy MM1 into MM2 */
04212                                 add esi, eax    /* move Src pointer 1 row below */
04213                                 movq mm3, [edx]         /* load 4 words of Kernel */
04214                         add edx, 8      /* move pointer to other 4 words */
04215                                 movq mm4, [edx]         /* load 4 words of Kernel */
04216                         add edx, 8      /* move pointer to other 4 words */
04217                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04218                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04219                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04220                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04221                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04222                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04223                                 /* --- 2 */
04224                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04225                         movq mm2, mm1           /* copy MM1 into MM2 */
04226                                 add esi, eax    /* move Src pointer 1 row below */
04227                                 movq mm3, [edx]         /* load 4 words of Kernel */
04228                         add edx, 8      /* move pointer to other 4 words */
04229                                 movq mm4, [edx]         /* load 4 words of Kernel */
04230                         add edx, 8      /* move pointer to other 4 words */
04231                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04232                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04233                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04234                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04235                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04236                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04237                                 /* --- 3 */
04238                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04239                         movq mm2, mm1           /* copy MM1 into MM2 */
04240                                 add esi, eax    /* move Src pointer 1 row below */
04241                                 movq mm3, [edx]         /* load 4 words of Kernel */
04242                         add edx, 8      /* move pointer to other 4 words */
04243                                 movq mm4, [edx]         /* load 4 words of Kernel */
04244                         add edx, 8      /* move pointer to other 4 words */
04245                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04246                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04247                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04248                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04249                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04250                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04251                                 /* --- 4 */
04252                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04253                         movq mm2, mm1           /* copy MM1 into MM2 */
04254                                 add esi, eax    /* move Src pointer 1 row below */
04255                                 movq mm3, [edx]         /* load 4 words of Kernel */
04256                         add edx, 8      /* move pointer to other 4 words */
04257                                 movq mm4, [edx]         /* load 4 words of Kernel */
04258                         add edx, 8      /* move pointer to other 4 words */
04259                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04260                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04261                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04262                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04263                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04264                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04265                                 /* --- 5 */
04266                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04267                         movq mm2, mm1           /* copy MM1 into MM2 */
04268                                 movq mm3, [edx]         /* load 4 words of Kernel */
04269                         add edx, 8      /* move pointer to other 4 words */
04270                                 movq mm4, [edx]         /* load 4 words of Kernel */
04271                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04272                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04273                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04274                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04275                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04276                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04277                                 /* ---, */
04278                                 movq mm3, mm7           /* copy MM7 into MM3 */
04279                                 psrlq mm7, 32           /* shift 2 left words to the right */
04280                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
04281                                 movq mm2, mm7           /* copy MM7 into MM2 */
04282                                 psrlq mm7, 16           /* shift 1 left word to the right */
04283                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
04284                                 /* ---, */
04285                                 movd mm1, eax           /* save EDX in MM1 */
04286                                 movd mm2, ebx           /* save EDX in MM2 */
04287                                 movd mm3, edx           /* save EDX in MM3 */
04288                                 movd eax, mm7           /* load summation result into EAX */
04289                                 psraw mm7, 15           /* spread sign bit of the result */
04290                                 movd ebx, mm5           /* load Divisor into EBX */
04291                                 movd edx, mm7           /* fill EDX with a sign bit */
04292                                 idiv bx         /* IDIV - VERY EXPENSIVE */
04293                                 movd mm7, eax           /* move result of division into MM7 */
04294                                 packuswb mm7, mm0       /* pack division result with saturation */
04295                                 movd eax, mm7           /* copy saturated result into EAX */
04296                                 mov [edi], al           /* copy a byte result into Dest */
04297                                 movd edx, mm3           /* restore saved EDX */
04298                                 movd ebx, mm2           /* restore saved EBX */
04299                                 movd eax, mm1           /* restore saved EAX */
04300                                 /* --, */
04301                                 movd esi, mm6           /* move Src pointer to the top pixel */
04302                                 sub edx, 72     /* EDX = Kernel address */
04303                                 inc              esi            /* move Src  pointer to the next pixel */
04304                                 inc              edi            /* move Dest pointer to the next pixel */
04305                                 /* ---, */
04306                                 dec              ecx            /* decrease loop counter COLUMNS */
04307                                 jnz            L10332           /* check loop termination, proceed if required */
04308                                 add esi, 4      /* move to the next row in Src */
04309                                 add edi, 4      /* move to the next row in Dest */
04310                                 dec              ebx            /* decrease loop counter ROWS */
04311                                 jnz            L10330           /* check loop termination, proceed if required */
04312                                 /* ---, */
04313                                 emms                            /* exit MMX state */
04314                                 popa
04315                 }
04316 #else
04317                 asm volatile
04318                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
04319                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
04320                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
04321                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */
04322                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
04323                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
04324                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
04325                         "add          $2, %%edi \n\t"   /* 2 column offset from the left edge */
04326                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
04327                         "shl          $1, %%eax \n\t"   /* EAX = columns * 2 */
04328                         "add       %%eax, %%edi \n\t"   /* 2 row offset from the top edge */
04329                         "shr          $1, %%eax \n\t"   /* EAX = columns */
04330                         "mov          %2, %%ebx \n\t"   /* initialize ROWS counter */
04331                         "sub          $4, %%ebx \n\t"   /* do not use first 2 and last 2 rows */
04332                         /* --- */
04333                         ".L10330:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
04334                         "sub          $4, %%ecx \n\t"   /* do not use first 2 and last 2 columns */
04335                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
04336                         ".L10332:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
04337                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
04338                         /* --- 1 */
04339                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04340                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04341                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04342                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04343                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04344                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04345                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04346                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04347                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04348                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04349                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04350                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04351                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04352                         /* --- 2 */
04353                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04354                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04355                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04356                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04357                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04358                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04359                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04360                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04361                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04362                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04363                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04364                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04365                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04366                         /* --- 3 */
04367                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04368                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04369                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04370                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04371                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04372                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04373                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04374                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04375                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04376                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04377                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04378                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04379                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04380                         /* --- 4 */
04381                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04382                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04383                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04384                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04385                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04386                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04387                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04388                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04389                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04390                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04391                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04392                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04393                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04394                         /* --- 5 */
04395                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04396                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04397                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04398                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04399                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04400                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04401                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04402                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04403                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04404                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04405                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04406                         /* --- */
04407                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
04408                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
04409                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
04410                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
04411                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
04412                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
04413                         /* --- */
04414                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */
04415                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */
04416                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
04417                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */
04418                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */
04419                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */
04420                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */
04421                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
04422                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */
04423                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
04424                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
04425                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
04426                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
04427                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */
04428                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
04429                         /* -- */
04430                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
04431                         "sub         $72, %%edx \n\t"   /* EDX = Kernel address */
04432                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
04433                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
04434                         /* --- */
04435                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
04436                         "jnz            .L10332 \n\t"   /* check loop termination, proceed if required */
04437                         "add          $4, %%esi \n\t"   /* move to the next row in Src */
04438                         "add          $4, %%edi \n\t"   /* move to the next row in Dest */
04439                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
04440                         "jnz            .L10330 \n\t"   /* check loop termination, proceed if required */
04441                         /* --- */
04442                         "emms                   \n\t"   /* exit MMX state */
04443                         "popa                   \n\t":"=m" (Dest)       /* %0 */
04444                         :"m"(Src),              /* %1 */
04445                         "m"(rows),              /* %2 */
04446                         "m"(columns),           /* %3 */
04447                         "m"(Kernel),            /* %4 */
04448                         "m"(Divisor)            /* %5 */
04449                         );
04450 #endif
04451 #endif
04452                 return (0);
04453         } else {
04454                 /* No non-MMX implementation yet */
04455                 return (-1);
04456         }
04457 }
04458 
04473 int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
04474                                                                                    signed short *Kernel, unsigned char Divisor)
04475 {
04476         /* Validate input parameters */
04477         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
04478                 return(-1);
04479 
04480         if ((columns < 7) || (rows < 7) || (Divisor == 0))
04481                 return (-1);
04482 
04483         if ((SDL_imageFilterMMXdetect())) {
04484 //#ifdef USE_MMX
04485 #if defined(USE_MMX) && defined(i386)
04486 #if !defined(GCC__)
04487                 __asm
04488                 {
04489                         pusha
04490                                 pxor mm0, mm0           /* zero MM0 */
04491                                 xor ebx, ebx    /* zero EBX */
04492                                 mov bl, Divisor         /* load Divisor into BL */
04493                                 movd mm5, ebx           /* copy Divisor into MM5 */
04494                                 mov edx, Kernel         /* load Kernel address into EDX */
04495                                 mov esi, Src    /* load Src  address to ESI */
04496                                 mov edi, Dest           /* load Dest address to EDI */
04497                                 add edi, 3      /* 3 column offset from the left edge */
04498                                 mov eax, columns        /* load columns into EAX */
04499                                 add edi, eax    /* 3 row offset from the top edge */
04500                                 add edi, eax
04501                                 add edi, eax
04502                                 mov ebx, rows           /* initialize ROWS counter */
04503                                 sub ebx, 6      /* do not use first 3 and last 3 rows */
04504                                 /* ---, */
04505 L10340:
04506                         mov ecx, eax    /* initialize COLUMNS counter */
04507                                 sub ecx, 6      /* do not use first 3 and last 3 columns */
04508                                 align 16                        /* 16 byte alignment of the loop entry */
04509 L10342:
04510                         pxor mm7, mm7           /* zero MM7 (accumulator) */
04511                                 movd mm6, esi           /* save ESI in MM6 */
04512                                 /* --- 1 */
04513                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04514                         movq mm2, mm1           /* copy MM1 into MM2 */
04515                                 add esi, eax    /* move Src pointer 1 row below */
04516                                 movq mm3, [edx]         /* load 4 words of Kernel */
04517                         add edx, 8      /* move pointer to other 4 words */
04518                                 movq mm4, [edx]         /* load 4 words of Kernel */
04519                         add edx, 8      /* move pointer to other 4 words */
04520                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04521                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04522                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04523                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04524                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04525                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04526                                 /* --- 2 */
04527                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04528                         movq mm2, mm1           /* copy MM1 into MM2 */
04529                                 add esi, eax    /* move Src pointer 1 row below */
04530                                 movq mm3, [edx]         /* load 4 words of Kernel */
04531                         add edx, 8      /* move pointer to other 4 words */
04532                                 movq mm4, [edx]         /* load 4 words of Kernel */
04533                         add edx, 8      /* move pointer to other 4 words */
04534                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04535                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04536                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04537                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04538                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04539                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04540                                 /* --- 3 */
04541                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04542                         movq mm2, mm1           /* copy MM1 into MM2 */
04543                                 add esi, eax    /* move Src pointer 1 row below */
04544                                 movq mm3, [edx]         /* load 4 words of Kernel */
04545                         add edx, 8      /* move pointer to other 4 words */
04546                                 movq mm4, [edx]         /* load 4 words of Kernel */
04547                         add edx, 8      /* move pointer to other 4 words */
04548                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04549                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04550                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04551                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04552                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04553                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04554                                 /* --- 4 */
04555                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04556                         movq mm2, mm1           /* copy MM1 into MM2 */
04557                                 add esi, eax    /* move Src pointer 1 row below */
04558                                 movq mm3, [edx]         /* load 4 words of Kernel */
04559                         add edx, 8      /* move pointer to other 4 words */
04560                                 movq mm4, [edx]         /* load 4 words of Kernel */
04561                         add edx, 8      /* move pointer to other 4 words */
04562                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04563                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04564                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04565                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04566                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04567                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04568                                 /* --- 5 */
04569                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04570                         movq mm2, mm1           /* copy MM1 into MM2 */
04571                                 add esi, eax    /* move Src pointer 1 row below */
04572                                 movq mm3, [edx]         /* load 4 words of Kernel */
04573                         add edx, 8      /* move pointer to other 4 words */
04574                                 movq mm4, [edx]         /* load 4 words of Kernel */
04575                         add edx, 8      /* move pointer to other 4 words */
04576                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04577                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04578                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04579                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04580                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04581                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04582                                 /* --- 6 */
04583                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04584                         movq mm2, mm1           /* copy MM1 into MM2 */
04585                                 add esi, eax    /* move Src pointer 1 row below */
04586                                 movq mm3, [edx]         /* load 4 words of Kernel */
04587                         add edx, 8      /* move pointer to other 4 words */
04588                                 movq mm4, [edx]         /* load 4 words of Kernel */
04589                         add edx, 8      /* move pointer to other 4 words */
04590                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04591                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04592                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04593                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04594                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04595                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04596                                 /* --- 7 */
04597                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04598                         movq mm2, mm1           /* copy MM1 into MM2 */
04599                                 movq mm3, [edx]         /* load 4 words of Kernel */
04600                         add edx, 8      /* move pointer to other 4 words */
04601                                 movq mm4, [edx]         /* load 4 words of Kernel */
04602                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04603                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04604                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04605                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04606                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04607                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04608                                 /* ---, */
04609                                 movq mm3, mm7           /* copy MM7 into MM3 */
04610                                 psrlq mm7, 32           /* shift 2 left words to the right */
04611                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
04612                                 movq mm2, mm7           /* copy MM7 into MM2 */
04613                                 psrlq mm7, 16           /* shift 1 left word to the right */
04614                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
04615                                 /* ---, */
04616                                 movd mm1, eax           /* save EDX in MM1 */
04617                                 movd mm2, ebx           /* save EDX in MM2 */
04618                                 movd mm3, edx           /* save EDX in MM3 */
04619                                 movd eax, mm7           /* load summation result into EAX */
04620                                 psraw mm7, 15           /* spread sign bit of the result */
04621                                 movd ebx, mm5           /* load Divisor into EBX */
04622                                 movd edx, mm7           /* fill EDX with a sign bit */
04623                                 idiv bx         /* IDIV - VERY EXPENSIVE */
04624                                 movd mm7, eax           /* move result of division into MM7 */
04625                                 packuswb mm7, mm0       /* pack division result with saturation */
04626                                 movd eax, mm7           /* copy saturated result into EAX */
04627                                 mov [edi], al           /* copy a byte result into Dest */
04628                                 movd edx, mm3           /* restore saved EDX */
04629                                 movd ebx, mm2           /* restore saved EBX */
04630                                 movd eax, mm1           /* restore saved EAX */
04631                                 /* --, */
04632                                 movd esi, mm6           /* move Src pointer to the top pixel */
04633                                 sub edx, 104    /* EDX = Kernel address */
04634                                 inc              esi            /* move Src  pointer to the next pixel */
04635                                 inc              edi            /* move Dest pointer to the next pixel */
04636                                 /* ---, */
04637                                 dec              ecx            /* decrease loop counter COLUMNS */
04638                                 jnz            L10342           /* check loop termination, proceed if required */
04639                                 add esi, 6      /* move to the next row in Src */
04640                                 add edi, 6      /* move to the next row in Dest */
04641                                 dec              ebx            /* decrease loop counter ROWS */
04642                                 jnz            L10340           /* check loop termination, proceed if required */
04643                                 /* ---, */
04644                                 emms                            /* exit MMX state */
04645                                 popa
04646                 }
04647 #else
04648                 asm volatile
04649                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
04650                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
04651                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
04652                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */
04653                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
04654                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
04655                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
04656                         "add          $3, %%edi \n\t"   /* 3 column offset from the left edge */
04657                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
04658                         "add       %%eax, %%edi \n\t"   /* 3 row offset from the top edge */
04659                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"       /* initialize ROWS counter */
04660                         "sub          $6, %%ebx \n\t"   /* do not use first 3 and last 3 rows */
04661                         /* --- */
04662                         ".L10340:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
04663                         "sub          $6, %%ecx \n\t"   /* do not use first 3 and last 3 columns */
04664                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
04665                         ".L10342:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
04666                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
04667                         /* --- 1 */
04668                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04669                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04670                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04671                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04672                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04673                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04674                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04675                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04676                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04677                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04678                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04679                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04680                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04681                         /* --- 2 */
04682                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04683                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04684                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04685                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04686                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04687                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04688                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04689                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04690                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04691                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04692                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04693                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04694                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04695                         /* --- 3 */
04696                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04697                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04698                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04699                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04700                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04701                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04702                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04703                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04704                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04705                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04706                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04707                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04708                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04709                         /* --- 4 */
04710                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04711                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04712                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04713                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04714                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04715                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04716                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04717                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04718                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04719                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04720                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04721                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04722                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04723                         /* --- 5 */
04724                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04725                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04726                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04727                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04728                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04729                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04730                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04731                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04732                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04733                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04734                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04735                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04736                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04737                         /* --- 6 */
04738                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04739                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04740                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04741                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04742                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04743                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04744                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04745                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04746                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04747                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04748                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04749                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04750                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04751                         /* --- 7 */
04752                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04753                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04754                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04755                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04756                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04757                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04758                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04759                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04760                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04761                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04762                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04763                         /* --- */
04764                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
04765                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
04766                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
04767                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
04768                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
04769                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
04770                         /* --- */
04771                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */
04772                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */
04773                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
04774                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */
04775                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */
04776                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */
04777                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */
04778                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
04779                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */
04780                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
04781                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
04782                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
04783                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
04784                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */
04785                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
04786                         /* -- */
04787                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
04788                         "sub        $104, %%edx \n\t"   /* EDX = Kernel address */
04789                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
04790                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
04791                         /* --- */
04792                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
04793                         "jnz            .L10342 \n\t"   /* check loop termination, proceed if required */
04794                         "add          $6, %%esi \n\t"   /* move to the next row in Src */
04795                         "add          $6, %%edi \n\t"   /* move to the next row in Dest */
04796                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
04797                         "jnz            .L10340 \n\t"   /* check loop termination, proceed if required */
04798                         /* --- */
04799                         "emms                   \n\t"   /* exit MMX state */
04800                         "popa                   \n\t":"=m" (Dest)       /* %0 */
04801                         :"m"(Src),              /* %1 */
04802                         "m"(rows),              /* %2 */
04803                         "m"(columns),           /* %3 */
04804                         "m"(Kernel),            /* %4 */
04805                         "m"(Divisor)            /* %5 */
04806                         );
04807 #endif
04808 #endif
04809                 return (0);
04810         } else {
04811                 /* No non-MMX implementation yet */
04812                 return (-1);
04813         }
04814 }
04815 
04830 int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
04831                                                                                    signed short *Kernel, unsigned char Divisor)
04832 {
04833         /* Validate input parameters */
04834         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
04835                 return(-1);
04836 
04837         if ((columns < 9) || (rows < 9) || (Divisor == 0))
04838                 return (-1);
04839 
04840         if ((SDL_imageFilterMMXdetect())) {
04841 //#ifdef USE_MMX
04842 #if defined(USE_MMX) && defined(i386)
04843 #if !defined(GCC__)
04844                 __asm
04845                 {
04846                         pusha
04847                                 pxor mm0, mm0           /* zero MM0 */
04848                                 xor ebx, ebx    /* zero EBX */
04849                                 mov bl, Divisor         /* load Divisor into BL */
04850                                 movd mm5, ebx           /* copy Divisor into MM5 */
04851                                 mov edx, Kernel         /* load Kernel address into EDX */
04852                                 mov esi, Src    /* load Src  address to ESI */
04853                                 mov edi, Dest           /* load Dest address to EDI */
04854                                 add edi, 4      /* 4 column offset from the left edge */
04855                                 mov eax, columns        /* load columns into EAX */
04856                                 add edi, eax    /* 4 row offset from the top edge */
04857                                 add edi, eax
04858                                 add edi, eax
04859                                 add edi, eax
04860                                 mov ebx, rows           /* initialize ROWS counter */
04861                                 sub ebx, 8      /* do not use first 4 and last 4 rows */
04862                                 /* ---, */
04863 L10350:
04864                         mov ecx, eax    /* initialize COLUMNS counter */
04865                                 sub ecx, 8      /* do not use first 4 and last 4 columns */
04866                                 align 16                        /* 16 byte alignment of the loop entry */
04867 L10352:
04868                         pxor mm7, mm7           /* zero MM7 (accumulator) */
04869                                 movd mm6, esi           /* save ESI in MM6 */
04870                                 /* --- 1 */
04871                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04872                         movq mm2, mm1           /* copy MM1 into MM2 */
04873                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04874                                 movq mm3, [edx]         /* load 4 words of Kernel */
04875                         add edx, 8      /* move pointer to other 4 words */
04876                                 movq mm4, [edx]         /* load 4 words of Kernel */
04877                         add edx, 8      /* move pointer to other 4 words */
04878                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04879                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04880                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04881                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04882                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04883                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04884                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04885                         dec              esi
04886                                 add esi, eax    /* move Src pointer 1 row below */
04887                                 movq mm3, [edx]         /* load 4 words of Kernel */
04888                         add edx, 8      /* move pointer to other 4 words */
04889                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04890                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04891                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04892                                 /* --- 2 */
04893                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04894                         movq mm2, mm1           /* copy MM1 into MM2 */
04895                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04896                                 movq mm3, [edx]         /* load 4 words of Kernel */
04897                         add edx, 8      /* move pointer to other 4 words */
04898                                 movq mm4, [edx]         /* load 4 words of Kernel */
04899                         add edx, 8      /* move pointer to other 4 words */
04900                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04901                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04902                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04903                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04904                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04905                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04906                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04907                         dec              esi
04908                                 add esi, eax    /* move Src pointer 1 row below */
04909                                 movq mm3, [edx]         /* load 4 words of Kernel */
04910                         add edx, 8      /* move pointer to other 4 words */
04911                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04912                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04913                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04914                                 /* --- 3 */
04915                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04916                         movq mm2, mm1           /* copy MM1 into MM2 */
04917                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04918                                 movq mm3, [edx]         /* load 4 words of Kernel */
04919                         add edx, 8      /* move pointer to other 4 words */
04920                                 movq mm4, [edx]         /* load 4 words of Kernel */
04921                         add edx, 8      /* move pointer to other 4 words */
04922                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04923                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04924                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04925                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04926                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04927                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04928                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04929                         dec              esi
04930                                 add esi, eax    /* move Src pointer 1 row below */
04931                                 movq mm3, [edx]         /* load 4 words of Kernel */
04932                         add edx, 8      /* move pointer to other 4 words */
04933                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04934                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04935                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04936                                 /* --- 4 */
04937                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04938                         movq mm2, mm1           /* copy MM1 into MM2 */
04939                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04940                                 movq mm3, [edx]         /* load 4 words of Kernel */
04941                         add edx, 8      /* move pointer to other 4 words */
04942                                 movq mm4, [edx]         /* load 4 words of Kernel */
04943                         add edx, 8      /* move pointer to other 4 words */
04944                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04945                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04946                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04947                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04948                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04949                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04950                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04951                         dec              esi
04952                                 add esi, eax    /* move Src pointer 1 row below */
04953                                 movq mm3, [edx]         /* load 4 words of Kernel */
04954                         add edx, 8      /* move pointer to other 4 words */
04955                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04956                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04957                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04958                                 /* --- 5 */
04959                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04960                         movq mm2, mm1           /* copy MM1 into MM2 */
04961                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04962                                 movq mm3, [edx]         /* load 4 words of Kernel */
04963                         add edx, 8      /* move pointer to other 4 words */
04964                                 movq mm4, [edx]         /* load 4 words of Kernel */
04965                         add edx, 8      /* move pointer to other 4 words */
04966                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04967                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04968                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04969                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04970                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04971                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04972                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04973                         dec              esi
04974                                 add esi, eax    /* move Src pointer 1 row below */
04975                                 movq mm3, [edx]         /* load 4 words of Kernel */
04976                         add edx, 8      /* move pointer to other 4 words */
04977                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04978                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04979                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04980                                 /* --- 6 */
04981                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04982                         movq mm2, mm1           /* copy MM1 into MM2 */
04983                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04984                                 movq mm3, [edx]         /* load 4 words of Kernel */
04985                         add edx, 8      /* move pointer to other 4 words */
04986                                 movq mm4, [edx]         /* load 4 words of Kernel */
04987                         add edx, 8      /* move pointer to other 4 words */
04988                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04989                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04990                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04991                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04992                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04993                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04994                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04995                         dec              esi
04996                                 add esi, eax    /* move Src pointer 1 row below */
04997                                 movq mm3, [edx]         /* load 4 words of Kernel */
04998                         add edx, 8      /* move pointer to other 4 words */
04999                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05000                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05001                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05002                                 /* --- 7 */
05003                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05004                         movq mm2, mm1           /* copy MM1 into MM2 */
05005                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05006                                 movq mm3, [edx]         /* load 4 words of Kernel */
05007                         add edx, 8      /* move pointer to other 4 words */
05008                                 movq mm4, [edx]         /* load 4 words of Kernel */
05009                         add edx, 8      /* move pointer to other 4 words */
05010                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05011                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05012                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05013                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05014                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05015                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05016                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05017                         dec              esi
05018                                 add esi, eax    /* move Src pointer 1 row below */
05019                                 movq mm3, [edx]         /* load 4 words of Kernel */
05020                         add edx, 8      /* move pointer to other 4 words */
05021                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05022                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05023                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05024                                 /* --- 8 */
05025                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05026                         movq mm2, mm1           /* copy MM1 into MM2 */
05027                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05028                                 movq mm3, [edx]         /* load 4 words of Kernel */
05029                         add edx, 8      /* move pointer to other 4 words */
05030                                 movq mm4, [edx]         /* load 4 words of Kernel */
05031                         add edx, 8      /* move pointer to other 4 words */
05032                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05033                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05034                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05035                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05036                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05037                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05038                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05039                         dec              esi
05040                                 add esi, eax    /* move Src pointer 1 row below */
05041                                 movq mm3, [edx]         /* load 4 words of Kernel */
05042                         add edx, 8      /* move pointer to other 4 words */
05043                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05044                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05045                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05046                                 /* --- 9 */
05047                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05048                         movq mm2, mm1           /* copy MM1 into MM2 */
05049                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05050                                 movq mm3, [edx]         /* load 4 words of Kernel */
05051                         add edx, 8      /* move pointer to other 4 words */
05052                                 movq mm4, [edx]         /* load 4 words of Kernel */
05053                         add edx, 8      /* move pointer to other 4 words */
05054                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05055                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05056                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05057                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05058                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05059                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05060                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05061                         movq mm3, [edx]         /* load 4 words of Kernel */
05062                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05063                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05064                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05065                                 /* ---, */
05066                                 movq mm3, mm7           /* copy MM7 into MM3 */
05067                                 psrlq mm7, 32           /* shift 2 left words to the right */
05068                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
05069                                 movq mm2, mm7           /* copy MM7 into MM2 */
05070                                 psrlq mm7, 16           /* shift 1 left word to the right */
05071                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
05072                                 /* ---, */
05073                                 movd mm1, eax           /* save EDX in MM1 */
05074                                 movd mm2, ebx           /* save EDX in MM2 */
05075                                 movd mm3, edx           /* save EDX in MM3 */
05076                                 movd eax, mm7           /* load summation result into EAX */
05077                                 psraw mm7, 15           /* spread sign bit of the result */
05078                                 movd ebx, mm5           /* load Divisor into EBX */
05079                                 movd edx, mm7           /* fill EDX with a sign bit */
05080                                 idiv bx         /* IDIV - VERY EXPENSIVE */
05081                                 movd mm7, eax           /* move result of division into MM7 */
05082                                 packuswb mm7, mm0       /* pack division result with saturation */
05083                                 movd eax, mm7           /* copy saturated result into EAX */
05084                                 mov [edi], al           /* copy a byte result into Dest */
05085                                 movd edx, mm3           /* restore saved EDX */
05086                                 movd ebx, mm2           /* restore saved EBX */
05087                                 movd eax, mm1           /* restore saved EAX */
05088                                 /* --, */
05089                                 movd esi, mm6           /* move Src pointer to the top pixel */
05090                                 sub edx, 208    /* EDX = Kernel address */
05091                                 inc              esi            /* move Src  pointer to the next pixel */
05092                                 inc              edi            /* move Dest pointer to the next pixel */
05093                                 /* ---, */
05094                                 dec              ecx            /* decrease loop counter COLUMNS */
05095                                 jnz            L10352           /* check loop termination, proceed if required */
05096                                 add esi, 8      /* move to the next row in Src */
05097                                 add edi, 8      /* move to the next row in Dest */
05098                                 dec              ebx            /* decrease loop counter ROWS */
05099                                 jnz            L10350           /* check loop termination, proceed if required */
05100                                 /* ---, */
05101                                 emms                            /* exit MMX state */
05102                                 popa
05103                 }
05104 #else
05105                 asm volatile
05106                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
05107                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
05108                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
05109                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */
05110                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
05111                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
05112                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
05113                         "add          $4, %%edi \n\t"   /* 4 column offset from the left edge */
05114                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
05115                         "add       %%eax, %%edi \n\t"   /* 4 row offset from the top edge */
05116                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t" /* initialize ROWS counter */
05117                         "sub          $8, %%ebx \n\t"   /* do not use first 4 and last 4 rows */
05118                         /* --- */
05119                         ".L10350:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
05120                         "sub          $8, %%ecx \n\t"   /* do not use first 4 and last 4 columns */
05121                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
05122                         ".L10352:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
05123                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
05124                         /* --- 1 */
05125                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05126                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05127                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05128                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05129                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05130                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05131                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05132                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05133                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05134                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05135                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05136                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05137                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05138                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05139                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05140                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05141                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05142                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05143                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05144                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05145                         /* --- 2 */
05146                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05147                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05148                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05149                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05150                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05151                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05152                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05153                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05154                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05155                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05156                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05157                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05158                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05159                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05160                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05161                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05162                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05163                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05164                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05165                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05166                         /* --- 3 */
05167                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05168                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05169                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05170                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05171                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05172                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05173                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05174                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05175                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05176                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05177                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05178                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05179                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05180                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05181                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05182                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05183                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05184                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05185                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05186                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05187                         /* --- 4 */
05188                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05189                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05190                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05191                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05192                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05193                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05194                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05195                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05196                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05197                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05198                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05199                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05200                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05201                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05202                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05203                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05204                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05205                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05206                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05207                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05208                         /* --- 5 */
05209                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05210                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05211                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05212                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05213                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05214                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05215                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05216                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05217                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05218                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05219                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05220                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05221                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05222                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05223                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05224                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05225                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05226                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05227                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05228                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05229                         /* --- 6 */
05230                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05231                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05232                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05233                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05234                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05235                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05236                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05237                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05238                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05239                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05240                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05241                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05242                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05243                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05244                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05245                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05246                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05247                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05248                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05249                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05250                         /* --- 7 */
05251                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05252                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05253                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05254                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05255                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05256                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05257                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05258                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05259                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05260                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05261                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05262                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05263                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05264                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05265                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05266                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05267                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05268                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05269                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05270                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05271                         /* --- 8 */
05272                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05273                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05274                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05275                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05276                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05277                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05278                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05279                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05280                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05281                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05282                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05283                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05284                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05285                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05286                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05287                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05288                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05289                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05290                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05291                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05292                         /* --- 9 */
05293                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05294                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05295                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05296                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05297                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05298                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05299                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05300                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05301                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05302                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05303                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05304                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05305                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05306                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05307                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05308                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05309                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05310                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05311                         /* --- */
05312                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
05313                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
05314                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
05315                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
05316                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
05317                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
05318                         /* --- */
05319                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */
05320                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */
05321                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
05322                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */
05323                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */
05324                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */
05325                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */
05326                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
05327                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */
05328                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
05329                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
05330                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
05331                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
05332                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */
05333                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
05334                         /* -- */
05335                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
05336                         "sub        $208, %%edx \n\t"   /* EDX = Kernel address */
05337                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
05338                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
05339                         /* --- */
05340                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
05341                         "jnz            .L10352 \n\t"   /* check loop termination, proceed if required */
05342                         "add          $8, %%esi \n\t"   /* move to the next row in Src */
05343                         "add          $8, %%edi \n\t"   /* move to the next row in Dest */
05344                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
05345                         "jnz            .L10350 \n\t"   /* check loop termination, proceed if required */
05346                         /* --- */
05347                         "emms                   \n\t"   /* exit MMX state */
05348                         "popa                   \n\t":"=m" (Dest)       /* %0 */
05349                         :"m"(Src),              /* %1 */
05350                         "m"(rows),              /* %2 */
05351                         "m"(columns),           /* %3 */
05352                         "m"(Kernel),            /* %4 */
05353                         "m"(Divisor)            /* %5 */
05354                         );
05355 #endif
05356 #endif
05357                 return (0);
05358         } else {
05359                 /* No non-MMX implementation yet */
05360                 return (-1);
05361         }
05362 }
05363 
05378 int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
05379                                                                                            signed short *Kernel, unsigned char NRightShift)
05380 {
05381         /* Validate input parameters */
05382         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
05383                 return(-1);
05384 
05385         if ((columns < 3) || (rows < 3) || (NRightShift > 7))
05386                 return (-1);
05387 
05388         if ((SDL_imageFilterMMXdetect())) {
05389 //#ifdef USE_MMX
05390 #if defined(USE_MMX) && defined(i386)
05391 #if !defined(GCC__)
05392                 __asm
05393                 {
05394                         pusha
05395                                 pxor mm0, mm0           /* zero MM0 */
05396                                 xor ebx, ebx    /* zero EBX */
05397                                 mov bl, NRightShift     /* load NRightShift into BL */
05398                                 movd mm4, ebx           /* copy NRightShift into MM4 */
05399                                 mov edx, Kernel         /* load Kernel address into EDX */
05400                                 movq mm5, [edx]         /* MM5 = {0,K2,K1,K0} */
05401                         add edx, 8      /* second row              |K0 K1 K2 0| */
05402                                 movq mm6, [edx]         /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
05403                         add edx, 8      /* third row               |K6 K7 K8 0| */
05404                                 movq mm7, [edx]         /* MM7 = {0,K8,K7,K6} */
05405                         /* ---, */
05406                         mov eax, columns        /* load columns into EAX */
05407                                 mov esi, Src    /* ESI = Src row 0 address */
05408                                 mov edi, Dest           /* load Dest address to EDI */
05409                                 add edi, eax    /* EDI = EDI + columns */
05410                                 inc              edi            /* 1 byte offset from the left edge */
05411                                 mov edx, rows           /* initialize ROWS counter */
05412                                 sub edx, 2      /* do not use first and last row */
05413                                 /* ---, */
05414 L10360:
05415                         mov ecx, eax    /* initialize COLUMS counter */
05416                                 sub ecx, 2      /* do not use first and last column */
05417                                 align 16                        /* 16 byte alignment of the loop entry */
05418 L10362:
05419                         /* ---, */
05420                         movq mm1, [esi]         /* load 8 bytes of the image first row */
05421                         add esi, eax    /* move one row below */
05422                                 movq mm2, [esi]         /* load 8 bytes of the image second row */
05423                         add esi, eax    /* move one row below */
05424                                 movq mm3, [esi]         /* load 8 bytes of the image third row */
05425                         punpcklbw mm1, mm0      /* unpack first 4 bytes into words */
05426                                 punpcklbw mm2, mm0      /* unpack first 4 bytes into words */
05427                                 punpcklbw mm3, mm0      /* unpack first 4 bytes into words */
05428                                 psrlw mm1, mm4          /* shift right each pixel NshiftRight times */
05429                                 psrlw mm2, mm4          /* shift right each pixel NshiftRight times */
05430                                 psrlw mm3, mm4          /* shift right each pixel NshiftRight times */
05431                                 pmullw mm1, mm5         /* multiply words first row  image*Kernel */
05432                                 pmullw mm2, mm6         /* multiply words second row image*Kernel */
05433                                 pmullw mm3, mm7         /* multiply words third row  image*Kernel */
05434                                 paddsw mm1, mm2         /* add 4 words of the first and second rows */
05435                                 paddsw mm1, mm3         /* add 4 words of the third row and result */
05436                                 movq mm2, mm1           /* copy MM1 into MM2 */
05437                                 psrlq mm1, 32           /* shift 2 left words to the right */
05438                                 paddsw mm1, mm2         /* add 2 left and 2 right result words */
05439                                 movq mm3, mm1           /* copy MM1 into MM3 */
05440                                 psrlq mm1, 16           /* shift 1 left word to the right */
05441                                 paddsw mm1, mm3         /* add 1 left and 1 right result words */
05442                                 packuswb mm1, mm0       /* pack shift result with saturation */
05443                                 movd ebx, mm1           /* copy saturated result into EBX */
05444                                 mov [edi], bl           /* copy a byte result into Dest */
05445                                 /* --, */
05446                                 sub esi, eax    /* move two rows up */
05447                                 sub esi, eax
05448                                 inc              esi            /* move Src  pointer to the next pixel */
05449                                 inc              edi            /* move Dest pointer to the next pixel */
05450                                 /* ---, */
05451                                 dec              ecx            /* decrease loop counter COLUMNS */
05452                                 jnz            L10362           /* check loop termination, proceed if required */
05453                                 add esi, 2      /* move to the next row in Src */
05454                                 add edi, 2      /* move to the next row in Dest */
05455                                 dec              edx            /* decrease loop counter ROWS */
05456                                 jnz            L10360           /* check loop termination, proceed if required */
05457                                 /* ---, */
05458                                 emms                            /* exit MMX state */
05459                                 popa
05460                 }
05461 #else
05462                 asm volatile
05463                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
05464                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
05465                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
05466                         "movd      %%ebx, %%mm4 \n\t"   /* copy NRightShift into MM4 */
05467                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
05468                         "movq    (%%edx), %%mm5 \n\t"   /* MM5 = {0,K2,K1,K0} */
05469                         "add          $8, %%edx \n\t"   /* second row              |K0 K1 K2 0| */
05470                         "movq    (%%edx), %%mm6 \n\t"   /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
05471                         "add          $8, %%edx \n\t"   /* third row               |K6 K7 K8 0| */
05472                         "movq    (%%edx), %%mm7 \n\t"   /* MM7 = {0,K8,K7,K6} */
05473                         /* --- */
05474                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
05475                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
05476                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
05477                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
05478                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
05479                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */
05480                         "sub          $2, %%edx \n\t"   /* do not use first and last row */
05481                         /* --- */
05482                         ".L10360:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMS counter */
05483                         "sub          $2, %%ecx \n\t"   /* do not use first and last column */
05484                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
05485                         ".L10362:               \n\t"
05486                         /* --- */
05487                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the image first row */
05488                         "add       %%eax, %%esi \n\t"   /* move one row below */
05489                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes of the image second row */
05490                         "add       %%eax, %%esi \n\t"   /* move one row below */
05491                         "movq    (%%esi), %%mm3 \n\t"   /* load 8 bytes of the image third row */
05492                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first 4 bytes into words */
05493                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack first 4 bytes into words */
05494                         "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack first 4 bytes into words */
05495                         "psrlw     %%mm4, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05496                         "psrlw     %%mm4, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05497                         "psrlw     %%mm4, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
05498                         "pmullw    %%mm5, %%mm1 \n\t"   /* multiply words first row  image*Kernel */
05499                         "pmullw    %%mm6, %%mm2 \n\t"   /* multiply words second row image*Kernel */
05500                         "pmullw    %%mm7, %%mm3 \n\t"   /* multiply words third row  image*Kernel */
05501                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the first and second rows */
05502                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 4 words of the third row and result */
05503                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05504                         "psrlq       $32, %%mm1 \n\t"   /* shift 2 left words to the right */
05505                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 2 left and 2 right result words */
05506                         "movq      %%mm1, %%mm3 \n\t"   /* copy MM1 into MM3 */
05507                         "psrlq       $16, %%mm1 \n\t"   /* shift 1 left word to the right */
05508                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 1 left and 1 right result words */
05509                         "packuswb  %%mm0, %%mm1 \n\t"   /* pack shift result with saturation */
05510                         "movd      %%mm1, %%ebx \n\t"   /* copy saturated result into EBX */
05511                         "mov      %%bl, (%%edi) \n\t"   /* copy a byte result into Dest */
05512                         /* -- */
05513                         "sub       %%eax, %%esi \n\t"   /* move two rows up */
05514                         "sub       %%eax, %%esi \n\t" "inc              %%esi \n\t"     /* move Src  pointer to the next pixel */
05515                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
05516                         /* --- */
05517                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
05518                         "jnz            .L10362 \n\t"   /* check loop termination, proceed if required */
05519                         "add          $2, %%esi \n\t"   /* move to the next row in Src */
05520                         "add          $2, %%edi \n\t"   /* move to the next row in Dest */
05521                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */
05522                         "jnz            .L10360 \n\t"   /* check loop termination, proceed if required */
05523                         /* --- */
05524                         "emms                   \n\t"   /* exit MMX state */
05525                         "popa                   \n\t":"=m" (Dest)       /* %0 */
05526                         :"m"(Src),              /* %1 */
05527                         "m"(rows),              /* %2 */
05528                         "m"(columns),           /* %3 */
05529                         "m"(Kernel),            /* %4 */
05530                         "m"(NRightShift)        /* %5 */
05531                         );
05532 #endif
05533 #endif
05534                 return (0);
05535         } else {
05536                 /* No non-MMX implementation yet */
05537                 return (-1);
05538         }
05539 }
05540 
05555 int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
05556                                                                                            signed short *Kernel, unsigned char NRightShift)
05557 {
05558         /* Validate input parameters */
05559         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
05560                 return(-1);
05561 
05562         if ((columns < 5) || (rows < 5) || (NRightShift > 7))
05563                 return (-1);
05564 
05565         if ((SDL_imageFilterMMXdetect())) {
05566 //#ifdef USE_MMX
05567 #if defined(USE_MMX) && defined(i386)
05568 #if !defined(GCC__)
05569                 __asm
05570                 {
05571                         pusha
05572                                 pxor mm0, mm0           /* zero MM0 */
05573                                 xor ebx, ebx    /* zero EBX */
05574                                 mov bl, NRightShift     /* load NRightShift into BL */
05575                                 movd mm5, ebx           /* copy NRightShift into MM5 */
05576                                 mov edx, Kernel         /* load Kernel address into EDX */
05577                                 mov esi, Src    /* load Src  address to ESI */
05578                                 mov edi, Dest           /* load Dest address to EDI */
05579                                 add edi, 2      /* 2 column offset from the left edge */
05580                                 mov eax, columns        /* load columns into EAX */
05581                                 shl eax, 1      /* EAX = columns * 2 */
05582                                 add edi, eax    /* 2 row offset from the top edge */
05583                                 shr eax, 1      /* EAX = columns */
05584                                 mov ebx, rows           /* initialize ROWS counter */
05585                                 sub ebx, 4      /* do not use first 2 and last 2 rows */
05586                                 /* ---, */
05587 L10370:
05588                         mov ecx, eax    /* initialize COLUMNS counter */
05589                                 sub ecx, 4      /* do not use first 2 and last 2 columns */
05590                                 align 16                        /* 16 byte alignment of the loop entry */
05591 L10372:
05592                         pxor mm7, mm7           /* zero MM7 (accumulator) */
05593                                 movd mm6, esi           /* save ESI in MM6 */
05594                                 /* --- 1 */
05595                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05596                         movq mm2, mm1           /* copy MM1 into MM2 */
05597                                 add esi, eax    /* move Src pointer 1 row below */
05598                                 movq mm3, [edx]         /* load 4 words of Kernel */
05599                         add edx, 8      /* move pointer to other 4 words */
05600                                 movq mm4, [edx]         /* load 4 words of Kernel */
05601                         add edx, 8      /* move pointer to other 4 words */
05602                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05603                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05604                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05605                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05606                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05607                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05608                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05609                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05610                                 /* --- 2 */
05611                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05612                         movq mm2, mm1           /* copy MM1 into MM2 */
05613                                 add esi, eax    /* move Src pointer 1 row below */
05614                                 movq mm3, [edx]         /* load 4 words of Kernel */
05615                         add edx, 8      /* move pointer to other 4 words */
05616                                 movq mm4, [edx]         /* load 4 words of Kernel */
05617                         add edx, 8      /* move pointer to other 4 words */
05618                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05619                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05620                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05621                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05622                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05623                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05624                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05625                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05626                                 /* --- 3 */
05627                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05628                         movq mm2, mm1           /* copy MM1 into MM2 */
05629                                 add esi, eax    /* move Src pointer 1 row below */
05630                                 movq mm3, [edx]         /* load 4 words of Kernel */
05631                         add edx, 8      /* move pointer to other 4 words */
05632                                 movq mm4, [edx]         /* load 4 words of Kernel */
05633                         add edx, 8      /* move pointer to other 4 words */
05634                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05635                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05636                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05637                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05638                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05639                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05640                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05641                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05642                                 /* --- 4 */
05643                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05644                         movq mm2, mm1           /* copy MM1 into MM2 */
05645                                 add esi, eax    /* move Src pointer 1 row below */
05646                                 movq mm3, [edx]         /* load 4 words of Kernel */
05647                         add edx, 8      /* move pointer to other 4 words */
05648                                 movq mm4, [edx]         /* load 4 words of Kernel */
05649                         add edx, 8      /* move pointer to other 4 words */
05650                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05651                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05652                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05653                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05654                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05655                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05656                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05657                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05658                                 /* --- 5 */
05659                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05660                         movq mm2, mm1           /* copy MM1 into MM2 */
05661                                 movq mm3, [edx]         /* load 4 words of Kernel */
05662                         add edx, 8      /* move pointer to other 4 words */
05663                                 movq mm4, [edx]         /* load 4 words of Kernel */
05664                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05665                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05666                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05667                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05668                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05669                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05670                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05671                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05672                                 /* ---, */
05673                                 movq mm3, mm7           /* copy MM7 into MM3 */
05674                                 psrlq mm7, 32           /* shift 2 left words to the right */
05675                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
05676                                 movq mm2, mm7           /* copy MM7 into MM2 */
05677                                 psrlq mm7, 16           /* shift 1 left word to the right */
05678                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
05679                                 movd mm1, eax           /* save EAX in MM1 */
05680                                 packuswb mm7, mm0       /* pack division result with saturation */
05681                                 movd eax, mm7           /* copy saturated result into EAX */
05682                                 mov [edi], al           /* copy a byte result into Dest */
05683                                 movd eax, mm1           /* restore saved EAX */
05684                                 /* --, */
05685                                 movd esi, mm6           /* move Src pointer to the top pixel */
05686                                 sub edx, 72     /* EDX = Kernel address */
05687                                 inc              esi            /* move Src  pointer to the next pixel */
05688                                 inc              edi            /* move Dest pointer to the next pixel */
05689                                 /* ---, */
05690                                 dec              ecx            /* decrease loop counter COLUMNS */
05691                                 jnz            L10372           /* check loop termination, proceed if required */
05692                                 add esi, 4      /* move to the next row in Src */
05693                                 add edi, 4      /* move to the next row in Dest */
05694                                 dec              ebx            /* decrease loop counter ROWS */
05695                                 jnz            L10370           /* check loop termination, proceed if required */
05696                                 /* ---, */
05697                                 emms                            /* exit MMX state */
05698                                 popa
05699                 }
05700 #else
05701                 asm volatile
05702                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
05703                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
05704                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
05705                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */
05706                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
05707                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
05708                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
05709                         "add          $2, %%edi \n\t"   /* 2 column offset from the left edge */
05710                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
05711                         "shl          $1, %%eax \n\t"   /* EAX = columns * 2 */
05712                         "add       %%eax, %%edi \n\t"   /* 2 row offset from the top edge */
05713                         "shr          $1, %%eax \n\t"   /* EAX = columns */
05714                         "mov          %2, %%ebx \n\t"   /* initialize ROWS counter */
05715                         "sub          $4, %%ebx \n\t"   /* do not use first 2 and last 2 rows */
05716                         /* --- */
05717                         ".L10370:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
05718                         "sub          $4, %%ecx \n\t"   /* do not use first 2 and last 2 columns */
05719                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
05720                         ".L10372:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
05721                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
05722                         /* --- 1 */
05723                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05724                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05725                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05726                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05727                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05728                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05729                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05730                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05731                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05732                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05733                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05734                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05735                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05736                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05737                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05738                         /* --- 2 */
05739                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05740                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05741                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05742                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05743                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05744                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05745                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05746                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05747                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05748                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05749                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05750                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05751                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05752                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05753                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05754                         /* --- 3 */
05755                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05756                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05757                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05758                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05759                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05760                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05761                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05762                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05763                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05764                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05765                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05766                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05767                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05768                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05769                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05770                         /* --- 4 */
05771                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05772                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05773                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05774                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05775                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05776                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05777                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05778                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05779                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05780                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05781                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05782                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05783                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05784                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05785                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05786                         /* --- 5 */
05787                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05788                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05789                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05790                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05791                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05792                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05793                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05794                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05795                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05796                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05797                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05798                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05799                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05800                         /* --- */
05801                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
05802                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
05803                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
05804                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
05805                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
05806                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
05807                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */
05808                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
05809                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
05810                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
05811                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
05812                         /* -- */
05813                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
05814                         "sub         $72, %%edx \n\t"   /* EDX = Kernel address */
05815                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
05816                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
05817                         /* --- */
05818                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
05819                         "jnz            .L10372 \n\t"   /* check loop termination, proceed if required */
05820                         "add          $4, %%esi \n\t"   /* move to the next row in Src */
05821                         "add          $4, %%edi \n\t"   /* move to the next row in Dest */
05822                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
05823                         "jnz            .L10370 \n\t"   /* check loop termination, proceed if required */
05824                         /* --- */
05825                         "emms                   \n\t"   /* exit MMX state */
05826                         "popa                   \n\t":"=m" (Dest)       /* %0 */
05827                         :"m"(Src),              /* %1 */
05828                         "m"(rows),              /* %2 */
05829                         "m"(columns),           /* %3 */
05830                         "m"(Kernel),            /* %4 */
05831                         "m"(NRightShift)        /* %5 */
05832                         );
05833 #endif
05834 #endif
05835                 return (0);
05836         } else {
05837                 /* No non-MMX implementation yet */
05838                 return (-1);
05839         }
05840 }
05841 
05856 int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
05857                                                                                            signed short *Kernel, unsigned char NRightShift)
05858 {
05859         /* Validate input parameters */
05860         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
05861                 return(-1);
05862 
05863         if ((columns < 7) || (rows < 7) || (NRightShift > 7))
05864                 return (-1);
05865 
05866         if ((SDL_imageFilterMMXdetect())) {
05867 //#ifdef USE_MMX
05868 #if defined(USE_MMX) && defined(i386)
05869 #if !defined(GCC__)
05870                 __asm
05871                 {
05872                         pusha
05873                                 pxor mm0, mm0           /* zero MM0 */
05874                                 xor ebx, ebx    /* zero EBX */
05875                                 mov bl, NRightShift     /* load NRightShift into BL */
05876                                 movd mm5, ebx           /* copy NRightShift into MM5 */
05877                                 mov edx, Kernel         /* load Kernel address into EDX */
05878                                 mov esi, Src    /* load Src  address to ESI */
05879                                 mov edi, Dest           /* load Dest address to EDI */
05880                                 add edi, 3      /* 3 column offset from the left edge */
05881                                 mov eax, columns        /* load columns into EAX */
05882                                 add edi, eax    /* 3 row offset from the top edge */
05883                                 add edi, eax
05884                                 add edi, eax
05885                                 mov ebx, rows           /* initialize ROWS counter */
05886                                 sub ebx, 6      /* do not use first 3 and last 3 rows */
05887                                 /* ---, */
05888 L10380:
05889                         mov ecx, eax    /* initialize COLUMNS counter */
05890                                 sub ecx, 6      /* do not use first 3 and last 3 columns */
05891                                 align 16                        /* 16 byte alignment of the loop entry */
05892 L10382:
05893                         pxor mm7, mm7           /* zero MM7 (accumulator) */
05894                                 movd mm6, esi           /* save ESI in MM6 */
05895                                 /* --- 1 */
05896                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05897                         movq mm2, mm1           /* copy MM1 into MM2 */
05898                                 add esi, eax    /* move Src pointer 1 row below */
05899                                 movq mm3, [edx]         /* load 4 words of Kernel */
05900                         add edx, 8      /* move pointer to other 4 words */
05901                                 movq mm4, [edx]         /* load 4 words of Kernel */
05902                         add edx, 8      /* move pointer to other 4 words */
05903                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05904                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05905                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05906                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05907                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05908                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05909                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05910                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05911                                 /* --- 2 */
05912                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05913                         movq mm2, mm1           /* copy MM1 into MM2 */
05914                                 add esi, eax    /* move Src pointer 1 row below */
05915                                 movq mm3, [edx]         /* load 4 words of Kernel */
05916                         add edx, 8      /* move pointer to other 4 words */
05917                                 movq mm4, [edx]         /* load 4 words of Kernel */
05918                         add edx, 8      /* move pointer to other 4 words */
05919                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05920                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05921                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05922                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05923                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05924                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05925                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05926                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05927                                 /* --- 3 */
05928                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05929                         movq mm2, mm1           /* copy MM1 into MM2 */
05930                                 add esi, eax    /* move Src pointer 1 row below */
05931                                 movq mm3, [edx]         /* load 4 words of Kernel */
05932                         add edx, 8      /* move pointer to other 4 words */
05933                                 movq mm4, [edx]         /* load 4 words of Kernel */
05934                         add edx, 8      /* move pointer to other 4 words */
05935                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05936                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05937                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05938                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05939                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05940                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05941                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05942                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05943                                 /* --- 4 */
05944                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05945                         movq mm2, mm1           /* copy MM1 into MM2 */
05946                                 add esi, eax    /* move Src pointer 1 row below */
05947                                 movq mm3, [edx]         /* load 4 words of Kernel */
05948                         add edx, 8      /* move pointer to other 4 words */
05949                                 movq mm4, [edx]         /* load 4 words of Kernel */
05950                         add edx, 8      /* move pointer to other 4 words */
05951                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05952                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05953                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05954                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05955                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05956                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05957                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05958                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05959                                 /* --- 5 */
05960                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05961                         movq mm2, mm1           /* copy MM1 into MM2 */
05962                                 add esi, eax    /* move Src pointer 1 row below */
05963                                 movq mm3, [edx]         /* load 4 words of Kernel */
05964                         add edx, 8      /* move pointer to other 4 words */
05965                                 movq mm4, [edx]         /* load 4 words of Kernel */
05966                         add edx, 8      /* move pointer to other 4 words */
05967                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05968                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05969                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05970                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05971                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05972                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05973                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05974                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05975                                 /* --- 6 */
05976                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05977                         movq mm2, mm1           /* copy MM1 into MM2 */
05978                                 add esi, eax    /* move Src pointer 1 row below */
05979                                 movq mm3, [edx]         /* load 4 words of Kernel */
05980                         add edx, 8      /* move pointer to other 4 words */
05981                                 movq mm4, [edx]         /* load 4 words of Kernel */
05982                         add edx, 8      /* move pointer to other 4 words */
05983                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05984                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05985                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05986                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05987                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05988                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05989                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05990                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05991                                 /* --- 7 */
05992                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05993                         movq mm2, mm1           /* copy MM1 into MM2 */
05994                                 movq mm3, [edx]         /* load 4 words of Kernel */
05995                         add edx, 8      /* move pointer to other 4 words */
05996                                 movq mm4, [edx]         /* load 4 words of Kernel */
05997                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05998                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05999                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06000                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06001                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06002                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06003                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06004                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06005                                 /* ---, */
06006                                 movq mm3, mm7           /* copy MM7 into MM3 */
06007                                 psrlq mm7, 32           /* shift 2 left words to the right */
06008                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
06009                                 movq mm2, mm7           /* copy MM7 into MM2 */
06010                                 psrlq mm7, 16           /* shift 1 left word to the right */
06011                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
06012                                 movd mm1, eax           /* save EAX in MM1 */
06013                                 packuswb mm7, mm0       /* pack division result with saturation */
06014                                 movd eax, mm7           /* copy saturated result into EAX */
06015                                 mov [edi], al           /* copy a byte result into Dest */
06016                                 movd eax, mm1           /* restore saved EAX */
06017                                 /* --, */
06018                                 movd esi, mm6           /* move Src pointer to the top pixel */
06019                                 sub edx, 104    /* EDX = Kernel address */
06020                                 inc              esi            /* move Src  pointer to the next pixel */
06021                                 inc              edi            /* move Dest pointer to the next pixel */
06022                                 /* ---, */
06023                                 dec              ecx            /* decrease loop counter COLUMNS */
06024                                 jnz            L10382           /* check loop termination, proceed if required */
06025                                 add esi, 6      /* move to the next row in Src */
06026                                 add edi, 6      /* move to the next row in Dest */
06027                                 dec              ebx            /* decrease loop counter ROWS */
06028                                 jnz            L10380           /* check loop termination, proceed if required */
06029                                 /* ---, */
06030                                 emms                            /* exit MMX state */
06031                                 popa
06032                 }
06033 #else
06034                 asm volatile
06035                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
06036                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
06037                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
06038                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */
06039                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
06040                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
06041                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
06042                         "add          $3, %%edi \n\t"   /* 3 column offset from the left edge */
06043                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
06044                         "add       %%eax, %%edi \n\t"   /* 3 row offset from the top edge */
06045                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"       /* initialize ROWS counter */
06046                         "sub          $6, %%ebx \n\t"   /* do not use first 3 and last 3 rows */
06047                         /* --- */
06048                         ".L10380:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
06049                         "sub          $6, %%ecx \n\t"   /* do not use first 3 and last 3 columns */
06050                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
06051                         ".L10382:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
06052                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
06053                         /* --- 1 */
06054                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06055                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06056                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06057                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06058                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06059                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06060                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06061                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06062                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06063                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06064                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06065                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06066                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06067                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06068                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06069                         /* --- 2 */
06070                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06071                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06072                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06073                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06074                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06075                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06076                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06077                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06078                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06079                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06080                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06081                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06082                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06083                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06084                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06085                         /* --- 3 */
06086                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06087                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06088                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06089                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06090                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06091                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06092                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06093                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06094                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06095                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06096                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06097                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06098                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06099                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06100                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06101                         /* --- 4 */
06102                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06103                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06104                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06105                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06106                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06107                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06108                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06109                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06110                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06111                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06112                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06113                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06114                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06115                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06116                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06117                         /* --- 5 */
06118                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06119                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06120                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06121                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06122                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06123                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06124                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06125                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06126                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06127                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06128                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06129                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06130                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06131                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06132                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06133                         /* --- 6 */
06134                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06135                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06136                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06137                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06138                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06139                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06140                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06141                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06142                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06143                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06144                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06145                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06146                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06147                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06148                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06149                         /* --- 7 */
06150                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06151                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06152                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06153                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06154                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06155                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06156                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06157                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06158                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06159                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06160                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06161                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06162                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06163                         /* --- */
06164                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
06165                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
06166                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
06167                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
06168                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
06169                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
06170                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */
06171                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
06172                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
06173                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
06174                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
06175                         /* -- */
06176                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
06177                         "sub        $104, %%edx \n\t"   /* EDX = Kernel address */
06178                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
06179                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
06180                         /* --- */
06181                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
06182                         "jnz            .L10382 \n\t"   /* check loop termination, proceed if required */
06183                         "add          $6, %%esi \n\t"   /* move to the next row in Src */
06184                         "add          $6, %%edi \n\t"   /* move to the next row in Dest */
06185                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
06186                         "jnz            .L10380 \n\t"   /* check loop termination, proceed if required */
06187                         /* --- */
06188                         "emms                   \n\t"   /* exit MMX state */
06189                         "popa                   \n\t":"=m" (Dest)       /* %0 */
06190                         :"m"(Src),              /* %1 */
06191                         "m"(rows),              /* %2 */
06192                         "m"(columns),           /* %3 */
06193                         "m"(Kernel),            /* %4 */
06194                         "m"(NRightShift)        /* %5 */
06195                         );
06196 #endif
06197 #endif
06198                 return (0);
06199         } else {
06200                 /* No non-MMX implementation yet */
06201                 return (-1);
06202         }
06203 }
06204 
06219 int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
06220                                                                                            signed short *Kernel, unsigned char NRightShift)
06221 {
06222         /* Validate input parameters */
06223         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
06224                 return(-1);
06225 
06226         if ((columns < 9) || (rows < 9) || (NRightShift > 7))
06227                 return (-1);
06228 
06229         if ((SDL_imageFilterMMXdetect())) {
06230 //#ifdef USE_MMX
06231 #if defined(USE_MMX) && defined(i386)
06232 #if !defined(GCC__)
06233                 __asm
06234                 {
06235                         pusha
06236                                 pxor mm0, mm0           /* zero MM0 */
06237                                 xor ebx, ebx    /* zero EBX */
06238                                 mov bl, NRightShift     /* load NRightShift into BL */
06239                                 movd mm5, ebx           /* copy NRightShift into MM5 */
06240                                 mov edx, Kernel         /* load Kernel address into EDX */
06241                                 mov esi, Src    /* load Src  address to ESI */
06242                                 mov edi, Dest           /* load Dest address to EDI */
06243                                 add edi, 4      /* 4 column offset from the left edge */
06244                                 mov eax, columns        /* load columns into EAX */
06245                                 add edi, eax    /* 4 row offset from the top edge */
06246                                 add edi, eax
06247                                 add edi, eax
06248                                 add edi, eax
06249                                 mov ebx, rows           /* initialize ROWS counter */
06250                                 sub ebx, 8      /* do not use first 4 and last 4 rows */
06251                                 /* ---, */
06252 L10390:
06253                         mov ecx, eax    /* initialize COLUMNS counter */
06254                                 sub ecx, 8      /* do not use first 4 and last 4 columns */
06255                                 align 16                        /* 16 byte alignment of the loop entry */
06256 L10392:
06257                         pxor mm7, mm7           /* zero MM7 (accumulator) */
06258                                 movd mm6, esi           /* save ESI in MM6 */
06259                                 /* --- 1 */
06260                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06261                         movq mm2, mm1           /* copy MM1 into MM2 */
06262                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06263                                 movq mm3, [edx]         /* load 4 words of Kernel */
06264                         add edx, 8      /* move pointer to other 4 words */
06265                                 movq mm4, [edx]         /* load 4 words of Kernel */
06266                         add edx, 8      /* move pointer to other 4 words */
06267                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06268                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06269                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06270                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06271                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06272                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06273                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06274                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06275                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06276                         dec              esi
06277                                 add esi, eax    /* move Src pointer 1 row below */
06278                                 movq mm3, [edx]         /* load 4 words of Kernel */
06279                         add edx, 8      /* move pointer to other 4 words */
06280                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06281                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06282                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06283                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06284                                 /* --- 2 */
06285                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06286                         movq mm2, mm1           /* copy MM1 into MM2 */
06287                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06288                                 movq mm3, [edx]         /* load 4 words of Kernel */
06289                         add edx, 8      /* move pointer to other 4 words */
06290                                 movq mm4, [edx]         /* load 4 words of Kernel */
06291                         add edx, 8      /* move pointer to other 4 words */
06292                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06293                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06294                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06295                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06296                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06297                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06298                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06299                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06300                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06301                         dec              esi
06302                                 add esi, eax    /* move Src pointer 1 row below */
06303                                 movq mm3, [edx]         /* load 4 words of Kernel */
06304                         add edx, 8      /* move pointer to other 4 words */
06305                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06306                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06307                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06308                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06309                                 /* --- 3 */
06310                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06311                         movq mm2, mm1           /* copy MM1 into MM2 */
06312                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06313                                 movq mm3, [edx]         /* load 4 words of Kernel */
06314                         add edx, 8      /* move pointer to other 4 words */
06315                                 movq mm4, [edx]         /* load 4 words of Kernel */
06316                         add edx, 8      /* move pointer to other 4 words */
06317                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06318                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06319                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06320                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06321                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06322                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06323                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06324                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06325                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06326                         dec              esi
06327                                 add esi, eax    /* move Src pointer 1 row below */
06328                                 movq mm3, [edx]         /* load 4 words of Kernel */
06329                         add edx, 8      /* move pointer to other 4 words */
06330                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06331                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06332                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06333                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06334                                 /* --- 4 */
06335                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06336                         movq mm2, mm1           /* copy MM1 into MM2 */
06337                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06338                                 movq mm3, [edx]         /* load 4 words of Kernel */
06339                         add edx, 8      /* move pointer to other 4 words */
06340                                 movq mm4, [edx]         /* load 4 words of Kernel */
06341                         add edx, 8      /* move pointer to other 4 words */
06342                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06343                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06344                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06345                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06346                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06347                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06348                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06349                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06350                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06351                         dec              esi
06352                                 add esi, eax    /* move Src pointer 1 row below */
06353                                 movq mm3, [edx]         /* load 4 words of Kernel */
06354                         add edx, 8      /* move pointer to other 4 words */
06355                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06356                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06357                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06358                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06359                                 /* --- 5 */
06360                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06361                         movq mm2, mm1           /* copy MM1 into MM2 */
06362                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06363                                 movq mm3, [edx]         /* load 4 words of Kernel */
06364                         add edx, 8      /* move pointer to other 4 words */
06365                                 movq mm4, [edx]         /* load 4 words of Kernel */
06366                         add edx, 8      /* move pointer to other 4 words */
06367                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06368                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06369                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06370                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06371                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06372                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06373                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06374                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06375                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06376                         dec              esi
06377                                 add esi, eax    /* move Src pointer 1 row below */
06378                                 movq mm3, [edx]         /* load 4 words of Kernel */
06379                         add edx, 8      /* move pointer to other 4 words */
06380                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06381                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06382                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06383                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06384                                 /* --- 6 */
06385                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06386                         movq mm2, mm1           /* copy MM1 into MM2 */
06387                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06388                                 movq mm3, [edx]         /* load 4 words of Kernel */
06389                         add edx, 8      /* move pointer to other 4 words */
06390                                 movq mm4, [edx]         /* load 4 words of Kernel */
06391                         add edx, 8      /* move pointer to other 4 words */
06392                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06393                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06394                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06395                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06396                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06397                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06398                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06399                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06400                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06401                         dec              esi
06402                                 add esi, eax    /* move Src pointer 1 row below */
06403                                 movq mm3, [edx]         /* load 4 words of Kernel */
06404                         add edx, 8      /* move pointer to other 4 words */
06405                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06406                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06407                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06408                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06409                                 /* --- 7 */
06410                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06411                         movq mm2, mm1           /* copy MM1 into MM2 */
06412                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06413                                 movq mm3, [edx]         /* load 4 words of Kernel */
06414                         add edx, 8      /* move pointer to other 4 words */
06415                                 movq mm4, [edx]         /* load 4 words of Kernel */
06416                         add edx, 8      /* move pointer to other 4 words */
06417                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06418                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06419                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06420                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06421                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06422                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06423                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06424                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06425                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06426                         dec              esi
06427                                 add esi, eax    /* move Src pointer 1 row below */
06428                                 movq mm3, [edx]         /* load 4 words of Kernel */
06429                         add edx, 8      /* move pointer to other 4 words */
06430                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06431                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06432                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06433                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06434                                 /* --- 8 */
06435                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06436                         movq mm2, mm1           /* copy MM1 into MM2 */
06437                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06438                                 movq mm3, [edx]         /* load 4 words of Kernel */
06439                         add edx, 8      /* move pointer to other 4 words */
06440                                 movq mm4, [edx]         /* load 4 words of Kernel */
06441                         add edx, 8      /* move pointer to other 4 words */
06442                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06443                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06444                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06445                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06446                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06447                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06448                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06449                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06450                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06451                         dec              esi
06452                                 add esi, eax    /* move Src pointer 1 row below */
06453                                 movq mm3, [edx]         /* load 4 words of Kernel */
06454                         add edx, 8      /* move pointer to other 4 words */
06455                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06456                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06457                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06458                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06459                                 /* --- 9 */
06460                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06461                         movq mm2, mm1           /* copy MM1 into MM2 */
06462                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06463                                 movq mm3, [edx]         /* load 4 words of Kernel */
06464                         add edx, 8      /* move pointer to other 4 words */
06465                                 movq mm4, [edx]         /* load 4 words of Kernel */
06466                         add edx, 8      /* move pointer to other 4 words */
06467                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06468                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06469                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06470                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06471                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06472                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06473                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06474                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06475                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06476                         movq mm3, [edx]         /* load 4 words of Kernel */
06477                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06478                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06479                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06480                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06481                                 /* ---, */
06482                                 movq mm3, mm7           /* copy MM7 into MM3 */
06483                                 psrlq mm7, 32           /* shift 2 left words to the right */
06484                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
06485                                 movq mm2, mm7           /* copy MM7 into MM2 */
06486                                 psrlq mm7, 16           /* shift 1 left word to the right */
06487                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
06488                                 movd mm1, eax           /* save EAX in MM1 */
06489                                 packuswb mm7, mm0       /* pack division result with saturation */
06490                                 movd eax, mm7           /* copy saturated result into EAX */
06491                                 mov [edi], al           /* copy a byte result into Dest */
06492                                 movd eax, mm1           /* restore saved EAX */
06493                                 /* --, */
06494                                 movd esi, mm6           /* move Src pointer to the top pixel */
06495                                 sub edx, 208    /* EDX = Kernel address */
06496                                 inc              esi            /* move Src  pointer to the next pixel */
06497                                 inc              edi            /* move Dest pointer to the next pixel */
06498                                 /* ---, */
06499                                 dec              ecx            /* decrease loop counter COLUMNS */
06500                                 jnz            L10392           /* check loop termination, proceed if required */
06501                                 add esi, 8      /* move to the next row in Src */
06502                                 add edi, 8      /* move to the next row in Dest */
06503                                 dec              ebx            /* decrease loop counter ROWS */
06504                                 jnz            L10390           /* check loop termination, proceed if required */
06505                                 /* ---, */
06506                                 emms                            /* exit MMX state */
06507                                 popa
06508                 }
06509 #else
06510                 asm volatile
06511                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
06512                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
06513                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
06514                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */
06515                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
06516                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
06517                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
06518                         "add          $4, %%edi \n\t"   /* 4 column offset from the left edge */
06519                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
06520                         "add       %%eax, %%edi \n\t"   /* 4 row offset from the top edge */
06521                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t" /* initialize ROWS counter */
06522                         "sub          $8, %%ebx \n\t"   /* do not use first 4 and last 4 rows */
06523                         /* --- */
06524                         ".L10390:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
06525                         "sub          $8, %%ecx \n\t"   /* do not use first 4 and last 4 columns */
06526                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
06527                         ".L10392:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
06528                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
06529                         /* --- 1 */
06530                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06531                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06532                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06533                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06534                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06535                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06536                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06537                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06538                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06539                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06540                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06541                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06542                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06543                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06544                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06545                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06546                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06547                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06548                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06549                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06550                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06551                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06552                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06553                         /* --- 2 */
06554                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06555                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06556                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06557                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06558                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06559                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06560                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06561                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06562                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06563                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06564                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06565                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06566                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06567                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06568                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06569                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06570                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06571                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06572                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06573                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06574                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06575                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06576                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06577                         /* --- 3 */
06578                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06579                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06580                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06581                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06582                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06583                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06584                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06585                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06586                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06587                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06588                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06589                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06590                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06591                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06592                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06593                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06594                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06595                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06596                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06597                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06598                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06599                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06600                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06601                         /* --- 4 */
06602                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06603                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06604                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06605                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06606                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06607                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06608                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06609                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06610                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06611                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06612                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06613                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06614                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06615                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06616                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06617                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06618                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06619                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06620                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06621                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06622                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06623                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06624                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06625                         /* --- 5 */
06626                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06627                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06628                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06629                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06630                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06631                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06632                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06633                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06634                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06635                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06636                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06637                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06638                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06639                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06640                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06641                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06642                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06643                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06644                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06645                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06646                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06647                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06648                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06649                         /* --- 6 */
06650                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06651                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06652                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06653                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06654                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06655                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06656                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06657                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06658                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06659                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06660                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06661                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06662                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06663                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06664                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06665                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06666                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06667                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06668                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06669                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06670                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06671                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06672                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06673                         /* --- 7 */
06674                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06675                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06676                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06677                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06678                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06679                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06680                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06681                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06682                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06683                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06684                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06685                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06686                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06687                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06688                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06689                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06690                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06691                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06692                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06693                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06694                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06695                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06696                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06697                         /* --- 8 */
06698                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06699                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06700                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06701                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06702                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06703                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06704                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06705                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06706                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06707                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06708                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06709                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06710                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06711                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06712                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06713                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06714                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06715                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06716                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06717                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06718                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06719                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06720                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06721                         /* --- 9 */
06722                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06723                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06724                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06725                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06726                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06727                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06728                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06729                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06730                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06731                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06732                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06733                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06734                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06735                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06736                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06737                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06738                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06739                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06740                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06741                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06742                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06743                         /* --- */
06744                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
06745                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
06746                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
06747                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
06748                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
06749                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
06750                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */
06751                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
06752                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
06753                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
06754                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
06755                         /* -- */
06756                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
06757                         "sub        $208, %%edx \n\t"   /* EDX = Kernel address */
06758                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
06759                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
06760                         /* --- */
06761                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
06762                         "jnz            .L10392 \n\t"   /* check loop termination, proceed if required */
06763                         "add          $8, %%esi \n\t"   /* move to the next row in Src */
06764                         "add          $8, %%edi \n\t"   /* move to the next row in Dest */
06765                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
06766                         "jnz            .L10390 \n\t"   /* check loop termination, proceed if required */
06767                         /* --- */
06768                         "emms                   \n\t"   /* exit MMX state */
06769                         "popa                   \n\t":"=m" (Dest)       /* %0 */
06770                         :"m"(Src),              /* %1 */
06771                         "m"(rows),              /* %2 */
06772                         "m"(columns),           /* %3 */
06773                         "m"(Kernel),            /* %4 */
06774                         "m"(NRightShift)        /* %5 */
06775                         );
06776 #endif
06777 #endif
06778                 return (0);
06779         } else {
06780                 /* No non-MMX implementation yet */
06781                 return (-1);
06782         }
06783 }
06784 
06785 /* ------------------------------------------------------------------------------------ */
06786 
06799 int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
06800 {
06801         /* Validate input parameters */
06802         if ((Src == NULL) || (Dest == NULL))
06803                 return(-1);
06804 
06805         if ((columns < 8) || (rows < 3))
06806                 return (-1);
06807 
06808         if ((SDL_imageFilterMMXdetect())) {
06809 //#ifdef USE_MMX
06810 #if defined(USE_MMX) && defined(i386)
06811 #if !defined(GCC__)
06812                 __asm
06813                 {
06814                         pusha
06815                                 pxor mm0, mm0           /* zero MM0 */
06816                                 mov eax, columns        /* load columns into EAX */
06817                                 /* ---, */
06818                                 mov esi, Src    /* ESI = Src row 0 address */
06819                                 mov edi, Dest           /* load Dest address to EDI */
06820                                 add edi, eax    /* EDI = EDI + columns */
06821                                 inc              edi            /* 1 byte offset from the left edge */
06822                                 mov edx, rows           /* initialize ROWS counter */
06823                                 sub edx, 2      /* do not use first and last rows */
06824                                 /* ---, */
06825 L10400:
06826                         mov ecx, eax    /* initialize COLUMS counter */
06827                                 shr ecx, 3      /* EBX/8 (MMX loads 8 bytes at a time) */
06828                                 mov ebx, esi    /* save ESI in EBX */
06829                                 movd mm1, edi           /* save EDI in MM1 */
06830                                 align 16                        /* 16 byte alignment of the loop entry */
06831 L10402:
06832                         /* ---, */
06833                         movq mm4, [esi]         /* load 8 bytes from Src */
06834                         movq mm5, mm4           /* save MM4 in MM5 */
06835                                 add esi, 2      /* move ESI pointer 2 bytes right */
06836                                 punpcklbw mm4, mm0      /* unpack 4 low  bytes into words */
06837                                 punpckhbw mm5, mm0      /* unpack 4 high bytes into words */
06838                                 movq mm6, [esi]         /* load 8 bytes from Src */
06839                         movq mm7, mm6           /* save MM6 in MM7 */
06840                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
06841                                 punpcklbw mm6, mm0      /* unpack 4 low  bytes into words */
06842                                 punpckhbw mm7, mm0      /* unpack 4 high bytes into words */
06843                                 add esi, eax    /* move to the next row of Src */
06844                                 movq mm2, [esi]         /* load 8 bytes from Src */
06845                         movq mm3, mm2           /* save MM2 in MM3 */
06846                                 add esi, 2      /* move ESI pointer 2 bytes right */
06847                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
06848                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
06849                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
06850                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
06851                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
06852                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
06853                                 movq mm2, [esi]         /* load 8 bytes from Src */
06854                         movq mm3, mm2           /* save MM2 in MM3 */
06855                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
06856                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
06857                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
06858                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
06859                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
06860                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
06861                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
06862                                 add esi, eax    /* move to the next row of Src */
06863                                 movq mm2, [esi]         /* load 8 bytes from Src */
06864                         movq mm3, mm2           /* save MM2 in MM3 */
06865                                 add esi, 2      /* move ESI pointer 2 bytes right */
06866                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
06867                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
06868                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
06869                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
06870                                 movq mm2, [esi]         /* load 8 bytes from Src */
06871                         movq mm3, mm2           /* save MM2 in MM3 */
06872                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
06873                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
06874                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
06875                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
06876                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
06877                                 /* ---, */
06878                                 movq mm2, mm4           /* copy MM4 into MM2 */
06879                                 psrlq mm4, 32           /* shift 2 left words to the right */
06880                                 psubw mm4, mm2          /* MM4 = MM4 - MM2 */
06881                                 movq mm3, mm6           /* copy MM6 into MM3 */
06882                                 psrlq mm6, 32           /* shift 2 left words to the right */
06883                                 psubw mm6, mm3          /* MM6 = MM6 - MM3 */
06884                                 punpckldq mm4, mm6      /* combine 2 words of MM6 and 2 words of MM4 */
06885                                 movq mm2, mm5           /* copy MM6 into MM2 */
06886                                 psrlq mm5, 32           /* shift 2 left words to the right */
06887                                 psubw mm5, mm2          /* MM5 = MM5 - MM2 */
06888                                 movq mm3, mm7           /* copy MM7 into MM3 */
06889                                 psrlq mm7, 32           /* shift 2 left words to the right */
06890                                 psubw mm7, mm3          /* MM7 = MM7 - MM3 */
06891                                 punpckldq mm5, mm7      /* combine 2 words of MM7 and 2 words of MM5 */
06892                                 /* Take abs values of MM4 and MM5 */
06893                                 movq mm6, mm4           /* copy MM4 into MM6 */
06894                                 movq mm7, mm5           /* copy MM5 into MM7 */
06895                                 psraw mm6, 15           /* fill MM6 words with word sign bit */
06896                                 psraw mm7, 15           /* fill MM7 words with word sign bit */
06897                                 pxor mm4, mm6           /* take 1's compliment of only neg words */
06898                                 pxor mm5, mm7           /* take 1's compliment of only neg words */
06899                                 psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
06900                                 psubsw mm5, mm7         /* add 1 to only neg words, W-(-1) or W-0 */
06901                                 packuswb mm4, mm5       /* combine and pack/saturate MM5 and MM4 */
06902                                 movq [edi], mm4         /* store result in Dest */
06903                                 /* ---, */
06904                                 sub esi, eax    /* move to the current top row in Src */
06905                                 sub esi, eax
06906                                 add esi, 8      /* move Src  pointer to the next 8 pixels */
06907                                 add edi, 8      /* move Dest pointer to the next 8 pixels */
06908                                 /* ---, */
06909                                 dec              ecx            /* decrease loop counter COLUMNS */
06910                                 jnz            L10402           /* check loop termination, proceed if required */
06911                                 mov esi, ebx    /* restore most left current row Src  address */
06912                                 movd edi, mm1           /* restore most left current row Dest address */
06913                                 add esi, eax    /* move to the next row in Src */
06914                                 add edi, eax    /* move to the next row in Dest */
06915                                 dec              edx            /* decrease loop counter ROWS */
06916                                 jnz            L10400           /* check loop termination, proceed if required */
06917                                 /* ---, */
06918                                 emms                            /* exit MMX state */
06919                                 popa
06920                 }
06921 #else
06922                 asm volatile
06923                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
06924                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
06925                         /* --- */
06926                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
06927                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
06928                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
06929                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
06930                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */
06931                         "sub          $2, %%edx \n\t"   /* do not use first and last rows */
06932                         /* --- */
06933                         ".L10400:                \n\t" "mov       %%eax, %%ecx \n\t"    /* initialize COLUMS counter */
06934                         "shr          $3, %%ecx \n\t"   /* EBX/8 (MMX loads 8 bytes at a time) */
06935                         "mov       %%esi, %%ebx \n\t"   /* save ESI in EBX */
06936                         "movd      %%edi, %%mm1 \n\t"   /* save EDI in MM1 */
06937                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
06938                         ".L10402:               \n\t"
06939                         /* --- */
06940                         "movq    (%%esi), %%mm4 \n\t"   /* load 8 bytes from Src */
06941                         "movq      %%mm4, %%mm5 \n\t"   /* save MM4 in MM5 */
06942                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
06943                         "punpcklbw %%mm0, %%mm4 \n\t"   /* unpack 4 low  bytes into words */
06944                         "punpckhbw %%mm0, %%mm5 \n\t"   /* unpack 4 high bytes into words */
06945                         "movq    (%%esi), %%mm6 \n\t"   /* load 8 bytes from Src */
06946                         "movq      %%mm6, %%mm7 \n\t"   /* save MM6 in MM7 */
06947                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
06948                         "punpcklbw %%mm0, %%mm6 \n\t"   /* unpack 4 low  bytes into words */
06949                         "punpckhbw %%mm0, %%mm7 \n\t"   /* unpack 4 high bytes into words */
06950                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
06951                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
06952                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
06953                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
06954                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
06955                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
06956                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
06957                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
06958                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
06959                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
06960                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
06961                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
06962                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
06963                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
06964                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
06965                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
06966                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
06967                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
06968                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
06969                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
06970                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
06971                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
06972                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
06973                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
06974                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
06975                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
06976                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
06977                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
06978                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
06979                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
06980                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
06981                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
06982                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
06983                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
06984                         /* --- */
06985                         "movq      %%mm4, %%mm2 \n\t"   /* copy MM4 into MM2 */
06986                         "psrlq       $32, %%mm4 \n\t"   /* shift 2 left words to the right */
06987                         "psubw     %%mm2, %%mm4 \n\t"   /* MM4 = MM4 - MM2 */
06988                         "movq      %%mm6, %%mm3 \n\t"   /* copy MM6 into MM3 */
06989                         "psrlq       $32, %%mm6 \n\t"   /* shift 2 left words to the right */
06990                         "psubw     %%mm3, %%mm6 \n\t"   /* MM6 = MM6 - MM3 */
06991                         "punpckldq %%mm6, %%mm4 \n\t"   /* combine 2 words of MM6 and 2 words of MM4 */
06992                         "movq      %%mm5, %%mm2 \n\t"   /* copy MM6 into MM2 */
06993                         "psrlq       $32, %%mm5 \n\t"   /* shift 2 left words to the right */
06994                         "psubw     %%mm2, %%mm5 \n\t"   /* MM5 = MM5 - MM2 */
06995                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
06996                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
06997                         "psubw     %%mm3, %%mm7 \n\t"   /* MM7 = MM7 - MM3 */
06998                         "punpckldq %%mm7, %%mm5 \n\t"   /* combine 2 words of MM7 and 2 words of MM5 */
06999                         /* Take abs values of MM4 and MM5 */
07000                         "movq      %%mm4, %%mm6 \n\t"   /* copy MM4 into MM6 */
07001                         "movq      %%mm5, %%mm7 \n\t"   /* copy MM5 into MM7 */
07002                         "psraw       $15, %%mm6 \n\t"   /* fill MM6 words with word sign bit */
07003                         "psraw       $15, %%mm7 \n\t"   /* fill MM7 words with word sign bit */
07004                         "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */
07005                         "pxor      %%mm7, %%mm5 \n\t"   /* take 1's compliment of only neg. words */
07006                         "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07007                         "psubsw    %%mm7, %%mm5 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07008                         "packuswb  %%mm5, %%mm4 \n\t"   /* combine and pack/saturate MM5 and MM4 */
07009                         "movq    %%mm4, (%%edi) \n\t"   /* store result in Dest */
07010                         /* --- */
07011                         "sub       %%eax, %%esi \n\t"   /* move to the current top row in Src */
07012                         "sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"     /* move Src  pointer to the next 8 pixels */
07013                         "add $8,          %%edi \n\t"   /* move Dest pointer to the next 8 pixels */
07014                         /* --- */
07015                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
07016                         "jnz            .L10402 \n\t"   /* check loop termination, proceed if required */
07017                         "mov       %%ebx, %%esi \n\t"   /* restore most left current row Src  address */
07018                         "movd      %%mm1, %%edi \n\t"   /* restore most left current row Dest address */
07019                         "add       %%eax, %%esi \n\t"   /* move to the next row in Src */
07020                         "add       %%eax, %%edi \n\t"   /* move to the next row in Dest */
07021                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */
07022                         "jnz            .L10400 \n\t"   /* check loop termination, proceed if required */
07023                         /* --- */
07024                         "emms                   \n\t"   /* exit MMX state */
07025                         "popa                   \n\t":"=m" (Dest)       /* %0 */
07026                         :"m"(Src),              /* %1 */
07027                         "m"(rows),              /* %2 */
07028                         "m"(columns)            /* %3 */
07029                         );
07030 #endif
07031 #endif
07032                 return (0);
07033         } else {
07034                 /* No non-MMX implementation yet */
07035                 return (-1);
07036         }
07037 }
07038 
07052 int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
07053                                                                         unsigned char NRightShift)
07054 {
07055         /* Validate input parameters */
07056         if ((Src == NULL) || (Dest == NULL))
07057                 return(-1);
07058         if ((columns < 8) || (rows < 3) || (NRightShift > 7))
07059                 return (-1);
07060 
07061         if ((SDL_imageFilterMMXdetect())) {
07062 //#ifdef USE_MMX
07063 #if defined(USE_MMX) && defined(i386)
07064 #if !defined(GCC__)
07065                 __asm
07066                 {
07067                         pusha
07068                                 pxor mm0, mm0           /* zero MM0 */
07069                                 mov eax, columns        /* load columns into EAX */
07070                                 xor ebx, ebx    /* zero EBX */
07071                                 mov bl, NRightShift     /* load NRightShift into BL */
07072                                 movd mm1, ebx           /* copy NRightShift into MM1 */
07073                                 /* ---, */
07074                                 mov esi, Src    /* ESI = Src row 0 address */
07075                                 mov edi, Dest           /* load Dest address to EDI */
07076                                 add edi, eax    /* EDI = EDI + columns */
07077                                 inc              edi            /* 1 byte offset from the left edge */
07078                                 /* initialize ROWS counter */
07079                                 sub rows, 2     /* do not use first and last rows */
07080                                 /* ---, */
07081 L10410:
07082                         mov ecx, eax    /* initialize COLUMS counter */
07083                                 shr ecx, 3      /* EBX/8 (MMX loads 8 bytes at a time) */
07084                                 mov ebx, esi    /* save ESI in EBX */
07085                                 mov edx, edi    /* save EDI in EDX */
07086                                 align 16                        /* 16 byte alignment of the loop entry */
07087 L10412:
07088                         /* ---, */
07089                         movq mm4, [esi]         /* load 8 bytes from Src */
07090                         movq mm5, mm4           /* save MM4 in MM5 */
07091                                 add esi, 2      /* move ESI pointer 2 bytes right */
07092                                 punpcklbw mm4, mm0      /* unpack 4 low  bytes into words */
07093                                 punpckhbw mm5, mm0      /* unpack 4 high bytes into words */
07094                                 psrlw mm4, mm1          /* shift right each pixel NshiftRight times */
07095                                 psrlw mm5, mm1          /* shift right each pixel NshiftRight times */
07096                                 movq mm6, [esi]         /* load 8 bytes from Src */
07097                         movq mm7, mm6           /* save MM6 in MM7 */
07098                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07099                                 punpcklbw mm6, mm0      /* unpack 4 low  bytes into words */
07100                                 punpckhbw mm7, mm0      /* unpack 4 high bytes into words */
07101                                 psrlw mm6, mm1          /* shift right each pixel NshiftRight times */
07102                                 psrlw mm7, mm1          /* shift right each pixel NshiftRight times */
07103                                 add esi, eax    /* move to the next row of Src */
07104                                 movq mm2, [esi]         /* load 8 bytes from Src */
07105                         movq mm3, mm2           /* save MM2 in MM3 */
07106                                 add esi, 2      /* move ESI pointer 2 bytes right */
07107                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07108                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07109                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07110                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07111                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07112                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07113                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07114                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07115                                 movq mm2, [esi]         /* load 8 bytes from Src */
07116                         movq mm3, mm2           /* save MM2 in MM3 */
07117                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07118                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07119                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07120                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07121                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07122                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07123                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07124                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07125                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07126                                 add esi, eax    /* move to the next row of Src */
07127                                 movq mm2, [esi]         /* load 8 bytes from Src */
07128                         movq mm3, mm2           /* save MM2 in MM3 */
07129                                 add esi, 2      /* move ESI pointer 2 bytes right */
07130                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07131                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07132                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07133                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07134                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07135                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07136                                 movq mm2, [esi]         /* load 8 bytes from Src */
07137                         movq mm3, mm2           /* save MM2 in MM3 */
07138                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07139                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07140                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07141                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07142                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07143                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07144                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07145                                 /* ---, */
07146                                 movq mm2, mm4           /* copy MM4 into MM2 */
07147                                 psrlq mm4, 32           /* shift 2 left words to the right */
07148                                 psubw mm4, mm2          /* MM4 = MM4 - MM2 */
07149                                 movq mm3, mm6           /* copy MM6 into MM3 */
07150                                 psrlq mm6, 32           /* shift 2 left words to the right */
07151                                 psubw mm6, mm3          /* MM6 = MM6 - MM3 */
07152                                 punpckldq mm4, mm6      /* combine 2 words of MM6 and 2 words of MM4 */
07153                                 movq mm2, mm5           /* copy MM6 into MM2 */
07154                                 psrlq mm5, 32           /* shift 2 left words to the right */
07155                                 psubw mm5, mm2          /* MM5 = MM5 - MM2 */
07156                                 movq mm3, mm7           /* copy MM7 into MM3 */
07157                                 psrlq mm7, 32           /* shift 2 left words to the right */
07158                                 psubw mm7, mm3          /* MM7 = MM7 - MM3 */
07159                                 punpckldq mm5, mm7      /* combine 2 words of MM7 and 2 words of MM5 */
07160                                 /* Take abs values of MM4 and MM5 */
07161                                 movq mm6, mm4           /* copy MM4 into MM6 */
07162                                 movq mm7, mm5           /* copy MM5 into MM7 */
07163                                 psraw mm6, 15           /* fill MM6 words with word sign bit */
07164                                 psraw mm7, 15           /* fill MM7 words with word sign bit */
07165                                 pxor mm4, mm6           /* take 1's compliment of only neg words */
07166                                 pxor mm5, mm7           /* take 1's compliment of only neg words */
07167                                 psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
07168                                 psubsw mm5, mm7         /* add 1 to only neg words, W-(-1) or W-0 */
07169                                 packuswb mm4, mm5       /* combine and pack/saturate MM5 and MM4 */
07170                                 movq [edi], mm4         /* store result in Dest */
07171                                 /* ---, */
07172                                 sub esi, eax    /* move to the current top row in Src */
07173                                 sub esi, eax
07174                                 add esi, 8      /* move Src  pointer to the next 8 pixels */
07175                                 add edi, 8      /* move Dest pointer to the next 8 pixels */
07176                                 /* ---, */
07177                                 dec              ecx            /* decrease loop counter COLUMNS */
07178                                 jnz            L10412           /* check loop termination, proceed if required */
07179                                 mov esi, ebx    /* restore most left current row Src  address */
07180                                 mov edi, edx    /* restore most left current row Dest address */
07181                                 add esi, eax    /* move to the next row in Src */
07182                                 add edi, eax    /* move to the next row in Dest */
07183                                 dec rows        /* decrease loop counter ROWS */
07184                                 jnz            L10410           /* check loop termination, proceed if required */
07185                                 /* ---, */
07186                                 emms                            /* exit MMX state */
07187                                 popa
07188                 }
07189 #else
07190                 asm volatile
07191                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
07192                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
07193                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
07194                         "mov           %4, %%bl \n\t"   /* load NRightShift into BL */
07195                         "movd      %%ebx, %%mm1 \n\t"   /* copy NRightShift into MM1 */
07196                         /* --- */
07197                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
07198                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
07199                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
07200                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
07201                         /* initialize ROWS counter */
07202                         "subl            $2, %2 \n\t"   /* do not use first and last rows */
07203                         /* --- */
07204                         ".L10410:                \n\t" "mov       %%eax, %%ecx \n\t"    /* initialize COLUMS counter */
07205                         "shr          $3, %%ecx \n\t"   /* EBX/8 (MMX loads 8 bytes at a time) */
07206                         "mov       %%esi, %%ebx \n\t"   /* save ESI in EBX */
07207                         "mov       %%edi, %%edx \n\t"   /* save EDI in EDX */
07208                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
07209                         ".L10412:               \n\t"
07210                         /* --- */
07211                         "movq    (%%esi), %%mm4 \n\t"   /* load 8 bytes from Src */
07212                         "movq      %%mm4, %%mm5 \n\t"   /* save MM4 in MM5 */
07213                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07214                         "punpcklbw %%mm0, %%mm4 \n\t"   /* unpack 4 low  bytes into words */
07215                         "punpckhbw %%mm0, %%mm5 \n\t"   /* unpack 4 high bytes into words */
07216                         "psrlw     %%mm1, %%mm4 \n\t"   /* shift right each pixel NshiftRight times */
07217                         "psrlw     %%mm1, %%mm5 \n\t"   /* shift right each pixel NshiftRight times */
07218                         "movq    (%%esi), %%mm6 \n\t"   /* load 8 bytes from Src */
07219                         "movq      %%mm6, %%mm7 \n\t"   /* save MM6 in MM7 */
07220                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07221                         "punpcklbw %%mm0, %%mm6 \n\t"   /* unpack 4 low  bytes into words */
07222                         "punpckhbw %%mm0, %%mm7 \n\t"   /* unpack 4 high bytes into words */
07223                         "psrlw     %%mm1, %%mm6 \n\t"   /* shift right each pixel NshiftRight times */
07224                         "psrlw     %%mm1, %%mm7 \n\t"   /* shift right each pixel NshiftRight times */
07225                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
07226                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07227                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07228                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07229                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07230                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07231                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07232                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07233                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07234                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07235                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07236                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07237                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07238                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07239                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07240                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07241                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07242                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07243                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07244                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07245                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07246                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07247                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07248                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
07249                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07250                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07251                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07252                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07253                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07254                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07255                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07256                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07257                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07258                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07259                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07260                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07261                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07262                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07263                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07264                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07265                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07266                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07267                         /* --- */
07268                         "movq      %%mm4, %%mm2 \n\t"   /* copy MM4 into MM2 */
07269                         "psrlq       $32, %%mm4 \n\t"   /* shift 2 left words to the right */
07270                         "psubw     %%mm2, %%mm4 \n\t"   /* MM4 = MM4 - MM2 */
07271                         "movq      %%mm6, %%mm3 \n\t"   /* copy MM6 into MM3 */
07272                         "psrlq       $32, %%mm6 \n\t"   /* shift 2 left words to the right */
07273                         "psubw     %%mm3, %%mm6 \n\t"   /* MM6 = MM6 - MM3 */
07274                         "punpckldq %%mm6, %%mm4 \n\t"   /* combine 2 words of MM6 and 2 words of MM4 */
07275                         "movq      %%mm5, %%mm2 \n\t"   /* copy MM6 into MM2 */
07276                         "psrlq       $32, %%mm5 \n\t"   /* shift 2 left words to the right */
07277                         "psubw     %%mm2, %%mm5 \n\t"   /* MM5 = MM5 - MM2 */
07278                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
07279                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
07280                         "psubw     %%mm3, %%mm7 \n\t"   /* MM7 = MM7 - MM3 */
07281                         "punpckldq %%mm7, %%mm5 \n\t"   /* combine 2 words of MM7 and 2 words of MM5 */
07282                         /* Take abs values of MM4 and MM5 */
07283                         "movq      %%mm4, %%mm6 \n\t"   /* copy MM4 into MM6 */
07284                         "movq      %%mm5, %%mm7 \n\t"   /* copy MM5 into MM7 */
07285                         "psraw       $15, %%mm6 \n\t"   /* fill MM6 words with word sign bit */
07286                         "psraw       $15, %%mm7 \n\t"   /* fill MM7 words with word sign bit */
07287                         "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */
07288                         "pxor      %%mm7, %%mm5 \n\t"   /* take 1's compliment of only neg. words */
07289                         "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07290                         "psubsw    %%mm7, %%mm5 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07291                         "packuswb  %%mm5, %%mm4 \n\t"   /* combine and pack/saturate MM5 and MM4 */
07292                         "movq    %%mm4, (%%edi) \n\t"   /* store result in Dest */
07293                         /* --- */
07294                         "sub       %%eax, %%esi \n\t"   /* move to the current top row in Src */
07295                         "sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"     /* move Src  pointer to the next 8 pixels */
07296                         "add $8,          %%edi \n\t"   /* move Dest pointer to the next 8 pixels */
07297                         /* --- */
07298                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
07299                         "jnz            .L10412 \n\t"   /* check loop termination, proceed if required */
07300                         "mov       %%ebx, %%esi \n\t"   /* restore most left current row Src  address */
07301                         "mov       %%edx, %%edi \n\t"   /* restore most left current row Dest address */
07302                         "add       %%eax, %%esi \n\t"   /* move to the next row in Src */
07303                         "add       %%eax, %%edi \n\t"   /* move to the next row in Dest */
07304                         "decl                %2 \n\t"   /* decrease loop counter ROWS */
07305                         "jnz            .L10410 \n\t"   /* check loop termination, proceed if required */
07306                         /* --- */
07307                         "emms                   \n\t"   /* exit MMX state */
07308                         "popa                   \n\t":"=m" (Dest)       /* %0 */
07309                         :"m"(Src),              /* %1 */
07310                         "m"(rows),              /* %2 */
07311                         "m"(columns),           /* %3 */
07312                         "m"(NRightShift)        /* %4 */
07313                         );
07314 #endif
07315 #endif
07316                 return (0);
07317         } else {
07318                 /* No non-MMX implementation yet */
07319                 return (-1);
07320         }
07321 }
07322 
07326 void SDL_imageFilterAlignStack(void)
07327 {
07328 #ifdef USE_MMX
07329 #if !defined(GCC__)
07330         __asm
07331         {                               /* --- stack alignment --- */
07332                 mov ebx, esp    /* load ESP into EBX */
07333                         sub ebx, 4      /* reserve space on stack for old value of ESP */
07334                         and ebx, -32    /* align EBX along a 32 byte boundary */
07335                         mov [ebx], esp          /* save old value of ESP in stack, behind the bndry */
07336                         mov esp, ebx    /* align ESP along a 32 byte boundary */
07337         }
07338 #else
07339         asm volatile
07340                 (                               /* --- stack alignment --- */
07341                 "mov       %%esp, %%ebx \n\t"   /* load ESP into EBX */
07342                 "sub          $4, %%ebx \n\t"   /* reserve space on stack for old value of ESP */
07343                 "and        $-32, %%ebx \n\t"   /* align EBX along a 32 byte boundary */
07344                 "mov     %%esp, (%%ebx) \n\t"   /* save old value of ESP in stack, behind the bndry */
07345                 "mov       %%ebx, %%esp \n\t"   /* align ESP along a 32 byte boundary */
07346                 ::);
07347 #endif
07348 #endif
07349 }
07350 
07354 void SDL_imageFilterRestoreStack(void)
07355 {
07356 #ifdef USE_MMX
07357 #if !defined(GCC__)
07358         __asm
07359         {                               /* --- restoring old stack --- */
07360                 mov ebx, [esp]          /* load old value of ESP */
07361                 mov esp, ebx    /* restore old value of ESP */
07362         }
07363 #else
07364         asm volatile
07365                 (                               /* --- restoring old stack --- */
07366                 "mov     (%%esp), %%ebx \n\t"   /* load old value of ESP */
07367                 "mov       %%ebx, %%esp \n\t"   /* restore old value of ESP */
07368                 ::);
07369 #endif
07370 #endif
07371 }