SDL_gfx: I:/Sources/sdlgfx/SDL_imageFilter.c Source File

Go to the documentation of this file.
00001 /*
00002 
00003 SDL_imageFilter.c: byte-image "filter" routines
00004 
00005 Copyright (C) 2001-2012  Andreas Schiffler
00006 Copyright (C) 2013  Sylvain Beucler
00007 
00008 This software is provided 'as-is', without any express or implied
00009 warranty. In no event will the authors be held liable for any damages
00010 arising from the use of this software.
00011 
00012 Permission is granted to anyone to use this software for any purpose,
00013 including commercial applications, and to alter it and redistribute it
00014 freely, subject to the following restrictions:
00015 
00016    1. The origin of this software must not be misrepresented; you must not
00017    claim that you wrote the original software. If you use this software
00018    in a product, an acknowledgment in the product documentation would be
00019    appreciated but is not required.
00020 
00021    2. Altered source versions must be plainly marked as such, and must not be
00022    misrepresented as being the original software.
00023 
00024    3. This notice may not be removed or altered from any source
00025    distribution.
00026 
00027 Andreas Schiffler -- aschiffler at ferzkopp dot net
00028 
00029 */
00030 
00031 /*
00032 
00033 Note: Uses inline x86 MMX or ASM optimizations if available and enabled.
00034 
00035 Note: Most of the MMX code is based on published routines 
00036 by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 
00037 him for his work.
00038 
00039 */
00040 
00041 #include <stdio.h>
00042 #include <stdlib.h>
00043 #include <string.h>
00044 
00045 /* Use GCC intrinsics if available: they support both i386 and x86_64,
00046    provide ASM-grade performances, and lift the PUSHA/POPA issues. */
00047 #ifdef __GNUC__
00048 #  ifdef USE_MMX
00049 #    include <mmintrin.h>
00050 #  endif
00051 #endif
00052 #include <SDL_cpuinfo.h>
00053 #include "SDL_imageFilter.h"
00054 
00058 #define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8)  | (((x) & 0x0000ff00) << 8)  | ((x) << 24))
00059 
00060 /* ------ Static variables ----- */
00061 
00065 static int SDL_imageFilterUseMMX = 1;
00066 
00067 /* Detect GCC */
00068 #if defined(__GNUC__)
00069 #define GCC__
00070 #endif
00071 
00077 int SDL_imageFilterMMXdetect(void)
00078 {
00079         /* Check override flag */
00080         if (SDL_imageFilterUseMMX == 0) {
00081                 return (0);
00082         }
00083 
00084         return SDL_HasMMX();
00085 }
00086 
00090 void SDL_imageFilterMMXoff()
00091 {
00092         SDL_imageFilterUseMMX = 0;
00093 }
00094 
00098 void SDL_imageFilterMMXon()
00099 {
00100         SDL_imageFilterUseMMX = 1;
00101 }
00102 
00103 /* ------------------------------------------------------------------------------------ */
00104 
00115 static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00116 {
00117 #ifdef USE_MMX
00118 #if !defined(GCC__)
00119         __asm
00120         {
00121                 pusha
00122                         mov eax, Src1   /* load Src1 address into eax */
00123                         mov ebx, Src2   /* load Src2 address into ebx */
00124                         mov edi, Dest   /* load Dest address into edi */
00125                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
00126                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
00127                         align 16        /* 16 byte alignment of the loop entry */
00128 L1010:
00129                 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
00130                 paddusb mm1, [ebx]      /* mm1=Src1+Src2 (add 8 bytes with saturation) */
00131                 movq [edi], mm1 /* store result in Dest */
00132                         add eax, 8      /* increase Src1, Src2 and Dest  */
00133                         add ebx, 8      /* register pointers by 8 */
00134                         add edi, 8
00135                         dec ecx /* decrease loop counter */
00136                         jnz L1010       /* check loop termination, proceed if required */
00137                         emms /* exit MMX state */
00138                         popa
00139         }
00140 #else
00141         /* i386 and x86_64 */
00142         __m64 *mSrc1 = (__m64*)Src1;
00143         __m64 *mSrc2 = (__m64*)Src2;
00144         __m64 *mDest = (__m64*)Dest;
00145         int i;
00146         for (i = 0; i < SrcLength/8; i++) {
00147                 *mDest = _m_paddusb(*mSrc1, *mSrc2);    /* Src1+Src2 (add 8 bytes with saturation) */
00148                 mSrc1++;
00149                 mSrc2++;
00150                 mDest++;
00151         }
00152         _m_empty();                                     /* clean MMX state */
00153 #endif
00154         return (0);
00155 #else
00156         return (-1);
00157 #endif
00158 }
00159 
00170 int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00171 {
00172         unsigned int i, istart;
00173         unsigned char *cursrc1, *cursrc2, *curdst;
00174         int result;
00175 
00176         /* Validate input parameters */
00177         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00178                 return(-1);
00179         if (length == 0)
00180                 return(0);
00181 
00182         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00183 
00184                 /* Use MMX assembly routine */
00185                 SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
00186 
00187                 /* Check for unaligned bytes */
00188                 if ((length & 7) > 0) {
00189                         /* Setup to process unaligned bytes */
00190                         istart = length & 0xfffffff8;
00191                         cursrc1 = &Src1[istart];
00192                         cursrc2 = &Src2[istart];
00193                         curdst = &Dest[istart];
00194                 } else {
00195                         /* No unaligned bytes - we are done */
00196                         return (0);
00197                 }
00198         } else {
00199                 /* Setup to process whole image */
00200                 istart = 0;
00201                 cursrc1 = Src1;
00202                 cursrc2 = Src2;
00203                 curdst = Dest;
00204         }
00205 
00206         /* C routine to process image */
00207         for (i = istart; i < length; i++) {
00208                 result = (int) *cursrc1 + (int) *cursrc2;
00209                 if (result > 255)
00210                         result = 255;
00211                 *curdst = (unsigned char) result;
00212                 /* Advance pointers */
00213                 cursrc1++;
00214                 cursrc2++;
00215                 curdst++;
00216         }
00217 
00218         return (0);
00219 }
00220 
00232 static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
00233                                                    unsigned char *Mask)
00234 {
00235 #ifdef USE_MMX
00236 #if !defined(GCC__)
00237         __asm
00238         { 
00239                 pusha
00240                         mov edx, Mask /* load Mask address into edx */
00241                         movq mm0, [edx] /* load Mask into mm0 */
00242                 mov eax, Src1 /* load Src1 address into eax */
00243                         mov ebx, Src2 /* load Src2 address into ebx */
00244                         mov edi, Dest /* load Dest address into edi */
00245                         mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
00246                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
00247                         align 16        /* 16 byte alignment of the loop entry */
00248 L21011:
00249                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00250                 movq mm2,  [ebx]        /* load 8 bytes from Src2 into mm2 */
00251                 /* --- Byte shift via Word shift --- */
00252                 psrlw mm1, 1    /* shift 4 WORDS of mm1 1 bit to the right */
00253                         psrlw mm2, 1    /* shift 4 WORDS of mm2 1 bit to the right */
00254                         pand mm1, mm0   // apply Mask to 8 BYTES of mm1 */
00255                         /* byte     0x0f, 0xdb, 0xc8 */
00256                         pand mm2, mm0   // apply Mask to 8 BYTES of mm2 */
00257                         /* byte     0x0f, 0xdb, 0xd0 */
00258                         paddusb mm1,  mm2       /* mm1=mm1+mm2 (add 8 bytes with saturation) */
00259                         movq [edi],  mm1        /* store result in Dest */
00260                         add eax,  8     /* increase Src1, Src2 and Dest  */
00261                         add ebx,  8     /* register pointers by 8 */
00262                         add edi,  8
00263                         dec ecx         /* decrease loop counter */
00264                         jnz L21011      /* check loop termination, proceed if required */
00265                         emms    /* exit MMX state */
00266                         popa
00267         }
00268 #else
00269         /* i386 and x86_64 */
00270         __m64 *mSrc1 = (__m64*)Src1;
00271         __m64 *mSrc2 = (__m64*)Src2;
00272         __m64 *mDest = (__m64*)Dest;
00273         __m64 *mMask = (__m64*)Mask;
00274         int i;
00275         for (i = 0; i < SrcLength/8; i++) {
00276                 __m64 mm1 = *mSrc1,
00277                       mm2 = *mSrc2;
00278                 mm1 = _m_psrlwi(mm1, 1);        /* shift 4 WORDS of mm1 1 bit to the right */
00279                 mm2 = _m_psrlwi(mm2, 1);        /* shift 4 WORDS of mm2 1 bit to the right */
00280                 mm1 = _m_pand(mm1, *mMask);     /* apply Mask to 8 BYTES of mm1 */
00281                 mm2 = _m_pand(mm2, *mMask);     /* apply Mask to 8 BYTES of mm2 */
00282                 *mDest = _m_paddusb(mm1, mm2);  /* mm1+mm2 (add 8 bytes with saturation) */
00283                 mSrc1++;
00284                 mSrc2++;
00285                 mDest++;
00286         }
00287         _m_empty();                             /* clean MMX state */
00288 #endif
00289         return (0);
00290 #else
00291         return (-1);
00292 #endif
00293 }
00294 
00305 int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00306 {
00307         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
00308         unsigned int i, istart;
00309         unsigned char *cursrc1, *cursrc2, *curdst;
00310         int result;
00311 
00312         /* Validate input parameters */
00313         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00314                 return(-1);
00315         if (length == 0)
00316                 return(0);
00317 
00318         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00319                 /* MMX routine */
00320                 SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
00321 
00322                 /* Check for unaligned bytes */
00323                 if ((length & 7) > 0) {
00324                         /* Setup to process unaligned bytes */
00325                         istart = length & 0xfffffff8;
00326                         cursrc1 = &Src1[istart];
00327                         cursrc2 = &Src2[istart];
00328                         curdst = &Dest[istart];
00329                 } else {
00330                         /* No unaligned bytes - we are done */
00331                         return (0);
00332                 }
00333         } else {
00334                 /* Setup to process whole image */
00335                 istart = 0;
00336                 cursrc1 = Src1;
00337                 cursrc2 = Src2;
00338                 curdst = Dest;
00339         }
00340 
00341         /* C routine to process image */
00342         for (i = istart; i < length; i++) {
00343                 result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
00344                 *curdst = (unsigned char) result;
00345                 /* Advance pointers */
00346                 cursrc1++;
00347                 cursrc2++;
00348                 curdst++;
00349         }
00350 
00351         return (0);
00352 }
00353 
00364 static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00365 {
00366 #ifdef USE_MMX
00367 #if !defined(GCC__)
00368         __asm
00369         {
00370                 pusha
00371                         mov eax,  Src1  /* load Src1 address into eax */
00372                         mov ebx,  Src2  /* load Src2 address into ebx */
00373                         mov edi,  Dest  /* load Dest address into edi */
00374                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
00375                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
00376                         align 16 /* 16 byte alignment of the loop entry */
00377 L1012:
00378                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00379                 psubusb mm1,  [ebx]     /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
00380                 movq [edi],  mm1        /* store result in Dest */
00381                         add eax, 8      /* increase Src1, Src2 and Dest  */
00382                         add ebx, 8      /* register pointers by 8 */
00383                         add edi, 8
00384                         dec ecx /* decrease loop counter */
00385                         jnz L1012       /* check loop termination, proceed if required */
00386                         emms /* exit MMX state */
00387                         popa
00388         }
00389 #else
00390         /* i386 and x86_64 */
00391         __m64 *mSrc1 = (__m64*)Src1;
00392         __m64 *mSrc2 = (__m64*)Src2;
00393         __m64 *mDest = (__m64*)Dest;
00394         int i;
00395         for (i = 0; i < SrcLength/8; i++) {
00396                 *mDest = _m_psubusb(*mSrc1, *mSrc2);    /* Src1-Src2 (sub 8 bytes with saturation) */
00397                 mSrc1++;
00398                 mSrc2++;
00399                 mDest++;
00400         }
00401         _m_empty();                                     /* clean MMX state */
00402 #endif
00403         return (0);
00404 #else
00405         return (-1);
00406 #endif
00407 }
00408 
00419 int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00420 {
00421         unsigned int i, istart;
00422         unsigned char *cursrc1, *cursrc2, *curdst;
00423         int result;
00424 
00425         /* Validate input parameters */
00426         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00427                 return(-1);
00428         if (length == 0)
00429                 return(0);
00430 
00431         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00432                 /* MMX routine */
00433                 SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
00434 
00435                 /* Check for unaligned bytes */
00436                 if ((length & 7) > 0) {
00437                         /* Setup to process unaligned bytes */
00438                         istart = length & 0xfffffff8;
00439                         cursrc1 = &Src1[istart];
00440                         cursrc2 = &Src2[istart];
00441                         curdst = &Dest[istart];
00442                 } else {
00443                         /* No unaligned bytes - we are done */
00444                         return (0);
00445                 }
00446         } else {
00447                 /* Setup to process whole image */
00448                 istart = 0;
00449                 cursrc1 = Src1;
00450                 cursrc2 = Src2;
00451                 curdst = Dest;
00452         }
00453 
00454         /* C routine to process image */
00455         for (i = istart; i < length; i++) {
00456                 result = (int) *cursrc1 - (int) *cursrc2;
00457                 if (result < 0)
00458                         result = 0;
00459                 *curdst = (unsigned char) result;
00460                 /* Advance pointers */
00461                 cursrc1++;
00462                 cursrc2++;
00463                 curdst++;
00464         }
00465 
00466         return (0);
00467 }
00468 
00479 static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00480 {
00481 #ifdef USE_MMX
00482 #if !defined(GCC__)
00483         __asm
00484         {
00485                 pusha
00486                         mov eax, Src1   /* load Src1 address into eax */
00487                         mov ebx, Src2   /* load Src2 address into ebx */
00488                         mov edi, Dest   /* load Dest address into edi */
00489                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
00490                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
00491                         align 16        /* 16 byte alignment of the loop entry */
00492 L1013:
00493                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00494                 movq mm2,  [ebx]        /* load 8 bytes from Src2 into mm2 */
00495                 psubusb mm1,  [ebx]     /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
00496                 psubusb mm2,  [eax]     /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
00497                 por mm1,  mm2   /* combine both mm2 and mm1 results */
00498                         movq [edi],  mm1        /* store result in Dest */
00499                         add eax, 8      /* increase Src1, Src2 and Dest  */
00500                         add ebx, 8      /* register pointers by 8 */
00501                         add edi, 8
00502                         dec ecx         /* decrease loop counter */
00503                         jnz L1013       /* check loop termination, proceed if required */
00504                         emms         /* exit MMX state */
00505                         popa
00506         }
00507 #else
00508         /* i386 and x86_64 */
00509         __m64 *mSrc1 = (__m64*)Src1;
00510         __m64 *mSrc2 = (__m64*)Src2;
00511         __m64 *mDest = (__m64*)Dest;
00512         int i;
00513         for (i = 0; i < SrcLength/8; i++) {
00514                 __m64 mm1 = _m_psubusb(*mSrc2, *mSrc1); /* Src1-Src2 (sub 8 bytes with saturation) */
00515                 __m64 mm2 = _m_psubusb(*mSrc1, *mSrc2); /* Src2-Src1 (sub 8 bytes with saturation) */
00516                 *mDest = _m_por(mm1, mm2);              /* combine both mm2 and mm1 results */
00517                 mSrc1++;
00518                 mSrc2++;
00519                 mDest++;
00520         }
00521         _m_empty();                                     /* clean MMX state */
00522 #endif
00523         return (0);
00524 #else
00525         return (-1);
00526 #endif
00527 }
00528 
00539 int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00540 {
00541         unsigned int i, istart;
00542         unsigned char *cursrc1, *cursrc2, *curdst;
00543         int result;
00544 
00545         /* Validate input parameters */
00546         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00547                 return(-1);
00548         if (length == 0)
00549                 return(0);
00550 
00551         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00552                 /* MMX routine */
00553                 SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
00554 
00555                 /* Check for unaligned bytes */
00556                 if ((length & 7) > 0) {
00557                         /* Setup to process unaligned bytes */
00558                         istart = length & 0xfffffff8;
00559                         cursrc1 = &Src1[istart];
00560                         cursrc2 = &Src2[istart];
00561                         curdst = &Dest[istart];
00562                 } else {
00563                         /* No unaligned bytes - we are done */
00564                         return (0);
00565                 }
00566         } else {
00567                 /* Setup to process whole image */
00568                 istart = 0;
00569                 cursrc1 = Src1;
00570                 cursrc2 = Src2;
00571                 curdst = Dest;
00572         }
00573 
00574         /* C routine to process image */
00575         for (i = istart; i < length; i++) {
00576                 result = abs((int) *cursrc1 - (int) *cursrc2);
00577                 *curdst = (unsigned char) result;
00578                 /* Advance pointers */
00579                 cursrc1++;
00580                 cursrc2++;
00581                 curdst++;
00582         }
00583 
00584         return (0);
00585 }
00586 
00597 static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00598 {
00599 #ifdef USE_MMX
00600 #if !defined(GCC__)
00601         __asm
00602         {
00603                 pusha
00604                         mov eax, Src1   /* load Src1 address into eax */
00605                         mov ebx, Src2   /* load Src2 address into ebx */
00606                         mov edi, Dest   /* load Dest address into edi */
00607                         mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
00608                         shr ecx, 3   /* counter/8 (MMX loads 8 bytes at a time) */
00609                         pxor mm0, mm0   /* zero mm0 register */
00610                         align 16        /* 16 byte alignment of the loop entry */
00611 L1014:
00612                 movq mm1, [eax]   /* load 8 bytes from Src1 into mm1 */
00613                 movq mm3, [ebx]   /* load 8 bytes from Src2 into mm3 */
00614                 movq mm2, mm1   /* copy mm1 into mm2 */
00615                         movq mm4, mm3   /* copy mm3 into mm4  */
00616                         punpcklbw mm1, mm0   /* unpack low  bytes of Src1 into words */
00617                         punpckhbw mm2, mm0   /* unpack high bytes of Src1 into words */
00618                         punpcklbw mm3, mm0   /* unpack low  bytes of Src2 into words */
00619                         punpckhbw mm4, mm0   /* unpack high bytes of Src2 into words */
00620                         pmullw mm1, mm3   /* mul low  bytes of Src1 and Src2  */
00621                         pmullw mm2, mm4   /* mul high bytes of Src1 and Src2 */
00622                         /* Take abs value of the results (signed words) */
00623                         movq mm5, mm1   /* copy mm1 into mm5 */
00624                         movq mm6, mm2   /* copy mm2 into mm6 */
00625                         psraw mm5, 15   /* fill mm5 words with word sign bit */
00626                         psraw mm6, 15   /* fill mm6 words with word sign bit */
00627                         pxor mm1, mm5   /* take 1's compliment of only neg. words */
00628                         pxor mm2, mm6   /* take 1's compliment of only neg. words */
00629                         psubsw mm1, mm5   /* add 1 to only neg. words, W-(-1) or W-0 */
00630                         psubsw mm2, mm6   /* add 1 to only neg. words, W-(-1) or W-0 */
00631                         packuswb mm1, mm2   /* pack words back into bytes with saturation */
00632                         movq [edi], mm1   /* store result in Dest */
00633                         add eax, 8   /* increase Src1, Src2 and Dest  */
00634                         add ebx, 8   /* register pointers by 8 */
00635                         add edi, 8
00636                         dec ecx         /* decrease loop counter */
00637                         jnz L1014       /* check loop termination, proceed if required */
00638                         emms /* exit MMX state */
00639                         popa
00640         }
00641 #else
00642         /* i386 ASM with constraints: */
00643         /* asm volatile ( */
00644         /*      "shr $3, %%ecx \n\t"    /\* counter/8 (MMX loads 8 bytes at a time) *\/ */
00645         /*      "pxor      %%mm0, %%mm0 \n\t"   /\* zero mm0 register *\/ */
00646         /*      ".align 16       \n\t"  /\* 16 byte alignment of the loop entry *\/ */
00647         /*      "1: movq (%%eax), %%mm1 \n\t"     /\* load 8 bytes from Src1 into mm1 *\/ */
00648         /*      "movq    (%%ebx), %%mm3 \n\t"   /\* load 8 bytes from Src2 into mm3 *\/ */
00649         /*      "movq      %%mm1, %%mm2 \n\t"   /\* copy mm1 into mm2 *\/ */
00650         /*      "movq      %%mm3, %%mm4 \n\t"   /\* copy mm3 into mm4  *\/ */
00651         /*      "punpcklbw %%mm0, %%mm1 \n\t"   /\* unpack low  bytes of Src1 into words *\/ */
00652         /*      "punpckhbw %%mm0, %%mm2 \n\t"   /\* unpack high bytes of Src1 into words *\/ */
00653         /*      "punpcklbw %%mm0, %%mm3 \n\t"   /\* unpack low  bytes of Src2 into words *\/ */
00654         /*      "punpckhbw %%mm0, %%mm4 \n\t"   /\* unpack high bytes of Src2 into words *\/ */
00655         /*      "pmullw    %%mm3, %%mm1 \n\t"   /\* mul low  bytes of Src1 and Src2  *\/ */
00656         /*      "pmullw    %%mm4, %%mm2 \n\t"   /\* mul high bytes of Src1 and Src2 *\/ */
00657         /*      /\* Take abs value of the results (signed words) *\/ */
00658         /*      "movq      %%mm1, %%mm5 \n\t"   /\* copy mm1 into mm5 *\/ */
00659         /*      "movq      %%mm2, %%mm6 \n\t"   /\* copy mm2 into mm6 *\/ */
00660         /*      "psraw       $15, %%mm5 \n\t"   /\* fill mm5 words with word sign bit *\/ */
00661         /*      "psraw       $15, %%mm6 \n\t"   /\* fill mm6 words with word sign bit *\/ */
00662         /*      "pxor      %%mm5, %%mm1 \n\t"   /\* take 1's compliment of only neg. words *\/ */
00663         /*      "pxor      %%mm6, %%mm2 \n\t"   /\* take 1's compliment of only neg. words *\/ */
00664         /*      "psubsw    %%mm5, %%mm1 \n\t"   /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
00665         /*      "psubsw    %%mm6, %%mm2 \n\t"   /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
00666         /*      "packuswb  %%mm2, %%mm1 \n\t"   /\* pack words back into bytes with saturation *\/ */
00667         /*      "movq    %%mm1, (%%edi) \n\t"   /\* store result in Dest *\/ */
00668         /*      "add $8, %%eax \n\t"    /\* increase Src1, Src2 and Dest  *\/ */
00669         /*      "add $8, %%ebx \n\t"    /\* register pointers by 8 *\/ */
00670         /*      "add $8, %%edi \n\t" */
00671         /*      "dec %%ecx     \n\t"    /\* decrease loop counter *\/ */
00672         /*      "jnz 1b        \n\t"    /\* check loop termination, proceed if required *\/ */
00673         /*      "emms          \n\t"    /\* exit MMX state *\/ */
00674         /*      : "+a" (Src1),          /\* load Src1 address into rax, modified by the loop *\/ */
00675         /*        "+b" (Src2),          /\* load Src2 address into rbx, modified by the loop *\/ */
00676         /*        "+c" (SrcLength),     /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
00677         /*        "+D" (Dest)           /\* load Dest address into rdi, modified by the loop *\/ */
00678         /*      : */
00679         /*      : "memory",             /\* *Dest is modified *\/ */
00680         /*           "mm0","mm1","mm2","mm3","mm4","mm5","mm6"  /\* registers modified *\/ */
00681         /* ); */
00682 
00683         /* i386 and x86_64 */
00684         __m64 *mSrc1 = (__m64*)Src1;
00685         __m64 *mSrc2 = (__m64*)Src2;
00686         __m64 *mDest = (__m64*)Dest;
00687         __m64 mm0 = _m_from_int(0); /* zero mm0 register */
00688         int i;
00689         for (i = 0; i < SrcLength/8; i++) {
00690                 __m64 mm1, mm2, mm3, mm4, mm5, mm6;
00691                 mm1 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
00692                 mm2 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
00693                 mm3 = _m_punpcklbw(*mSrc2, mm0);        /* unpack low  bytes of Src2 into words */
00694                 mm4 = _m_punpckhbw(*mSrc2, mm0);        /* unpack high bytes of Src2 into words */
00695                 mm1 = _m_pmullw(mm1, mm3);              /* mul low  bytes of Src1 and Src2  */
00696                 mm2 = _m_pmullw(mm2, mm4);              /* mul high bytes of Src1 and Src2 */
00697                 mm5 = _m_psrawi(mm1, 15);               /* fill mm5 words with word sign bit */
00698                 mm6 = _m_psrawi(mm2, 15);               /* fill mm6 words with word sign bit */
00699                 mm1 = _m_pxor(mm1, mm5);                /* take 1's compliment of only neg. words */
00700                 mm2 = _m_pxor(mm2, mm6);                /* take 1's compliment of only neg. words */
00701                 mm1 = _m_psubsw(mm1, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */
00702                 mm2 = _m_psubsw(mm2, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */
00703                 *mDest = _m_packuswb(mm1, mm2);         /* pack words back into bytes with saturation */
00704                 mSrc1++;
00705                 mSrc2++;
00706                 mDest++;
00707         }
00708         _m_empty();                                     /* clean MMX state */
00709 #endif
00710         return (0);
00711 #else
00712         return (-1);
00713 #endif
00714 }
00715 
00726 int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00727 {
00728         unsigned int i, istart;
00729         unsigned char *cursrc1, *cursrc2, *curdst;
00730         int result;
00731 
00732         /* Validate input parameters */
00733         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00734                 return(-1);
00735         if (length == 0)
00736                 return(0);
00737 
00738         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
00739                 /* MMX routine */
00740                 SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
00741 
00742                 /* Check for unaligned bytes */
00743                 if ((length & 7) > 0) {
00744                         /* Setup to process unaligned bytes */
00745                         istart = length & 0xfffffff8;
00746                         cursrc1 = &Src1[istart];
00747                         cursrc2 = &Src2[istart];
00748                         curdst = &Dest[istart];
00749                 } else {
00750                         /* No unaligned bytes - we are done */
00751                         return (0);
00752                 }
00753         } else {
00754                 /* Setup to process whole image */
00755                 istart = 0;
00756                 cursrc1 = Src1;
00757                 cursrc2 = Src2;
00758                 curdst = Dest;
00759         }
00760 
00761         /* C routine to process image */
00762         for (i = istart; i < length; i++) {
00763 
00764                 /* NOTE: this is probably wrong - dunno what the MMX code does */
00765 
00766                 result = (int) *cursrc1 * (int) *cursrc2;
00767                 if (result > 255)
00768                         result = 255;
00769                 *curdst = (unsigned char) result;
00770                 /* Advance pointers */
00771                 cursrc1++;
00772                 cursrc2++;
00773                 curdst++;
00774         }
00775 
00776         return (0);
00777 }
00778 
00789 int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00790 {
00791 #ifdef USE_MMX
00792 #if !defined(GCC__)
00793         __asm
00794         {
00795                 pusha
00796                         mov edx, Src1   /* load Src1 address into edx */
00797                         mov esi, Src2   /* load Src2 address into esi */
00798                         mov edi, Dest   /* load Dest address into edi */
00799                         mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
00800                         align 16        /* 16 byte alignment of the loop entry */
00801 L10141:
00802                 mov al, [edx]   /* load a byte from Src1 */
00803                 mul [esi]       /* mul with a byte from Src2 */
00804                 mov [edi], al   /* move a byte result to Dest */
00805                         inc edx         /* increment Src1, Src2, Dest */
00806                         inc esi                 /* pointer registers by one */
00807                         inc edi
00808                         dec ecx /* decrease loop counter */
00809                         jnz L10141      /* check loop termination, proceed if required */
00810                         popa
00811         }
00812 #else
00813         /* Note: ~5% gain on i386, less efficient than C on x86_64 */
00814         /* Also depends on whether this function is static (?!) */
00815         asm volatile (
00816                 ".align 16       \n\t"  /* 16 byte alignment of the loop entry */
00817 #  if defined(i386)
00818                 "1:mov  (%%edx), %%al \n\t"      /* load a byte from Src1 */
00819                 "mulb (%%esi)       \n\t"       /* mul with a byte from Src2 */
00820                 "mov %%al, (%%edi)  \n\t"       /* move a byte result to Dest */
00821                 "inc %%edx \n\t"                /* increment Src1, Src2, Dest */
00822                 "inc %%esi \n\t"                /* pointer registers by one */
00823                 "inc %%edi \n\t"
00824                 "dec %%ecx      \n\t"   /* decrease loop counter */
00825 #  elif defined(__x86_64__)
00826                 "1:mov  (%%rdx), %%al \n\t"      /* load a byte from Src1 */
00827                 "mulb (%%rsi)       \n\t"       /* mul with a byte from Src2 */
00828                 "mov %%al, (%%rdi)  \n\t"       /* move a byte result to Dest */
00829                 "inc %%rdx \n\t"                /* increment Src1, Src2, Dest */
00830                 "inc %%rsi \n\t"                /* pointer registers by one */
00831                 "inc %%rdi \n\t"
00832                 "dec %%rcx      \n\t"   /* decrease loop counter */
00833 #  endif
00834                 "jnz 1b         \n\t"   /* check loop termination, proceed if required */
00835                 : "+d" (Src1),          /* load Src1 address into edx */
00836                   "+S" (Src2),          /* load Src2 address into esi */
00837                   "+c" (SrcLength),     /* load loop counter (SIZE) into ecx */
00838                   "+D" (Dest)           /* load Dest address into edi */
00839                 :
00840                 : "memory", "rax"
00841                 );
00842 #endif
00843         return (0);
00844 #else
00845         return (-1);
00846 #endif
00847 }
00848 
00859 int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00860 {
00861         unsigned int i, istart;
00862         unsigned char *cursrc1, *cursrc2, *curdst;
00863 
00864         /* Validate input parameters */
00865         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
00866                 return(-1);
00867         if (length == 0)
00868                 return(0);
00869 
00870         if (SDL_imageFilterMMXdetect()) {
00871                 if (length > 0) {
00872                         /* ASM routine */
00873                         SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
00874 
00875                         /* Check for unaligned bytes */
00876                         if ((length & 7) > 0) {
00877                                 /* Setup to process unaligned bytes */
00878                                 istart = length & 0xfffffff8;
00879                                 cursrc1 = &Src1[istart];
00880                                 cursrc2 = &Src2[istart];
00881                                 curdst = &Dest[istart];
00882                         } else {
00883                                 /* No unaligned bytes - we are done */
00884                                 return (0);
00885                         }
00886                 } else {
00887                         /* No bytes - we are done */
00888                         return (0);
00889                 }
00890         } else {
00891                 /* Setup to process whole image */
00892                 istart = 0;
00893                 cursrc1 = Src1;
00894                 cursrc2 = Src2;
00895                 curdst = Dest;
00896         }
00897 
00898         /* C routine to process image */
00899         for (i = istart; i < length; i++) {
00900                 *curdst = (int)*cursrc1 * (int)*cursrc2;  // (int) for efficiency
00901                 /* Advance pointers */
00902                 cursrc1++;
00903                 cursrc2++;
00904                 curdst++;
00905         }
00906 
00907         return (0);
00908 }
00909 
00920 static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
00921 {
00922 #ifdef USE_MMX
00923 #if !defined(GCC__)
00924         __asm
00925         { 
00926                 pusha
00927                         mov eax, Src1           /* load Src1 address into eax */
00928                         mov ebx, Src2           /* load Src2 address into ebx */
00929                         mov edi, Dest           /* load Dest address into edi */
00930                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
00931                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
00932                         pxor mm0,  mm0  /* zero mm0 register */
00933                         align 16                /* 16 byte alignment of the loop entry */
00934 L1015:
00935                 movq mm1,  [eax]        /* load 8 bytes from Src1 into mm1 */
00936                 movq mm3,  [ebx]        /* load 8 bytes from Src2 into mm3 */
00937                 movq mm2,  mm1  /* copy mm1 into mm2 */
00938                         movq mm4,  mm3  /* copy mm3 into mm4  */
00939                         punpcklbw mm1,  mm0     /* unpack low  bytes of Src1 into words */
00940                         punpckhbw mm2,  mm0     /* unpack high bytes of Src1 into words */
00941                         punpcklbw mm3,  mm0     /* unpack low  bytes of Src2 into words */
00942                         punpckhbw mm4,  mm0     /* unpack high bytes of Src2 into words */
00943                         psrlw mm1,  1   /* divide mm1 words by 2, Src1 low bytes */
00944                         psrlw mm2,  1   /* divide mm2 words by 2, Src1 high bytes */
00945                         pmullw mm1,  mm3        /* mul low  bytes of Src1 and Src2  */
00946                         pmullw mm2,  mm4        /* mul high bytes of Src1 and Src2 */
00947                         packuswb mm1,  mm2      /* pack words back into bytes with saturation */
00948                         movq [edi],  mm1        /* store result in Dest */
00949                         add eax,  8     /* increase Src1, Src2 and Dest  */
00950                         add ebx,  8     /* register pointers by 8 */
00951                         add edi,  8
00952                         dec ecx         /* decrease loop counter */
00953                         jnz L1015               /* check loop termination, proceed if required */
00954                         emms                    /* exit MMX state */
00955                         popa
00956         }
00957 #else
00958         /* i386 and x86_64 */
00959         __m64 *mSrc1 = (__m64*)Src1;
00960         __m64 *mSrc2 = (__m64*)Src2;
00961         __m64 *mDest = (__m64*)Dest;
00962         __m64 mm0 = _m_from_int(0); /* zero mm0 register */
00963         int i;
00964         for (i = 0; i < SrcLength/8; i++) {
00965                 __m64 mm1, mm2, mm3, mm4, mm5, mm6;
00966                 mm1 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
00967                 mm2 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
00968                 mm3 = _m_punpcklbw(*mSrc2, mm0);        /* unpack low  bytes of Src2 into words */
00969                 mm4 = _m_punpckhbw(*mSrc2, mm0);        /* unpack high bytes of Src2 into words */
00970                 mm1 = _m_psrlwi(mm1, 1);                /* divide mm1 words by 2, Src1 low bytes */
00971                 mm2 = _m_psrlwi(mm2, 1);                /* divide mm2 words by 2, Src1 high bytes */
00972                 mm1 = _m_pmullw(mm1, mm3);              /* mul low  bytes of Src1 and Src2  */
00973                 mm2 = _m_pmullw(mm2, mm4);              /* mul high bytes of Src1 and Src2 */
00974                 *mDest = _m_packuswb(mm1, mm2);         /* pack words back into bytes with saturation */
00975                 mSrc1++;
00976                 mSrc2++;
00977                 mDest++;
00978         }
00979         _m_empty();                                     /* clean MMX state */
00980 #endif
00981         return (0);
00982 #else
00983         return (-1);
00984 #endif
00985 }
00986 
00997 int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
00998 {
00999         unsigned int i, istart;
01000         unsigned char *cursrc1, *cursrc2, *curdst;
01001         int result;
01002 
01003         /* Validate input parameters */
01004         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01005                 return(-1);
01006         if (length == 0)
01007                 return(0);
01008 
01009         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01010                 /* MMX routine */
01011                 SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
01012 
01013                 /* Check for unaligned bytes */
01014                 if ((length & 7) > 0) {
01015                         /* Setup to process unaligned bytes */
01016                         istart = length & 0xfffffff8;
01017                         cursrc1 = &Src1[istart];
01018                         cursrc2 = &Src2[istart];
01019                         curdst = &Dest[istart];
01020                 } else {
01021                         /* No unaligned bytes - we are done */
01022                         return (0);
01023                 }
01024         } else {
01025                 /* Setup to process whole image */
01026                 istart = 0;
01027                 cursrc1 = Src1;
01028                 cursrc2 = Src2;
01029                 curdst = Dest;
01030         }
01031 
01032         /* C routine to process image */
01033         for (i = istart; i < length; i++) {
01034                 result = ((int) *cursrc1 / 2) * (int) *cursrc2;
01035                 if (result > 255)
01036                         result = 255;
01037                 *curdst = (unsigned char) result;
01038                 /* Advance pointers */
01039                 cursrc1++;
01040                 cursrc2++;
01041                 curdst++;
01042         }
01043 
01044         return (0);
01045 }
01046 
01057 static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01058 {
01059 #ifdef USE_MMX
01060 #if !defined(GCC__)
01061         __asm
01062         {
01063                 pusha
01064                         mov eax, Src1           /* load Src1 address into eax */
01065                         mov ebx, Src2           /* load Src2 address into ebx */
01066                         mov edi, Dest           /* load Dest address into edi */
01067                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01068                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01069                         pxor mm0, mm0           /* zero mm0 register */
01070                         align 16                /* 16 byte alignment of the loop entry */
01071 L1016:
01072                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */
01073                 movq mm3, [ebx]         /* load 8 bytes from Src2 into mm3 */
01074                 movq mm2, mm1           /* copy mm1 into mm2 */
01075                         movq mm4, mm3           /* copy mm3 into mm4  */
01076                         punpcklbw mm1, mm0      /* unpack low  bytes of Src1 into words */
01077                         punpckhbw mm2, mm0      /* unpack high bytes of Src1 into words */
01078                         punpcklbw mm3, mm0      /* unpack low  bytes of Src2 into words */
01079                         punpckhbw mm4, mm0      /* unpack high bytes of Src2 into words */
01080                         psrlw mm1, 1    /* divide mm1 words by 2, Src1 low bytes */
01081                         psrlw mm2, 1    /* divide mm2 words by 2, Src1 high bytes */
01082                         psrlw mm3, 1    /* divide mm3 words by 2, Src2 low bytes */
01083                         psrlw mm4, 1    /* divide mm4 words by 2, Src2 high bytes */
01084                         pmullw mm1, mm3         /* mul low  bytes of Src1 and Src2  */
01085                         pmullw mm2, mm4         /* mul high bytes of Src1 and Src2 */
01086                         packuswb mm1, mm2       /* pack words back into bytes with saturation */
01087                         movq [edi], mm1         /* store result in Dest */
01088                         add eax, 8      /* increase Src1, Src2 and Dest  */
01089                         add ebx, 8      /* register pointers by 8 */
01090                         add edi,  8
01091                         dec ecx         /* decrease loop counter */
01092                         jnz L1016               /* check loop termination, proceed if required */
01093                         emms                    /* exit MMX state */
01094                         popa
01095         }
01096 #else
01097         /* i386 and x86_64 */
01098         __m64 *mSrc1 = (__m64*)Src1;
01099         __m64 *mSrc2 = (__m64*)Src2;
01100         __m64 *mDest = (__m64*)Dest;
01101         __m64 mm0 = _m_from_int(0); /* zero mm0 register */
01102         int i;
01103         for (i = 0; i < SrcLength/8; i++) {
01104                 __m64 mm1, mm2, mm3, mm4, mm5, mm6;
01105                 mm1 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
01106                 mm2 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
01107                 mm3 = _m_punpcklbw(*mSrc2, mm0);        /* unpack low  bytes of Src2 into words */
01108                 mm4 = _m_punpckhbw(*mSrc2, mm0);        /* unpack high bytes of Src2 into words */
01109                 mm1 = _m_psrlwi(mm1, 1);                /* divide mm1 words by 2, Src1 low bytes */
01110                 mm2 = _m_psrlwi(mm2, 1);                /* divide mm2 words by 2, Src1 high bytes */
01111                 mm3 = _m_psrlwi(mm3, 1);                /* divide mm3 words by 2, Src2 low bytes */
01112                 mm4 = _m_psrlwi(mm4, 1);                /* divide mm4 words by 2, Src2 high bytes */
01113                 mm1 = _m_pmullw(mm1, mm3);              /* mul low  bytes of Src1 and Src2  */
01114                 mm2 = _m_pmullw(mm2, mm4);              /* mul high bytes of Src1 and Src2 */
01115                 *mDest = _m_packuswb(mm1, mm2);         /* pack words back into bytes with saturation */
01116                 mSrc1++;
01117                 mSrc2++;
01118                 mDest++;
01119         }
01120         _m_empty();                                     /* clean MMX state */
01121 #endif
01122         return (0);
01123 #else
01124         return (-1);
01125 #endif
01126 }
01127 
01138 int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01139 {
01140         unsigned int i, istart;
01141         unsigned char *cursrc1, *cursrc2, *curdst;
01142         int result;
01143 
01144         /* Validate input parameters */
01145         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01146                 return(-1);
01147         if (length == 0)
01148                 return(0);
01149 
01150         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01151                 /* MMX routine */
01152                 SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
01153 
01154                 /* Check for unaligned bytes */
01155                 if ((length & 7) > 0) {
01156                         /* Setup to process unaligned bytes */
01157                         istart = length & 0xfffffff8;
01158                         cursrc1 = &Src1[istart];
01159                         cursrc2 = &Src2[istart];
01160                         curdst = &Dest[istart];
01161                 } else {
01162                         /* No unaligned bytes - we are done */
01163                         return (0);
01164                 }
01165         } else {
01166                 /* Setup to process whole image */
01167                 istart = 0;
01168                 cursrc1 = Src1;
01169                 cursrc2 = Src2;
01170                 curdst = Dest;
01171         }
01172 
01173         /* C routine to process image */
01174         for (i = istart; i < length; i++) {
01175                 result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
01176                 if (result > 255)
01177                         result = 255;
01178                 *curdst = (unsigned char) result;
01179                 /* Advance pointers */
01180                 cursrc1++;
01181                 cursrc2++;
01182                 curdst++;
01183         }
01184 
01185         return (0);
01186 }
01187 
01198 static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01199 {
01200 #ifdef USE_MMX
01201 #if !defined(GCC__)
01202         __asm
01203         {
01204                 pusha
01205                         mov eax, Src1           /* load Src1 address into eax */
01206                         mov ebx, Src2           /* load Src2 address into ebx */
01207                         mov edi, Dest           /* load Dest address into edi */
01208                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01209                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
01210                         align 16                /* 16 byte alignment of the loop entry */
01211 L1017:
01212                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */
01213                 pand mm1, [ebx]         /* mm1=Src1&Src2 */
01214                 movq [edi], mm1         /* store result in Dest */
01215                         add eax, 8      /* increase Src1, Src2 and Dest  */
01216                         add ebx, 8      /* register pointers by 8 */
01217                         add edi, 8
01218                         dec ecx         /* decrease loop counter */
01219                         jnz L1017               /* check loop termination, proceed if required */
01220                         emms                    /* exit MMX state */
01221                         popa
01222         }
01223 #else
01224         /* x86_64 ASM with constraints: */
01225         /* asm volatile ( */
01226         /*      "shr $3, %%rcx \n\t"    /\* counter/8 (MMX loads 8 bytes at a time) *\/ */
01227         /*      ".align 16       \n\t"  /\* 16 byte alignment of the loop entry *\/ */
01228         /*      "1: movq (%%rax), %%mm1 \n\t"   /\* load 8 bytes from Src1 into mm1 *\/ */
01229         /*      "pand    (%%rbx), %%mm1 \n\t"   /\* mm1=Src1&Src2 *\/ */
01230         /*      "movq    %%mm1, (%%rdi) \n\t"   /\* store result in Dest *\/ */
01231         /*      "add $8, %%rax \n\t"    /\* increase Src1, Src2 and Dest  *\/ */
01232         /*      "add $8, %%rbx \n\t"    /\* register pointers by 8 *\/ */
01233         /*      "add $8, %%rdi \n\t" */
01234         /*      "dec %%rcx     \n\t"    /\* decrease loop counter *\/ */
01235         /*      "jnz 1b        \n\t"    /\* check loop termination, proceed if required *\/ */
01236         /*      "emms          \n\t"    /\* exit MMX state *\/ */
01237         /*      : "+a" (Src1),          /\* load Src1 address into rax, modified by the loop *\/ */
01238         /*        "+b" (Src2),          /\* load Src2 address into rbx, modified by the loop *\/ */
01239         /*        "+c" (SrcLength),     /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
01240         /*        "+D" (Dest)           /\* load Dest address into rdi, modified by the loop *\/ */
01241         /*      : */
01242         /*      : "memory",             /\* *Dest is modified *\/ */
01243         /*           "mm1"                      /\* register mm1 modified *\/ */
01244         /* ); */
01245 
01246         /* i386 and x86_64 */
01247         __m64 *mSrc1 = (__m64*)Src1;
01248         __m64 *mSrc2 = (__m64*)Src2;
01249         __m64 *mDest = (__m64*)Dest;
01250         int i;
01251         for (i = 0; i < SrcLength/8; i++) {
01252                 *mDest = _m_pand(*mSrc1, *mSrc2);       /* Src1&Src2 */
01253                 mSrc1++;
01254                 mSrc2++;
01255                 mDest++;
01256         }
01257         _m_empty();                                     /* clean MMX state */
01258 #endif
01259         return (0);
01260 #else
01261         return (-1);
01262 #endif
01263 }
01264 
01275 int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01276 {
01277         unsigned int i, istart;
01278         unsigned char *cursrc1, *cursrc2, *curdst;
01279 
01280         /* Validate input parameters */
01281         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01282                 return(-1);
01283         if (length == 0)
01284                 return(0);
01285 
01286         if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
01287                 /*  if (length > 7) { */
01288                 /* Call MMX routine */
01289 
01290                 SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
01291 
01292                 /* Check for unaligned bytes */
01293                 if ((length & 7) > 0) {
01294 
01295                         /* Setup to process unaligned bytes */
01296                         istart = length & 0xfffffff8;
01297                         cursrc1 = &Src1[istart];
01298                         cursrc2 = &Src2[istart];
01299                         curdst = &Dest[istart];
01300                 } else {
01301                         /* No unaligned bytes - we are done */
01302                         return (0);
01303                 }
01304         } else {
01305                 /* Setup to process whole image */
01306                 istart = 0;
01307                 cursrc1 = Src1;
01308                 cursrc2 = Src2;
01309                 curdst = Dest;
01310         }
01311 
01312         /* C routine to process image */
01313         for (i = istart; i < length; i++) {
01314                 *curdst = (*cursrc1) & (*cursrc2);
01315                 /* Advance pointers */
01316                 cursrc1++;
01317                 cursrc2++;
01318                 curdst++;
01319         }
01320 
01321         return (0);
01322 }
01323 
01334 static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01335 {
01336 #ifdef USE_MMX
01337 #if !defined(GCC__)
01338         __asm
01339         {
01340                 pusha
01341                         mov eax, Src1           /* load Src1 address into eax */
01342                         mov ebx, Src2           /* load Src2 address into ebx */
01343                         mov edi, Dest           /* load Dest address into edi */
01344                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01345                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01346                         align 16                /* 16 byte alignment of the loop entry */
01347 L91017:
01348                 movq mm1, [eax]         /* load 8 bytes from Src1 into mm1 */
01349                 por mm1, [ebx]          /* mm1=Src1|Src2 */
01350                 movq [edi], mm1         /* store result in Dest */
01351                         add eax, 8      /* increase Src1, Src2 and Dest  */
01352                         add ebx, 8      /* register pointers by 8 */
01353                         add edi,  8
01354                         dec ecx         /* decrease loop counter */
01355                         jnz L91017              /* check loop termination, proceed if required */
01356                         emms                    /* exit MMX state */
01357                         popa
01358         }
01359 #else
01360         /* i386 and x86_64 */
01361         __m64 *mSrc1 = (__m64*)Src1;
01362         __m64 *mSrc2 = (__m64*)Src2;
01363         __m64 *mDest = (__m64*)Dest;
01364         int i;
01365         for (i = 0; i < SrcLength/8; i++) {
01366                 *mDest = _m_por(*mSrc1, *mSrc2);        /* Src1|Src2 */
01367                 mSrc1++;
01368                 mSrc2++;
01369                 mDest++;
01370         }
01371         _m_empty();                                     /* clean MMX state */
01372 #endif
01373         return (0);
01374 #else
01375         return (-1);
01376 #endif
01377 }
01378 
01389 int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01390 {
01391         unsigned int i, istart;
01392         unsigned char *cursrc1, *cursrc2, *curdst;
01393 
01394         /* Validate input parameters */
01395         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01396                 return(-1);
01397         if (length == 0)
01398                 return(0);
01399 
01400         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01401 
01402                 /* MMX routine */
01403                 SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
01404 
01405                 /* Check for unaligned bytes */
01406                 if ((length & 7) > 0) {
01407                         /* Setup to process unaligned bytes */
01408                         istart = length & 0xfffffff8;
01409                         cursrc1 = &Src1[istart];
01410                         cursrc2 = &Src2[istart];
01411                         curdst = &Dest[istart];
01412                 } else {
01413                         /* No unaligned bytes - we are done */
01414                         return (0);
01415                 }
01416         } else {
01417                 /* Setup to process whole image */
01418                 istart = 0;
01419                 cursrc1 = Src1;
01420                 cursrc2 = Src2;
01421                 curdst = Dest;
01422         }
01423 
01424         /* C routine to process image */
01425         for (i = istart; i < length; i++) {
01426                 *curdst = *cursrc1 | *cursrc2;
01427                 /* Advance pointers */
01428                 cursrc1++;
01429                 cursrc2++;
01430                 curdst++;
01431         }
01432         return (0);
01433 }
01434 
01445 static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
01446 {
01447 #ifdef USE_MMX
01448 #if !defined(GCC__)
01449         __asm
01450         {
01451                 pusha
01452                         mov edx, Src1           /* load Src1 address into edx */
01453                         mov esi, Src2           /* load Src2 address into esi */
01454                         mov edi, Dest           /* load Dest address into edi */
01455                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01456                         align 16                /* 16 byte alignment of the loop entry */
01457 L10191:
01458                 mov bl, [esi]           /* load a byte from Src2 */
01459                 cmp bl, 0       /* check if it zero */
01460                         jnz L10192
01461                         mov [edi], 255          /* division by zero = 255 !!! */
01462                         jmp  L10193
01463 L10192:
01464                 xor ah, ah      /* prepare AX, zero AH register */
01465                         mov al, [edx]           /* load a byte from Src1 into AL */
01466                 div   bl                /* divide AL by BL */
01467                         mov [edi], al           /* move a byte result to Dest */
01468 L10193:
01469                 inc edx         /* increment Src1, Src2, Dest */
01470                         inc esi                 /* pointer registers by one */
01471                         inc edi
01472                         dec ecx         /* decrease loop counter */
01473                         jnz L10191      /* check loop termination, proceed if required */
01474                         popa
01475         }
01476 #else
01477         /* Note: ~15% gain on i386, less efficient than C on x86_64 */
01478         /* Also depends on whether the function is static (?!) */
01479         /* Also depends on whether we work on malloc() or static char[] */
01480         asm volatile (
01481 #  if defined(i386)
01482                 "pushl %%ebx \n\t"              /* %ebx may be the PIC register.  */
01483                 ".align 16     \n\t"            /* 16 byte alignment of the loop entry */
01484                 "1: mov (%%esi), %%bl  \n\t"    /* load a byte from Src2 */
01485                 "cmp       $0, %%bl    \n\t"    /* check if it zero */
01486                 "jnz 2f                \n\t"
01487                 "movb  $255, (%%edi)   \n\t"    /* division by zero = 255 !!! */
01488                 "jmp 3f                \n\t"
01489                 "2: xor %%ah, %%ah     \n\t"    /* prepare AX, zero AH register */
01490                 "mov   (%%edx), %%al   \n\t"    /* load a byte from Src1 into AL */
01491                 "div   %%bl            \n\t"    /* divide AL by BL */
01492                 "mov   %%al, (%%edi)   \n\t"    /* move a byte result to Dest */
01493                 "3: inc %%edx          \n\t"    /* increment Src1, Src2, Dest */
01494                 "inc %%esi \n\t"                /* pointer registers by one */
01495                 "inc %%edi \n\t"
01496                 "dec %%ecx \n\t"                /* decrease loop counter */
01497                 "jnz 1b    \n\t"                /* check loop termination, proceed if required */
01498                 "popl %%ebx \n\t"               /* restore %ebx */
01499                 : "+d" (Src1),          /* load Src1 address into edx */
01500                   "+S" (Src2),          /* load Src2 address into esi */
01501                   "+c" (SrcLength),     /* load loop counter (SIZE) into ecx */
01502                   "+D" (Dest)           /* load Dest address into edi */
01503                 :
01504                 : "memory", "rax"
01505 #  elif defined(__x86_64__)
01506                 ".align 16     \n\t"            /* 16 byte alignment of the loop entry */
01507                 "1: mov (%%rsi), %%bl  \n\t"    /* load a byte from Src2 */
01508                 "cmp       $0, %%bl    \n\t"    /* check if it zero */
01509                 "jnz 2f                \n\t"
01510                 "movb  $255, (%%rdi)   \n\t"    /* division by zero = 255 !!! */
01511                 "jmp 3f                \n\t"
01512                 "2: xor %%ah, %%ah     \n\t"    /* prepare AX, zero AH register */
01513                 "mov   (%%rdx), %%al   \n\t"    /* load a byte from Src1 into AL */
01514                 "div   %%bl            \n\t"    /* divide AL by BL */
01515                 "mov   %%al, (%%rdi)   \n\t"    /* move a byte result to Dest */
01516                 "3: inc %%rdx          \n\t"    /* increment Src1, Src2, Dest */
01517                 "inc %%rsi \n\t"                /* pointer registers by one */
01518                 "inc %%rdi \n\t"
01519                 "dec %%rcx \n\t"                /* decrease loop counter */
01520                 "jnz 1b    \n\t"                /* check loop termination, proceed if required */
01521                 : "+d" (Src1),          /* load Src1 address into edx */
01522                   "+S" (Src2),          /* load Src2 address into esi */
01523                   "+c" (SrcLength),     /* load loop counter (SIZE) into ecx */
01524                   "+D" (Dest)           /* load Dest address into edi */
01525                 :
01526                 : "memory", "rax", "rbx"
01527 #  endif
01528                 );
01529 #endif
01530         return (0);
01531 #else
01532         return (-1);
01533 #endif
01534 }
01535 
01546 int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
01547 {
01548         unsigned int i, istart;
01549         unsigned char *cursrc1, *cursrc2, *curdst;
01550 
01551         /* Validate input parameters */
01552         if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
01553                 return(-1);
01554         if (length == 0)
01555                 return(0);
01556 
01557         if (SDL_imageFilterMMXdetect()) {
01558                 if (length > 0) {
01559                         /* Call ASM routine */
01560                         SDL_imageFilterDivASM(Src1, Src2, Dest, length);
01561 
01562                         /* Never unaligned bytes - we are done */
01563                         return (0);
01564                 } else {
01565                         return (-1);
01566                 }
01567         } 
01568         
01569         /* Setup to process whole image */
01570         istart = 0;
01571         cursrc1 = Src1;
01572         cursrc2 = Src2;
01573         curdst = Dest;
01574 
01575         /* C routine to process image */
01576         /* for (i = istart; i < length; i++) { */
01577         /*      if (*cursrc2 == 0) { */
01578         /*              *curdst = 255; */
01579         /*      } else { */
01580         /*              result = (int) *cursrc1 / (int) *cursrc2; */
01581         /*              *curdst = (unsigned char) result; */
01582         /*      } */
01583         /*      /\* Advance pointers *\/ */
01584         /*      cursrc1++; */
01585         /*      cursrc2++; */
01586         /*      curdst++; */
01587         /* } */
01588         for (i = istart; i < length; i++) {
01589                 if (*cursrc2 == 0) {
01590                         *curdst = 255;
01591                 } else {
01592                         *curdst = (int)*cursrc1 / (int)*cursrc2;  // (int) for efficiency
01593                 }
01594                 /* Advance pointers */
01595                 cursrc1++;
01596                 cursrc2++;
01597                 curdst++;
01598         }
01599 
01600         return (0);
01601 }
01602 
01603 /* ------------------------------------------------------------------------------------ */
01604 
01614 static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
01615 {
01616 #ifdef USE_MMX
01617 #if !defined(GCC__)
01618         __asm
01619         {
01620                 pusha
01621                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
01622                         mov eax, Src1           /* load Src1 address into eax */
01623                         mov edi, Dest           /* load Dest address into edi */
01624                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01625                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01626                         align 16                /* 16 byte alignment of the loop entry */
01627 L91117:
01628                 movq mm0, [eax]         /* load 8 bytes from Src1 into mm1 */
01629                 pxor mm0, mm1           /* negate mm0 by xoring with mm1 */
01630                         movq [edi], mm0         /* store result in Dest */
01631                         add eax, 8      /* increase Src1, Src2 and Dest  */
01632                         add edi,  8
01633                         dec ecx         /* decrease loop counter */
01634                         jnz L91117              /* check loop termination, proceed if required */
01635                         emms                    /* exit MMX state */
01636                         popa
01637         }
01638 #else
01639         /* i386 and x86_64 */
01640         __m64 *mSrc1 = (__m64*)Src1;
01641         __m64 *mDest = (__m64*)Dest;
01642         __m64 mm1;
01643         mm1 = _m_pcmpeqb(mm1, mm1);             /* generate all 1's in mm1 */
01644         int i;
01645         for (i = 0; i < SrcLength/8; i++) {
01646                 *mDest = _m_pxor(*mSrc1, mm1);  /* negate mm0 by xoring with mm1 */
01647                 mSrc1++;
01648                 mDest++;
01649         }
01650         _m_empty();                             /* clean MMX state */
01651 
01652 #endif
01653         return (0);
01654 #else
01655         return (-1);
01656 #endif
01657 }
01658 
01668 int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
01669 {
01670         unsigned int i, istart;
01671         unsigned char *cursrc1, *curdst;
01672 
01673         /* Validate input parameters */
01674         if ((Src1 == NULL) || (Dest == NULL))
01675                 return(-1);
01676         if (length == 0)
01677                 return(0);
01678 
01679         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01680                 /* MMX routine */
01681                 SDL_imageFilterBitNegationMMX(Src1, Dest, length);
01682 
01683                 /* Check for unaligned bytes */
01684                 if ((length & 7) > 0) {
01685                         /* Setup to process unaligned bytes */
01686                         istart = length & 0xfffffff8;
01687                         cursrc1 = &Src1[istart];
01688                         curdst = &Dest[istart];
01689                 } else {
01690                         /* No unaligned bytes - we are done */
01691                         return (0);
01692                 }
01693         } else {
01694                 /* Setup to process whole image */
01695                 istart = 0;
01696                 cursrc1 = Src1;
01697                 curdst = Dest;
01698         }
01699 
01700         /* C routine to process image */
01701         for (i = istart; i < length; i++) {
01702                 *curdst = ~(*cursrc1);
01703                 /* Advance pointers */
01704                 cursrc1++;
01705                 curdst++;
01706         }
01707 
01708         return (0);
01709 }
01710 
01721 static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
01722 {
01723 #ifdef USE_MMX
01724 #if !defined(GCC__)
01725         __asm
01726         {
01727                 pusha
01728                         /* ** Duplicate C in 8 bytes of MM1 ** */
01729                         mov al, C       /* load C into AL */
01730                         mov ah, al      /* copy AL into AH */
01731                         mov bx, ax      /* copy AX into BX */
01732                         shl eax, 16     /* shift 2 bytes of EAX left */
01733                         mov ax, bx      /* copy BX into AX */
01734                         movd mm1, eax           /* copy EAX into MM1 */
01735                         movd mm2, eax           /* copy EAX into MM2 */
01736                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
01737                         mov eax, Src1           /* load Src1 address into eax */
01738                         mov edi, Dest           /* load Dest address into edi */
01739                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01740                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01741                         align 16                        /* 16 byte alignment of the loop entry */
01742 L1021:
01743                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */
01744                 paddusb mm0,  mm1       /* MM0=SrcDest+C (add 8 bytes with saturation) */
01745                         movq [edi], mm0         /* store result in Dest */
01746                         add eax, 8      /* increase Dest register pointer by 8 */
01747                         add edi, 8      /* increase Dest register pointer by 8 */
01748                         dec              ecx            /* decrease loop counter */
01749                         jnz             L1021           /* check loop termination, proceed if required */
01750                         emms                            /* exit MMX state */
01751                         popa
01752         }
01753 #else
01754         /* i386 and x86_64 */
01755         __m64 *mSrc1 = (__m64*)Src1;
01756         __m64 *mDest = (__m64*)Dest;
01757         /* Duplicate C in 8 bytes of MM1 */
01758         int i;
01759         memset(&i, C, 4);
01760         __m64 mm1 = _m_from_int(i);
01761         __m64 mm2 = _m_from_int(i);
01762         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */
01763         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
01764         for (i = 0; i < SrcLength/8; i++) {
01765                 *mDest = _m_paddusb(*mSrc1, mm1);       /* Src1+C (add 8 bytes with saturation) */
01766                 mSrc1++;
01767                 mDest++;
01768         }
01769         _m_empty();                                     /* clean MMX state */
01770 #endif
01771         return (0);
01772 #else
01773         return (-1);
01774 #endif
01775 }
01776 
01788 int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
01789 {
01790         unsigned int i, istart;
01791         int iC;
01792         unsigned char *cursrc1, *curdest;
01793         int result;
01794 
01795         /* Validate input parameters */
01796         if ((Src1 == NULL) || (Dest == NULL))
01797                 return(-1);
01798         if (length == 0)
01799                 return(0);
01800 
01801         /* Special case: C==0 */
01802         if (C == 0) {
01803                 memcpy(Src1, Dest, length);
01804                 return (0); 
01805         }
01806 
01807         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01808 
01809                 /* MMX routine */
01810                 SDL_imageFilterAddByteMMX(Src1, Dest, length, C);
01811 
01812                 /* Check for unaligned bytes */
01813                 if ((length & 7) > 0) {
01814                         /* Setup to process unaligned bytes */
01815                         istart = length & 0xfffffff8;
01816                         cursrc1 = &Src1[istart];
01817                         curdest = &Dest[istart];
01818                 } else {
01819                         /* No unaligned bytes - we are done */
01820                         return (0);
01821                 }
01822         } else {
01823                 /* Setup to process whole image */
01824                 istart = 0;
01825                 cursrc1 = Src1;
01826                 curdest = Dest;
01827         }
01828 
01829         /* C routine to process image */
01830         iC = (int) C;
01831         for (i = istart; i < length; i++) {
01832                 result = (int) *cursrc1 + iC;
01833                 if (result > 255)
01834                         result = 255;
01835                 *curdest = (unsigned char) result;
01836                 /* Advance pointers */
01837                 cursrc1++;
01838                 curdest++;
01839         }
01840         return (0);
01841 }
01842 
01854 static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
01855 {
01856 #ifdef USE_MMX
01857 #if !defined(GCC__)
01858         __asm
01859         {
01860                 pusha
01861                         /* ** Duplicate (int)C in 8 bytes of MM1 ** */
01862                         mov eax, C      /* load C into EAX */
01863                         movd mm1, eax           /* copy EAX into MM1 */
01864                         mov eax, D      /* load D into EAX */
01865                         movd mm2, eax           /* copy EAX into MM2 */
01866                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
01867                         mov eax, Src1           /* load Src1 address into eax */
01868                         mov edi, Dest           /* load Dest address into edi */
01869                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
01870                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
01871                         align 16                        /* 16 byte alignment of the loop entry */
01872 L11023:
01873                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
01874                 paddusb mm0,  mm1       /* MM0=SrcDest+C (add 8 bytes with saturation) */
01875                         movq [edi],  mm0        /* store result in SrcDest */
01876                         add eax, 8      /* increase Src1 register pointer by 8 */
01877                         add edi, 8      /* increase Dest register pointer by 8 */
01878                         dec              ecx            /* decrease loop counter */
01879                         jnz             L11023          /* check loop termination, proceed if required */
01880                         emms                            /* exit MMX state */
01881                         popa
01882         }
01883 #else
01884         /* i386 and x86_64 */
01885         __m64 *mSrc1 = (__m64*)Src1;
01886         __m64 *mDest = (__m64*)Dest;
01887         /* Duplicate (int)C in 8 bytes of MM1 */
01888         __m64 mm1 = _m_from_int(C);
01889         __m64 mm2 = _m_from_int(C);
01890         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */
01891         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
01892         int i;
01893         for (i = 0; i < SrcLength/8; i++) {
01894                 *mDest = _m_paddusb(*mSrc1, mm1);       /* Src1+C (add 8 bytes with saturation) */
01895                 mSrc1++;
01896                 mDest++;
01897         }
01898         _m_empty();                                     /* clean MMX state */
01899 #endif
01900         return (0);
01901 #else
01902         return (-1);
01903 #endif
01904 }
01905 
01916 int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
01917 {
01918         unsigned int i, j, istart, D;
01919         int iC[4];
01920         unsigned char *cursrc1;
01921         unsigned char *curdest;
01922         int result;
01923 
01924         /* Validate input parameters */
01925         if ((Src1 == NULL) || (Dest == NULL))
01926                 return(-1);
01927         if (length == 0)
01928                 return(0);
01929 
01930         /* Special case: C==0 */
01931         if (C == 0) {
01932                 memcpy(Src1, Dest, length);
01933                 return (0); 
01934         }
01935 
01936         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
01937 
01938                 /* MMX routine */
01939                 D=SWAP_32(C);
01940                 SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);
01941 
01942                 /* Check for unaligned bytes */
01943                 if ((length & 7) > 0) {
01944                         /* Setup to process unaligned bytes */
01945                         istart = length & 0xfffffff8;
01946                         cursrc1 = &Src1[istart];
01947                         curdest = &Dest[istart];
01948                 } else {
01949                         /* No unaligned bytes - we are done */
01950                         return (0);
01951                 }
01952         } else {
01953                 /* Setup to process whole image */
01954                 istart = 0;
01955                 cursrc1 = Src1;
01956                 curdest = Dest;
01957         }
01958 
01959         /* C routine to process bytes */
01960         iC[3] = (int) ((C >> 24) & 0xff);
01961         iC[2] = (int) ((C >> 16) & 0xff);
01962         iC[1] = (int) ((C >>  8) & 0xff);
01963         iC[0] = (int) ((C >>  0) & 0xff);
01964         for (i = istart; i < length; i += 4) {
01965                 for (j = 0; j < 4; j++) {
01966                         if ((i+j)<length) {
01967                                 result = (int) *cursrc1 + iC[j];
01968                                 if (result > 255) result = 255;
01969                                 *curdest = (unsigned char) result;
01970                                 /* Advance pointers */
01971                                 cursrc1++;
01972                                 curdest++;
01973                         }
01974                 }
01975         }
01976         return (0);
01977 }
01978 
01990 static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
01991                                                                         unsigned char *Mask)
01992 {
01993 #ifdef USE_MMX
01994 #if !defined(GCC__)
01995         __asm
01996         {
01997                 pusha
01998                         /* ** Duplicate C in 8 bytes of MM1 ** */
01999                         mov al, C       /* load C into AL */
02000                         mov ah, al      /* copy AL into AH */
02001                         mov bx, ax      /* copy AX into BX */
02002                         shl eax, 16     /* shift 2 bytes of EAX left */
02003                         mov ax, bx      /* copy BX into AX */
02004                         movd mm1, eax           /* copy EAX into MM1 */
02005                         movd mm2, eax           /* copy EAX into MM2 */
02006                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
02007                         mov edx, Mask           /* load Mask address into edx */
02008                         movq mm0, [edx]         /* load Mask into mm0 */
02009                 mov eax, Src1           /* load Src1 address into eax */
02010                         mov edi, Dest           /* load Dest address into edi */
02011                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02012                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02013                         align 16                        /* 16 byte alignment of the loop entry */
02014 L1022:
02015                 movq mm2, [eax]         /* load 8 bytes from Src1 into MM2 */
02016                 psrlw mm2, 1    /* shift 4 WORDS of MM2 1 bit to the right */
02017                         pand mm2, mm0        // apply Mask to 8 BYTES of MM2 */
02018                         paddusb mm2,  mm1       /* MM2=SrcDest+C (add 8 bytes with saturation) */
02019                         movq [edi], mm2         /* store result in Dest */
02020                         add eax, 8      /* increase Src1 register pointer by 8 */
02021                         add edi, 8      /* increase Dest register pointer by 8 */
02022                         dec              ecx            /* decrease loop counter */
02023                         jnz             L1022           /* check loop termination, proceed if required */
02024                         emms                            /* exit MMX state */
02025                         popa
02026         }
02027 #else
02028         /* i386 and x86_64 */
02029         __m64 *mSrc1 = (__m64*)Src1;
02030         __m64 *mDest = (__m64*)Dest;
02031         __m64 *mMask = (__m64*)Mask;
02032         /* Duplicate C in 8 bytes of MM1 */
02033         int i;
02034         memset(&i, C, 4);
02035         __m64 mm1 = _m_from_int(i);
02036         __m64 mm2 = _m_from_int(i);
02037         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */
02038         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
02039         for (i = 0; i < SrcLength/8; i++) {
02040                 __m64 mm2 = _m_psrlwi(*mSrc1, 1);       /* shift 4 WORDS of MM2 1 bit to the right */
02041                 mm2 = _m_pand(mm2, *mMask);             /* apply Mask to 8 BYTES of MM2 */
02042                                                         /* byte     0x0f, 0xdb, 0xd0 */
02043                 *mDest = _m_paddusb(mm1, mm2);          /* Src1+C (add 8 bytes with saturation) */
02044                 mSrc1++;
02045                 mDest++;
02046         }
02047         _m_empty();                                     /* clean MMX state */
02048 #endif
02049         return (0);
02050 #else
02051         return (-1);
02052 #endif
02053 }
02054 
02065 int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
02066 {
02067         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
02068         unsigned int i, istart;
02069         int iC;
02070         unsigned char *cursrc1;
02071         unsigned char *curdest;
02072         int result;
02073 
02074         /* Validate input parameters */
02075         if ((Src1 == NULL) || (Dest == NULL))
02076                 return(-1);
02077         if (length == 0)
02078                 return(0);
02079 
02080         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02081 
02082                 /* MMX routine */
02083                 SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);
02084 
02085                 /* Check for unaligned bytes */
02086                 if ((length & 7) > 0) {
02087                         /* Setup to process unaligned bytes */
02088                         istart = length & 0xfffffff8;
02089                         cursrc1 = &Src1[istart];
02090                         curdest = &Dest[istart];
02091                 } else {
02092                         /* No unaligned bytes - we are done */
02093                         return (0);
02094                 }
02095         } else {
02096                 /* Setup to process whole image */
02097                 istart = 0;
02098                 cursrc1 = Src1;
02099                 curdest = Dest;
02100         }
02101 
02102         /* C routine to process image */
02103         iC = (int) C;
02104         for (i = istart; i < length; i++) {
02105                 result = (int) (*cursrc1 / 2) + iC;
02106                 if (result > 255)
02107                         result = 255;
02108                 *curdest = (unsigned char) result;
02109                 /* Advance pointers */
02110                 cursrc1++;
02111                 curdest++;
02112         }
02113 
02114         return (0);
02115 }
02116 
02127 int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
02128 {
02129 #ifdef USE_MMX
02130 #if !defined(GCC__)
02131         __asm
02132         {
02133                 pusha
02134                         /* ** Duplicate C in 8 bytes of MM1 ** */
02135                         mov al, C       /* load C into AL */
02136                         mov ah, al      /* copy AL into AH */
02137                         mov bx, ax      /* copy AX into BX */
02138                         shl eax, 16     /* shift 2 bytes of EAX left */
02139                         mov ax, bx      /* copy BX into AX */
02140                         movd mm1, eax           /* copy EAX into MM1 */
02141                         movd mm2, eax           /* copy EAX into MM2 */
02142                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
02143                         mov eax, Src1           /* load Src1 address into eax */
02144                         mov edi, Dest           /* load Dest address into edi */
02145                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02146                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02147                         align 16                        /* 16 byte alignment of the loop entry */
02148 L1023:
02149                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02150                 psubusb mm0,  mm1       /* MM0=SrcDest-C (sub 8 bytes with saturation) */
02151                         movq [edi], mm0         /* store result in SrcDest */
02152                         add eax, 8      /* increase Src1 register pointer by 8 */
02153                         add edi, 8      /* increase Dest register pointer by 8 */
02154                         dec              ecx            /* decrease loop counter */
02155                         jnz             L1023           /* check loop termination, proceed if required */
02156                         emms                            /* exit MMX state */
02157                         popa
02158         }
02159 #else
02160         /* i386 and x86_64 */
02161         __m64 *mSrc1 = (__m64*)Src1;
02162         __m64 *mDest = (__m64*)Dest;
02163         /* Duplicate C in 8 bytes of MM1 */
02164         int i;
02165         memset(&i, C, 4);
02166         __m64 mm1 = _m_from_int(i);
02167         __m64 mm2 = _m_from_int(i);
02168         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */
02169         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
02170         for (i = 0; i < SrcLength/8; i++) {
02171                 *mDest = _m_psubusb(*mSrc1, mm1);       /* Src1-C (sub 8 bytes with saturation) */
02172                 mSrc1++;
02173                 mDest++;
02174         }
02175         _m_empty();                                     /* clean MMX state */
02176 #endif
02177         return (0);
02178 #else
02179         return (-1);
02180 #endif
02181 }
02182 
02193 int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
02194 {
02195         unsigned int i, istart;
02196         int iC;
02197         unsigned char *cursrc1;
02198         unsigned char *curdest;
02199         int result;
02200 
02201         /* Validate input parameters */
02202         if ((Src1 == NULL) || (Dest == NULL))
02203                 return(-1);
02204         if (length == 0)
02205                 return(0);
02206 
02207         /* Special case: C==0 */
02208         if (C == 0) {
02209                 memcpy(Src1, Dest, length);
02210                 return (0); 
02211         }
02212 
02213         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02214 
02215                 /* MMX routine */
02216                 SDL_imageFilterSubByteMMX(Src1, Dest, length, C);
02217 
02218                 /* Check for unaligned bytes */
02219                 if ((length & 7) > 0) {
02220                         /* Setup to process unaligned bytes */
02221                         istart = length & 0xfffffff8;
02222                         cursrc1 = &Src1[istart];
02223                         curdest = &Dest[istart];
02224                 } else {
02225                         /* No unaligned bytes - we are done */
02226                         return (0);
02227                 }
02228         } else {
02229                 /* Setup to process whole image */
02230                 istart = 0;
02231                 cursrc1 = Src1;
02232                 curdest = Dest;
02233         }
02234 
02235         /* C routine to process image */
02236         iC = (int) C;
02237         for (i = istart; i < length; i++) {
02238                 result = (int) *cursrc1 - iC;
02239                 if (result < 0)
02240                         result = 0;
02241                 *curdest = (unsigned char) result;
02242                 /* Advance pointers */
02243                 cursrc1++;
02244                 curdest++;
02245         }
02246         return (0);
02247 }
02248 
02260 static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
02261 {
02262 #ifdef USE_MMX
02263 #if !defined(GCC__)
02264         __asm
02265         {
02266                 pusha
02267                         /* ** Duplicate (int)C in 8 bytes of MM1 ** */
02268                         mov eax, C      /* load C into EAX */
02269                         movd mm1, eax           /* copy EAX into MM1 */
02270                         mov eax, D      /* load D into EAX */
02271                         movd mm2, eax           /* copy EAX into MM2 */
02272                         punpckldq mm1, mm2      /* fill higher bytes of MM1 with C */
02273                         mov eax, Src1           /* load Src1 address into eax */
02274                         mov edi, Dest           /* load Dest address into edi */
02275                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02276                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02277                         align 16                        /* 16 byte alignment of the loop entry */
02278 L11024:
02279                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02280                 psubusb mm0, mm1        /* MM0=SrcDest-C (sub 8 bytes with saturation) */
02281                         movq [edi], mm0         /* store result in SrcDest */
02282                         add eax, 8      /* increase Src1 register pointer by 8 */
02283                         add edi, 8      /* increase Dest register pointer by 8 */
02284                         dec              ecx            /* decrease loop counter */
02285                         jnz             L11024          /* check loop termination, proceed if required */
02286                         emms                            /* exit MMX state */
02287                         popa
02288         }
02289 #else
02290         /* i386 and x86_64 */
02291         __m64 *mSrc1 = (__m64*)Src1;
02292         __m64 *mDest = (__m64*)Dest;
02293         /* Duplicate (int)C in 8 bytes of MM1 */
02294         __m64 mm1 = _m_from_int(C);
02295         __m64 mm2 = _m_from_int(C);
02296         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher bytes of MM1 with C */
02297         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
02298         int i;
02299         for (i = 0; i < SrcLength/8; i++) {
02300                 *mDest = _m_psubusb(*mSrc1, mm1);       /* Src1-C (sub 8 bytes with saturation) */
02301                 mSrc1++;
02302                 mDest++;
02303         }
02304         _m_empty();                                     /* clean MMX state */
02305 #endif
02306         return (0);
02307 #else
02308         return (-1);
02309 #endif
02310 }
02311 
02322 int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
02323 {
02324         unsigned int i, j, istart, D;
02325         int iC[4];
02326         unsigned char *cursrc1;
02327         unsigned char *curdest;
02328         int result;
02329 
02330         /* Validate input parameters */
02331         if ((Src1 == NULL) || (Dest == NULL))
02332                 return(-1);
02333         if (length == 0)
02334                 return(0);
02335 
02336     /* Special case: C==0 */
02337         if (C == 0) {
02338                 memcpy(Src1, Dest, length);
02339                 return (0); 
02340         }
02341 
02342         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02343 
02344                 /* MMX routine */
02345                 D=SWAP_32(C);
02346                 SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);
02347 
02348                 /* Check for unaligned bytes */
02349                 if ((length & 7) > 0) {
02350                         /* Setup to process unaligned bytes */
02351                         istart = length & 0xfffffff8;
02352                         cursrc1 = &Src1[istart];
02353                         curdest = &Dest[istart];
02354                 } else {
02355                         /* No unaligned bytes - we are done */
02356                         return (0);
02357                 }
02358         } else {
02359                 /* Setup to process whole image */
02360                 istart = 0;
02361                 cursrc1 = Src1;
02362                 curdest = Dest;
02363         }
02364 
02365         /* C routine to process image */
02366         iC[3] = (int) ((C >> 24) & 0xff);
02367         iC[2] = (int) ((C >> 16) & 0xff);
02368         iC[1] = (int) ((C >>  8) & 0xff);
02369         iC[0] = (int) ((C >>  0) & 0xff);
02370         for (i = istart; i < length; i += 4) {
02371                 for (j = 0; j < 4; j++) {
02372                         if ((i+j)<length) {
02373                                 result = (int) *cursrc1 - iC[j];
02374                                 if (result < 0) result = 0;
02375                                 *curdest = (unsigned char) result;
02376                                 /* Advance pointers */
02377                                 cursrc1++;
02378                                 curdest++;
02379                         }
02380                 }
02381         }
02382         return (0);
02383 }
02384 
02396 static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
02397                                                                  unsigned char *Mask)
02398 {
02399 #ifdef USE_MMX
02400 #if !defined(GCC__)
02401         __asm
02402         {
02403                 pusha
02404                         mov edx, Mask           /* load Mask address into edx */
02405                         movq mm0, [edx]         /* load Mask into mm0 */
02406                 xor ecx, ecx    /* zero ECX */
02407                         mov cl,  N      /* load loop counter (N) into CL */
02408                         movd mm3,  ecx  /* copy (N) into MM3  */
02409                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
02410 L10240:                         /* ** Prepare proper bit-Mask in MM1 ** */
02411                 psrlw mm1,  1   /* shift 4 WORDS of MM1 1 bit to the right */
02412                         pand mm1, mm0   // apply Mask to 8 BYTES of MM1 */
02413                         /*  byte     0x0f, 0xdb, 0xc8 */
02414                         dec               cl            /* decrease loop counter */
02415                         jnz            L10240           /* check loop termination, proceed if required */
02416                         /* ** Shift all bytes of the image ** */
02417                         mov eax, Src1           /* load Src1 address into eax */
02418                         mov edi, Dest           /* load Dest address into edi */
02419                         mov ecx,  SrcLength     /* load loop counter (SIZE) into ecx */
02420                         shr ecx,  3     /* counter/8 (MMX loads 8 bytes at a time) */
02421                         align 16                        /* 16 byte alignment of the loop entry */
02422 L10241:
02423                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02424                 psrlw mm0, mm3          /* shift 4 WORDS of MM0 (N) bits to the right */
02425                         pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
02426                         /* byte     0x0f, 0xdb, 0xc1 */
02427                         movq [edi], mm0         /* store result in SrcDest */
02428                         add eax, 8      /* increase Src1 register pointer by 8 */
02429                         add edi, 8      /* increase Dest register pointer by 8 */
02430                         dec              ecx            /* decrease loop counter */
02431                         jnz            L10241           /* check loop termination, proceed if required */
02432                         emms                            /* exit MMX state */
02433                         popa
02434         }
02435 #else
02436         /* i386 and x86_64 */
02437         __m64 *mSrc1 = (__m64*)Src1;
02438         __m64 *mDest = (__m64*)Dest;
02439         __m64 *mMask = (__m64*)Mask;
02440         __m64 mm1;
02441         int i;
02442         mm1 = _m_pcmpeqb(mm1, mm1);                     /* generate all 1's in mm1 */
02443         /* Prepare proper bit-Mask in MM1 */
02444         for (i = 0; i < N; i++) {
02445                 mm1 = _m_psrlwi(mm1, 1);                /* shift 4 WORDS of MM1 1 bit to the right */
02446                 mm1 = _m_pand(mm1, *mMask);             /* apply Mask to 8 BYTES of MM1 */
02447         }
02448         /* Shift all bytes of the image */
02449         for (i = 0; i < SrcLength/8; i++) {
02450                 __m64 mm0 = _m_psrlwi(*mSrc1, N);       /* shift 4 WORDS of MM0 (N) bits to the right */
02451                 *mDest = _m_pand(mm0, mm1);             /* apply proper bit-Mask to 8 BYTES of MM0 */
02452                 mSrc1++;
02453                 mDest++;
02454         }
02455         _m_empty();                                     /* clean MMX state */
02456 #endif
02457         return (0);
02458 #else
02459         return (-1);
02460 #endif
02461 }
02462 
02473 int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
02474 {
02475         static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
02476         unsigned int i, istart;
02477         unsigned char *cursrc1;
02478         unsigned char *curdest;
02479 
02480         /* Validate input parameters */
02481         if ((Src1 == NULL) || (Dest == NULL))
02482                 return(-1);
02483         if (length == 0)
02484                 return(0);
02485 
02486         /* Check shift */
02487         if (N > 8) {
02488                 return (-1);
02489         }
02490 
02491         /* Special case: N==0 */
02492         if (N == 0) {
02493                 memcpy(Src1, Dest, length);
02494                 return (0); 
02495         }
02496 
02497         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02498 
02499                 /* MMX routine */
02500                 SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);
02501 
02502                 /* Check for unaligned bytes */
02503                 if ((length & 7) > 0) {
02504                         /* Setup to process unaligned bytes */
02505                         istart = length & 0xfffffff8;
02506                         cursrc1 = &Src1[istart];
02507                         curdest = &Dest[istart];
02508                 } else {
02509                         /* No unaligned bytes - we are done */
02510                         return (0);
02511                 }
02512         } else {
02513                 /* Setup to process whole image */
02514                 istart = 0;
02515                 cursrc1 = Src1;
02516                 curdest = Dest;
02517         }
02518 
02519         /* C routine to process image */
02520         for (i = istart; i < length; i++) {
02521                 *curdest = (unsigned char) *cursrc1 >> N;
02522                 /* Advance pointers */
02523                 cursrc1++;
02524                 curdest++;
02525         }
02526 
02527         return (0);
02528 }
02529 
02540 static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
02541 {
02542 #ifdef USE_MMX
02543 #if !defined(GCC__)
02544         __asm
02545         {
02546                 pusha
02547                         mov eax, Src1           /* load Src1 address into eax */
02548                         mov edi, Dest           /* load Dest address into edi */
02549                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
02550                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
02551                         align 16                        /* 16 byte alignment of the loop entry */
02552 L13023:
02553                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
02554                 psrld mm0, N
02555                         movq [edi], mm0         /* store result in SrcDest */
02556                         add eax, 8      /* increase Src1 register pointer by 8 */
02557                         add edi, 8      /* increase Dest register pointer by 8 */
02558                         dec              ecx            /* decrease loop counter */
02559                         jnz             L13023          /* check loop termination, proceed if required */
02560                         emms                            /* exit MMX state */
02561                         popa
02562         }
02563 #else
02564         /* i386 and x86_64 */
02565         __m64 *mSrc1 = (__m64*)Src1;
02566         __m64 *mDest = (__m64*)Dest;
02567         int i;
02568         for (i = 0; i < SrcLength/8; i++) {
02569                 *mDest = _m_psrldi(*mSrc1, N);
02570                 mSrc1++;
02571                 mDest++;
02572         }
02573         _m_empty();                                     /* clean MMX state */
02574 #endif
02575         return (0);
02576 #else
02577         return (-1);
02578 #endif
02579 }
02580 
02591 int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
02592 {
02593         unsigned int i, istart;
02594         unsigned char *cursrc1, *curdest;
02595         unsigned int *icursrc1, *icurdest;
02596         unsigned int result;
02597 
02598         /* Validate input parameters */
02599         if ((Src1 == NULL) || (Dest == NULL))
02600                 return(-1);
02601         if (length == 0)
02602                 return(0);
02603 
02604         if (N > 32) {
02605                 return (-1);
02606         }
02607 
02608         /* Special case: N==0 */
02609         if (N == 0) {
02610                 memcpy(Src1, Dest, length);
02611                 return (0); 
02612         }
02613 
02614         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02615 
02616                 SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);
02617 
02618                 /* Check for unaligned bytes */
02619                 if ((length & 7) > 0) {
02620                         /* Setup to process unaligned bytes */
02621                         istart = length & 0xfffffff8;
02622                         cursrc1 = &Src1[istart];
02623                         curdest = &Dest[istart];
02624                 } else {
02625                         /* No unaligned bytes - we are done */
02626                         return (0);
02627                 }
02628         } else {
02629                 /* Setup to process whole image */
02630                 istart = 0;
02631                 cursrc1 = Src1;
02632                 curdest = Dest;
02633         }
02634 
02635         /* C routine to process image */
02636         icursrc1=(unsigned int *)cursrc1;
02637         icurdest=(unsigned int *)curdest;
02638         for (i = istart; i < length; i += 4) {
02639                 if ((i+4)<length) {
02640                         result = ((unsigned int)*icursrc1 >> N);
02641                         *icurdest = result;
02642                 }
02643                 /* Advance pointers */
02644                 icursrc1++;
02645                 icurdest++;
02646         }
02647 
02648         return (0);
02649 }
02650 
02661 static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
02662 {
02663 #ifdef USE_MMX
02664 #if !defined(GCC__)
02665         __asm
02666         {
02667                 pusha
02668                         /* ** Duplicate C in 4 words of MM1 ** */
02669                         mov al, C       /* load C into AL */
02670                         xor ah, ah      /* zero AH */
02671                         mov bx, ax      /* copy AX into BX */
02672                         shl eax, 16     /* shift 2 bytes of EAX left */
02673                         mov ax, bx      /* copy BX into AX */
02674                         movd mm1, eax           /* copy EAX into MM1 */
02675                         movd mm2, eax           /* copy EAX into MM2 */
02676                         punpckldq mm1, mm2      /* fill higher words of MM1 with C */
02677                         pxor mm0, mm0           /* zero MM0 register */
02678                         mov eax, Src1           /* load Src1 address into eax */
02679                         mov edi, Dest           /* load Dest address into edi */
02680                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
02681                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
02682                         cmp al, 128     /* if (C <= 128) execute more efficient code */
02683                         jg             L10251
02684                         align 16                        /* 16 byte alignment of the loop entry */
02685 L10250:
02686                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
02687                 movq mm4, mm3           /* copy MM3 into MM4  */
02688                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
02689                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
02690                         pmullw mm3, mm1         /* mul low  bytes of SrcDest and MM1 */
02691                         pmullw mm4, mm1         /* mul high bytes of SrcDest and MM1 */
02692                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
02693                         movq [edi], mm3         /* store result in Dest */
02694                         add eax, 8      /* increase Src1 register pointer by 8 */
02695                         add edi, 8      /* increase Dest register pointer by 8 */
02696                         dec              ecx            /* decrease loop counter */
02697                         jnz            L10250           /* check loop termination, proceed if required */
02698                         jmp            L10252
02699                         align 16                        /* 16 byte alignment of the loop entry */
02700 L10251:
02701                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
02702                 movq mm4, mm3           /* copy MM3 into MM4  */
02703                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
02704                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
02705                         pmullw mm3, mm1         /* mul low  bytes of SrcDest and MM1 */
02706                         pmullw mm4, mm1         /* mul high bytes of SrcDest and MM1 */
02707                         /* ** Take abs value of the results (signed words) ** */
02708                         movq mm5, mm3           /* copy mm3 into mm5 */
02709                         movq mm6, mm4           /* copy mm4 into mm6 */
02710                         psraw mm5, 15           /* fill mm5 words with word sign bit */
02711                         psraw mm6, 15           /* fill mm6 words with word sign bit */
02712                         pxor mm3, mm5           /* take 1's compliment of only neg words */
02713                         pxor mm4, mm6           /* take 1's compliment of only neg words */
02714                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */
02715                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
02716                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
02717                         movq [edi], mm3         /* store result in Dest */
02718                         add eax, 8      /* increase Src1 register pointer by 8 */
02719                         add edi, 8      /* increase Dest register pointer by 8 */
02720                         dec              ecx            /* decrease loop counter */
02721                         jnz            L10251           /* check loop termination, proceed if required */
02722 L10252:
02723                 emms                            /* exit MMX state */
02724                         popa
02725         }
02726 #else
02727         /* i386 and x86_64 */
02728         __m64 *mSrc1 = (__m64*)Src1;
02729         __m64 *mDest = (__m64*)Dest;
02730         __m64 mm0 = _m_from_int(0);                             /* zero mm0 register */
02731         /* Duplicate C in 4 words of MM1 */
02732         int i;
02733         i = C | C<<16;
02734         __m64 mm1 = _m_from_int(i);
02735         __m64 mm2 = _m_from_int(i);
02736         mm1 = _m_punpckldq(mm1, mm2);                           /* fill higher words of MM1 with C */
02737         // long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48;
02738         //__m64 mm1 = _m_from_int64(lli); // x86_64 only
02739         if (C <= 128) {                                         /* if (C <= 128) execute more efficient code */
02740                 for (i = 0; i < SrcLength/8; i++) {
02741                         __m64 mm3, mm4;
02742                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
02743                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
02744                         mm3 = _m_pmullw(mm3, mm1);              /* mul low  bytes of Src1 and MM1 */
02745                         mm4 = _m_pmullw(mm4, mm1);              /* mul high bytes of Src1 and MM1 */
02746                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
02747                         mSrc1++;
02748                         mDest++;
02749                 }
02750         } else {
02751                 for (i = 0; i < SrcLength/8; i++) {
02752                         __m64 mm3, mm4, mm5, mm6;
02753                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
02754                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
02755                         mm3 = _m_pmullw(mm3, mm1);              /* mul low  bytes of Src1 and MM1 */
02756                         mm4 = _m_pmullw(mm4, mm1);              /* mul high bytes of Src1 and MM1 */
02757                         /* Take abs value of the results (signed words) */
02758                         mm5 = _m_psrawi(mm3, 15);               /* fill mm5 words with word sign bit */
02759                         mm6 = _m_psrawi(mm4, 15);               /* fill mm6 words with word sign bit */
02760                         mm3 = _m_pxor(mm3, mm5);                /* take 1's compliment of only neg. words */
02761                         mm4 = _m_pxor(mm4, mm6);                /* take 1's compliment of only neg. words */
02762                         mm3 = _m_psubsw(mm3, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */
02763                         mm4 = _m_psubsw(mm4, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */
02764                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
02765                         mSrc1++;
02766                         mDest++;
02767                 }
02768         }
02769         _m_empty();                                             /* clean MMX state */
02770 #endif
02771         return (0);
02772 #else
02773         return (-1);
02774 #endif
02775 }
02776 
02787 int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
02788 {
02789         unsigned int i, istart;
02790         int iC;
02791         unsigned char *cursrc1;
02792         unsigned char *curdest;
02793         int result;
02794 
02795         /* Validate input parameters */
02796         if ((Src1 == NULL) || (Dest == NULL))
02797                 return(-1);
02798         if (length == 0)
02799                 return(0);
02800 
02801         /* Special case: C==1 */
02802         if (C == 1) {
02803                 memcpy(Src1, Dest, length);
02804                 return (0); 
02805         }
02806 
02807         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02808 
02809                 SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);
02810 
02811                 /* Check for unaligned bytes */
02812                 if ((length & 7) > 0) {
02813                         /* Setup to process unaligned bytes */
02814                         istart = length & 0xfffffff8;
02815                         cursrc1 = &Src1[istart];
02816                         curdest = &Dest[istart];
02817                 } else {
02818                         /* No unaligned bytes - we are done */
02819                         return (0);
02820                 }
02821         } else {
02822                 /* Setup to process whole image */
02823                 istart = 0;
02824                 cursrc1 = Src1;
02825                 curdest = Dest;
02826         }
02827 
02828         /* C routine to process image */
02829         iC = (int) C;
02830         for (i = istart; i < length; i++) {
02831                 result = (int) *cursrc1 * iC;
02832                 if (result > 255)
02833                         result = 255;
02834                 *curdest = (unsigned char) result;
02835                 /* Advance pointers */
02836                 cursrc1++;
02837                 curdest++;
02838         }
02839 
02840         return (0);
02841 }
02842 
02854 static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
02855                                                                                           unsigned char C)
02856 {
02857 #ifdef USE_MMX
02858 #if !defined(GCC__)
02859         __asm
02860         {
02861                 pusha
02862                         /* ** Duplicate C in 4 words of MM1 ** */
02863                         mov al, C       /* load C into AL */
02864                         xor ah, ah      /* zero AH */
02865                         mov bx, ax      /* copy AX into BX */
02866                         shl eax, 16     /* shift 2 bytes of EAX left */
02867                         mov ax, bx      /* copy BX into AX */
02868                         movd mm1, eax           /* copy EAX into MM1 */
02869                         movd mm2, eax           /* copy EAX into MM2 */
02870                         punpckldq mm1, mm2      /* fill higher words of MM1 with C */
02871                         xor ecx, ecx    /* zero ECX */
02872                         mov cl, N       /* load N into CL */
02873                         movd mm7, ecx           /* copy N into MM7 */
02874                         pxor mm0, mm0           /* zero MM0 register */
02875                         mov eax, Src1           /* load Src1 address into eax */
02876                         mov edi, Dest           /* load Dest address into edi */
02877                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
02878                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
02879                         align 16                        /* 16 byte alignment of the loop entry */
02880 L1026:
02881                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
02882                 movq mm4, mm3           /* copy MM3 into MM4  */
02883                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
02884                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
02885                         psrlw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the right */
02886                         psrlw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the right */
02887                         pmullw mm3, mm1         /* mul low  bytes of SrcDest by MM1 */
02888                         pmullw mm4, mm1         /* mul high bytes of SrcDest by MM1 */
02889                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
02890                         movq [edi], mm3         /* store result in Dest */
02891                         add eax, 8      /* increase Src1 register pointer by 8 */
02892                         add edi, 8      /* increase Dest register pointer by 8 */
02893                         dec              ecx            /* decrease loop counter */
02894                         jnz             L1026           /* check loop termination, proceed if required */
02895                         emms                            /* exit MMX state */
02896                         popa
02897         }
02898 #else
02899         /* i386 and x86_64 */
02900         __m64 *mSrc1 = (__m64*)Src1;
02901         __m64 *mDest = (__m64*)Dest;
02902         __m64 mm0 = _m_from_int(0);                     /* zero mm0 register */
02903         /* Duplicate C in 4 words of MM1 */
02904         int i;
02905         i = (C<<16)|C;
02906         __m64 mm1 = _m_from_int(i);
02907         __m64 mm2 = _m_from_int(i);
02908         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher words of MM1 with C */
02909         for (i = 0; i < SrcLength/8; i++) {
02910                 __m64 mm3, mm4, mm5, mm6;
02911                 mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
02912                 mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
02913                 mm3 = _m_psrlwi(mm3, N);                /* shift 4 WORDS of MM3 (N) bits to the right */
02914                 mm4 = _m_psrlwi(mm4, N);                /* shift 4 WORDS of MM4 (N) bits to the right */
02915                 mm3 = _m_pmullw(mm3, mm1);              /* mul low  bytes of Src1 and MM1 */
02916                 mm4 = _m_pmullw(mm4, mm1);              /* mul high bytes of Src1 and MM1 */
02917                 *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
02918                 mSrc1++;
02919                 mDest++;
02920         }
02921         _m_empty();                                     /* clean MMX state */
02922 #endif
02923         return (0);
02924 #else
02925         return (-1);
02926 #endif
02927 }
02928 
02940 int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
02941                                                                                    unsigned char C)
02942 {
02943         unsigned int i, istart;
02944         int iC;
02945         unsigned char *cursrc1;
02946         unsigned char *curdest;
02947         int result;
02948 
02949         /* Validate input parameters */
02950         if ((Src1 == NULL) || (Dest == NULL))
02951                 return(-1);
02952         if (length == 0)
02953                 return(0);
02954 
02955         /* Check shift */
02956         if (N > 8) {
02957                 return (-1);
02958         }
02959 
02960         /* Special case: N==0 && C==1 */
02961         if ((N == 0) && (C == 1)) {
02962                 memcpy(Src1, Dest, length);
02963                 return (0); 
02964         }
02965 
02966         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
02967 
02968                 SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);
02969 
02970                 /* Check for unaligned bytes */
02971                 if ((length & 7) > 0) {
02972                         /* Setup to process unaligned bytes */
02973                         istart = length & 0xfffffff8;
02974                         cursrc1 = &Src1[istart];
02975                         curdest = &Dest[istart];
02976                 } else {
02977                         /* No unaligned bytes - we are done */
02978                         return (0);
02979                 }
02980         } else {
02981                 /* Setup to process whole image */
02982                 istart = 0;
02983                 cursrc1 = Src1;
02984                 curdest = Dest;
02985         }
02986 
02987         /* C routine to process image */
02988         iC = (int) C;
02989         for (i = istart; i < length; i++) {
02990                 result = (int) (*cursrc1 >> N) * iC;
02991                 if (result > 255)
02992                         result = 255;
02993                 *curdest = (unsigned char) result;
02994                 /* Advance pointers */
02995                 cursrc1++;
02996                 curdest++;
02997         }
02998 
02999         return (0);
03000 }
03001 
03013 static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
03014                                                                         unsigned char *Mask)
03015 {
03016 #ifdef USE_MMX
03017 #if !defined(GCC__)
03018         __asm
03019         {
03020                 pusha
03021                         mov edx, Mask           /* load Mask address into edx */
03022                         movq mm0, [edx]         /* load Mask into mm0 */
03023                 xor ecx, ecx    /* zero ECX */
03024                         mov cl, N       /* load loop counter (N) into CL */
03025                         movd mm3, ecx           /* copy (N) into MM3  */
03026                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
03027 L10270:                         /* ** Prepare proper bit-Mask in MM1 ** */
03028                 psllw mm1, 1    /* shift 4 WORDS of MM1 1 bit to the left */
03029                         pand mm1, mm0        // apply Mask to 8 BYTES of MM1 */
03030                         /*  byte     0x0f, 0xdb, 0xc8 */
03031                         dec cl                          /* decrease loop counter */
03032                         jnz            L10270           /* check loop termination, proceed if required */
03033                         /* ** Shift all bytes of the image ** */
03034                         mov eax, Src1           /* load Src1 address into eax */
03035                         mov edi, Dest           /* load SrcDest address into edi */
03036                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03037                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03038                         align 16                        /* 16 byte alignment of the loop entry */
03039 L10271:
03040                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */
03041                 psllw mm0, mm3          /* shift 4 WORDS of MM0 (N) bits to the left */
03042                         pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
03043                         /* byte     0x0f, 0xdb, 0xc1 */
03044                         movq [edi], mm0         /* store result in Dest */
03045                         add eax, 8      /* increase Src1 register pointer by 8 */
03046                         add edi, 8      /* increase Dest register pointer by 8 */
03047                         dec              ecx            /* decrease loop counter */
03048                         jnz            L10271           /* check loop termination, proceed if required */
03049                         emms                            /* exit MMX state */
03050                         popa
03051         }
03052 #else
03053         /* i386 and x86_64 */
03054         __m64 *mSrc1 = (__m64*)Src1;
03055         __m64 *mDest = (__m64*)Dest;
03056         __m64 *mMask = (__m64*)Mask;
03057         __m64 mm1;
03058         int i;
03059         mm1 = _m_pcmpeqb(mm1, mm1);                     /* generate all 1's in mm1 */
03060         /* Prepare proper bit-Mask in MM1 */
03061         for (i = 0; i < N; i++) {
03062                 mm1 = _m_psllwi(mm1, 1);                /* shift 4 WORDS of MM1 1 bit to the left */
03063                 mm1 = _m_pand(mm1, *mMask);             /* apply Mask to 8 BYTES of MM1 */
03064         }
03065         /* ** Shift all bytes of the image ** */
03066         for (i = 0; i < SrcLength/8; i++) {
03067                 __m64 mm0 = _m_psllwi(*mSrc1, N);       /* shift 4 WORDS of MM0 (N) bits to the left */
03068                 *mDest = _m_pand(mm0, mm1);             /* apply proper bit-Mask to 8 BYTES of MM0 */
03069                 mSrc1++;
03070                 mDest++;
03071         }
03072         _m_empty();                                     /* clean MMX state */
03073 #endif
03074         return (0);
03075 #else
03076         return (-1);
03077 #endif
03078 }
03079 
03090 int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
03091 {
03092         static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
03093         unsigned int i, istart;
03094         unsigned char *cursrc1, *curdest;
03095         int result;
03096 
03097         /* Validate input parameters */
03098         if ((Src1 == NULL) || (Dest == NULL))
03099                 return(-1);
03100         if (length == 0)
03101                 return(0);
03102 
03103         if (N > 8) {
03104                 return (-1);
03105         }
03106 
03107         /* Special case: N==0 */
03108         if (N == 0) {
03109                 memcpy(Src1, Dest, length);
03110                 return (0); 
03111         }
03112 
03113         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03114 
03115                 SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);
03116 
03117                 /* Check for unaligned bytes */
03118                 if ((length & 7) > 0) {
03119                         /* Setup to process unaligned bytes */
03120                         istart = length & 0xfffffff8;
03121                         cursrc1 = &Src1[istart];
03122                         curdest = &Dest[istart];
03123                 } else {
03124                         /* No unaligned bytes - we are done */
03125                         return (0);
03126                 }
03127         } else {
03128                 /* Setup to process whole image */
03129                 istart = 0;
03130                 cursrc1 = Src1;
03131                 curdest = Dest;
03132         }
03133 
03134         /* C routine to process image */
03135         for (i = istart; i < length; i++) {
03136                 result = ((int) *cursrc1 << N) & 0xff;
03137                 *curdest = (unsigned char) result;
03138                 /* Advance pointers */
03139                 cursrc1++;
03140                 curdest++;
03141         }
03142 
03143         return (0);
03144 }
03145 
03156 static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
03157 {
03158 #ifdef USE_MMX
03159 #if !defined(GCC__)
03160         __asm
03161         {
03162                 pusha
03163                         mov eax, Src1           /* load Src1 address into eax */
03164                         mov edi, Dest           /* load Dest address into edi */
03165                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03166                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03167                         align 16                        /* 16 byte alignment of the loop entry */
03168 L12023:
03169                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
03170                 pslld mm0, N    /* MM0=SrcDest+C (add 8 bytes with saturation) */
03171                         movq [edi], mm0         /* store result in SrcDest */
03172                         add eax, 8      /* increase Src1 register pointer by 8 */
03173                         add edi, 8      /* increase Dest register pointer by 8 */
03174                         dec              ecx            /* decrease loop counter */
03175                         jnz             L12023          /* check loop termination, proceed if required */
03176                         emms                            /* exit MMX state */
03177                         popa
03178         }
03179 #else
03180         /* i386 and x86_64 */
03181         __m64 *mSrc1 = (__m64*)Src1;
03182         __m64 *mDest = (__m64*)Dest;
03183         int i;
03184         for (i = 0; i < SrcLength/8; i++) {
03185                 *mDest = _m_pslldi(*mSrc1, N);  /* Src1+C (add 8 bytes with saturation) */
03186                 mSrc1++;
03187                 mDest++;
03188         }
03189         _m_empty();                             /* clean MMX state */
03190 #endif
03191         return (0);
03192 #else
03193         return (-1);
03194 #endif
03195 }
03196 
03207 int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
03208 {
03209         unsigned int i, istart;
03210         unsigned char *cursrc1, *curdest;
03211         unsigned int *icursrc1, *icurdest;
03212         unsigned int result;
03213 
03214         /* Validate input parameters */
03215         if ((Src1 == NULL) || (Dest == NULL))
03216                 return(-1);
03217         if (length == 0)
03218                 return(0);
03219 
03220         if (N > 32) {
03221                 return (-1);
03222         }
03223 
03224         /* Special case: N==0 */
03225         if (N == 0) {
03226                 memcpy(Src1, Dest, length);
03227                 return (0); 
03228         }
03229 
03230         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03231 
03232                 SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);
03233 
03234                 /* Check for unaligned bytes */
03235                 if ((length & 7) > 0) {
03236                         /* Setup to process unaligned bytes */
03237                         istart = length & 0xfffffff8;
03238                         cursrc1 = &Src1[istart];
03239                         curdest = &Dest[istart];
03240                 } else {
03241                         /* No unaligned bytes - we are done */
03242                         return (0);
03243                 }
03244         } else {
03245                 /* Setup to process whole image */
03246                 istart = 0;
03247                 cursrc1 = Src1;
03248                 curdest = Dest;
03249         }
03250 
03251         /* C routine to process image */
03252         icursrc1=(unsigned int *)cursrc1;
03253         icurdest=(unsigned int *)curdest;
03254         for (i = istart; i < length; i += 4) {
03255                 if ((i+4)<length) {
03256                         result = ((unsigned int)*icursrc1 << N);
03257                         *icurdest = result;
03258                 }
03259                 /* Advance pointers */
03260                 icursrc1++;
03261                 icurdest++;
03262         }
03263 
03264         return (0);
03265 }
03266 
03277 static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
03278 {
03279 #ifdef USE_MMX
03280 #if !defined(GCC__)
03281         __asm
03282         {
03283                 pusha
03284                         xor eax, eax    /* zero EAX */
03285                         mov al, N       /* load N into AL */
03286                         movd mm7, eax           /* copy N into MM7 */
03287                         pxor mm0, mm0           /* zero MM0 register */
03288                         mov eax, Src1           /* load Src1 address into eax */
03289                         mov edi, Dest           /* load Dest address into edi */
03290                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03291                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03292                         cmp al, 7       /* if (N <= 7) execute more efficient code */
03293                         jg             L10281
03294                         align 16                        /* 16 byte alignment of the loop entry */
03295 L10280:
03296                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
03297                 movq mm4, mm3           /* copy MM3 into MM4  */
03298                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
03299                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
03300                         psllw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the left */
03301                         psllw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the left */
03302                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
03303                         movq [edi], mm3         /* store result in Dest */
03304                         add eax, 8      /* increase Src1 register pointer by 8 */
03305                         add edi, 8      /* increase Dest register pointer by 8 */
03306                         dec              ecx            /* decrease loop counter */
03307                         jnz            L10280           /* check loop termination, proceed if required */
03308                         jmp            L10282
03309                         align 16                        /* 16 byte alignment of the loop entry */
03310 L10281:
03311                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
03312                 movq mm4, mm3           /* copy MM3 into MM4  */
03313                         punpcklbw mm3, mm0      /* unpack low  bytes of SrcDest into words */
03314                         punpckhbw mm4, mm0      /* unpack high bytes of SrcDest into words */
03315                         psllw mm3, mm7          /* shift 4 WORDS of MM3 (N) bits to the left */
03316                         psllw mm4, mm7          /* shift 4 WORDS of MM4 (N) bits to the left */
03317                         /* ** Take abs value of the signed words ** */
03318                         movq mm5, mm3           /* copy mm3 into mm5 */
03319                         movq mm6, mm4           /* copy mm4 into mm6 */
03320                         psraw mm5, 15           /* fill mm5 words with word sign bit */
03321                         psraw mm6, 15           /* fill mm6 words with word sign bit */
03322                         pxor mm3, mm5           /* take 1's compliment of only neg words */
03323                         pxor mm4, mm6           /* take 1's compliment of only neg words */
03324                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */
03325                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
03326                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
03327                         movq [edi], mm3         /* store result in Dest */
03328                         add eax, 8      /* increase Src1 register pointer by 8 */
03329                         add edi, 8      /* increase Dest register pointer by 8 */
03330                         dec              ecx            /* decrease loop counter */
03331                         jnz            L10281           /* check loop termination, proceed if required */
03332 L10282:
03333                 emms                            /* exit MMX state */
03334                         popa
03335         }
03336 #else
03337         /* i386 and x86_64 */
03338         __m64 *mSrc1 = (__m64*)Src1;
03339         __m64 *mDest = (__m64*)Dest;
03340         __m64 mm0 = _m_from_int(0);                             /* zero mm0 register */
03341         int i;
03342         if (N <= 7) {                                           /* if (N <= 7) execute more efficient code */
03343                 for (i = 0; i < SrcLength/8; i++) {
03344                         __m64 mm3, mm4;
03345                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
03346                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
03347                         mm3 = _m_psllwi(mm3, N);                /* shift 4 WORDS of MM3 (N) bits to the left */
03348                         mm4 = _m_psllwi(mm4, N);                /* shift 4 WORDS of MM4 (N) bits to the left */
03349                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
03350                         mSrc1++;
03351                         mDest++;
03352                 }
03353         } else {
03354                 for (i = 0; i < SrcLength/8; i++) {
03355                         __m64 mm3, mm4, mm5, mm6;
03356                         mm3 = _m_punpcklbw(*mSrc1, mm0);        /* unpack low  bytes of Src1 into words */
03357                         mm4 = _m_punpckhbw(*mSrc1, mm0);        /* unpack high bytes of Src1 into words */
03358                         mm3 = _m_psllwi(mm3, N);                /* shift 4 WORDS of MM3 (N) bits to the left */
03359                         mm4 = _m_psllwi(mm4, N);                /* shift 4 WORDS of MM4 (N) bits to the left */
03360                         /* Take abs value of the signed words */
03361                         mm5 = _m_psrawi(mm3, 15);               /* fill mm5 words with word sign bit */
03362                         mm6 = _m_psrawi(mm4, 15);               /* fill mm6 words with word sign bit */
03363                         mm3 = _m_pxor(mm3, mm5);                /* take 1's compliment of only neg. words */
03364                         mm4 = _m_pxor(mm4, mm6);                /* take 1's compliment of only neg. words */
03365                         mm3 = _m_psubsw(mm3, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */
03366                         mm4 = _m_psubsw(mm4, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */
03367                         *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
03368                         mSrc1++;
03369                         mDest++;
03370                 }
03371         }
03372         _m_empty();                                             /* clean MMX state */
03373 #endif
03374         return (0);
03375 #else
03376         return (-1);
03377 #endif
03378 }
03379 
03390 int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
03391 {
03392         unsigned int i, istart;
03393         unsigned char *cursrc1, *curdest;
03394         int result;
03395 
03396         /* Validate input parameters */
03397         if ((Src1 == NULL) || (Dest == NULL))
03398                 return(-1);
03399         if (length == 0)
03400                 return(0);
03401 
03402         if (N > 8) {
03403                 return (-1);
03404         }
03405 
03406         /* Special case: N==0 */
03407         if (N == 0) {
03408                 memcpy(Src1, Dest, length);
03409                 return (0); 
03410         }
03411 
03412         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03413 
03414                 SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);
03415 
03416                 /* Check for unaligned bytes */
03417                 if ((length & 7) > 0) {
03418                         /* Setup to process unaligned bytes */
03419                         istart = length & 0xfffffff8;
03420                         cursrc1 = &Src1[istart];
03421                         curdest = &Dest[istart];
03422                 } else {
03423                         /* No unaligned bytes - we are done */
03424                         return (0);
03425                 }
03426         } else {
03427                 /* Setup to process whole image */
03428                 istart = 0;
03429                 cursrc1 = Src1;
03430                 curdest = Dest;
03431         }
03432 
03433         /* C routine to process image */
03434         for (i = istart; i < length; i++) {
03435                 result = (int) *cursrc1 << N;
03436                 if (result > 255)
03437                         result = 255;
03438                 *curdest = (unsigned char) result;
03439                 /* Advance pointers */
03440                 cursrc1++;
03441                 curdest++;
03442         }
03443 
03444         return (0);
03445 }
03446 
03457 static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
03458 {
03459 #ifdef USE_MMX
03460 #if !defined(GCC__)
03461         __asm
03462         {
03463                 pusha
03464                         /* ** Duplicate T in 8 bytes of MM3 ** */
03465                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
03466                         pcmpeqb mm2, mm2        /* generate all 1's in mm2 */
03467                         mov al, T       /* load T into AL */
03468                         mov ah, al      /* copy AL into AH */
03469                         mov bx, ax      /* copy AX into BX */
03470                         shl eax, 16     /* shift 2 bytes of EAX left */
03471                         mov ax, bx      /* copy BX into AX */
03472                         movd mm3, eax           /* copy EAX into MM3 */
03473                         movd mm4, eax           /* copy EAX into MM4 */
03474                         punpckldq mm3, mm4      /* fill higher bytes of MM3 with T */
03475                         psubusb mm2, mm3        /* store 0xFF - T in MM2 */
03476                         mov eax, Src1           /* load Src1 address into eax */
03477                         mov edi, Dest           /* load Dest address into edi */
03478                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03479                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03480                         align 16                        /* 16 byte alignment of the loop entry */
03481 L1029:
03482                 movq mm0, [eax]         /* load 8 bytes from SrcDest into MM0 */
03483                 paddusb mm0, mm2        /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
03484                         pcmpeqb mm0, mm1        /* binarize 255:0, comparing to 255 */
03485                         movq [edi], mm0         /* store result in SrcDest */
03486                         add eax, 8      /* increase Src1 register pointer by 8 */
03487                         add edi, 8      /* increase Dest register pointer by 8 */
03488                         dec              ecx            /* decrease loop counter */
03489                         jnz             L1029           /* check loop termination, proceed if required */
03490                         emms                            /* exit MMX state */
03491                         popa
03492         }
03493 #else
03494         /* i386 and x86_64 */
03495         __m64 *mSrc1 = (__m64*)Src1;
03496         __m64 *mDest = (__m64*)Dest;
03497         /* Duplicate T in 8 bytes of MM3 */
03498         __m64 mm1 = _m_pcmpeqb(mm1, mm1);                       /* generate all 1's in mm1 */
03499         __m64 mm2 = _m_pcmpeqb(mm2, mm2);                       /* generate all 1's in mm1 */
03500         int i;
03501         memset(&i, T, 4);
03502         __m64 mm3 = _m_from_int(i);
03503         __m64 mm4 = _m_from_int(i);
03504         mm3 = _m_punpckldq(mm3, mm4);                   /* fill higher bytes of MM3 with T */
03505         mm2 = _m_psubusb(mm2, mm3);                     /* store 0xFF - T in MM2 */
03506         //__m64 mm3 = _m_from_int64(lli); // x86_64 only
03507         for (i = 0; i < SrcLength/8; i++) {
03508                 __m64 mm0 = _m_paddusb(*mSrc1, mm2);    /* Src1+(0xFF-T) (add 8 bytes with saturation) */
03509                 *mDest = _m_pcmpeqb(mm0, mm1);          /* binarize 255:0, comparing to 255 */
03510                 mSrc1++;
03511                 mDest++;
03512         }
03513         _m_empty();                                     /* clean MMX state */
03514 #endif
03515         return (0);
03516 #else
03517         return (-1);
03518 #endif
03519 }
03520 
03531 int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
03532 {
03533         unsigned int i, istart;
03534         unsigned char *cursrc1;
03535         unsigned char *curdest;
03536 
03537         /* Validate input parameters */
03538         if ((Src1 == NULL) || (Dest == NULL))
03539                 return(-1);
03540         if (length == 0)
03541                 return(0);
03542 
03543         /* Special case: T==0 */
03544         if (T == 0) {
03545                 memset(Dest, 255, length);
03546                 return (0); 
03547         }
03548 
03549         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03550 
03551                 SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);
03552 
03553                 /* Check for unaligned bytes */
03554                 if ((length & 7) > 0) {
03555                         /* Setup to process unaligned bytes */
03556                         istart = length & 0xfffffff8;
03557                         cursrc1 = &Src1[istart];
03558                         curdest = &Dest[istart];
03559                 } else {
03560                         /* No unaligned bytes - we are done */
03561                         return (0);
03562                 }
03563         } else {
03564                 /* Setup to process whole image */
03565                 istart = 0;
03566                 cursrc1 = Src1;
03567                 curdest = Dest;
03568         }
03569 
03570         /* C routine to process image */
03571         for (i = istart; i < length; i++) {
03572                 *curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0);
03573                 /* Advance pointers */
03574                 cursrc1++;
03575                 curdest++;
03576         }
03577 
03578         return (0);
03579 }
03580 
03592 static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
03593                                                                   unsigned char Tmax)
03594 {
03595 #ifdef USE_MMX
03596 #if !defined(GCC__)
03597         __asm
03598         {
03599                 pusha
03600                         pcmpeqb mm1, mm1        /* generate all 1's in mm1 */
03601                         /* ** Duplicate Tmax in 8 bytes of MM3 ** */
03602                         mov al, Tmax    /* load Tmax into AL */
03603                         mov ah, al      /* copy AL into AH */
03604                         mov bx, ax      /* copy AX into BX */
03605                         shl eax, 16     /* shift 2 bytes of EAX left */
03606                         mov ax, bx      /* copy BX into AX */
03607                         movd mm3, eax           /* copy EAX into MM3 */
03608                         movd mm4, eax           /* copy EAX into MM4 */
03609                         punpckldq mm3, mm4      /* fill higher bytes of MM3 with Tmax */
03610                         psubusb mm1, mm3        /* store 0xFF - Tmax in MM1 */
03611                         /* ** Duplicate Tmin in 8 bytes of MM5 ** */
03612                         mov al, Tmin    /* load Tmin into AL */
03613                         mov ah, al      /* copy AL into AH */
03614                         mov bx, ax      /* copy AX into BX */
03615                         shl eax, 16     /* shift 2 bytes of EAX left */
03616                         mov ax, bx      /* copy BX into AX */
03617                         movd mm5, eax           /* copy EAX into MM5 */
03618                         movd mm4, eax           /* copy EAX into MM4 */
03619                         punpckldq mm5, mm4      /* fill higher bytes of MM5 with Tmin */
03620                         movq mm7, mm5           /* copy MM5 into MM7 */
03621                         paddusb mm7, mm1        /* store 0xFF - Tmax + Tmin in MM7 */
03622                         mov eax, Src1           /* load Src1 address into eax */
03623                         mov edi, Dest           /* load Dest address into edi */
03624                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03625                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03626                         align 16                        /* 16 byte alignment of the loop entry */
03627 L1030:
03628                 movq mm0, [eax]         /* load 8 bytes from Src1 into MM0 */
03629                 paddusb mm0, mm1        /* MM0=SrcDest+(0xFF-Tmax) */
03630                         psubusb mm0, mm7        /* MM0=MM0-(0xFF-Tmax+Tmin) */
03631                         paddusb mm0, mm5        /* MM0=MM0+Tmin */
03632                         movq [edi], mm0         /* store result in Dest */
03633                         add eax, 8      /* increase Src1 register pointer by 8 */
03634                         add edi, 8      /* increase Dest register pointer by 8 */
03635                         dec              ecx            /* decrease loop counter */
03636                         jnz             L1030           /* check loop termination, proceed if required */
03637                         emms                            /* exit MMX state */
03638                         popa
03639         }
03640 #else
03641         /* i386 and x86_64 */
03642         __m64 *mSrc1 = (__m64*)Src1;
03643         __m64 *mDest = (__m64*)Dest;
03644         __m64 mm1 = _m_pcmpeqb(mm1, mm1);       /* generate all 1's in mm1 */
03645         int i;
03646         /* Duplicate Tmax in 8 bytes of MM3 */
03647         __m64 mm3, mm4;
03648         memset(&i, Tmax, 4);
03649         mm3 = _m_from_int(i);
03650         mm4 = _m_from_int(i);
03651         mm3 = _m_punpckldq(mm3, mm4);           /* fill higher bytes of MM3 with Tmax */
03652         mm1 = _m_psubusb(mm1, mm3);             /* store 0xFF - Tmax in MM1 */
03653         //__m64 mm3 = _m_from_int64(lli); // x86_64 only
03654         /* Duplicate Tmax in 8 bytes of MM3 */
03655         __m64 mm5, mm7;
03656         memset(&i, Tmin, 4);
03657         mm5 = _m_from_int(i);
03658         mm4 = _m_from_int(i);
03659         mm5 = _m_punpckldq(mm5, mm4);           /* fill higher bytes of MM5 with Tmin */
03660         mm7 = _m_paddusb(mm5, mm1);     /* store 0xFF - Tmax + Tmin in MM7 */
03661         for (i = 0; i < SrcLength/8; i++) {
03662                 __m64 mm0;
03663                 mm0 = _m_paddusb(*mSrc1, mm1);  /* MM0=Src1+(0xFF-Tmax) */
03664                 mm0 = _m_psubusb(mm0, mm7);     /* MM0=MM0-(0xFF-Tmax+Tmin) */
03665                 *mDest = _m_paddusb(mm0, mm5);  /* MM0+Tmin */
03666                 mSrc1++;
03667                 mDest++;
03668         }
03669         _m_empty();                             /* clean MMX state */
03670 #endif
03671         return (0);
03672 #else
03673         return (-1);
03674 #endif
03675 }
03676 
03688 int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
03689                                                            unsigned char Tmax)
03690 {
03691         unsigned int i, istart;
03692         unsigned char *cursrc1;
03693         unsigned char *curdest;
03694 
03695         /* Validate input parameters */
03696         if ((Src1 == NULL) || (Dest == NULL))
03697                 return(-1);
03698         if (length == 0)
03699                 return(0);
03700 
03701         /* Special case: Tmin==0 && Tmax = 255 */
03702         if ((Tmin == 0) && (Tmax == 25)) {
03703                 memcpy(Src1, Dest, length);
03704                 return (0); 
03705         }
03706 
03707         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03708 
03709                 SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);
03710 
03711                 /* Check for unaligned bytes */
03712                 if ((length & 7) > 0) {
03713                         /* Setup to process unaligned bytes */
03714                         istart = length & 0xfffffff8;
03715                         cursrc1 = &Src1[istart];
03716                         curdest = &Dest[istart];
03717                 } else {
03718                         /* No unaligned bytes - we are done */
03719                         return (0);
03720                 }
03721         } else {
03722                 /* Setup to process whole image */
03723                 istart = 0;
03724                 cursrc1 = Src1;
03725                 curdest = Dest;
03726         }
03727 
03728         /* C routine to process image */
03729         for (i = istart; i < length; i++) {
03730                 if (*cursrc1 < Tmin) {
03731                         *curdest = Tmin;
03732                 } else if (*cursrc1 > Tmax) {
03733                         *curdest = Tmax;
03734                 } else {
03735                         *curdest = *cursrc1;
03736                 }
03737                 /* Advance pointers */
03738                 cursrc1++;
03739                 curdest++;
03740         }
03741 
03742         return (0);
03743 }
03744 
03758 static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
03759                                                                           int Nmin, int Nmax)
03760 {
03761 #ifdef USE_MMX
03762 #if !defined(GCC__)
03763         __asm
03764         {
03765                 pusha
03766                         mov ax, WORD PTR Nmax           /* load Nmax in AX */
03767                         mov bx, WORD PTR Cmax           /* load Cmax in BX */
03768                         sub ax, WORD PTR Nmin           /* AX = Nmax - Nmin */
03769                         sub bx, WORD PTR Cmin           /* BX = Cmax - Cmin */
03770                         jz             L10311           /* check division by zero */
03771                         xor dx, dx      /* prepare for division, zero DX */
03772                         div               bx            /* AX = AX/BX */
03773                         jmp            L10312
03774 L10311:
03775                 mov ax, 255     /* if div by zero, assume result max byte value */
03776 L10312:                         /* ** Duplicate AX in 4 words of MM0 ** */
03777                 mov bx, ax      /* copy AX into BX */
03778                         shl eax, 16     /* shift 2 bytes of EAX left */
03779                         mov ax, bx      /* copy BX into AX */
03780                         movd mm0, eax           /* copy EAX into MM0 */
03781                         movd mm1, eax           /* copy EAX into MM1 */
03782                         punpckldq mm0, mm1      /* fill higher words of MM0 with AX */
03783                         /* ** Duplicate Cmin in 4 words of MM1 ** */
03784                         mov ax, WORD PTR Cmin           /* load Cmin into AX */
03785                         mov bx, ax      /* copy AX into BX */
03786                         shl eax, 16     /* shift 2 bytes of EAX left */
03787                         mov ax, bx      /* copy BX into AX */
03788                         movd mm1, eax           /* copy EAX into MM1 */
03789                         movd mm2, eax           /* copy EAX into MM2 */
03790                         punpckldq mm1, mm2      /* fill higher words of MM1 with Cmin */
03791                         /* ** Duplicate Nmin in 4 words of MM2 ** */
03792                         mov ax, WORD PTR Nmin           /* load Nmin into AX */
03793                         mov bx, ax      /* copy AX into BX */
03794                         shl eax, 16     /* shift 2 bytes of EAX left */
03795                         mov ax, bx      /* copy BX into AX */
03796                         movd mm2, eax           /* copy EAX into MM2 */
03797                         movd mm3, eax           /* copy EAX into MM3 */
03798                         punpckldq mm2, mm3      /* fill higher words of MM2 with Nmin */
03799                         pxor mm7, mm7           /* zero MM7 register */
03800                         mov eax, Src1           /* load Src1 address into eax */
03801                         mov edi, Dest           /* load Dest address into edi */
03802                         mov ecx, SrcLength      /* load loop counter (SIZE) into ecx */
03803                         shr ecx, 3      /* counter/8 (MMX loads 8 bytes at a time) */
03804                         align 16                        /* 16 byte alignment of the loop entry */
03805 L1031:
03806                 movq mm3, [eax]         /* load 8 bytes from Src1 into MM3 */
03807                 movq mm4, mm3           /* copy MM3 into MM4  */
03808                         punpcklbw mm3, mm7      /* unpack low  bytes of SrcDest into words */
03809                         punpckhbw mm4, mm7      /* unpack high bytes of SrcDest into words */
03810                         psubusb mm3, mm1        /* S-Cmin, low  bytes */
03811                         psubusb mm4, mm1        /* S-Cmin, high bytes */
03812                         pmullw mm3, mm0         /* MM0*(S-Cmin), low  bytes */
03813                         pmullw mm4, mm0         /* MM0*(S-Cmin), high bytes */
03814                         paddusb mm3, mm2        /* MM0*(S-Cmin)+Nmin, low  bytes */
03815                         paddusb mm4, mm2        /* MM0*(S-Cmin)+Nmin, high bytes */
03816                         /* ** Take abs value of the signed words ** */
03817                         movq mm5, mm3           /* copy mm3 into mm5 */
03818                         movq mm6, mm4           /* copy mm4 into mm6 */
03819                         psraw mm5, 15           /* fill mm5 words with word sign bit */
03820                         psraw mm6, 15           /* fill mm6 words with word sign bit */
03821                         pxor mm3, mm5           /* take 1's compliment of only neg words */
03822                         pxor mm4, mm6           /* take 1's compliment of only neg words */
03823                         psubsw mm3, mm5         /* add 1 to only neg words, W-(-1) or W-0 */
03824                         psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
03825                         packuswb mm3, mm4       /* pack words back into bytes with saturation */
03826                         movq [edi], mm3         /* store result in Dest */
03827                         add eax, 8      /* increase Src1 register pointer by 8 */
03828                         add edi, 8      /* increase Dest register pointer by 8 */
03829                         dec              ecx            /* decrease loop counter */
03830                         jnz             L1031           /* check loop termination, proceed if required */
03831                         emms                            /* exit MMX state */
03832                         popa
03833         }
03834 #else
03835         /* i386 and x86_64 */
03836         __m64 *mSrc1 = (__m64*)Src1;
03837         __m64 *mDest = (__m64*)Dest;
03838         __m64 mm0, mm1, mm2, mm3;
03839 
03840         int i;
03841         /* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */
03842         unsigned short a = Nmax - Nmin;
03843         unsigned short b = Cmax - Cmin;
03844         if (b == 0) {
03845             a = 255;
03846         } else {
03847             a /= b;
03848         }
03849         i = (a<<16)|a;
03850         mm0 = _m_from_int(i);
03851         mm1 = _m_from_int(i);
03852         mm0 = _m_punpckldq(mm0, mm1);                   /* fill higher words of MM0 with AX */
03853         /* Duplicate Cmin in 4 words of MM1 */
03854         i = (Cmin<<16)|(short)Cmin;
03855         mm1 = _m_from_int(i);
03856         mm2 = _m_from_int(i);
03857         mm1 = _m_punpckldq(mm1, mm2);                   /* fill higher words of MM1 with Cmin */
03858         /* Duplicate Nmin in 4 words of MM2 */
03859         i = (Nmin<<16)|(short)Nmin;
03860         mm2 = _m_from_int(i);
03861         mm3 = _m_from_int(i);
03862         mm2 = _m_punpckldq(mm2, mm3);                   /* fill higher words of MM2 with Nmin */
03863         __m64 mm7 = _m_from_int(0);                     /* zero mm0 register */
03864         for (i = 0; i < SrcLength/8; i++) {
03865                 __m64 mm3, mm4, mm5, mm6;
03866                 mm3 = _m_punpcklbw(*mSrc1, mm7);        /* unpack low  bytes of Src1 into words */
03867                 mm4 = _m_punpckhbw(*mSrc1, mm7);        /* unpack high bytes of Src1 into words */
03868                 mm3 = _m_psubusb(mm3, mm1);             /* S-Cmin, low  bytes */
03869                 mm4 = _m_psubusb(mm4, mm1);             /* S-Cmin, high bytes */
03870                 mm3 = _m_pmullw(mm3, mm0);              /* MM0*(S-Cmin), low  bytes */
03871                 mm4 = _m_pmullw(mm4, mm0);              /* MM0*(S-Cmin), high bytes */
03872                 mm3 = _m_paddusb(mm3, mm2);             /* MM0*(S-Cmin)+Nmin, low  bytes */
03873                 mm4 = _m_paddusb(mm4, mm2);             /* MM0*(S-Cmin)+Nmin, high bytes */
03874                 /* Take abs value of the signed words */
03875                 mm5 = _m_psrawi(mm3, 15);               /* fill mm5 words with word sign bit */
03876                 mm6 = _m_psrawi(mm4, 15);               /* fill mm6 words with word sign bit */
03877                 mm3 = _m_pxor(mm3, mm5);                /* take 1's compliment of only neg. words */
03878                 mm4 = _m_pxor(mm4, mm6);                /* take 1's compliment of only neg. words */
03879                 mm3 = _m_psubsw(mm3, mm5);              /* add 1 to only neg. words, W-(-1) or W-0 */
03880                 mm4 = _m_psubsw(mm4, mm6);              /* add 1 to only neg. words, W-(-1) or W-0 */
03881                 *mDest = _m_packuswb(mm3, mm4);         /* pack words back into bytes with saturation */
03882                 mSrc1++;
03883                 mDest++;
03884         }
03885         _m_empty();                                     /* clean MMX state */
03886 #endif
03887         return (0);
03888 #else
03889         return (-1);
03890 #endif
03891 }
03892 
03906 int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
03907                                                                    int Nmax)
03908 {
03909         unsigned int i, istart;
03910         unsigned char *cursrc;
03911         unsigned char *curdest;
03912         int dN, dC, factor;
03913         int result;
03914 
03915         /* Validate input parameters */
03916         if ((Src == NULL) || (Dest == NULL))
03917                 return(-1);
03918         if (length == 0)
03919                 return(0);
03920 
03921         if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
03922 
03923                 SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);
03924 
03925                 /* Check for unaligned bytes */
03926                 if ((length & 7) > 0) {
03927                         /* Setup to process unaligned bytes */
03928                         istart = length & 0xfffffff8;
03929                         cursrc = &Src[istart];
03930                         curdest = &Dest[istart];
03931                 } else {
03932                         /* No unaligned bytes - we are done */
03933                         return (0);
03934                 }
03935         } else {
03936                 /* Setup to process whole image */
03937                 istart = 0;
03938                 cursrc = Src;
03939                 curdest = Dest;
03940         }
03941 
03942         /* C routine to process image */
03943         dC = Cmax - Cmin;
03944         if (dC == 0)
03945                 return (0);
03946         dN = Nmax - Nmin;
03947         factor = dN / dC;
03948         for (i = istart; i < length; i++) {
03949                 result = factor * ((int) (*cursrc) - Cmin) + Nmin;
03950                 if (result > 255)
03951                         result = 255;
03952                 *curdest = (unsigned char) result;
03953                 /* Advance pointers */
03954                 cursrc++;
03955                 curdest++;
03956         }
03957 
03958         return (0);
03959 }
03960 
03961 /* ------------------------------------------------------------------------------------ */
03962 
03977 int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
03978                                                                                    signed short *Kernel, unsigned char Divisor)
03979 {
03980         /* Validate input parameters */
03981         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
03982                 return(-1);
03983 
03984         if ((columns < 3) || (rows < 3) || (Divisor == 0))
03985                 return (-1);
03986 
03987         if ((SDL_imageFilterMMXdetect())) {
03988 //#ifdef USE_MMX
03989 #if defined(USE_MMX) && defined(i386)
03990 #if !defined(GCC__)
03991                 __asm
03992                 {
03993                         pusha
03994                                 pxor mm0, mm0           /* zero MM0 */
03995                                 xor ebx, ebx    /* zero EBX */
03996                                 mov bl, Divisor         /* load Divisor into BL */
03997                                 mov edx, Kernel         /* load Kernel address into EDX */
03998                                 movq mm5, [edx]         /* MM5 = {0,K2,K1,K0} */
03999                         add edx, 8      /* second row              |K0 K1 K2 0| */
04000                                 movq mm6, [edx]         /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
04001                         add edx, 8      /* third row               |K6 K7 K8 0| */
04002                                 movq mm7, [edx]         /* MM7 = {0,K8,K7,K6} */
04003                         /* ---, */
04004                         mov eax, columns        /* load columns into EAX */
04005                                 mov esi, Src    /* ESI = Src row 0 address */
04006                                 mov edi, Dest           /* load Dest address to EDI */
04007                                 add edi, eax    /* EDI = EDI + columns */
04008                                 inc              edi            /* 1 byte offset from the left edge */
04009                                 mov edx, rows           /* initialize ROWS counter */
04010                                 sub edx, 2      /* do not use first and last row */
04011                                 /* ---, */
04012 L10320:
04013                         mov ecx, eax    /* initialize COLUMS counter */
04014                                 sub ecx, 2      /* do not use first and last column */
04015                                 align 16                        /* 16 byte alignment of the loop entry */
04016 L10322:
04017                         /* ---, */
04018                         movq mm1, [esi]         /* load 8 bytes of the image first row */
04019                         add esi, eax    /* move one row below */
04020                                 movq mm2, [esi]         /* load 8 bytes of the image second row */
04021                         add esi, eax    /* move one row below */
04022                                 movq mm3, [esi]         /* load 8 bytes of the image third row */
04023                         punpcklbw mm1, mm0      /* unpack first 4 bytes into words */
04024                                 punpcklbw mm2, mm0      /* unpack first 4 bytes into words */
04025                                 punpcklbw mm3, mm0      /* unpack first 4 bytes into words */
04026                                 pmullw mm1, mm5         /* multiply words first row  image*Kernel */
04027                                 pmullw mm2, mm6         /* multiply words second row image*Kernel */
04028                                 pmullw mm3, mm7         /* multiply words third row  image*Kernel */
04029                                 paddsw mm1, mm2         /* add 4 words of the first and second rows */
04030                                 paddsw mm1, mm3         /* add 4 words of the third row and result */
04031                                 movq mm2, mm1           /* copy MM1 into MM2 */
04032                                 psrlq mm1, 32           /* shift 2 left words to the right */
04033                                 paddsw mm1, mm2         /* add 2 left and 2 right result words */
04034                                 movq mm3, mm1           /* copy MM1 into MM3 */
04035                                 psrlq mm1, 16           /* shift 1 left word to the right */
04036                                 paddsw mm1, mm3         /* add 1 left and 1 right result words */
04037                                 /* --, */
04038                                 movd mm2, eax           /* save EAX in MM2 */
04039                                 movd mm3, edx           /* save EDX in MM3 */
04040                                 movd eax, mm1           /* copy MM1 into EAX */
04041                                 psraw mm1, 15           /* spread sign bit of the result */
04042                                 movd edx, mm1           /* fill EDX with a sign bit */
04043                                 idiv bx         /* IDIV - VERY EXPENSIVE */
04044                                 movd mm1, eax           /* move result of division into MM1 */
04045                                 packuswb mm1, mm0       /* pack division result with saturation */
04046                                 movd eax, mm1           /* copy saturated result into EAX */
04047                                 mov [edi], al           /* copy a byte result into Dest */
04048                                 movd edx, mm3           /* restore saved EDX */
04049                                 movd eax, mm2           /* restore saved EAX */
04050                                 /* --, */
04051                                 sub esi, eax    /* move two rows up */
04052                                 sub esi, eax    /* */
04053                                 inc              esi            /* move Src  pointer to the next pixel */
04054                                 inc              edi            /* move Dest pointer to the next pixel */
04055                                 /* ---, */
04056                                 dec              ecx            /* decrease loop counter COLUMNS */
04057                                 jnz            L10322           /* check loop termination, proceed if required */
04058                                 add esi, 2      /* move to the next row in Src */
04059                                 add edi, 2      /* move to the next row in Dest */
04060                                 dec              edx            /* decrease loop counter ROWS */
04061                                 jnz            L10320           /* check loop termination, proceed if required */
04062                                 /* ---, */
04063                                 emms                            /* exit MMX state */
04064                                 popa
04065                 }
04066 #else
04067                 asm volatile
04068                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
04069                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
04070                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
04071                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
04072                         "movq    (%%edx), %%mm5 \n\t"   /* MM5 = {0,K2,K1,K0} */
04073                         "add          $8, %%edx \n\t"   /* second row              |K0 K1 K2 0| */
04074                         "movq    (%%edx), %%mm6 \n\t"   /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
04075                         "add          $8, %%edx \n\t"   /* third row               |K6 K7 K8 0| */
04076                         "movq    (%%edx), %%mm7 \n\t"   /* MM7 = {0,K8,K7,K6} */
04077                         /* --- */
04078                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
04079                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
04080                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
04081                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
04082                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
04083                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */
04084                         "sub          $2, %%edx \n\t"   /* do not use first and last row */
04085                         /* --- */
04086                         ".L10320:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMS counter */
04087                         "sub          $2, %%ecx \n\t"   /* do not use first and last column */
04088                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
04089                         ".L10322:               \n\t"
04090                         /* --- */
04091                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the image first row */
04092                         "add       %%eax, %%esi \n\t"   /* move one row below */
04093                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes of the image second row */
04094                         "add       %%eax, %%esi \n\t"   /* move one row below */
04095                         "movq    (%%esi), %%mm3 \n\t"   /* load 8 bytes of the image third row */
04096                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first 4 bytes into words */
04097                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack first 4 bytes into words */
04098                         "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack first 4 bytes into words */
04099                         "pmullw    %%mm5, %%mm1 \n\t"   /* multiply words first row  image*Kernel */
04100                         "pmullw    %%mm6, %%mm2 \n\t"   /* multiply words second row image*Kernel */
04101                         "pmullw    %%mm7, %%mm3 \n\t"   /* multiply words third row  image*Kernel */
04102                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the first and second rows */
04103                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 4 words of the third row and result */
04104                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04105                         "psrlq       $32, %%mm1 \n\t"   /* shift 2 left words to the right */
04106                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 2 left and 2 right result words */
04107                         "movq      %%mm1, %%mm3 \n\t"   /* copy MM1 into MM3 */
04108                         "psrlq       $16, %%mm1 \n\t"   /* shift 1 left word to the right */
04109                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 1 left and 1 right result words */
04110                         /* -- */
04111                         "movd      %%eax, %%mm2 \n\t"   /* save EAX in MM2 */
04112                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
04113                         "movd      %%mm1, %%eax \n\t"   /* copy MM1 into EAX */
04114                         "psraw       $15, %%mm1 \n\t"   /* spread sign bit of the result */
04115                         "movd      %%mm1, %%edx \n\t"   /* fill EDX with a sign bit */
04116                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
04117                         "movd      %%eax, %%mm1 \n\t"   /* move result of division into MM1 */
04118                         "packuswb  %%mm0, %%mm1 \n\t"   /* pack division result with saturation */
04119                         "movd      %%mm1, %%eax \n\t"   /* copy saturated result into EAX */
04120                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
04121                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
04122                         "movd      %%mm2, %%eax \n\t"   /* restore saved EAX */
04123                         /* -- */
04124                         "sub       %%eax, %%esi \n\t"   /* move two rows up */
04125                         "sub       %%eax, %%esi \n\t"   /* */
04126                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
04127                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
04128                         /* --- */
04129                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
04130                         "jnz            .L10322 \n\t"   /* check loop termination, proceed if required */
04131                         "add          $2, %%esi \n\t"   /* move to the next row in Src */
04132                         "add          $2, %%edi \n\t"   /* move to the next row in Dest */
04133                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */
04134                         "jnz            .L10320 \n\t"   /* check loop termination, proceed if required */
04135                         /* --- */
04136                         "emms                   \n\t"   /* exit MMX state */
04137                         "popa                   \n\t":"=m" (Dest)       /* %0 */
04138                         :"m"(Src),              /* %1 */
04139                         "m"(rows),              /* %2 */
04140                         "m"(columns),           /* %3 */
04141                         "m"(Kernel),            /* %4 */
04142                         "m"(Divisor)            /* %5 */
04143                         );
04144 #endif
04145 #endif
04146                 return (0);
04147         } else {
04148                 /* No non-MMX implementation yet */
04149                 return (-1);
04150         }
04151 }
04152 
04167 int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
04168                                                                                    signed short *Kernel, unsigned char Divisor)
04169 {
04170         /* Validate input parameters */
04171         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
04172                 return(-1);
04173 
04174         if ((columns < 5) || (rows < 5) || (Divisor == 0))
04175                 return (-1);
04176 
04177         if ((SDL_imageFilterMMXdetect())) {
04178 //#ifdef USE_MMX
04179 #if defined(USE_MMX) && defined(i386)
04180 #if !defined(GCC__)
04181                 __asm
04182                 {
04183                         pusha
04184                                 pxor mm0, mm0           /* zero MM0 */
04185                                 xor ebx, ebx    /* zero EBX */
04186                                 mov bl, Divisor         /* load Divisor into BL */
04187                                 movd mm5, ebx           /* copy Divisor into MM5 */
04188                                 mov edx, Kernel         /* load Kernel address into EDX */
04189                                 mov esi, Src    /* load Src  address to ESI */
04190                                 mov edi, Dest           /* load Dest address to EDI */
04191                                 add edi, 2      /* 2 column offset from the left edge */
04192                                 mov eax, columns        /* load columns into EAX */
04193                                 shl eax, 1      /* EAX = columns * 2 */
04194                                 add edi, eax    /* 2 row offset from the top edge */
04195                                 shr eax, 1      /* EAX = columns */
04196                                 mov ebx, rows           /* initialize ROWS counter */
04197                                 sub ebx, 4      /* do not use first 2 and last 2 rows */
04198                                 /* ---, */
04199 L10330:
04200                         mov ecx, eax    /* initialize COLUMNS counter */
04201                                 sub ecx, 4      /* do not use first 2 and last 2 columns */
04202                                 align 16                        /* 16 byte alignment of the loop entry */
04203 L10332:
04204                         pxor mm7, mm7           /* zero MM7 (accumulator) */
04205                                 movd mm6, esi           /* save ESI in MM6 */
04206                                 /* --- 1 */
04207                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04208                         movq mm2, mm1           /* copy MM1 into MM2 */
04209                                 add esi, eax    /* move Src pointer 1 row below */
04210                                 movq mm3, [edx]         /* load 4 words of Kernel */
04211                         add edx, 8      /* move pointer to other 4 words */
04212                                 movq mm4, [edx]         /* load 4 words of Kernel */
04213                         add edx, 8      /* move pointer to other 4 words */
04214                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04215                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04216                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04217                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04218                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04219                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04220                                 /* --- 2 */
04221                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04222                         movq mm2, mm1           /* copy MM1 into MM2 */
04223                                 add esi, eax    /* move Src pointer 1 row below */
04224                                 movq mm3, [edx]         /* load 4 words of Kernel */
04225                         add edx, 8      /* move pointer to other 4 words */
04226                                 movq mm4, [edx]         /* load 4 words of Kernel */
04227                         add edx, 8      /* move pointer to other 4 words */
04228                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04229                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04230                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04231                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04232                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04233                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04234                                 /* --- 3 */
04235                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04236                         movq mm2, mm1           /* copy MM1 into MM2 */
04237                                 add esi, eax    /* move Src pointer 1 row below */
04238                                 movq mm3, [edx]         /* load 4 words of Kernel */
04239                         add edx, 8      /* move pointer to other 4 words */
04240                                 movq mm4, [edx]         /* load 4 words of Kernel */
04241                         add edx, 8      /* move pointer to other 4 words */
04242                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04243                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04244                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04245                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04246                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04247                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04248                                 /* --- 4 */
04249                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04250                         movq mm2, mm1           /* copy MM1 into MM2 */
04251                                 add esi, eax    /* move Src pointer 1 row below */
04252                                 movq mm3, [edx]         /* load 4 words of Kernel */
04253                         add edx, 8      /* move pointer to other 4 words */
04254                                 movq mm4, [edx]         /* load 4 words of Kernel */
04255                         add edx, 8      /* move pointer to other 4 words */
04256                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04257                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04258                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04259                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04260                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04261                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04262                                 /* --- 5 */
04263                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04264                         movq mm2, mm1           /* copy MM1 into MM2 */
04265                                 movq mm3, [edx]         /* load 4 words of Kernel */
04266                         add edx, 8      /* move pointer to other 4 words */
04267                                 movq mm4, [edx]         /* load 4 words of Kernel */
04268                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04269                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04270                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04271                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04272                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04273                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04274                                 /* ---, */
04275                                 movq mm3, mm7           /* copy MM7 into MM3 */
04276                                 psrlq mm7, 32           /* shift 2 left words to the right */
04277                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
04278                                 movq mm2, mm7           /* copy MM7 into MM2 */
04279                                 psrlq mm7, 16           /* shift 1 left word to the right */
04280                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
04281                                 /* ---, */
04282                                 movd mm1, eax           /* save EDX in MM1 */
04283                                 movd mm2, ebx           /* save EDX in MM2 */
04284                                 movd mm3, edx           /* save EDX in MM3 */
04285                                 movd eax, mm7           /* load summation result into EAX */
04286                                 psraw mm7, 15           /* spread sign bit of the result */
04287                                 movd ebx, mm5           /* load Divisor into EBX */
04288                                 movd edx, mm7           /* fill EDX with a sign bit */
04289                                 idiv bx         /* IDIV - VERY EXPENSIVE */
04290                                 movd mm7, eax           /* move result of division into MM7 */
04291                                 packuswb mm7, mm0       /* pack division result with saturation */
04292                                 movd eax, mm7           /* copy saturated result into EAX */
04293                                 mov [edi], al           /* copy a byte result into Dest */
04294                                 movd edx, mm3           /* restore saved EDX */
04295                                 movd ebx, mm2           /* restore saved EBX */
04296                                 movd eax, mm1           /* restore saved EAX */
04297                                 /* --, */
04298                                 movd esi, mm6           /* move Src pointer to the top pixel */
04299                                 sub edx, 72     /* EDX = Kernel address */
04300                                 inc              esi            /* move Src  pointer to the next pixel */
04301                                 inc              edi            /* move Dest pointer to the next pixel */
04302                                 /* ---, */
04303                                 dec              ecx            /* decrease loop counter COLUMNS */
04304                                 jnz            L10332           /* check loop termination, proceed if required */
04305                                 add esi, 4      /* move to the next row in Src */
04306                                 add edi, 4      /* move to the next row in Dest */
04307                                 dec              ebx            /* decrease loop counter ROWS */
04308                                 jnz            L10330           /* check loop termination, proceed if required */
04309                                 /* ---, */
04310                                 emms                            /* exit MMX state */
04311                                 popa
04312                 }
04313 #else
04314                 asm volatile
04315                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
04316                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
04317                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
04318                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */
04319                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
04320                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
04321                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
04322                         "add          $2, %%edi \n\t"   /* 2 column offset from the left edge */
04323                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
04324                         "shl          $1, %%eax \n\t"   /* EAX = columns * 2 */
04325                         "add       %%eax, %%edi \n\t"   /* 2 row offset from the top edge */
04326                         "shr          $1, %%eax \n\t"   /* EAX = columns */
04327                         "mov          %2, %%ebx \n\t"   /* initialize ROWS counter */
04328                         "sub          $4, %%ebx \n\t"   /* do not use first 2 and last 2 rows */
04329                         /* --- */
04330                         ".L10330:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
04331                         "sub          $4, %%ecx \n\t"   /* do not use first 2 and last 2 columns */
04332                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
04333                         ".L10332:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
04334                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
04335                         /* --- 1 */
04336                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04337                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04338                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04339                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04340                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04341                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04342                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04343                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04344                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04345                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04346                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04347                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04348                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04349                         /* --- 2 */
04350                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04351                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04352                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04353                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04354                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04355                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04356                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04357                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04358                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04359                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04360                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04361                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04362                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04363                         /* --- 3 */
04364                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04365                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04366                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04367                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04368                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04369                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04370                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04371                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04372                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04373                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04374                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04375                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04376                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04377                         /* --- 4 */
04378                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04379                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04380                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04381                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04382                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04383                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04384                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04385                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04386                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04387                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04388                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04389                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04390                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04391                         /* --- 5 */
04392                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04393                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04394                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04395                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04396                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04397                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04398                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04399                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04400                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04401                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04402                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04403                         /* --- */
04404                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
04405                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
04406                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
04407                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
04408                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
04409                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
04410                         /* --- */
04411                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */
04412                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */
04413                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
04414                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */
04415                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */
04416                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */
04417                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */
04418                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
04419                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */
04420                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
04421                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
04422                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
04423                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
04424                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */
04425                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
04426                         /* -- */
04427                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
04428                         "sub         $72, %%edx \n\t"   /* EDX = Kernel address */
04429                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
04430                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
04431                         /* --- */
04432                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
04433                         "jnz            .L10332 \n\t"   /* check loop termination, proceed if required */
04434                         "add          $4, %%esi \n\t"   /* move to the next row in Src */
04435                         "add          $4, %%edi \n\t"   /* move to the next row in Dest */
04436                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
04437                         "jnz            .L10330 \n\t"   /* check loop termination, proceed if required */
04438                         /* --- */
04439                         "emms                   \n\t"   /* exit MMX state */
04440                         "popa                   \n\t":"=m" (Dest)       /* %0 */
04441                         :"m"(Src),              /* %1 */
04442                         "m"(rows),              /* %2 */
04443                         "m"(columns),           /* %3 */
04444                         "m"(Kernel),            /* %4 */
04445                         "m"(Divisor)            /* %5 */
04446                         );
04447 #endif
04448 #endif
04449                 return (0);
04450         } else {
04451                 /* No non-MMX implementation yet */
04452                 return (-1);
04453         }
04454 }
04455 
04470 int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
04471                                                                                    signed short *Kernel, unsigned char Divisor)
04472 {
04473         /* Validate input parameters */
04474         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
04475                 return(-1);
04476 
04477         if ((columns < 7) || (rows < 7) || (Divisor == 0))
04478                 return (-1);
04479 
04480         if ((SDL_imageFilterMMXdetect())) {
04481 //#ifdef USE_MMX
04482 #if defined(USE_MMX) && defined(i386)
04483 #if !defined(GCC__)
04484                 __asm
04485                 {
04486                         pusha
04487                                 pxor mm0, mm0           /* zero MM0 */
04488                                 xor ebx, ebx    /* zero EBX */
04489                                 mov bl, Divisor         /* load Divisor into BL */
04490                                 movd mm5, ebx           /* copy Divisor into MM5 */
04491                                 mov edx, Kernel         /* load Kernel address into EDX */
04492                                 mov esi, Src    /* load Src  address to ESI */
04493                                 mov edi, Dest           /* load Dest address to EDI */
04494                                 add edi, 3      /* 3 column offset from the left edge */
04495                                 mov eax, columns        /* load columns into EAX */
04496                                 add edi, eax    /* 3 row offset from the top edge */
04497                                 add edi, eax
04498                                 add edi, eax
04499                                 mov ebx, rows           /* initialize ROWS counter */
04500                                 sub ebx, 6      /* do not use first 3 and last 3 rows */
04501                                 /* ---, */
04502 L10340:
04503                         mov ecx, eax    /* initialize COLUMNS counter */
04504                                 sub ecx, 6      /* do not use first 3 and last 3 columns */
04505                                 align 16                        /* 16 byte alignment of the loop entry */
04506 L10342:
04507                         pxor mm7, mm7           /* zero MM7 (accumulator) */
04508                                 movd mm6, esi           /* save ESI in MM6 */
04509                                 /* --- 1 */
04510                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04511                         movq mm2, mm1           /* copy MM1 into MM2 */
04512                                 add esi, eax    /* move Src pointer 1 row below */
04513                                 movq mm3, [edx]         /* load 4 words of Kernel */
04514                         add edx, 8      /* move pointer to other 4 words */
04515                                 movq mm4, [edx]         /* load 4 words of Kernel */
04516                         add edx, 8      /* move pointer to other 4 words */
04517                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04518                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04519                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04520                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04521                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04522                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04523                                 /* --- 2 */
04524                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04525                         movq mm2, mm1           /* copy MM1 into MM2 */
04526                                 add esi, eax    /* move Src pointer 1 row below */
04527                                 movq mm3, [edx]         /* load 4 words of Kernel */
04528                         add edx, 8      /* move pointer to other 4 words */
04529                                 movq mm4, [edx]         /* load 4 words of Kernel */
04530                         add edx, 8      /* move pointer to other 4 words */
04531                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04532                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04533                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04534                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04535                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04536                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04537                                 /* --- 3 */
04538                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04539                         movq mm2, mm1           /* copy MM1 into MM2 */
04540                                 add esi, eax    /* move Src pointer 1 row below */
04541                                 movq mm3, [edx]         /* load 4 words of Kernel */
04542                         add edx, 8      /* move pointer to other 4 words */
04543                                 movq mm4, [edx]         /* load 4 words of Kernel */
04544                         add edx, 8      /* move pointer to other 4 words */
04545                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04546                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04547                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04548                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04549                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04550                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04551                                 /* --- 4 */
04552                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04553                         movq mm2, mm1           /* copy MM1 into MM2 */
04554                                 add esi, eax    /* move Src pointer 1 row below */
04555                                 movq mm3, [edx]         /* load 4 words of Kernel */
04556                         add edx, 8      /* move pointer to other 4 words */
04557                                 movq mm4, [edx]         /* load 4 words of Kernel */
04558                         add edx, 8      /* move pointer to other 4 words */
04559                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04560                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04561                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04562                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04563                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04564                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04565                                 /* --- 5 */
04566                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04567                         movq mm2, mm1           /* copy MM1 into MM2 */
04568                                 add esi, eax    /* move Src pointer 1 row below */
04569                                 movq mm3, [edx]         /* load 4 words of Kernel */
04570                         add edx, 8      /* move pointer to other 4 words */
04571                                 movq mm4, [edx]         /* load 4 words of Kernel */
04572                         add edx, 8      /* move pointer to other 4 words */
04573                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04574                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04575                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04576                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04577                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04578                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04579                                 /* --- 6 */
04580                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04581                         movq mm2, mm1           /* copy MM1 into MM2 */
04582                                 add esi, eax    /* move Src pointer 1 row below */
04583                                 movq mm3, [edx]         /* load 4 words of Kernel */
04584                         add edx, 8      /* move pointer to other 4 words */
04585                                 movq mm4, [edx]         /* load 4 words of Kernel */
04586                         add edx, 8      /* move pointer to other 4 words */
04587                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04588                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04589                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04590                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04591                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04592                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04593                                 /* --- 7 */
04594                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04595                         movq mm2, mm1           /* copy MM1 into MM2 */
04596                                 movq mm3, [edx]         /* load 4 words of Kernel */
04597                         add edx, 8      /* move pointer to other 4 words */
04598                                 movq mm4, [edx]         /* load 4 words of Kernel */
04599                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04600                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04601                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
04602                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
04603                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04604                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04605                                 /* ---, */
04606                                 movq mm3, mm7           /* copy MM7 into MM3 */
04607                                 psrlq mm7, 32           /* shift 2 left words to the right */
04608                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
04609                                 movq mm2, mm7           /* copy MM7 into MM2 */
04610                                 psrlq mm7, 16           /* shift 1 left word to the right */
04611                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
04612                                 /* ---, */
04613                                 movd mm1, eax           /* save EDX in MM1 */
04614                                 movd mm2, ebx           /* save EDX in MM2 */
04615                                 movd mm3, edx           /* save EDX in MM3 */
04616                                 movd eax, mm7           /* load summation result into EAX */
04617                                 psraw mm7, 15           /* spread sign bit of the result */
04618                                 movd ebx, mm5           /* load Divisor into EBX */
04619                                 movd edx, mm7           /* fill EDX with a sign bit */
04620                                 idiv bx         /* IDIV - VERY EXPENSIVE */
04621                                 movd mm7, eax           /* move result of division into MM7 */
04622                                 packuswb mm7, mm0       /* pack division result with saturation */
04623                                 movd eax, mm7           /* copy saturated result into EAX */
04624                                 mov [edi], al           /* copy a byte result into Dest */
04625                                 movd edx, mm3           /* restore saved EDX */
04626                                 movd ebx, mm2           /* restore saved EBX */
04627                                 movd eax, mm1           /* restore saved EAX */
04628                                 /* --, */
04629                                 movd esi, mm6           /* move Src pointer to the top pixel */
04630                                 sub edx, 104    /* EDX = Kernel address */
04631                                 inc              esi            /* move Src  pointer to the next pixel */
04632                                 inc              edi            /* move Dest pointer to the next pixel */
04633                                 /* ---, */
04634                                 dec              ecx            /* decrease loop counter COLUMNS */
04635                                 jnz            L10342           /* check loop termination, proceed if required */
04636                                 add esi, 6      /* move to the next row in Src */
04637                                 add edi, 6      /* move to the next row in Dest */
04638                                 dec              ebx            /* decrease loop counter ROWS */
04639                                 jnz            L10340           /* check loop termination, proceed if required */
04640                                 /* ---, */
04641                                 emms                            /* exit MMX state */
04642                                 popa
04643                 }
04644 #else
04645                 asm volatile
04646                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
04647                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
04648                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
04649                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */
04650                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
04651                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
04652                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
04653                         "add          $3, %%edi \n\t"   /* 3 column offset from the left edge */
04654                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
04655                         "add       %%eax, %%edi \n\t"   /* 3 row offset from the top edge */
04656                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"       /* initialize ROWS counter */
04657                         "sub          $6, %%ebx \n\t"   /* do not use first 3 and last 3 rows */
04658                         /* --- */
04659                         ".L10340:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
04660                         "sub          $6, %%ecx \n\t"   /* do not use first 3 and last 3 columns */
04661                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
04662                         ".L10342:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
04663                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
04664                         /* --- 1 */
04665                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04666                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04667                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04668                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04669                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04670                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04671                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04672                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04673                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04674                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04675                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04676                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04677                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04678                         /* --- 2 */
04679                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04680                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04681                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04682                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04683                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04684                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04685                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04686                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04687                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04688                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04689                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04690                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04691                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04692                         /* --- 3 */
04693                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04694                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04695                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04696                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04697                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04698                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04699                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04700                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04701                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04702                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04703                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04704                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04705                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04706                         /* --- 4 */
04707                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04708                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04709                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04710                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04711                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04712                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04713                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04714                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04715                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04716                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04717                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04718                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04719                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04720                         /* --- 5 */
04721                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04722                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04723                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04724                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04725                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04726                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04727                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04728                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04729                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04730                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04731                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04732                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04733                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04734                         /* --- 6 */
04735                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04736                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04737                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
04738                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04739                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04740                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04741                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04742                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04743                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04744                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04745                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04746                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04747                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04748                         /* --- 7 */
04749                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
04750                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
04751                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
04752                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
04753                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
04754                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
04755                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
04756                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
04757                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
04758                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
04759                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
04760                         /* --- */
04761                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
04762                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
04763                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
04764                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
04765                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
04766                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
04767                         /* --- */
04768                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */
04769                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */
04770                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
04771                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */
04772                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */
04773                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */
04774                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */
04775                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
04776                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */
04777                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
04778                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
04779                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
04780                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
04781                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */
04782                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
04783                         /* -- */
04784                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
04785                         "sub        $104, %%edx \n\t"   /* EDX = Kernel address */
04786                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
04787                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
04788                         /* --- */
04789                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
04790                         "jnz            .L10342 \n\t"   /* check loop termination, proceed if required */
04791                         "add          $6, %%esi \n\t"   /* move to the next row in Src */
04792                         "add          $6, %%edi \n\t"   /* move to the next row in Dest */
04793                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
04794                         "jnz            .L10340 \n\t"   /* check loop termination, proceed if required */
04795                         /* --- */
04796                         "emms                   \n\t"   /* exit MMX state */
04797                         "popa                   \n\t":"=m" (Dest)       /* %0 */
04798                         :"m"(Src),              /* %1 */
04799                         "m"(rows),              /* %2 */
04800                         "m"(columns),           /* %3 */
04801                         "m"(Kernel),            /* %4 */
04802                         "m"(Divisor)            /* %5 */
04803                         );
04804 #endif
04805 #endif
04806                 return (0);
04807         } else {
04808                 /* No non-MMX implementation yet */
04809                 return (-1);
04810         }
04811 }
04812 
04827 int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
04828                                                                                    signed short *Kernel, unsigned char Divisor)
04829 {
04830         /* Validate input parameters */
04831         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
04832                 return(-1);
04833 
04834         if ((columns < 9) || (rows < 9) || (Divisor == 0))
04835                 return (-1);
04836 
04837         if ((SDL_imageFilterMMXdetect())) {
04838 //#ifdef USE_MMX
04839 #if defined(USE_MMX) && defined(i386)
04840 #if !defined(GCC__)
04841                 __asm
04842                 {
04843                         pusha
04844                                 pxor mm0, mm0           /* zero MM0 */
04845                                 xor ebx, ebx    /* zero EBX */
04846                                 mov bl, Divisor         /* load Divisor into BL */
04847                                 movd mm5, ebx           /* copy Divisor into MM5 */
04848                                 mov edx, Kernel         /* load Kernel address into EDX */
04849                                 mov esi, Src    /* load Src  address to ESI */
04850                                 mov edi, Dest           /* load Dest address to EDI */
04851                                 add edi, 4      /* 4 column offset from the left edge */
04852                                 mov eax, columns        /* load columns into EAX */
04853                                 add edi, eax    /* 4 row offset from the top edge */
04854                                 add edi, eax
04855                                 add edi, eax
04856                                 add edi, eax
04857                                 mov ebx, rows           /* initialize ROWS counter */
04858                                 sub ebx, 8      /* do not use first 4 and last 4 rows */
04859                                 /* ---, */
04860 L10350:
04861                         mov ecx, eax    /* initialize COLUMNS counter */
04862                                 sub ecx, 8      /* do not use first 4 and last 4 columns */
04863                                 align 16                        /* 16 byte alignment of the loop entry */
04864 L10352:
04865                         pxor mm7, mm7           /* zero MM7 (accumulator) */
04866                                 movd mm6, esi           /* save ESI in MM6 */
04867                                 /* --- 1 */
04868                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04869                         movq mm2, mm1           /* copy MM1 into MM2 */
04870                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04871                                 movq mm3, [edx]         /* load 4 words of Kernel */
04872                         add edx, 8      /* move pointer to other 4 words */
04873                                 movq mm4, [edx]         /* load 4 words of Kernel */
04874                         add edx, 8      /* move pointer to other 4 words */
04875                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04876                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04877                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04878                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04879                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04880                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04881                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04882                         dec              esi
04883                                 add esi, eax    /* move Src pointer 1 row below */
04884                                 movq mm3, [edx]         /* load 4 words of Kernel */
04885                         add edx, 8      /* move pointer to other 4 words */
04886                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04887                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04888                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04889                                 /* --- 2 */
04890                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04891                         movq mm2, mm1           /* copy MM1 into MM2 */
04892                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04893                                 movq mm3, [edx]         /* load 4 words of Kernel */
04894                         add edx, 8      /* move pointer to other 4 words */
04895                                 movq mm4, [edx]         /* load 4 words of Kernel */
04896                         add edx, 8      /* move pointer to other 4 words */
04897                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04898                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04899                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04900                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04901                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04902                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04903                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04904                         dec              esi
04905                                 add esi, eax    /* move Src pointer 1 row below */
04906                                 movq mm3, [edx]         /* load 4 words of Kernel */
04907                         add edx, 8      /* move pointer to other 4 words */
04908                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04909                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04910                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04911                                 /* --- 3 */
04912                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04913                         movq mm2, mm1           /* copy MM1 into MM2 */
04914                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04915                                 movq mm3, [edx]         /* load 4 words of Kernel */
04916                         add edx, 8      /* move pointer to other 4 words */
04917                                 movq mm4, [edx]         /* load 4 words of Kernel */
04918                         add edx, 8      /* move pointer to other 4 words */
04919                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04920                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04921                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04922                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04923                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04924                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04925                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04926                         dec              esi
04927                                 add esi, eax    /* move Src pointer 1 row below */
04928                                 movq mm3, [edx]         /* load 4 words of Kernel */
04929                         add edx, 8      /* move pointer to other 4 words */
04930                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04931                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04932                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04933                                 /* --- 4 */
04934                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04935                         movq mm2, mm1           /* copy MM1 into MM2 */
04936                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04937                                 movq mm3, [edx]         /* load 4 words of Kernel */
04938                         add edx, 8      /* move pointer to other 4 words */
04939                                 movq mm4, [edx]         /* load 4 words of Kernel */
04940                         add edx, 8      /* move pointer to other 4 words */
04941                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04942                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04943                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04944                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04945                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04946                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04947                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04948                         dec              esi
04949                                 add esi, eax    /* move Src pointer 1 row below */
04950                                 movq mm3, [edx]         /* load 4 words of Kernel */
04951                         add edx, 8      /* move pointer to other 4 words */
04952                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04953                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04954                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04955                                 /* --- 5 */
04956                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04957                         movq mm2, mm1           /* copy MM1 into MM2 */
04958                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04959                                 movq mm3, [edx]         /* load 4 words of Kernel */
04960                         add edx, 8      /* move pointer to other 4 words */
04961                                 movq mm4, [edx]         /* load 4 words of Kernel */
04962                         add edx, 8      /* move pointer to other 4 words */
04963                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04964                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04965                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04966                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04967                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04968                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04969                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04970                         dec              esi
04971                                 add esi, eax    /* move Src pointer 1 row below */
04972                                 movq mm3, [edx]         /* load 4 words of Kernel */
04973                         add edx, 8      /* move pointer to other 4 words */
04974                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04975                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04976                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04977                                 /* --- 6 */
04978                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04979                         movq mm2, mm1           /* copy MM1 into MM2 */
04980                                 inc              esi            /* move pointer to the next 8 bytes of Src */
04981                                 movq mm3, [edx]         /* load 4 words of Kernel */
04982                         add edx, 8      /* move pointer to other 4 words */
04983                                 movq mm4, [edx]         /* load 4 words of Kernel */
04984                         add edx, 8      /* move pointer to other 4 words */
04985                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04986                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
04987                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04988                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
04989                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
04990                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04991                                 movq mm1, [esi]         /* load 8 bytes of the Src */
04992                         dec              esi
04993                                 add esi, eax    /* move Src pointer 1 row below */
04994                                 movq mm3, [edx]         /* load 4 words of Kernel */
04995                         add edx, 8      /* move pointer to other 4 words */
04996                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
04997                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
04998                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
04999                                 /* --- 7 */
05000                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05001                         movq mm2, mm1           /* copy MM1 into MM2 */
05002                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05003                                 movq mm3, [edx]         /* load 4 words of Kernel */
05004                         add edx, 8      /* move pointer to other 4 words */
05005                                 movq mm4, [edx]         /* load 4 words of Kernel */
05006                         add edx, 8      /* move pointer to other 4 words */
05007                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05008                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05009                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05010                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05011                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05012                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05013                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05014                         dec              esi
05015                                 add esi, eax    /* move Src pointer 1 row below */
05016                                 movq mm3, [edx]         /* load 4 words of Kernel */
05017                         add edx, 8      /* move pointer to other 4 words */
05018                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05019                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05020                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05021                                 /* --- 8 */
05022                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05023                         movq mm2, mm1           /* copy MM1 into MM2 */
05024                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05025                                 movq mm3, [edx]         /* load 4 words of Kernel */
05026                         add edx, 8      /* move pointer to other 4 words */
05027                                 movq mm4, [edx]         /* load 4 words of Kernel */
05028                         add edx, 8      /* move pointer to other 4 words */
05029                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05030                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05031                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05032                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05033                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05034                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05035                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05036                         dec              esi
05037                                 add esi, eax    /* move Src pointer 1 row below */
05038                                 movq mm3, [edx]         /* load 4 words of Kernel */
05039                         add edx, 8      /* move pointer to other 4 words */
05040                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05041                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05042                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05043                                 /* --- 9 */
05044                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05045                         movq mm2, mm1           /* copy MM1 into MM2 */
05046                                 inc              esi            /* move pointer to the next 8 bytes of Src */
05047                                 movq mm3, [edx]         /* load 4 words of Kernel */
05048                         add edx, 8      /* move pointer to other 4 words */
05049                                 movq mm4, [edx]         /* load 4 words of Kernel */
05050                         add edx, 8      /* move pointer to other 4 words */
05051                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05052                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05053                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05054                                 pmullw mm2, mm4         /* mult. 4 high words of Src and Kernel */
05055                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05056                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05057                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05058                         movq mm3, [edx]         /* load 4 words of Kernel */
05059                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05060                                 pmullw mm1, mm3         /* mult. 4 low  words of Src and Kernel */
05061                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05062                                 /* ---, */
05063                                 movq mm3, mm7           /* copy MM7 into MM3 */
05064                                 psrlq mm7, 32           /* shift 2 left words to the right */
05065                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
05066                                 movq mm2, mm7           /* copy MM7 into MM2 */
05067                                 psrlq mm7, 16           /* shift 1 left word to the right */
05068                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
05069                                 /* ---, */
05070                                 movd mm1, eax           /* save EDX in MM1 */
05071                                 movd mm2, ebx           /* save EDX in MM2 */
05072                                 movd mm3, edx           /* save EDX in MM3 */
05073                                 movd eax, mm7           /* load summation result into EAX */
05074                                 psraw mm7, 15           /* spread sign bit of the result */
05075                                 movd ebx, mm5           /* load Divisor into EBX */
05076                                 movd edx, mm7           /* fill EDX with a sign bit */
05077                                 idiv bx         /* IDIV - VERY EXPENSIVE */
05078                                 movd mm7, eax           /* move result of division into MM7 */
05079                                 packuswb mm7, mm0       /* pack division result with saturation */
05080                                 movd eax, mm7           /* copy saturated result into EAX */
05081                                 mov [edi], al           /* copy a byte result into Dest */
05082                                 movd edx, mm3           /* restore saved EDX */
05083                                 movd ebx, mm2           /* restore saved EBX */
05084                                 movd eax, mm1           /* restore saved EAX */
05085                                 /* --, */
05086                                 movd esi, mm6           /* move Src pointer to the top pixel */
05087                                 sub edx, 208    /* EDX = Kernel address */
05088                                 inc              esi            /* move Src  pointer to the next pixel */
05089                                 inc              edi            /* move Dest pointer to the next pixel */
05090                                 /* ---, */
05091                                 dec              ecx            /* decrease loop counter COLUMNS */
05092                                 jnz            L10352           /* check loop termination, proceed if required */
05093                                 add esi, 8      /* move to the next row in Src */
05094                                 add edi, 8      /* move to the next row in Dest */
05095                                 dec              ebx            /* decrease loop counter ROWS */
05096                                 jnz            L10350           /* check loop termination, proceed if required */
05097                                 /* ---, */
05098                                 emms                            /* exit MMX state */
05099                                 popa
05100                 }
05101 #else
05102                 asm volatile
05103                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
05104                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
05105                         "mov           %5, %%bl \n\t"   /* load Divisor into BL */
05106                         "movd      %%ebx, %%mm5 \n\t"   /* copy Divisor into MM5 */
05107                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
05108                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
05109                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
05110                         "add          $4, %%edi \n\t"   /* 4 column offset from the left edge */
05111                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
05112                         "add       %%eax, %%edi \n\t"   /* 4 row offset from the top edge */
05113                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t" /* initialize ROWS counter */
05114                         "sub          $8, %%ebx \n\t"   /* do not use first 4 and last 4 rows */
05115                         /* --- */
05116                         ".L10350:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
05117                         "sub          $8, %%ecx \n\t"   /* do not use first 4 and last 4 columns */
05118                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
05119                         ".L10352:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
05120                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
05121                         /* --- 1 */
05122                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05123                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05124                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05125                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05126                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05127                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05128                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05129                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05130                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05131                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05132                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05133                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05134                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05135                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05136                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05137                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05138                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05139                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05140                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05141                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05142                         /* --- 2 */
05143                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05144                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05145                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05146                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05147                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05148                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05149                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05150                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05151                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05152                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05153                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05154                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05155                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05156                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05157                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05158                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05159                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05160                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05161                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05162                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05163                         /* --- 3 */
05164                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05165                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05166                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05167                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05168                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05169                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05170                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05171                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05172                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05173                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05174                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05175                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05176                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05177                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05178                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05179                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05180                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05181                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05182                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05183                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05184                         /* --- 4 */
05185                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05186                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05187                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05188                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05189                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05190                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05191                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05192                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05193                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05194                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05195                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05196                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05197                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05198                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05199                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05200                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05201                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05202                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05203                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05204                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05205                         /* --- 5 */
05206                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05207                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05208                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05209                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05210                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05211                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05212                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05213                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05214                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05215                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05216                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05217                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05218                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05219                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05220                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05221                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05222                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05223                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05224                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05225                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05226                         /* --- 6 */
05227                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05228                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05229                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05230                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05231                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05232                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05233                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05234                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05235                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05236                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05237                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05238                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05239                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05240                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05241                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05242                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05243                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05244                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05245                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05246                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05247                         /* --- 7 */
05248                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05249                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05250                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05251                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05252                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05253                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05254                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05255                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05256                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05257                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05258                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05259                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05260                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05261                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05262                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05263                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05264                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05265                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05266                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05267                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05268                         /* --- 8 */
05269                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05270                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05271                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05272                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05273                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05274                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05275                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05276                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05277                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05278                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05279                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05280                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05281                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05282                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05283                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
05284                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05285                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05286                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05287                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05288                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05289                         /* --- 9 */
05290                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05291                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05292                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
05293                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05294                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05295                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05296                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05297                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05298                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05299                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05300                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05301                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05302                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05303                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05304                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05305                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05306                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05307                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05308                         /* --- */
05309                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
05310                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
05311                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
05312                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
05313                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
05314                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
05315                         /* --- */
05316                         "movd      %%eax, %%mm1 \n\t"   /* save EDX in MM1 */
05317                         "movd      %%ebx, %%mm2 \n\t"   /* save EDX in MM2 */
05318                         "movd      %%edx, %%mm3 \n\t"   /* save EDX in MM3 */
05319                         "movd      %%mm7, %%eax \n\t"   /* load summation result into EAX */
05320                         "psraw       $15, %%mm7 \n\t"   /* spread sign bit of the result */
05321                         "movd      %%mm5, %%ebx \n\t"   /* load Divisor into EBX */
05322                         "movd      %%mm7, %%edx \n\t"   /* fill EDX with a sign bit */
05323                         "idivw             %%bx \n\t"   /* IDIV - VERY EXPENSIVE */
05324                         "movd      %%eax, %%mm7 \n\t"   /* move result of division into MM7 */
05325                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
05326                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
05327                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
05328                         "movd      %%mm3, %%edx \n\t"   /* restore saved EDX */
05329                         "movd      %%mm2, %%ebx \n\t"   /* restore saved EBX */
05330                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
05331                         /* -- */
05332                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
05333                         "sub        $208, %%edx \n\t"   /* EDX = Kernel address */
05334                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
05335                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
05336                         /* --- */
05337                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
05338                         "jnz            .L10352 \n\t"   /* check loop termination, proceed if required */
05339                         "add          $8, %%esi \n\t"   /* move to the next row in Src */
05340                         "add          $8, %%edi \n\t"   /* move to the next row in Dest */
05341                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
05342                         "jnz            .L10350 \n\t"   /* check loop termination, proceed if required */
05343                         /* --- */
05344                         "emms                   \n\t"   /* exit MMX state */
05345                         "popa                   \n\t":"=m" (Dest)       /* %0 */
05346                         :"m"(Src),              /* %1 */
05347                         "m"(rows),              /* %2 */
05348                         "m"(columns),           /* %3 */
05349                         "m"(Kernel),            /* %4 */
05350                         "m"(Divisor)            /* %5 */
05351                         );
05352 #endif
05353 #endif
05354                 return (0);
05355         } else {
05356                 /* No non-MMX implementation yet */
05357                 return (-1);
05358         }
05359 }
05360 
05375 int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
05376                                                                                            signed short *Kernel, unsigned char NRightShift)
05377 {
05378         /* Validate input parameters */
05379         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
05380                 return(-1);
05381 
05382         if ((columns < 3) || (rows < 3) || (NRightShift > 7))
05383                 return (-1);
05384 
05385         if ((SDL_imageFilterMMXdetect())) {
05386 //#ifdef USE_MMX
05387 #if defined(USE_MMX) && defined(i386)
05388 #if !defined(GCC__)
05389                 __asm
05390                 {
05391                         pusha
05392                                 pxor mm0, mm0           /* zero MM0 */
05393                                 xor ebx, ebx    /* zero EBX */
05394                                 mov bl, NRightShift     /* load NRightShift into BL */
05395                                 movd mm4, ebx           /* copy NRightShift into MM4 */
05396                                 mov edx, Kernel         /* load Kernel address into EDX */
05397                                 movq mm5, [edx]         /* MM5 = {0,K2,K1,K0} */
05398                         add edx, 8      /* second row              |K0 K1 K2 0| */
05399                                 movq mm6, [edx]         /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
05400                         add edx, 8      /* third row               |K6 K7 K8 0| */
05401                                 movq mm7, [edx]         /* MM7 = {0,K8,K7,K6} */
05402                         /* ---, */
05403                         mov eax, columns        /* load columns into EAX */
05404                                 mov esi, Src    /* ESI = Src row 0 address */
05405                                 mov edi, Dest           /* load Dest address to EDI */
05406                                 add edi, eax    /* EDI = EDI + columns */
05407                                 inc              edi            /* 1 byte offset from the left edge */
05408                                 mov edx, rows           /* initialize ROWS counter */
05409                                 sub edx, 2      /* do not use first and last row */
05410                                 /* ---, */
05411 L10360:
05412                         mov ecx, eax    /* initialize COLUMS counter */
05413                                 sub ecx, 2      /* do not use first and last column */
05414                                 align 16                        /* 16 byte alignment of the loop entry */
05415 L10362:
05416                         /* ---, */
05417                         movq mm1, [esi]         /* load 8 bytes of the image first row */
05418                         add esi, eax    /* move one row below */
05419                                 movq mm2, [esi]         /* load 8 bytes of the image second row */
05420                         add esi, eax    /* move one row below */
05421                                 movq mm3, [esi]         /* load 8 bytes of the image third row */
05422                         punpcklbw mm1, mm0      /* unpack first 4 bytes into words */
05423                                 punpcklbw mm2, mm0      /* unpack first 4 bytes into words */
05424                                 punpcklbw mm3, mm0      /* unpack first 4 bytes into words */
05425                                 psrlw mm1, mm4          /* shift right each pixel NshiftRight times */
05426                                 psrlw mm2, mm4          /* shift right each pixel NshiftRight times */
05427                                 psrlw mm3, mm4          /* shift right each pixel NshiftRight times */
05428                                 pmullw mm1, mm5         /* multiply words first row  image*Kernel */
05429                                 pmullw mm2, mm6         /* multiply words second row image*Kernel */
05430                                 pmullw mm3, mm7         /* multiply words third row  image*Kernel */
05431                                 paddsw mm1, mm2         /* add 4 words of the first and second rows */
05432                                 paddsw mm1, mm3         /* add 4 words of the third row and result */
05433                                 movq mm2, mm1           /* copy MM1 into MM2 */
05434                                 psrlq mm1, 32           /* shift 2 left words to the right */
05435                                 paddsw mm1, mm2         /* add 2 left and 2 right result words */
05436                                 movq mm3, mm1           /* copy MM1 into MM3 */
05437                                 psrlq mm1, 16           /* shift 1 left word to the right */
05438                                 paddsw mm1, mm3         /* add 1 left and 1 right result words */
05439                                 packuswb mm1, mm0       /* pack shift result with saturation */
05440                                 movd ebx, mm1           /* copy saturated result into EBX */
05441                                 mov [edi], bl           /* copy a byte result into Dest */
05442                                 /* --, */
05443                                 sub esi, eax    /* move two rows up */
05444                                 sub esi, eax
05445                                 inc              esi            /* move Src  pointer to the next pixel */
05446                                 inc              edi            /* move Dest pointer to the next pixel */
05447                                 /* ---, */
05448                                 dec              ecx            /* decrease loop counter COLUMNS */
05449                                 jnz            L10362           /* check loop termination, proceed if required */
05450                                 add esi, 2      /* move to the next row in Src */
05451                                 add edi, 2      /* move to the next row in Dest */
05452                                 dec              edx            /* decrease loop counter ROWS */
05453                                 jnz            L10360           /* check loop termination, proceed if required */
05454                                 /* ---, */
05455                                 emms                            /* exit MMX state */
05456                                 popa
05457                 }
05458 #else
05459                 asm volatile
05460                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
05461                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
05462                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
05463                         "movd      %%ebx, %%mm4 \n\t"   /* copy NRightShift into MM4 */
05464                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
05465                         "movq    (%%edx), %%mm5 \n\t"   /* MM5 = {0,K2,K1,K0} */
05466                         "add          $8, %%edx \n\t"   /* second row              |K0 K1 K2 0| */
05467                         "movq    (%%edx), %%mm6 \n\t"   /* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
05468                         "add          $8, %%edx \n\t"   /* third row               |K6 K7 K8 0| */
05469                         "movq    (%%edx), %%mm7 \n\t"   /* MM7 = {0,K8,K7,K6} */
05470                         /* --- */
05471                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
05472                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
05473                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
05474                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
05475                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
05476                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */
05477                         "sub          $2, %%edx \n\t"   /* do not use first and last row */
05478                         /* --- */
05479                         ".L10360:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMS counter */
05480                         "sub          $2, %%ecx \n\t"   /* do not use first and last column */
05481                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
05482                         ".L10362:               \n\t"
05483                         /* --- */
05484                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the image first row */
05485                         "add       %%eax, %%esi \n\t"   /* move one row below */
05486                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes of the image second row */
05487                         "add       %%eax, %%esi \n\t"   /* move one row below */
05488                         "movq    (%%esi), %%mm3 \n\t"   /* load 8 bytes of the image third row */
05489                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first 4 bytes into words */
05490                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack first 4 bytes into words */
05491                         "punpcklbw %%mm0, %%mm3 \n\t"   /* unpack first 4 bytes into words */
05492                         "psrlw     %%mm4, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05493                         "psrlw     %%mm4, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05494                         "psrlw     %%mm4, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
05495                         "pmullw    %%mm5, %%mm1 \n\t"   /* multiply words first row  image*Kernel */
05496                         "pmullw    %%mm6, %%mm2 \n\t"   /* multiply words second row image*Kernel */
05497                         "pmullw    %%mm7, %%mm3 \n\t"   /* multiply words third row  image*Kernel */
05498                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the first and second rows */
05499                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 4 words of the third row and result */
05500                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05501                         "psrlq       $32, %%mm1 \n\t"   /* shift 2 left words to the right */
05502                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 2 left and 2 right result words */
05503                         "movq      %%mm1, %%mm3 \n\t"   /* copy MM1 into MM3 */
05504                         "psrlq       $16, %%mm1 \n\t"   /* shift 1 left word to the right */
05505                         "paddsw    %%mm3, %%mm1 \n\t"   /* add 1 left and 1 right result words */
05506                         "packuswb  %%mm0, %%mm1 \n\t"   /* pack shift result with saturation */
05507                         "movd      %%mm1, %%ebx \n\t"   /* copy saturated result into EBX */
05508                         "mov      %%bl, (%%edi) \n\t"   /* copy a byte result into Dest */
05509                         /* -- */
05510                         "sub       %%eax, %%esi \n\t"   /* move two rows up */
05511                         "sub       %%eax, %%esi \n\t" "inc              %%esi \n\t"     /* move Src  pointer to the next pixel */
05512                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
05513                         /* --- */
05514                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
05515                         "jnz            .L10362 \n\t"   /* check loop termination, proceed if required */
05516                         "add          $2, %%esi \n\t"   /* move to the next row in Src */
05517                         "add          $2, %%edi \n\t"   /* move to the next row in Dest */
05518                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */
05519                         "jnz            .L10360 \n\t"   /* check loop termination, proceed if required */
05520                         /* --- */
05521                         "emms                   \n\t"   /* exit MMX state */
05522                         "popa                   \n\t":"=m" (Dest)       /* %0 */
05523                         :"m"(Src),              /* %1 */
05524                         "m"(rows),              /* %2 */
05525                         "m"(columns),           /* %3 */
05526                         "m"(Kernel),            /* %4 */
05527                         "m"(NRightShift)        /* %5 */
05528                         );
05529 #endif
05530 #endif
05531                 return (0);
05532         } else {
05533                 /* No non-MMX implementation yet */
05534                 return (-1);
05535         }
05536 }
05537 
05552 int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
05553                                                                                            signed short *Kernel, unsigned char NRightShift)
05554 {
05555         /* Validate input parameters */
05556         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
05557                 return(-1);
05558 
05559         if ((columns < 5) || (rows < 5) || (NRightShift > 7))
05560                 return (-1);
05561 
05562         if ((SDL_imageFilterMMXdetect())) {
05563 //#ifdef USE_MMX
05564 #if defined(USE_MMX) && defined(i386)
05565 #if !defined(GCC__)
05566                 __asm
05567                 {
05568                         pusha
05569                                 pxor mm0, mm0           /* zero MM0 */
05570                                 xor ebx, ebx    /* zero EBX */
05571                                 mov bl, NRightShift     /* load NRightShift into BL */
05572                                 movd mm5, ebx           /* copy NRightShift into MM5 */
05573                                 mov edx, Kernel         /* load Kernel address into EDX */
05574                                 mov esi, Src    /* load Src  address to ESI */
05575                                 mov edi, Dest           /* load Dest address to EDI */
05576                                 add edi, 2      /* 2 column offset from the left edge */
05577                                 mov eax, columns        /* load columns into EAX */
05578                                 shl eax, 1      /* EAX = columns * 2 */
05579                                 add edi, eax    /* 2 row offset from the top edge */
05580                                 shr eax, 1      /* EAX = columns */
05581                                 mov ebx, rows           /* initialize ROWS counter */
05582                                 sub ebx, 4      /* do not use first 2 and last 2 rows */
05583                                 /* ---, */
05584 L10370:
05585                         mov ecx, eax    /* initialize COLUMNS counter */
05586                                 sub ecx, 4      /* do not use first 2 and last 2 columns */
05587                                 align 16                        /* 16 byte alignment of the loop entry */
05588 L10372:
05589                         pxor mm7, mm7           /* zero MM7 (accumulator) */
05590                                 movd mm6, esi           /* save ESI in MM6 */
05591                                 /* --- 1 */
05592                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05593                         movq mm2, mm1           /* copy MM1 into MM2 */
05594                                 add esi, eax    /* move Src pointer 1 row below */
05595                                 movq mm3, [edx]         /* load 4 words of Kernel */
05596                         add edx, 8      /* move pointer to other 4 words */
05597                                 movq mm4, [edx]         /* load 4 words of Kernel */
05598                         add edx, 8      /* move pointer to other 4 words */
05599                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05600                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05601                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05602                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05603                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05604                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05605                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05606                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05607                                 /* --- 2 */
05608                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05609                         movq mm2, mm1           /* copy MM1 into MM2 */
05610                                 add esi, eax    /* move Src pointer 1 row below */
05611                                 movq mm3, [edx]         /* load 4 words of Kernel */
05612                         add edx, 8      /* move pointer to other 4 words */
05613                                 movq mm4, [edx]         /* load 4 words of Kernel */
05614                         add edx, 8      /* move pointer to other 4 words */
05615                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05616                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05617                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05618                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05619                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05620                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05621                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05622                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05623                                 /* --- 3 */
05624                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05625                         movq mm2, mm1           /* copy MM1 into MM2 */
05626                                 add esi, eax    /* move Src pointer 1 row below */
05627                                 movq mm3, [edx]         /* load 4 words of Kernel */
05628                         add edx, 8      /* move pointer to other 4 words */
05629                                 movq mm4, [edx]         /* load 4 words of Kernel */
05630                         add edx, 8      /* move pointer to other 4 words */
05631                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05632                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05633                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05634                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05635                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05636                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05637                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05638                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05639                                 /* --- 4 */
05640                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05641                         movq mm2, mm1           /* copy MM1 into MM2 */
05642                                 add esi, eax    /* move Src pointer 1 row below */
05643                                 movq mm3, [edx]         /* load 4 words of Kernel */
05644                         add edx, 8      /* move pointer to other 4 words */
05645                                 movq mm4, [edx]         /* load 4 words of Kernel */
05646                         add edx, 8      /* move pointer to other 4 words */
05647                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05648                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05649                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05650                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05651                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05652                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05653                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05654                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05655                                 /* --- 5 */
05656                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05657                         movq mm2, mm1           /* copy MM1 into MM2 */
05658                                 movq mm3, [edx]         /* load 4 words of Kernel */
05659                         add edx, 8      /* move pointer to other 4 words */
05660                                 movq mm4, [edx]         /* load 4 words of Kernel */
05661                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05662                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05663                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05664                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05665                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05666                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05667                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05668                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05669                                 /* ---, */
05670                                 movq mm3, mm7           /* copy MM7 into MM3 */
05671                                 psrlq mm7, 32           /* shift 2 left words to the right */
05672                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
05673                                 movq mm2, mm7           /* copy MM7 into MM2 */
05674                                 psrlq mm7, 16           /* shift 1 left word to the right */
05675                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
05676                                 movd mm1, eax           /* save EAX in MM1 */
05677                                 packuswb mm7, mm0       /* pack division result with saturation */
05678                                 movd eax, mm7           /* copy saturated result into EAX */
05679                                 mov [edi], al           /* copy a byte result into Dest */
05680                                 movd eax, mm1           /* restore saved EAX */
05681                                 /* --, */
05682                                 movd esi, mm6           /* move Src pointer to the top pixel */
05683                                 sub edx, 72     /* EDX = Kernel address */
05684                                 inc              esi            /* move Src  pointer to the next pixel */
05685                                 inc              edi            /* move Dest pointer to the next pixel */
05686                                 /* ---, */
05687                                 dec              ecx            /* decrease loop counter COLUMNS */
05688                                 jnz            L10372           /* check loop termination, proceed if required */
05689                                 add esi, 4      /* move to the next row in Src */
05690                                 add edi, 4      /* move to the next row in Dest */
05691                                 dec              ebx            /* decrease loop counter ROWS */
05692                                 jnz            L10370           /* check loop termination, proceed if required */
05693                                 /* ---, */
05694                                 emms                            /* exit MMX state */
05695                                 popa
05696                 }
05697 #else
05698                 asm volatile
05699                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
05700                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
05701                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
05702                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */
05703                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
05704                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
05705                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
05706                         "add          $2, %%edi \n\t"   /* 2 column offset from the left edge */
05707                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
05708                         "shl          $1, %%eax \n\t"   /* EAX = columns * 2 */
05709                         "add       %%eax, %%edi \n\t"   /* 2 row offset from the top edge */
05710                         "shr          $1, %%eax \n\t"   /* EAX = columns */
05711                         "mov          %2, %%ebx \n\t"   /* initialize ROWS counter */
05712                         "sub          $4, %%ebx \n\t"   /* do not use first 2 and last 2 rows */
05713                         /* --- */
05714                         ".L10370:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
05715                         "sub          $4, %%ecx \n\t"   /* do not use first 2 and last 2 columns */
05716                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
05717                         ".L10372:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
05718                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
05719                         /* --- 1 */
05720                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05721                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05722                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05723                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05724                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05725                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05726                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05727                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05728                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05729                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05730                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05731                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05732                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05733                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05734                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05735                         /* --- 2 */
05736                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05737                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05738                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05739                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05740                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05741                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05742                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05743                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05744                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05745                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05746                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05747                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05748                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05749                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05750                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05751                         /* --- 3 */
05752                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05753                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05754                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05755                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05756                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05757                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05758                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05759                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05760                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05761                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05762                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05763                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05764                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05765                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05766                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05767                         /* --- 4 */
05768                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05769                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05770                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
05771                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05772                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05773                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05774                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05775                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05776                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05777                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05778                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05779                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05780                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05781                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05782                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05783                         /* --- 5 */
05784                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
05785                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
05786                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
05787                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
05788                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
05789                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
05790                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
05791                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
05792                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
05793                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
05794                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
05795                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
05796                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
05797                         /* --- */
05798                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
05799                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
05800                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
05801                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
05802                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
05803                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
05804                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */
05805                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
05806                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
05807                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
05808                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
05809                         /* -- */
05810                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
05811                         "sub         $72, %%edx \n\t"   /* EDX = Kernel address */
05812                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
05813                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
05814                         /* --- */
05815                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
05816                         "jnz            .L10372 \n\t"   /* check loop termination, proceed if required */
05817                         "add          $4, %%esi \n\t"   /* move to the next row in Src */
05818                         "add          $4, %%edi \n\t"   /* move to the next row in Dest */
05819                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
05820                         "jnz            .L10370 \n\t"   /* check loop termination, proceed if required */
05821                         /* --- */
05822                         "emms                   \n\t"   /* exit MMX state */
05823                         "popa                   \n\t":"=m" (Dest)       /* %0 */
05824                         :"m"(Src),              /* %1 */
05825                         "m"(rows),              /* %2 */
05826                         "m"(columns),           /* %3 */
05827                         "m"(Kernel),            /* %4 */
05828                         "m"(NRightShift)        /* %5 */
05829                         );
05830 #endif
05831 #endif
05832                 return (0);
05833         } else {
05834                 /* No non-MMX implementation yet */
05835                 return (-1);
05836         }
05837 }
05838 
05853 int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
05854                                                                                            signed short *Kernel, unsigned char NRightShift)
05855 {
05856         /* Validate input parameters */
05857         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
05858                 return(-1);
05859 
05860         if ((columns < 7) || (rows < 7) || (NRightShift > 7))
05861                 return (-1);
05862 
05863         if ((SDL_imageFilterMMXdetect())) {
05864 //#ifdef USE_MMX
05865 #if defined(USE_MMX) && defined(i386)
05866 #if !defined(GCC__)
05867                 __asm
05868                 {
05869                         pusha
05870                                 pxor mm0, mm0           /* zero MM0 */
05871                                 xor ebx, ebx    /* zero EBX */
05872                                 mov bl, NRightShift     /* load NRightShift into BL */
05873                                 movd mm5, ebx           /* copy NRightShift into MM5 */
05874                                 mov edx, Kernel         /* load Kernel address into EDX */
05875                                 mov esi, Src    /* load Src  address to ESI */
05876                                 mov edi, Dest           /* load Dest address to EDI */
05877                                 add edi, 3      /* 3 column offset from the left edge */
05878                                 mov eax, columns        /* load columns into EAX */
05879                                 add edi, eax    /* 3 row offset from the top edge */
05880                                 add edi, eax
05881                                 add edi, eax
05882                                 mov ebx, rows           /* initialize ROWS counter */
05883                                 sub ebx, 6      /* do not use first 3 and last 3 rows */
05884                                 /* ---, */
05885 L10380:
05886                         mov ecx, eax    /* initialize COLUMNS counter */
05887                                 sub ecx, 6      /* do not use first 3 and last 3 columns */
05888                                 align 16                        /* 16 byte alignment of the loop entry */
05889 L10382:
05890                         pxor mm7, mm7           /* zero MM7 (accumulator) */
05891                                 movd mm6, esi           /* save ESI in MM6 */
05892                                 /* --- 1 */
05893                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05894                         movq mm2, mm1           /* copy MM1 into MM2 */
05895                                 add esi, eax    /* move Src pointer 1 row below */
05896                                 movq mm3, [edx]         /* load 4 words of Kernel */
05897                         add edx, 8      /* move pointer to other 4 words */
05898                                 movq mm4, [edx]         /* load 4 words of Kernel */
05899                         add edx, 8      /* move pointer to other 4 words */
05900                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05901                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05902                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05903                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05904                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05905                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05906                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05907                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05908                                 /* --- 2 */
05909                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05910                         movq mm2, mm1           /* copy MM1 into MM2 */
05911                                 add esi, eax    /* move Src pointer 1 row below */
05912                                 movq mm3, [edx]         /* load 4 words of Kernel */
05913                         add edx, 8      /* move pointer to other 4 words */
05914                                 movq mm4, [edx]         /* load 4 words of Kernel */
05915                         add edx, 8      /* move pointer to other 4 words */
05916                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05917                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05918                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05919                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05920                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05921                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05922                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05923                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05924                                 /* --- 3 */
05925                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05926                         movq mm2, mm1           /* copy MM1 into MM2 */
05927                                 add esi, eax    /* move Src pointer 1 row below */
05928                                 movq mm3, [edx]         /* load 4 words of Kernel */
05929                         add edx, 8      /* move pointer to other 4 words */
05930                                 movq mm4, [edx]         /* load 4 words of Kernel */
05931                         add edx, 8      /* move pointer to other 4 words */
05932                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05933                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05934                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05935                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05936                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05937                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05938                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05939                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05940                                 /* --- 4 */
05941                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05942                         movq mm2, mm1           /* copy MM1 into MM2 */
05943                                 add esi, eax    /* move Src pointer 1 row below */
05944                                 movq mm3, [edx]         /* load 4 words of Kernel */
05945                         add edx, 8      /* move pointer to other 4 words */
05946                                 movq mm4, [edx]         /* load 4 words of Kernel */
05947                         add edx, 8      /* move pointer to other 4 words */
05948                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05949                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05950                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05951                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05952                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05953                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05954                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05955                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05956                                 /* --- 5 */
05957                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05958                         movq mm2, mm1           /* copy MM1 into MM2 */
05959                                 add esi, eax    /* move Src pointer 1 row below */
05960                                 movq mm3, [edx]         /* load 4 words of Kernel */
05961                         add edx, 8      /* move pointer to other 4 words */
05962                                 movq mm4, [edx]         /* load 4 words of Kernel */
05963                         add edx, 8      /* move pointer to other 4 words */
05964                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05965                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05966                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05967                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05968                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05969                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05970                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05971                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05972                                 /* --- 6 */
05973                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05974                         movq mm2, mm1           /* copy MM1 into MM2 */
05975                                 add esi, eax    /* move Src pointer 1 row below */
05976                                 movq mm3, [edx]         /* load 4 words of Kernel */
05977                         add edx, 8      /* move pointer to other 4 words */
05978                                 movq mm4, [edx]         /* load 4 words of Kernel */
05979                         add edx, 8      /* move pointer to other 4 words */
05980                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05981                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05982                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05983                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05984                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05985                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
05986                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
05987                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
05988                                 /* --- 7 */
05989                                 movq mm1, [esi]         /* load 8 bytes of the Src */
05990                         movq mm2, mm1           /* copy MM1 into MM2 */
05991                                 movq mm3, [edx]         /* load 4 words of Kernel */
05992                         add edx, 8      /* move pointer to other 4 words */
05993                                 movq mm4, [edx]         /* load 4 words of Kernel */
05994                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
05995                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
05996                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
05997                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
05998                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
05999                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06000                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06001                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06002                                 /* ---, */
06003                                 movq mm3, mm7           /* copy MM7 into MM3 */
06004                                 psrlq mm7, 32           /* shift 2 left words to the right */
06005                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
06006                                 movq mm2, mm7           /* copy MM7 into MM2 */
06007                                 psrlq mm7, 16           /* shift 1 left word to the right */
06008                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
06009                                 movd mm1, eax           /* save EAX in MM1 */
06010                                 packuswb mm7, mm0       /* pack division result with saturation */
06011                                 movd eax, mm7           /* copy saturated result into EAX */
06012                                 mov [edi], al           /* copy a byte result into Dest */
06013                                 movd eax, mm1           /* restore saved EAX */
06014                                 /* --, */
06015                                 movd esi, mm6           /* move Src pointer to the top pixel */
06016                                 sub edx, 104    /* EDX = Kernel address */
06017                                 inc              esi            /* move Src  pointer to the next pixel */
06018                                 inc              edi            /* move Dest pointer to the next pixel */
06019                                 /* ---, */
06020                                 dec              ecx            /* decrease loop counter COLUMNS */
06021                                 jnz            L10382           /* check loop termination, proceed if required */
06022                                 add esi, 6      /* move to the next row in Src */
06023                                 add edi, 6      /* move to the next row in Dest */
06024                                 dec              ebx            /* decrease loop counter ROWS */
06025                                 jnz            L10380           /* check loop termination, proceed if required */
06026                                 /* ---, */
06027                                 emms                            /* exit MMX state */
06028                                 popa
06029                 }
06030 #else
06031                 asm volatile
06032                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
06033                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
06034                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
06035                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */
06036                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
06037                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
06038                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
06039                         "add          $3, %%edi \n\t"   /* 3 column offset from the left edge */
06040                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
06041                         "add       %%eax, %%edi \n\t"   /* 3 row offset from the top edge */
06042                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"       /* initialize ROWS counter */
06043                         "sub          $6, %%ebx \n\t"   /* do not use first 3 and last 3 rows */
06044                         /* --- */
06045                         ".L10380:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
06046                         "sub          $6, %%ecx \n\t"   /* do not use first 3 and last 3 columns */
06047                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
06048                         ".L10382:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
06049                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
06050                         /* --- 1 */
06051                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06052                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06053                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06054                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06055                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06056                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06057                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06058                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06059                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06060                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06061                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06062                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06063                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06064                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06065                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06066                         /* --- 2 */
06067                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06068                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06069                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06070                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06071                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06072                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06073                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06074                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06075                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06076                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06077                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06078                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06079                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06080                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06081                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06082                         /* --- 3 */
06083                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06084                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06085                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06086                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06087                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06088                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06089                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06090                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06091                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06092                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06093                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06094                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06095                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06096                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06097                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06098                         /* --- 4 */
06099                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06100                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06101                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06102                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06103                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06104                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06105                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06106                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06107                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06108                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06109                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06110                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06111                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06112                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06113                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06114                         /* --- 5 */
06115                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06116                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06117                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06118                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06119                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06120                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06121                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06122                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06123                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06124                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06125                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06126                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06127                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06128                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06129                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06130                         /* --- 6 */
06131                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06132                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06133                         "add       %%eax, %%esi \n\t"   /* move Src pointer 1 row below */
06134                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06135                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06136                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06137                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06138                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06139                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06140                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06141                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06142                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06143                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06144                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06145                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06146                         /* --- 7 */
06147                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06148                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06149                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06150                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06151                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06152                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06153                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06154                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06155                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06156                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06157                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06158                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06159                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06160                         /* --- */
06161                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
06162                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
06163                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
06164                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
06165                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
06166                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
06167                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */
06168                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
06169                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
06170                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
06171                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
06172                         /* -- */
06173                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
06174                         "sub        $104, %%edx \n\t"   /* EDX = Kernel address */
06175                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
06176                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
06177                         /* --- */
06178                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
06179                         "jnz            .L10382 \n\t"   /* check loop termination, proceed if required */
06180                         "add          $6, %%esi \n\t"   /* move to the next row in Src */
06181                         "add          $6, %%edi \n\t"   /* move to the next row in Dest */
06182                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
06183                         "jnz            .L10380 \n\t"   /* check loop termination, proceed if required */
06184                         /* --- */
06185                         "emms                   \n\t"   /* exit MMX state */
06186                         "popa                   \n\t":"=m" (Dest)       /* %0 */
06187                         :"m"(Src),              /* %1 */
06188                         "m"(rows),              /* %2 */
06189                         "m"(columns),           /* %3 */
06190                         "m"(Kernel),            /* %4 */
06191                         "m"(NRightShift)        /* %5 */
06192                         );
06193 #endif
06194 #endif
06195                 return (0);
06196         } else {
06197                 /* No non-MMX implementation yet */
06198                 return (-1);
06199         }
06200 }
06201 
06216 int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
06217                                                                                            signed short *Kernel, unsigned char NRightShift)
06218 {
06219         /* Validate input parameters */
06220         if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
06221                 return(-1);
06222 
06223         if ((columns < 9) || (rows < 9) || (NRightShift > 7))
06224                 return (-1);
06225 
06226         if ((SDL_imageFilterMMXdetect())) {
06227 //#ifdef USE_MMX
06228 #if defined(USE_MMX) && defined(i386)
06229 #if !defined(GCC__)
06230                 __asm
06231                 {
06232                         pusha
06233                                 pxor mm0, mm0           /* zero MM0 */
06234                                 xor ebx, ebx    /* zero EBX */
06235                                 mov bl, NRightShift     /* load NRightShift into BL */
06236                                 movd mm5, ebx           /* copy NRightShift into MM5 */
06237                                 mov edx, Kernel         /* load Kernel address into EDX */
06238                                 mov esi, Src    /* load Src  address to ESI */
06239                                 mov edi, Dest           /* load Dest address to EDI */
06240                                 add edi, 4      /* 4 column offset from the left edge */
06241                                 mov eax, columns        /* load columns into EAX */
06242                                 add edi, eax    /* 4 row offset from the top edge */
06243                                 add edi, eax
06244                                 add edi, eax
06245                                 add edi, eax
06246                                 mov ebx, rows           /* initialize ROWS counter */
06247                                 sub ebx, 8      /* do not use first 4 and last 4 rows */
06248                                 /* ---, */
06249 L10390:
06250                         mov ecx, eax    /* initialize COLUMNS counter */
06251                                 sub ecx, 8      /* do not use first 4 and last 4 columns */
06252                                 align 16                        /* 16 byte alignment of the loop entry */
06253 L10392:
06254                         pxor mm7, mm7           /* zero MM7 (accumulator) */
06255                                 movd mm6, esi           /* save ESI in MM6 */
06256                                 /* --- 1 */
06257                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06258                         movq mm2, mm1           /* copy MM1 into MM2 */
06259                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06260                                 movq mm3, [edx]         /* load 4 words of Kernel */
06261                         add edx, 8      /* move pointer to other 4 words */
06262                                 movq mm4, [edx]         /* load 4 words of Kernel */
06263                         add edx, 8      /* move pointer to other 4 words */
06264                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06265                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06266                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06267                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06268                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06269                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06270                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06271                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06272                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06273                         dec              esi
06274                                 add esi, eax    /* move Src pointer 1 row below */
06275                                 movq mm3, [edx]         /* load 4 words of Kernel */
06276                         add edx, 8      /* move pointer to other 4 words */
06277                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06278                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06279                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06280                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06281                                 /* --- 2 */
06282                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06283                         movq mm2, mm1           /* copy MM1 into MM2 */
06284                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06285                                 movq mm3, [edx]         /* load 4 words of Kernel */
06286                         add edx, 8      /* move pointer to other 4 words */
06287                                 movq mm4, [edx]         /* load 4 words of Kernel */
06288                         add edx, 8      /* move pointer to other 4 words */
06289                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06290                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06291                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06292                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06293                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06294                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06295                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06296                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06297                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06298                         dec              esi
06299                                 add esi, eax    /* move Src pointer 1 row below */
06300                                 movq mm3, [edx]         /* load 4 words of Kernel */
06301                         add edx, 8      /* move pointer to other 4 words */
06302                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06303                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06304                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06305                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06306                                 /* --- 3 */
06307                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06308                         movq mm2, mm1           /* copy MM1 into MM2 */
06309                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06310                                 movq mm3, [edx]         /* load 4 words of Kernel */
06311                         add edx, 8      /* move pointer to other 4 words */
06312                                 movq mm4, [edx]         /* load 4 words of Kernel */
06313                         add edx, 8      /* move pointer to other 4 words */
06314                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06315                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06316                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06317                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06318                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06319                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06320                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06321                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06322                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06323                         dec              esi
06324                                 add esi, eax    /* move Src pointer 1 row below */
06325                                 movq mm3, [edx]         /* load 4 words of Kernel */
06326                         add edx, 8      /* move pointer to other 4 words */
06327                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06328                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06329                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06330                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06331                                 /* --- 4 */
06332                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06333                         movq mm2, mm1           /* copy MM1 into MM2 */
06334                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06335                                 movq mm3, [edx]         /* load 4 words of Kernel */
06336                         add edx, 8      /* move pointer to other 4 words */
06337                                 movq mm4, [edx]         /* load 4 words of Kernel */
06338                         add edx, 8      /* move pointer to other 4 words */
06339                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06340                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06341                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06342                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06343                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06344                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06345                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06346                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06347                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06348                         dec              esi
06349                                 add esi, eax    /* move Src pointer 1 row below */
06350                                 movq mm3, [edx]         /* load 4 words of Kernel */
06351                         add edx, 8      /* move pointer to other 4 words */
06352                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06353                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06354                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06355                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06356                                 /* --- 5 */
06357                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06358                         movq mm2, mm1           /* copy MM1 into MM2 */
06359                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06360                                 movq mm3, [edx]         /* load 4 words of Kernel */
06361                         add edx, 8      /* move pointer to other 4 words */
06362                                 movq mm4, [edx]         /* load 4 words of Kernel */
06363                         add edx, 8      /* move pointer to other 4 words */
06364                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06365                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06366                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06367                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06368                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06369                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06370                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06371                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06372                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06373                         dec              esi
06374                                 add esi, eax    /* move Src pointer 1 row below */
06375                                 movq mm3, [edx]         /* load 4 words of Kernel */
06376                         add edx, 8      /* move pointer to other 4 words */
06377                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06378                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06379                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06380                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06381                                 /* --- 6 */
06382                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06383                         movq mm2, mm1           /* copy MM1 into MM2 */
06384                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06385                                 movq mm3, [edx]         /* load 4 words of Kernel */
06386                         add edx, 8      /* move pointer to other 4 words */
06387                                 movq mm4, [edx]         /* load 4 words of Kernel */
06388                         add edx, 8      /* move pointer to other 4 words */
06389                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06390                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06391                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06392                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06393                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06394                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06395                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06396                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06397                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06398                         dec              esi
06399                                 add esi, eax    /* move Src pointer 1 row below */
06400                                 movq mm3, [edx]         /* load 4 words of Kernel */
06401                         add edx, 8      /* move pointer to other 4 words */
06402                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06403                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06404                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06405                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06406                                 /* --- 7 */
06407                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06408                         movq mm2, mm1           /* copy MM1 into MM2 */
06409                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06410                                 movq mm3, [edx]         /* load 4 words of Kernel */
06411                         add edx, 8      /* move pointer to other 4 words */
06412                                 movq mm4, [edx]         /* load 4 words of Kernel */
06413                         add edx, 8      /* move pointer to other 4 words */
06414                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06415                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06416                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06417                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06418                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06419                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06420                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06421                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06422                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06423                         dec              esi
06424                                 add esi, eax    /* move Src pointer 1 row below */
06425                                 movq mm3, [edx]         /* load 4 words of Kernel */
06426                         add edx, 8      /* move pointer to other 4 words */
06427                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06428                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06429                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06430                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06431                                 /* --- 8 */
06432                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06433                         movq mm2, mm1           /* copy MM1 into MM2 */
06434                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06435                                 movq mm3, [edx]         /* load 4 words of Kernel */
06436                         add edx, 8      /* move pointer to other 4 words */
06437                                 movq mm4, [edx]         /* load 4 words of Kernel */
06438                         add edx, 8      /* move pointer to other 4 words */
06439                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06440                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06441                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06442                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06443                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06444                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06445                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06446                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06447                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06448                         dec              esi
06449                                 add esi, eax    /* move Src pointer 1 row below */
06450                                 movq mm3, [edx]         /* load 4 words of Kernel */
06451                         add edx, 8      /* move pointer to other 4 words */
06452                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06453                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06454                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06455                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06456                                 /* --- 9 */
06457                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06458                         movq mm2, mm1           /* copy MM1 into MM2 */
06459                                 inc              esi            /* move pointer to the next 8 bytes of Src */
06460                                 movq mm3, [edx]         /* load 4 words of Kernel */
06461                         add edx, 8      /* move pointer to other 4 words */
06462                                 movq mm4, [edx]         /* load 4 words of Kernel */
06463                         add edx, 8      /* move pointer to other 4 words */
06464                                 punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06465                                 punpckhbw mm2, mm0      /* unpack second 4 bytes into words */
06466                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06467                                 psrlw mm2, mm5          /* shift right each pixel NshiftRight times */
06468                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06469                                 pmullw mm2, mm4         /* mult 4 high words of Src and Kernel */
06470                                 paddsw mm1, mm2         /* add 4 words of the high and low bytes */
06471                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06472                                 movq mm1, [esi]         /* load 8 bytes of the Src */
06473                         movq mm3, [edx]         /* load 4 words of Kernel */
06474                         punpcklbw mm1, mm0      /* unpack first  4 bytes into words */
06475                                 psrlw mm1, mm5          /* shift right each pixel NshiftRight times */
06476                                 pmullw mm1, mm3         /* mult 4 low  words of Src and Kernel */
06477                                 paddsw mm7, mm1         /* add MM1 to accumulator MM7 */
06478                                 /* ---, */
06479                                 movq mm3, mm7           /* copy MM7 into MM3 */
06480                                 psrlq mm7, 32           /* shift 2 left words to the right */
06481                                 paddsw mm7, mm3         /* add 2 left and 2 right result words */
06482                                 movq mm2, mm7           /* copy MM7 into MM2 */
06483                                 psrlq mm7, 16           /* shift 1 left word to the right */
06484                                 paddsw mm7, mm2         /* add 1 left and 1 right result words */
06485                                 movd mm1, eax           /* save EAX in MM1 */
06486                                 packuswb mm7, mm0       /* pack division result with saturation */
06487                                 movd eax, mm7           /* copy saturated result into EAX */
06488                                 mov [edi], al           /* copy a byte result into Dest */
06489                                 movd eax, mm1           /* restore saved EAX */
06490                                 /* --, */
06491                                 movd esi, mm6           /* move Src pointer to the top pixel */
06492                                 sub edx, 208    /* EDX = Kernel address */
06493                                 inc              esi            /* move Src  pointer to the next pixel */
06494                                 inc              edi            /* move Dest pointer to the next pixel */
06495                                 /* ---, */
06496                                 dec              ecx            /* decrease loop counter COLUMNS */
06497                                 jnz            L10392           /* check loop termination, proceed if required */
06498                                 add esi, 8      /* move to the next row in Src */
06499                                 add edi, 8      /* move to the next row in Dest */
06500                                 dec              ebx            /* decrease loop counter ROWS */
06501                                 jnz            L10390           /* check loop termination, proceed if required */
06502                                 /* ---, */
06503                                 emms                            /* exit MMX state */
06504                                 popa
06505                 }
06506 #else
06507                 asm volatile
06508                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
06509                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
06510                         "mov           %5, %%bl \n\t"   /* load NRightShift into BL */
06511                         "movd      %%ebx, %%mm5 \n\t"   /* copy NRightShift into MM5 */
06512                         "mov          %4, %%edx \n\t"   /* load Kernel address into EDX */
06513                         "mov          %1, %%esi \n\t"   /* load Src  address to ESI */
06514                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
06515                         "add          $4, %%edi \n\t"   /* 4 column offset from the left edge */
06516                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
06517                         "add       %%eax, %%edi \n\t"   /* 4 row offset from the top edge */
06518                         "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t" /* initialize ROWS counter */
06519                         "sub          $8, %%ebx \n\t"   /* do not use first 4 and last 4 rows */
06520                         /* --- */
06521                         ".L10390:               \n\t" "mov       %%eax, %%ecx \n\t"     /* initialize COLUMNS counter */
06522                         "sub          $8, %%ecx \n\t"   /* do not use first 4 and last 4 columns */
06523                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
06524                         ".L10392:               \n\t" "pxor      %%mm7, %%mm7 \n\t"     /* zero MM7 (accumulator) */
06525                         "movd      %%esi, %%mm6 \n\t"   /* save ESI in MM6 */
06526                         /* --- 1 */
06527                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06528                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06529                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06530                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06531                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06532                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06533                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06534                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06535                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06536                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06537                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06538                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06539                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06540                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06541                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06542                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06543                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06544                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06545                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06546                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06547                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06548                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06549                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06550                         /* --- 2 */
06551                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06552                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06553                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06554                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06555                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06556                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06557                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06558                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06559                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06560                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06561                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06562                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06563                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06564                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06565                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06566                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06567                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06568                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06569                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06570                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06571                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06572                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06573                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06574                         /* --- 3 */
06575                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06576                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06577                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06578                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06579                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06580                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06581                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06582                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06583                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06584                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06585                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06586                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06587                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06588                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06589                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06590                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06591                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06592                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06593                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06594                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06595                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06596                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06597                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06598                         /* --- 4 */
06599                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06600                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06601                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06602                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06603                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06604                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06605                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06606                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06607                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06608                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06609                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06610                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06611                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06612                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06613                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06614                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06615                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06616                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06617                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06618                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06619                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06620                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06621                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06622                         /* --- 5 */
06623                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06624                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06625                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06626                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06627                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06628                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06629                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06630                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06631                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06632                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06633                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06634                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06635                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06636                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06637                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06638                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06639                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06640                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06641                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06642                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06643                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06644                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06645                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06646                         /* --- 6 */
06647                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06648                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06649                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06650                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06651                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06652                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06653                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06654                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06655                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06656                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06657                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06658                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06659                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06660                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06661                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06662                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06663                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06664                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06665                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06666                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06667                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06668                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06669                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06670                         /* --- 7 */
06671                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06672                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06673                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06674                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06675                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06676                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06677                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06678                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06679                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06680                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06681                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06682                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06683                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06684                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06685                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06686                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06687                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06688                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06689                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06690                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06691                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06692                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06693                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06694                         /* --- 8 */
06695                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06696                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06697                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06698                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06699                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06700                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06701                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06702                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06703                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06704                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06705                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06706                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06707                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06708                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06709                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06710                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06711                         "dec              %%esi \n\t" "add       %%eax, %%esi \n\t"     /* move Src pointer 1 row below */
06712                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06713                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06714                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06715                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06716                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06717                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06718                         /* --- 9 */
06719                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06720                         "movq      %%mm1, %%mm2 \n\t"   /* copy MM1 into MM2 */
06721                         "inc              %%esi \n\t"   /* move pointer to the next 8 bytes of Src */
06722                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06723                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06724                         "movq    (%%edx), %%mm4 \n\t"   /* load 4 words of Kernel */
06725                         "add          $8, %%edx \n\t"   /* move pointer to other 4 words */
06726                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06727                         "punpckhbw %%mm0, %%mm2 \n\t"   /* unpack second 4 bytes into words */
06728                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06729                         "psrlw     %%mm5, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
06730                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06731                         "pmullw    %%mm4, %%mm2 \n\t"   /* mult. 4 high words of Src and Kernel */
06732                         "paddsw    %%mm2, %%mm1 \n\t"   /* add 4 words of the high and low bytes */
06733                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06734                         "movq    (%%esi), %%mm1 \n\t"   /* load 8 bytes of the Src */
06735                         "movq    (%%edx), %%mm3 \n\t"   /* load 4 words of Kernel */
06736                         "punpcklbw %%mm0, %%mm1 \n\t"   /* unpack first  4 bytes into words */
06737                         "psrlw     %%mm5, %%mm1 \n\t"   /* shift right each pixel NshiftRight times */
06738                         "pmullw    %%mm3, %%mm1 \n\t"   /* mult. 4 low  words of Src and Kernel */
06739                         "paddsw    %%mm1, %%mm7 \n\t"   /* add MM1 to accumulator MM7 */
06740                         /* --- */
06741                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
06742                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
06743                         "paddsw    %%mm3, %%mm7 \n\t"   /* add 2 left and 2 right result words */
06744                         "movq      %%mm7, %%mm2 \n\t"   /* copy MM7 into MM2 */
06745                         "psrlq       $16, %%mm7 \n\t"   /* shift 1 left word to the right */
06746                         "paddsw    %%mm2, %%mm7 \n\t"   /* add 1 left and 1 right result words */
06747                         "movd      %%eax, %%mm1 \n\t"   /* save EAX in MM1 */
06748                         "packuswb  %%mm0, %%mm7 \n\t"   /* pack division result with saturation */
06749                         "movd      %%mm7, %%eax \n\t"   /* copy saturated result into EAX */
06750                         "mov      %%al, (%%edi) \n\t"   /* copy a byte result into Dest */
06751                         "movd      %%mm1, %%eax \n\t"   /* restore saved EAX */
06752                         /* -- */
06753                         "movd      %%mm6, %%esi \n\t"   /* move Src pointer to the top pixel */
06754                         "sub        $208, %%edx \n\t"   /* EDX = Kernel address */
06755                         "inc              %%esi \n\t"   /* move Src  pointer to the next pixel */
06756                         "inc              %%edi \n\t"   /* move Dest pointer to the next pixel */
06757                         /* --- */
06758                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
06759                         "jnz            .L10392 \n\t"   /* check loop termination, proceed if required */
06760                         "add          $8, %%esi \n\t"   /* move to the next row in Src */
06761                         "add          $8, %%edi \n\t"   /* move to the next row in Dest */
06762                         "dec              %%ebx \n\t"   /* decrease loop counter ROWS */
06763                         "jnz            .L10390 \n\t"   /* check loop termination, proceed if required */
06764                         /* --- */
06765                         "emms                   \n\t"   /* exit MMX state */
06766                         "popa                   \n\t":"=m" (Dest)       /* %0 */
06767                         :"m"(Src),              /* %1 */
06768                         "m"(rows),              /* %2 */
06769                         "m"(columns),           /* %3 */
06770                         "m"(Kernel),            /* %4 */
06771                         "m"(NRightShift)        /* %5 */
06772                         );
06773 #endif
06774 #endif
06775                 return (0);
06776         } else {
06777                 /* No non-MMX implementation yet */
06778                 return (-1);
06779         }
06780 }
06781 
06782 /* ------------------------------------------------------------------------------------ */
06783 
06796 int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
06797 {
06798         /* Validate input parameters */
06799         if ((Src == NULL) || (Dest == NULL))
06800                 return(-1);
06801 
06802         if ((columns < 8) || (rows < 3))
06803                 return (-1);
06804 
06805         if ((SDL_imageFilterMMXdetect())) {
06806 //#ifdef USE_MMX
06807 #if defined(USE_MMX) && defined(i386)
06808 #if !defined(GCC__)
06809                 __asm
06810                 {
06811                         pusha
06812                                 pxor mm0, mm0           /* zero MM0 */
06813                                 mov eax, columns        /* load columns into EAX */
06814                                 /* ---, */
06815                                 mov esi, Src    /* ESI = Src row 0 address */
06816                                 mov edi, Dest           /* load Dest address to EDI */
06817                                 add edi, eax    /* EDI = EDI + columns */
06818                                 inc              edi            /* 1 byte offset from the left edge */
06819                                 mov edx, rows           /* initialize ROWS counter */
06820                                 sub edx, 2      /* do not use first and last rows */
06821                                 /* ---, */
06822 L10400:
06823                         mov ecx, eax    /* initialize COLUMS counter */
06824                                 shr ecx, 3      /* EBX/8 (MMX loads 8 bytes at a time) */
06825                                 mov ebx, esi    /* save ESI in EBX */
06826                                 movd mm1, edi           /* save EDI in MM1 */
06827                                 align 16                        /* 16 byte alignment of the loop entry */
06828 L10402:
06829                         /* ---, */
06830                         movq mm4, [esi]         /* load 8 bytes from Src */
06831                         movq mm5, mm4           /* save MM4 in MM5 */
06832                                 add esi, 2      /* move ESI pointer 2 bytes right */
06833                                 punpcklbw mm4, mm0      /* unpack 4 low  bytes into words */
06834                                 punpckhbw mm5, mm0      /* unpack 4 high bytes into words */
06835                                 movq mm6, [esi]         /* load 8 bytes from Src */
06836                         movq mm7, mm6           /* save MM6 in MM7 */
06837                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
06838                                 punpcklbw mm6, mm0      /* unpack 4 low  bytes into words */
06839                                 punpckhbw mm7, mm0      /* unpack 4 high bytes into words */
06840                                 add esi, eax    /* move to the next row of Src */
06841                                 movq mm2, [esi]         /* load 8 bytes from Src */
06842                         movq mm3, mm2           /* save MM2 in MM3 */
06843                                 add esi, 2      /* move ESI pointer 2 bytes right */
06844                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
06845                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
06846                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
06847                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
06848                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
06849                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
06850                                 movq mm2, [esi]         /* load 8 bytes from Src */
06851                         movq mm3, mm2           /* save MM2 in MM3 */
06852                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
06853                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
06854                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
06855                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
06856                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
06857                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
06858                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
06859                                 add esi, eax    /* move to the next row of Src */
06860                                 movq mm2, [esi]         /* load 8 bytes from Src */
06861                         movq mm3, mm2           /* save MM2 in MM3 */
06862                                 add esi, 2      /* move ESI pointer 2 bytes right */
06863                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
06864                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
06865                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
06866                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
06867                                 movq mm2, [esi]         /* load 8 bytes from Src */
06868                         movq mm3, mm2           /* save MM2 in MM3 */
06869                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
06870                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
06871                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
06872                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
06873                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
06874                                 /* ---, */
06875                                 movq mm2, mm4           /* copy MM4 into MM2 */
06876                                 psrlq mm4, 32           /* shift 2 left words to the right */
06877                                 psubw mm4, mm2          /* MM4 = MM4 - MM2 */
06878                                 movq mm3, mm6           /* copy MM6 into MM3 */
06879                                 psrlq mm6, 32           /* shift 2 left words to the right */
06880                                 psubw mm6, mm3          /* MM6 = MM6 - MM3 */
06881                                 punpckldq mm4, mm6      /* combine 2 words of MM6 and 2 words of MM4 */
06882                                 movq mm2, mm5           /* copy MM6 into MM2 */
06883                                 psrlq mm5, 32           /* shift 2 left words to the right */
06884                                 psubw mm5, mm2          /* MM5 = MM5 - MM2 */
06885                                 movq mm3, mm7           /* copy MM7 into MM3 */
06886                                 psrlq mm7, 32           /* shift 2 left words to the right */
06887                                 psubw mm7, mm3          /* MM7 = MM7 - MM3 */
06888                                 punpckldq mm5, mm7      /* combine 2 words of MM7 and 2 words of MM5 */
06889                                 /* Take abs values of MM4 and MM5 */
06890                                 movq mm6, mm4           /* copy MM4 into MM6 */
06891                                 movq mm7, mm5           /* copy MM5 into MM7 */
06892                                 psraw mm6, 15           /* fill MM6 words with word sign bit */
06893                                 psraw mm7, 15           /* fill MM7 words with word sign bit */
06894                                 pxor mm4, mm6           /* take 1's compliment of only neg words */
06895                                 pxor mm5, mm7           /* take 1's compliment of only neg words */
06896                                 psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
06897                                 psubsw mm5, mm7         /* add 1 to only neg words, W-(-1) or W-0 */
06898                                 packuswb mm4, mm5       /* combine and pack/saturate MM5 and MM4 */
06899                                 movq [edi], mm4         /* store result in Dest */
06900                                 /* ---, */
06901                                 sub esi, eax    /* move to the current top row in Src */
06902                                 sub esi, eax
06903                                 add esi, 8      /* move Src  pointer to the next 8 pixels */
06904                                 add edi, 8      /* move Dest pointer to the next 8 pixels */
06905                                 /* ---, */
06906                                 dec              ecx            /* decrease loop counter COLUMNS */
06907                                 jnz            L10402           /* check loop termination, proceed if required */
06908                                 mov esi, ebx    /* restore most left current row Src  address */
06909                                 movd edi, mm1           /* restore most left current row Dest address */
06910                                 add esi, eax    /* move to the next row in Src */
06911                                 add edi, eax    /* move to the next row in Dest */
06912                                 dec              edx            /* decrease loop counter ROWS */
06913                                 jnz            L10400           /* check loop termination, proceed if required */
06914                                 /* ---, */
06915                                 emms                            /* exit MMX state */
06916                                 popa
06917                 }
06918 #else
06919                 asm volatile
06920                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
06921                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
06922                         /* --- */
06923                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
06924                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
06925                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
06926                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
06927                         "mov          %2, %%edx \n\t"   /* initialize ROWS counter */
06928                         "sub          $2, %%edx \n\t"   /* do not use first and last rows */
06929                         /* --- */
06930                         ".L10400:                \n\t" "mov       %%eax, %%ecx \n\t"    /* initialize COLUMS counter */
06931                         "shr          $3, %%ecx \n\t"   /* EBX/8 (MMX loads 8 bytes at a time) */
06932                         "mov       %%esi, %%ebx \n\t"   /* save ESI in EBX */
06933                         "movd      %%edi, %%mm1 \n\t"   /* save EDI in MM1 */
06934                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
06935                         ".L10402:               \n\t"
06936                         /* --- */
06937                         "movq    (%%esi), %%mm4 \n\t"   /* load 8 bytes from Src */
06938                         "movq      %%mm4, %%mm5 \n\t"   /* save MM4 in MM5 */
06939                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
06940                         "punpcklbw %%mm0, %%mm4 \n\t"   /* unpack 4 low  bytes into words */
06941                         "punpckhbw %%mm0, %%mm5 \n\t"   /* unpack 4 high bytes into words */
06942                         "movq    (%%esi), %%mm6 \n\t"   /* load 8 bytes from Src */
06943                         "movq      %%mm6, %%mm7 \n\t"   /* save MM6 in MM7 */
06944                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
06945                         "punpcklbw %%mm0, %%mm6 \n\t"   /* unpack 4 low  bytes into words */
06946                         "punpckhbw %%mm0, %%mm7 \n\t"   /* unpack 4 high bytes into words */
06947                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
06948                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
06949                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
06950                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
06951                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
06952                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
06953                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
06954                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
06955                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
06956                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
06957                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
06958                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
06959                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
06960                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
06961                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
06962                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
06963                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
06964                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
06965                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
06966                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
06967                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
06968                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
06969                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
06970                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
06971                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
06972                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
06973                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
06974                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
06975                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
06976                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
06977                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
06978                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
06979                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
06980                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
06981                         /* --- */
06982                         "movq      %%mm4, %%mm2 \n\t"   /* copy MM4 into MM2 */
06983                         "psrlq       $32, %%mm4 \n\t"   /* shift 2 left words to the right */
06984                         "psubw     %%mm2, %%mm4 \n\t"   /* MM4 = MM4 - MM2 */
06985                         "movq      %%mm6, %%mm3 \n\t"   /* copy MM6 into MM3 */
06986                         "psrlq       $32, %%mm6 \n\t"   /* shift 2 left words to the right */
06987                         "psubw     %%mm3, %%mm6 \n\t"   /* MM6 = MM6 - MM3 */
06988                         "punpckldq %%mm6, %%mm4 \n\t"   /* combine 2 words of MM6 and 2 words of MM4 */
06989                         "movq      %%mm5, %%mm2 \n\t"   /* copy MM6 into MM2 */
06990                         "psrlq       $32, %%mm5 \n\t"   /* shift 2 left words to the right */
06991                         "psubw     %%mm2, %%mm5 \n\t"   /* MM5 = MM5 - MM2 */
06992                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
06993                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
06994                         "psubw     %%mm3, %%mm7 \n\t"   /* MM7 = MM7 - MM3 */
06995                         "punpckldq %%mm7, %%mm5 \n\t"   /* combine 2 words of MM7 and 2 words of MM5 */
06996                         /* Take abs values of MM4 and MM5 */
06997                         "movq      %%mm4, %%mm6 \n\t"   /* copy MM4 into MM6 */
06998                         "movq      %%mm5, %%mm7 \n\t"   /* copy MM5 into MM7 */
06999                         "psraw       $15, %%mm6 \n\t"   /* fill MM6 words with word sign bit */
07000                         "psraw       $15, %%mm7 \n\t"   /* fill MM7 words with word sign bit */
07001                         "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */
07002                         "pxor      %%mm7, %%mm5 \n\t"   /* take 1's compliment of only neg. words */
07003                         "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07004                         "psubsw    %%mm7, %%mm5 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07005                         "packuswb  %%mm5, %%mm4 \n\t"   /* combine and pack/saturate MM5 and MM4 */
07006                         "movq    %%mm4, (%%edi) \n\t"   /* store result in Dest */
07007                         /* --- */
07008                         "sub       %%eax, %%esi \n\t"   /* move to the current top row in Src */
07009                         "sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"     /* move Src  pointer to the next 8 pixels */
07010                         "add $8,          %%edi \n\t"   /* move Dest pointer to the next 8 pixels */
07011                         /* --- */
07012                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
07013                         "jnz            .L10402 \n\t"   /* check loop termination, proceed if required */
07014                         "mov       %%ebx, %%esi \n\t"   /* restore most left current row Src  address */
07015                         "movd      %%mm1, %%edi \n\t"   /* restore most left current row Dest address */
07016                         "add       %%eax, %%esi \n\t"   /* move to the next row in Src */
07017                         "add       %%eax, %%edi \n\t"   /* move to the next row in Dest */
07018                         "dec              %%edx \n\t"   /* decrease loop counter ROWS */
07019                         "jnz            .L10400 \n\t"   /* check loop termination, proceed if required */
07020                         /* --- */
07021                         "emms                   \n\t"   /* exit MMX state */
07022                         "popa                   \n\t":"=m" (Dest)       /* %0 */
07023                         :"m"(Src),              /* %1 */
07024                         "m"(rows),              /* %2 */
07025                         "m"(columns)            /* %3 */
07026                         );
07027 #endif
07028 #endif
07029                 return (0);
07030         } else {
07031                 /* No non-MMX implementation yet */
07032                 return (-1);
07033         }
07034 }
07035 
07049 int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
07050                                                                         unsigned char NRightShift)
07051 {
07052         /* Validate input parameters */
07053         if ((Src == NULL) || (Dest == NULL))
07054                 return(-1);
07055         if ((columns < 8) || (rows < 3) || (NRightShift > 7))
07056                 return (-1);
07057 
07058         if ((SDL_imageFilterMMXdetect())) {
07059 //#ifdef USE_MMX
07060 #if defined(USE_MMX) && defined(i386)
07061 #if !defined(GCC__)
07062                 __asm
07063                 {
07064                         pusha
07065                                 pxor mm0, mm0           /* zero MM0 */
07066                                 mov eax, columns        /* load columns into EAX */
07067                                 xor ebx, ebx    /* zero EBX */
07068                                 mov bl, NRightShift     /* load NRightShift into BL */
07069                                 movd mm1, ebx           /* copy NRightShift into MM1 */
07070                                 /* ---, */
07071                                 mov esi, Src    /* ESI = Src row 0 address */
07072                                 mov edi, Dest           /* load Dest address to EDI */
07073                                 add edi, eax    /* EDI = EDI + columns */
07074                                 inc              edi            /* 1 byte offset from the left edge */
07075                                 /* initialize ROWS counter */
07076                                 sub rows, 2     /* do not use first and last rows */
07077                                 /* ---, */
07078 L10410:
07079                         mov ecx, eax    /* initialize COLUMS counter */
07080                                 shr ecx, 3      /* EBX/8 (MMX loads 8 bytes at a time) */
07081                                 mov ebx, esi    /* save ESI in EBX */
07082                                 mov edx, edi    /* save EDI in EDX */
07083                                 align 16                        /* 16 byte alignment of the loop entry */
07084 L10412:
07085                         /* ---, */
07086                         movq mm4, [esi]         /* load 8 bytes from Src */
07087                         movq mm5, mm4           /* save MM4 in MM5 */
07088                                 add esi, 2      /* move ESI pointer 2 bytes right */
07089                                 punpcklbw mm4, mm0      /* unpack 4 low  bytes into words */
07090                                 punpckhbw mm5, mm0      /* unpack 4 high bytes into words */
07091                                 psrlw mm4, mm1          /* shift right each pixel NshiftRight times */
07092                                 psrlw mm5, mm1          /* shift right each pixel NshiftRight times */
07093                                 movq mm6, [esi]         /* load 8 bytes from Src */
07094                         movq mm7, mm6           /* save MM6 in MM7 */
07095                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07096                                 punpcklbw mm6, mm0      /* unpack 4 low  bytes into words */
07097                                 punpckhbw mm7, mm0      /* unpack 4 high bytes into words */
07098                                 psrlw mm6, mm1          /* shift right each pixel NshiftRight times */
07099                                 psrlw mm7, mm1          /* shift right each pixel NshiftRight times */
07100                                 add esi, eax    /* move to the next row of Src */
07101                                 movq mm2, [esi]         /* load 8 bytes from Src */
07102                         movq mm3, mm2           /* save MM2 in MM3 */
07103                                 add esi, 2      /* move ESI pointer 2 bytes right */
07104                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07105                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07106                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07107                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07108                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07109                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07110                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07111                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07112                                 movq mm2, [esi]         /* load 8 bytes from Src */
07113                         movq mm3, mm2           /* save MM2 in MM3 */
07114                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07115                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07116                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07117                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07118                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07119                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07120                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07121                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07122                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07123                                 add esi, eax    /* move to the next row of Src */
07124                                 movq mm2, [esi]         /* load 8 bytes from Src */
07125                         movq mm3, mm2           /* save MM2 in MM3 */
07126                                 add esi, 2      /* move ESI pointer 2 bytes right */
07127                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07128                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07129                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07130                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07131                                 paddw mm4, mm2          /* add 4 low  bytes to accumolator MM4 */
07132                                 paddw mm5, mm3          /* add 4 high bytes to accumolator MM5 */
07133                                 movq mm2, [esi]         /* load 8 bytes from Src */
07134                         movq mm3, mm2           /* save MM2 in MM3 */
07135                                 sub esi, 2      /* move ESI pointer back 2 bytes left */
07136                                 punpcklbw mm2, mm0      /* unpack 4 low  bytes into words */
07137                                 punpckhbw mm3, mm0      /* unpack 4 high bytes into words */
07138                                 psrlw mm2, mm1          /* shift right each pixel NshiftRight times */
07139                                 psrlw mm3, mm1          /* shift right each pixel NshiftRight times */
07140                                 paddw mm6, mm2          /* add 4 low  bytes to accumolator MM6 */
07141                                 paddw mm7, mm3          /* add 4 high bytes to accumolator MM7 */
07142                                 /* ---, */
07143                                 movq mm2, mm4           /* copy MM4 into MM2 */
07144                                 psrlq mm4, 32           /* shift 2 left words to the right */
07145                                 psubw mm4, mm2          /* MM4 = MM4 - MM2 */
07146                                 movq mm3, mm6           /* copy MM6 into MM3 */
07147                                 psrlq mm6, 32           /* shift 2 left words to the right */
07148                                 psubw mm6, mm3          /* MM6 = MM6 - MM3 */
07149                                 punpckldq mm4, mm6      /* combine 2 words of MM6 and 2 words of MM4 */
07150                                 movq mm2, mm5           /* copy MM6 into MM2 */
07151                                 psrlq mm5, 32           /* shift 2 left words to the right */
07152                                 psubw mm5, mm2          /* MM5 = MM5 - MM2 */
07153                                 movq mm3, mm7           /* copy MM7 into MM3 */
07154                                 psrlq mm7, 32           /* shift 2 left words to the right */
07155                                 psubw mm7, mm3          /* MM7 = MM7 - MM3 */
07156                                 punpckldq mm5, mm7      /* combine 2 words of MM7 and 2 words of MM5 */
07157                                 /* Take abs values of MM4 and MM5 */
07158                                 movq mm6, mm4           /* copy MM4 into MM6 */
07159                                 movq mm7, mm5           /* copy MM5 into MM7 */
07160                                 psraw mm6, 15           /* fill MM6 words with word sign bit */
07161                                 psraw mm7, 15           /* fill MM7 words with word sign bit */
07162                                 pxor mm4, mm6           /* take 1's compliment of only neg words */
07163                                 pxor mm5, mm7           /* take 1's compliment of only neg words */
07164                                 psubsw mm4, mm6         /* add 1 to only neg words, W-(-1) or W-0 */
07165                                 psubsw mm5, mm7         /* add 1 to only neg words, W-(-1) or W-0 */
07166                                 packuswb mm4, mm5       /* combine and pack/saturate MM5 and MM4 */
07167                                 movq [edi], mm4         /* store result in Dest */
07168                                 /* ---, */
07169                                 sub esi, eax    /* move to the current top row in Src */
07170                                 sub esi, eax
07171                                 add esi, 8      /* move Src  pointer to the next 8 pixels */
07172                                 add edi, 8      /* move Dest pointer to the next 8 pixels */
07173                                 /* ---, */
07174                                 dec              ecx            /* decrease loop counter COLUMNS */
07175                                 jnz            L10412           /* check loop termination, proceed if required */
07176                                 mov esi, ebx    /* restore most left current row Src  address */
07177                                 mov edi, edx    /* restore most left current row Dest address */
07178                                 add esi, eax    /* move to the next row in Src */
07179                                 add edi, eax    /* move to the next row in Dest */
07180                                 dec rows        /* decrease loop counter ROWS */
07181                                 jnz            L10410           /* check loop termination, proceed if required */
07182                                 /* ---, */
07183                                 emms                            /* exit MMX state */
07184                                 popa
07185                 }
07186 #else
07187                 asm volatile
07188                         ("pusha              \n\t" "pxor      %%mm0, %%mm0 \n\t"        /* zero MM0 */
07189                         "mov          %3, %%eax \n\t"   /* load columns into EAX */
07190                         "xor       %%ebx, %%ebx \n\t"   /* zero EBX */
07191                         "mov           %4, %%bl \n\t"   /* load NRightShift into BL */
07192                         "movd      %%ebx, %%mm1 \n\t"   /* copy NRightShift into MM1 */
07193                         /* --- */
07194                         "mov          %1, %%esi \n\t"   /* ESI = Src row 0 address */
07195                         "mov          %0, %%edi \n\t"   /* load Dest address to EDI */
07196                         "add       %%eax, %%edi \n\t"   /* EDI = EDI + columns */
07197                         "inc              %%edi \n\t"   /* 1 byte offset from the left edge */
07198                         /* initialize ROWS counter */
07199                         "subl            $2, %2 \n\t"   /* do not use first and last rows */
07200                         /* --- */
07201                         ".L10410:                \n\t" "mov       %%eax, %%ecx \n\t"    /* initialize COLUMS counter */
07202                         "shr          $3, %%ecx \n\t"   /* EBX/8 (MMX loads 8 bytes at a time) */
07203                         "mov       %%esi, %%ebx \n\t"   /* save ESI in EBX */
07204                         "mov       %%edi, %%edx \n\t"   /* save EDI in EDX */
07205                         ".align 16              \n\t"   /* 16 byte alignment of the loop entry */
07206                         ".L10412:               \n\t"
07207                         /* --- */
07208                         "movq    (%%esi), %%mm4 \n\t"   /* load 8 bytes from Src */
07209                         "movq      %%mm4, %%mm5 \n\t"   /* save MM4 in MM5 */
07210                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07211                         "punpcklbw %%mm0, %%mm4 \n\t"   /* unpack 4 low  bytes into words */
07212                         "punpckhbw %%mm0, %%mm5 \n\t"   /* unpack 4 high bytes into words */
07213                         "psrlw     %%mm1, %%mm4 \n\t"   /* shift right each pixel NshiftRight times */
07214                         "psrlw     %%mm1, %%mm5 \n\t"   /* shift right each pixel NshiftRight times */
07215                         "movq    (%%esi), %%mm6 \n\t"   /* load 8 bytes from Src */
07216                         "movq      %%mm6, %%mm7 \n\t"   /* save MM6 in MM7 */
07217                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07218                         "punpcklbw %%mm0, %%mm6 \n\t"   /* unpack 4 low  bytes into words */
07219                         "punpckhbw %%mm0, %%mm7 \n\t"   /* unpack 4 high bytes into words */
07220                         "psrlw     %%mm1, %%mm6 \n\t"   /* shift right each pixel NshiftRight times */
07221                         "psrlw     %%mm1, %%mm7 \n\t"   /* shift right each pixel NshiftRight times */
07222                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
07223                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07224                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07225                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07226                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07227                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07228                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07229                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07230                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07231                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07232                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07233                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07234                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07235                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07236                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07237                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07238                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07239                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07240                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07241                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07242                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07243                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07244                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07245                         "add       %%eax, %%esi \n\t"   /* move to the next row of Src */
07246                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07247                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07248                         "add          $2, %%esi \n\t"   /* move ESI pointer 2 bytes right */
07249                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07250                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07251                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07252                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07253                         "paddw     %%mm2, %%mm4 \n\t"   /* add 4 low  bytes to accumolator MM4 */
07254                         "paddw     %%mm3, %%mm5 \n\t"   /* add 4 high bytes to accumolator MM5 */
07255                         "movq    (%%esi), %%mm2 \n\t"   /* load 8 bytes from Src */
07256                         "movq      %%mm2, %%mm3 \n\t"   /* save MM2 in MM3 */
07257                         "sub          $2, %%esi \n\t"   /* move ESI pointer back 2 bytes left */
07258                         "punpcklbw %%mm0, %%mm2 \n\t"   /* unpack 4 low  bytes into words */
07259                         "punpckhbw %%mm0, %%mm3 \n\t"   /* unpack 4 high bytes into words */
07260                         "psrlw     %%mm1, %%mm2 \n\t"   /* shift right each pixel NshiftRight times */
07261                         "psrlw     %%mm1, %%mm3 \n\t"   /* shift right each pixel NshiftRight times */
07262                         "paddw     %%mm2, %%mm6 \n\t"   /* add 4 low  bytes to accumolator MM6 */
07263                         "paddw     %%mm3, %%mm7 \n\t"   /* add 4 high bytes to accumolator MM7 */
07264                         /* --- */
07265                         "movq      %%mm4, %%mm2 \n\t"   /* copy MM4 into MM2 */
07266                         "psrlq       $32, %%mm4 \n\t"   /* shift 2 left words to the right */
07267                         "psubw     %%mm2, %%mm4 \n\t"   /* MM4 = MM4 - MM2 */
07268                         "movq      %%mm6, %%mm3 \n\t"   /* copy MM6 into MM3 */
07269                         "psrlq       $32, %%mm6 \n\t"   /* shift 2 left words to the right */
07270                         "psubw     %%mm3, %%mm6 \n\t"   /* MM6 = MM6 - MM3 */
07271                         "punpckldq %%mm6, %%mm4 \n\t"   /* combine 2 words of MM6 and 2 words of MM4 */
07272                         "movq      %%mm5, %%mm2 \n\t"   /* copy MM6 into MM2 */
07273                         "psrlq       $32, %%mm5 \n\t"   /* shift 2 left words to the right */
07274                         "psubw     %%mm2, %%mm5 \n\t"   /* MM5 = MM5 - MM2 */
07275                         "movq      %%mm7, %%mm3 \n\t"   /* copy MM7 into MM3 */
07276                         "psrlq       $32, %%mm7 \n\t"   /* shift 2 left words to the right */
07277                         "psubw     %%mm3, %%mm7 \n\t"   /* MM7 = MM7 - MM3 */
07278                         "punpckldq %%mm7, %%mm5 \n\t"   /* combine 2 words of MM7 and 2 words of MM5 */
07279                         /* Take abs values of MM4 and MM5 */
07280                         "movq      %%mm4, %%mm6 \n\t"   /* copy MM4 into MM6 */
07281                         "movq      %%mm5, %%mm7 \n\t"   /* copy MM5 into MM7 */
07282                         "psraw       $15, %%mm6 \n\t"   /* fill MM6 words with word sign bit */
07283                         "psraw       $15, %%mm7 \n\t"   /* fill MM7 words with word sign bit */
07284                         "pxor      %%mm6, %%mm4 \n\t"   /* take 1's compliment of only neg. words */
07285                         "pxor      %%mm7, %%mm5 \n\t"   /* take 1's compliment of only neg. words */
07286                         "psubsw    %%mm6, %%mm4 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07287                         "psubsw    %%mm7, %%mm5 \n\t"   /* add 1 to only neg. words, W-(-1) or W-0 */
07288                         "packuswb  %%mm5, %%mm4 \n\t"   /* combine and pack/saturate MM5 and MM4 */
07289                         "movq    %%mm4, (%%edi) \n\t"   /* store result in Dest */
07290                         /* --- */
07291                         "sub       %%eax, %%esi \n\t"   /* move to the current top row in Src */
07292                         "sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"     /* move Src  pointer to the next 8 pixels */
07293                         "add $8,          %%edi \n\t"   /* move Dest pointer to the next 8 pixels */
07294                         /* --- */
07295                         "dec              %%ecx \n\t"   /* decrease loop counter COLUMNS */
07296                         "jnz            .L10412 \n\t"   /* check loop termination, proceed if required */
07297                         "mov       %%ebx, %%esi \n\t"   /* restore most left current row Src  address */
07298                         "mov       %%edx, %%edi \n\t"   /* restore most left current row Dest address */
07299                         "add       %%eax, %%esi \n\t"   /* move to the next row in Src */
07300                         "add       %%eax, %%edi \n\t"   /* move to the next row in Dest */
07301                         "decl                %2 \n\t"   /* decrease loop counter ROWS */
07302                         "jnz            .L10410 \n\t"   /* check loop termination, proceed if required */
07303                         /* --- */
07304                         "emms                   \n\t"   /* exit MMX state */
07305                         "popa                   \n\t":"=m" (Dest)       /* %0 */
07306                         :"m"(Src),              /* %1 */
07307                         "m"(rows),              /* %2 */
07308                         "m"(columns),           /* %3 */
07309                         "m"(NRightShift)        /* %4 */
07310                         );
07311 #endif
07312 #endif
07313                 return (0);
07314         } else {
07315                 /* No non-MMX implementation yet */
07316                 return (-1);
07317         }
07318 }
07319 
07323 void SDL_imageFilterAlignStack(void)
07324 {
07325 #ifdef USE_MMX
07326 #if !defined(GCC__)
07327         __asm
07328         {                               /* --- stack alignment --- */
07329                 mov ebx, esp    /* load ESP into EBX */
07330                         sub ebx, 4      /* reserve space on stack for old value of ESP */
07331                         and ebx, -32    /* align EBX along a 32 byte boundary */
07332                         mov [ebx], esp          /* save old value of ESP in stack, behind the bndry */
07333                         mov esp, ebx    /* align ESP along a 32 byte boundary */
07334         }
07335 #else
07336         asm volatile
07337                 (                               /* --- stack alignment --- */
07338                 "mov       %%esp, %%ebx \n\t"   /* load ESP into EBX */
07339                 "sub          $4, %%ebx \n\t"   /* reserve space on stack for old value of ESP */
07340                 "and        $-32, %%ebx \n\t"   /* align EBX along a 32 byte boundary */
07341                 "mov     %%esp, (%%ebx) \n\t"   /* save old value of ESP in stack, behind the bndry */
07342                 "mov       %%ebx, %%esp \n\t"   /* align ESP along a 32 byte boundary */
07343                 ::);
07344 #endif
07345 #endif
07346 }
07347 
07351 void SDL_imageFilterRestoreStack(void)
07352 {
07353 #ifdef USE_MMX
07354 #if !defined(GCC__)
07355         __asm
07356         {                               /* --- restoring old stack --- */
07357                 mov ebx, [esp]          /* load old value of ESP */
07358                 mov esp, ebx    /* restore old value of ESP */
07359         }
07360 #else
07361         asm volatile
07362                 (                               /* --- restoring old stack --- */
07363                 "mov     (%%esp), %%ebx \n\t"   /* load old value of ESP */
07364                 "mov       %%ebx, %%esp \n\t"   /* restore old value of ESP */
07365                 ::);
07366 #endif
07367 #endif
07368 }