/******************************************************************** * * * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * * * * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * * by the Xiph.Org Foundation http://www.xiph.org/ * * * ******************************************************************** function: last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ ********************************************************************/ #include #include "codec_internal.h" #include "dsp.h" #if 0 //These are to let me selectively enable the C versions, these are needed #define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2) #define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b))) #define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b)))) #endif static const ogg_int64_t V128 = 0x0080008000800080LL; static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr, ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine, ogg_uint32_t ReconPixelsPerLine) { //Make non-zero to use the C-version #if 0 int i; /* For each block row */ for (i=8; i; i--) { DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]); DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]); DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]); DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]); DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]); DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]); DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]); DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]); /* Start next row */ FiltPtr += PixelsPerLine; ReconPtr += ReconPixelsPerLine; DctInputPtr += 8; } #else __asm { align 16 pxor mm7, mm7 mov eax, FiltPtr mov ebx, ReconPtr mov edx, DctInputPtr /* You can't use rept in inline masm and macro parsing seems screwed with inline asm*/ /* ITERATION 1 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm1, [ebx] /* mm1 = ReconPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ movq mm3, mm1 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */ movq [edx], mm0 /* write answer out */ movq [8 + edx], mm2 /* write answer out */ /* Increment pointers */ add edx, 16 add eax, PixelsPerLine add ebx, ReconPixelsPerLine /* ITERATION 2 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm1, [ebx] /* mm1 = ReconPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ movq mm3, mm1 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */ movq [edx], mm0 /* write answer out */ movq [8 + edx], mm2 /* write answer out */ /* Increment pointers */ add edx, 16 add eax, PixelsPerLine add ebx, ReconPixelsPerLine /* ITERATION 3 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm1, [ebx] /* mm1 = ReconPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ movq mm3, mm1 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */ movq [edx], mm0 /* write answer out */ movq [8 + edx], mm2 /* write answer out */ /* Increment pointers */ add edx, 16 add eax, PixelsPerLine add ebx, ReconPixelsPerLine /* ITERATION 4 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm1, [ebx] /* mm1 = ReconPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ movq mm3, mm1 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */ movq [edx], mm0 /* write answer out */ movq [8 + edx], mm2 /* write answer out */ /* Increment pointers */ add edx, 16 add eax, PixelsPerLine add ebx, ReconPixelsPerLine /* ITERATION 5 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm1, [ebx] /* mm1 = ReconPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ movq mm3, mm1 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */ movq [edx], mm0 /* write answer out */ movq [8 + edx], mm2 /* write answer out */ /* Increment pointers */ add edx, 16 add eax, PixelsPerLine add ebx, ReconPixelsPerLine /* ITERATION 6 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm1, [ebx] /* mm1 = ReconPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ movq mm3, mm1 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */ movq [edx], mm0 /* write answer out */ movq [8 + edx], mm2 /* write answer out */ /* Increment pointers */ add edx, 16 add eax, PixelsPerLine add ebx, ReconPixelsPerLine /* ITERATION 7 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm1, [ebx] /* mm1 = ReconPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ movq mm3, mm1 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */ movq [edx], mm0 /* write answer out */ movq [8 + edx], mm2 /* write answer out */ /* Increment pointers */ add edx, 16 add eax, PixelsPerLine add ebx, ReconPixelsPerLine /* ITERATION 8 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm1, [ebx] /* mm1 = ReconPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ movq mm3, mm1 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */ movq [edx], mm0 /* write answer out */ movq [8 + edx], mm2 /* write answer out */ /* Increment pointers */ add edx, 16 add eax, PixelsPerLine add ebx, ReconPixelsPerLine }; #endif } static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine) { #if 0 int i; /* For each block row */ for (i=8; i; i--) { /* INTRA mode so code raw image data */ /* We convert the data to 8 bit signed (by subtracting 128) as this reduces the internal precision requirments in the DCT transform. */ DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128); DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128); DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128); DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128); DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128); DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128); DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128); DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128); /* Start next row */ FiltPtr += PixelsPerLine; DctInputPtr += 8; } #else __asm { align 16 pxor mm7, mm7 mov eax, FiltPtr mov ebx, DctInputPtr movq mm1, V128 /* ITERATION 1 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */ movq [ebx], mm0 /* write answer out */ movq [8 + ebx], mm2 /* write answer out */ /* Increment pointers */ add ebx, 16 add eax, PixelsPerLine /* ITERATION 2 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */ movq [ebx], mm0 /* write answer out */ movq [8 + ebx], mm2 /* write answer out */ /* Increment pointers */ add ebx, 16 add eax, PixelsPerLine /* ITERATION 3 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */ movq [ebx], mm0 /* write answer out */ movq [8 + ebx], mm2 /* write answer out */ /* Increment pointers */ add ebx, 16 add eax, PixelsPerLine /* ITERATION 4 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */ movq [ebx], mm0 /* write answer out */ movq [8 + ebx], mm2 /* write answer out */ /* Increment pointers */ add ebx, 16 add eax, PixelsPerLine /* ITERATION 5 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */ movq [ebx], mm0 /* write answer out */ movq [8 + ebx], mm2 /* write answer out */ /* Increment pointers */ add ebx, 16 add eax, PixelsPerLine /* ITERATION 6 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */ movq [ebx], mm0 /* write answer out */ movq [8 + ebx], mm2 /* write answer out */ /* Increment pointers */ add ebx, 16 add eax, PixelsPerLine /* ITERATION 7 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */ movq [ebx], mm0 /* write answer out */ movq [8 + ebx], mm2 /* write answer out */ /* Increment pointers */ add ebx, 16 add eax, PixelsPerLine /* ITERATION 8 */ movq mm0, [eax] /* mm0 = FiltPtr */ movq mm2, mm0 /* dup to prepare for up conversion */ /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */ /* start calculation */ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */ movq [ebx], mm0 /* write answer out */ movq [8 + ebx], mm2 /* write answer out */ /* Increment pointers */ add ebx, 16 add eax, PixelsPerLine }; #endif } static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1, unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine, ogg_uint32_t ReconPixelsPerLine) { #if 0 int i; /* For each block row */ for (i=8; i; i--) { DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0])); DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1])); DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2])); DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3])); DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4])); DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5])); DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6])); DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7])); /* Start next row */ FiltPtr += PixelsPerLine; ReconPtr1 += ReconPixelsPerLine; ReconPtr2 += ReconPixelsPerLine; DctInputPtr += 8; } #else __asm { align 16 pxor mm7, mm7 mov eax, FiltPtr mov ebx, ReconPtr1 mov ecx, ReconPtr2 mov edx, DctInputPtr /* ITERATION 1 */ movq mm0, [eax] ; /* mm0 = FiltPtr */ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */ movq mm2, mm0 ; /* dup to prepare for up conversion */ movq mm3, mm1 ; /* dup to prepare for up conversion */ movq mm5, mm4 ; /* dup to prepare for up conversion */ ; /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */ ; /* average ReconPtr1 and ReconPtr2 */ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ movq [edx], mm0 ; /* write answer out */ movq [8 + edx], mm2 ; /* write answer out */ ; /* Increment pointers */ add edx, 16 ; add eax, PixelsPerLine ; add ebx, ReconPixelsPerLine ; add ecx, ReconPixelsPerLine ; /* ITERATION 2 */ movq mm0, [eax] ; /* mm0 = FiltPtr */ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */ movq mm2, mm0 ; /* dup to prepare for up conversion */ movq mm3, mm1 ; /* dup to prepare for up conversion */ movq mm5, mm4 ; /* dup to prepare for up conversion */ ; /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */ ; /* average ReconPtr1 and ReconPtr2 */ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ movq [edx], mm0 ; /* write answer out */ movq [8 + edx], mm2 ; /* write answer out */ ; /* Increment pointers */ add edx, 16 ; add eax, PixelsPerLine ; add ebx, ReconPixelsPerLine ; add ecx, ReconPixelsPerLine ; /* ITERATION 3 */ movq mm0, [eax] ; /* mm0 = FiltPtr */ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */ movq mm2, mm0 ; /* dup to prepare for up conversion */ movq mm3, mm1 ; /* dup to prepare for up conversion */ movq mm5, mm4 ; /* dup to prepare for up conversion */ ; /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */ ; /* average ReconPtr1 and ReconPtr2 */ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ movq [edx], mm0 ; /* write answer out */ movq [8 + edx], mm2 ; /* write answer out */ ; /* Increment pointers */ add edx, 16 ; add eax, PixelsPerLine ; add ebx, ReconPixelsPerLine ; add ecx, ReconPixelsPerLine ; /* ITERATION 4 */ movq mm0, [eax] ; /* mm0 = FiltPtr */ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */ movq mm2, mm0 ; /* dup to prepare for up conversion */ movq mm3, mm1 ; /* dup to prepare for up conversion */ movq mm5, mm4 ; /* dup to prepare for up conversion */ ; /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */ ; /* average ReconPtr1 and ReconPtr2 */ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ movq [edx], mm0 ; /* write answer out */ movq [8 + edx], mm2 ; /* write answer out */ ; /* Increment pointers */ add edx, 16 ; add eax, PixelsPerLine ; add ebx, ReconPixelsPerLine ; add ecx, ReconPixelsPerLine ; /* ITERATION 5 */ movq mm0, [eax] ; /* mm0 = FiltPtr */ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */ movq mm2, mm0 ; /* dup to prepare for up conversion */ movq mm3, mm1 ; /* dup to prepare for up conversion */ movq mm5, mm4 ; /* dup to prepare for up conversion */ ; /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */ ; /* average ReconPtr1 and ReconPtr2 */ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ movq [edx], mm0 ; /* write answer out */ movq [8 + edx], mm2 ; /* write answer out */ ; /* Increment pointers */ add edx, 16 ; add eax, PixelsPerLine ; add ebx, ReconPixelsPerLine ; add ecx, ReconPixelsPerLine ; /* ITERATION 6 */ movq mm0, [eax] ; /* mm0 = FiltPtr */ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */ movq mm2, mm0 ; /* dup to prepare for up conversion */ movq mm3, mm1 ; /* dup to prepare for up conversion */ movq mm5, mm4 ; /* dup to prepare for up conversion */ ; /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */ ; /* average ReconPtr1 and ReconPtr2 */ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ movq [edx], mm0 ; /* write answer out */ movq [8 + edx], mm2 ; /* write answer out */ ; /* Increment pointers */ add edx, 16 ; add eax, PixelsPerLine ; add ebx, ReconPixelsPerLine ; add ecx, ReconPixelsPerLine ; /* ITERATION 7 */ movq mm0, [eax] ; /* mm0 = FiltPtr */ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */ movq mm2, mm0 ; /* dup to prepare for up conversion */ movq mm3, mm1 ; /* dup to prepare for up conversion */ movq mm5, mm4 ; /* dup to prepare for up conversion */ ; /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */ ; /* average ReconPtr1 and ReconPtr2 */ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ movq [edx], mm0 ; /* write answer out */ movq [8 + edx], mm2 ; /* write answer out */ ; /* Increment pointers */ add edx, 16 ; add eax, PixelsPerLine ; add ebx, ReconPixelsPerLine ; add ecx, ReconPixelsPerLine ; /* ITERATION 8 */ movq mm0, [eax] ; /* mm0 = FiltPtr */ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */ movq mm2, mm0 ; /* dup to prepare for up conversion */ movq mm3, mm1 ; /* dup to prepare for up conversion */ movq mm5, mm4 ; /* dup to prepare for up conversion */ ; /* convert from UINT8 to INT16 */ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */ ; /* average ReconPtr1 and ReconPtr2 */ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ movq [edx], mm0 ; /* write answer out */ movq [8 + edx], mm2 ; /* write answer out */ ; /* Increment pointers */ add edx, 16 ; add eax, PixelsPerLine ; add ebx, ReconPixelsPerLine ; add ecx, ReconPixelsPerLine ; }; #endif } static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2) { #if 0 ogg_uint32_t SadValue; ogg_uint32_t SadValue1; SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + DSP_OP_ABS_DIFF (Src1[1], Src2[1]) + DSP_OP_ABS_DIFF (Src1[2], Src2[2]) + DSP_OP_ABS_DIFF (Src1[3], Src2[3]); SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + DSP_OP_ABS_DIFF (Src1[5], Src2[5]) + DSP_OP_ABS_DIFF (Src1[6], Src2[6]) + DSP_OP_ABS_DIFF (Src1[7], Src2[7]); SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1; return SadValue; #else ogg_uint32_t MaxSad; __asm { align 16 mov ebx, Src1 mov ecx, Src2 pxor mm6, mm6 ; /* zero out mm6 for unpack */ pxor mm7, mm7 ; /* zero out mm7 for unpack */ movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [ecx] ; movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm6 ; /* ; unpack low four bytes to higher precision */ punpckhbw mm1, mm7 ; /* ; unpack high four bytes to higher precision */ movq mm2, mm0 ; movq mm3, mm1 ; psrlq mm2, 32 ; /* fold and add */ psrlq mm3, 32 ; paddw mm0, mm2 ; paddw mm1, mm3 ; movq mm2, mm0 ; movq mm3, mm1 ; psrlq mm2, 16 ; psrlq mm3, 16 ; paddw mm0, mm2 ; paddw mm1, mm3 ; psubusw mm1, mm0 ; paddw mm1, mm0 ; /* mm1 = max(mm1, mm0) */ movd eax, mm1 ; and eax, 0xffff mov MaxSad, eax }; return MaxSad; #endif } static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2, ogg_uint32_t stride) { #if 0 ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0}; ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0}; ogg_uint32_t MaxSad = 0; ogg_uint32_t i; for ( i = 0; i < 4; i++ ){ SadValue[0] += abs(Src1[0] - Src2[0]); SadValue[1] += abs(Src1[1] - Src2[1]); SadValue[2] += abs(Src1[2] - Src2[2]); SadValue[3] += abs(Src1[3] - Src2[3]); SadValue[4] += abs(Src1[4] - Src2[4]); SadValue[5] += abs(Src1[5] - Src2[5]); SadValue[6] += abs(Src1[6] - Src2[6]); SadValue[7] += abs(Src1[7] - Src2[7]); Src1 += stride; Src2 += stride; } for ( i = 0; i < 4; i++ ){ SadValue2[0] += abs(Src1[0] - Src2[0]); SadValue2[1] += abs(Src1[1] - Src2[1]); SadValue2[2] += abs(Src1[2] - Src2[2]); SadValue2[3] += abs(Src1[3] - Src2[3]); SadValue2[4] += abs(Src1[4] - Src2[4]); SadValue2[5] += abs(Src1[5] - Src2[5]); SadValue2[6] += abs(Src1[6] - Src2[6]); SadValue2[7] += abs(Src1[7] - Src2[7]); Src1 += stride; Src2 += stride; } for ( i = 0; i < 8; i++ ){ if ( SadValue[i] > MaxSad ) MaxSad = SadValue[i]; if ( SadValue2[i] > MaxSad ) MaxSad = SadValue2[i]; } return MaxSad; #else ogg_uint32_t MaxSad; __asm { align 16 mov ebx, Src1 mov ecx, Src2 pxor mm3, mm3 ; /* zero out mm3 for unpack */ pxor mm4, mm4 ; /* mm4 low sum */ pxor mm5, mm5 ; /* mm5 high sum */ pxor mm6, mm6 ; /* mm6 low sum */ pxor mm7, mm7 ; /* mm7 high sum */ mov edi, 4 ; /* 4 rows */ label_1: ; movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [ecx] ; /* take 8 bytes */ movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */ paddw mm4, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */ paddw mm5, mm1 ; /* accumulate difference... */ add ebx, stride ; /* Inc pointer into the new data */ add ecx, stride ; /* Inc pointer into the new data */ dec edi ; jnz label_1 ; mov edi, 4 ; /* 4 rows */ label_2: ; movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [ecx] ; /* take 8 bytes */ movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */ paddw mm6, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */ paddw mm7, mm1 ; /* accumulate difference... */ add ebx, stride ; /* Inc pointer into the new data */ add ecx, stride ; /* Inc pointer into the new data */ dec edi ; jnz label_2 ; psubusw mm7, mm6 ; paddw mm7, mm6 ; /* mm7 = max(mm7, mm6) */ psubusw mm5, mm4 ; paddw mm5, mm4 ; /* mm5 = max(mm5, mm4) */ psubusw mm7, mm5 ; paddw mm7, mm5 ; /* mm7 = max(mm5, mm7) */ movq mm6, mm7 ; psrlq mm6, 32 ; psubusw mm7, mm6 ; paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */ movq mm6, mm7 ; psrlq mm6, 16 ; psubusw mm7, mm6 ; paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */ movd eax, mm7 ; and eax, 0xffff ; mov MaxSad, eax }; return MaxSad; #endif } static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1, unsigned char *ptr2, ogg_uint32_t stride2) { #if 0 ogg_uint32_t i; ogg_uint32_t sad = 0; for (i=8; i; i--) { sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]); sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]); sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]); sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]); sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]); sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]); sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]); sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]); /* Step to next row of block. */ ptr1 += stride1; ptr2 += stride2; } return sad; #else ogg_uint32_t DiffVal; __asm { align 16 mov ebx, ptr1 mov edx, ptr2 pxor mm6, mm6 ; /* zero out mm6 for unpack */ pxor mm7, mm7 ; /* mm7 contains the result */ ; /* ITERATION 1 */ movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [edx] ; movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */ paddw mm7, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */ add ebx, stride1 ; /* Inc pointer into the new data */ paddw mm7, mm1 ; /* accumulate difference... */ add edx, stride2 ; /* Inc pointer into ref data */ ; /* ITERATION 2 */ movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [edx] ; movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */ paddw mm7, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */ add ebx, stride1 ; /* Inc pointer into the new data */ paddw mm7, mm1 ; /* accumulate difference... */ add edx, stride2 ; /* Inc pointer into ref data */ ; /* ITERATION 3 */ movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [edx] ; movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */ paddw mm7, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */ add ebx, stride1 ; /* Inc pointer into the new data */ paddw mm7, mm1 ; /* accumulate difference... */ add edx, stride2 ; /* Inc pointer into ref data */ ; /* ITERATION 4 */ movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [edx] ; movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */ paddw mm7, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */ add ebx, stride1 ; /* Inc pointer into the new data */ paddw mm7, mm1 ; /* accumulate difference... */ add edx, stride2 ; /* Inc pointer into ref data */ ; /* ITERATION 5 */ movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [edx] ; movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */ paddw mm7, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */ add ebx, stride1 ; /* Inc pointer into the new data */ paddw mm7, mm1 ; /* accumulate difference... */ add edx, stride2 ; /* Inc pointer into ref data */ ; /* ITERATION 6 */ movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [edx] ; movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */ paddw mm7, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */ add ebx, stride1 ; /* Inc pointer into the new data */ paddw mm7, mm1 ; /* accumulate difference... */ add edx, stride2 ; /* Inc pointer into ref data */ ; /* ITERATION 7 */ movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [edx] ; movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */ paddw mm7, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */ add ebx, stride1 ; /* Inc pointer into the new data */ paddw mm7, mm1 ; /* accumulate difference... */ add edx, stride2 ; /* Inc pointer into ref data */ ; /* ITERATION 8 */ movq mm0, [ebx] ; /* take 8 bytes */ movq mm1, [edx] ; movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */ paddw mm7, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */ add ebx, stride1 ; /* Inc pointer into the new data */ paddw mm7, mm1 ; /* accumulate difference... */ add edx, stride2 ; /* Inc pointer into ref data */ ; /* ------ */ movq mm0, mm7 ; psrlq mm7, 32 ; paddw mm7, mm0 ; movq mm0, mm7 ; psrlq mm7, 16 ; paddw mm7, mm0 ; movd eax, mm7 ; and eax, 0xffff ; mov DiffVal, eax }; return DiffVal; #endif } static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1, unsigned char *ptr2, ogg_uint32_t stride2, ogg_uint32_t thres) { #if 0 ogg_uint32_t i; ogg_uint32_t sad = 0; for (i=8; i; i--) { sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]); sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]); sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]); sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]); sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]); sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]); sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]); sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]); if (sad > thres ) break; /* Step to next row of block. */ ptr1 += stride1; ptr2 += stride2; } return sad; #else return sad8x8__mmx (ptr1, stride1, ptr2, stride2); #endif } static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, unsigned char *RefDataPtr1, unsigned char *RefDataPtr2, ogg_uint32_t RefStride, ogg_uint32_t thres) { #if 0 ogg_uint32_t i; ogg_uint32_t sad = 0; for (i=8; i; i--) { sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0])); sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1])); sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2])); sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3])); sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4])); sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5])); sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6])); sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7])); if ( sad > thres ) break; /* Step to next row of block. */ SrcData += SrcStride; RefDataPtr1 += RefStride; RefDataPtr2 += RefStride; } return sad; #else ogg_uint32_t DiffVal; __asm { align 16 mov ebx, SrcData mov ecx, RefDataPtr1 mov edx, RefDataPtr2 pcmpeqd mm5, mm5 ; /* fefefefefefefefe in mm5 */ paddb mm5, mm5 ; ; pxor mm6, mm6 ; /* zero out mm6 for unpack */ pxor mm7, mm7 ; /* mm7 contains the result */ mov edi, 8 ; /* 8 rows */ loop_start: ; movq mm0, [ebx] ; /* take 8 bytes */ movq mm2, [ecx] ; movq mm3, [edx] ; /* take average of mm2 and mm3 */ movq mm1, mm2 ; pand mm1, mm3 ; pxor mm3, mm2 ; pand mm3, mm5 ; psrlq mm3, 1 ; paddb mm1, mm3 ; movq mm2, mm0 ; psubusb mm0, mm1 ; /* A - B */ psubusb mm1, mm2 ; /* B - A */ por mm0, mm1 ; /* and or gives abs difference */ movq mm1, mm0 ; punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */ paddw mm7, mm0 ; /* accumulate difference... */ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */ add ebx, SrcStride ; /* Inc pointer into the new data */ paddw mm7, mm1 ; /* accumulate difference... */ add ecx, RefStride ; /* Inc pointer into ref data */ add edx, RefStride ; /* Inc pointer into ref data */ dec edi ; jnz loop_start ; movq mm0, mm7 ; psrlq mm7, 32 ; paddw mm7, mm0 ; movq mm0, mm7 ; psrlq mm7, 16 ; paddw mm7, mm0 ; movd eax, mm7 ; and eax, 0xffff ; mov DiffVal, eax }; return DiffVal; #endif } static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride) { #if 0 ogg_uint32_t i; ogg_uint32_t XSum=0; ogg_uint32_t XXSum=0; for (i=8; i; i--) { /* Examine alternate pixel locations. */ XSum += DataPtr[0]; XXSum += DataPtr[0]*DataPtr[0]; XSum += DataPtr[1]; XXSum += DataPtr[1]*DataPtr[1]; XSum += DataPtr[2]; XXSum += DataPtr[2]*DataPtr[2]; XSum += DataPtr[3]; XXSum += DataPtr[3]*DataPtr[3]; XSum += DataPtr[4]; XXSum += DataPtr[4]*DataPtr[4]; XSum += DataPtr[5]; XXSum += DataPtr[5]*DataPtr[5]; XSum += DataPtr[6]; XXSum += DataPtr[6]*DataPtr[6]; XSum += DataPtr[7]; XXSum += DataPtr[7]*DataPtr[7]; /* Step to next row of block. */ DataPtr += Stride; } /* Compute population variance as mis-match metric. */ return (( (XXSum<<6) - XSum*XSum ) ); #else ogg_uint32_t XSum; ogg_uint32_t XXSum; __asm { align 16 mov ecx, DataPtr pxor mm5, mm5 ; pxor mm6, mm6 ; pxor mm7, mm7 ; mov edi, 8 ; loop_start: movq mm0, [ecx] ; /* take 8 bytes */ movq mm2, mm0 ; punpcklbw mm0, mm6 ; punpckhbw mm2, mm6 ; paddw mm5, mm0 ; paddw mm5, mm2 ; pmaddwd mm0, mm0 ; pmaddwd mm2, mm2 ; ; paddd mm7, mm0 ; paddd mm7, mm2 ; add ecx, Stride ; /* Inc pointer into src data */ dec edi ; jnz loop_start ; movq mm0, mm5 ; psrlq mm5, 32 ; paddw mm5, mm0 ; movq mm0, mm5 ; psrlq mm5, 16 ; paddw mm5, mm0 ; movd edi, mm5 ; movsx edi, di ; mov eax, edi ; movq mm0, mm7 ; psrlq mm7, 32 ; paddd mm7, mm0 ; movd ebx, mm7 ; mov XSum, eax mov XXSum, ebx; }; /* Compute population variance as mis-match metric. */ return (( (XXSum<<6) - XSum*XSum ) ); #endif } static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, unsigned char *RefDataPtr, ogg_uint32_t RefStride) { #if 0 ogg_uint32_t i; ogg_uint32_t XSum=0; ogg_uint32_t XXSum=0; ogg_int32_t DiffVal; for (i=8; i; i--) { DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]); XSum += DiffVal; XXSum += DiffVal*DiffVal; /* Step to next row of block. */ SrcData += SrcStride; RefDataPtr += RefStride; } /* Compute and return population variance as mis-match metric. */ return (( (XXSum<<6) - XSum*XSum )); #else ogg_uint32_t XSum; ogg_uint32_t XXSum; __asm { align 16 mov ecx, SrcData mov edx, RefDataPtr pxor mm5, mm5 ; pxor mm6, mm6 ; pxor mm7, mm7 ; mov edi, 8 ; loop_start: ; movq mm0, [ecx] ; /* take 8 bytes */ movq mm1, [edx] ; movq mm2, mm0 ; movq mm3, mm1 ; punpcklbw mm0, mm6 ; punpcklbw mm1, mm6 ; punpckhbw mm2, mm6 ; punpckhbw mm3, mm6 ; psubsw mm0, mm1 ; psubsw mm2, mm3 ; paddw mm5, mm0 ; paddw mm5, mm2 ; pmaddwd mm0, mm0 ; pmaddwd mm2, mm2 ; ; paddd mm7, mm0 ; paddd mm7, mm2 ; add ecx, SrcStride ; /* Inc pointer into src data */ add edx, RefStride ; /* Inc pointer into ref data */ dec edi ; jnz loop_start ; movq mm0, mm5 ; psrlq mm5, 32 ; paddw mm5, mm0 ; movq mm0, mm5 ; psrlq mm5, 16 ; paddw mm5, mm0 ; movd edi, mm5 ; movsx edi, di ; mov eax, edi ; movq mm0, mm7 ; psrlq mm7, 32 ; paddd mm7, mm0 ; movd ebx, mm7 ; mov XSum, eax mov XXSum, ebx }; /* Compute and return population variance as mis-match metric. */ return (( (XXSum<<6) - XSum*XSum )); #endif } static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, unsigned char *RefDataPtr1, unsigned char *RefDataPtr2, ogg_uint32_t RefStride) { #if 0 ogg_uint32_t i; ogg_uint32_t XSum=0; ogg_uint32_t XXSum=0; ogg_int32_t DiffVal; for (i=8; i; i--) { DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6])); XSum += DiffVal; XXSum += DiffVal*DiffVal; DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7])); XSum += DiffVal; XXSum += DiffVal*DiffVal; /* Step to next row of block. */ SrcData += SrcStride; RefDataPtr1 += RefStride; RefDataPtr2 += RefStride; } /* Compute and return population variance as mis-match metric. */ return (( (XXSum<<6) - XSum*XSum )); #else ogg_uint32_t XSum; ogg_uint32_t XXSum; __asm { align 16 mov ebx, SrcData mov ecx, RefDataPtr1 mov edx, RefDataPtr2 pcmpeqd mm4, mm4 ; /* fefefefefefefefe in mm4 */ paddb mm4, mm4 ; pxor mm5, mm5 ; pxor mm6, mm6 ; pxor mm7, mm7 ; mov edi, 8 ; loop_start: ; movq mm0, [ebx] ; /* take 8 bytes */ movq mm2, [ecx] ; movq mm3, [edx] ; /* take average of mm2 and mm3 */ movq mm1, mm2 ; pand mm1, mm3 ; pxor mm3, mm2 ; pand mm3, mm4 ; psrlq mm3, 1 ; paddb mm1, mm3 ; movq mm2, mm0 ; movq mm3, mm1 ; punpcklbw mm0, mm6 ; punpcklbw mm1, mm6 ; punpckhbw mm2, mm6 ; punpckhbw mm3, mm6 ; psubsw mm0, mm1 ; psubsw mm2, mm3 ; paddw mm5, mm0 ; paddw mm5, mm2 ; pmaddwd mm0, mm0 ; pmaddwd mm2, mm2 ; ; paddd mm7, mm0 ; paddd mm7, mm2 ; add ebx, SrcStride ; /* Inc pointer into src data */ add ecx, RefStride ; /* Inc pointer into ref data */ add edx, RefStride ; /* Inc pointer into ref data */ dec edi ; jnz loop_start ; movq mm0, mm5 ; psrlq mm5, 32 ; paddw mm5, mm0 ; movq mm0, mm5 ; psrlq mm5, 16 ; paddw mm5, mm0 ; movd edi, mm5 ; movsx edi, di ; mov XSum, edi ; /* movl eax, edi ; Modified for vc to resuse eax*/ movq mm0, mm7 ; psrlq mm7, 32 ; paddd mm7, mm0 ; movd XXSum, mm7 ; /*movd eax, mm7 ; Modified for vc to reuse eax */ }; return (( (XXSum<<6) - XSum*XSum )); #endif } static void restore_fpu (void) { __asm { emms } } void dsp_mmx_init(DspFunctions *funcs) { TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n"); funcs->restore_fpu = restore_fpu; funcs->sub8x8 = sub8x8__mmx; funcs->sub8x8_128 = sub8x8_128__mmx; funcs->sub8x8avg2 = sub8x8avg2__mmx; funcs->row_sad8 = row_sad8__mmx; funcs->col_sad8x8 = col_sad8x8__mmx; funcs->sad8x8 = sad8x8__mmx; funcs->sad8x8_thres = sad8x8_thres__mmx; funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx; funcs->intra8x8_err = intra8x8_err__mmx; funcs->inter8x8_err = inter8x8_err__mmx; funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx; }