#include #include #include "dct.h" #include "fdct.h" /*Performs a forward 8 point Type-II DCT transform. The output is scaled by a factor of 2 from the orthonormal version of the transform. _y: The buffer to store the result in. Data will be placed in every 8th entry (e.g., in a column of an 8x8 block). _x: The input coefficients. The first 8 entries are used (e.g., from a row of an 8x8 block).*/ static void fdct8(ogg_int16_t *_y,const ogg_int16_t _x[8]){ ogg_int32_t t[9]; ogg_int32_t r; /*Stage 1:*/ /*0-7 butterfly.*/ t[0]=_x[0]+(ogg_int32_t)_x[7]; /*1-6 butterfly.*/ t[1]=_x[1]+(ogg_int32_t)_x[6]; /*2-5 butterfly.*/ t[2]=_x[2]+(ogg_int32_t)_x[5]; /*3-4 butterfly.*/ t[3]=_x[3]+(ogg_int32_t)_x[4]; t[4]=_x[3]-(ogg_int32_t)_x[4]; t[5]=_x[2]-(ogg_int32_t)_x[5]; t[6]=_x[1]-(ogg_int32_t)_x[6]; t[7]=_x[0]-(ogg_int32_t)_x[7]; /*Stage 2:*/ /*0-3 butterfly.*/ r=t[0]+t[3]; t[3]=t[0]-t[3]; t[0]=r; /*1-2 butterfly.*/ r=t[1]+t[2]; t[2]=t[1]-t[2]; t[1]=r; /*6-5 butterfly.*/ r=t[6]-t[5]; t[6]=OC_DIV2_16(OC_C4S4*(t[6]+t[5])); t[5]=OC_DIV2_16(OC_C4S4*r); /*Stage 3:*/ /*4-5 butterfly.*/ r=t[4]+t[5]; t[5]=t[4]-t[5]; t[4]=r; /*7-6 butterfly.*/ r=t[7]+t[6]; t[6]=t[7]-t[6]; t[7]=r; /*0-1 butterfly.*/ _y[0<<3]=(ogg_int16_t)(OC_DIV2_16(OC_C4S4*(t[0]+t[1]))); _y[4<<3]=(ogg_int16_t)(OC_DIV2_16(OC_C4S4*(t[0]-t[1]))); /*3-2 rotation by 6pi/16*/ _y[2<<3]=(ogg_int16_t)(OC_DIV2_16(OC_C2S6*t[3])+OC_DIV2_16(OC_C6S2*t[2])); _y[6<<3]=(ogg_int16_t)(OC_DIV2_16(OC_C6S2*t[3])-OC_DIV2_16(OC_C2S6*t[2])); /*Stage 4:*/ /*7-4 rotation by 7pi/16*/ _y[1<<3]=(ogg_int16_t)(OC_DIV2_16(OC_C1S7*t[7])+OC_DIV2_16(OC_C7S1*t[4])); /*6-5 rotation by 3pi/16*/ _y[5<<3]=(ogg_int16_t)(OC_DIV2_16(OC_C5S3*t[6])+OC_DIV2_16(OC_C3S5*t[5])); _y[3<<3]=(ogg_int16_t)(OC_DIV2_16(OC_C3S5*t[6])-OC_DIV2_16(OC_C5S3*t[5])); _y[7<<3]=(ogg_int16_t)(OC_DIV2_16(OC_C7S1*t[7])-OC_DIV2_16(OC_C1S7*t[4])); } /*Performs a forward 8x8 Type-II DCT transform. The output is scaled by a factor of 4 relative to the orthonormal version of the transform. _y: The buffer to store the result in. This may be the same as _x. _x: The input coefficients. */ void oc_fdct8x8(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ const ogg_int16_t *in; ogg_int16_t *end; ogg_int16_t *out; ogg_int16_t w[64]; /*Transform rows of x into columns of w.*/ for(in=_x,out=w,end=out+8;outna==1){ int ci; /*While the branch below is still correct for shapes with na==1, we can perform the entire transform with just 1 multiply in this case instead of 23.*/ _y[0]=(ogg_int16_t)(OC_DIV2_16(OC_C4S4*(_x[_e->pi[0]]<<3))); for(ci=8;ci<64;ci+=8)_y[ci]=0; } else{ int zpi; int api; int nz; /*First multiply by the extension matrix to compute the padding values.*/ nz=8-_e->na; for(zpi=0;zpina;api++)v+=_e->ext[zpi][api]*_x[_e->pi[api]]; _x[_e->pi[zpi+_e->na]]= (ogg_int16_t)OC_DIV_ROUND_POW2(v,OC_EXT_SHIFT,1<mask; for(ri=0;ri<8;ri++){ rmask|=mask&0xFF; cmask|=((mask&0xFF)!=0)<>=8; } /*Find the associated extension info for these shapes.*/ if(rmask==0xFF)rext=NULL; else for(rext=OC_EXTENSION_INFO;rext->mask!=rmask;){ /*If we somehow can't find the shape, then just do an unpadded fDCT. It won't be efficient, but it should still be correct.*/ if(++rext>=OC_EXTENSION_INFO+OC_NSHAPES){ oc_fdct8x8(_y,_x); return; } } if(cmask==0xFF)cext=NULL; else for(cext=OC_EXTENSION_INFO;cext->mask!=cmask;){ /*If we somehow can't find the shape, then just do an unpadded fDCT. It won't be efficient, but it should still be correct.*/ if(++cext>=OC_EXTENSION_INFO+OC_NSHAPES){ oc_fdct8x8(_y,_x); return; } } /*Transform the rows. We can ignore zero rows without a problem.*/ if(rext==NULL)for(in=_x,out=w,end=out+8;out>=1){ if(ri&1)fdct8_ext(out,in,rext); } /*Transform the columns. We transform even columns that are supposedly zero, because rounding errors may make them slightly non-zero, and this will give a more precise reconstruction with very small quantizers.*/ if(cext==NULL)for(in=w,out=_y,end=out+8;outbuffer[_framei]; /*For border fragments, only copy pixels that are in the displayable region of the image. The DCT function will compute optimal padding values for the other pixels.*/ if(_frag->border!=NULL){ ogg_int64_t mask; mask=_frag->border->mask; for(pixi=y=0;y<8;y++){ for(x=0;x<8;x++,pixi++){ pix_buf[pixi]=(ogg_int16_t)(((int)mask&1)?pixels[x]-128:0); /*This branchless code is (almost) equivalent to the previous line: int pmask; pmask=-(int)mask&1; pix_buf[pixi]=(ogg_int16_t)(pmask&pixels[x]); We don't use this code to allow the user to pass in a buffer that is the exact size of the displayed image, not the size padded to a multiple of 16. In the latter case, we might segfault on pixels[x] if it is not mapped to a valid page, even though we would discard the value we were attempting to read.*/ mask>>=1; } pixels+=_ystride; } oc_fdct8x8_border(_frag->border,_dct_vals,pix_buf); } /*Otherwise, copy all the pixels in the fragment and do a normal DCT.*/ else{ for(pixi=y=0;y<8;y++){ for(x=0;x<8;x++,pixi++)pix_buf[pixi]=(ogg_int16_t)(pixels[x]-128); pixels+=_ystride; } oc_fdct8x8(_dct_vals,pix_buf); } } /*A pipline stage for applying an fDCT to each (non-motion compensated) block in a frame.*/ static int oc_fdct_pipe_start(oc_enc_pipe_stage *_stage){ int pli; for(pli=0;pli<3;pli++)_stage->y_procd[pli]=0; return _stage->next!=NULL?(*_stage->next->pipe_start)(_stage->next):0; } /*Does the fDCTs. This pipeline stage proceeds in a planar fashion.*/ static int oc_fdct_pipe_process(oc_enc_pipe_stage *_stage,int _y_avail[3]){ int pli; for(pli=0;pli<3;pli++){ int y_procd; int y_avail; /*Compute how far we can get in complete fragment rows.*/ y_procd=_stage->y_procd[pli]; y_avail=_y_avail[pli]&~7; /*If that's farther than we've already gotten, do some fDCTs.*/ if(y_avail>y_procd){ oc_fragment_plane *fplane; oc_fragment *frags; oc_fragment *frag_end; oc_fragment_enc_info *efrags; int ystride; int yfrag0; int yrows; fplane=_stage->enc->state.fplanes+pli; ystride=_stage->enc->state.input[pli].ystride; yfrag0=fplane->froffset+(y_procd>>3)*fplane->nhfrags; yrows=y_avail-y_procd>>3; frags=_stage->enc->state.frags+yfrag0; efrags=_stage->enc->frinfo+yfrag0; do{ for(frag_end=frags+fplane->nhfrags;fragsdct_coeffs,ystride,OC_FRAME_IO); } _stage->y_procd[pli]+=8; if(_stage->next!=NULL){ int ret; ret=(*_stage->next->pipe_proc)(_stage->next,_stage->y_procd); if(ret<0)return ret; } } while(--yrows); } } return 0; } static int oc_fdct_pipe_end(oc_enc_pipe_stage *_stage){ return _stage->next!=NULL?(*_stage->next->pipe_end)(_stage->next):0; } /*Initialize the fDCT stage of the pipeline. _enc: The encoding context.*/ void oc_fdct_pipe_init(oc_enc_pipe_stage *_stage,oc_enc_ctx *_enc){ _stage->enc=_enc; _stage->next=NULL; _stage->pipe_start=oc_fdct_pipe_start; _stage->pipe_proc=oc_fdct_pipe_process; _stage->pipe_end=oc_fdct_pipe_end; }