# base run length
# number of extra run length bits (0 - 12)
# base value
# number of extra value bits (0 - 9)
# number of extra sign bits (0 or 1)
token_array = [
[1, 0, 'eob', 0, 0],
[2, 0, 'eob', 0, 0],
[3, 0, 'eob', 0, 0],
[4, 2, 'eob', 0, 0],
[8, 3, 'eob', 0, 0],
[16, 4, 'eob', 0, 0],
[0, 12, 'eob', 0, 0],
[0, 3, 0, 0, 0],
[0, 6, 0, 0, 0],
[0, 0, 1, 0, 0],
[0, 0, -1, 0, 0],
[0, 0, 2, 0, 0],
[0, 0, -2, 0, 0],
[0, 0, 3, 0, 1],
[0, 0, 4, 0, 1],
[0, 0, 5, 0, 1],
[0, 0, 6, 0, 1],
[0, 0, 7, 1, 1],
[0, 0, 9, 2, 1],
[0, 0, 13, 3, 1],
[0, 0, 21, 4, 1],
[0, 0, 37, 5, 1],
[0, 0, 69, 9, 1],
[1, 0, 1, 0, 1],
[2, 0, 1, 0, 1],
[3, 0, 1, 0, 1],
[4, 0, 1, 0, 1],
[5, 0, 1, 0, 1],
[6, 2, 1, 0, 1],
[10, 3, 1, 0, 1],
[1, 0, 2, 1, 1],
[2, 1, 2, 1, 1] ]
parse the tokens
this function returns a run length & value based on the token & extended bits:
def parsetoken(huf):
global token_array #(not strictly necessary for read-only in Python)
token = readtoken(huf) #read a token from the stream
table = token_array[token] #get our table of parameters for this token
run = table[0] #base run length
run_extra = table[1] #number of extra bits for run length
value = table[2] #actual value
val_extra = table[3] #number of extra value bits
sign_extra = table[4] #number of sign bits
sign = 1
if sign_extra:
if readbits(1):
sign = -1 #if there's a sign bit, get it. 1 means negative
#note that value may be negative to begin with, in
#which case there are no extra value or sign bits
if val_extra:
value += readbits(val_extra) #get extra value bits
if run_extra:
run += readbits(run_extra) #get extra run bits
return [run, value * sign] #return run length and value
#note: string * 1 = string, so 'eob', 'zrl' are OK
routines to read & parse codec headers
routine to parse the Theora info header:
def read_info_header():
global huffs, encoded_width, encoded_height, decode_width, decode_height, offset_x, offset_y
global fps_numerator, fps_denominator, aspect_numerator, aspect_denominator, quality, bitrate
global version_major, version_minor, version_subminor, colorspace, infoflag
version_major = readbits(8) #major & minor version must be exact match
if version_major != 3:
spec_err("incompatible major version#")
version_minor = readbits(8)
if version_minor != 2:
spec_err("incompatible minor version#")
version_subminor = readbits(8)
encoded_width = readbits(16) << 4 #encoded width & height are in block units of 16x16
encoded_height = readbits(16) << 4
decode_width = readbits(24) #decode width & height are in actual pixels
decode_height = readbits(24)
offset_x = readbits(8) #offset for cropping if decode != full encoded frame
offset_y = readbits(8)
fps_numerator = readbits(32) #frames per second encoded as a fraction
fps_denominator = readbits(32)
aspect_numerator = readbits(24) #aspect not used now
aspect_denominator = readbits(24)
colorspace = readbits(8) #colorspace flag defines YUV to RGB mapping
bitrate = readbits(24) #target bitrate; not used for decode
quality = readbits(6) #target quality also not used for decode
keyframe_granulepos_shift = readbits(5) #P-frame granulepos field size; not used for decode
readbits(5) #5 bits set aside for future use
infoflag = 1
parse the comment header:
def read_comment_header():
global vendor_string, vendor_string_len, comment_string, comment_string_len
vendor_string_len = readOgg32()
vendor_string = readstring(vendor_string_len)
comment_string_len = readOgg32()
comment_string = readstring(comment_string_len)
read & parse the table header:
def read_table_header():
global scale_table_AC, scale_table_DC, Y_quantizer, UV_quantizer, IF_quantizer
global frequency_counts, hufftokens, tableflag
scale_table_AC = [] #64 possible quantizer scalers for AC coeffs
for x in range(64): scale_table_AC.append(readbits(16))
scale_table_DC = [] #64 possible quantizer scalers for DC coeffs
#Note this is unrelated to 64 coeffs in an 8x8 block!
for x in range(64): scale_table_DC.append(readbits(16))
Y_quantizer = [] #quantizers for intra Y coeff (this IS about 8x8 blocks!)
for x in range(64): Y_quantizer.append(readbits(8))
UV_quantizer = [] #quantizers for intra U or V coeff
for x in range(64): UV_quantizer.append(readbits(8))
IF_quantizer = [] #quantizers for interframe coeffs (Y, U, or V)
for x in range(64): IF_quantizer.append(readbits(8))
for x in range(80): #Read in huffman tables
huffs.append([])
hufftokens=0
read_hufftable(huffs[x])
tableflag = 1
def decode_header():
header_type = readbits(7)
cid = readstring(6)
if cid != "theora":
spec_err("not a theora stream header %s", cid)
if header_type == 0:
read_info_header()
return "info"
elif header_type == 1:
read_comment_header()
return "comment"
elif header_type == 2:
read_table_header()
return "table"
else:
print "unknown stream header type -- skipping"
return "unknown"
routine to process a new stream
def decode_new_stream() :
global page, stream, buffer, streams, streamid
codecs = (("\x80theora", "Theora"), ("\x01vorbis", "Vorbis"),
("Speex", "Speex"), ("fLaC", "FLAC"), ("\x00writ", "Writ"))
serialno = page.serialno
temp_stream = ogg2.OggStreamState(serialno)
temp_stream.pagein(page)
page = None
packet = temp_stream.packetout()
if packet == None :
spec_err('first page of a bitstream must contain one complete packet')
buffer = ogg2.OggPackBuffB(packet)
magic = readstring(7)
for c in codecs:
if magic[:len(c[0])] == c[0] :
streams[serialno] = c[1]
if c[1] == "Theora" :
if streamid == None :
print "Theora stream found (serialno %d), will decode" % serialno
stream = temp_stream
streamid = serialno
read_info_header()
output("info")
else :
print "another Theora stream found (serialno %d), will ignore" % serialno
else :
print "%s stream found (serialno %d), will ignore" % (c[1], serialno)
break
Routines that decode video
[NOTE: for now, these routines only handle keyframes. We may modify or add routines to support interframe data]
[NOTE: each frame of video resides in a single Ogg page.]
[NOTE: that's crap. each frame of video resides in a logical Ogg packet. pagination is irrelevant.]
Hilbert ordering
All data in Theora is organized into 8x8 blocks. When encoding the data, these blocks are further grouped into 'super-blocks' of 16 blocks each, and encoded in 'Hilbert' order. Each super-block consists of up to 16 blocks encoded in the following order:
X -> X X -> X
| ^
v |
X <- X X <- X
| ^
v |
X X -> X X
| ^ | ^
v | v |
X -> X X -> X
(thanks to Mike Melanson for the diagram)
If any block is not coded (due to clipping, for instance -- encoded images can include partial super-blocks) the pattern continues until the next coded block is hit. Each pixel plane -- Y, V, and U -- are encoded using their own pattern of superblocks.
By way of example: if a plane consisted of 32x16 pixels, only the top half of the Hilbert pattern would be used. If the 8 8x8 blocks in this example are labled in this way:
A B C D
E F G H
then they will be encoded in the following order: A, B, F, E, H, G, C, D.
the following array & function are used to 'de-hilbertize' the data. [note: this is a keyframe-only routine right now]
hilbert = [
[0,0], [1,0], [1,1], [0,1],
[0,2], [0,3], [1,3], [1,2],
[2,2], [2,3], [3,3], [3,2],
[3,1], [2,1], [2,0], [3,0] ]
note the use of Python style integer division, //
(this is important as Python's / operator behaves differently in various versions!)
def de_hilbert(w, h, colist): #width, height, coefficient list
sbw = (w+3) // 4 #super-block width (width in sb's)
sbh = (h+3) // 4 #super-block height (height in sb's)
ii = 0
comap = []
for x in range(w): #initialize coefficient map
comap.append([])
for y in range(h):
comap[x].append([])
for y in range(sbh):
for x in range(sbw):
for i in range(16):
p, q = hilbert[i] #nice Python syntax
xx = x*4 + p
yy = y*4 + q
if (xx < w) & (yy < h): #skip stuff out of range
comap[xx][yy] = colist[ii] #if in range, get a coeff
ii += 1
return comap
decoding the DC coefficients
The DC values are simply the zero-order coefficients of each 8x8 block. These values tend to have more entropy than most AC components, so in addition to quantization, it is desirable to use delta coding to reduce the data requirement of encoding them.
Theora uses a scheme where each encoded DC value is in fact a difference between a predicted value and the actual value. Since the blocks are coded in raster order, the predicted value can be any combination of DC values of blocks to the left, up, upper left, and upper right.
routine to do DC prediction on a single color plane:
def DCpredict(comap, w, h):
first row is straight delta coding:
l = 0 #l = DC coeff to my left; init to zero
for x in range(w):
l += comap[x][0][0]
comap[x][0][0] = l
now the rest:
for y in range(h):
for x in range(w):
if y>0: #already got the first row
if x == 0: #left column?
u = comap[0][y-1][0] #u = upper
ur = comap[1][y-1][0] #ur = upper-right
p = u
comap[0][y][0] += p #add predictor to decoded value
else: #general case -- neither left column nor top row
l = comap[x-1][y][0] #l = left
ul = comap[x-1][y-1][0] #ul = upper-left
u = comap[x][y-1][0] #u = upper
p = (l*29 + u*29 - ul*26) #compute weighted predictor
if p < 0:
p += 31 #round towards zero
p >>= 5 #shift by weight multiplier
if abs(p-u) > 128: #range checking
p = u;
if abs(p-l) > 128:
p = l;
if abs(p-ul) > 128:
p = ul;
comap[x][y][0] += p #add predictor to decoded value
Dequantization logic
(thanks again to Mike Melanson for this exposition)
Setting Up The Dequantizers
Theora has three static tables for dequantizing fragments-- one for intra Y fragments, one for intra C fragments, and one for inter Y or C fragments. In the following code, these tables are loaded as Y_quantizer[], UV_quantizer[], and IF_quantizer[] (see definition for read_table_header below). However, these tables are adjusted according to the value of quality_index.
quality_index is an index into 2 64-element tables: scale_table_DC[] and scale_table_AC[]. Each dequantizer from the three dequantization tables is adjusted by the appropriate scale factor according to this formula:
Scale dequantizers:
dequantizer * sf
----------------
100
where sf = scale_table_DC[quality_index] for DC dequantizer
scale_table_AC[quality_index] for AC dequantizer
(Note that the dequantize routine also multiplies each coefficient by 4. This is to facilitate the iDCT later on.)
def dequant(data, scaleDC, scaleAC, dqtable):
mul = ( (scaleDC * dqtable[0]) // 100 ) * 4
for x in range(64):
if x>0:
mul = ( (scaleAC * dqtable[x]) // 100 ) * 4
data[x] *= mul
ZIG-ZAG order
The coefficients in each 8x8 DCT coded block are layed out in 'zig-zag' order.
The following table shows the order in which the 64 coefficients are coded:
zigzag=[
0, 1, 5, 6, 14, 15, 27, 28,
2, 4, 7, 13, 16, 26, 29, 42,
3, 8, 12, 17, 25, 30, 41, 43,
9, 11, 18, 24, 31, 40, 44, 53,
10, 19, 23, 32, 39, 45, 52, 54,
20, 22, 33, 38, 46, 51, 55, 60,
21, 34, 37, 47, 50, 56, 59, 61,
35, 36, 48, 49, 57, 58, 62, 63]
this routine remaps the coefficients to their original order:
def unzig(data,un):
for x in range(64):
un[x] = data[zigzag[x]]
inverse DCT (iDCT)
Once coefficients are re-ordered and dequantized, the iDCT is performed on the 8x8 matrix to produce the actual pixel values (or differentials for predicted blocks).
Theora's particular choice of iDCT computation involves intermediate values that are calculated using 32-bit, fixed-point arithmetic. Multiplication is defined as follows (assuming 32-bit integer parameters):
def Mul_fix(a, b):
return (a * b) >> 16
addition and subtraction can be performed normally.
First we define a one-dimensional iDCT. Note that there are many implementations of iDCT available. Theora compatibility requires that the output of your iDCT routine be bitwise equivalent to the one outlined here:
def idct_1D(data, i, stride):
A = Mul_fix(64277, data[i+stride]) + Mul_fix(12785, data[i+7*stride])
B = Mul_fix(12785, data[i+stride]) - Mul_fix(64277, data[i+7*stride])
C = Mul_fix(54491, data[i+3*stride]) + Mul_fix(36410, data[i+5*stride])
D = Mul_fix(54491, data[i+5*stride]) - Mul_fix(36410, data[i+3*stride])
A2 = Mul_fix(46341, A - C)
B2 = Mul_fix(46341, B - D)
C2 = A + C
D2 = B + D
E = Mul_fix(46341, data[i] + data[i+4*stride])
F = Mul_fix(46341, data[i] - data[i+4*stride])
G = Mul_fix(60547, data[i+2*stride]) + Mul_fix(25080, data[i+6*stride])
H = Mul_fix(25080, data[i+2*stride]) - Mul_fix(60547, data[i+6*stride])
E2 = E - G
G2 = E + G
A3 = F + A2
B3 = B2 - H
F2 = F - A2
H2 = B2 + H
data[i] = G2 + C2
data[i+stride] = A3 + H2
data[i+2*stride] = A3 - H2
data[i+3*stride] = E2 + D2
data[i+4*stride] = E2 - D2
data[i+5*stride] = F2 + B3
data[i+6*stride] = F2 - B3
data[i+7*stride] = G2 - C2
2D iDCT is performed first on rows, then columns (note order can affect lower bits!)
(Note that we dequantized all coefficients to 4 times their real value. Since each coefficient is run through the iDCT twice (horizontal & vertical), the final values must be divided by 16.)
def idct(data):
for y in range(8):
idct_1D(data, y*8, 1);
for x in range(8):
idct_1D(data, x, 8);
for x in range(64):
data[x] = (data[x] + 8) >> 4 #add for rounding; /16 (remember dequant was *4!)
loop filter
the Theora loop filter is run along every horizontal and vertical edge between blocks where one of the blocks is coded. In the keyframe case, this means every edge except the borders of the frame. For predicted frames, the only edges that are not filtered are those between two uncoded blocks (because they were filtered at some point previously, when the block was originally reconstructed)...
clamping routine
def clampit(n, lo, hi):
if n < lo:
return lo
if n > hi:
return hi
return n
first, define an array of quality-dependent filter parameters:
loopfilter_array = [
30, 25, 20, 20, 15, 15, 14, 14,
13, 13, 12, 12, 11, 11, 10, 10,
9, 9, 8, 8, 7, 7, 7, 7,
6, 6, 6, 6, 5, 5, 5, 5,
4, 4, 4, 4, 3, 3, 3, 3,
2, 2, 2, 2, 2, 2, 2, 2,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 ]
next, we define a step function with five linear segments:
def loopfunc(n):
global loopfilter_array
global quality_index #from the frame header
K = loopfilter_array[quality_index]
if (n <= -K*2) | (n >= K*2):
return 0
elif (n > -K*2) & (n < -K):
return -2*K - n
elif (n >= -K) & (n <= K):
return n
else:
return 2*K - n
now the 1D filter functions:
def filter_horiz(pixmap, x, y):
for i in range(8):
A = pixmap[x-2][y+i]
B = pixmap[x-1][y+i]
C = pixmap[x][y+i]
D = pixmap[x+1][y+i]
A = clampit(A, -128, 127)
B = clampit(B, -128, 127)
C = clampit(C, -128, 127)
D = clampit(D, -128, 127)
N = ( 4+(A - B*3 + C*3 - D)) >> 3
delta = loopfunc(N)
pixmap[x-1][y+i] = B + delta
pixmap[x][y+i] = C - delta
def filter_vert(pixmap, x, y):
for i in range(8):
A = pixmap[x+i][y-2]
B = pixmap[x+i][y-1]
C = pixmap[x+i][y]
D = pixmap[x+i][y+1]
A = clampit(A, -128, 127)
B = clampit(B, -128, 127)
C = clampit(C, -128, 127)
D = clampit(D, -128, 127)
N = ( 4+(A - B*3 + C*3 - D)) >> 3
delta = loopfunc(N)
pixmap[x+i][y-1] = B + delta
pixmap[x+i][y] = C - delta
full filter (keyframe case only):
def loopfilter(pixmap, w, h):
for y in range(h>>3):
for x in range(w>>3):
xx = x*8
yy = y*8
if xx > 0: #vertical if not on left edge
filter_horiz(pixmap, xx, yy)
if yy > 0: #horiz if not on top row
filter_vert(pixmap, xx, yy)
Pixel management routines
this routine converts an array of 8x8 blocks of data into a pixel array
def blocks2pixels(data, w, h, dx, dy):
pix = [ [0 for y in range(h)] for x in range(w) ] #initialize 2D array, pix[w][h]
for y in range(h):
for x in range(w):
xx = x+dx
yy = y+dy
bx = xx >> 3
by = yy >> 3
ix = xx % 8
iy = yy % 8
p = data[bx][by][ix + iy*8]
pix[x][h-y-1] = p # The h-y-1 trick inverts the frame
return pix
one last helper -- turns a color map (integers with x, y coordinates) into a straight block array, clamped to 0-255:
def pixels2chars(pixels):
h = len(pixels[0])
w = len(pixels)
data = [] #initialze linear array width * height long
for y in range(h):
for x in range(w):
p = pixels[x][y]
p += 128
p = clampit(p, 0, 255)
data.append(p) #add it to the list
return data
*** DECODING THE FRAME ***
ok, let's do it!
def decode_frame():
print
print "DECODING FRAME"
global quality_index, infoflag, tableflag
if (infoflag == 0) | (tableflag == 0): #if info & table not initialized
spec_err("stream parameters not initialized -- missing info or table headers?")
First, we decode the frame header:
is_predicted = readbits(1)
print "is_predicted:", is_predicted
quality_index = readbits(6)
print "quality_index =", quality_index
spare_q_bit = readbits(1)
if spare_q_bit == 1 :
spec_err("spare QI bit is set, I don't know what to do!")
scalefactor_AC = scale_table_AC[quality_index] #(ThisFrameQualityValue in C)
print "scalefactor_AC =", scalefactor_AC
scalefactor_DC = scale_table_DC[quality_index]
print "scalefactor_DC =", scalefactor_DC
if is_predicted == 0: #0 = keyframe, 1 = interframe
OK, this is a keyframe. That means we just have 'intra' coded blocks.
print "decoding keyframe"
keyframe_type = readbits(1) #keyframe type always == 1 (DCT) (for now)
readbits(2) #2 unused bits
compute some values based on width & height:
blocks_Y = encoded_height*encoded_width // 64 #Y blocks coded
blocks_UV = blocks_Y // 2
blocks_U = blocks_UV // 2
blocks_V = blocks_U
blocks_tot = int(blocks_Y * 1.5) #Y and UV blocks coded
initialize a map of coefficients. For each coded block, we will eventually have 64:
global coeffs
coeffs = [ [] for x in range(blocks_tot) ]
Theora encodes each coefficient for every block in sequence. IE first we decode all the DC coefficients; then coefficient #1, then #2, and so on.
Also, as we go higher into the coefficient index, we will use different huffman tables:
for i in range(64): #For each coefficient index,
if i == 0: #get DC huffman tables for coeff=0
huff_Y = readbits(4)
huff_UV = readbits(4)
elif i == 1:
huff_Y = readbits(4)+16 #get AC huff tables at i=1
huff_UV = readbits(4)+16
elif i == 6: #update at 6, 15, and 28
huff_Y += 16
huff_UV += 16
elif i == 15:
huff_Y += 16
huff_UV += 16
elif i == 28:
huff_Y += 16
huff_UV += 16
now, for every block, decode our coefficient:
for x in range(blocks_tot):
if x < blocks_Y: #if we're in the Y plane,
huff = huff_Y #use huff_Y
else: #or,
huff = huff_UV #huff_UV for chroma planes
first check whether this coefficient was already decoded (Because of an end-of-block or other run):
if len(coeffs[x]) <= i: #if this coeff has not been set
if not, get a token:
run, val = parsetoken(huffs[huff])
if this is an end-of-block token, that means we have a run of blocks to mark as fully decoded:
if val == 'eob': #eob = End Of Block run
xx = x #temporary block index starts with x
for r in range(run): #clear (run) blocks
done = len(coeffs[xx]) #this many coeffs are set in this block
remain = 64 - done #this many remain
for j in range (remain): #for all remaining coeffs
coeffs[xx].append(0) #set to zero
ii = i #temporary coeff index starts with i
while (len(coeffs[xx]) >ii) and ii<64: #find next candidate block for eob treatment
xx += 1 #next block
if xx == blocks_tot: #if we wrapped around,
xx = 0 #back to block zero
ii += 1 #and next coeff
otherwise the token represents a run of zeros followed by a value:
else: #zero run + value
for r in range (run): #a run of zeros
coeffs[x].append(0)
coeffs[x].append(val) #followed by a val
now 'de-hilbertize' coefficient blocks:
Yheight = encoded_height//8
Ywidth = encoded_width//8
UVheight = Yheight//2
UVwidth = Ywidth//2
global comapY, comapU, comapV
comapY = de_hilbert(Ywidth, Yheight, coeffs)
comapU = de_hilbert(UVwidth, UVheight, coeffs[blocks_Y:])
comapV = de_hilbert(UVwidth, UVheight, coeffs[(blocks_Y+blocks_U):])
next, we need to reverse the DC prediction-based delta coding:
DCpredict(comapY, Ywidth, Yheight)
DCpredict(comapV, UVwidth, UVheight)
DCpredict(comapU, UVwidth, UVheight)
finally reorder, dequantize, and iDCT all the coefficients, for each color plane:
temp = [0 for x in range(64)] #temporary array
for y in range(Yheight):
for x in range(Ywidth):
unzig (comapY[x][y], temp)
dequant(temp, scalefactor_DC, scalefactor_AC, Y_quantizer)
idct(temp)
comapY[x][y] = temp + [] #Python quirk -- force assignment by copy (not reference)
for y in range(UVheight):
for x in range(UVwidth):
unzig (comapU[x][y], temp)
dequant(temp, scalefactor_DC, scalefactor_AC, UV_quantizer)
idct(temp)
comapU[x][y] = temp + []
for y in range(UVheight):
for x in range(UVwidth):
unzig (comapV[x][y], temp)
dequant(temp, scalefactor_DC, scalefactor_AC, UV_quantizer)
idct(temp)
comapV[x][y] = temp + []
convert the image into a raw byte array of planar Y, V, U:
pixY = blocks2pixels(comapY, decode_width, decode_height, offset_x, offset_y)
pixU = blocks2pixels(comapU, decode_width>>1, decode_height>>1, offset_x>>1, offset_y>>1)
pixV = blocks2pixels(comapV, decode_width>>1, decode_height>>1, offset_x>>1, offset_y>>1)
run loop filter:
loopfilter(pixY, decode_width, decode_height)
loopfilter(pixU, decode_width>>1, decode_height>>1)
loopfilter(pixV, decode_width>>1, decode_height>>1)
return the three color planes:
return pixels2chars(pixY), pixels2chars(pixU), pixels2chars(pixV)
Decode Predicted Frame: THIS SECTION UNFINISHED
else:
spec_err("decoding interframe (NOT!)")
coding_scheme = readbits(3)
if coding_scheme == 0:
mode_alphabet = [] #define a list (think of it as an array)
for x in range(8):
mode_alphabet.append(readbits(3)) #add another mode to the list
--end of definition for decode_frame()
Parse Theora packets
Define a function to parse the packet type & call appropriate functions. Returns either a string for header packets, or a tuple of Y, U, and V data for frames:
def decode_packet():
packet_type = readbits(1)
if packet_type == 0:
return decode_frame()
else:
return decode_header()
MAIN TEST SEQUENCE
let's test our routines by parsing the stream headers and the first frame.
Main Loop
while 1 :
get_page()
if page == None :
print 'No more pages left, done!'
outfile.close()
exit()
if streams.has_key(page.serialno) :
if page.serialno == streamid :
stream.pagein(page)
packet = stream.packetout()
while packet != None :
buffer = ogg2.OggPackBuffB(packet)
output(decode_packet())
packet = stream.packetout()
else :
decode_new_stream()
page = None
'ret' should now have the first frame: