~shabbyrobe/grugdct

6c0c1c5364b08c597156852586f030b30f528d74 — Blake Williams 1 year, 8 months ago b9304e5
Try to shave some cycles out of cached cosines
3 files changed, 218 insertions(+), 2 deletions(-)

M cachedcos_8x8.go
A cachedcos_8x8_faster.go
A cachedcos_8x8_faster_test.go
M cachedcos_8x8.go => cachedcos_8x8.go +3 -2
@@ 35,7 35,7 @@ func (cos CachedCosines8x8) DCT8x8Into(img Image, xpos int, ypos int, mat *Matri
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)
	const scale = 0.25                      // 1 / (sqrt(8*8) / 2)

	for v, voff := 0, 0; v < 8; v, voff = v+1, voff+8 {
	for v, mi := 0, 0; v < 8; v++ {
		for u := 0; u < 8; u++ {
			var z float32
			for y := 0; y < 8; y++ {


@@ 55,7 55,8 @@ func (cos CachedCosines8x8) DCT8x8Into(img Image, xpos int, ypos int, mat *Matri
			if v == 0 {
				q *= oneOverSqrt2
			}
			mat[voff+u] = q
			mat[mi] = q
			mi++
		}
	}
}

A cachedcos_8x8_faster.go => cachedcos_8x8_faster.go +102 -0
@@ 0,0 1,102 @@
package grugdct

import (
	"image"
	"math"
)

var cosines8x8FasterLUT32 [8][8]float32 = func() (out [8][8]float32) {
	piDivByDoubleWidth := math.Pi / float64(8*2)
	for y := 0; y < 8; y++ {
		for x := 0; x < 8; x++ {
			out[x][y] = float32(math.Cos(float64(2*x+1) * float64(y) * piDivByDoubleWidth))
		}
	}
	return out
}()

// This is a space to play around with shaving ops off the Cosine implementation;
// For Gruggier code, use CachedCosines8x8.
type CachedCosines8x8Faster struct{}

var _ ImageTransform8x8F32 = CachedCosines8x8Faster{}

func NewCachedCosines8x8Faster() CachedCosines8x8Faster {
	return CachedCosines8x8Faster{}
}

func (cos CachedCosines8x8Faster) Matrix() Matrix8x8F32 {
	return Matrix8x8F32{}
}

func (cos CachedCosines8x8Faster) DCTSize() image.Point {
	return image.Point{8, 8}
}

func (cos CachedCosines8x8Faster) DCT8x8Into(img Image, xpos int, ypos int, mat *Matrix8x8F32) {
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)
	const scale = 0.25                      // 1 / (sqrt(8*8) / 2)

	for i := 0; i < 64; i++ {
		v, u := i>>3, i&0b111

		var z float32
		for j := 0; j < 64; j++ {
			y, x := j>>3, j&0b111
			xp, yp := xpos+x, ypos+y
			if xp < img.W && yp < img.H {
				px := img.Pix[yp*img.W+xp]
				z += float32(px) * cosines8x8FasterLUT32[x][u] * cosines8x8FasterLUT32[y][v]
			}
		}

		q := scale * z
		if u == 0 {
			q *= oneOverSqrt2
		}
		if v == 0 {
			q *= oneOverSqrt2
		}
		mat[i] = q
	}
}

func (cos CachedCosines8x8Faster) IDCT8x8Into(mat *Matrix8x8F32, xpos int, ypos int, into Image) {
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)
	const scale = 4.0                       // sqrt(8*8) / 2

	for i := 0; i < 64; i++ {
		y, x := i>>3, i&0b111
		var z float32

		for v, mi := 0, 0; v < 8; v++ {
			for u := 0; u < 8; u++ {
				s := mat[mi]
				mi++

				q := s * cosines8x8FasterLUT32[x][u] * cosines8x8FasterLUT32[y][v]
				if u == 0 {
					q *= oneOverSqrt2
				}
				if v == 0 {
					q *= oneOverSqrt2
				}
				z += q
			}
		}

		z /= scale
		if z > 255.0 {
			z = 255.0
		}
		if z < 0 {
			z = 0.0
		}

		xp, yp := xpos+x, ypos+y
		if xp < into.W && yp < into.H {
			// It's inaccurate without the round. 255.5 will truncate to 255, not wrap.
			into.Pix[yp*into.W+xp] = uint8(z + 0.5)
		}
	}
}

A cachedcos_8x8_faster_test.go => cachedcos_8x8_faster_test.go +113 -0
@@ 0,0 1,113 @@
package grugdct

import (
	"reflect"
	"testing"
)

func TestCachedCosines8x8FasterRoundtripCases(t *testing.T) {
	transform := NewCachedCosines8x8Faster()

	for _, timg := range TestImages {
		t.Run(timg.CaseName("cachedcos8x8"), func(t *testing.T) {
			tw, th := transform.DCTSize().X, transform.DCTSize().Y
			result := NewImage(timg.W, timg.H)
			mat := transform.Matrix()
			for y := 0; y < timg.H; y += th {
				for x := 0; x < timg.W; x += tw {
					transform.DCT8x8Into(timg.Image, x, y, &mat)
					transform.IDCT8x8Into(&mat, x, y, result)
				}
			}

			if !reflect.DeepEqual(timg.Pix, result.Pix) {
				// Print2DUint8s(timg.Pix, timg.W, timg.H, 0)
				// fmt.Println()
				// Print2DUint8s(result.Pix, result.W, result.H, 0)
				// fmt.Println()

				t.Fatal()
			}
		})
	}
}

func BenchmarkCachedCosines8x8FasterWith8x8(b *testing.B) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewCachedCosines8x8Faster()
	dctH, dctW := transform.DCTSize().X, transform.DCTSize().Y

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		mat := transform.Matrix()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctH {
					transform.DCT8x8Into(img, x, y, &mat)
				}
			}
		}
	})

	result := NewImage(img.W, img.H)
	mats := make([]Matrix8x8F32, dctH*dctW)
	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mat := transform.Matrix()
			transform.DCT8x8Into(img, x, y, &mat)
			mats[m] = mat
			m++
		}
	}

	b.Run("idct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8Into(&mats[m], x, y, result)
					m++
				}
			}
		}
	})
}

func BenchmarkCachedCosines8x8FasterWithASingle8x8DCTCall(b *testing.B) {
	img := Image{
		W: 8, H: 8,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22,
			131, 11, 12, 44, 131, 11, 12, 44,
			94, 51, 88, 66, 94, 51, 88, 66,
			0, 128, 255, 22, 131, 11, 12, 44,
			131, 11, 12, 44, 94, 51, 88, 66,
			94, 51, 88, 66, 0, 128, 255, 22,
			131, 11, 12, 44, 131, 11, 12, 44,
			94, 51, 88, 66, 131, 11, 12, 44,
		},
	}

	transform := NewCachedCosines8x8Faster()
	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		mat := transform.Matrix()
		for i := 0; i < b.N; i++ {
			transform.DCT8x8Into(img, 0, 0, &mat)
		}
	})
}