~shabbyrobe/grugdct

6b7d1494dc92f208562196c50e0b1d5347cad06d — Blake Williams 1 year, 8 months ago 934b9cb
API cleanup to allow CGo version to be competitive
17 files changed, 336 insertions(+), 56 deletions(-)

M aan_cgo.go
M aan_cgo_test.go
M aan_flat.go
M aan_flat_test.go
M aan_std.go
M aan_std_test.go
A basic_8x8.go
A basic_8x8_test.go
R basic.go => basic_std.go
R basic_test.go => basic_std_test.go
R cachedcos8x8.go => cachedcos_8x8.go
R cachedcos8x8_test.go => cachedcos_8x8_test.go
R cachedcosstd.go => cachedcos_std.go
R cachedcosstd_test.go => cachedcos_std_test.go
M quantize.go
M tool.go
M transform.go
M aan_cgo.go => aan_cgo.go +6 -3
@@ 20,18 20,21 @@ func NewAAN8x8C() AAN8x8C {

var _ ImageTransform8x8F32 = AAN8x8C{}

func (aan AAN8x8C) Matrix() Matrix8x8F32 {
	return Matrix8x8F32{}
}

func (aan AAN8x8C) DCTSize() image.Point {
	return image.Point{8, 8}
}

func (aan AAN8x8C) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32) {
func (aan AAN8x8C) DCT8x8Into(img Image, xpos int, ypos int, mat *Matrix8x8F32) {
	matp := (*C.float)(&mat[0])
	luma := (*C.uint8_t)(&img.Pix[0])
	C.dct_8x8(luma, C.size_t(len(img.Pix)), C.int(img.W), C.int(xpos), C.int(ypos), matp)
	return mat
}

func (aan AAN8x8C) IDCT8x8(mat Matrix8x8F32, xpos int, ypos int, into Image) {
func (aan AAN8x8C) IDCT8x8Into(mat *Matrix8x8F32, xpos int, ypos int, into Image) {
	matp := (*C.float)(&mat[0])
	luma := (*C.uint8_t)(&into.Pix[0])
	C.idct_8x8(matp, C.int(xpos), C.int(ypos), luma, C.size_t(len(into.Pix)), C.int(into.W))

M aan_cgo_test.go => aan_cgo_test.go +9 -5
@@ 22,15 22,16 @@ func TestAANC(t *testing.T) {
	}

	transform := NewAAN8x8C()
	mat := transform.Matrix()
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	tw, th := transform.DCTSize().X, transform.DCTSize().Y
	result := NewImage(img.W, img.H)

	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			mat := transform.DCT8x8(img, x, y)
			transform.DCT8x8Into(img, x, y, &mat)
			dct.PutMat328x8(x, y, mat)
			transform.IDCT8x8(mat, x, y, result)
			transform.IDCT8x8Into(&mat, x, y, result)
		}
	}



@@ 77,10 78,11 @@ func BenchmarkAANCWith8x8(b *testing.B) {

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		mat := transform.Matrix()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.DCT8x8(img, x, y)
					transform.DCT8x8Into(img, x, y, &mat)
				}
			}
		}


@@ 92,7 94,9 @@ func BenchmarkAANCWith8x8(b *testing.B) {

	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mats[m] = transform.DCT8x8(img, x, y)
			mat := transform.Matrix()
			transform.DCT8x8Into(img, x, y, &mat)
			mats[m] = mat
			m++
		}
	}


@@ 102,7 106,7 @@ func BenchmarkAANCWith8x8(b *testing.B) {
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8(mats[m], x, y, result)
					transform.IDCT8x8Into(&mats[m], x, y, result)
					m++
				}
			}

M aan_flat.go => aan_flat.go +6 -4
@@ 12,6 12,10 @@ func NewAAN8x8Flat() AAN8x8Flat {

var _ ImageTransform8x8F32 = AAN8x8Flat{}

func (aan AAN8x8Flat) Matrix() Matrix8x8F32 {
	return Matrix8x8F32{}
}

func (dct AAN8x8Flat) DCTSize() image.Point {
	return image.Point{8, 8}
}


@@ 23,7 27,7 @@ func (dct AAN8x8Flat) px(img Image, x, y int) uint8 {
	return img.Pix[y*img.W+x]
}

func (aan AAN8x8Flat) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32) {
func (aan AAN8x8Flat) DCT8x8Into(img Image, xpos int, ypos int, mat *Matrix8x8F32) {
	const (
		a1 = 0.7071067811865476  // sqrt(0.5)
		a2 = 0.5411961001461971  // sqrt(2) * cos(3/16 * 2 * pi)


@@ 193,11 197,9 @@ func (aan AAN8x8Flat) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32) {
		mat[yoff+7] = g6 * s7
		mat[yoff+3] = g7 * s3
	}

	return mat
}

func (aan AAN8x8Flat) IDCT8x8(mat Matrix8x8F32, xpos int, ypos int, into Image) {
func (aan AAN8x8Flat) IDCT8x8Into(mat *Matrix8x8F32, xpos int, ypos int, into Image) {
	const (
		m0 = 1.8477590650225735 // 2 * cos(1/16 * 2 * pi)
		m1 = 1.4142135623730951 // 2 * cos(2/16 * 2 * pi)

M aan_flat_test.go => aan_flat_test.go +9 -5
@@ 25,12 25,13 @@ func TestAANFlat(t *testing.T) {
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	tw, th := transform.DCTSize().X, transform.DCTSize().Y
	result := NewImage(img.W, img.H)
	mat := transform.Matrix()

	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			mat := transform.DCT8x8(img, x, y)
			transform.DCT8x8Into(img, x, y, &mat)
			dct.PutMat328x8(x, y, mat)
			transform.IDCT8x8(mat, x, y, result)
			transform.IDCT8x8Into(&mat, x, y, result)
		}
	}



@@ 66,10 67,11 @@ func BenchmarkAANFlat(b *testing.B) {

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		mat := transform.Matrix()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.DCT8x8(img, x, y)
					transform.DCT8x8Into(img, x, y, &mat)
				}
			}
		}


@@ 81,7 83,9 @@ func BenchmarkAANFlat(b *testing.B) {

	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mats[m] = transform.DCT8x8(img, x, y)
			mat := transform.Matrix()
			transform.DCT8x8Into(img, x, y, &mat)
			mats[m] = mat
			m++
		}
	}


@@ 91,7 95,7 @@ func BenchmarkAANFlat(b *testing.B) {
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8(mats[m], x, y, result)
					transform.IDCT8x8Into(&mats[m], x, y, result)
					m++
				}
			}

M aan_std.go => aan_std.go +6 -4
@@ 12,6 12,10 @@ func NewAAN8x8() AAN8x8 {

var _ ImageTransform8x8F32 = AAN8x8{}

func (aan AAN8x8) Matrix() Matrix8x8F32 {
	return Matrix8x8F32{}
}

func (aan AAN8x8) DCTSize() image.Point {
	return image.Point{8, 8}
}


@@ 30,7 34,7 @@ func (aan AAN8x8) putpx(img Image, x, y int, v uint8) {
	img.Pix[y*img.W+x] = v
}

func (aan AAN8x8) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32) {
func (aan AAN8x8) DCT8x8Into(img Image, xpos int, ypos int, mat *Matrix8x8F32) {
	const (
		a1 = 0.7071067811865476  // sqrt(0.5)
		a2 = 0.5411961001461971  // sqrt(2) * cos(3/16 * 2 * pi)


@@ 113,12 117,10 @@ func (aan AAN8x8) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32) {
		mat[yoff+7] = g6 * s7
		mat[yoff+3] = g7 * s3
	}

	return mat
}

// NOTE: This was slower with a pointer to the matrix.
func (aan AAN8x8) IDCT8x8(mat Matrix8x8F32, xpos int, ypos int, into Image) {
func (aan AAN8x8) IDCT8x8Into(mat *Matrix8x8F32, xpos int, ypos int, into Image) {
	const (
		m0 = 1.8477590650225735 // 2 * cos(1/16 * 2 * pi)
		m1 = 1.4142135623730951 // 2 * cos(2/16 * 2 * pi)

M aan_std_test.go => aan_std_test.go +13 -7
@@ 25,12 25,13 @@ func TestAANStd(t *testing.T) {
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	tw, th := transform.DCTSize().X, transform.DCTSize().Y
	result := NewImage(img.W, img.H)
	mat := transform.Matrix()

	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			mat := transform.DCT8x8(img, x, y)
			transform.DCT8x8Into(img, x, y, &mat)
			dct.PutMat328x8(x, y, mat)
			transform.IDCT8x8(mat, x, y, result)
			transform.IDCT8x8Into(&mat, x, y, result)
		}
	}



@@ 52,11 53,13 @@ func TestAANStdRoundtripCases(t *testing.T) {
			dct := NewBuffer32(timg.Size(), transform.DCTSize())
			tw, th := transform.DCTSize().X, transform.DCTSize().Y
			result := NewImage(timg.W, timg.H)
			mat := transform.Matrix()

			for y := 0; y < timg.H; y += th {
				for x := 0; x < timg.W; x += tw {
					mat := transform.DCT8x8(timg.Image, x, y)
					transform.DCT8x8Into(timg.Image, x, y, &mat)
					dct.PutMat328x8(x, y, mat)
					transform.IDCT8x8(mat, x, y, result)
					transform.IDCT8x8Into(&mat, x, y, result)
				}
			}



@@ 89,10 92,11 @@ func BenchmarkAANStdWith8x8(b *testing.B) {

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		mat := transform.Matrix()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.DCT8x8(img, x, y)
					transform.DCT8x8Into(img, x, y, &mat)
				}
			}
		}


@@ 103,7 107,9 @@ func BenchmarkAANStdWith8x8(b *testing.B) {

	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mats[m] = transform.DCT8x8(img, x, y)
			mat := transform.Matrix()
			transform.DCT8x8Into(img, x, y, &mat)
			mats[m] = mat
			m++
		}
	}


@@ 113,7 119,7 @@ func BenchmarkAANStdWith8x8(b *testing.B) {
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8(mats[m], x, y, result)
					transform.IDCT8x8Into(&mats[m], x, y, result)
					m++
				}
			}

A basic_8x8.go => basic_8x8.go +98 -0
@@ 0,0 1,98 @@
package grugdct

import (
	"image"
	"math"
)

type Basic8x8 struct{}

var _ ImageTransform8x8F32 = Basic8x8{}

func NewBasic8x8() Basic8x8 {
	return Basic8x8{}
}

func (bas Basic8x8) Matrix() Matrix8x8F32 {
	return Matrix8x8F32{}
}

func (bas Basic8x8) DCTSize() image.Point {
	return image.Point{8, 8}
}

func (bas Basic8x8) DCT8x8Into(img Image, xpos int, ypos int, mat *Matrix8x8F32) {
	const oneOverSqrt2 = 0.7071067811865475      // 1 / sqrt(2)
	const scale = 0.25                           // 1 / (sqrt(8*8) / 2)
	const piDivByDoubleDim = 0.19634954084936207 // pi / 8*2

	for v, voff := 0, 0; v < 8; v, voff = v+1, voff+8 {
		for u := 0; u < 8; u++ {
			var cu, cv, z float32 = 1, 1, 0
			if u == 0 {
				cu = oneOverSqrt2
			}
			if v == 0 {
				cv = oneOverSqrt2
			}

			for y := 0; y < 8; y++ {
				for x := 0; x < 8; x++ {
					xp, yp := xpos+x, ypos+y
					if xp < img.W && yp < img.H {
						px := img.Pix[yp*img.W+xp]
						q := float64(px) *
							math.Cos(float64(2*x+1)*float64(u)*piDivByDoubleDim) *
							math.Cos(float64(2*y+1)*float64(v)*piDivByDoubleDim)
						z += float32(q)
					}

				}
			}

			mat[voff+u] = scale * cu * cv * z
		}
	}
}

func (bas Basic8x8) IDCT8x8Into(mat *Matrix8x8F32, xpos int, ypos int, into Image) {
	const oneOverSqrt2 = 0.7071067811865475      // 1 / sqrt(2)
	const scale = 0.25                           // 1 / sqrt(8*8) / 2
	const piDivByDoubleDim = 0.19634954084936207 // pi / 8*2

	for y := 0; y < 8; y++ {
		for x := 0; x < 8; x++ {
			var z float32

			for v, voff := 0, 0; v < 8; v, voff = v+1, voff+8 {
				for u := 0; u < 8; u++ {
					s := mat[voff+u]
					q := s *
						float32(math.Cos(float64(2*x+1)*float64(u)*piDivByDoubleDim)) *
						float32(math.Cos(float64(2*y+1)*float64(v)*piDivByDoubleDim))
					if u == 0 {
						q *= oneOverSqrt2
					}
					if v == 0 {
						q *= oneOverSqrt2
					}
					z += q
				}
			}

			z *= scale
			if z > 255.0 {
				z = 255.0
			}
			if z < 0 {
				z = 0.0
			}

			xp, yp := xpos+x, ypos+y
			if xp < into.W && yp < into.H {
				// It's inaccurate without the round. 255.5 will truncate to 255, not wrap.
				into.Pix[yp*into.W+xp] = uint8(z + 0.5)
			}
		}
	}
}

A basic_8x8_test.go => basic_8x8_test.go +116 -0
@@ 0,0 1,116 @@
package grugdct

import (
	"reflect"
	"testing"
)

func TestBasic8x8(t *testing.T) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewBasic8x8()
	result := NewImage(img.W, img.H)
	tw, th := transform.DCTSize().X, transform.DCTSize().Y
	mat := transform.Matrix()

	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			transform.DCT8x8Into(img, x, y, &mat)
			transform.IDCT8x8Into(&mat, x, y, result)
		}
	}

	if !reflect.DeepEqual(img.Pix, result.Pix) {
		t.Fatal()
	}
}

func TestBasic8x8RoundtripCases(t *testing.T) {
	transform := NewBasic8x8()

	for _, timg := range TestImages {
		t.Run(timg.CaseName("basic8x8"), func(t *testing.T) {
			tw, th := transform.DCTSize().X, transform.DCTSize().Y
			result := NewImage(timg.W, timg.H)
			mat := transform.Matrix()
			for y := 0; y < timg.H; y += th {
				for x := 0; x < timg.W; x += tw {
					transform.DCT8x8Into(timg.Image, x, y, &mat)
					transform.IDCT8x8Into(&mat, x, y, result)
				}
			}

			if !reflect.DeepEqual(timg.Pix, result.Pix) {
				t.Fatal()
			}
		})
	}
}

func BenchmarkBasic8x8With8x8DCT(b *testing.B) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewBasic8x8()
	dctH, dctW := transform.DCTSize().X, transform.DCTSize().Y

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		mat := transform.Matrix()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.DCT8x8Into(img, x, y, &mat)
				}
			}
		}
	})

	result := NewImage(img.W, img.H)
	mats := make([]Matrix8x8F32, dctH*dctW)
	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mat := transform.Matrix()
			transform.DCT8x8Into(img, x, y, &mat)
			mats[m] = mat
			m++
		}
	}

	b.Run("idct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8Into(&mats[m], x, y, result)
					m++
				}
			}
		}
	})
}

R basic.go => basic_std.go +0 -0
R basic_test.go => basic_std_test.go +0 -0
R cachedcos8x8.go => cachedcos_8x8.go +6 -3
@@ 23,11 23,15 @@ func NewCachedCosines8x8() CachedCosines8x8 {
	return CachedCosines8x8{}
}

func (cos CachedCosines8x8) Matrix() Matrix8x8F32 {
	return Matrix8x8F32{}
}

func (cos CachedCosines8x8) DCTSize() image.Point {
	return image.Point{8, 8}
}

func (cos CachedCosines8x8) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32) {
func (cos CachedCosines8x8) DCT8x8Into(img Image, xpos int, ypos int, mat *Matrix8x8F32) {
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)
	const scale = 0.25                      // 1 / (sqrt(8*8) / 2)



@@ 53,10 57,9 @@ func (cos CachedCosines8x8) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8
			mat[voff+u] = scale * cu * cv * z
		}
	}
	return mat
}

func (cos CachedCosines8x8) IDCT8x8(mat Matrix8x8F32, xpos int, ypos int, into Image) {
func (cos CachedCosines8x8) IDCT8x8Into(mat *Matrix8x8F32, xpos int, ypos int, into Image) {
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)
	const scale = 4.0                       // sqrt(8*8) / 2


R cachedcos8x8_test.go => cachedcos_8x8_test.go +11 -6
@@ 12,10 12,11 @@ func TestCachedCosines8x8RoundtripCases(t *testing.T) {
		t.Run(timg.CaseName("cachedcos8x8"), func(t *testing.T) {
			tw, th := transform.DCTSize().X, transform.DCTSize().Y
			result := NewImage(timg.W, timg.H)
			mat := transform.Matrix()
			for y := 0; y < timg.H; y += th {
				for x := 0; x < timg.W; x += tw {
					mat := transform.DCT8x8(timg.Image, x, y)
					transform.IDCT8x8(mat, x, y, result)
					transform.DCT8x8Into(timg.Image, x, y, &mat)
					transform.IDCT8x8Into(&mat, x, y, result)
				}
			}



@@ 52,10 53,11 @@ func BenchmarkCachedCosines8x8With8x8(b *testing.B) {

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		mat := transform.Matrix()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctH {
					transform.DCT8x8(img, x, y)
					transform.DCT8x8Into(img, x, y, &mat)
				}
			}
		}


@@ 65,7 67,9 @@ func BenchmarkCachedCosines8x8With8x8(b *testing.B) {
	mats := make([]Matrix8x8F32, dctH*dctW)
	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mats[m] = transform.DCT8x8(img, x, y)
			mat := transform.Matrix()
			transform.DCT8x8Into(img, x, y, &mat)
			mats[m] = mat
			m++
		}
	}


@@ 75,7 79,7 @@ func BenchmarkCachedCosines8x8With8x8(b *testing.B) {
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8(mats[m], x, y, result)
					transform.IDCT8x8Into(&mats[m], x, y, result)
					m++
				}
			}


@@ 101,8 105,9 @@ func BenchmarkCachedCosines8x8WithASingle8x8DCTCall(b *testing.B) {
	transform := NewCachedCosines8x8()
	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		mat := transform.Matrix()
		for i := 0; i < b.N; i++ {
			transform.DCT8x8(img, 0, 0)
			transform.DCT8x8Into(img, 0, 0, &mat)
		}
	})
}

R cachedcosstd.go => cachedcos_std.go +0 -0
R cachedcosstd_test.go => cachedcos_std_test.go +0 -0
M quantize.go => quantize.go +19 -12
@@ 48,17 48,22 @@ func Scale8x8F32(mat Matrix8x8F32, quality int) Matrix8x8F32 {
	return mat
}

func GrugQuantize8x8F32InPlace(img Image, transform ImageTransform8x8F32, table Matrix8x8F32) {
func GrugQuantize8x8F32InPlace(img Image, transform ImageTransform8x8F32, quality int, table *Matrix8x8F32) {
	tw, th := transform.DCTSize().X, transform.DCTSize().Y
	result := NewImage(img.W, img.H)
	if table == nil {
		table = &DefaultLumaQuantizationMatrix8x8F32
	}
	qTable := Scale8x8F32(*table, quality)

	var mat Matrix8x8F32
	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			mat := transform.DCT8x8(img, x, y)
			// for i := 63; i >= 63-strip && i >= 0; i-- {
			//     mat[ZigZag8x8Order[i]] = 0
			// }
			transform.IDCT8x8(mat, x, y, result)
			transform.DCT8x8Into(img, x, y, &mat)
			for i := 0; i < 64; i++ {
				mat[i] = float32(math.Round(float64(mat[i]/qTable[i]))) * qTable[i]
			}
			transform.IDCT8x8Into(&mat, x, y, result)
		}
	}
}


@@ 92,33 97,35 @@ func GrugQuantizeYCbCr8x8F32InPlace(
	imgCb := Image{Pix: yCbCr.Cb, W: chromaSz.X, H: chromaSz.Y}
	imgCr := Image{Pix: yCbCr.Cr, W: chromaSz.X, H: chromaSz.Y}

	var mat Matrix8x8F32

	for y := 0; y < imgCy.H; y += th {
		for x := 0; x < imgCy.W; x += tw {
			mat := transform.DCT8x8(imgCy, x, y)
			transform.DCT8x8Into(imgCy, x, y, &mat)
			for i := 0; i < 64; i++ {
				mat[i] = float32(math.Round(float64(mat[i]/qLumaTable[i]))) * qLumaTable[i]
			}
			transform.IDCT8x8(mat, x, y, imgCy)
			transform.IDCT8x8Into(&mat, x, y, imgCy)
		}
	}

	for y := 0; y < imgCb.H; y += th {
		for x := 0; x < imgCb.W; x += tw {
			mat := transform.DCT8x8(imgCb, x, y)
			transform.DCT8x8Into(imgCb, x, y, &mat)
			for i := 0; i < 64; i++ {
				mat[i] = float32(math.Round(float64(mat[i]/qChromaTable[i]))) * qChromaTable[i]
			}
			transform.IDCT8x8(mat, x, y, imgCb)
			transform.IDCT8x8Into(&mat, x, y, imgCb)
		}
	}

	for y := 0; y < imgCr.H; y += th {
		for x := 0; x < imgCr.W; x += tw {
			mat := transform.DCT8x8(imgCr, x, y)
			transform.DCT8x8Into(imgCr, x, y, &mat)
			for i := 0; i < 64; i++ {
				mat[i] = float32(math.Round(float64(mat[i]/qChromaTable[i]))) * qChromaTable[i]
			}
			transform.IDCT8x8(mat, x, y, imgCr)
			transform.IDCT8x8Into(&mat, x, y, imgCr)
		}
	}
}

M tool.go => tool.go +29 -4
@@ 4,6 4,7 @@ package main

import (
	"bytes"
	"flag"
	"fmt"
	"image"
	"image/jpeg"


@@ 23,11 24,16 @@ func main() {
}

func run() error {
	if len(os.Args) < 3 {
	var transformType string

	flag.StringVar(&transformType, "transform", "aan", "Transform type (aan, cos, basic)")
	flag.Parse()

	if len(flag.Args()) < 2 {
		return fmt.Errorf("Usage: tool.go <image> <output>")
	}

	input, output := os.Args[1], os.Args[2]
	input, output := flag.Arg(0), flag.Arg(1)

	in, err := os.ReadFile(input)
	if err != nil {


@@ 55,13 61,32 @@ func run() error {
		return fmt.Errorf("unsupported file extension %q", input)
	}

	transform := grugdct.NewAAN8x8()
	var transform any
	switch transformType {
	case "aan":
		transform = grugdct.NewAAN8x8()
	case "aancgo":
		transform = grugdct.NewAAN8x8C()
	case "cos":
		transform = grugdct.NewCachedCosines8x8()
	case "basic":
		transform = grugdct.NewBasic8x8()
	default:
		return fmt.Errorf("unknown transform type")
	}

	s := time.Now()
	subsampleRatio := image.YCbCrSubsampleRatio410
	quality := 90
	yCbCr := grugdct.ConvertImageToYCbCr(img, subsampleRatio)
	grugdct.GrugQuantizeYCbCr8x8F32InPlace(yCbCr, transform, quality, nil, nil)

	switch transform := transform.(type) {
	case grugdct.ImageTransform8x8F32:
		grugdct.GrugQuantizeYCbCr8x8F32InPlace(yCbCr, transform, quality, nil, nil)
	default:
		return fmt.Errorf("unimplemented transform type %T", transform)
	}

	took := time.Since(s)
	fmt.Println("grug quantize", yCbCr.Bounds().Size(), took)


M transform.go => transform.go +8 -3
@@ 4,13 4,18 @@ import "image"

type ImageTransform8x8F32 interface {
	DCTSize() image.Point
	DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32)
	IDCT8x8(mat Matrix8x8F32, xpos int, ypos int, into Image)
	Matrix() Matrix8x8F32

	// These benchmarked the same with pointers to matrices as they did when returning a
	// matrix value from the DCT and accepting a non-pointer matrix in the IDCT... except
	// for cgo, where it escaped to the heap and butchered performance:
	DCT8x8Into(img Image, xpos int, ypos int, into *Matrix8x8F32)
	IDCT8x8Into(mat *Matrix8x8F32, xpos int, ypos int, into Image)
}

type ImageTransformF64 interface {
	DCTSize() image.Point
	Matrix() MatrixF64
	DCTInto(img Image, xpos int, ypos int, mat MatrixF64)
	DCTInto(img Image, xpos int, ypos int, into MatrixF64)
	IDCTInto(mat MatrixF64, xpos int, ypos int, into Image)
}