~shabbyrobe/grugdct

140f1745f99a96c566c050d4f8be825895e6f9e9 — Blake Williams 1 year, 8 months ago
GRUG DCT
A  => LICENSE.0BSD +12 -0
@@ 1,12 @@
Copyright (C) 2022 by Blake Williams <code@shabbyrobe.org>

Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.

A  => LICENSE.BlueOak +56 -0
@@ 1,56 @@
# Blue Oak Model License

Version 1.0.0

## Purpose

This license gives everyone as much permission to work with
this software as possible, while protecting contributors
from liability.

## Acceptance

In order to receive this license, you must agree to its
rules.  The rules of this license are both obligations
under that agreement and conditions to your license.
You must not do anything with this software that triggers
a rule that you cannot or will not follow.

## Copyright

Each contributor licenses you to do everything with this
software that would otherwise infringe that contributor's
copyright in it.

## Notices

You must ensure that everyone who gets a copy of
any part of this software from you, with or without
changes, also gets the text of this license or a link to
<https://blueoakcouncil.org/license/1.0.0>.

## Excuse

If anyone notifies you in writing that you have not
complied with [Notices](#notices), you can keep your
license by taking all practical steps to comply within 30
days after the notice.  If you do not do so, your license
ends immediately.

## Patent

Each contributor licenses you to do everything with this
software that would otherwise infringe any patent claims
they can license or become able to license.

## Reliability

No contributor can revoke this license.

## No Liability

***As far as the law allows, this software comes as is,
without any warranty or condition, and no contributor
will be liable to anyone for any damages related to this
software or this license, under any kind of legal claim.***


A  => LICENSE.MIT +19 -0
@@ 1,19 @@
Copyright 2022 Blake Williams <code@shabbyrobe.org>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

A  => README.md +41 -0
@@ 1,41 @@
# GRUG DCT

I tried to write a [Grug-brained](https://grugbrain.dev/) DCT and IDCT from
scratch using the Wikipedia description. I got most of the way, but I couldn't
quite get the IDCT working on my own. It's working now though.

There will be lots of different DCT/IDCT implementations in here, choose one
that suits you best and copy/paste it into your project.


## Issues

- There's some visual noise creeping in. I think there's some int overflow
  hiding in here somewhere.


## Expectation Management

**NOTE: This is NOT INTENDED TO BE IMPORTED DIRECTLY. PLEASE copy the bits you
want into your own code.**

This repo is not guaranteed to be stable in any way, shape or form. If you
import the code directly and it changes or disappears and your code breaks,
that's your fault.

There's no copyright here, these are just bog-standard implementations of
widely-known algorithms with nothing novel whatsoever added. I have included
several permissive licenses just in case you need that.

Contributions will not be accepted.


## Got by with a little help from:

- https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II
- http://i.stack.imgur.com/fvnEF.gif
- https://stackoverflow.com/questions/8310749/discrete-cosine-transform-dct-implementation-c
- https://github.com/prtsh/aan_dct/blob/master/8point_DCT_AAN.c
- http://unix4lyfe.org/dct/
- https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
- https://github.com/mypopydev/dct/

A  => aan_buffer.go +205 -0
@@ 1,205 @@
package grugdct

import (
	"image"
)

type AAN8x8Buffer struct{}

func NewAAN8x8Buffer() AAN8x8Buffer {
	return AAN8x8Buffer{}
}

func (dct AAN8x8Buffer) DCTSize() image.Point {
	return image.Point{8, 8}
}

func (dct AAN8x8Buffer) px(img Image, x, y int) uint8 {
	if x >= img.W || y >= img.H {
		return 0
	}
	return img.Pix[y*img.W+x]
}

func (dct AAN8x8Buffer) DCT8x8Buffer(img Image, xpos int, ypos int, into *Buffer32) {
	const (
		a1 = 0.7071067811865476  // sqrt(0.5)
		a2 = 0.5411961001461971  // sqrt(2) * cos(3/16 * 2 * pi)
		a3 = a1                  //
		a4 = 1.3065629648763766  // sqrt(2) * cos(1/16 * 2 * pi)
		a5 = 0.38268343236508984 // cos(3/16 * 2 * pi)

		s0 = 0.3535533905932738  // (cos(0)*sqrt(0.5) / 2) / (1)
		s1 = 0.2548977895520796  // (cos(1*pi/16) / 2) / (-a5+a4+1)
		s2 = 0.2705980500730985  // (cos(2*pi/16) / 2) / (a1+1)
		s3 = 0.30067244346752264 // (cos(3*pi/16) / 2) / (a5+1)
		s4 = s0                  // (cos(4*pi/16) / 2) / (1)
		s5 = 0.44998811156820795 // (cos(5*pi/16) / 2) / (1-a5)
		s6 = 0.6532814824381885  // (cos(6*pi/16) / 2) / (1-a1)
		s7 = 1.2814577238707532  // (cos(7*pi/16) / 2) / (a5-a4+1)

		__ = 0
	)

	var woff = ypos*into.W + xpos
	var tmp [64]float32

	// Columns
	for i := 0; i < 8; i++ {
		xposi := xpos + i
		y0 := float32(dct.px(img, xposi, ypos+0))
		y1 := float32(dct.px(img, xposi, ypos+1))
		y2 := float32(dct.px(img, xposi, ypos+2))
		y3 := float32(dct.px(img, xposi, ypos+3))
		y4 := float32(dct.px(img, xposi, ypos+4))
		y5 := float32(dct.px(img, xposi, ypos+5))
		y6 := float32(dct.px(img, xposi, ypos+6))
		y7 := float32(dct.px(img, xposi, ypos+7))

		/*                                i, 0        i, 1        i, 2        i, 3        i, 4        i, 5        i, 6        i, 7       */
		/*                                ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ────────── */
		b0, b1, b2, b3, b4, b5, b6, b7 := (+y0 + y7), (+y1 + y6), (+y2 + y5), (+y3 + y4), (-y4 + y3), (-y5 + y2), (-y6 + y1), (-y7 + y0)
		c0, c1, c2, c3, c4, c5, c6, c7 := (+b0 + b3), (+b1 + b2), (-b2 + b1), (-b3 + b0), (-b4 - b5), (+b5 + b6), (+b6 + b7), (+b7)
		d0, d1, d2, d3, d4, d5, d6, d7 := (+c0 + c1), (-c1 + c0), (+c2 + c3), (+c3) /**/, (+c4) /**/, (+c5) /**/, (+c6) /**/, (+c7)
		d8 := (d4 + d6) * a5 /*           ↓           ↓           ↓           ↓           ↓           ↓           ↓           ↓          */
		e0, e1, e2, e3, e4, e5, e6, e7 := (+d0) /**/, (+d1) /**/, (+d2 * a1), (+d3) /**/, -d4*+a2-d8, (+d5 * a3), +d6*+a4-d8, (+d7)
		f0, f1, f2, f3, f4, f5, f6, f7 := (+e0) /**/, (+e1) /**/, (+e2 + e3), (+e3 - e2), (+e4) /**/, (+e5 + e7), (+e6) /**/, (+e7 - e5)
		g0, g1, g2, g3, g4, g5, g6, g7 := (+f0) /**/, (+f1) /**/, (+f2) /**/, (+f3) /**/, (+f4 + f7), (+f5 + f6), (-f6 + f5), (+f7 - f4)

		tmp[(0*8)+i] = g0 * s0
		tmp[(4*8)+i] = g1 * s4
		tmp[(2*8)+i] = g2 * s2
		tmp[(6*8)+i] = g3 * s6
		tmp[(5*8)+i] = g4 * s5
		tmp[(1*8)+i] = g5 * s1
		tmp[(7*8)+i] = g6 * s7
		tmp[(3*8)+i] = g7 * s3
	}

	// Rows
	for i, yoff := 0, woff; i < 8; i, yoff = i+1, yoff+into.W {
		yi := i * 8
		x0 := tmp[yi+0]
		x1 := tmp[yi+1]
		x2 := tmp[yi+2]
		x3 := tmp[yi+3]
		x4 := tmp[yi+4]
		x5 := tmp[yi+5]
		x6 := tmp[yi+6]
		x7 := tmp[yi+7]

		/*                                0, i        1, i        2, i        3, i        4, i        5, i        6, i        7, i       */
		/*                                ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ────────── */
		b0, b1, b2, b3, b4, b5, b6, b7 := (+x0 + x7), (+x1 + x6), (+x2 + x5), (+x3 + x4), (-x4 + x3), (-x5 + x2), (-x6 + x1), (-x7 + x0)
		c0, c1, c2, c3, c4, c5, c6, c7 := (+b0 + b3), (+b1 + b2), (-b2 + b1), (-b3 + b0), (-b4 - b5), (+b5 + b6), (+b6 + b7), (+b7)
		d0, d1, d2, d3, d4, d5, d6, d7 := (+c0 + c1), (-c1 + c0), (+c2 + c3), (+c3) /**/, (+c4) /**/, (+c5) /**/, (+c6) /**/, (+c7)
		d8 := (d4 + d6) * a5 /*           ↓           ↓           ↓           ↓           ↓           ↓           ↓           ↓          */
		e0, e1, e2, e3, e4, e5, e6, e7 := (+d0) /**/, (+d1) /**/, (+d2 * a1), (+d3) /**/, -d4*+a2-d8, (+d5 * a3), +d6*+a4-d8, (+d7)
		f0, f1, f2, f3, f4, f5, f6, f7 := (+e0) /**/, (+e1) /**/, (+e2 + e3), (+e3 - e2), (+e4) /**/, (+e5 + e7), (+e6) /**/, (+e7 - e5)
		g0, g1, g2, g3, g4, g5, g6, g7 := (+f0) /**/, (+f1) /**/, (+f2) /**/, (+f3) /**/, (+f4 + f7), (+f5 + f6), (-f6 + f5), (+f7 - f4)

		into.Data[yoff+0] = g0 * s0
		into.Data[yoff+4] = g1 * s4
		into.Data[yoff+2] = g2 * s2
		into.Data[yoff+6] = g3 * s6
		into.Data[yoff+5] = g4 * s5
		into.Data[yoff+1] = g5 * s1
		into.Data[yoff+7] = g6 * s7
		into.Data[yoff+3] = g7 * s3
	}
}

func (dct AAN8x8Buffer) IDCT8x8Buffer(buf *Buffer32, xpos int, ypos int, into Image) {
	const (
		m0 = 1.8477590650225735 // 2 * cos(1/16 * 2 * pi)
		m1 = 1.4142135623730951 // 2 * cos(2/16 * 2 * pi)
		m3 = m1                 //
		m5 = 0.7653668647301797 // 2 * cos(3/16 * 2 * pi)
		m2 = m0 - m5
		m4 = m0 + m5

		s0 = 0.35355339059327373 // cos(0/16*pi) / sqrt(8)
		s1 = 0.4903926402016152  // cos(1/16*pi) / 2.0
		s2 = 0.46193976625564337 // cos(2/16*pi) / 2.0
		s3 = 0.4157348061512726  // cos(3/16*pi) / 2.0
		s4 = 0.3535533905932738  // cos(4/16*pi) / 2.0
		s5 = 0.27778511650980114 // cos(5/16*pi) / 2.0
		s6 = 0.19134171618254492 // cos(6/16*pi) / 2.0
		s7 = 0.09754516100806417 // cos(7/16*pi) / 2.0

		__ = 0
	)

	var tmp [8][8]float32

	woff := ypos*into.W + xpos

	// Columns
	for i := 0; i < 8; i++ {
		xposi := xpos + i

		g0 := buf.At(xposi, ypos+0) * s0
		g1 := buf.At(xposi, ypos+4) * s4
		g2 := buf.At(xposi, ypos+2) * s2
		g3 := buf.At(xposi, ypos+6) * s6
		g4 := buf.At(xposi, ypos+5) * s5
		g5 := buf.At(xposi, ypos+1) * s1
		g6 := buf.At(xposi, ypos+7) * s7
		g7 := buf.At(xposi, ypos+3) * s3

		/*                                    i, 0        i, 1        i, 2        i, 3        i, 4        i, 5        i, 6        i, 7        extra      */
		/*                                    ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ────────── */
		f0, f1, f2, f3, f4, f5, f6, f7, _8 := (+g0) /**/, (+g1) /**/, (+g2) /**/, (+g3) /**/, (+g4 - g7), (+g5 + g6), (+g5 - g6), (+g4 + g7), __
		e0, e1, e2, e3, e4, e5, e6, e7, e8 := (+f0) /**/, (+f1) /**/, (+f2 - f3), (+f2 + f3), (+f4) /**/, (+f5 - f7), (+f6) /**/, (+f5 + f7), (+f4 + f6)
		d0, d1, d2, d3, d4, d5, d6, d7, d8 := (+e0) /**/, (+e1) /**/, (+e2 * m1), (+e3) /**/, (+e4 * m2), (+e5 * m3), (+e6 * m4), (+e7) /**/, (+e8 * m5)
		c0, c1, c2, c3, c4, c5, c6, c7, _8 := (+d0 + d1), (+d0 - d1), (+d2 - d3), (+d3) /**/, (+d4 + d8), (+d5 + d7), (+d6 - d8), (+d7) /**/, __
		c8 := (+c5 - c6) /*                   ↓           ↓           ↓           ↓           ↓           ↓           ↓           ↓                      */
		b0, b1, b2, b3, b4, b5, b6, b7, _8 := (+c0 + c3), (+c1 + c2), (+c1 - c2), (+c0 - c3), (+c4 - c8), (+c8) /**/, (+c6 - c7), (+c7) /**/, __

		_ = _8

		tmp[i][0] = b0 + b7
		tmp[i][1] = b1 + b6
		tmp[i][2] = b2 + b5
		tmp[i][3] = b3 + b4
		tmp[i][4] = b3 - b4
		tmp[i][5] = b2 - b5
		tmp[i][6] = b1 - b6
		tmp[i][7] = b0 - b7
	}

	// Rows
	for i, yoff := 0, woff; i < 8; i, yoff = i+1, yoff+into.W {
		g0 := tmp[0][i] * s0
		g1 := tmp[4][i] * s4
		g2 := tmp[2][i] * s2
		g3 := tmp[6][i] * s6
		g4 := tmp[5][i] * s5
		g5 := tmp[1][i] * s1
		g6 := tmp[7][i] * s7
		g7 := tmp[3][i] * s3

		/*                                    i, 0        i, 1        i, 2        i, 3        i, 4        i, 5        i, 6        i, 7        extra      */
		/*                                    ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ────────── */
		f0, f1, f2, f3, f4, f5, f6, f7, _8 := (+g0) /**/, (+g1) /**/, (+g2) /**/, (+g3) /**/, (+g4 - g7), (+g5 + g6), (+g5 - g6), (+g4 + g7), __
		e0, e1, e2, e3, e4, e5, e6, e7, e8 := (+f0) /**/, (+f1) /**/, (+f2 - f3), (+f2 + f3), (+f4) /**/, (+f5 - f7), (+f6) /**/, (+f5 + f7), (+f4 + f6)
		d0, d1, d2, d3, d4, d5, d6, d7, d8 := (+e0) /**/, (+e1) /**/, (+e2 * m1), (+e3) /**/, (+e4 * m2), (+e5 * m3), (+e6 * m4), (+e7) /**/, (+e8 * m5)
		c0, c1, c2, c3, c4, c5, c6, c7, _8 := (+d0 + d1), (+d0 - d1), (+d2 - d3), (+d3) /**/, (+d4 + d8), (+d5 + d7), (+d6 - d8), (+d7) /**/, __
		c8 := (+c5 - c6) /*                   ↓           ↓           ↓           ↓           ↓           ↓           ↓           ↓                      */
		b0, b1, b2, b3, b4, b5, b6, b7, _8 := (+c0 + c3), (+c1 + c2), (+c1 - c2), (+c0 - c3), (+c4 - c8), (+c8) /**/, (+c6 - c7), (+c7) /**/, __

		_ = _8

		yposi := ypos + i

		// XXX: +0.5 is "grug rounding"
		into.Put(xpos+0, yposi, uint8(b0+b7+0.5))
		into.Put(xpos+1, yposi, uint8(b1+b6+0.5))
		into.Put(xpos+2, yposi, uint8(b2+b5+0.5))
		into.Put(xpos+3, yposi, uint8(b3+b4+0.5))
		into.Put(xpos+4, yposi, uint8(b3-b4+0.5))
		into.Put(xpos+5, yposi, uint8(b2-b5+0.5))
		into.Put(xpos+6, yposi, uint8(b1-b6+0.5))
		into.Put(xpos+7, yposi, uint8(b0-b7+0.5))
	}
}

A  => aan_buffer_test.go +94 -0
@@ 1,94 @@
package grugdct

import (
	"reflect"
	"testing"
)

func TestAANBuffer(t *testing.T) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewAAN8x8Buffer()
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	tw, th := transform.DCTSize().X, transform.DCTSize().Y

	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			transform.DCT8x8Buffer(img, x, y, dct)
		}
	}

	result := NewImage(img.W, img.H)
	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			transform.IDCT8x8Buffer(dct, x, y, result)
		}
	}

	if !reflect.DeepEqual(img.Pix, result.Pix) {
		// Print2DUint8s(img.Luma, img.W, img.H, 0)
		// fmt.Println()
		// Print2DUint8s(result.Luma, result.W, result.H, 0)
		// fmt.Println()

		t.Fatal()
	}
}

func BenchmarkAANBuffer(b *testing.B) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewAAN8x8Buffer()
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	dctH, dctW := transform.DCTSize().X, transform.DCTSize().Y

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.DCT8x8Buffer(img, x, y, dct)
				}
			}
		}
	})

	result := NewImage(img.W, img.H)

	b.Run("idct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8Buffer(dct, x, y, result)
				}
			}
		}
	})
}

A  => aan_cgo.c +328 -0
@@ 1,328 @@
#include "aan_cgo.h"
#include <stdio.h>

/* #define px(luma, luma_size, luma_width, x, y) ({ \ */
/*     int pos = (y)*luma_width+x; \ */
/*     ((x >= luma_width || ((size_t)pos >= luma_size)) ? 0 : (float)(luma[pos])); \ */
/* }) */

static inline float px(uint8_t *luma, size_t luma_size, int luma_width, int x, int y) {
    if (x >= luma_width) return 0;
    int pos = y*luma_width+x;
    if ((size_t)pos >= luma_size) return 0;
    return (float)(luma[pos]);
}

static inline void put(uint8_t *luma, size_t luma_size, int luma_width, int x, int y, float v) {
    if (x >= luma_width) return;
    int pos = y*luma_width+x;
    if ((size_t)pos >= luma_size) return;
    // XXX: +0.5 is "grug rounding"
    luma[pos] = (uint8_t)(v+0.5);
}

void do_nothing(void) {
    // Just here for cgo baseline benchmarking
}

void dct_8x8(uint8_t *luma, size_t luma_size, int luma_width, int xpos, int ypos, float *mat) {
    static const float a1 = 0.7071067811865476;  // sqrt(0.5)
    static const float a2 = 0.5411961001461971;  // sqrt(2) * cos(3/16 * 2 * pi)
    static const float a3 = a1;                  //
    static const float a4 = 1.3065629648763766;  // sqrt(2) * cos(1/16 * 2 * pi)
    static const float a5 = 0.38268343236508984; // cos(3/16 * 2 * pi)

    static const float s0 = 0.3535533905932738;  // (cos(0)*sqrt(0.5) / 2) / (1)
    static const float s1 = 0.2548977895520796;  // (cos(1*pi/16) / 2) / (-a5+a4+1)
    static const float s2 = 0.2705980500730985;  // (cos(2*pi/16) / 2) / (a1+1)
    static const float s3 = 0.30067244346752264; // (cos(3*pi/16) / 2) / (a5+1)
    static const float s4 = s0;                  // (cos(4*pi/16) / 2) / (1)
    static const float s5 = 0.44998811156820795; // (cos(5*pi/16) / 2) / (1-a5)
    static const float s6 = 0.6532814824381885;  // (cos(6*pi/16) / 2) / (1-a1)
    static const float s7 = 1.2814577238707532;  // (cos(7*pi/16) / 2) / (a5-a4+1)

	// Columns
	for (int i = 0; i < 8; i++) {
		int xposi = xpos + i;
		const float y0 = px(luma, luma_size, luma_width, xposi, ypos+0);
		const float y1 = px(luma, luma_size, luma_width, xposi, ypos+1);
		const float y2 = px(luma, luma_size, luma_width, xposi, ypos+2);
		const float y3 = px(luma, luma_size, luma_width, xposi, ypos+3);
		const float y4 = px(luma, luma_size, luma_width, xposi, ypos+4);
		const float y5 = px(luma, luma_size, luma_width, xposi, ypos+5);
		const float y6 = px(luma, luma_size, luma_width, xposi, ypos+6);
		const float y7 = px(luma, luma_size, luma_width, xposi, ypos+7);

		const float b0 = +y0 + y7;
		const float b1 = +y1 + y6;
		const float b2 = +y2 + y5;
		const float b3 = +y3 + y4;
		const float b4 = -y4 + y3;
		const float b5 = -y5 + y2;
		const float b6 = -y6 + y1;
		const float bcde7 = -y7 + y0;

		const float c0 = +b0 + b3;
		const float c1 = +b1 + b2;
		const float c2 = -b2 + b1;
		const float c3 = -b3 + b0;
		const float cd4 = -b4 - b5;
		const float cd5 = +b5 + b6;
		const float cd6 = +b6 + bcde7;

		const float d0 = +c0 + c1;
		const float defg1 = -c1 + c0;
		const float d2 = +c2 + c3;
		const float de3 = +c3;
		const float d8 = (cd4 + cd6) * a5;

		const float efg0 = d0;
		const float e2 = d2 * a1;
		const float ef4 = -cd4*a2 - d8;
		const float e5 = cd5 * a3;
		const float ef6 = cd6*a4 - d8;

		const float fg2 = e2 + de3;
		const float fg3 = de3 - e2;
		const float f5 = e5 + bcde7;
		const float f7 = bcde7 - e5;

		const float g4 = ef4 + f7;
		const float g5 = f5 + ef6;
		const float g6 = -ef6 + f5;
		const float g7 = f7 - ef4;

		mat[(0*8)+i] = efg0 * s0;
		mat[(4*8)+i] = defg1 * s4;
		mat[(2*8)+i] = fg2 * s2;
		mat[(6*8)+i] = fg3 * s6;
		mat[(5*8)+i] = g4 * s5;
		mat[(1*8)+i] = g5 * s1;
		mat[(7*8)+i] = g6 * s7;
		mat[(3*8)+i] = g7 * s3;
	}

	// Rows
    int yoff = 0;
	for (int i = 0; i < 8; i++) {
		const float x0 = mat[yoff+0];
		const float x1 = mat[yoff+1];
		const float x2 = mat[yoff+2];
		const float x3 = mat[yoff+3];
		const float x4 = mat[yoff+4];
		const float x5 = mat[yoff+5];
		const float x6 = mat[yoff+6];
		const float x7 = mat[yoff+7];

		const float b0 = +x0 + x7;
		const float b1 = +x1 + x6;
		const float b2 = +x2 + x5;
		const float b3 = +x3 + x4;
		const float b4 = -x4 + x3;
		const float b5 = -x5 + x2;
		const float b6 = -x6 + x1;
		const float bcde7 = -x7 + x0;

		const float c0 = +b0 + b3;
		const float c1 = +b1 + b2;
		const float c2 = -b2 + b1;
		const float c3 = -b3 + b0;
		const float cd4 = -b4 - b5;
		const float cd5 = +b5 + b6;
		const float cd6 = +b6 + bcde7;

		const float d0 = +c0 + c1;
		const float defg1 = -c1 + c0;
		const float d2 = +c2 + c3;
		const float de3 = +c3;
		const float d8 = (cd4 + cd6) * a5;

		const float efg0 = d0;
		const float e2 = d2 * a1;
		const float ef4 = -cd4*a2 - d8;
		const float e5 = cd5 * a3;
		const float ef6 = cd6*a4 - d8;

		const float fg2 = e2 + de3;
		const float fg3 = de3 - e2;
		const float f5 = e5 + bcde7;
		const float f7 = bcde7 - e5;

		const float g4 = ef4 + f7;
		const float g5 = f5 + ef6;
		const float g6 = -ef6 + f5;
		const float g7 = f7 - ef4;

		mat[yoff+0] = efg0 * s0;
		mat[yoff+4] = defg1 * s4;
		mat[yoff+2] = fg2 * s2;
		mat[yoff+6] = fg3 * s6;
		mat[yoff+5] = g4 * s5;
		mat[yoff+1] = g5 * s1;
		mat[yoff+7] = g6 * s7;
		mat[yoff+3] = g7 * s3;

        yoff += 8;
	}
}

void idct_8x8(float *mat, int xpos, int ypos, uint8_t *luma, size_t luma_size, int luma_width) {
    static const float m0 = 1.8477590650225735; // 2 * cos(1/16 * 2 * pi)
    static const float m1 = 1.4142135623730951; // 2 * cos(2/16 * 2 * pi)
    static const float m3 = m1;                 //
    static const float m5 = 0.7653668647301797; // 2 * cos(3/16 * 2 * pi)
    static const float m2 = m0 - m5;
    static const float m4 = m0 + m5;

    static const float s0 = 0.35355339059327373; // cos(0/16*pi) / sqrt(8)
    static const float s1 = 0.49039264020161520; // cos(1/16*pi) / 2.0
    static const float s2 = 0.46193976625564337; // cos(2/16*pi) / 2.0
    static const float s3 = 0.41573480615127260; // cos(3/16*pi) / 2.0
    static const float s4 = 0.35355339059327380; // cos(4/16*pi) / 2.0
    static const float s5 = 0.27778511650980114; // cos(5/16*pi) / 2.0
    static const float s6 = 0.19134171618254492; // cos(6/16*pi) / 2.0
    static const float s7 = 0.09754516100806417; // cos(7/16*pi) / 2.0

	// Columns
	for (int i = 0; i < 8; i++) {
		float g0 = mat[8*0+i] * s0;
		float g1 = mat[4*8+i] * s4;
		float g2 = mat[2*8+i] * s2;
		float g3 = mat[6*8+i] * s6;
		float g4 = mat[5*8+i] * s5;
		float g5 = mat[1*8+i] * s1;
		float g6 = mat[7*8+i] * s7;
		float g7 = mat[3*8+i] * s3;

		float f0 = g0;
		float f1 = g1;
		float f2 = g2;
		float f3 = g3;
		float f4 = g4 - g7;
		float f5 = g5 + g6;
		float f6 = g5 - g6;
		float f7 = g4 + g7;

		float e0 = f0;
		float e1 = f1;
		float e2 = f2 - f3;
		float e3 = f2 + f3;
		float e4 = f4;
		float e5 = f5 - f7;
		float e6 = f6;
		float e7 = f5 + f7;
		float e8 = f4 + f6;

		float d0 = e0;
		float d1 = e1;
		float d2 = e2 * m1;
		float d3 = e3;
		float d4 = e4 * m2;
		float d5 = e5 * m3;
		float d6 = e6 * m4;
		float d7 = e7;
		float d8 = e8 * m5;

		float c0 = d0 + d1;
		float c1 = d0 - d1;
		float c2 = d2 - d3;
		float c3 = d3;
		float c4 = d4 + d8;
		float c5 = d5 + d7;
		float c6 = d6 - d8;
		float c7 = d7;
		float c8 = c5 - c6;

		float b0 = c0 + c3;
		float b1 = c1 + c2;
		float b2 = c1 - c2;
		float b3 = c0 - c3;
		float b4 = c4 - c8;
		float b5 = c8;
		float b6 = c6 - c7;
		float b7 = c7;

		mat[0*8+i] = b0 + b7;
		mat[1*8+i] = b1 + b6;
		mat[2*8+i] = b2 + b5;
		mat[3*8+i] = b3 + b4;
		mat[4*8+i] = b3 - b4;
		mat[5*8+i] = b2 - b5;
		mat[6*8+i] = b1 - b6;
		mat[7*8+i] = b0 - b7;
	}

	// Rows
    int yoff = 0;
	for (int i = 0; i < 8; i++) {
		float g0 = mat[yoff+0] * s0;
		float g1 = mat[yoff+4] * s4;
		float g2 = mat[yoff+2] * s2;
		float g3 = mat[yoff+6] * s6;
		float g4 = mat[yoff+5] * s5;
		float g5 = mat[yoff+1] * s1;
		float g6 = mat[yoff+7] * s7;
		float g7 = mat[yoff+3] * s3;

		float f0 = g0;
		float f1 = g1;
		float f2 = g2;
		float f3 = g3;
		float f4 = g4 - g7;
		float f5 = g5 + g6;
		float f6 = g5 - g6;
		float f7 = g4 + g7;

		float e0 = f0;
		float e1 = f1;
		float e2 = f2 - f3;
		float e3 = f2 + f3;
		float e4 = f4;
		float e5 = f5 - f7;
		float e6 = f6;
		float e7 = f5 + f7;
		float e8 = f4 + f6;

		float d0 = e0;
		float d1 = e1;
		float d2 = e2 * m1;
		float d3 = e3;
		float d4 = e4 * m2;
		float d5 = e5 * m3;
		float d6 = e6 * m4;
		float d7 = e7;
		float d8 = e8 * m5;

		float c0 = d0 + d1;
		float c1 = d0 - d1;
		float c2 = d2 - d3;
		float c3 = d3;
		float c4 = d4 + d8;
		float c5 = d5 + d7;
		float c6 = d6 - d8;
		float c7 = d7;
		float c8 = c5 - c6;

		float b0 = c0 + c3;
		float b1 = c1 + c2;
		float b2 = c1 - c2;
		float b3 = c0 - c3;
		float b4 = c4 - c8;
		float b5 = c8;
		float b6 = c6 - c7;
		float b7 = c7;

		int yposi = ypos + i;

		put(luma, luma_size, luma_width, xpos+0, yposi, b0+b7);
		put(luma, luma_size, luma_width, xpos+1, yposi, b1+b6);
		put(luma, luma_size, luma_width, xpos+2, yposi, b2+b5);
		put(luma, luma_size, luma_width, xpos+3, yposi, b3+b4);
		put(luma, luma_size, luma_width, xpos+4, yposi, b3-b4);
		put(luma, luma_size, luma_width, xpos+5, yposi, b2-b5);
		put(luma, luma_size, luma_width, xpos+6, yposi, b1-b6);
		put(luma, luma_size, luma_width, xpos+7, yposi, b0-b7);

        yoff += 8;
	}
}

A  => aan_cgo.go +42 -0
@@ 1,42 @@
package grugdct

// Can't use -Wall, need to disable -Wunused-variable:
// https://github.com/golang/go/issues/6883#issuecomment-383919125

// #include <stdio.h>
// #include "aan_cgo.h"
// #cgo CFLAGS: -std=c11 -O3 -ffast-math -Wall -Wno-unused-variable -Wpedantic
import "C"

import (
	"image"
)

type AAN8x8C struct{}

func NewAAN8x8C() AAN8x8C {
	return AAN8x8C{}
}

var _ ImageTransform8x8F32 = AAN8x8C{}

func (aan AAN8x8C) DCTSize() image.Point {
	return image.Point{8, 8}
}

func (aan AAN8x8C) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32) {
	matp := (*C.float)(&mat[0])
	luma := (*C.uint8_t)(&img.Pix[0])
	C.dct_8x8(luma, C.size_t(len(img.Pix)), C.int(img.W), C.int(xpos), C.int(ypos), matp)
	return mat
}

func (aan AAN8x8C) IDCT8x8(mat Matrix8x8F32, xpos int, ypos int, into Image) {
	matp := (*C.float)(&mat[0])
	luma := (*C.uint8_t)(&into.Pix[0])
	C.idct_8x8(matp, C.int(xpos), C.int(ypos), luma, C.size_t(len(into.Pix)), C.int(into.W))
}

func (aan AAN8x8C) DoNothing() {
	C.do_nothing()
}

A  => aan_cgo.h +7 -0
@@ 1,7 @@
#include <stdint.h>
#include <stdlib.h>

void dct_8x8(uint8_t *luma, size_t luma_size, int width, int xpos, int ypos, float *mat);
void idct_8x8(float *mat, int xpos, int ypos, uint8_t *luma, size_t luma_size, int luma_width);

void do_nothing(void);

A  => aan_cgo_test.go +111 -0
@@ 1,111 @@
package grugdct

import (
	"reflect"
	"testing"
)

func TestAANC(t *testing.T) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewAAN8x8C()
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	tw, th := transform.DCTSize().X, transform.DCTSize().Y
	result := NewImage(img.W, img.H)

	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			mat := transform.DCT8x8(img, x, y)
			dct.PutMat328x8(x, y, mat)
			transform.IDCT8x8(mat, x, y, result)
		}
	}

	if !reflect.DeepEqual(img.Pix, result.Pix) {
		// Print2DUint8s(img.Luma, img.W, img.H, 0)
		// fmt.Println()
		// Print2DUint8s(result.Luma, result.W, result.H, 0)
		// fmt.Println()

		t.Fatal()
	}
}

func BenchmarkAANCWith8x8(b *testing.B) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewAAN8x8C()
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	dctH, dctW := transform.DCTSize().X, transform.DCTSize().Y

	b.Run("baseline", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.DoNothing()
				}
			}
		}
	})

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.DCT8x8(img, x, y)
				}
			}
		}
	})

	result := NewImage(img.W, img.H)

	mats := make([]Matrix8x8F32, dct.DCTDims.X*dct.DCTDims.Y)

	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mats[m] = transform.DCT8x8(img, x, y)
			m++
		}
	}

	b.Run("idct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8(mats[m], x, y, result)
					m++
				}
			}
		}
	})
}

A  => aan_flat.go +359 -0
@@ 1,359 @@
package grugdct

import "image"

// A version of the AAN algorithm with the fancy formatting ironed out. Vastly
// less readable.
type AAN8x8Flat struct{}

func NewAAN8x8Flat() AAN8x8Flat {
	return AAN8x8Flat{}
}

var _ ImageTransform8x8F32 = AAN8x8Flat{}

func (dct AAN8x8Flat) DCTSize() image.Point {
	return image.Point{8, 8}
}

func (dct AAN8x8Flat) px(img Image, x, y int) uint8 {
	if x >= img.W || y >= img.H {
		return 0
	}
	return img.Pix[y*img.W+x]
}

func (aan AAN8x8Flat) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32) {
	const (
		a1 = 0.7071067811865476  // sqrt(0.5)
		a2 = 0.5411961001461971  // sqrt(2) * cos(3/16 * 2 * pi)
		a3 = a1                  //
		a4 = 1.3065629648763766  // sqrt(2) * cos(1/16 * 2 * pi)
		a5 = 0.38268343236508984 // cos(3/16 * 2 * pi)

		s0 = 0.3535533905932738  // (cos(0)*sqrt(0.5) / 2) / (1)
		s1 = 0.2548977895520796  // (cos(1*pi/16) / 2) / (-a5+a4+1)
		s2 = 0.2705980500730985  // (cos(2*pi/16) / 2) / (a1+1)
		s3 = 0.30067244346752264 // (cos(3*pi/16) / 2) / (a5+1)
		s4 = s0                  // (cos(4*pi/16) / 2) / (1)
		s5 = 0.44998811156820795 // (cos(5*pi/16) / 2) / (1-a5)
		s6 = 0.6532814824381885  // (cos(6*pi/16) / 2) / (1-a1)
		s7 = 1.2814577238707532  // (cos(7*pi/16) / 2) / (a5-a4+1)
	)

	// Columns
	for i := 0; i < 8; i++ {
		xposi := xpos + i
		y0 := float32(aan.px(img, xposi, ypos+0))
		y1 := float32(aan.px(img, xposi, ypos+1))
		y2 := float32(aan.px(img, xposi, ypos+2))
		y3 := float32(aan.px(img, xposi, ypos+3))
		y4 := float32(aan.px(img, xposi, ypos+4))
		y5 := float32(aan.px(img, xposi, ypos+5))
		y6 := float32(aan.px(img, xposi, ypos+6))
		y7 := float32(aan.px(img, xposi, ypos+7))

		b0 := +y0 + y7
		b1 := +y1 + y6
		b2 := +y2 + y5
		b3 := +y3 + y4
		b4 := -y4 + y3
		b5 := -y5 + y2
		b6 := -y6 + y1
		b7 := -y7 + y0

		c0 := +b0 + b3
		c1 := +b1 + b2
		c2 := -b2 + b1
		c3 := -b3 + b0
		c4 := -b4 - b5
		c5 := +b5 + b6
		c6 := +b6 + b7
		c7 := +b7

		d0 := +c0 + c1
		d1 := -c1 + c0
		d2 := +c2 + c3
		d3 := +c3
		d4 := +c4
		d5 := +c5
		d6 := +c6
		d7 := +c7
		d8 := (d4 + d6) * a5

		e0 := d0
		e1 := d1
		e2 := d2 * a1
		e3 := d3
		e4 := -d4*a2 - d8
		e5 := d5 * a3
		e6 := d6*a4 - d8
		e7 := d7

		f0 := e0
		f1 := e1
		f2 := e2 + e3
		f3 := e3 - e2
		f4 := e4
		f5 := e5 + e7
		f6 := e6
		f7 := e7 - e5

		g0 := f0
		g1 := f1
		g2 := f2
		g3 := f3
		g4 := f4 + f7
		g5 := f5 + f6
		g6 := -f6 + f5
		g7 := f7 - f4

		mat[(0*8)+i] = g0 * s0
		mat[(4*8)+i] = g1 * s4
		mat[(2*8)+i] = g2 * s2
		mat[(6*8)+i] = g3 * s6
		mat[(5*8)+i] = g4 * s5
		mat[(1*8)+i] = g5 * s1
		mat[(7*8)+i] = g6 * s7
		mat[(3*8)+i] = g7 * s3
	}

	// Rows
	for i, yoff := 0, 0; i < 8; i, yoff = i+1, yoff+8 {
		x0 := mat[yoff+0]
		x1 := mat[yoff+1]
		x2 := mat[yoff+2]
		x3 := mat[yoff+3]
		x4 := mat[yoff+4]
		x5 := mat[yoff+5]
		x6 := mat[yoff+6]
		x7 := mat[yoff+7]

		b0 := +x0 + x7
		b1 := +x1 + x6
		b2 := +x2 + x5
		b3 := +x3 + x4
		b4 := -x4 + x3
		b5 := -x5 + x2
		b6 := -x6 + x1
		b7 := -x7 + x0

		c0 := +b0 + b3
		c1 := +b1 + b2
		c2 := -b2 + b1
		c3 := -b3 + b0
		c4 := -b4 - b5
		c5 := +b5 + b6
		c6 := +b6 + b7
		c7 := +b7

		d0 := +c0 + c1
		d1 := -c1 + c0
		d2 := +c2 + c3
		d3 := +c3
		d4 := +c4
		d5 := +c5
		d6 := +c6
		d7 := +c7
		d8 := (d4 + d6) * a5

		e0 := d0
		e1 := d1
		e2 := d2 * a1
		e3 := d3
		e4 := -d4*a2 - d8
		e5 := d5 * a3
		e6 := d6*a4 - d8
		e7 := d7

		f0 := e0
		f1 := e1
		f2 := e2 + e3
		f3 := e3 - e2
		f4 := e4
		f5 := e5 + e7
		f6 := e6
		f7 := e7 - e5

		g0 := f0
		g1 := f1
		g2 := f2
		g3 := f3
		g4 := f4 + f7
		g5 := f5 + f6
		g6 := -f6 + f5
		g7 := f7 - f4

		mat[yoff+0] = g0 * s0
		mat[yoff+4] = g1 * s4
		mat[yoff+2] = g2 * s2
		mat[yoff+6] = g3 * s6
		mat[yoff+5] = g4 * s5
		mat[yoff+1] = g5 * s1
		mat[yoff+7] = g6 * s7
		mat[yoff+3] = g7 * s3
	}

	return mat
}

func (aan AAN8x8Flat) IDCT8x8(mat Matrix8x8F32, xpos int, ypos int, into Image) {
	const (
		m0 = 1.8477590650225735 // 2 * cos(1/16 * 2 * pi)
		m1 = 1.4142135623730951 // 2 * cos(2/16 * 2 * pi)
		m3 = m1                 //
		m5 = 0.7653668647301797 // 2 * cos(3/16 * 2 * pi)
		m2 = m0 - m5
		m4 = m0 + m5

		s0 = 0.35355339059327373 // cos(0/16*pi) / sqrt(8)
		s1 = 0.4903926402016152  // cos(1/16*pi) / 2.0
		s2 = 0.46193976625564337 // cos(2/16*pi) / 2.0
		s3 = 0.4157348061512726  // cos(3/16*pi) / 2.0
		s4 = 0.3535533905932738  // cos(4/16*pi) / 2.0
		s5 = 0.27778511650980114 // cos(5/16*pi) / 2.0
		s6 = 0.19134171618254492 // cos(6/16*pi) / 2.0
		s7 = 0.09754516100806417 // cos(7/16*pi) / 2.0
	)

	// Columns
	for i := 0; i < 8; i++ {
		g0 := mat[8*0+i] * s0
		g1 := mat[4*8+i] * s4
		g2 := mat[2*8+i] * s2
		g3 := mat[6*8+i] * s6
		g4 := mat[5*8+i] * s5
		g5 := mat[1*8+i] * s1
		g6 := mat[7*8+i] * s7
		g7 := mat[3*8+i] * s3

		f0 := g0
		f1 := g1
		f2 := g2
		f3 := g3
		f4 := g4 - g7
		f5 := g5 + g6
		f6 := g5 - g6
		f7 := g4 + g7

		e0 := f0
		e1 := f1
		e2 := f2 - f3
		e3 := f2 + f3
		e4 := f4
		e5 := f5 - f7
		e6 := f6
		e7 := f5 + f7
		e8 := f4 + f6

		d0 := e0
		d1 := e1
		d2 := e2 * m1
		d3 := e3
		d4 := e4 * m2
		d5 := e5 * m3
		d6 := e6 * m4
		d7 := e7
		d8 := e8 * m5

		c0 := d0 + d1
		c1 := d0 - d1
		c2 := d2 - d3
		c3 := d3
		c4 := d4 + d8
		c5 := d5 + d7
		c6 := d6 - d8
		c7 := d7
		c8 := c5 - c6

		b0 := c0 + c3
		b1 := c1 + c2
		b2 := c1 - c2
		b3 := c0 - c3
		b4 := c4 - c8
		b5 := c8
		b6 := c6 - c7
		b7 := c7

		mat[0*8+i] = b0 + b7
		mat[1*8+i] = b1 + b6
		mat[2*8+i] = b2 + b5
		mat[3*8+i] = b3 + b4
		mat[4*8+i] = b3 - b4
		mat[5*8+i] = b2 - b5
		mat[6*8+i] = b1 - b6
		mat[7*8+i] = b0 - b7
	}

	// Rows
	for i, yoff := 0, 0; i < 8; i, yoff = i+1, yoff+8 {
		g0 := mat[yoff+0] * s0
		g1 := mat[yoff+4] * s4
		g2 := mat[yoff+2] * s2
		g3 := mat[yoff+6] * s6
		g4 := mat[yoff+5] * s5
		g5 := mat[yoff+1] * s1
		g6 := mat[yoff+7] * s7
		g7 := mat[yoff+3] * s3

		f0 := g0
		f1 := g1
		f2 := g2
		f3 := g3
		f4 := g4 - g7
		f5 := g5 + g6
		f6 := g5 - g6
		f7 := g4 + g7

		e0 := f0
		e1 := f1
		e2 := f2 - f3
		e3 := f2 + f3
		e4 := f4
		e5 := f5 - f7
		e6 := f6
		e7 := f5 + f7
		e8 := f4 + f6

		d0 := e0
		d1 := e1
		d2 := e2 * m1
		d3 := e3
		d4 := e4 * m2
		d5 := e5 * m3
		d6 := e6 * m4
		d7 := e7
		d8 := e8 * m5

		c0 := d0 + d1
		c1 := d0 - d1
		c2 := d2 - d3
		c3 := d3
		c4 := d4 + d8
		c5 := d5 + d7
		c6 := d6 - d8
		c7 := d7
		c8 := c5 - c6

		b0 := c0 + c3
		b1 := c1 + c2
		b2 := c1 - c2
		b3 := c0 - c3
		b4 := c4 - c8
		b5 := c8
		b6 := c6 - c7
		b7 := c7

		yposi := ypos + i

		// XXX: +0.5 is "grug rounding"
		into.Put(xpos+0, yposi, uint8(b0+b7+0.5))
		into.Put(xpos+1, yposi, uint8(b1+b6+0.5))
		into.Put(xpos+2, yposi, uint8(b2+b5+0.5))
		into.Put(xpos+3, yposi, uint8(b3+b4+0.5))
		into.Put(xpos+4, yposi, uint8(b3-b4+0.5))
		into.Put(xpos+5, yposi, uint8(b2-b5+0.5))
		into.Put(xpos+6, yposi, uint8(b1-b6+0.5))
		into.Put(xpos+7, yposi, uint8(b0-b7+0.5))
	}
}

A  => aan_flat_test.go +100 -0
@@ 1,100 @@
package grugdct

import (
	"reflect"
	"testing"
)

func TestAANFlat(t *testing.T) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewAAN8x8()
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	tw, th := transform.DCTSize().X, transform.DCTSize().Y
	result := NewImage(img.W, img.H)

	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			mat := transform.DCT8x8(img, x, y)
			dct.PutMat328x8(x, y, mat)
			transform.IDCT8x8(mat, x, y, result)
		}
	}

	if !reflect.DeepEqual(img.Pix, result.Pix) {
		// Print2DUint8s(img.Pix, img.W, img.H, 0)
		// fmt.Println()
		// Print2DUint8s(result.Pix, result.W, result.H, 0)
		// fmt.Println()

		t.Fatal()
	}
}

func BenchmarkAANFlat(b *testing.B) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewAAN8x8()
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	dctH, dctW := transform.DCTSize().X, transform.DCTSize().Y

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.DCT8x8(img, x, y)
				}
			}
		}
	})

	result := NewImage(img.W, img.H)

	mats := make([]Matrix8x8F32, dct.DCTDims.X*dct.DCTDims.Y)

	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mats[m] = transform.DCT8x8(img, x, y)
			m++
		}
	}

	b.Run("idct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8(mats[m], x, y, result)
					m++
				}
			}
		}
	})
}

A  => aan_std.go +208 -0
@@ 1,208 @@
package grugdct

import (
	"image"
)

type AAN8x8 struct{}

func NewAAN8x8() AAN8x8 {
	return AAN8x8{}
}

var _ ImageTransform8x8F32 = AAN8x8{}

func (aan AAN8x8) DCTSize() image.Point {
	return image.Point{8, 8}
}

func (aan AAN8x8) px(img Image, x, y int) uint8 {
	if x >= img.W || y >= img.H {
		return 0
	}
	return img.Pix[y*img.W+x]
}

func (aan AAN8x8) putpx(img Image, x, y int, v uint8) {
	if x >= img.W || y >= img.H {
		return
	}
	img.Pix[y*img.W+x] = v
}

func (aan AAN8x8) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32) {
	const (
		a1 = 0.7071067811865476  // sqrt(0.5)
		a2 = 0.5411961001461971  // sqrt(2) * cos(3/16 * 2 * pi)
		a3 = a1                  //
		a4 = 1.3065629648763766  // sqrt(2) * cos(1/16 * 2 * pi)
		a5 = 0.38268343236508984 // cos(3/16 * 2 * pi)

		s0 = 0.3535533905932738  // (cos(0)*sqrt(0.5) / 2) / (1)
		s1 = 0.2548977895520796  // (cos(1*pi/16) / 2) / (-a5+a4+1)
		s2 = 0.2705980500730985  // (cos(2*pi/16) / 2) / (a1+1)
		s3 = 0.30067244346752264 // (cos(3*pi/16) / 2) / (a5+1)
		s4 = s0                  // (cos(4*pi/16) / 2) / (1)
		s5 = 0.44998811156820795 // (cos(5*pi/16) / 2) / (1-a5)
		s6 = 0.6532814824381885  // (cos(6*pi/16) / 2) / (1-a1)
		s7 = 1.2814577238707532  // (cos(7*pi/16) / 2) / (a5-a4+1)

		__ = 0
	)

	// Columns
	for i := 0; i < 8; i++ {
		xposi := xpos + i
		y0 := float32(aan.px(img, xposi, ypos+0))
		y1 := float32(aan.px(img, xposi, ypos+1))
		y2 := float32(aan.px(img, xposi, ypos+2))
		y3 := float32(aan.px(img, xposi, ypos+3))
		y4 := float32(aan.px(img, xposi, ypos+4))
		y5 := float32(aan.px(img, xposi, ypos+5))
		y6 := float32(aan.px(img, xposi, ypos+6))
		y7 := float32(aan.px(img, xposi, ypos+7))

		/*                                i, 0        i, 1        i, 2        i, 3        i, 4        i, 5        i, 6        i, 7       */
		/*                                ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ────────── */
		b0, b1, b2, b3, b4, b5, b6, b7 := (+y0 + y7), (+y1 + y6), (+y2 + y5), (+y3 + y4), (-y4 + y3), (-y5 + y2), (-y6 + y1), (-y7 + y0)
		c0, c1, c2, c3, c4, c5, c6, c7 := (+b0 + b3), (+b1 + b2), (-b2 + b1), (-b3 + b0), (-b4 - b5), (+b5 + b6), (+b6 + b7), (+b7)
		d0, d1, d2, d3, d4, d5, d6, d7 := (+c0 + c1), (-c1 + c0), (+c2 + c3), (+c3) /**/, (+c4) /**/, (+c5) /**/, (+c6) /**/, (+c7)
		d8 := (d4 + d6) * a5 /*           ↓           ↓           ↓           ↓           ↓           ↓           ↓           ↓          */
		e0, e1, e2, e3, e4, e5, e6, e7 := (+d0) /**/, (+d1) /**/, (+d2 * a1), (+d3) /**/, -d4*+a2-d8, (+d5 * a3), +d6*+a4-d8, (+d7)
		f0, f1, f2, f3, f4, f5, f6, f7 := (+e0) /**/, (+e1) /**/, (+e2 + e3), (+e3 - e2), (+e4) /**/, (+e5 + e7), (+e6) /**/, (+e7 - e5)
		g0, g1, g2, g3, g4, g5, g6, g7 := (+f0) /**/, (+f1) /**/, (+f2) /**/, (+f3) /**/, (+f4 + f7), (+f5 + f6), (-f6 + f5), (+f7 - f4)

		mat[(0*8)+i] = g0 * s0
		mat[(4*8)+i] = g1 * s4
		mat[(2*8)+i] = g2 * s2
		mat[(6*8)+i] = g3 * s6
		mat[(5*8)+i] = g4 * s5
		mat[(1*8)+i] = g5 * s1
		mat[(7*8)+i] = g6 * s7
		mat[(3*8)+i] = g7 * s3
	}

	// Rows
	for i, yoff := 0, 0; i < 8; i, yoff = i+1, yoff+8 {
		yi := i * 8
		x0 := mat[yi+0]
		x1 := mat[yi+1]
		x2 := mat[yi+2]
		x3 := mat[yi+3]
		x4 := mat[yi+4]
		x5 := mat[yi+5]
		x6 := mat[yi+6]
		x7 := mat[yi+7]

		/*                                0, i        1, i        2, i        3, i        4, i        5, i        6, i        7, i       */
		/*                                ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ────────── */
		b0, b1, b2, b3, b4, b5, b6, b7 := (+x0 + x7), (+x1 + x6), (+x2 + x5), (+x3 + x4), (-x4 + x3), (-x5 + x2), (-x6 + x1), (-x7 + x0)
		c0, c1, c2, c3, c4, c5, c6, c7 := (+b0 + b3), (+b1 + b2), (-b2 + b1), (-b3 + b0), (-b4 - b5), (+b5 + b6), (+b6 + b7), (+b7)
		d0, d1, d2, d3, d4, d5, d6, d7 := (+c0 + c1), (-c1 + c0), (+c2 + c3), (+c3) /**/, (+c4) /**/, (+c5) /**/, (+c6) /**/, (+c7)
		d8 := (d4 + d6) * a5 /*           ↓           ↓           ↓           ↓           ↓           ↓           ↓           ↓          */
		e0, e1, e2, e3, e4, e5, e6, e7 := (+d0) /**/, (+d1) /**/, (+d2 * a1), (+d3) /**/, -d4*+a2-d8, (+d5 * a3), +d6*+a4-d8, (+d7)
		f0, f1, f2, f3, f4, f5, f6, f7 := (+e0) /**/, (+e1) /**/, (+e2 + e3), (+e3 - e2), (+e4) /**/, (+e5 + e7), (+e6) /**/, (+e7 - e5)
		g0, g1, g2, g3, g4, g5, g6, g7 := (+f0) /**/, (+f1) /**/, (+f2) /**/, (+f3) /**/, (+f4 + f7), (+f5 + f6), (-f6 + f5), (+f7 - f4)

		mat[yoff+0] = g0 * s0
		mat[yoff+4] = g1 * s4
		mat[yoff+2] = g2 * s2
		mat[yoff+6] = g3 * s6
		mat[yoff+5] = g4 * s5
		mat[yoff+1] = g5 * s1
		mat[yoff+7] = g6 * s7
		mat[yoff+3] = g7 * s3
	}

	return mat
}

// NOTE: This was slower with a pointer to the matrix.
func (aan AAN8x8) IDCT8x8(mat Matrix8x8F32, xpos int, ypos int, into Image) {
	const (
		m0 = 1.8477590650225735 // 2 * cos(1/16 * 2 * pi)
		m1 = 1.4142135623730951 // 2 * cos(2/16 * 2 * pi)
		m3 = m1                 //
		m5 = 0.7653668647301797 // 2 * cos(3/16 * 2 * pi)
		m2 = m0 - m5
		m4 = m0 + m5

		s0 = 0.35355339059327373 // cos(0/16*pi) / sqrt(8)
		s1 = 0.4903926402016152  // cos(1/16*pi) / 2.0
		s2 = 0.46193976625564337 // cos(2/16*pi) / 2.0
		s3 = 0.4157348061512726  // cos(3/16*pi) / 2.0
		s4 = 0.3535533905932738  // cos(4/16*pi) / 2.0
		s5 = 0.27778511650980114 // cos(5/16*pi) / 2.0
		s6 = 0.19134171618254492 // cos(6/16*pi) / 2.0
		s7 = 0.09754516100806417 // cos(7/16*pi) / 2.0

		__ = 0
	)

	// Columns
	for i := 0; i < 8; i++ {
		g0 := mat[8*0+i] * s0
		g1 := mat[4*8+i] * s4
		g2 := mat[2*8+i] * s2
		g3 := mat[6*8+i] * s6
		g4 := mat[5*8+i] * s5
		g5 := mat[1*8+i] * s1
		g6 := mat[7*8+i] * s7
		g7 := mat[3*8+i] * s3

		/*                                    i, 0        i, 1        i, 2        i, 3        i, 4        i, 5        i, 6        i, 7        extra      */
		/*                                    ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ────────── */
		f0, f1, f2, f3, f4, f5, f6, f7, _8 := (+g0) /**/, (+g1) /**/, (+g2) /**/, (+g3) /**/, (+g4 - g7), (+g5 + g6), (+g5 - g6), (+g4 + g7), __
		e0, e1, e2, e3, e4, e5, e6, e7, e8 := (+f0) /**/, (+f1) /**/, (+f2 - f3), (+f2 + f3), (+f4) /**/, (+f5 - f7), (+f6) /**/, (+f5 + f7), (+f4 + f6)
		d0, d1, d2, d3, d4, d5, d6, d7, d8 := (+e0) /**/, (+e1) /**/, (+e2 * m1), (+e3) /**/, (+e4 * m2), (+e5 * m3), (+e6 * m4), (+e7) /**/, (+e8 * m5)
		c0, c1, c2, c3, c4, c5, c6, c7, _8 := (+d0 + d1), (+d0 - d1), (+d2 - d3), (+d3) /**/, (+d4 + d8), (+d5 + d7), (+d6 - d8), (+d7) /**/, __
		c8 := (+c5 - c6) /*                   ↓           ↓           ↓           ↓           ↓           ↓           ↓           ↓                      */
		b0, b1, b2, b3, b4, b5, b6, b7, _8 := (+c0 + c3), (+c1 + c2), (+c1 - c2), (+c0 - c3), (+c4 - c8), (+c8) /**/, (+c6 - c7), (+c7) /**/, __

		_ = _8

		mat[0*8+i] = b0 + b7
		mat[1*8+i] = b1 + b6
		mat[2*8+i] = b2 + b5
		mat[3*8+i] = b3 + b4
		mat[4*8+i] = b3 - b4
		mat[5*8+i] = b2 - b5
		mat[6*8+i] = b1 - b6
		mat[7*8+i] = b0 - b7
	}

	// Rows
	for i, yoff := 0, 0; i < 8; i, yoff = i+1, yoff+8 {
		g0 := mat[yoff+0] * s0
		g1 := mat[yoff+4] * s4
		g2 := mat[yoff+2] * s2
		g3 := mat[yoff+6] * s6
		g4 := mat[yoff+5] * s5
		g5 := mat[yoff+1] * s1
		g6 := mat[yoff+7] * s7
		g7 := mat[yoff+3] * s3

		/*                                    i, 0        i, 1        i, 2        i, 3        i, 4        i, 5        i, 6        i, 7        extra      */
		/*                                    ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ──────────  ────────── */
		f0, f1, f2, f3, f4, f5, f6, f7, _8 := (+g0) /**/, (+g1) /**/, (+g2) /**/, (+g3) /**/, (+g4 - g7), (+g5 + g6), (+g5 - g6), (+g4 + g7), __
		e0, e1, e2, e3, e4, e5, e6, e7, e8 := (+f0) /**/, (+f1) /**/, (+f2 - f3), (+f2 + f3), (+f4) /**/, (+f5 - f7), (+f6) /**/, (+f5 + f7), (+f4 + f6)
		d0, d1, d2, d3, d4, d5, d6, d7, d8 := (+e0) /**/, (+e1) /**/, (+e2 * m1), (+e3) /**/, (+e4 * m2), (+e5 * m3), (+e6 * m4), (+e7) /**/, (+e8 * m5)
		c0, c1, c2, c3, c4, c5, c6, c7, _8 := (+d0 + d1), (+d0 - d1), (+d2 - d3), (+d3) /**/, (+d4 + d8), (+d5 + d7), (+d6 - d8), (+d7) /**/, __
		c8 := (+c5 - c6) /*                   ↓           ↓           ↓           ↓           ↓           ↓           ↓           ↓                      */
		b0, b1, b2, b3, b4, b5, b6, b7, _8 := (+c0 + c3), (+c1 + c2), (+c1 - c2), (+c0 - c3), (+c4 - c8), (+c8) /**/, (+c6 - c7), (+c7) /**/, __

		_ = _8

		yposi := ypos + i

		// XXX: +0.5 is "grug rounding"
		aan.putpx(into, xpos+0, yposi, uint8(b0+b7+0.5))
		aan.putpx(into, xpos+1, yposi, uint8(b1+b6+0.5))
		aan.putpx(into, xpos+2, yposi, uint8(b2+b5+0.5))
		aan.putpx(into, xpos+3, yposi, uint8(b3+b4+0.5))
		aan.putpx(into, xpos+4, yposi, uint8(b3-b4+0.5))
		aan.putpx(into, xpos+5, yposi, uint8(b2-b5+0.5))
		aan.putpx(into, xpos+6, yposi, uint8(b1-b6+0.5))
		aan.putpx(into, xpos+7, yposi, uint8(b0-b7+0.5))
	}
}

A  => aan_std_test.go +122 -0
@@ 1,122 @@
package grugdct

import (
	"reflect"
	"testing"
)

func TestAANStd(t *testing.T) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewAAN8x8()
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	tw, th := transform.DCTSize().X, transform.DCTSize().Y
	result := NewImage(img.W, img.H)

	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			mat := transform.DCT8x8(img, x, y)
			dct.PutMat328x8(x, y, mat)
			transform.IDCT8x8(mat, x, y, result)
		}
	}

	// Print2DFloats(dct.Data, dct.W, dct.H, 6, 1)
	// Print2DUint8s(img.Pix, img.W, img.H, 0)
	// fmt.Println()
	// Print2DUint8s(result.Pix, result.W, result.H, 0)

	if !reflect.DeepEqual(img.Pix, result.Pix) {
		t.Fatal()
	}
}

func TestAANStdRoundtripCases(t *testing.T) {
	transform := NewAAN8x8()

	for _, timg := range TestImages {
		t.Run(timg.CaseName("aanstd"), func(t *testing.T) {
			dct := NewBuffer32(timg.Size(), transform.DCTSize())
			tw, th := transform.DCTSize().X, transform.DCTSize().Y
			result := NewImage(timg.W, timg.H)
			for y := 0; y < timg.H; y += th {
				for x := 0; x < timg.W; x += tw {
					mat := transform.DCT8x8(timg.Image, x, y)
					dct.PutMat328x8(x, y, mat)
					transform.IDCT8x8(mat, x, y, result)
				}
			}

			if !reflect.DeepEqual(timg.Pix, result.Pix) {
				t.Fatal()
			}
		})
	}
}

func BenchmarkAANStdWith8x8(b *testing.B) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewAAN8x8()
	dct := NewBuffer32(img.Size(), transform.DCTSize())
	dctH, dctW := transform.DCTSize().X, transform.DCTSize().Y

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.DCT8x8(img, x, y)
				}
			}
		}
	})

	result := NewImage(img.W, img.H)
	mats := make([]Matrix8x8F32, dct.DCTDims.X*dct.DCTDims.Y)

	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mats[m] = transform.DCT8x8(img, x, y)
			m++
		}
	}

	b.Run("idct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8(mats[m], x, y, result)
					m++
				}
			}
		}
	})
}

A  => basic.go +114 -0
@@ 1,114 @@
package grugdct

import (
	"image"
	"math"
)

type Basic struct {
	W, H                int
	piDivByDoubleWidth  float64
	piDivByDoubleHeight float64
	dctScale            float64
	idctScale           float64
}

var _ ImageTransformF64 = Basic{}

func NewBasic(w, h int) Basic {
	idctScale := math.Sqrt(float64(w)*float64(h)) / 2
	return Basic{
		W:                   w,
		H:                   h,
		dctScale:            1 / idctScale,
		idctScale:           idctScale,
		piDivByDoubleWidth:  math.Pi / float64(w*2),
		piDivByDoubleHeight: math.Pi / float64(h*2),
	}
}

func (bas Basic) DCTSize() image.Point {
	return image.Point{bas.W, bas.W}
}

func (bas Basic) Matrix() MatrixF64 {
	return NewMatrixF64(bas.W, bas.H)
}

func (bas Basic) DCTInto(img Image, xpos int, ypos int, mat MatrixF64) {
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)

	w, h := bas.W, bas.H

	for v, voff := 0, 0; v < h; v, voff = v+1, voff+bas.W {
		for u := 0; u < w; u++ {
			var cu, cv, z float64 = 1, 1, 0
			if u == 0 {
				cu = oneOverSqrt2
			}
			if v == 0 {
				cv = oneOverSqrt2
			}

			for y := 0; y < h; y++ {
				for x := 0; x < w; x++ {
					xp, yp := xpos+x, ypos+y
					if xp < img.W && yp < img.H {
						px := img.Pix[yp*img.W+xp]
						q := float64(px) *
							math.Cos(float64(2*x+1)*float64(u)*bas.piDivByDoubleWidth) *
							math.Cos(float64(2*y+1)*float64(v)*bas.piDivByDoubleHeight)
						z += q
					}

				}
			}

			mat.V[voff+u] = bas.dctScale * cu * cv * z
		}
	}
}

func (bas Basic) IDCTInto(mat MatrixF64, xpos int, ypos int, into Image) {
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)

	w, h := bas.W, bas.H

	for y := 0; y < h; y++ {
		for x := 0; x < w; x++ {
			var z float64

			for v, voff := 0, 0; v < h; v, voff = v+1, voff+bas.W {
				for u := 0; u < w; u++ {
					var cu, cv float64 = 1, 1
					if u == 0 {
						cu = oneOverSqrt2
					}
					if v == 0 {
						cv = oneOverSqrt2
					}
					s := mat.V[voff+u]
					q := cu * cv * s *
						math.Cos(float64(2*x+1)*float64(u)*bas.piDivByDoubleWidth) *
						math.Cos(float64(2*y+1)*float64(v)*bas.piDivByDoubleHeight)

					z += q
				}
			}

			z /= bas.idctScale
			if z > 255.0 {
				z = 255.0
			}
			if z < 0 {
				z = 0.0
			}

			xp, yp := xpos+x, ypos+y
			if xp < into.W && yp < into.H {
				// It's inaccurate without the round. 255.5 will truncate to 255, not wrap.
				into.Pix[yp*into.W+xp] = uint8(z + 0.5)
			}
		}
	}
}

A  => basic_test.go +93 -0
@@ 1,93 @@
package grugdct

import (
	"reflect"
	"testing"
)

func TestBasic(t *testing.T) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewBasic(4, 3)
	mat := transform.Matrix()
	result := NewImage(img.W, img.H)

	for y := 0; y < img.H; y += transform.H {
		for x := 0; x < img.W; x += transform.W {
			transform.DCTInto(img, x, y, mat)
			transform.IDCTInto(mat, x, y, result)
		}
	}

	if !reflect.DeepEqual(img.Pix, result.Pix) {
		t.Fatal()
	}
}

func BenchmarkBasicWith8x8(b *testing.B) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewBasic(8, 8)
	dctH, dctW := transform.DCTSize().X, transform.DCTSize().Y

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		mat := transform.Matrix()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += transform.H {
				for x := 0; x < img.W; x += transform.W {
					transform.DCTInto(img, x, y, mat)
				}
			}
		}
	})

	result := NewImage(img.W, img.H)
	mats := make([]MatrixF64, dctH*dctW)
	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mat := transform.Matrix()
			transform.DCTInto(img, x, y, mat)
			mats[m] = mat
			m++
		}
	}

	b.Run("idct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += transform.H {
				for x := 0; x < img.W; x += transform.W {
					transform.IDCTInto(mats[m], x, y, result)
					m++
				}
			}
		}
	})
}

A  => buffer.go +115 -0
@@ 1,115 @@
package grugdct

import (
	"image"
)

// NOTE: using a generic for this slows it down considerably if you use the At() method
// (or any other helpers we might add). Not worth the trade.
type Buffer32 struct {
	W, H      int
	ImageSize image.Point
	DCTSize   image.Point
	DCTDims   image.Point // Number of DCT cells in the image's grid
	Data      []float32
}

func NewBuffer32(imgSize image.Point, dctSize image.Point) *Buffer32 {
	imgW, imgH := imgSize.X, imgSize.Y
	dctW, dctH := dctSize.X, dctSize.Y

	w := ((imgW + dctW - 1) / dctW)
	h := ((imgH + dctH - 1) / dctH)

	dctDims := image.Pt(w, h)

	w *= dctW
	h *= dctH

	return &Buffer32{
		Data:      make([]float32, w*h),
		W:         w,
		H:         h,
		ImageSize: imgSize,
		DCTSize:   dctSize,
		DCTDims:   dctDims,
	}
}

func (b *Buffer32) At(x, y int) float32 {
	return b.Data[y*b.W+x]
}

func (b *Buffer32) Mat328x8(x, y int) (mat Matrix8x8F32) {
	off := y*b.W + x
	jump := b.W - 7

	for i := 0; i < len(mat); i++ {
		mat[i] = b.Data[off]
		if i&0b111 == 0b111 {
			off += jump // Move to the first x-offset column on the next row
		} else {
			off++ // Move to the next column
		}
	}
	return mat
}

func (b *Buffer32) Mat328x8Into(x, y int, mat *Matrix8x8F32) {
	off := y*b.W + x
	jump := b.W - 7

	for i := 0; i < len(mat); i++ {
		mat[i] = b.Data[off]
		if i&0b111 == 0b111 {
			off += jump // Move to the first x-offset column on the next row
		} else {
			off++ // Move to the next column
		}
	}
}

func (b *Buffer32) PutMat328x8(x, y int, mat Matrix8x8F32) {
	off := y*b.W + x
	jump := b.W - 7

	for i := 0; i < len(mat); i++ {
		b.Data[off] = mat[i]
		if i&0b111 == 0b111 {
			off += jump // Move to the first x-offset column on the next row
		} else {
			off++ // Move to the next column
		}
	}
}

// NOTE: using a generic for this slows it down considerably if you use the At() method
// (or any other helpers we might add). Not worth the trade.
type Buffer64 struct {
	W, H           int
	ImageW, ImageH int
	DCTW, DCTH     int
	Data           []float64
}

func NewBuffer64(imgSize image.Point, dctSize image.Point) *Buffer64 {
	imgW, imgH := imgSize.X, imgSize.Y
	dctW, dctH := dctSize.X, dctSize.Y

	w := ((imgW + dctW - 1) / dctW)
	h := ((imgH + dctH - 1) / dctH)

	w *= dctW
	h *= dctH

	return &Buffer64{
		Data: make([]float64, w*h),
		W:    w, H: h,
		ImageW: imgW, ImageH: imgH,
		DCTW: dctW, DCTH: dctH,
	}
}

func (b *Buffer64) At(x, y int) float64 {
	return b.Data[y*b.W+x]
}

A  => cachedcos8x8.go +96 -0
@@ 1,96 @@
package grugdct

import (
	"image"
	"math"
)

var cosines8x8LUT32 [8][8]float32 = func() (out [8][8]float32) {
	piDivByDoubleWidth := math.Pi / float64(8*2)
	for y := 0; y < 8; y++ {
		for x := 0; x < 8; x++ {
			out[x][y] = float32(math.Cos(float64(2*x+1) * float64(y) * piDivByDoubleWidth))
		}
	}
	return out
}()

type CachedCosines8x8 struct{}

var _ ImageTransform8x8F32 = CachedCosines8x8{}

func NewCachedCosines8x8() CachedCosines8x8 {
	return CachedCosines8x8{}
}

func (cos CachedCosines8x8) DCTSize() image.Point {
	return image.Point{8, 8}
}

func (cos CachedCosines8x8) DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32) {
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)
	const scale = 0.25                      // 1 / (sqrt(8*8) / 2)

	for v, voff := 0, 0; v < 8; v, voff = v+1, voff+8 {
		for u := 0; u < 8; u++ {
			var cu, cv, z float32 = 1, 1, 0
			if u == 0 {
				cu = oneOverSqrt2
			}
			if v == 0 {
				cv = oneOverSqrt2
			}
			for y := 0; y < 8; y++ {
				for x := 0; x < 8; x++ {
					xp, yp := xpos+x, ypos+y
					if xp < img.W && yp < img.H {
						px := img.Pix[yp*img.W+xp]
						z += float32(px) * cosines8x8LUT32[x][u] * cosines8x8LUT32[y][v]
					}
				}
			}

			mat[voff+u] = scale * cu * cv * z
		}
	}
	return mat
}

func (cos CachedCosines8x8) IDCT8x8(mat Matrix8x8F32, xpos int, ypos int, into Image) {
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)
	const scale = 4.0                       // sqrt(8*8) / 2

	for y := 0; y < 8; y++ {
		for x := 0; x < 8; x++ {
			var z float32

			for v, voff := 0, 0; v < 8; v, voff = v+1, voff+8 {
				for u := 0; u < 8; u++ {
					var cu, cv float32 = 1, 1
					if u == 0 {
						cu = oneOverSqrt2
					}
					if v == 0 {
						cv = oneOverSqrt2
					}
					s := mat[voff+u]
					z += cu * cv * s * cosines8x8LUT32[x][u] * cosines8x8LUT32[y][v]
				}
			}

			z /= scale
			if z > 255.0 {
				z = 255.0
			}
			if z < 0 {
				z = 0.0
			}

			xp, yp := xpos+x, ypos+y
			if xp < into.W && yp < into.H {
				// It's inaccurate without the round. 255.5 will truncate to 255, not wrap.
				into.Pix[yp*into.W+xp] = uint8(z + 0.5)
			}
		}
	}
}

A  => cachedcos8x8_test.go +108 -0
@@ 1,108 @@
package grugdct

import (
	"reflect"
	"testing"
)

func TestCachedCosines8x8RoundtripCases(t *testing.T) {
	transform := NewCachedCosines8x8()

	for _, timg := range TestImages {
		t.Run(timg.CaseName("cachedcos8x8"), func(t *testing.T) {
			tw, th := transform.DCTSize().X, transform.DCTSize().Y
			result := NewImage(timg.W, timg.H)
			for y := 0; y < timg.H; y += th {
				for x := 0; x < timg.W; x += tw {
					mat := transform.DCT8x8(timg.Image, x, y)
					transform.IDCT8x8(mat, x, y, result)
				}
			}

			if !reflect.DeepEqual(timg.Pix, result.Pix) {
				// Print2DUint8s(timg.Pix, timg.W, timg.H, 0)
				// fmt.Println()
				// Print2DUint8s(result.Pix, result.W, result.H, 0)
				// fmt.Println()

				t.Fatal()
			}
		})
	}
}

func BenchmarkCachedCosines8x8With8x8(b *testing.B) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewCachedCosines8x8()
	dctH, dctW := transform.DCTSize().X, transform.DCTSize().Y

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctH {
					transform.DCT8x8(img, x, y)
				}
			}
		}
	})

	result := NewImage(img.W, img.H)
	mats := make([]Matrix8x8F32, dctH*dctW)
	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mats[m] = transform.DCT8x8(img, x, y)
			m++
		}
	}

	b.Run("idct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += dctH {
				for x := 0; x < img.W; x += dctW {
					transform.IDCT8x8(mats[m], x, y, result)
					m++
				}
			}
		}
	})
}

func BenchmarkCachedCosines8x8WithASingle8x8DCTCall(b *testing.B) {
	img := Image{
		W: 8, H: 8,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22,
			131, 11, 12, 44, 131, 11, 12, 44,
			94, 51, 88, 66, 94, 51, 88, 66,
			0, 128, 255, 22, 131, 11, 12, 44,
			131, 11, 12, 44, 94, 51, 88, 66,
			94, 51, 88, 66, 0, 128, 255, 22,
			131, 11, 12, 44, 131, 11, 12, 44,
			94, 51, 88, 66, 131, 11, 12, 44,
		},
	}

	transform := NewCachedCosines8x8()
	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			transform.DCT8x8(img, 0, 0)
		}
	})
}

A  => cachedcosstd.go +139 -0
@@ 1,139 @@
package grugdct

import (
	"image"
	"math"
)

var (
	cachedCos8x8 = newCachedCosines(8, 8)
)

type CachedCosines struct {
	W, H      int
	dctScale  float64
	idctScale float64
	wcos      []float64
	hcos      []float64
}

var _ ImageTransformF64 = CachedCosines{}

func NewCachedCosines(w, h int) CachedCosines {
	if w == 8 && h == 8 {
		return cachedCos8x8
	}
	return newCachedCosines(w, h)
}

func newCachedCosines(w, h int) CachedCosines {
	idctScale := math.Sqrt(float64(w)*float64(h)) / 2
	b := CachedCosines{
		W:         w,
		H:         h,
		dctScale:  1 / idctScale,
		idctScale: idctScale,
		wcos:      make([]float64, w*w),
		hcos:      make([]float64, h*h),
	}

	piDivByDoubleWidth := math.Pi / float64(w*2)
	for i, u := 0, 0; u < w; u++ {
		for x := 0; x < w; x++ {
			b.wcos[i] = math.Cos(float64(2*x+1) * float64(u) * piDivByDoubleWidth)
			i++
		}
	}

	if w == h {
		b.hcos = b.wcos
	} else {
		piDivByDoubleHeight := math.Pi / float64(h*2)
		for i, v := 0, 0; v < h; v++ {
			for y := 0; y < h; y++ {
				b.hcos[i] = math.Cos(float64(2*y+1) * float64(v) * piDivByDoubleHeight)
				i++
			}
		}
	}

	return b
}

func (dct CachedCosines) DCTSize() image.Point {
	return image.Point{dct.W, dct.H}
}

func (cos CachedCosines) Matrix() MatrixF64 {
	return NewMatrixF64(cos.W, cos.H)
}

func (cos CachedCosines) DCTInto(img Image, xpos int, ypos int, mat MatrixF64) {
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)

	w, h := cos.W, cos.H

	for v, voff := 0, 0; v < h; v, voff = v+1, voff+w {
		for u := 0; u < w; u++ {
			var cu, cv, z float64 = 1, 1, 0
			if u == 0 {
				cu = oneOverSqrt2
			}
			if v == 0 {
				cv = oneOverSqrt2
			}

			for y := 0; y < h; y++ {
				for x := 0; x < w; x++ {
					xp, yp := xpos+x, ypos+y
					if xp < img.W && yp < img.H {
						px := img.Pix[yp*img.W+xp]
						z += float64(px) * cos.wcos[u*w+x] * cos.hcos[v*h+y]
					}
				}
			}

			mat.V[voff+u] = cos.dctScale * cu * cv * z
		}
	}
}

func (dct CachedCosines) IDCTInto(mat MatrixF64, xpos int, ypos int, into Image) {
	const oneOverSqrt2 = 0.7071067811865475 // 1 / sqrt(2)

	w, h := dct.W, dct.H

	for y := 0; y < h; y++ {
		for x := 0; x < w; x++ {
			var z float64

			for v, voff := 0, 0; v < h; v, voff = v+1, voff+dct.W {
				for u := 0; u < w; u++ {
					var cu, cv float64 = 1, 1
					if u == 0 {
						cu = oneOverSqrt2
					}
					if v == 0 {
						cv = oneOverSqrt2
					}
					s := mat.V[voff+u]
					z += cu * cv * s * dct.wcos[u*w+x] * dct.hcos[v*h+y]
				}
			}

			z /= dct.idctScale
			if z > 255.0 {
				z = 255.0
			}
			if z < 0 {
				z = 0.0
			}

			xp, yp := xpos+x, ypos+y
			if xp < into.W && yp < into.H {
				// It's inaccurate without the round. 255.5 will truncate to 255, not wrap.
				into.Pix[yp*into.W+xp] = uint8(z + 0.5)
			}
		}
	}
}

A  => cachedcosstd_test.go +145 -0
@@ 1,145 @@
package grugdct

import (
	"fmt"
	"image"
	"reflect"
	"testing"
)

func TestCachedCosines(t *testing.T) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	t.Run("4x3", func(t *testing.T) {
		transform := NewCachedCosines(4, 3)
		mat := transform.Matrix()
		result := NewImage(img.W, img.H)

		for y := 0; y < img.H; y += transform.H {
			for x := 0; x < img.W; x += transform.W {
				transform.DCTInto(img, x, y, mat)
				transform.IDCTInto(mat, x, y, result)
			}
		}

		if !reflect.DeepEqual(img.Pix, result.Pix) {
			t.Fatal()
		}
	})

	t.Run("8x8", func(t *testing.T) {
		transform := NewCachedCosines(8, 8)
		mat := transform.Matrix()
		result := NewImage(img.W, img.H)

		for y := 0; y < img.H; y += transform.H {
			for x := 0; x < img.W; x += transform.W {
				transform.DCTInto(img, x, y, mat)
				transform.IDCTInto(mat, x, y, result)
			}
		}

		if !reflect.DeepEqual(img.Pix, result.Pix) {
			t.Fatal()
		}
	})
}

func TestCachedCosinesRoundtripCases(t *testing.T) {
	dctSizes := []image.Point{
		image.Pt(1, 1),
		image.Pt(4, 3),
		image.Pt(8, 8),
		image.Pt(8, 8),
	}

	for _, dctSize := range dctSizes {
		transform := NewCachedCosines(dctSize.X, dctSize.Y)
		mat := transform.Matrix()

		for _, timg := range TestImages {
			t.Run(timg.CaseName(fmt.Sprintf("cachedcos-%s", dctSize)), func(t *testing.T) {
				tw, th := transform.DCTSize().X, transform.DCTSize().Y
				result := NewImage(timg.W, timg.H)
				for y := 0; y < timg.H; y += th {
					for x := 0; x < timg.W; x += tw {
						transform.DCTInto(timg.Image, x, y, mat)
						transform.IDCTInto(mat, x, y, result)
					}
				}

				if !reflect.DeepEqual(timg.Pix, result.Pix) {
					t.Fatal()
				}
			})
		}
	}
}

func BenchmarkCachedCosinesWith8x8(b *testing.B) {
	img := Image{
		W: 9, H: 9,
		Pix: []uint8{
			0, 128, 255, 22, 0, 128, 255, 22, 99,
			131, 11, 12, 44, 131, 11, 12, 44, 88,
			94, 51, 88, 66, 94, 51, 88, 66, 77,
			0, 128, 255, 22, 131, 11, 12, 44, 66,
			131, 11, 12, 44, 94, 51, 88, 66, 55,
			94, 51, 88, 66, 0, 128, 255, 22, 44,
			131, 11, 12, 44, 131, 11, 12, 44, 33,
			94, 51, 88, 66, 131, 11, 12, 44, 22,
			84, 41, 78, 56, 121, 1, 2, 34, 12,
		},
	}

	transform := NewCachedCosines(8, 8)
	dctH, dctW := transform.DCTSize().X, transform.DCTSize().Y

	b.Run("dct", func(b *testing.B) {
		b.ReportAllocs()
		mat := transform.Matrix()
		for i := 0; i < b.N; i++ {
			for y := 0; y < img.H; y += transform.H {
				for x := 0; x < img.W; x += transform.W {
					transform.DCTInto(img, x, y, mat)
				}
			}
		}
	})

	result := NewImage(img.W, img.H)
	mats := make([]MatrixF64, dctH*dctW)
	for m, y := 0, 0; y < img.H; y += dctH {
		for x := 0; x < img.W; x += dctW {
			mat := transform.Matrix()
			transform.DCTInto(img, x, y, mat)
			mats[m] = mat
			m++
		}
	}

	b.Run("idct", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			for m, y := 0, 0; y < img.H; y += transform.H {
				for x := 0; x < img.W; x += transform.W {
					transform.IDCTInto(mats[m], x, y, result)
					m++
				}
			}
		}
	})
}

A  => debug.go +79 -0
@@ 1,79 @@
package grugdct

import (
	"bytes"
	"fmt"
	"io"
	"os"
	"strconv"
)

func Sprint2DUint8s(vs []uint8, width, height int, base int) string {
	var buf bytes.Buffer
	Fprint2DUint8s(&buf, vs, width, height, base)
	return buf.String()
}

func Print2DUint8s(vs []uint8, width, height int, base int) (n int, err error) {
	return Fprint2DUint8s(os.Stdout, vs, width, height, base)
}

func Fprint2DUint8s(w io.Writer, vs []uint8, width, height int, base int) (n int, err error) {
	if base == 0 {
		base = 16
	}
	for y := 0; y < height; y++ {
		ln := ""
		for x := 0; x < width; x++ {
			if x == 0 {
				ln += " "
			}
			u := strconv.FormatUint(uint64(vs[y*width+x]), base)
			ln += fmt.Sprintf("%3s", u)
		}
		wn, err := fmt.Fprintln(w, ln)
		n += wn
		if err != nil {
			return n, err
		}
	}
	return n, nil
}

func Sprint2DFloats[T ~float64 | ~float32](vs []T, width, height int, digits, prec int) string {
	var buf bytes.Buffer
	Fprint2DFloats(&buf, vs, width, height, digits, prec)
	return buf.String()
}

func Print2DFloats[T ~float64 | ~float32](vs []T, width, height int, digits, prec int) (n int, err error) {
	return Fprint2DFloats(os.Stdout, vs, width, height, digits, prec)
}

func Fprint2DFloats[T ~float64 | ~float32](w io.Writer, vs []T, width, height int, digits, prec int) (n int, err error) {
	if prec == 0 {
		prec = 3
	}
	const first = "%*.*f"
	const rest = " %*.*f"

	for y := 0; y < height; y++ {
		for x := 0; x < width; x++ {
			fs := rest
			if x == 0 {
				fs = first
			}
			wn, err := fmt.Fprintf(w, fs, digits, prec, vs[y*width+x])
			n += wn
			if err != nil {
				return n, err
			}
		}
		wn, err := fmt.Fprintln(w)
		n += wn
		if err != nil {
			return n, err
		}
	}
	return n, nil
}

A  => go.mod +3 -0
@@ 1,3 @@
module go.shabbyrobe.org/grugdct

go 1.19

A  => image.go +30 -0
@@ 1,30 @@
package grugdct

import (
	"image"
)

type Image struct {
	W, H int
	Pix  []uint8
}

func NewImage(w, h int) Image {
	return Image{
		W:   w,
		H:   h,
		Pix: make([]uint8, w*h),
	}
}

func (i Image) Size() image.Point {
	return image.Point{i.W, i.H}
}

// Deprecated
func (i Image) Put(x, y int, v uint8) {
	if x >= i.W || y >= i.H {
		return
	}
	i.Pix[y*i.W+x] = v
}

A  => init_test.go +89 -0
@@ 1,89 @@
package grugdct

import (
	"fmt"
	"math/rand"
)

type TestImage struct {
	Image
	Name string
}

func (ti TestImage) CaseName(prefix string) string {
	return fmt.Sprintf("%s/%s", prefix, ti.Name)
}

var TestImages = BuildTestImages(nil)

func BuildTestImages(rng *rand.Rand) []TestImage {
	if rng == nil {
		rng = rand.New(rand.NewSource(0))
	}
	return []TestImage{
		BuildGridImage(rng, 1, 1, 1, 1),

		BuildGridImage(rng, 2, 1, 1, 1),
		BuildGridImage(rng, 1, 2, 1, 1),
		BuildGridImage(rng, 2, 1, 2, 2),

		// 7x9 == 8x8 boundary hover
		BuildGridImage(rng, 7, 9, 1, 1),
		BuildGridImage(rng, 7, 9, 2, 2),
		BuildGridImage(rng, 7, 9, 7, 7),
		BuildGridImage(rng, 7, 9, 8, 8),
		BuildGridImage(rng, 7, 9, 9, 9),

		// 8x8
		BuildGridImage(rng, 8, 8, 1, 1),
		BuildGridImage(rng, 8, 8, 2, 2),
		BuildGridImage(rng, 8, 8, 7, 7),
		BuildGridImage(rng, 8, 8, 8, 8),

		// 9x7 == 8x8 boundary hover
		BuildGridImage(rng, 9, 7, 1, 1),
		BuildGridImage(rng, 9, 7, 2, 2),
		BuildGridImage(rng, 9, 7, 7, 7),
		BuildGridImage(rng, 9, 7, 8, 8),
		BuildGridImage(rng, 9, 7, 9, 9),

		// 16x16 == 2x 8x8
		BuildGridImage(rng, 16, 16, 1, 1),
		BuildGridImage(rng, 16, 16, 2, 2),
		BuildGridImage(rng, 16, 16, 7, 7),
		BuildGridImage(rng, 16, 16, 8, 8),
		BuildGridImage(rng, 16, 16, 9, 9),
		BuildGridImage(rng, 16, 16, 16, 16),

		// 17x17 == 2x 8x8 ++
		BuildGridImage(rng, 17, 17, 1, 1),
		BuildGridImage(rng, 17, 17, 2, 2),
		BuildGridImage(rng, 17, 17, 7, 7),
		BuildGridImage(rng, 17, 17, 8, 8),
		BuildGridImage(rng, 17, 17, 9, 9),
		BuildGridImage(rng, 17, 17, 16, 16),
		BuildGridImage(rng, 17, 17, 17, 17),

		BuildGridImage(rng, 1920, 1080, 1, 1),
	}
}

func BuildGridImage(rng *rand.Rand, w, h int, gw, gh int) TestImage {
	img := NewImage(w, h)
	RenderGridImage(rng, img, w, h, gw, gh)
	return TestImage{Image: img, Name: fmt.Sprintf("gridimage:%dx%d,g=%dx%d", w, h, gw, gh)}
}

func RenderGridImage(rng *rand.Rand, img Image, w, h int, gw, gh int) {
	for y := 0; y < h; y += gh {
		for x := 0; x < w; x += gw {
			c := uint8(rng.Intn(256))
			for gy := 0; gy < gh && y+gy < h; gy++ {
				for gx := 0; gx < gw && x+gx < w; gx++ {
					off := (y+gy)*w + (x + gx)
					img.Pix[off] = c
				}
			}
		}
	}
}

A  => luma.go +73 -0
@@ 1,73 @@
package grugdct

import (
	"image"
	"image/color"
)

func ConvertImageToLuma(img image.Image) Image {
	bounds := img.Bounds()
	w, h := bounds.Dx(), bounds.Dy()

	switch img := img.(type) {
	case *image.Gray:
		px := make([]uint8, len(img.Pix))
		copy(px, img.Pix)
		return Image{W: w, H: h, Pix: px}

	case *image.RGBA:
		px := make([]uint8, len(img.Pix))
		for i := 0; i < len(img.Pix); i += 4 {
			r, g, b := img.Pix[i], img.Pix[i+1], img.Pix[i+2]
			px[i] = uint8((19595*(uint32(r)<<8|uint32(r)) +
				38470*(uint32(g)<<8|uint32(g)) +
				7471*(uint32(b)<<8|uint32(b)) + 1<<15) >> 24)
		}
		return Image{W: w, H: h, Pix: px}

	case interface{ RGBAAt(x, y int) color.RGBA }:
		px := make([]uint8, w*h)
		i := 0
		for y := 0; y < w; y++ {
			for x := 0; x < w; x++ {
				rgba := img.RGBAAt(x, y)
				px[i] = uint8((19595*(uint32(rgba.R)<<8|uint32(rgba.R)) +
					38470*(uint32(rgba.G)<<8|uint32(rgba.G)) +
					7471*(uint32(rgba.B)<<8|uint32(rgba.B)) + 1<<15) >> 24)
				i++
			}
		}
		return Image{W: w, H: h, Pix: px}

	default:
		px := make([]uint8, w*h)
		i := 0
		for y := 0; y < w; y++ {
			for x := 0; x < w; x++ {
				col := img.At(x, y)
				r, g, b, _ := col.RGBA()
				px[i] = uint8((19595 * r) +
					(38470 * g) +
					(7471 * b) +
					(1<<15)>>24)
				i++
			}
		}
		return Image{W: w, H: h, Pix: px}
	}
}

// Casts an image.Image to a luma Image if possible, re-using the memory.
// If an image type is not supported, it is converted.
func CastImageToLuma(img image.Image) Image {
	bounds := img.Bounds()
	w, h := bounds.Dx(), bounds.Dy()

	switch img := img.(type) {
	case *image.Gray:
		return Image{W: w, H: h, Pix: img.Pix}

	default:
		return ConvertImageToLuma(img)
	}
}

A  => matrix.go +37 -0
@@ 1,37 @@
package grugdct

type MatrixF32 struct {
	V []float32
	W int
	H int
}

func NewMatrixF32(w, h int) MatrixF32 {
	return MatrixF32{V: make([]float32, w*h), W: w, H: h}
}

func (d MatrixF32) Dx() int { return d.W }
func (d MatrixF32) Dy() int { return d.H }

type Matrix8x8F32 [64]float32

func (d Matrix8x8F32) Dx() int { return 8 }
func (d Matrix8x8F32) Dy() int { return 8 }

type MatrixF64 struct {
	V []float64
	W int
	H int
}

func NewMatrixF64(w, h int) MatrixF64 {
	return MatrixF64{V: make([]float64, w*h), W: w, H: h}
}

func (d MatrixF64) Dx() int { return d.W }
func (d MatrixF64) Dy() int { return d.H }

type Matrix8x8F64 [64]float64

func (d Matrix8x8F64) Dx() int { return 8 }
func (d Matrix8x8F64) Dy() int { return 8 }

A  => quantize.go +124 -0
@@ 1,124 @@
package grugdct

import (
	"image"
	"math"
)

var DefaultLumaQuantizationMatrix8x8F32 = Matrix8x8F32{
	16, 11, 10, 16, 24, 40, 51, 61,
	12, 12, 14, 19, 26, 58, 60, 55,
	14, 13, 16, 24, 40, 57, 69, 56,
	14, 17, 22, 29, 51, 87, 80, 62,
	18, 22, 37, 56, 68, 109, 103, 77,
	24, 35, 55, 64, 81, 104, 113, 92,
	49, 64, 78, 87, 103, 121, 120, 101,
	72, 92, 95, 98, 112, 100, 103, 99,
}

var DefaultChromaQuantizationMatrix8x8F32 = Matrix8x8F32{
	17, 18, 24, 47, 99, 99, 99, 99,
	18, 21, 26, 66, 99, 99, 99, 99,
	24, 26, 56, 99, 99, 99, 99, 99,
	47, 66, 99, 99, 99, 99, 99, 99,
	99, 99, 99, 99, 99, 99, 99, 99,
	99, 99, 99, 99, 99, 99, 99, 99,
	99, 99, 99, 99, 99, 99, 99, 99,
	99, 99, 99, 99, 99, 99, 99, 99,
}

func Scale8x8F32(mat Matrix8x8F32, quality int) Matrix8x8F32 {
	// quality: [0, 100]
	// http://old.dfrws.org/2008/proceedings/p21-kornblum_pres.pdf
	// https://stackoverflow.com/a/29216609

	var s float32
	if quality < 50 {
		s = 5000 / float32(quality)
	} else {
		s = 200 - (2 * float32(quality))
	}
	for i := 0; i < 64; i++ {
		ts := math.Floor(float64((s*mat[i] + 50) / 100))
		if ts == 0 {
			ts = 1
		}
		mat[i] = float32(ts)
	}
	return mat
}

func GrugQuantize8x8F32InPlace(img Image, transform ImageTransform8x8F32, table Matrix8x8F32) {
	tw, th := transform.DCTSize().X, transform.DCTSize().Y
	result := NewImage(img.W, img.H)

	for y := 0; y < img.H; y += th {
		for x := 0; x < img.W; x += tw {
			mat := transform.DCT8x8(img, x, y)
			// for i := 63; i >= 63-strip && i >= 0; i-- {
			//     mat[ZigZag8x8Order[i]] = 0
			// }
			transform.IDCT8x8(mat, x, y, result)
		}
	}
}

func GrugQuantizeYCbCr8x8F32InPlace(
	yCbCr *image.YCbCr,
	transform ImageTransform8x8F32,
	quality int,
	lumaTable *Matrix8x8F32,
	chromaTable *Matrix8x8F32,
) {
	if lumaTable == nil {
		lumaTable = &DefaultLumaQuantizationMatrix8x8F32
	}
	if chromaTable == nil {
		chromaTable = &DefaultChromaQuantizationMatrix8x8F32
	}
	if quality == 0 {
		quality = 80
	}

	qLumaTable := Scale8x8F32(*lumaTable, quality)
	qChromaTable := Scale8x8F32(*chromaTable, quality)

	tw, th := transform.DCTSize().X, transform.DCTSize().Y
	rect := yCbCr.Bounds()
	imgw, imgh := rect.Dx(), rect.Dy()

	chromaSz := SubsampleSize(rect.Size(), yCbCr.SubsampleRatio)
	imgCy := Image{Pix: yCbCr.Y, W: imgw, H: imgh}
	imgCb := Image{Pix: yCbCr.Cb, W: chromaSz.X, H: chromaSz.Y}
	imgCr := Image{Pix: yCbCr.Cr, W: chromaSz.X, H: chromaSz.Y}

	for y := 0; y < imgCy.H; y += th {
		for x := 0; x < imgCy.W; x += tw {
			mat := transform.DCT8x8(imgCy, x, y)
			for i := 0; i < 64; i++ {
				mat[i] = float32(math.Round(float64(mat[i]/qLumaTable[i]))) * qLumaTable[i]
			}
			transform.IDCT8x8(mat, x, y, imgCy)
		}
	}

	for y := 0; y < imgCb.H; y += th {
		for x := 0; x < imgCb.W; x += tw {
			mat := transform.DCT8x8(imgCb, x, y)
			for i := 0; i < 64; i++ {
				mat[i] = float32(math.Round(float64(mat[i]/qChromaTable[i]))) * qChromaTable[i]
			}
			transform.IDCT8x8(mat, x, y, imgCb)
		}
	}

	for y := 0; y < imgCr.H; y += th {
		for x := 0; x < imgCr.W; x += tw {
			mat := transform.DCT8x8(imgCr, x, y)
			for i := 0; i < 64; i++ {
				mat[i] = float32(math.Round(float64(mat[i]/qChromaTable[i]))) * qChromaTable[i]
			}
			transform.IDCT8x8(mat, x, y, imgCr)
		}
	}
}

A  => tool.go +81 -0
@@ 1,81 @@
//go:build tool

package main

import (
	"bytes"
	"fmt"
	"image"
	"image/jpeg"
	"image/png"
	"log"
	"os"
	"path/filepath"
	"time"

	"go.shabbyrobe.org/grugdct"
)

func main() {
	if err := run(); err != nil {
		log.Fatal(err)
	}
}

func run() error {
	if len(os.Args) < 3 {
		return fmt.Errorf("Usage: tool.go <image> <output>")
	}

	input, output := os.Args[1], os.Args[2]

	in, err := os.ReadFile(input)
	if err != nil {
		return err
	}

	var img image.Image

	switch filepath.Ext(input) {
	case ".png":
		p, err := png.Decode(bytes.NewReader(in))
		if err != nil {
			return err
		}
		img = p

	case ".jpg", ".jpeg":
		j, err := jpeg.Decode(bytes.NewReader(in))
		if err != nil {
			return err
		}
		img = j

	default:
		return fmt.Errorf("unsupported file extension %q", input)
	}

	transform := grugdct.NewAAN8x8()

	s := time.Now()
	subsampleRatio := image.YCbCrSubsampleRatio410
	quality := 90
	yCbCr := grugdct.ConvertImageToYCbCr(img, subsampleRatio)
	grugdct.GrugQuantizeYCbCr8x8F32InPlace(yCbCr, transform, quality, nil, nil)
	took := time.Since(s)
	fmt.Println("grug quantize", yCbCr.Bounds().Size(), took)

	s = time.Now()
	var buf bytes.Buffer
	if err := png.Encode(&buf, yCbCr); err != nil {
		return err
	}
	took = time.Since(s)
	fmt.Println("grug png enc", took)

	if err := os.WriteFile(output, buf.Bytes(), 0600); err != nil {
		return err
	}

	return nil
}

A  => transform.go +16 -0
@@ 1,16 @@
package grugdct

import "image"

type ImageTransform8x8F32 interface {
	DCTSize() image.Point
	DCT8x8(img Image, xpos int, ypos int) (mat Matrix8x8F32)
	IDCT8x8(mat Matrix8x8F32, xpos int, ypos int, into Image)
}

type ImageTransformF64 interface {
	DCTSize() image.Point
	Matrix() MatrixF64
	DCTInto(img Image, xpos int, ypos int, mat MatrixF64)
	IDCTInto(mat MatrixF64, xpos int, ypos int, into Image)
}

A  => ycbcr.go +88 -0
@@ 1,88 @@
package grugdct

import (
	"image"
	"image/color"
)

func ConvertImageToYCbCr(img image.Image, subsampleRatio image.YCbCrSubsampleRatio) *image.YCbCr {
	bounds := img.Bounds()
	w, h := bounds.Dx(), bounds.Dy()
	out := image.NewYCbCr(image.Rect(0, 0, w, h), subsampleRatio)

	cyidx, cbidx, cridx := 0, 0, 0

	for y := 0; y < h; y++ {
		for x := 0; x < w; x++ {
			r, g, b, _ := img.At(x, y).RGBA()
			cy, cb, cr := color.RGBToYCbCr(uint8(r>>8), uint8(g>>8), uint8(b>>8))

			chroma := false

			switch subsampleRatio {
			case image.YCbCrSubsampleRatio422:
				// Half horizontal color resolution, full vertical:
				chroma = x&1 == 0

			case image.YCbCrSubsampleRatio420:
				// Half horizontal color resolution, half vertical:
				chroma = x&1 == 0 && y&1 == 0

			case image.YCbCrSubsampleRatio440:
				// Full horizontal color resolution, half vertical:
				chroma = y&1 == 0

			case image.YCbCrSubsampleRatio411:
				// One-fourth the horizontal color resolution, full vertical:
				chroma = x&3 == 0

			case image.YCbCrSubsampleRatio410:
				// One-fourth the horizontal color resolution and half of the vertical:
				chroma = x&3 == 0 && y&1 == 0

			default: // 4:4:4
				chroma = true
			}

			out.Y[cyidx] = cy
			cyidx++

			if chroma {
				out.Cb[cbidx] = cb
				cbidx++
				out.Cr[cridx] = cr
				cridx++
			}
		}
	}

	return out
}

func SubsampleSize(r image.Point, subsampleRatio image.YCbCrSubsampleRatio) image.Point {
	w, h := r.X, r.Y
	switch subsampleRatio {
	case image.YCbCrSubsampleRatio422:
		// Half horizontal color resolution, full vertical:
		return image.Point{(w + 1) / 2, h}

	case image.YCbCrSubsampleRatio420:
		// Half horizontal color resolution, half vertical:
		return image.Point{(w + 1) / 2, (h + 1) / 2}

	case image.YCbCrSubsampleRatio440:
		// Full horizontal color resolution, half vertical:
		return image.Point{w, (h + 1) / 2}

	case image.YCbCrSubsampleRatio411:
		// One-fourth the horizontal color resolution, full vertical:
		return image.Point{(w + 3) / 4, h}

	case image.YCbCrSubsampleRatio410:
		// One-fourth the horizontal color resolution and half of the vertical:
		return image.Point{(w + 3) / 4, (h + 1) / 2}

	default: // 4:4:4
		return image.Point{w, h}
	}
}

A  => zigzag.go +26 -0
@@ 1,26 @@
package grugdct

var ZigZag8x8 = [64]int{
	/**/ +0, +1, +5, +6, 14, 15, 27, 28,
	//      ↙   ↗   ↙   ↗   ↙   ↗   ↙
	/**/ +2, +4, +7, 13, 16, 26, 29, 42,
	//    ↓ ↗   ↙   ↗   ↙   ↗   ↙   ↗ ↓
	/**/ +3, +8, 12, 17, 25, 30, 41, 43,
	//      ↙   ↗   ↙   ↗   ↙   ↗   ↙
	/**/ +9, 11, 18, 24, 31, 40, 44, 53,
	//    ↓ ↗   ↙   ↗   ↙   ↗   ↙   ↗ ↓
	/**/ 10, 19, 23, 32, 39, 45, 52, 54,
	//      ↙   ↗   ↙   ↗   ↙   ↗   ↙
	/**/ 20, 22, 33, 38, 46, 51, 55, 60,
	//    ↓ ↗   ↙   ↗   ↙   ↗   ↙   ↗ ↓
	/**/ 21, 34, 37, 47, 50, 56, 59, 61,
	//      ↙   ↗   ↙   ↗   ↙   ↗   ↙
	/**/ 35, 36, 48, 49, 57, 58, 62, 63,
}

var ZigZag8x8Order [64]int = func() (out [64]int) {
	for i := 0; i < 64; i++ {
		out[ZigZag8x8[i]] = i
	}
	return out
}()