~rycwo/ispc-bench

f38a66186b80290bd03c73dfad10acec8fa6ccaf — Ryan Chan 2 years ago f10ba0b
Add source files for benchmark program
6 files changed, 167 insertions(+), 0 deletions(-)

A .gitignore
A Makefile
A README.md
A add.ispc
A main.cc
A shell.nix
A .gitignore => .gitignore +3 -0
@@ 0,0 1,3 @@
ispc_add.o
ispc_add.h
bench

A Makefile => Makefile +44 -0
@@ 0,0 1,44 @@
# Boilerplate variables from: http://clarkgrubb.com/makefile-style-guide
MAKEFLAGS += --warn-undefined-variables
# This is necessary for invocation of echo with "-e" support.
SHELL := /usr/bin/env bash
.SHELLFLAGS := -eu -o pipefail -c
.DEFAULT_GOAL := all
.DELETE_ON_ERROR:
.SUFFIXES:

# https://www.gnu.org/prep/standards/html_node/Command-Variables.html
RM := rm
RM_DIR := $(RM) --recursive --force

MKDIR := mkdir

#------------------------------------------------------------------------------
# Build configuration.
#------------------------------------------------------------------------------
LIBS := -lbenchmark -lpthread -ltbb
CXXFLAGS := \
	-Wall -Wextra -Wpedantic -Werror \
	-march=native -mavx2 \
	--std=c++11 -O3

ISPC := ispc
ISPCFLAGS := --arch=x86-64 --cpu=haswell --target=avx2-i32x8 -O3

target_binary := ./bench

.PHONY : all
all : $(target_binary) ;

.PHONY : clean
clean :
	@-$(RM) $(target_binary) ispc_add.o ispc_add.h

$(target_binary) : main.cc ispc_add.o | ispc_add.h
	@$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)

ispc_add.o : add.ispc
	@$(ISPC) $(ISPCFLAGS) --outfile=$@ $^

ispc_add.h : add.ispc
	@$(ISPC) $(ISPCFLAGS) --header-outfile=$@ $^

A README.md => README.md +21 -0
@@ 0,0 1,21 @@
# `ispc` Benchmarks

[![built with nix](https://builtwithnix.org/badge.svg)](https://builtwithnix.org)

Here lies some naive benchmarks to see the performance benefits when adding large numbers of
floating-point values using `ispc`.

## Dependencies

- GNU Make
- GCC (At least 4.9)
- [`ispc`](http://ispc.github.io/)
- [Intel Threading Building Blocks](https://www.threadingbuildingblocks.org)
- [Google Benchmark](https://github.com/google/benchmark)

You will also need a CPU that has the AVX intruction set. You can replace the intrinsics to
target SSE or any other architecture as required. Here are a couple of helpful links for
finding the appropriate functions and headers:

- https://software.intel.com/sites/landingpage/IntrinsicsGuide/
- http://www.g-truc.net/post-0359.html

A add.ispc => add.ispc +10 -0
@@ 0,0 1,10 @@
export void add(
    uniform float const lhs[],
    uniform float const rhs[],
    uniform float result[],
    uniform int const size) {

    foreach (index = 0 ... size) {
        result[index] = lhs[index] + rhs[index];
    }
}

A main.cc => main.cc +79 -0
@@ 0,0 1,79 @@
#include <cstdlib>

#include <immintrin.h>

#include <benchmark/benchmark.h>

#include <tbb/tbb.h>

#include "ispc_add.h"

namespace {

int const DATA_COUNT{2 << 16}; // 131072.
std::size_t const DATA_SIZE{sizeof(float) * DATA_COUNT};

struct FloatArraysFixture : public benchmark::Fixture {
    void SetUp(benchmark::State&) override {
        lhs = static_cast<float*>(_mm_malloc(DATA_SIZE, 32));
        rhs = static_cast<float*>(_mm_malloc(DATA_SIZE, 32));
        result = static_cast<float*>(_mm_malloc(DATA_SIZE, 32));

        for (int index = 0; index < DATA_COUNT; ++index) {
            lhs[index] = index;
            rhs[index] = DATA_COUNT - index - 1;
        }
    }

    void TearDown(benchmark::State&) override {
        _mm_free(lhs);
        _mm_free(rhs);
        _mm_free(result);
    }

    float* lhs;
    float* rhs;
    float* result;
};

}

BENCHMARK_F(FloatArraysFixture, AdditionScalar)(benchmark::State& state) {
    for (auto _ : state) {
        for (int index = 0; index < DATA_COUNT; ++index) {
            result[index] = lhs[index] + rhs[index];
        }
    }
}

BENCHMARK_F(FloatArraysFixture, AdditionVector)(benchmark::State& state) {
    for (auto _ : state) {
        for (int index = 0; index < DATA_COUNT; index += 8) {
            __m256 val_a = _mm256_load_ps(lhs + index);
            __m256 val_b = _mm256_load_ps(rhs + index);
            __m256 val_c = _mm256_add_ps(val_a, val_b);
            _mm256_store_ps(result + index, val_c);
        }
    }
}

BENCHMARK_F(FloatArraysFixture, AdditionIspc)(benchmark::State& state) {
    for (auto _ : state) {
        ispc::add(lhs, rhs, result, DATA_COUNT);
    }
}

BENCHMARK_F(FloatArraysFixture, AdditionTbb)(benchmark::State& state) {
    for (auto _ : state) {
        tbb::parallel_for(
            tbb::blocked_range<int>(0, DATA_COUNT), [&](tbb::blocked_range<int> const& range) {
                ispc::add(
                    lhs + range.begin(),
                    rhs + range.begin(),
                    result + range.begin(),
                    range.size());
            });
    }
}

BENCHMARK_MAIN()

A shell.nix => shell.nix +10 -0
@@ 0,0 1,10 @@
{ pkgs ? import <nixpkgs> {} }:

pkgs.mkShell {
  buildInputs = with pkgs; [
    gbenchmark
    gcc49
    ispc
    tbb
  ];
}