~jmbr/libpme6

3c73b6328ec0d5065ab8d2cfd089de87138f6a5b — Juan M. Bello-Rivas 8 years ago
Initial revision.
A  => .gitignore +1 -0
@@ 1,1 @@
build/*

A  => CMakeLists.txt +29 -0
@@ 1,29 @@
project(particle-mesh-ewald CXX)

cmake_minimum_required(VERSION 2.8)

find_package(Armadillo REQUIRED)

find_package(PkgConfig REQUIRED)
pkg_check_modules(FFTW REQUIRED fftw3)

find_package(OpenMP)

include_directories(${ARMADILLO_INCLUDE_DIRS} ${FFTW_INCLUDE_DIRS})
add_definitions(${FFTW_CFLAGS})
# set(CXX_WARNING_FLAGS "-W -Wall -Wextra -Wconversion -Wshadow -Wpointer-arith -Wcast-qual -Wwrite-strings")
set(CXX_WARNING_FLAGS "-W -Wall")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall ${OpenMP_CXX_FLAGS} ${CXX_WARNING_FLAGS} -std=c++11")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -O3 -finline-functions -funroll-loops")

add_library(pme pme.cpp pme.h cube.cpp cube.h linear-algebra.cpp linear-algebra.h spline.cpp spline.h)

add_executable(make-box make-box.cpp)
target_link_libraries(make-box ${ARMADILLO_LIBRARIES})

add_executable(dispersive dispersive.cpp)
target_link_libraries(dispersive pme ${ARMADILLO_LIBRARIES} ${FFTW_LDFLAGS} ${FFTW_LDFLAGS_OTHER} -lfftw3_omp)

add_executable(run-pme run-pme.cpp)
target_link_libraries(run-pme pme ${ARMADILLO_LIBRARIES} ${FFTW_LDFLAGS} ${FFTW_LDFLAGS_OTHER} -lfftw3_omp)

A  => abort_unless.h +21 -0
@@ 1,21 @@
#ifndef ABORT_UNLESS_H
#define ABORT_UNLESS_H

#include <cstdio>
#include <cstdlib>

/*
 * Unlike the assert() macro, abort_unless() is not disabled by
 * defining the preprocessor macro NDEBUG.
 */

#define abort_unless(expr) do {                                         \
    if (!(expr)) {                                                      \
      std::fprintf(stderr, "%s:%u (%s): Assertion `%s' failed.\n",      \
                   __FILE__, __LINE__, __func__, __STRING(expr));       \
      std::fflush(stderr);                                              \
      abort();                                                          \
    }                                                                   \
  } while (0)

#endif

A  => cube.cpp +64 -0
@@ 1,64 @@
#include <cassert>

#include <algorithm>

#include "cube.h"

namespace ewald {

using std::complex;

template <typename T>
cube<T>::cube(int d0, int d1, int d2)
    : dim0(d0), dim1(d1), dim2(d2) {
  memory = new T[dim0 * dim1 * dim2];
}

template <typename T>
cube<T>::~cube() {
  delete [] memory;
}

template <typename T>
void cube<T>::zeros() {
  std::fill_n(memory, dim0 * dim1 * dim2, T(0));
}

template <typename T>
T cube<T>::operator()(int k0, int k1, int k2) const {
  assert(0 <= k0 && k0 < dim0);
  assert(0 <= k1 && k1 < dim1);
  assert(0 <= k2 && k2 < dim2);

  return memory[k0 * dim2 * dim1 + k1 * dim2 + k2];
}

template <typename T>
T& cube<T>::operator()(int k0, int k1, int k2) {
  assert(0 <= k0 && k0 < dim0);
  assert(0 <= k1 && k1 < dim1);
  assert(0 <= k2 && k2 < dim2);

  return memory[k0 * dim2 * dim1 + k1 * dim2 + k2];
}

template <typename T>
cube<T>& cube<T>::operator*=(cube const& rhs) {
  assert(rhs.dim0 == dim0);
  assert(rhs.dim1 == dim1);
  assert(rhs.dim2 == dim2);

  for (int k0 = 0; k0 < dim0; ++k0)
    for (int k1 = 0; k1 < dim1; ++k1)
      for (int k2 = 0; k2 < dim2; ++k2)
        operator()(k0, k1, k2) *= rhs(k0, k1, k2);

  return *this;
}

template <typename T>
T* cube<T>::memptr() {
  return memory;
}

}

A  => cube.h +33 -0
@@ 1,33 @@
#ifndef CUBE_H
#define CUBE_H

#include <complex>

namespace ewald {

template <typename T>
class cube {
 public:
  cube(int d0, int d1, int d2);
  ~cube();

  void zeros();

  T operator()(int k0, int k1, int k2) const;
  T& operator()(int k0, int k1, int k2);

  cube<T>& operator*=(cube const& rhs);

  T* memptr();

 private:
  const int dim0, dim1, dim2;
  T* memory;
};

typedef cube<double> real_cube;
typedef cube<std::complex<double> > complex_cube;

}

#endif

A  => dispersive.cpp +332 -0
@@ 1,332 @@
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cfenv>

#include <complex>
#include <iomanip>
#include <iostream>
#include <vector>

#include <armadillo>

#ifdef HAVE_OMP
#include <omp.h>
#endif

#include "pme.h"
#include "abort_unless.h"
#include "energies.h"

using namespace std;
using namespace arma;

const complex<double> I(0.0, 1.0);

ewald::exclusion_vector exclusions;

static bool excluded(size_t i, size_t j);

int main(int argc, char* argv[]) {
  if (argc < 8) {
    cout << "Usage: " << argv[0]
         << " dimension cell-file positions-file charges-file cut-off tolerance num-shells\n";
    return EXIT_FAILURE;
  }

  const long dim0 = atoi(argv[1]);
  const long dim1 = dim0;
  const long dim2 = dim0;

  mat::fixed<3, 3> L;
  L.load(argv[2]);
  const mat::fixed<3, 3> LinvT = trans(inv(L));
  L.print();

  mat r;
  r.load(argv[3]);
  abort_unless(r.n_rows == 3);
  const mat x = inv(L) * r;

  vec qq;
  qq.load(argv[4]);
  std::vector<double> q(r.n_cols);
  {
    for (size_t k = 0; k < q.size(); ++k)
      q[k] = qq(k);
  }

  const size_t num_particles = q.size();
  clog << "Read " << num_particles << " particles into memory.\n";

  std::vector<double> coor_x(num_particles);
  std::vector<double> coor_y(num_particles);
  std::vector<double> coor_z(num_particles);
  {
    for (size_t k = 0; k < num_particles; ++k) {
      coor_x[k] = r(0, k);
      coor_y[k] = r(1, k);
      coor_z[k] = r(2, k);
    }
  }

  const double cut_off = atof(argv[5]);
  const double cut_off2 = cut_off * cut_off;
  const double tolerance = atof(argv[6]);
  clog << "Real space cut-off: " << cut_off << "\n"
       << "Real space tolerance: " << tolerance << "\n\n";


  exclusions = ewald::exclusion_vector(num_particles);
  {                                     // Populate exclusion vector.
/*
    for (size_t i = 0; i < num_particles; ++i)
      for (size_t j = 0; j < num_particles; ++j)
        // if (i != j && ((i % 2) != (j % 2))) {
        if (abs(int(i) - int(j)) <= 2) {
          exclusions[i].push_back(j);
        }
*/
    /*
    for (size_t i = 0; i < num_particles; ++i) {
      std::cout << i << ": ";
      ewald::index_vector const& is = exclusions[i];
      for (size_t j = 0; j < is.size(); ++j)
        std::cout << is[j] << " ";
      std::cout << std::endl;
    }
    */
  }

  const double max_shell = int(atof(argv[7]));

  mat forces = zeros<mat>(3, num_particles);
  mat force_cutoff = zeros<mat>(3, num_particles);
  mat force_direct = zeros<mat>(3, num_particles);
  mat force_recip = zeros<mat>(3, num_particles);
  mat force_excluded = zeros<mat>(3, num_particles);
  mat force_excluded_pme = zeros<mat>(3, num_particles);

  std::vector<double> force_x(num_particles, 0.0);
  std::vector<double> force_y(num_particles, 0.0);
  std::vector<double> force_z(num_particles, 0.0);
  mat force_recip_pme = zeros<mat>(3, num_particles);
  double LL[3 * 3];
  for (size_t i = 0; i < 3; ++i)
    for (size_t j = 0; j < 3; ++j)
      LL[3*i+j] = L(i, j);

  energies energy_pme, energy_ewald;

  wall_clock timer;
  timer.tic();

#ifdef HAVE_OMP
  ewald::pme p(dim0, dim1, dim2, LL, num_particles, &q[0], cut_off, tolerance, omp_get_max_threads());
#else
  ewald::pme p(dim0, dim1, dim2, LL, num_particles, &q[0], cut_off, tolerance, 1);
#endif
  double seconds = timer.toc();

  clog << "PME initialization took " << seconds << " seconds.\n\n";

  const double t = p.get_threshold();
  if (std::isnan(t)) {
    cerr << "Unable to run PME for the prescribed values of cut off and tolerance.\n";
    return EXIT_FAILURE;
  }

  timer.tic();

  energy_pme.recip = p.energy(&coor_x[0], &coor_y[0], &coor_z[0],
                              &force_x[0], &force_y[0], &force_z[0]);

  for (size_t k = 0; k < num_particles; ++k) {
    force_recip_pme(0, k) = force_x[k];
    force_recip_pme(1, k) = force_y[k];
    force_recip_pme(2, k) = force_z[k];
  }

  seconds = timer.toc();
  clog << "PME took " << seconds << " seconds.\n\n";

  std::vector<double> force_excl_x(num_particles, 0.0);
  std::vector<double> force_excl_y(num_particles, 0.0);
  std::vector<double> force_excl_z(num_particles, 0.0);

  energy_pme.masked = p.excluded(exclusions,
                                 &coor_x[0],
                                 &coor_y[0],
                                 &coor_z[0],
                                 &force_excl_x[0],
                                 &force_excl_y[0],
                                 &force_excl_z[0]);

  for (size_t k = 0; k < num_particles; ++k) {
    force_excluded_pme(0, k) = force_excl_x[k];
    force_excluded_pme(1, k) = force_excl_y[k];
    force_excluded_pme(2, k) = force_excl_z[k];
  }

  energy_ewald.extra = energy_pme.extra = p.energy_extra();
  energy_ewald.self = energy_pme.self = p.energy_self();

  double Eexhaustive = 0.0;
  double Ecutoff = 0.0;

  mat force_ewald, force_pme;

  double elapsed = 0.0;

  for (int n = 0; n <= max_shell; n++) {
    wall_clock timer;
    timer.tic();
    for (int nx = -n; nx <= n; nx++) {
      for (int ny = -n; ny <= n; ny++) {
        for (int nz = -n; nz <= n; nz++) {
          if (abs(nx) != n && abs(ny) != n && abs(nz) != n)
            continue;

          const bool origin = (n == 0);

          const vec::fixed<3> m = { double(nx), double(ny), double(nz) };
          const vec::fixed<3> Lm = L * m;
          const vec::fixed<3> LinvTm = LinvT * m;

          // Compute direct space contribution.
          for (size_t i = 0; i < num_particles; i++) {
            for (size_t j = 0; j < num_particles; j++) {
              if (origin && i == j)
                continue;

              const vec::fixed<3> v = r.col(i) - r.col(j) + Lm;
              const double r2 = dot(v, v);
              const double r6 = r2 * r2 * r2;
              const double r8 = r6 * r2;

              const double qij = q[i] * q[j];

              const bool is_excluded = excluded(i, j);

              const double ener = 0.5 * qij / r6;
              const vec::fixed<3> f = 6.0 * qij / r8 * v;

              if (origin) {
                if (!is_excluded) {     // Unmasked pair.
                  Eexhaustive  += ener;
                  forces.col(i) += f;

                  if (r2 < cut_off2) {
                    Ecutoff += ener;
                    force_cutoff.col(i) += f;

                    vec::fixed<3> dv;
                    const double pot = 0.5 * qij
                        * p.direct_convergence_term(v.memptr(), dv.memptr());
                    energy_pme.direct += pot;
                    energy_ewald.direct += pot;
                    force_direct.col(i) += -qij * dv;
                  }
                } else {                // Masked pair.
                  // Note that we don't do cut offs here because we
                  // want to cancel the Fourier space contribution,
                  // which does not involve cut offs.
                  vec::fixed<3> dv;
                  energy_ewald.masked += (ener - 0.5 * qij
                                          * p.direct_convergence_term(v.memptr(),
                                                                      dv.memptr()));
                  force_excluded.col(i) += f + qij * dv;
                }
              } else {                  // Outer shells.
                Eexhaustive  += ener;
                forces.col(i) += f;

                if (r2 < cut_off2) {
                  Ecutoff += ener;
                  force_cutoff.col(i) += f;

                  vec::fixed<3> dv;
                  const double pot = 0.5 * qij
                      * p.direct_convergence_term(v.memptr(), dv.memptr());
                  energy_pme.direct += pot;
                  energy_ewald.direct += pot;
                  force_direct.col(i) += -qij * dv;
                }
              }
            }
          }

          // Compute contribution from Fourier space.
          if (!origin) {
            complex<double> rho(0.0, 0.0);

            for (size_t i = 0; i < num_particles; i++)
              rho += q[i] * exp(2.0 * M_PI * I * dot(m, x.col(i)));

            const double h2 = dot(LinvTm, LinvTm);
            energy_ewald.recip += p.recip_convergence_term(h2, rho);

            // force_recip.col(0) = -4.0 * M_PI * q[0] * q[1] * p.psi(h2) * sin(2.0 * M_PI * dot(LinvTm, r.col(0) - r.col(1))) * (-m);
            // force_recip.col(1) = -4.0 * M_PI * q[0] * q[1] * p.psi(h2) * sin(2.0 * M_PI * dot(LinvTm, r.col(0) - r.col(1))) * m;
          }
        }
      }
    }

    elapsed += timer.toc();

    force_ewald = force_direct + force_recip;
    force_pme   = force_direct + force_recip_pme - force_excluded_pme;

    {
      const double pme_total   = energy_pme.total();
      const double ewald_total = energy_ewald.total();

      const double rel_error_ewald_vs_pme    = fabsl(ewald_total - pme_total) / fabsl(ewald_total);
      const double rel_error_ewald_vs_cutoff = fabsl(ewald_total - Ecutoff) / fabsl(ewald_total);

      const double rel_error_exhaustive_vs_cutoff = fabsl(Eexhaustive - Ecutoff) / fabsl(Eexhaustive);
      const double rel_error_exhaustive_vs_ewald  = fabsl(Eexhaustive - ewald_total) / fabsl(Eexhaustive);
      const double rel_error_exhaustive_vs_pme    = fabsl(Eexhaustive - pme_total) / fabsl(Eexhaustive);

      cout << "==============================================\n"
           << fixed << setprecision(4)
           << "Shell " << n << " (" << elapsed << " seconds)" << "\n"
           << "==============================================\n"
           << scientific << setprecision(9)
           << "\n"
           << "Eexhaustive = " << Eexhaustive << "\n"
           << "Ecutoff     = " << Ecutoff << "\n"
           << "Ewald       = " << energy_ewald << "\n"
           << "PME         = " << energy_pme << "\n"
           << "\n"
           << "Abs. error in excluded forces = " << norm(force_excluded - force_excluded_pme, "inf") << "\n"
           << "\n"
           << "Abs. error in forces (Exhaustive vs. cutoff) = " << norm(forces - force_cutoff, "inf") << "\n"
           << "Abs. error in forces (Exhaustive vs. PME)    = " << norm(forces - force_pme, "inf") << "\n"
           << "\n"
           << "Rel. error in total energies (Ewald vs. cutoff) = " << rel_error_ewald_vs_cutoff << "\n"
           << "Rel. error in total energies (Ewald vs. PME)    = " << rel_error_ewald_vs_pme << "\n"
           << "\n"
           << "Rel. error in total energies (Exhaustive vs. cutoff) = " << rel_error_exhaustive_vs_cutoff << "\n"
           << "Rel. error in total energies (Exhaustive vs. Ewald)  = " << rel_error_exhaustive_vs_ewald << "\n"
           << "Rel. error in total energies (Exhaustive vs. PME)    = " << rel_error_exhaustive_vs_pme << "\n"
           << endl;
    }
  }

  return 0;
}

bool excluded(size_t i, size_t j) {
  // assert(i < num_particles);
  // assert(j < num_particles);

  ewald::index_vector const& is = exclusions[i];
  for (size_t k = 0; k < is.size(); ++k)
    if (j == is[k])
      return true;

  return false;
}

A  => energies.h +32 -0
@@ 1,32 @@
#ifndef ENERGIES_H
#define ENERGIES_H

#include <ostream>

struct energies {
  double direct;
  double recip;
  double extra;
  double self;
  double masked;

  energies()
      : direct(0.0), recip(0.0), extra(0.0),
        self(0.0), masked(0.0) {}

  double total() const {
    return direct + recip + extra - self - masked;
  }

  friend std::ostream& operator<<(std::ostream& stream, energies& ener) {
    return stream << ener.total()
                  << " = direct + recip + extra - self - masked = "
                  << ener.direct << " + " 
                  << ener.recip  << " + "
                  << ener.extra  << " - " 
                  << ener.self   << " - "
                  << ener.masked;
  }
};

#endif

A  => linear-algebra.cpp +68 -0
@@ 1,68 @@
#include <cstdlib>
#include <cstring>

#include <sys/types.h>

#include "linear-algebra.h"

namespace ewald {

void matrix_vector_product(double const* A, double* X) {
  double Y[] = { 0.0, 0.0, 0.0 };

  for (size_t i = 0; i < 3; ++i)
    for (size_t j = 0; j < 3; ++j)
      Y[i] += A[3*i+j] * X[j];

  memcpy(X, Y, 3 * sizeof(double));
}

void matrix_vector_product_transpose(double const* A, double* X) {
  double Y[] = { 0.0, 0.0, 0.0 };

  for (size_t i = 0; i < 3; ++i)
    for (size_t j = 0; j < 3; ++j)
      Y[i] += A[3*j+i] * X[j];

  memcpy(X, Y, 3 * sizeof(double));
}

void matrix_matrix_product(double const* A, double const* B, double* C) {
  memset(C, 0, 3 * 3 * sizeof(double));

  for (size_t i = 0; i < 3; ++i)
    for (size_t j = 0; j < 3; ++j)
      for (size_t k = 0; k < 3; ++k)
        C[3*i+j] += A[3*i+k] * B[3*k+j];
}

double determinant(double const* A) {
  return A[3*0+0] * (A[3*1+1] * A[3*2+2] - A[3*1+2] * A[3*2+1])
       - A[3*0+1] * (A[3*1+0] * A[3*2+2] - A[3*1+2] * A[3*2+0])
       + A[3*0+2] * (A[3*1+0] * A[3*2+1] - A[3*1+1] * A[3*2+0]);
}

void matrix_inverse(double const* A, double* Ainv) {
  const double invdet = 1.0 / determinant(A);
  memset(Ainv, 0, 3 * 3 * sizeof(double));
  Ainv[3*0+0] = ( A[3*1+1]*A[3*2+2] - A[3*1+2]*A[3*2+1]) * invdet;
  Ainv[3*0+1] = (-A[3*0+1]*A[3*2+2] + A[3*0+2]*A[3*2+1]) * invdet;
  Ainv[3*0+2] = ( A[3*0+1]*A[3*1+2] - A[3*0+2]*A[3*1+1]) * invdet;
  Ainv[3*1+0] = (-A[3*1+0]*A[3*2+2] + A[3*1+2]*A[3*2+0]) * invdet;
  Ainv[3*1+1] = ( A[3*0+0]*A[3*2+2] - A[3*0+2]*A[3*2+0]) * invdet;
  Ainv[3*1+2] = (-A[3*0+0]*A[3*1+2] + A[3*0+2]*A[3*1+0]) * invdet;
  Ainv[3*2+0] = ( A[3*1+0]*A[3*2+1] - A[3*1+1]*A[3*2+0]) * invdet;
  Ainv[3*2+1] = (-A[3*0+0]*A[3*2+1] + A[3*0+1]*A[3*2+0]) * invdet;
  Ainv[3*2+2] = ( A[3*0+0]*A[3*1+1] - A[3*0+1]*A[3*1+0]) * invdet;
}

double dot_product(double const* u, double const* v) {
  double result = 0.0;

  for (size_t k = 0; k < 3; ++k)
    result += u[k] * v[k];

  return result;
}

}

A  => linear-algebra.h +20 -0
@@ 1,20 @@
#ifndef LINEAR_ALGEBRA_H
#define LINEAR_ALGEBRA_H

namespace ewald {

void matrix_vector_product(double const* A, double* X);

void matrix_vector_product_transpose(double const* A, double* X);

void matrix_matrix_product(double const* A, double const* B, double* C);

double determinant(double const* A);

void matrix_inverse(double const* A, double* Ainv);

double dot_product(double const* u, double const* v);

}

#endif

A  => make-box.cpp +37 -0
@@ 1,37 @@
#include <cstdio>
#include <cstdlib>

#include <iostream>

#include <armadillo>

using namespace std;
using namespace arma;

int main(int argc, char* argv[]) {
  if (argc < 3) {
    cerr << "Usage: " << argv[0] << " num-particles rand-seed\n";
    return EXIT_FAILURE;
  }

  const size_t num_particles = atoi(argv[1]);
  
  const ulong rand_seed = strtoul(argv[2], nullptr, 10);
  clog << "Using random seed: " << rand_seed << "\n";
  srand(rand_seed);

  const double sigma = pow(2.0, 1.0 / 6.0);
  mat::fixed<3, 3> L = 40.0 * sigma * eye(3, 3);
  L.save("cell.mat", arma::csv_ascii);
  
  mat x = randu<mat>(3, num_particles);
  mat r = L * x;
  r.save("positions.mat", arma::csv_ascii);
  
  vec q = 2e1 * randu<vec>(num_particles);
  // vec q = ones<vec>(num_particles);
  for (size_t n = 0; n < num_particles; ++n) if (n % 3 != 0) q[n] = 0.0;
  q.save("charges.mat", arma::csv_ascii);

  return EXIT_SUCCESS;
}

A  => pme.cpp +436 -0
@@ 1,436 @@
#include <cassert>
#include <cstdlib>
#include <cstring>

#include <limits>
#include <numeric>
#include <functional>
#include <iostream>

#include <fftw3.h>

#include "cube.cpp"
#include "pme.h"
#include "spline.h"

#include "linear-algebra.h"

namespace ewald {

const spline4 spline;
const int order = spline.order;

const double epsilon = std::numeric_limits<double>::epsilon();

pme::pme(long d0, long d1, long d2,
         double const unit_cell[],
         size_t N_,
         double const* q_,
         double cut_off_,
         double tolerance,
         int num_threads)
    : N(N_),
      q(q_),
      cut_off(cut_off_),
      threshold(compute_threshold(cut_off, tolerance)),
      threshold2(threshold * threshold),
      dim0(d0), dim1(d1), dim2(d2),
      Q(d0, d1, d2),
      Qhat(d0, d1, d2),
      BC(d0, d1, d2),
      s(3, N, order),
      ds(3, N, order)
{
  assert(dim0 > 0 && dim1 > 0 && dim2 > 0);

  std::memcpy(L, unit_cell, 3 * 3 * sizeof(double));
  volume = determinant(L);
  assert(volume > 1e2 * epsilon);
  matrix_inverse(L, Linv);

  std::memset(K, 0, 3 * 3 * sizeof(double));
  K[3*0+0] = dim0;
  K[3*1+1] = dim1;
  K[3*2+2] = dim2;

  matrix_matrix_product(K, Linv, H);

  initialize_table();

  fftw_init_threads();
  fftw_plan_with_nthreads(num_threads);

  {
    // TODO: Benchmark r2c.
    fftw_complex* in  = reinterpret_cast<fftw_complex*>(Q.memptr());
    fftw_complex* out = reinterpret_cast<fftw_complex*>(Qhat.memptr());
    forward_plan = fftw_plan_dft_3d(dim0, dim1, dim2,
                                    in, out,
                                    FFTW_FORWARD, FFTW_MEASURE);
  }

  {
    fftw_complex* in  = reinterpret_cast<fftw_complex*>(Qhat.memptr());
    fftw_complex* out = reinterpret_cast<fftw_complex*>(Qhat.memptr());
    backward_plan = fftw_plan_dft_3d(dim0, dim1, dim2,
                                     in, out,
                                     FFTW_BACKWARD, FFTW_MEASURE);
  }
}

pme::~pme() {
  fftw_destroy_plan(forward_plan);
  fftw_destroy_plan(backward_plan);

  fftw_cleanup_threads();
}

void pme::initialize_table() {
  long k0, k1, k2;
  for (k0 = 0; k0 < dim0; ++k0) {
    const double h0 = k0 < dim0 / 2 ? k0 : k0 - dim0;

    for (k1 = 0; k1 < dim1; ++k1) {
      const double h1 = k1 < dim1 / 2 ? k1 : k1 - dim1;

      for (k2 = 0; k2 < dim2; ++k2) {

        if (k0 == 0 && k1 == 0 && k2 == 0)
          continue;

        const double h2 = k2 < dim2 / 2 ? k2 : k2 - dim2;

        double LinvTm[] = { h0, h1, h2 };
        matrix_vector_product_transpose(Linv, LinvTm);

        const double hh = dot_product(LinvTm, LinvTm);

        const double nrm =  norm(spline.factor(k0, dim0)
                                 * spline.factor(k1, dim1)
                                 * spline.factor(k2, dim2));

        BC(k0, k1, k2) = psi(hh) * nrm;
      }
    }
  }
}

static void compute_spline(double u,
                           double* __restrict__ s,
                           double* __restrict__ ds,
                           double& floor_u) {
  floor_u = floor(u);
  double x = u - floor_u + double(order) - 1.0;

  // Compute splines and their derivatives using Horner's method.
  const double a[] = {
    32.0 / 3.0, -8.0, 2.0, -1.0 / 6.0,
    -22.0 / 3.0, 10.0, -4.0, 1.0 / 2.0,
    2.0 / 3.0, -2.0, 2.0, -1.0 / 2.0,
    0.0, 0.0, 0.0, 1.0 / 6.0
  };

  for (size_t k = 0; k < 4; ++k, --x) {
     s[k] = a[4*k] + (a[4*k+1] + (a[4*k+2] + a[4*k+3] * x) * x) * x;
    ds[k] = a[4*k+1] + (2.0 * a[4*k+2] + 3.0 * a[4*k+3] * x) * x;
  }
}

void pme::interpolate(double const* __restrict__ x,
                      double const* __restrict__ y,
                      double const* __restrict__ z) {
  for (size_t n = 0; n < N; ++n) {
    double u[] = { x[n], y[n], z[n] };
    matrix_vector_product(H, u);

    const double q_n = q[n];

    double floor_u[3];
    for (size_t d = 0; d < 3; ++d)
      compute_spline(u[d], &s(d, n, 0), &ds(d, n, 0), floor_u[d]);

    for (long c0 = 0; c0 < order; ++c0) {
      const long l0 = floor_u[0] + c0 - order + 1;
      const long k0 = l0 < 0 ? l0 + dim0 : l0;

      const double q_n_s0 = q_n * s(0, n, c0);

      for (long c1 = 0; c1 < order; ++c1) {
        const long l1 = floor_u[1] + c1 - order + 1;
        const long k1 = l1 < 0 ? l1 + dim1 : l1;

        const double q_n_s0_s1 = q_n_s0 * s(1, n, c1);

        for (long c2 = 0; c2 < order; ++c2) {
          const long l2 = floor_u[2] + c2 - order + 1;
          const long k2 = l2 < 0 ? l2 + dim2 : l2;

          Q(k0, k1, k2) += q_n_s0_s1 * s(2, n, c2);
          // printf("cube (%s): %d %d %d %d %g\n", __func__, n, k0, k1, k2, std::real(Qhat(k0, k1, k2)));
        }
      }
    }
  }
}

double pme::energy(double const* __restrict__ x,
                   double const* __restrict__ y,
                   double const* __restrict__ z,
                   double* __restrict__ force_x,
                   double* __restrict__ force_y,
                   double* __restrict__ force_z) {
  Q.zeros();
  Qhat.zeros();
  s.zeros();
  ds.zeros();

  interpolate(x, y, z);
  /*
  {
    std::complex<double> const* Qptr = Q.memptr();
    std::complex<double> total { 0.0, 0.0 };
    for (long r = 0; r < dim0 * dim1 * dim2; ++r)
      total += Qptr[r];
    std::printf("%s: Interpolated a total \"charge\" of %g + %g i\n", __func__, real(total), imag(total));
  }
  */


  fftw_execute(forward_plan);

  Qhat *= BC;

  fftw_execute(backward_plan);

  double Erecip = 0.0;
  for (long k0 = 0; k0 < dim0; ++k0)
    for (long k1 = 0; k1 < dim1; ++k1)
      for (long k2 = 0; k2 < dim2; ++k2)
        Erecip += real(Q(k0, k1, k2)) * real(Qhat(k0, k1, k2));

  double Erecip1 = 0.0;
  for (size_t n = 0; n < N; ++n) {
    double u[] = { x[n], y[n], z[n] };
    matrix_vector_product(H, u);

    for (long c0 = 0; c0 < order; ++c0) {
      const long l0 = floor(u[0]) + c0 - order + 1;
      const long k0 = l0 < 0 ? l0 + dim0 : l0;

      for (long c1 = 0; c1 < order; ++c1) {
        const long l1 = floor(u[1]) + c1 - order + 1;
        const long k1 = l1 < 0 ? l1 + dim1 : l1;

        const double ds0_s1 = ds(0, n, c0) *  s(1, n, c1);
        const double s0_ds1 =  s(0, n, c0) * ds(1, n, c1);
        const double s0_s1  =  s(0, n, c0) *  s(1, n, c1);

        for (long c2 = 0; c2 < order; ++c2) {
          const long l2 = floor(u[2]) + c2 - order + 1;
          const long k2 = l2 < 0 ? l2 + dim2 : l2;

          const double Q_k = std::real(Qhat(k0, k1, k2));

          Erecip1 += Q_k * std::real(Q(k0, k1, k2));

          double w[] = {
            ds0_s1 *  s(2, n, c2),
            s0_ds1 *  s(2, n, c2),
             s0_s1 * ds(2, n, c2)
          };
          matrix_vector_product_transpose(H, w);

          // if (n == 1)
          //   printf("cube (%s): %3d\t%3d\t%3d\t%3d\t%3.6f\t%3.6f\t%3.6f\t%3.6f\t%3.9f\n",
          //          __func__, n, k0, k1, k2, 
          //          std::real(Q(k0, k1, k2)), std::imag(Q(k0, k1, k2)),
          //          std::real(Qhat(k0, k1, k2)), std::imag(Qhat(k0, k1, k2)),
          //          Q_k * std::real(Q(k0, k1, k2)));

          const double factor = -2.0 * Q_k * q[n];
          force_x[n] += factor * w[0];
          force_y[n] += factor * w[1];
          force_z[n] += factor * w[2];
        }
      }
    }
  }

  printf("%s: Erecip = %3.16f vs. %3.16f\n", __func__, Erecip, Erecip1);
  return double(Erecip1);
}

double pme::energy_extra() const {
  const double sum_q = std::accumulate(q, q + N, 0.0, std::plus<double>());

  return 1.0 / 6.0 * pow(M_PI * threshold, 3.0) / volume * sum_q * sum_q;
}

double pme::energy_self() const {
  const double q2 = std::inner_product(q, q + N, q, 0.0);

  return 1.0 / 12.0 * pow(M_PI, 3.0) * pow(threshold, 6.0) * q2;
}

double pme::psi(double h2) const {
  const double b2 = M_PI * h2 / threshold2;
  const double b = sqrt(b2);
  const double b3 = b2 * b;

  const double h = sqrt(h2);
  const double h3 = h2 * h;

  return pow(M_PI, 9.0 / 2.0) / (3.0 * volume) * h3
      * (sqrt(M_PI) * erfc(b) + (1.0 / (2.0 * b3) - 1.0 / b) * exp(-b2));
}

double pme::recip_convergence_term(double h2,
                                   std::complex<double> const& rho) const {
  const double norm_rho2 = norm(rho);
  return norm_rho2 * psi(h2);
}

static inline double direct_term(double r2, double threshold2, double& fac) {
  const double r6 = r2 * r2 * r2;
  const double r8 = r6 * r2;
  const double a2 = M_PI * threshold2 * r2;
  const double a4 = a2 * a2;
  const double a6 = a4 * a2;
  const double exp_m_a2 = exp(-a2);

  fac = -(a6 + 3.0 * a4 + 6.0 * a2 + 6.0) * exp_m_a2 / r8;

  return (1.0 + a2 + 0.5 * a4) * exp_m_a2 / r6;
}

double pme::direct_convergence_term(double const* v, double* dv) const {
  const double r2 = dot_product(v, v);
  double fac;

  const double energy = direct_term(r2, threshold2, fac);

  for (size_t k = 0; k < 3; ++k)
    dv[k] = fac * v[k];

  return energy;
}

static double diff_direct_term(double threshold, double cut_off) {
  // Computes the derivative of direct_term() with respect to the
  // variable threshold. Note that the input is threshold, not
  // threshold squared (threshold2).

  const double threshold2 = threshold * threshold;
  const double threshold4 = threshold2 * threshold2;
  const double threshold5 = threshold4 * threshold;
  const double cut_off2 = cut_off * cut_off;

  return -threshold5 * exp(-threshold2 * cut_off2);
}

/** Obtains a suitable value of the threshold parameter. The threshold
 * parameter (beta) is obtained by solving the non-linear equation
 * direct_term(beta) = tol for a given tolerance tol.
 *
 * \param cut_off Cut off distance.
 *
 * \param tol Maximum value of the real space contribution for the
 * distance given by cut_off.
 *
 * \return The solution beta of the non-linear equation.
 */
double pme::compute_threshold(double cut_off, double tol) const {
  // XXX Maybe throw exception rather than return an integer status flag.
  const double r2 = cut_off * cut_off;
  double aux;

  // Our initial guess comes from imposing that the value of exp(-a2)
  // in direct_term should be greater than or equal to machine
  // epsilon.
  double beta_init = sqrt(-log(epsilon) / M_PI) / cut_off;
  double beta = beta_init;
  double beta2 = beta * beta;
  double val = direct_term(r2, beta2, aux);

  std::clog << "Starting non-linear solver with beta = " << beta << std::endl;

  size_t iter;
  const size_t max_iters = size_t(1e6);
  for (iter = 0; iter < max_iters; ++iter) {
    beta -= (direct_term(r2, beta2, aux) - tol) / diff_direct_term(beta, cut_off);
    beta2 = beta * beta;
    if (isinf(beta) || isnan(beta)) {
      beta_init /= 2.0;
      beta = beta_init;

      if (beta_init == 0.0) {
        std::clog << "Non-linear solver failed to find a suitable value of beta."
                  << std::endl;
        // Red alert. XXX This should fail more gracefully.
        beta = NAN;
        break;
      }
    }

    val = direct_term(r2, beta2, aux);
    if (fabs(val - tol) < epsilon)
      break;
  }

  std::clog << "Solver stopped after " << iter << " iterations.\n"
            << "Using beta = " << beta << " with value = " << val << "\n";

  if (iter == max_iters)
    std::clog << "Warning: Non-linear solver did not converge."
              << std::endl;

  return beta;
}

double pme::excluded(exclusion_vector const& exclusions,
                     double const* __restrict__ x,
                     double const* __restrict__ y,
                     double const* __restrict__ z,
                     double* __restrict__ force_x,
                     double* __restrict__ force_y,
                     double* __restrict__ force_z) const {
  double excluded_energy = 0.0;

  for (size_t i = 0; i < N; ++i) {
    const double q_i = q[i];

    index_vector const& excl = exclusions[i];
    for (size_t r = 0; r < excl.size(); ++r) {
      const size_t j = excl[r];
      if (j <= i)
        continue;

      const double v[] = { x[i] - x[j], y[i] - y[j], z[i] - z[j] };
      const double r2 = dot_product(v, v);
      // Note that we must NOT enforce a cut off here.  We are
      // compensating a masked contribution to the reciprocal part,
      // and that contribution does not involve cut offs.
      const double r6 = r2 * r2 * r2;
      const double invr6 = 1.0 / (r2 * r2 * r2);
      const double invr8 = 1.0 / (r6 * r2);

      double term;
      double f[3];
      const double qi_qj = q_i * q[j];
      excluded_energy += qi_qj * (invr6 - direct_term(r2, threshold2, term));
      const double factor = qi_qj * (6.0 * invr8 + term);
      for (size_t k = 0; k < 3; ++k)
        f[k] = factor * v[k];
      force_x[i] += f[0];
      force_y[i] += f[1];
      force_z[i] += f[2];
      force_x[j] -= f[0];
      force_y[j] -= f[1];
      force_z[j] -= f[2];
    }
  }

  return excluded_energy;
}

}

A  => pme.h +87 -0
@@ 1,87 @@
#ifndef PME_H
#define PME_H

#include <vector>
#include <complex>

#include <fftw3.h>

#include "cube.h"

namespace ewald {

typedef std::vector<size_t> index_vector;
typedef std::vector<index_vector> exclusion_vector;

class pme {
 public:
  pme(long dimension0, long dimension1, long dimension2,
      double const* unit_cell,
      size_t num_particles,
      double const* q,
      double cut_off, double tolerance,
      int num_threads);

  ~pme();

  double get_threshold() const { return threshold; }

  double energy(double const* x, double const* y, double const* z,
                double* force_x, double* force_y, double* force_z);

  double energy_extra() const;

  double energy_self() const;

  double direct_convergence_term(double const* v,
                                 double* dv) const;

  // XXX Make the following two methods private at some point.
  double recip_convergence_term(double h2,
                                std::complex<double> const& rho) const;

  double psi(double h2) const;

  double excluded(exclusion_vector const& exclusions,
                  double const* x, double const* y, double const* z,
                  double* force_x, double* force_y, double* force_z) const;

 private:
  double compute_threshold(double cut_off, double tol) const;

  void initialize_table();

  void interpolate(double const* x, double const* y, double const* z);

 private:
  const size_t N;                       // Number of particles.

  double L[3 * 3];
  double Linv[3 * 3];
  double volume;

  double const* q;                      // B params. in OPLS LJ model.

  const double cut_off;

  const double threshold;               // Real-space threshold.
  const double threshold2;

  double K[3 * 3];
  double H[3 * 3];                      // Coordinate transformation.

  const long dim0, dim1, dim2;

  complex_cube Q;
  complex_cube Qhat;
  complex_cube BC;

  real_cube s;
  real_cube ds;

  fftw_plan forward_plan, backward_plan;
};

}

#endif

A  => run-pme.cpp +182 -0
@@ 1,182 @@
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cfenv>

#include <complex>
#include <iomanip>
#include <iostream>

#include <armadillo>

#include <omp.h>

#include "pme.h"
#include "abort_unless.h"

using namespace std;
using namespace arma;

const complex<double> I(0.0, 1.0);

const double sigma = pow(2.0, 1.0 / 6.0);

int main(int argc, char* argv[]) {
  if (argc < 7) {
    cout << "Usage: " << argv[0]
         << " dimension cell-file positions-file charges-file cut-off tolerance\n";
    return EXIT_FAILURE;
  }

  const long dim0 = atoi(argv[1]);
  const long dim1 = dim0;
  const long dim2 = dim0;

  mat::fixed<3, 3> L;
  L.load(argv[2]);
  const mat::fixed<3, 3> LinvT = trans(inv(L));
  // const double V = det(L);
  L.print("L = ");

  mat r;
  r.load(argv[3]);
  abort_unless(r.n_rows == 3);
  // r.print("r = ");
  const mat x = inv(L) * r;

  vec qq;
  qq.load(argv[4]);
  std::vector<double> q(r.n_cols);
  {
    for (size_t k = 0; k < q.size(); ++k)
      q[k] = qq(k);
  }

  const size_t num_particles = q.size();
  clog << "Read " << num_particles << " particles into memory.\n";

  std::vector<double> coor_x(num_particles);
  std::vector<double> coor_y(num_particles);
  std::vector<double> coor_z(num_particles);
  {
    for (size_t k = 0; k < num_particles; ++k) {
      coor_x[k] = r(0, k);
      coor_y[k] = r(1, k); 
      coor_z[k] = r(2, k); 
    }
  }

  const double cut_off = atof(argv[5]);
  const double tolerance = atof(argv[6]);
  clog << "Real space cut-off: " << cut_off << "\n"
       << "Real space tolerance: " << tolerance << "\n\n";

  mat forces = zeros<mat>(3, num_particles);
  mat forces_cutoff = zeros<mat>(3, num_particles);
  mat forces_direct = zeros<mat>(3, num_particles);
  mat forces_recip = zeros<mat>(3, num_particles);

  long double ErecipPME = 0.0L;
  std::vector<double> forces_x(num_particles);
  std::vector<double> forces_y(num_particles);
  std::vector<double> forces_z(num_particles);
  mat forcesPME = zeros<mat>(3, num_particles);
    wall_clock timer;
    timer.tic();

    double LL[3 * 3];
    for (size_t i = 0; i < 3; ++i)
      for (size_t j = 0; j < 3; ++j)
        LL[3*i+j] = L(i, j);

    ewald::pme p(dim0, dim1, dim2, LL, 
                 num_particles, &q[0], 
                 cut_off, tolerance, 
                 omp_get_max_threads());

    double seconds = timer.toc();
    clog << "PME initialization took " << seconds << " seconds.\n\n";

    timer.tic();

    ErecipPME = p.energy(&coor_x[0], &coor_y[0], &coor_z[0],
                         &forces_x[0], &forces_y[0], &forces_z[0]);

    for (size_t k = 0; k < num_particles; ++k) {
      forcesPME(0, k) = forces_x[k];
      forcesPME(1, k) = forces_y[k];
      forcesPME(2, k) = forces_z[k];
    }

    seconds = timer.toc();
    clog << "PME took " << seconds << " seconds.\n\n";

  long double Ebrute = 0.0L, Edirect = 0.0L;
  long double Ecutoff = 0.0L;
  const long double Eextra = p.energy_extra();
  const long double Eself  = p.energy_self();

  mat forces_ewald, forces_pme;
  
  double elapsed = 0.0;
  for (int n = 0; n < 1; n++) {
    wall_clock timer;
    timer.tic();
    for (int nx = -n; nx <= n; nx++) {
      for (int ny = -n; ny <= n; ny++) {
        for (int nz = -n; nz <= n; nz++) {
          if (abs(nx) != n && abs(ny) != n && abs(nz) != n)
            continue;

          const bool origin = (nx == 0 && ny == 0 && nz == 0);
          
          const vec::fixed<3> m = { double(nx), double(ny), double(nz) };
          const vec::fixed<3> Lm = L * m;
          const vec::fixed<3> LinvTm = LinvT * m;
          
          for (size_t i = 0; i < num_particles; i++) {
            for (size_t j = 0; j < num_particles; j++) {
              if (origin && i == j)
                continue;

              const vec::fixed<3> v = r.col(i) - r.col(j) + Lm;
              const double r2 = dot(v, v);
              const double r6 = r2 * r2 * r2;
              const double r8 = r6 * r2;

              assert(r2 >= 1e-16);

              Ebrute  += 0.5 * q[i] * q[j] / r6;
              forces.col(i) -= -6.0 * q[i] * q[j] / r8 * v;

              if (abs(nx) < 2 && abs(ny) < 2 && abs(nz) < 2) {
                forces_cutoff.col(i) -= -6.0 * q[i] * q[j] / r8 * v;
                Ecutoff = Ebrute;
              }

              vec::fixed<3> dv;
              Edirect += 0.5 * q[i] * q[j] * p.direct_convergence_term(v.memptr(), dv.memptr());
              forces_direct.col(i) -= q[i] * q[j] * dv;
            }
          }
        }
      }
    }

    elapsed += timer.toc();

    const long double EtotalPME = Edirect + ErecipPME + Eextra - Eself;

    forces_pme   = forces_direct + forcesPME;

    cout << "Real space computation over one shell took " << elapsed << " seconds." << "\n"
         << fixed << setprecision(16)
         << "Ecutoff      = " << Ecutoff << "\n"
         << "Etotal (PME) = " << EtotalPME  << "\n"
         << "Edirect + Erecip + Eextra - Eself = " 
         << Edirect << " + " << ErecipPME << " + " << Eextra << " - " << Eself << "\n";
  }
  
  return 0;
}

A  => spline.cpp +48 -0
@@ 1,48 @@
#include <cmath>

#include <complex>

#include "spline.h"

using std::floor;
using std::complex;

const complex<double> I(0.0, 1.0);

spline4::spline4() {}

double spline4::operator()(double x) const {
  if (0.0 < x && x < 1.0)
    return 1.0 / 6.0 * x * x * x;
  else if (1.0 <= x && x < 2.0)
    return 2.0 / 3.0 + (-2.0 + (2.0 - 1.0 / 2.0 * x) * x) * x;
  else if (2.0 <= x && x < 3.0)
    return -22.0 / 3.0 + (10.0 + (-4.0 + 1.0 / 2.0 * x) * x) * x;
  else if (3.0 <= x && x < 4.0)
    return 32.0 / 3.0 + (-8.0 + (2.0 - 1.0 / 6.0 * x) * x) * x;
  else
    return 0.0;
}

double spline4::diff(double x) const {
  if (0.0 < x && x < 1.0)
    return 1.0 / 2.0 * x * x;
  else if (1.0 <= x && x < 2.0)
    return -2.0 + (4.0 - 3.0 / 2.0 * x) * x;
  else if (2.0 <= x && x < 3.0)
    return 10.0 + (-8.0 + 3.0 / 2.0 * x) * x;
  else if (3.0 <= x && x < 4.0)
    return -8.0 + (4.0 - 1.0 / 2.0 * x) * x;
  else
    return 0.0;
}

complex<double> spline4::factor(double m, double K) const {
  const double m_over_K = m / K;

  complex<double> denom = 0.0;
  for (double k = 0.0; k <= order - 2.0; k++)
    denom += operator()(k + 1.0) * exp(2.0 * M_PI * I * m_over_K * k);

  return exp(2.0 * M_PI * I * (order - 1.0) * m_over_K) / denom;
}

A  => spline.h +18 -0
@@ 1,18 @@
#ifndef SPLINE_H
#define SPLINE_H

#include <complex>

struct spline4 {
  spline4();

  double operator()(double x) const;

  double diff(double x) const;

  std::complex<double> factor(double m, double K) const;
  
  static const unsigned int order = 4;
};

#endif