Commit a2f78e7c authored by Sievers's avatar Sievers
Browse files

Updated whitespace, implemented low hanging performance boosts

parent 30d9ffba
Loading
Loading
Loading
Loading
+246 −239
Original line number Diff line number Diff line
@@ -480,9 +480,6 @@ template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_zi(const int& iter)
{
  int idouble = 0;
  for(int elem1 = 0; elem1 < nelements; elem1++)
    for(int elem2 = 0; elem2 < nelements; elem2++) {
  const int iatom = iter / idxz_max;
  const int jjz = iter % idxz_max;

@@ -496,15 +493,19 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iter)
  const int na = idxz(jjz, 7);
  const int nb = idxz(jjz, 8);

      const int jjza = idouble*idxz_max + jjz;

  const double *cgblock = cglist.data() + idxcg_block(j1, j2, j);

  int idouble = 0;
  for(int elem1 = 0; elem1 < nelements; elem1++)
  for(int elem2 = 0; elem2 < nelements; elem2++) {

  const int jjza = idouble*idxz_max + jjz;

  zlist(jjza, iatom).re = 0.0;
  zlist(jjza, iatom).im = 0.0;

      int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
      int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
  int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min;
  int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max;
  int icgb = mb1min * (j2 + 1) + mb2max;
  for (int ib = 0; ib < nb; ib++) {

@@ -515,10 +516,10 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iter)
    int ma2 = ma2max;
    int icga = ma1min * (j2 + 1) + ma2max;
    for (int ia = 0; ia < na; ia++) {
          suma1_r += cgblock[icga] * (ulisttot(elem1*idxu_max+jju1 + ma1, iatom).re * ulisttot(elem2*idxu_max+jju2 + ma2, iatom).re -
                                      ulisttot(elem1*idxu_max+jju1 + ma1, iatom).im * ulisttot(elem2*idxu_max+jju2 + ma2, iatom).im);
          suma1_i += cgblock[icga] * (ulisttot(elem1*idxu_max+jju1 + ma1, iatom).re * ulisttot(elem2*idxu_max+jju2 + ma2, iatom).im +
                                      ulisttot(elem1*idxu_max+jju1 + ma1, iatom).im * ulisttot(elem2*idxu_max+jju2 + ma2, iatom).re);
      suma1_r += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * ulisttot(jju2 + ma2, iatom).re -
                                  ulisttot(jju1 + ma1, iatom).im * ulisttot(jju2 + ma2, iatom).im);
      suma1_i += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * ulisttot(jju2 + ma2, iatom).im +
                                  ulisttot(jju1 + ma1, iatom).im * ulisttot(jju2 + ma2, iatom).re);
      ma1++;
      ma2--;
      icga += j2;
@@ -560,9 +561,6 @@ KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_yi(int iter,
 const Kokkos::View<F_FLOAT**, DeviceType> &beta)
{
  int itriple;
  for(int elem1 = 0; elem1 < nelements; elem1++)
    for (int elem2 = 0; elem2 < nelements; elem2++) {
  double betaj;
  const int iatom = iter / idxz_max;
  const int jjz = iter % idxz_max;
@@ -582,11 +580,15 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
  //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
  //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;

  int itriple;
  for(int elem1 = 0; elem1 < nelements; elem1++)
  for (int elem2 = 0; elem2 < nelements; elem2++) {

  double ztmp_r = 0.0;
  double ztmp_i = 0.0;

      int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
      int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
  int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min;
  int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max;
  int icgb = mb1min * (j2 + 1) + mb2max;
  for (int ib = 0; ib < nb; ib++) {

@@ -598,14 +600,14 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
    int icga = ma1min * (j2 + 1) + ma2max;

    for (int ia = 0; ia < na; ia++) {
          suma1_r += cgblock[icga] * (ulisttot(elem1 * idxu_max + jju1 + ma1, iatom).re *
                                      ulisttot(elem2 * idxu_max + jju2 + ma2, iatom).re -
                                      ulisttot(elem1 * idxu_max + jju1 + ma1, iatom).im *
                                      ulisttot(elem2 * idxu_max + jju2 + ma2, iatom).im);
          suma1_i += cgblock[icga] * (ulisttot(elem1 * idxu_max + jju1 + ma1, iatom).re *
                                      ulisttot(elem2 * idxu_max + jju2 + ma2, iatom).im +
                                      ulisttot(elem1 * idxu_max + jju1 + ma1, iatom).im *
                                      ulisttot(elem2 * idxu_max + jju2 + ma2, iatom).re);
      suma1_r += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re *
                                  ulisttot(jju2 + ma2, iatom).re -
                                  ulisttot(jju1 + ma1, iatom).im *
                                  ulisttot(jju2 + ma2, iatom).im);
      suma1_i += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re *
                                  ulisttot(jju2 + ma2, iatom).im +
                                  ulisttot(jju1 + ma1, iatom).im *
                                  ulisttot(jju2 + ma2, iatom).re);
      ma1++;
      ma2--;
      icga += j2;
@@ -630,6 +632,7 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,

  // pick out right beta value
  for (int elem3 = 0; elem3 < nelements; elem3++) {
    const int jjuy = elem3 * idxu_max + jju;
    if (j >= j1) {
      const int jjb = idxb_block(j1, j2, j);
      itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
@@ -651,8 +654,8 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
    if (!bnorm_flag && j1 > j)
      betaj *= (j1 + 1) / (j + 1.0);

        Kokkos::atomic_add(&(ylist(elem3 * idxu_max + jju, iatom).re), betaj * ztmp_r);
        Kokkos::atomic_add(&(ylist(elem3 * idxu_max + jju, iatom).im), betaj * ztmp_i);
    Kokkos::atomic_add(&(ylist(jjuy, iatom).re), betaj * ztmp_r);
    Kokkos::atomic_add(&(ylist(jjuy, iatom).im), betaj * ztmp_i);
  }
  }
}
@@ -755,7 +758,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
      int ma = m % n_ma;
      int mb = m / n_ma;

      const int jju_index = jju+m;
      const int jju_index = jpos+jju+m;

      // Load y_local, apply the symmetry scaling factor
      // The "secret" of the shared memory optimization is it eliminates
@@ -763,7 +766,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
      // shared memory and otherwise always writing, making the kernel
      // ultimately compute bound. We take advantage of that by adding
      // some reads back in.
      auto y_local = ylist(jpos+jju_index,iatom);
      auto y_local = ylist(jju_index,iatom);
      if (j % 2 == 0 && 2*mb == j) {
        if (ma == mb) { y_local = 0.5*y_local; }
        else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
@@ -867,13 +870,15 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
      [&] (const int& j, t_scalar3<double>& sum_tmp) {
    int jju = idxu_block[j];
    int jjuy = idxu_block[j] + jelem*idxu_max;

    for(int mb = 0; 2*mb < j; mb++)
      for(int ma = 0; ma <= j; ma++) {
        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jelem*idxu_max+jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jelem*idxu_max+jju,iatom).im;
        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jelem*idxu_max+jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jelem*idxu_max+jju,iatom).im;
        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jelem*idxu_max+jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jelem*idxu_max+jju,iatom).im;
        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im;
        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im;
        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im;
        jju++;
        jjuy++;
      } //end loop over ma mb

    // For j even, handle middle column
@@ -882,16 +887,17 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy

      int mb = j/2;
      for(int ma = 0; ma < mb; ma++) {
        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jelem*idxu_max+jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jelem*idxu_max+jju,iatom).im;
        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jelem*idxu_max+jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jelem*idxu_max+jju,iatom).im;
        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jelem*idxu_max+jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jelem*idxu_max+jju,iatom).im;
        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im;
        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im;
        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im;
        jju++;
        jjuy++;
      }

      //int ma = mb;
      sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jelem*idxu_max+jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jelem*idxu_max+jju,iatom).im)*0.5;
      sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jelem*idxu_max+jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jelem*idxu_max+jju,iatom).im)*0.5;
      sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jelem*idxu_max+jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jelem*idxu_max+jju,iatom).im)*0.5;
      sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im)*0.5;
      sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im)*0.5;
      sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im)*0.5;
    } // end if jeven

  },final_sum); // end loop over j
@@ -931,6 +937,7 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
  Kokkos::parallel_for(Kokkos::TeamThreadRange(team, idxb_max),
                       [&](const int &jjb) {
                         //for(int jjb = 0; jjb < idxb_max; jjb++) {
                         const int jjballoy = itriple*idxb_max+jjb;
                         const int j1 = idxb(jjb, 0);
                         const int j2 = idxb(jjb, 1);
                         const int j = idxb(jjb, 2);
@@ -946,14 +953,14 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
                                                   //for(int ma = 0; ma <= j; ma++) {
                                                   const int ma = mbma % (j + 1);
                                                   const int mb = mbma / (j + 1);
                                                         const int jju_index = jju + mb * (j + 1) + ma;
                                                         const int jjz_index = jjz + mb * (j + 1) + ma;
                                                   const int jju_index = elem3 * idxu_max + jju + mb * (j + 1) + ma;
                                                   const int jjz_index = jalloy + jjz + mb * (j + 1) + ma;
                                                   if (2 * mb == j) return;
                                                   sum +=
                                                             ulisttot(elem3*idxu_max+jju_index, iatom).re *
                                                             zlist(jalloy+jjz_index, iatom).re +
                                                             ulisttot(elem3*idxu_max+jju_index, iatom).im *
                                                             zlist(jalloy+jjz_index, iatom).im;
                                                       ulisttot(jju_index, iatom).re *
                                                       zlist(jjz_index, iatom).re +
                                                       ulisttot(jju_index, iatom).im *
                                                       zlist(jjz_index, iatom).im;
                                                 }, sumzu_temp); // end loop over ma, mb
                         sumzu += sumzu_temp;

@@ -965,25 +972,25 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
                                                   [&](const int ma, double &sum) {
                                                     //for(int ma = 0; ma < mb; ma++) {
                                                     const int jju_index =
                                                               jju + (mb - 1) * (j + 1) + (j + 1) + ma;
                                                         elem3 * idxu_max + jju + (mb - 1) * (j + 1) + (j + 1) + ma;
                                                     const int jjz_index =
                                                               jjz + (mb - 1) * (j + 1) + (j + 1) + ma;
                                                         jalloy + jjz + (mb - 1) * (j + 1) + (j + 1) + ma;
                                                     sum +=
                                                               ulisttot(elem3*idxu_max+jju_index, iatom).re *
                                                               zlist(jalloy+jjz_index, iatom).re +
                                                               ulisttot(elem3*idxu_max+jju_index, iatom).im *
                                                               zlist(jalloy+jjz_index, iatom).im;
                                                         ulisttot(jju_index, iatom).re *
                                                         zlist(jjz_index, iatom).re +
                                                         ulisttot(jju_index, iatom).im *
                                                         zlist(jjz_index, iatom).im;
                                                   }, sumzu_temp); // end loop over ma
                           sumzu += sumzu_temp;

                           const int ma = mb;
                                 const int jju_index = jju + (mb - 1) * (j + 1) + (j + 1) + ma;
                                 const int jjz_index = jjz + (mb - 1) * (j + 1) + (j + 1) + ma;
                           const int jju_index = elem3 * idxu_max + jju + (mb - 1) * (j + 1) + (j + 1) + ma;
                           const int jjz_index = jalloy + jjz + (mb - 1) * (j + 1) + (j + 1) + ma;
                           sumzu += 0.5 *
                                          (ulisttot(elem3*idxu_max+jju_index, iatom).re *
                                           zlist(jalloy+jjz_index, iatom).re +
                                           ulisttot(elem3*idxu_max+jju_index, iatom).im *
                                           zlist(jalloy+jjz_index, iatom).im);
                                    (ulisttot(jju_index, iatom).re *
                                     zlist(jjz_index, iatom).re +
                                     ulisttot(jju_index, iatom).im *
                                     zlist(jjz_index, iatom).im);
                         } // end if jeven

                         Kokkos::single(Kokkos::PerThread(team), [&]() {
@@ -998,7 +1005,7 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
                             } else sumzu -= bzero[j];
                           }

                                 blist(itriple*idxb_max+jjb, iatom) = sumzu;
                           blist(jjballoy, iatom) = sumzu;
                         });
                       });
  itriple++;