Commit 439c2fd9 authored by Steve Plimpton's avatar Steve Plimpton Committed by GitHub
Browse files

Merge pull request #677 from stanmoore1/kk_update

Update to Kokkos r2.04.04 and add workaround for performance regression
parents 15853a0e a55adf4a
Loading
Loading
Loading
Loading
+19 −0
Original line number Diff line number Diff line
# Change Log

## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)

**Implemented enhancements:**

- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)

**Fixed bugs:**

- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
- (Experimental) HBWSpace  Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
- (Experimental) ROCm:  algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)

## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)

+25 −7
Original line number Diff line number Diff line
@@ -443,7 +443,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
  KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
  KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
  KOKKOS_LIBS += -lmemkind
  KOKKOS_LIBS += -lmemkind -lnuma
  tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
endif

@@ -614,11 +614,20 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

  else
    # Assume that this is a really a GNU compiler or it could be XL on P8.
    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
    else
      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)

      else 
        # Assume that this is a really a GNU compiler on P8.
        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
      endif
    endif
  endif
endif

ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_POWER9 1" >> KokkosCore_config.tmp )
@@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

  else
    # Assume that this is a really a GNU compiler or it could be XL on P9.
    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
    else
      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)

      else 
        # Assume that this is a really a GNU compiler on P9
        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
      endif
    endif
  endif
endif

+237 −0
Original line number Diff line number Diff line
@@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
}


#endif

#if defined(KOKKOS_ENABLE_ROCM) 

  template<>
  class Random_XorShift1024<Kokkos::Experimental::ROCm> {
  private:
    int p_;
    const int state_idx_;
    uint64_t* state_;
    const int stride_;
    friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
  public:

    typedef Kokkos::Experimental::ROCm device_type;
    typedef Random_XorShift1024_Pool<device_type> pool_type;

    enum {MAX_URAND = 0xffffffffU};
    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};

    KOKKOS_INLINE_FUNCTION
    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
    }

    KOKKOS_INLINE_FUNCTION
    uint32_t urand() {
      uint64_t state_0 = state_[ p_ * stride_ ];
      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
      state_1 ^= state_1 << 31;
      state_1 ^= state_1 >> 11;
      state_0 ^= state_0 >> 30;
      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
      tmp = tmp>>16;
      return static_cast<uint32_t>(tmp&MAX_URAND);
    }

    KOKKOS_INLINE_FUNCTION
    uint64_t urand64() {
      uint64_t state_0 = state_[ p_ * stride_ ];
      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
      state_1 ^= state_1 << 31;
      state_1 ^= state_1 >> 11;
      state_0 ^= state_0 >> 30;
      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
    }

    KOKKOS_INLINE_FUNCTION
    uint32_t urand(const uint32_t& range) {
      const uint32_t max_val = (MAX_URAND/range)*range;
      uint32_t tmp = urand();
      while(tmp>=max_val)
        urand();
      return tmp%range;
    }

    KOKKOS_INLINE_FUNCTION
    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
      return urand(end-start)+start;
    }

    KOKKOS_INLINE_FUNCTION
    uint64_t urand64(const uint64_t& range) {
      const uint64_t max_val = (MAX_URAND64/range)*range;
      uint64_t tmp = urand64();
      while(tmp>=max_val)
        urand64();
      return tmp%range;
    }

    KOKKOS_INLINE_FUNCTION
    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
      return urand64(end-start)+start;
    }

    KOKKOS_INLINE_FUNCTION
    int rand() {
      return static_cast<int>(urand()/2);
    }

    KOKKOS_INLINE_FUNCTION
    int rand(const int& range) {
      const int max_val = (MAX_RAND/range)*range;
      int tmp = rand();
      while(tmp>=max_val)
        rand();
      return tmp%range;
    }

    KOKKOS_INLINE_FUNCTION
    int rand(const int& start, const int& end ) {
      return rand(end-start)+start;
    }

    KOKKOS_INLINE_FUNCTION
    int64_t rand64() {
      return static_cast<int64_t>(urand64()/2);
    }

    KOKKOS_INLINE_FUNCTION
    int64_t rand64(const int64_t& range) {
      const int64_t max_val = (MAX_RAND64/range)*range;
      int64_t tmp = rand64();
      while(tmp>=max_val)
        rand64();
      return tmp%range;
    }

    KOKKOS_INLINE_FUNCTION
    int64_t rand64(const int64_t& start, const int64_t& end ) {
      return rand64(end-start)+start;
    }

    KOKKOS_INLINE_FUNCTION
    float frand() {
      return 1.0f * urand64()/MAX_URAND64;
    }

    KOKKOS_INLINE_FUNCTION
    float frand(const float& range) {
      return range * urand64()/MAX_URAND64;
    }

    KOKKOS_INLINE_FUNCTION
    float frand(const float& start, const float& end ) {
      return frand(end-start)+start;
    }

    KOKKOS_INLINE_FUNCTION
    double drand() {
      return 1.0 * urand64()/MAX_URAND64;
    }

    KOKKOS_INLINE_FUNCTION
    double drand(const double& range) {
      return range * urand64()/MAX_URAND64;
    }

    KOKKOS_INLINE_FUNCTION
    double drand(const double& start, const double& end ) {
      return frand(end-start)+start;
    }

    //Marsaglia polar method for drawing a standard normal distributed random number
    KOKKOS_INLINE_FUNCTION
    double normal() {
      double S = 2.0;
      double U;
      while(S>=1.0) {
        U = 2.0*drand() - 1.0;
        const double V = 2.0*drand() - 1.0;
        S = U*U+V*V;
      }
      return U*std::sqrt(-2.0*log(S)/S);
    }

    KOKKOS_INLINE_FUNCTION
    double normal(const double& mean, const double& std_dev=1.0) {
      return mean + normal()*std_dev;
    }
  };

template<>
inline
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
  num_states_ = 0;
  init(seed,4*32768);
}

template<>
KOKKOS_INLINE_FUNCTION
Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
      i+=blockDim_x*blockDim_y*blockDim_z;
      if(i>=num_states_) {i = i_offset;}
  }

  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
#else
  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
#endif
}

template<>
KOKKOS_INLINE_FUNCTION
void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
#ifdef __HCC_ACCELERATOR__
  state_(state.state_idx_) = state.state_;
  locks_(state.state_idx_) = 0;
  return;
#endif
}


template<>
inline
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
  num_states_ = 0;
  init(seed,4*32768);
}

template<>
KOKKOS_INLINE_FUNCTION
Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
      i+=blockDim_x*blockDim_y*blockDim_z;
      if(i>=num_states_) {i = i_offset;}
  }

  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
#else
  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
#endif
}

template<>
KOKKOS_INLINE_FUNCTION
void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
#ifdef __HCC_ACCELERATOR__
  for(int i=0; i<16; i++)
    state_(state.state_idx_,i) = state.state_[i];
  locks_(state.state_idx_) = 0;
  return;
#endif
}


#endif


+12 −0
Original line number Diff line number Diff line
@@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
	TEST_TARGETS += test-cuda
endif

ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
	OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
	TARGETS += KokkosAlgorithms_UnitTest_ROCm
	TEST_TARGETS += test-rocm
endif

ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
	TARGETS += KokkosAlgorithms_UnitTest_Threads
@@ -51,6 +57,9 @@ endif
KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
	$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda

KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
	$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm

KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
	$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads

@@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
test-cuda: KokkosAlgorithms_UnitTest_Cuda
	./KokkosAlgorithms_UnitTest_Cuda

test-rocm: KokkosAlgorithms_UnitTest_ROCm
	./KokkosAlgorithms_UnitTest_ROCm

test-threads: KokkosAlgorithms_UnitTest_Threads
	./KokkosAlgorithms_UnitTest_Threads

+112 −0
Original line number Diff line number Diff line
/*
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 2.0
//              Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/

#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_ROCM

#include <cstdint>
#include <iostream>
#include <iomanip>

#include <gtest/gtest.h>

#include <Kokkos_Core.hpp>

#include <TestRandom.hpp>
#include <TestSort.hpp>

namespace Test {

class rocm : public ::testing::Test {
protected:
  static void SetUpTestCase()
  {
    std::cout << std::setprecision(5) << std::scientific;
    Kokkos::HostSpace::execution_space::initialize();
    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
  }
  static void TearDownTestCase()
  {
    Kokkos::Experimental::ROCm::finalize();
    Kokkos::HostSpace::execution_space::finalize();
  }
};

void rocm_test_random_xorshift64( int num_draws  )
{
  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
}

void rocm_test_random_xorshift1024( int num_draws  )
{
  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
}


#define ROCM_RANDOM_XORSHIFT64( num_draws )  \
  TEST_F( rocm, Random_XorShift64 ) {        \
  rocm_test_random_xorshift64(num_draws);    \
  }

#define ROCM_RANDOM_XORSHIFT1024( num_draws )  \
  TEST_F( rocm, Random_XorShift1024 ) {        \
  rocm_test_random_xorshift1024(num_draws);    \
  }

#define ROCM_SORT_UNSIGNED( size )                                    \
  TEST_F( rocm, SortUnsigned ) {                                      \
      Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size);  \
  }

ROCM_RANDOM_XORSHIFT64(  132141141 )
ROCM_RANDOM_XORSHIFT1024( 52428813 )
ROCM_SORT_UNSIGNED(171)

#undef ROCM_RANDOM_XORSHIFT64
#undef ROCM_RANDOM_XORSHIFT1024
#undef ROCM_SORT_UNSIGNED
}
#else
void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
#endif  /* #ifdef KOKKOS_ENABLE_ROCM */
Loading