Commit ec228331 authored by peastman's avatar peastman
Browse files

Fixed tests for hindsight experience replay

parent 52bce871
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -110,7 +110,8 @@ class A2C(object):
  The method receives the list of states generated during the rollout, the action taken for each one,
  and a new goal state.  It should generate a new list of states that are identical to the input ones,
  except specifying the new goal.  It should return that list of states, and the rewards that would
  have been received for taking the specified actions from those states.
  have been received for taking the specified actions from those states.  The output arrays may be
  shorter than the input ones, if the modified rollout would have terminated sooner.
  """

  def __init__(self,
@@ -488,7 +489,7 @@ class A2C(object):
    outputs = self._compute_model(inputs)
    values = outputs[self._value_index].numpy()
    values = np.append(values.flatten(), 0.0)
    self._process_rollout(hindsight_states, actions,
    self._process_rollout(hindsight_states, actions[:len(rewards)],
                          np.array(rewards, dtype=np.float32),
                          np.array(values, dtype=np.float32),
                          initial_rnn_states)
+3 −1
Original line number Diff line number Diff line
@@ -84,7 +84,8 @@ class PPO(object):
  The method receives the list of states generated during the rollout, the action taken for each one,
  and a new goal state.  It should generate a new list of states that are identical to the input ones,
  except specifying the new goal.  It should return that list of states, and the rewards that would
  have been received for taking the specified actions from those states.
  have been received for taking the specified actions from those states.  The output arrays may be
  shorter than the input ones, if the modified rollout would have terminated sooner.
  """

  def __init__(self,
@@ -543,6 +544,7 @@ class _Worker(object):
    values = outputs[self.ppo._value_index].numpy()
    values = np.append(values.flatten(), 0.0)
    probabilities = outputs[self.ppo._action_prob_index].numpy()
    actions = actions[:len(rewards)]
    action_prob = probabilities[np.arange(len(actions)), actions]
    return self.process_rollout(hindsight_states, actions, action_prob,
                                np.array(rewards, dtype=np.float32),
+3 −4
Original line number Diff line number Diff line
@@ -206,6 +206,7 @@ class TestA2C(unittest.TestCase):
          pos_after_action = new_state[:2] + self.moves[action]
          if np.array_equal(pos_after_action, goal_pos):
            rewards.append(1)
            break
          else:
            rewards.append(0)
        return new_states, rewards
@@ -228,14 +229,12 @@ class TestA2C(unittest.TestCase):
    # Optimize it.

    env = TestEnvironment()
    learning_rate = PolynomialDecay(
        initial_rate=0.0005, final_rate=0.0002, decay_steps=2000000)
    a2c = dc.rl.A2C(
        env,
        TestPolicy(),
        use_hindsight=True,
        optimizer=Adam(learning_rate=learning_rate))
    a2c.fit(2000000)
        optimizer=Adam(learning_rate=0.001))
    a2c.fit(1000000)

    # Try running it a few times and see if it succeeds.

+4 −2
Original line number Diff line number Diff line
@@ -206,6 +206,7 @@ class TestPPO(unittest.TestCase):
          pos_after_action = new_state[:2] + self.moves[action]
          if np.array_equal(pos_after_action, goal_pos):
            rewards.append(1)
            break
          else:
            rewards.append(0)
        return new_states, rewards
@@ -234,8 +235,9 @@ class TestPPO(unittest.TestCase):
        env,
        TestPolicy(),
        use_hindsight=True,
        optimization_epochs=8,
        optimizer=Adam(learning_rate=learning_rate))
        optimization_epochs=1,
        batch_size=0,
        optimizer=Adam(learning_rate=0.001))
    ppo.fit(1500000)

    # Try running it a few times and see if it succeeds.