Fixed tests for hindsight experience replay (ec228331) · Commits · 钟慕尧 / deepchem

deepchem/rl/a2c.py

+3 −2

Original line number	Diff line number	Diff line
		@@ -110,7 +110,8 @@ class A2C(object):
		The method receives the list of states generated during the rollout, the action taken for each one,
		and a new goal state. It should generate a new list of states that are identical to the input ones,
		except specifying the new goal. It should return that list of states, and the rewards that would
		have been received for taking the specified actions from those states.
		have been received for taking the specified actions from those states. The output arrays may be
		shorter than the input ones, if the modified rollout would have terminated sooner.
		"""

		def __init__(self,
		@@ -488,7 +489,7 @@ class A2C(object):
		outputs = self._compute_model(inputs)
		values = outputs[self._value_index].numpy()
		values = np.append(values.flatten(), 0.0)
		self._process_rollout(hindsight_states, actions,
		self._process_rollout(hindsight_states, actions[:len(rewards)],
		np.array(rewards, dtype=np.float32),
		np.array(values, dtype=np.float32),
		initial_rnn_states)

+3 −1

Original line number	Diff line number	Diff line
		@@ -84,7 +84,8 @@ class PPO(object):
		The method receives the list of states generated during the rollout, the action taken for each one,
		and a new goal state. It should generate a new list of states that are identical to the input ones,
		except specifying the new goal. It should return that list of states, and the rewards that would
		have been received for taking the specified actions from those states.
		have been received for taking the specified actions from those states. The output arrays may be
		shorter than the input ones, if the modified rollout would have terminated sooner.
		"""

		def __init__(self,
		@@ -543,6 +544,7 @@ class _Worker(object):
		values = outputs[self.ppo._value_index].numpy()
		values = np.append(values.flatten(), 0.0)
		probabilities = outputs[self.ppo._action_prob_index].numpy()
		actions = actions[:len(rewards)]
		action_prob = probabilities[np.arange(len(actions)), actions]
		return self.process_rollout(hindsight_states, actions, action_prob,
		np.array(rewards, dtype=np.float32),

+3 −4

Original line number	Diff line number	Diff line
		@@ -206,6 +206,7 @@ class TestA2C(unittest.TestCase):
		pos_after_action = new_state[:2] + self.moves[action]
		if np.array_equal(pos_after_action, goal_pos):
		rewards.append(1)
		break
		else:
		rewards.append(0)
		return new_states, rewards
		@@ -228,14 +229,12 @@ class TestA2C(unittest.TestCase):
		# Optimize it.

		env = TestEnvironment()
		learning_rate = PolynomialDecay(
		initial_rate=0.0005, final_rate=0.0002, decay_steps=2000000)
		a2c = dc.rl.A2C(
		env,
		TestPolicy(),
		use_hindsight=True,
		optimizer=Adam(learning_rate=learning_rate))
		a2c.fit(2000000)
		optimizer=Adam(learning_rate=0.001))
		a2c.fit(1000000)

		# Try running it a few times and see if it succeeds.

+4 −2

Original line number	Diff line number	Diff line
		@@ -206,6 +206,7 @@ class TestPPO(unittest.TestCase):
		pos_after_action = new_state[:2] + self.moves[action]
		if np.array_equal(pos_after_action, goal_pos):
		rewards.append(1)
		break
		else:
		rewards.append(0)
		return new_states, rewards
		@@ -234,8 +235,9 @@ class TestPPO(unittest.TestCase):
		env,
		TestPolicy(),
		use_hindsight=True,
		optimization_epochs=8,
		optimizer=Adam(learning_rate=learning_rate))
		optimization_epochs=1,
		batch_size=0,
		optimizer=Adam(learning_rate=0.001))
		ppo.fit(1500000)

		# Try running it a few times and see if it succeeds.