A3C supports Generalized Advantage Estimation (a51752c9) · Commits · 钟慕尧 / deepchem

deepchem/rl/a3c.py

+31 −9

Original line number	Original line	Diff line number	Diff line
	@@ -37,9 +37,10 @@ class A3C(object):
	"""		"""
	Implements the Asynchronous Advantage Actor-Critic (A3C) algorithm for reinforcement learning.		Implements the Asynchronous Advantage Actor-Critic (A3C) algorithm for reinforcement learning.

	This algorithm requires the policy to output two quantities: a vector giving the probability of		The algorithm is described in Mnih et al, "Asynchronous Methods for Deep Reinforcement Learning"
	taking each action, and an estimate of the value function for the current state. It optimizes		(https://arxiv.org/abs/1602.01783). This class requires the policy to output two quantities:
	both outputs at once using a loss that is the sum of three terms:		a vector giving the probability of taking each action, and an estimate of the value function for
			the current state. It optimizes both outputs at once using a loss that is the sum of three terms:

	1. The policy loss, which seeks to maximize the discounted reward for each action.		1. The policy loss, which seeks to maximize the discounted reward for each action.
	2. The value loss, which tries to make the value estimate match the actual discounted reward		2. The value loss, which tries to make the value estimate match the actual discounted reward
	@@ -48,6 +49,11 @@ class A3C(object):

	This class only supports environments with discrete action spaces, not continuous ones. The		This class only supports environments with discrete action spaces, not continuous ones. The
	"action" argument passed to the environment is an integer, giving the index of the action to perform.		"action" argument passed to the environment is an integer, giving the index of the action to perform.

			This class supports Generalized Advantage Estimation as described in Schulman et al., "High-Dimensional
			Continuous Control Using Generalized Advantage Estimation," (https://arxiv.org/abs/1506.02438).
			This is a method of trading off bias and variance in the advantage estimate, which can sometimes
			improve the rate of convergance. Use the advantage_lambda parameter to adjust the tradeoff.
	"""		"""

	def __init__(self,		def __init__(self,
	@@ -55,6 +61,7 @@ class A3C(object):
	policy,		policy,
	max_rollout_length=20,		max_rollout_length=20,
	discount_factor=0.99,		discount_factor=0.99,
			advantage_lambda=0.98,
	value_weight=1.0,		value_weight=1.0,
	entropy_weight=0.01,		entropy_weight=0.01,
	optimizer=None,		optimizer=None,
	@@ -85,6 +92,7 @@ class A3C(object):
	self._policy = policy		self._policy = policy
	self.max_rollout_length = max_rollout_length		self.max_rollout_length = max_rollout_length
	self.discount_factor = discount_factor		self.discount_factor = discount_factor
			self.advantage_lambda = advantage_lambda
	self.value_weight = value_weight		self.value_weight = value_weight
	self.entropy_weight = entropy_weight		self.entropy_weight = entropy_weight
	if optimizer is None:		if optimizer is None:
	@@ -335,6 +343,9 @@ class _Worker(object):
	actions = []		actions = []
	rewards = []		rewards = []
	values = []		values = []

			# Generate the rollout.

	for i in range(self.a3c.max_rollout_length):		for i in range(self.a3c.max_rollout_length):
	if self.env.terminated:		if self.env.terminated:
	break		break
	@@ -353,19 +364,30 @@ class _Worker(object):
	actions[i][action] = 1.0		actions[i][action] = 1.0
	values.append(float(value))		values.append(float(value))
	rewards.append(self.env.step(action))		rewards.append(self.env.step(action))

			# Compute an estimate of the reward for the rest of the episode.

	if not self.env.terminated:		if not self.env.terminated:
	# Add an estimate of the reward for the rest of the episode.
	feed_dict = self.create_feed_dict(self.env.state)		feed_dict = self.create_feed_dict(self.env.state)
	rewards[-1] += self.a3c.discount_factor * float(		final_value = self.a3c.discount_factor * float(
	session.run(self.value.out_tensor, feed_dict))		session.run(self.value.out_tensor, feed_dict))
	for j in range(len(rewards) - 1, 0, -1):		else:
	rewards[j - 1] += self.a3c.discount_factor * rewards[j]		final_value = 0.0

			# Compute the output arrays.

	rewards_array = np.array(rewards)		rewards_array = np.array(rewards)
	advantages = rewards_array - np.array(values)		values_array = np.array(values)
			discounted_rewards = rewards_array.copy()
			discounted_rewards[-1] += final_value
			advantages = rewards_array - values_array + self.a3c.discount_factor * np.array(values[1:]+[final_value])
			for j in range(len(rewards) - 1, 0, -1):
			discounted_rewards[j - 1] += self.a3c.discount_factor * discounted_rewards[j]
			advantages[j - 1] += self.a3c.discount_factor * self.a3c.advantage_lambda * advantages[j]
	if self.env.terminated:		if self.env.terminated:
	self.env.reset()		self.env.reset()
	self.rnn_states = self.graph.rnn_zero_states		self.rnn_states = self.graph.rnn_zero_states
	return np.array(states), np.array(actions), rewards_array, advantages		return np.array(states), np.array(actions), discounted_rewards, advantages

	def create_feed_dict(self, state):		def create_feed_dict(self, state):
	"""Create a feed dict for use during a rollout."""		"""Create a feed dict for use during a rollout."""

Admin message