Cleanup and documentation (71962e35) · Commits · 钟慕尧 / deepchem

deepchem/rl/init.py

+45 −7

Original line number	Diff line number	Diff line
		@@ -10,20 +10,47 @@ class Environment(object):

		An environment has a current state, which is represented as either a single NumPy
		array, or optionally a list of NumPy arrays. When an action is taken, that causes
		the state to be updated. Exactly what is meant by an "action" is defined by each
		subclass. As far as this interface is concerned, it is simply an arbitrary object.
		The environment also computes a reward for each action, and reports when the task
		has been terminated (meaning that no more actions may be taken).
		the state to be updated. The environment also computes a reward for each action,
		and reports when the task has been terminated (meaning that no more actions may
		be taken).

		Two types of actions are supported. For environments with discrete action spaces,
		the action is an integer specifying the index of the action to perform (out of a
		fixed list of possible actions). For environments with continuous action spaces,
		the action is a NumPy array.

		Environment objects should be written to support pickle and deepcopy operations.
		Many algorithms involve creating multiple copies of the Environment, possibly
		running in different processes or even on different computers.
		"""

		def __init__(self, state_shape, n_actions, state_dtype=None):
		"""Subclasses should call the superclass constructor in addition to doing their own initialization."""
		def __init__(self,
		state_shape,
		n_actions=None,
		state_dtype=None,
		action_shape=None):
		"""Subclasses should call the superclass constructor in addition to doing their own initialization.

		A value should be provided for either n_actions (for discrete action spaces)
		or action_shape (for continuous action spaces), but not both.

		Parameters
		----------
		state_shape: tuple or list of tuples
		the shape(s) of the array(s) making up the state
		n_actions: int
		the number of discrete actions that can be performed. If the action space
		is continuous, this should be None.
		state_dtype: dtype or list of dtypes
		the type(s) of the array(s) making up the state. If this is None, all
		arrays are assumed to be float32.
		action_shape: tuple
		the shape of the array describing an action. If the action space
		is discrete, this should be none.
		"""
		self._state_shape = state_shape
		self._n_actions = n_actions
		self._action_shape = action_shape
		self._state = None
		self._terminated = None
		if state_dtype is None:
		@@ -74,9 +101,20 @@ class Environment(object):

		@property
		def n_actions(self):
		"""The number of possible actions that can be performed in this Environment."""
		"""The number of possible actions that can be performed in this Environment.

		If the environment uses a continuous action space, this returns None.
		"""
		return self._n_actions

		@property
		def action_shape(self):
		"""The expected shape of NumPy arrays representing actions.

		If the environment uses a discrete action space, this returns None.
		"""
		return self._action_shape

		def reset(self):
		"""Initialize the environment in preparation for doing calculations with it.

deepchem/rl/a3c.py

+29 −20

Original line number	Diff line number	Diff line
		@@ -62,18 +62,23 @@ class A3C(object):
		Implements the Asynchronous Advantage Actor-Critic (A3C) algorithm for reinforcement learning.

		The algorithm is described in Mnih et al, "Asynchronous Methods for Deep Reinforcement Learning"
		(https://arxiv.org/abs/1602.01783). This class requires the policy to output two quantities:
		a vector giving the probability of taking each action, and an estimate of the value function for
		the current state. It optimizes both outputs at once using a loss that is the sum of three terms:
		(https://arxiv.org/abs/1602.01783). This class supports environments with both discrete and
		continuous action spaces. For discrete action spaces, the "action" argument passed to the
		environment is an integer giving the index of the action to perform. The policy must output
		a vector called "action_prob" giving the probability of taking each action. For continous
		action spaces, the action is an array where each element is chosen independently from a
		normal distribution. The policy must output two arrays of the same shape: "action_mean"
		gives the mean value for each element, and "action_std" gives the standard deviation for
		each element. In either case, the policy must also output a scalar called "value" which
		is an estimate of the value function for the current state.

		The algorithm optimizes all outputs at once using a loss that is the sum of three terms:

		1. The policy loss, which seeks to maximize the discounted reward for each action.
		2. The value loss, which tries to make the value estimate match the actual discounted reward
		that was attained at each step.
		3. An entropy term to encourage exploration.

		This class only supports environments with discrete action spaces, not continuous ones. The
		"action" argument passed to the environment is an integer, giving the index of the action to perform.

		This class supports Generalized Advantage Estimation as described in Schulman et al., "High-Dimensional
		Continuous Control Using Generalized Advantage Estimation" (https://arxiv.org/abs/1506.02438).
		This is a method of trading off bias and variance in the advantage estimate, which can sometimes
		@@ -119,7 +124,8 @@ class A3C(object):
		the Environment to interact with
		policy: Policy
		the Policy to optimize. Its create_layers() method must return a dict containing the
		keys 'action_prob' and 'value', corresponding to the action probabilities and value estimate
		keys 'action_prob' and 'value' (for discrete action spaces) or 'action_mean', 'action_std',
		and 'value' (for continuous action spaces)
		max_rollout_length: int
		the maximum length of rollouts to generate
		discount_factor: float
		@@ -153,7 +159,8 @@ class A3C(object):
		fields = self._build_graph(None, 'global', model_dir)
		if self.continuous:
		(self._graph, self._features, self._rewards, self._actions,
		self._action_mean, self._action_std, self._value, self._advantages) = fields
		self._action_mean, self._action_std, self._value,
		self._advantages) = fields
		else:
		(self._graph, self._features, self._rewards, self._actions,
		self._action_prob, self._value, self._advantages) = fields
		@@ -195,11 +202,13 @@ class A3C(object):
		self.continuous = True
		action_mean = policy_layers['action_mean']
		action_std = policy_layers['action_std']
		actions = Label(shape=[None]+list(action_mean.shape))
		actions = Label(shape=[None] + list(self._env.action_shape))
		loss = A3CLossContinuous(
		self.value_weight,
		self.entropy_weight,
		in_layers=[rewards, actions, action_mean, action_std, value, advantages])
		in_layers=[
		rewards, actions, action_mean, action_std, value, advantages
		])
		graph.add_output(action_mean)
		graph.add_output(action_std)
		graph.add_output(value)
		@@ -330,7 +339,8 @@ class A3C(object):
		tensors = [self._action_mean, self._action_std]
		else:
		tensors = [self._action_prob]
		outputs = self._predict_outputs(tensors, state, use_saved_states, save_states)
		outputs = self._predict_outputs(tensors, state, use_saved_states,
		save_states)
		return self._select_action_from_outputs(outputs, deterministic)

		def restore(self):
		@@ -376,9 +386,9 @@ class A3C(object):
		if self.continuous:
		action_mean, action_std = outputs
		if deterministic:
		return action_mean
		return action_mean[0]
		else:
		return np.random.normal(action_mean, action_std)
		return np.random.normal(action_mean[0], action_std[0])
		else:
		action_prob = outputs[0]
		if deterministic:
		@@ -451,11 +461,11 @@ class _Worker(object):
		else:
		tensors = [self.action_prob, self.value]
		results = session.run(
		tensors + self.graph.rnn_final_states,
		feed_dict=feed_dict)
		tensors + self.graph.rnn_final_states, feed_dict=feed_dict)
		value = results[len(tensors) - 1]
		self.rnn_states = results[len(tensors):]
		action = self.a3c._select_action_from_outputs(results[:len(tensors)-1], False)
		action = self.a3c._select_action_from_outputs(results[:len(tensors) - 1],
		False)
		actions.append(action)
		values.append(float(value))
		rewards.append(self.env.step(action))
		@@ -551,8 +561,7 @@ class _Worker(object):
		feed_dict[f.out_tensor] = s
		values = self.a3c._session.run(self.value.out_tensor, feed_dict=feed_dict)
		values = np.append(values.flatten(), 0.0)
		self.process_rollout(hindsight_states, actions,
		np.array(rewards),
		self.process_rollout(hindsight_states, actions, np.array(rewards),
		np.array(values), initial_rnn_states, step_count)

		def create_feed_dict(self, state):

deepchem/rl/tests/test_a3c.py

+15 −10

Original line number	Diff line number	Diff line
		@@ -231,7 +231,6 @@ class TestA3C(unittest.TestCase):
		pass_count += 1
		assert pass_count >= 3


		def test_continuous(self):
		"""Test A3C on an environment with a continous action space."""

		@@ -242,7 +241,7 @@ class TestA3C(unittest.TestCase):
		class TestEnvironment(dc.rl.Environment):

		def __init__(self):
		super(TestEnvironment, self).__init__((2,), 0)
		super(TestEnvironment, self).__init__((2,), action_shape=(1,))

		def reset(self):
		target = np.random.uniform(-50, 50)
		@@ -252,9 +251,9 @@ class TestA3C(unittest.TestCase):

		def step(self, action):
		target = self._state[1]
		dist = np.abs(target - action[0][0])
		dist = np.abs(target - action[0])
		old_dist = np.abs(target - self._state[0])
		new_state = np.array([action[0][0], target])
		new_state = np.array([action[0], target])
		self._state = new_state
		self.count += 1
		reward = old_dist - dist
		@@ -266,10 +265,15 @@ class TestA3C(unittest.TestCase):
		class TestPolicy(dc.rl.Policy):

		def create_layers(self, state, **kwargs):
		action_mean = Dense(1, in_layers=state, weights_initializer=tf.zeros_initializer)
		action_std = Constant(10.0)
		action_mean = Dense(
		1, in_layers=state, weights_initializer=tf.zeros_initializer)
		action_std = Constant([10.0])
		value = Dense(1, in_layers=state)
		return {'action_mean': action_mean, 'action_std': action_std, 'value': value}
		return {
		'action_mean': action_mean,
		'action_std': action_std,
		'value': value
		}

		# Optimize it.

		@@ -278,7 +282,8 @@ class TestA3C(unittest.TestCase):
		initial_rate=0.005, final_rate=0.0005, decay_steps=25000)
		a3c = dc.rl.A3C(
		env,
		TestPolicy(), discount_factor=0,
		TestPolicy(),
		discount_factor=0,
		optimizer=Adam(learning_rate=learning_rate))
		a3c.fit(25000)

Admin message