State is represented as a list of arrays (c8342e9c) · Commits · 钟慕尧 / deepchem

deepchem/rl/init.py

+21 −15

Original line number	Diff line number	Diff line
		@@ -5,10 +5,10 @@ from deepchem.rl.a3c import A3C
		class Environment(object):
		"""An environment in which an actor performs actions to accomplish a task.

		An environment has a current state that can be queried. When an action
		is taken, that causes the state to be updated. Exactly what is meant by
		a "state" or "action" is defined by each subclass. As far as this interface
		is concerned, they are simply arbitrary objects. The environment also computes
		An environment has a current state, which is represented as a list of NumPy
		arrays. When an action is taken, that causes the state to be updated. Exactly
		what is meant by an "action" is defined by each subclass. As far as this interface
		is concerned, it is simply an arbitrary object. The environment also computes
		a reward for each action, and reports when the task has been terminated
		(meaning that no more actions may be taken).

		@@ -26,7 +26,7 @@ class Environment(object):

		@property
		def state(self):
		"""The current state of the environment, represented as a class-specific object.
		"""The current state of the environment, represented as a list of NumPy arrays.

		If reset() has not yet been called at least once, this is undefined.
		"""
		@@ -42,7 +42,10 @@ class Environment(object):

		@property
		def state_shape(self):
		"""The shape of the values that describe a state."""
		"""The shape of the arrays that describe a state.

		This returns a list of tuples, where each tuple is the shape of one array.
		"""
		return self._state_shape

		@property
		@@ -83,14 +86,16 @@ class GymEnvironment(Environment):
		import gym
		self.env = gym.make(name)
		self.name = name
		super().__init__(self.env.observation_space.shape, self.env.action_space.n)
		super().__init__([self.env.observation_space.shape], self.env.action_space.n)

		def reset(self):
		self._state = self.env.reset()
		state = self.env.reset()
		self._state = [state]
		self._terminated = False

		def step(self, action):
		self._state, reward, self._terminated, info = self.env.step(action)
		state, reward, self._terminated, info = self.env.step(action)
		self._state = [state]
		return reward

		def __deepcopy__(self, memo):
		@@ -113,12 +118,13 @@ class Policy(object):
		or even on different computers.
		"""

		def create_layers(self, features, **kwargs):
		def create_layers(self, state, **kwargs):
		"""Create the TensorGraph Layers that define the policy.

		The arguments always include a Feature layer representing the current state of
		the environment. Depending on the algorithm being used, other arguments might
		get passed as well. It is up to each algorithm to document that.
		The arguments always include a list of Feature layers representing the current
		state of the environment (one layer for each array in the state). Depending on
		the algorithm being used, other arguments might get passed as well. It is up
		to each algorithm to document that.

		This method should construct and return a dict that maps strings to Layer
		objects. Each algorithm must document what Layers it expects the policy to

deepchem/rl/a3c.py

+17 −9

Original line number	Diff line number	Diff line
		@@ -27,6 +27,11 @@ class A3CLoss(Layer):
		self.out_tensor = policy_loss + self.value_weightvalue_loss - self.entropy_weightentropy
		return self.out_tensor


		def _create_feed_dict(features, state):
		return dict((f.out_tensor, np.expand_dims(s, axis=0)) for f,s in zip(features, state))


		class A3C(object):
		"""
		Implements the Asynchronous Advantage Actor-Critic (A3C) algorithm for reinforcement learning.
		@@ -78,7 +83,7 @@ class A3C(object):

		def _build_graph(self, tf_graph, scope, model_dir):
		"""Construct a TensorGraph containing the policy and loss calculations."""
		features = Feature(shape=[None]+list(self._env.state_shape))
		features = [Feature(shape=[None]+list(s)) for s in self._env.state_shape]
		policy_layers = self._policy.create_layers(features)
		action_prob = policy_layers['action_prob']
		value = policy_layers['value']
		@@ -145,7 +150,7 @@ class A3C(object):
		the array of action probabilities, and the estimated value function
		"""
		with self._graph._get_tf("Graph").as_default():
		feed_dict = {self._features.out_tensor: np.expand_dims(state, axis=0)}
		feed_dict = _create_feed_dict(self._features, state)
		return self._session.run([self._action_prob.out_tensor, self._value.out_tensor], feed_dict=feed_dict)

		def select_action(self, state, deterministic=False):
		@@ -164,7 +169,7 @@ class A3C(object):
		the index of the selected action
		"""
		with self._graph._get_tf("Graph").as_default():
		feed_dict = {self._features.out_tensor: np.expand_dims(state, axis=0)}
		feed_dict = _create_feed_dict(self._features, state)
		probabilities = self._session.run(self._action_prob.out_tensor, feed_dict=feed_dict)
		if deterministic:
		return probabilities.argmax()
		@@ -206,24 +211,27 @@ class _Worker(object):
		session.run(self.update_local_variables)
		episode_states, episode_actions, episode_rewards = self.create_rollout()
		feed_dict = {}
		feed_dict[self.features.out_tensor] = episode_states
		for f,s in zip(self.features, episode_states):
		feed_dict[f.out_tensor] = s
		feed_dict[self.rewards.out_tensor] = episode_rewards
		feed_dict[self.actions.out_tensor] = episode_actions
		session.run(self.train_op, feed_dict=feed_dict)
		step_count[0] += len(episode_states)
		step_count[0] += len(episode_actions)

		def create_rollout(self):
		"""Generate a rollout."""
		n_actions = self.env.n_actions
		session = self.a3c._session
		states = []
		states = [[] for i in range(len(self.features))]
		actions = []
		rewards = []
		for i in range(self.a3c.max_rollout_length):
		if self.env.terminated:
		break
		states.append(self.env.state)
		feed_dict = {self.features.out_tensor: np.expand_dims(self.env.state, axis=0)}
		state = self.env.state
		for j in range(len(state)):
		states[j].append(state[j])
		feed_dict = _create_feed_dict(self.features, state)
		probabilities = session.run(self.action_prob.out_tensor, feed_dict=feed_dict)
		action = np.random.choice(np.arange(n_actions), p=probabilities[0])
		actions.append(np.zeros(n_actions))
		@@ -231,7 +239,7 @@ class _Worker(object):
		rewards.append(self.env.step(action))
		if not self.env.terminated:
		# Add an estimate of the reward for the rest of the episode.
		feed_dict = {self.features.out_tensor: np.expand_dims(self.env.state, axis=0)}
		feed_dict = _create_feed_dict(self.features, self.env.state)
		rewards[-1] += self.a3c.discount_factor*session.run(self.value.out_tensor, feed_dict)
		for j in range(len(rewards)-1, 0, -1):
		rewards[j-1] += self.a3c.discount_factor*rewards[j]

deepchem/rl/tests/test_a3c.py

+10 −10

Original line number	Diff line number	Diff line
		@@ -17,8 +17,8 @@ class TestA3C(unittest.TestCase):

		class RouletteEnvironment(dc.rl.Environment):
		def __init__(self):
		super().__init__([1], 38)
		self._state = np.array([0])
		super().__init__([(1,)], 38)
		self._state = [np.array([0])]

		def step(self, action):
		if action == 37:
		@@ -41,30 +41,30 @@ class TestA3C(unittest.TestCase):
		# This policy just learns a constant probability for each action, and a constant for the value.

		class TestPolicy(dc.rl.Policy):
		def create_layers(self, features, **kwargs):
		action = Dense(in_layers=[features], out_channels=env.n_actions)
		def create_layers(self, state, **kwargs):
		action = Dense(in_layers=state, out_channels=env.n_actions)
		output = SoftMax(in_layers=[Reshape(in_layers=[action], shape=(-1, env.n_actions))])
		value = Dense(in_layers=[features], out_channels=1)
		value = Dense(in_layers=state, out_channels=1)
		return {'action_prob':output, 'value':value}

		# Optimize it.

		a3c = dc.rl.A3C(env, TestPolicy(), value_weight=100.0)
		a3c.optimizer = dc.models.tensorgraph.TFWrapper(tf.train.AdamOptimizer, learning_rate=0.1)
		a3c = dc.rl.A3C(env, TestPolicy(), value_weight=100.0, max_rollout_length=50)
		a3c.optimizer = dc.models.tensorgraph.TFWrapper(tf.train.AdamOptimizer, learning_rate=0.2)
		a3c.fit(100000)

		# It should have learned that the expected value is very close to zero, and that the best
		# action is to walk away.

		action_prob, value = a3c.predict([0])
		action_prob, value = a3c.predict([[0]])
		assert -0.5 < value[0] < 0.5
		assert action_prob.argmax() == 37
		assert a3c.select_action([0], deterministic=True) == 37
		assert a3c.select_action([[0]], deterministic=True) == 37

		# Verify that we can create a new A3C object, reload the parameters from the first one, and
		# get the same result.

		new_a3c = dc.rl.A3C(env, TestPolicy(), model_dir=a3c._graph.model_dir)
		new_a3c.restore()
		action_prob2, value2 = new_a3c.predict([0])
		action_prob2, value2 = new_a3c.predict([[0]])
		assert value2 == value

Admin message