Reformatted with yapf (53e5a8f5) · Commits · 钟慕尧 / deepchem

deepchem/rl/init.py

+4 −2

Original line number	Diff line number	Diff line
		@@ -2,6 +2,7 @@

		from deepchem.rl.a3c import A3C


		class Environment(object):
		"""An environment in which an actor performs actions to accomplish a task.

		@@ -81,12 +82,14 @@ class Environment(object):

		class GymEnvironment(Environment):
		"""This is a convenience class for working with environments from OpenAI Gym."""

		def __init__(self, name):
		"""Create an Environment wrapping the OpenAI Gym environment with a specified name."""
		import gym
		self.env = gym.make(name)
		self.name = name
		super().__init__([self.env.observation_space.shape], self.env.action_space.n)
		super().__init__([self.env.observation_space.shape],
		self.env.action_space.n)

		def reset(self):
		state = self.env.reset()
		@@ -132,4 +135,3 @@ class Policy(object):
		of Layers every time.
		"""
		raise NotImplemented("Subclasses must implement this")

deepchem/rl/a3c.py

+64 −28

Original line number	Diff line number	Diff line
		@@ -11,8 +11,10 @@ import os
		import re
		import threading


		class A3CLoss(Layer):
		"""This layer computes the loss function for A3C."""

		def __init__(self, value_weight, entropy_weight, **kwargs):
		super(A3CLoss, self).__init__(**kwargs)
		self.value_weight = value_weight
		@@ -21,7 +23,8 @@ class A3CLoss(Layer):
		def create_tensor(self, **kwargs):
		reward, action, prob, value = [layer.out_tensor for layer in self.in_layers]
		log_prob = tf.log(prob)
		policy_loss = -tf.reduce_sum((reward-value)tf.reduce_sum(actionlog_prob))
		policy_loss = -tf.reduce_sum(
		(reward - value) * tf.reduce_sum(action * log_prob))
		value_loss = tf.reduce_sum(tf.square(reward - value))
		entropy = -tf.reduce_sum(prob * log_prob)
		self.out_tensor = policy_loss + self.value_weight * value_loss - self.entropy_weight * entropy
		@@ -29,7 +32,8 @@ class A3CLoss(Layer):


		def _create_feed_dict(features, state):
		return dict((f.out_tensor, np.expand_dims(s, axis=0)) for f,s in zip(features, state))
		return dict((f.out_tensor, np.expand_dims(s, axis=0))
		for f, s in zip(features, state))


		class A3C(object):
		@@ -49,7 +53,14 @@ class A3C(object):
		"action" argument passed to the environment is an integer, giving the index of the action to perform.
		"""

		def __init__(self, env, policy, max_rollout_length=20, discount_factor=0.99, value_weight=1.0, entropy_weight=0.01, model_dir=None):
		def __init__(self,
		env,
		policy,
		max_rollout_length=20,
		discount_factor=0.99,
		value_weight=1.0,
		entropy_weight=0.01,
		model_dir=None):
		"""Create an object for optimizing a policy.

		Parameters
		@@ -76,8 +87,10 @@ class A3C(object):
		self.discount_factor = discount_factor
		self.value_weight = value_weight
		self.entropy_weight = entropy_weight
		self.optimizer = TFWrapper(tf.train.AdamOptimizer, learning_rate=0.001, beta1=0.9, beta2=0.999)
		(self._graph, self._features, rewards, actions, self._action_prob, self._value) = self._build_graph(None, 'global', model_dir)
		self.optimizer = TFWrapper(
		tf.train.AdamOptimizer, learning_rate=0.001, beta1=0.9, beta2=0.999)
		(self._graph, self._features, rewards, actions, self._action_prob,
		self._value) = self._build_graph(None, 'global', model_dir)
		with self._graph._get_tf("Graph").as_default():
		self._session = tf.Session()

		@@ -89,8 +102,15 @@ class A3C(object):
		value = policy_layers['value']
		rewards = Weights(shape=(None, 1))
		actions = Label(shape=(None, self._env.n_actions))
		loss = A3CLoss(self.value_weight, self.entropy_weight, in_layers=[rewards, actions, action_prob, value])
		graph = TensorGraph(batch_size=self.max_rollout_length, use_queue=False, graph=tf_graph, model_dir=model_dir)
		loss = A3CLoss(
		self.value_weight,
		self.entropy_weight,
		in_layers=[rewards, actions, action_prob, value])
		graph = TensorGraph(
		batch_size=self.max_rollout_length,
		use_queue=False,
		graph=tf_graph,
		model_dir=model_dir)
		graph.add_output(action_prob)
		graph.add_output(value)
		graph.set_loss(loss)
		@@ -100,7 +120,8 @@ class A3C(object):
		graph.build()
		return graph, features, rewards, actions, action_prob, value

		def fit(self, total_steps, max_checkpoints_to_keep=5, checkpoint_interval=600):
		def fit(self, total_steps, max_checkpoints_to_keep=5,
		checkpoint_interval=600):
		"""Train the policy.

		Parameters
		@@ -123,7 +144,9 @@ class A3C(object):
		for i in range(multiprocessing.cpu_count()):
		workers.append(_Worker(self, i))
		for worker in workers:
		thread = threading.Thread(name=worker.scope, target=lambda: worker.run(step_count, total_steps))
		thread = threading.Thread(
		name=worker.scope,
		target=lambda: worker.run(step_count, total_steps))
		threads.append(thread)
		thread.start()
		saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
		@@ -133,7 +156,8 @@ class A3C(object):
		if len(threads) > 0:
		threads[0].join(checkpoint_interval)
		checkpoint_index += 1
		saver.save(self._session, self._graph.save_file, global_step=checkpoint_index)
		saver.save(
		self._session, self._graph.save_file, global_step=checkpoint_index)
		if len(threads) == 0:
		break

		@@ -151,7 +175,9 @@ class A3C(object):
		"""
		with self._graph._get_tf("Graph").as_default():
		feed_dict = _create_feed_dict(self._features, state)
		return self._session.run([self._action_prob.out_tensor, self._value.out_tensor], feed_dict=feed_dict)
		return self._session.run(
		[self._action_prob.out_tensor, self._value.out_tensor],
		feed_dict=feed_dict)

		def select_action(self, state, deterministic=False):
		"""Select an action to perform based on the environment's state.
		@@ -170,11 +196,13 @@ class A3C(object):
		"""
		with self._graph._get_tf("Graph").as_default():
		feed_dict = _create_feed_dict(self._features, state)
		probabilities = self._session.run(self._action_prob.out_tensor, feed_dict=feed_dict)
		probabilities = self._session.run(
		self._action_prob.out_tensor, feed_dict=feed_dict)
		if deterministic:
		return probabilities.argmax()
		else:
		return np.random.choice(np.arange(self._env.n_actions), p=probabilities[0])
		return np.random.choice(
		np.arange(self._env.n_actions), p=probabilities[0])

		def restore(self):
		"""Reload the model parameters from the most recent checkpoint file."""
		@@ -195,14 +223,19 @@ class _Worker(object):
		self.scope = 'worker%d' % index
		self.env = copy.deepcopy(a3c._env)
		self.env.reset()
		self.graph, self.features, self.rewards, self.actions, self.action_prob, self.value = a3c._build_graph(a3c._graph._get_tf('Graph'), self.scope, None)
		self.graph, self.features, self.rewards, self.actions, self.action_prob, self.value = a3c._build_graph(
		a3c._graph._get_tf('Graph'), self.scope, None)
		with a3c._graph._get_tf("Graph").as_default():
		local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
		global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
		local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
		self.scope)
		global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
		'global')
		gradients = tf.gradients(self.graph.loss.out_tensor, local_vars)
		grads_and_vars = list(zip(gradients, global_vars))
		self.train_op = a3c._graph._get_tf('Optimizer').apply_gradients(grads_and_vars)
		self.update_local_variables = tf.group(*[tf.assign(v1, v2) for v1, v2 in zip(local_vars, global_vars)])
		self.train_op = a3c._graph._get_tf('Optimizer').apply_gradients(
		grads_and_vars)
		self.update_local_variables = tf.group(
		*[tf.assign(v1, v2) for v1, v2 in zip(local_vars, global_vars)])

		def run(self, step_count, total_steps):
		with self.graph._get_tf("Graph").as_default():
		@@ -232,7 +265,8 @@ class _Worker(object):
		for j in range(len(state)):
		states[j].append(state[j])
		feed_dict = _create_feed_dict(self.features, state)
		probabilities = session.run(self.action_prob.out_tensor, feed_dict=feed_dict)
		probabilities = session.run(
		self.action_prob.out_tensor, feed_dict=feed_dict)
		action = np.random.choice(np.arange(n_actions), p=probabilities[0])
		actions.append(np.zeros(n_actions))
		actions[i][action] = 1.0
		@@ -240,9 +274,11 @@ class _Worker(object):
		if not self.env.terminated:
		# Add an estimate of the reward for the rest of the episode.
		feed_dict = _create_feed_dict(self.features, self.env.state)
		rewards[-1] += self.a3c.discount_factor*session.run(self.value.out_tensor, feed_dict)
		rewards[-1] += self.a3c.discount_factor * session.run(
		self.value.out_tensor, feed_dict)
		for j in range(len(rewards) - 1, 0, -1):
		rewards[j - 1] += self.a3c.discount_factor * rewards[j]
		if self.env.terminated:
		self.env.reset()
		return np.array(states), np.array(actions), np.array(rewards).reshape((len(rewards), 1))
		return np.array(states), np.array(actions), np.array(rewards).reshape(
		(len(rewards), 1))

deepchem/rl/tests/test_a3c.py

+11 −6

Original line number	Diff line number	Diff line
		@@ -16,6 +16,7 @@ class TestA3C(unittest.TestCase):
		# strategy is to walk away.

		class RouletteEnvironment(dc.rl.Environment):

		def __init__(self):
		super().__init__([(1,)], 38)
		self._state = [np.array([0])]
		@@ -41,16 +42,20 @@ class TestA3C(unittest.TestCase):
		# This policy just learns a constant probability for each action, and a constant for the value.

		class TestPolicy(dc.rl.Policy):

		def create_layers(self, state, **kwargs):
		action = Dense(in_layers=state, out_channels=env.n_actions)
		output = SoftMax(in_layers=[Reshape(in_layers=[action], shape=(-1, env.n_actions))])
		output = SoftMax(
		in_layers=[Reshape(in_layers=[action], shape=(-1, env.n_actions))])
		value = Dense(in_layers=state, out_channels=1)
		return {'action_prob': output, 'value': value}

		# Optimize it.

		a3c = dc.rl.A3C(env, TestPolicy(), value_weight=100.0, max_rollout_length=50)
		a3c.optimizer = dc.models.tensorgraph.TFWrapper(tf.train.AdamOptimizer, learning_rate=0.2)
		a3c = dc.rl.A3C(
		env, TestPolicy(), value_weight=100.0, max_rollout_length=50)
		a3c.optimizer = dc.models.tensorgraph.TFWrapper(
		tf.train.AdamOptimizer, learning_rate=0.2)
		a3c.fit(100000)

		# It should have learned that the expected value is very close to zero, and that the best

Admin message