Commit 53e5a8f5 authored by Peter Eastman's avatar Peter Eastman
Browse files

Reformatted with yapf

parent c8342e9c
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@

from deepchem.rl.a3c import A3C


class Environment(object):
  """An environment in which an actor performs actions to accomplish a task.

@@ -81,12 +82,14 @@ class Environment(object):

class GymEnvironment(Environment):
  """This is a convenience class for working with environments from OpenAI Gym."""

  def __init__(self, name):
    """Create an Environment wrapping the OpenAI Gym environment with a specified name."""
    import gym
    self.env = gym.make(name)
    self.name = name
    super().__init__([self.env.observation_space.shape], self.env.action_space.n)
    super().__init__([self.env.observation_space.shape],
                     self.env.action_space.n)

  def reset(self):
    state = self.env.reset()
@@ -132,4 +135,3 @@ class Policy(object):
    of Layers every time.
    """
    raise NotImplemented("Subclasses must implement this")
+64 −28
Original line number Diff line number Diff line
@@ -11,8 +11,10 @@ import os
import re
import threading


class A3CLoss(Layer):
  """This layer computes the loss function for A3C."""

  def __init__(self, value_weight, entropy_weight, **kwargs):
    super(A3CLoss, self).__init__(**kwargs)
    self.value_weight = value_weight
@@ -21,7 +23,8 @@ class A3CLoss(Layer):
  def create_tensor(self, **kwargs):
    reward, action, prob, value = [layer.out_tensor for layer in self.in_layers]
    log_prob = tf.log(prob)
    policy_loss = -tf.reduce_sum((reward-value)*tf.reduce_sum(action*log_prob))
    policy_loss = -tf.reduce_sum(
        (reward - value) * tf.reduce_sum(action * log_prob))
    value_loss = tf.reduce_sum(tf.square(reward - value))
    entropy = -tf.reduce_sum(prob * log_prob)
    self.out_tensor = policy_loss + self.value_weight * value_loss - self.entropy_weight * entropy
@@ -29,7 +32,8 @@ class A3CLoss(Layer):


def _create_feed_dict(features, state):
  return dict((f.out_tensor, np.expand_dims(s, axis=0)) for f,s in zip(features, state))
  return dict((f.out_tensor, np.expand_dims(s, axis=0))
              for f, s in zip(features, state))


class A3C(object):
@@ -49,7 +53,14 @@ class A3C(object):
  "action" argument passed to the environment is an integer, giving the index of the action to perform.
  """

  def __init__(self, env, policy, max_rollout_length=20, discount_factor=0.99, value_weight=1.0, entropy_weight=0.01, model_dir=None):
  def __init__(self,
               env,
               policy,
               max_rollout_length=20,
               discount_factor=0.99,
               value_weight=1.0,
               entropy_weight=0.01,
               model_dir=None):
    """Create an object for optimizing a policy.

    Parameters
@@ -76,8 +87,10 @@ class A3C(object):
    self.discount_factor = discount_factor
    self.value_weight = value_weight
    self.entropy_weight = entropy_weight
    self.optimizer = TFWrapper(tf.train.AdamOptimizer, learning_rate=0.001, beta1=0.9, beta2=0.999)
    (self._graph, self._features, rewards, actions, self._action_prob, self._value) = self._build_graph(None, 'global', model_dir)
    self.optimizer = TFWrapper(
        tf.train.AdamOptimizer, learning_rate=0.001, beta1=0.9, beta2=0.999)
    (self._graph, self._features, rewards, actions, self._action_prob,
     self._value) = self._build_graph(None, 'global', model_dir)
    with self._graph._get_tf("Graph").as_default():
      self._session = tf.Session()

@@ -89,8 +102,15 @@ class A3C(object):
    value = policy_layers['value']
    rewards = Weights(shape=(None, 1))
    actions = Label(shape=(None, self._env.n_actions))
    loss = A3CLoss(self.value_weight, self.entropy_weight, in_layers=[rewards, actions, action_prob, value])
    graph = TensorGraph(batch_size=self.max_rollout_length, use_queue=False, graph=tf_graph, model_dir=model_dir)
    loss = A3CLoss(
        self.value_weight,
        self.entropy_weight,
        in_layers=[rewards, actions, action_prob, value])
    graph = TensorGraph(
        batch_size=self.max_rollout_length,
        use_queue=False,
        graph=tf_graph,
        model_dir=model_dir)
    graph.add_output(action_prob)
    graph.add_output(value)
    graph.set_loss(loss)
@@ -100,7 +120,8 @@ class A3C(object):
        graph.build()
    return graph, features, rewards, actions, action_prob, value

  def fit(self, total_steps, max_checkpoints_to_keep=5, checkpoint_interval=600):
  def fit(self, total_steps, max_checkpoints_to_keep=5,
          checkpoint_interval=600):
    """Train the policy.

    Parameters
@@ -123,7 +144,9 @@ class A3C(object):
      for i in range(multiprocessing.cpu_count()):
        workers.append(_Worker(self, i))
      for worker in workers:
        thread = threading.Thread(name=worker.scope, target=lambda: worker.run(step_count, total_steps))
        thread = threading.Thread(
            name=worker.scope,
            target=lambda: worker.run(step_count, total_steps))
        threads.append(thread)
        thread.start()
      saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
@@ -133,7 +156,8 @@ class A3C(object):
        if len(threads) > 0:
          threads[0].join(checkpoint_interval)
        checkpoint_index += 1
        saver.save(self._session, self._graph.save_file, global_step=checkpoint_index)
        saver.save(
            self._session, self._graph.save_file, global_step=checkpoint_index)
        if len(threads) == 0:
          break

@@ -151,7 +175,9 @@ class A3C(object):
    """
    with self._graph._get_tf("Graph").as_default():
      feed_dict = _create_feed_dict(self._features, state)
      return self._session.run([self._action_prob.out_tensor, self._value.out_tensor], feed_dict=feed_dict)
      return self._session.run(
          [self._action_prob.out_tensor, self._value.out_tensor],
          feed_dict=feed_dict)

  def select_action(self, state, deterministic=False):
    """Select an action to perform based on the environment's state.
@@ -170,11 +196,13 @@ class A3C(object):
    """
    with self._graph._get_tf("Graph").as_default():
      feed_dict = _create_feed_dict(self._features, state)
      probabilities = self._session.run(self._action_prob.out_tensor, feed_dict=feed_dict)
      probabilities = self._session.run(
          self._action_prob.out_tensor, feed_dict=feed_dict)
      if deterministic:
        return probabilities.argmax()
      else:
        return np.random.choice(np.arange(self._env.n_actions), p=probabilities[0])
        return np.random.choice(
            np.arange(self._env.n_actions), p=probabilities[0])

  def restore(self):
    """Reload the model parameters from the most recent checkpoint file."""
@@ -195,14 +223,19 @@ class _Worker(object):
    self.scope = 'worker%d' % index
    self.env = copy.deepcopy(a3c._env)
    self.env.reset()
    self.graph, self.features, self.rewards, self.actions, self.action_prob, self.value = a3c._build_graph(a3c._graph._get_tf('Graph'), self.scope, None)
    self.graph, self.features, self.rewards, self.actions, self.action_prob, self.value = a3c._build_graph(
        a3c._graph._get_tf('Graph'), self.scope, None)
    with a3c._graph._get_tf("Graph").as_default():
      local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
      global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
      local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     self.scope)
      global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      'global')
      gradients = tf.gradients(self.graph.loss.out_tensor, local_vars)
      grads_and_vars = list(zip(gradients, global_vars))
      self.train_op = a3c._graph._get_tf('Optimizer').apply_gradients(grads_and_vars)
      self.update_local_variables = tf.group(*[tf.assign(v1, v2) for v1, v2 in zip(local_vars, global_vars)])
      self.train_op = a3c._graph._get_tf('Optimizer').apply_gradients(
          grads_and_vars)
      self.update_local_variables = tf.group(
          *[tf.assign(v1, v2) for v1, v2 in zip(local_vars, global_vars)])

  def run(self, step_count, total_steps):
    with self.graph._get_tf("Graph").as_default():
@@ -232,7 +265,8 @@ class _Worker(object):
      for j in range(len(state)):
        states[j].append(state[j])
      feed_dict = _create_feed_dict(self.features, state)
      probabilities = session.run(self.action_prob.out_tensor, feed_dict=feed_dict)
      probabilities = session.run(
          self.action_prob.out_tensor, feed_dict=feed_dict)
      action = np.random.choice(np.arange(n_actions), p=probabilities[0])
      actions.append(np.zeros(n_actions))
      actions[i][action] = 1.0
@@ -240,9 +274,11 @@ class _Worker(object):
    if not self.env.terminated:
      # Add an estimate of the reward for the rest of the episode.
      feed_dict = _create_feed_dict(self.features, self.env.state)
      rewards[-1] += self.a3c.discount_factor*session.run(self.value.out_tensor, feed_dict)
      rewards[-1] += self.a3c.discount_factor * session.run(
          self.value.out_tensor, feed_dict)
    for j in range(len(rewards) - 1, 0, -1):
      rewards[j - 1] += self.a3c.discount_factor * rewards[j]
    if self.env.terminated:
      self.env.reset()
    return np.array(states), np.array(actions), np.array(rewards).reshape((len(rewards), 1))
    return np.array(states), np.array(actions), np.array(rewards).reshape(
        (len(rewards), 1))
+11 −6
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@ class TestA3C(unittest.TestCase):
    # strategy is to walk away.

    class RouletteEnvironment(dc.rl.Environment):

      def __init__(self):
        super().__init__([(1,)], 38)
        self._state = [np.array([0])]
@@ -41,16 +42,20 @@ class TestA3C(unittest.TestCase):
    # This policy just learns a constant probability for each action, and a constant for the value.

    class TestPolicy(dc.rl.Policy):

      def create_layers(self, state, **kwargs):
        action = Dense(in_layers=state, out_channels=env.n_actions)
        output = SoftMax(in_layers=[Reshape(in_layers=[action], shape=(-1, env.n_actions))])
        output = SoftMax(
            in_layers=[Reshape(in_layers=[action], shape=(-1, env.n_actions))])
        value = Dense(in_layers=state, out_channels=1)
        return {'action_prob': output, 'value': value}

    # Optimize it.

    a3c = dc.rl.A3C(env, TestPolicy(), value_weight=100.0, max_rollout_length=50)
    a3c.optimizer = dc.models.tensorgraph.TFWrapper(tf.train.AdamOptimizer, learning_rate=0.2)
    a3c = dc.rl.A3C(
        env, TestPolicy(), value_weight=100.0, max_rollout_length=50)
    a3c.optimizer = dc.models.tensorgraph.TFWrapper(
        tf.train.AdamOptimizer, learning_rate=0.2)
    a3c.fit(100000)

    # It should have learned that the expected value is very close to zero, and that the best