Commit 515e01ed authored by peastman's avatar peastman
Browse files

Recurrent models are working again with A3C

parent 1d1878ed
Loading
Loading
Loading
Loading
+4 −5
Original line number Diff line number Diff line
@@ -63,7 +63,6 @@ class Environment(object):
        self._state_dtype = numpy.float32
    else:
      self._state_dtype = state_dtype
    print(self._state_dtype)

  @property
  def state(self):
@@ -186,11 +185,11 @@ class Policy(object):
  or even on different computers.
  """

  def create_model(self, **kwargs):
    raise NotImplemented("Subclasses must implement this")
  def __init__(self, output_names, rnn_initial_states=[]):
    self.output_names = output_names
    self.rnn_initial_states = rnn_initial_states

  @property
  def output_names(self):
  def create_model(self, **kwargs):
    raise NotImplemented("Subclasses must implement this")

  def create_layers(self, state, **kwargs):
+78 −108
Original line number Diff line number Diff line
@@ -2,7 +2,6 @@

from deepchem.models import KerasModel
from deepchem.models.tensorgraph.optimizers import Adam
#from deepchem.models.tensorgraph.layers import Feature, Weights, Label, Layer
import numpy as np
import tensorflow as tf
import collections
@@ -16,7 +15,8 @@ import threading
class A3CLossDiscrete(object):
  """This layer computes the loss function for A3C with discrete action spaces."""

  def __init__(self, value_weight, entropy_weight, action_prob_index, value_index, **kwargs):
  def __init__(self, value_weight, entropy_weight, action_prob_index,
               value_index, **kwargs):
    self.value_weight = value_weight
    self.entropy_weight = entropy_weight
    self.action_prob_index = action_prob_index
@@ -27,7 +27,7 @@ class A3CLossDiscrete(object):
    value = outputs[self.value_index]
    reward, advantage = weights
    action = labels[0]
    # reward, action, prob, value, advantage = inputs
    advantage = tf.expand_dims(advantage, axis=1)
    prob = prob + np.finfo(np.float32).eps
    log_prob = tf.log(prob)
    policy_loss = -tf.reduce_mean(
@@ -40,7 +40,8 @@ class A3CLossDiscrete(object):
class A3CLossContinuous(object):
  """This layer computes the loss function for A3C with continuous action spaces."""

  def __init__(self, value_weight, entropy_weight, mean_index, std_index, value_index, **kwargs):
  def __init__(self, value_weight, entropy_weight, mean_index, std_index,
               value_index, **kwargs):
    self.value_weight = value_weight
    self.entropy_weight = entropy_weight
    self.mean_index = mean_index
@@ -53,7 +54,6 @@ class A3CLossContinuous(object):
    value = outputs[self.value_index]
    reward, advantage = weights
    action = labels[0]
    # reward, action, mean, std, value, advantage = inputs
    distrib = tf.distributions.Normal(mean, std)
    reduce_axes = list(range(1, len(action.shape)))
    log_prob = tf.reduce_sum(distrib.log_prob(action), reduce_axes)
@@ -166,19 +166,19 @@ class A3C(object):
    output_names = policy.output_names
    self._value = self._model._output_tensors[output_names.index('value')]
    if self.continuous:
      self._action_mean = self._model._output_tensors[output_names.index('action_mean')]
      self._action_std = self._model._output_tensors[output_names.index('action_std')]
      self._action_mean = self._model._output_tensors[output_names.index(
          'action_mean')]
      self._action_std = self._model._output_tensors[output_names.index(
          'action_std')]
    else:
      self._action_prob = self._model._output_tensors[output_names.index('action_prob')]
    # if self.continuous:
    #   (self._graph, self._features, self._rewards, self._actions,
    #    self._action_mean, self._action_std, self._value,
    #    self._advantages) = fields
    # else:
    #   (self._graph, self._features, self._rewards, self._actions,
    #    self._action_prob, self._value, self._advantages) = fields
      self._action_prob = self._model._output_tensors[output_names.index(
          'action_prob')]
    rnn_outputs = [i for i, n in enumerate(output_names) if n == 'rnn_state']
    self._rnn_final_states = [
        self._model._output_tensors[i] for i in rnn_outputs
    ]
    self._session = self._model.session
    # self._rnn_states = self._graph.rnn_zero_states
    self._rnn_states = policy.rnn_initial_states
    with tf.variable_scope('global'):
      self._checkpoint = tf.train.Checkpoint()
      self._checkpoint.save_counter  # Ensure the variable has been created
@@ -194,57 +194,41 @@ class A3C(object):
      state_dtype = [state_dtype]
    features = []
    for s, d in zip(state_shape, state_dtype):
      features.append(tf.keras.layers.Input(shape=list(s), dtype=tf.as_dtype(d)))
    # policy_layers = self._policy.create_layers(features)
    # value = policy_layers['value']
    # rewards = Weights(shape=(None,))
    # advantages = Weights(shape=(None,))
      features.append(
          tf.keras.layers.Input(shape=list(s), dtype=tf.as_dtype(d)))
    policy_model = self._policy.create_model()
    # graph = TensorGraph(
    #     batch_size=self.max_rollout_length,
    #     use_queue=False,
    #     graph=tf_graph,
    #     model_dir=model_dir)
    # for f in features:
    #   graph._add_layer(f)
    output_names = self._policy.output_names
    if 'action_prob' in output_names:
      self.continuous = False
      # action_prob = policy_layers['action_prob']
      # actions = Label(shape=(None, self._env.n_actions))
      loss = A3CLossDiscrete(
          self.value_weight,
          self.entropy_weight,
      loss = A3CLossDiscrete(self.value_weight, self.entropy_weight,
                             output_names.index('action_prob'),
                             output_names.index('value'))
      # graph.add_output(action_prob)
    else:
      self.continuous = True
      # action_mean = policy_layers['action_mean']
      # action_std = policy_layers['action_std']
      # actions = Label(shape=[None] + list(self._env.action_shape))
      loss = A3CLossContinuous(
          self.value_weight,
          self.entropy_weight,
      loss = A3CLossContinuous(self.value_weight, self.entropy_weight,
                               output_names.index('action_mean'),
                               output_names.index('action_std'),
                               output_names.index('value'))
    # graph.set_optimizer(self._optimizer)
    # with tf.variable_scope(scope):
    #   graph.build()
    model = KerasModel(policy_model, loss, batch_size=self.max_rollout_length, model_dir=model_dir, optimize=self._optimizer)
    # if self.continuous:
    #   return graph, features, rewards, actions, action_mean, action_std, value, advantages
    # else:
    #   return graph, features, rewards, actions, action_prob, value, advantages
    model = KerasModel(
        policy_model,
        loss,
        batch_size=self.max_rollout_length,
        model_dir=model_dir,
        optimize=self._optimizer)
    env = self._env
    example_inputs = [np.zeros([model.batch_size]+list(shape), dtype) for shape,dtype in zip(state_shape, state_dtype)]
    example_inputs = [
        np.zeros([model.batch_size] + list(shape), dtype)
        for shape, dtype in zip(state_shape, state_dtype)
    ]
    if self.continuous:
      example_labels = [np.zeros([model.batch_size]+list(env.action_shape), np.float32)]
      example_labels = [
          np.zeros([model.batch_size] + list(env.action_shape), np.float32)
      ]
    else:
      example_labels = [np.zeros((model.batch_size, env.n_actions), np.float32)]
    example_weights = [np.zeros(model.batch_size, np.float32)] * 2
    model._create_training_ops((example_inputs, example_labels, example_weights))
    model._create_training_ops((example_inputs, example_labels,
                                example_weights))
    return model

  def fit(self,
@@ -278,8 +262,7 @@ class A3C(object):
      self.restore()
    for worker in workers:
      thread = threading.Thread(
          name=worker.scope,
          target=lambda: worker.run(step_count, total_steps))
          name=worker.scope, target=lambda: worker.run(step_count, total_steps))
      threads.append(thread)
      thread.start()
    manager = tf.train.CheckpointManager(
@@ -371,31 +354,24 @@ class A3C(object):
      raise ValueError('No checkpoint found')
    self._checkpoint.restore(last_checkpoint).run_restore_ops(self._session)

  def _create_feed_dict(self, state, use_saved_states):
    """Create a feed dict for use by predict() or select_action()."""
    feed_dict = dict((f, np.expand_dims(s, axis=0))
                     for f, s in zip(self._model._input_placeholders, state))
    # if use_saved_states:
    #   rnn_states = self._rnn_states
    # else:
    #   rnn_states = self._graph.rnn_zero_states
    # for (placeholder, value) in zip(self._graph.rnn_initial_states, rnn_states):
    #   feed_dict[placeholder] = value
    return feed_dict

  def _predict_outputs(self, outputs, state, use_saved_states, save_states):
    """Compute a set of outputs for a state. """
    if not self._state_is_list:
      state = [state]
    feed_dict = self._create_feed_dict(state, use_saved_states)
    if use_saved_states:
      state = state + list(self._rnn_states)
    else:
      state = state + list(self._policy.rnn_initial_states)
    feed_dict = dict((f, np.expand_dims(s, axis=0))
                     for f, s in zip(self._model._input_placeholders, state))
    tensors = outputs
    if save_states:
      tensors = outputs + self._rnn_final_states
    else:
      tensors = outputs
    # if save_states:
    #   tensors = outputs + self._model.rnn_final_states
    # else:
    #   tensors = outputs
    results = self._session.run(tensors, feed_dict=feed_dict)
    # if save_states:
    #   self._rnn_states = results[len(outputs):]
    if save_states:
      self._rnn_states = [np.squeeze(r, 0) for r in results[len(outputs):]]
    return results[:len(outputs)]

  def _select_action_from_outputs(self, outputs, deterministic):
@@ -428,23 +404,21 @@ class _Worker(object):
    output_names = a3c._policy.output_names
    self.value = self.model._output_tensors[output_names.index('value')]
    if a3c.continuous:
      self.action_mean = self.model._output_tensors[output_names.index('action_mean')]
      self.action_std = self.model._output_tensors[output_names.index('action_std')]
    #   self.graph, self.features, self.rewards, self.actions, self.action_mean, self.action_std, self.value, self.advantages = fields
      self.action_mean = self.model._output_tensors[output_names.index(
          'action_mean')]
      self.action_std = self.model._output_tensors[output_names.index(
          'action_std')]
    else:
      self.action_prob = self.model._output_tensors[output_names.index('action_prob')]
    #   self.graph, self.features, self.rewards, self.actions, self.action_prob, self.value, self.advantages = fields
    # self.rnn_states = self.graph.rnn_zero_states
      self.action_prob = self.model._output_tensors[output_names.index(
          'action_prob')]
    rnn_outputs = [i for i, n in enumerate(output_names) if n == 'rnn_state']
    self.rnn_final_states = [self.model._output_tensors[i] for i in rnn_outputs]
    self.rnn_states = a3c._policy.rnn_initial_states
    local_vars = self.model.model.trainable_variables
    global_vars = a3c._model.model.trainable_variables
    # local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
    #                                self.scope)
    # global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
    #                                 'global')
    gradients = tf.gradients(self.model._loss_tensor, local_vars)
    grads_and_vars = list(zip(gradients, global_vars))
    self.train_op = a3c._model._tf_optimizer.apply_gradients(
        grads_and_vars)
    self.train_op = a3c._model._tf_optimizer.apply_gradients(grads_and_vars)
    self.update_local_variables = tf.group(
        *[tf.assign(v1, v2) for v1, v2 in zip(local_vars, global_vars)])
    self.global_step = self.model.get_global_step()
@@ -452,13 +426,13 @@ class _Worker(object):
  def run(self, step_count, total_steps):
    while step_count[0] < total_steps:
      self.a3c._session.run(self.update_local_variables)
      initial_rnn_states = None#self.rnn_states
      initial_rnn_states = self.rnn_states
      states, actions, rewards, values = self.create_rollout()
      self.process_rollout(states, actions, rewards, values,
                           initial_rnn_states, step_count[0])
      self.process_rollout(states, actions, rewards, values, initial_rnn_states,
                           step_count[0])
      if self.a3c.use_hindsight:
        self.process_rollout_with_hindsight(states, actions,
                                            initial_rnn_states, step_count[0])
        self.process_rollout_with_hindsight(states, actions, initial_rnn_states,
                                            step_count[0])
      step_count[0] += len(actions)

  def create_rollout(self):
@@ -482,10 +456,10 @@ class _Worker(object):
        tensors = [self.action_mean, self.action_std, self.value]
      else:
        tensors = [self.action_prob, self.value]
      # results = session.run(tensors + self.graph.rnn_final_states, feed_dict=feed_dict)
      results = session.run(tensors, feed_dict=feed_dict)
      results = session.run(
          tensors + self.rnn_final_states, feed_dict=feed_dict)
      value = results[len(tensors) - 1]
      # self.rnn_states = results[len(tensors):]
      self.rnn_states = [np.squeeze(r, 0) for r in results[len(tensors):]]
      action = self.a3c._select_action_from_outputs(results[:len(tensors) - 1],
                                                    False)
      actions.append(action)
@@ -503,7 +477,7 @@ class _Worker(object):
    values.append(final_value)
    if self.env.terminated:
      self.env.reset()
      # self.rnn_states = self.graph.rnn_zero_states
      self.rnn_states = self.a3c._policy.rnn_initial_states
    return states, actions, np.array(
        rewards, dtype=np.float32), np.array(
            values, dtype=np.float32)
@@ -551,11 +525,11 @@ class _Worker(object):
    # Build the feed dict and apply gradients.

    feed_dict = {}
    # for placeholder, value in zip(self.graph.rnn_initial_states,
    #                               initial_rnn_states):
    #   feed_dict[placeholder] = value
    for f, s in zip(self.model._input_placeholders, state_arrays):
      feed_dict[f] = s
    for f, s in zip(self.model._input_placeholders[len(state_arrays):],
                    initial_rnn_states):
      feed_dict[f] = np.expand_dims(s, axis=0)
    feed_dict[self.model._weights_placeholders[0]] = discounted_rewards
    feed_dict[self.model._label_placeholders[0]] = actions_matrix
    feed_dict[self.model._weights_placeholders[1]] = advantages
@@ -574,10 +548,8 @@ class _Worker(object):
          state_arrays[j].append(state[j])
    else:
      state_arrays = [hindsight_states]
    state_arrays += initial_rnn_states
    feed_dict = {}
    # for placeholder, value in zip(self.graph.rnn_initial_states,
    #                               initial_rnn_states):
    #   feed_dict[placeholder] = value
    for f, s in zip(self.model._input_placeholders, state_arrays):
      feed_dict[f] = s
    values = self.a3c._session.run(self.value, feed_dict=feed_dict)
@@ -589,9 +561,7 @@ class _Worker(object):
    """Create a feed dict for use during a rollout."""
    if not self.a3c._state_is_list:
      state = [state]
    state = state + self.rnn_states
    feed_dict = dict((f, np.expand_dims(s, axis=0))
                     for f, s in zip(self.model._input_placeholders, state))
    # for (placeholder, value) in zip(self.graph.rnn_initial_states,
    #                                 self.rnn_states):
    #   feed_dict[placeholder] = value
    return feed_dict
+36 −42
Original line number Diff line number Diff line
from flaky import flaky

import deepchem as dc
from deepchem.models.tensorgraph.layers import Reshape, Variable, SoftMax, GRU, Dense, Constant
#from deepchem.models.tensorgraph.layers import Reshape, Variable, SoftMax, GRU, Dense, Constant
from deepchem.models.tensorgraph.optimizers import Adam, PolynomialDecay
from tensorflow.keras.layers import Input, Dense, GRU, Reshape, Softmax
import numpy as np
import tensorflow as tf
import unittest
@@ -48,23 +49,24 @@ class TestA3C(unittest.TestCase):

    class TestPolicy(dc.rl.Policy):

      def __init__(self):
        super(TestPolicy, self).__init__(['action_prob', 'value'])

      def create_model(self, **kwargs):

        class TestModel(tf.keras.Model):

          def __init__(self):
            super(TestModel, self).__init__(**kwargs)
            self.action = tf.Variable(np.ones(env.n_actions, np.float32))
            self.value = tf.Variable([0.0], tf.float32)

          def call(self, inputs, **kwargs):
            prob = tf.nn.softmax(tf.reshape(self.action, (-1, env.n_actions)))
            return (prob, self.value)

        return TestModel()

      @property
      def output_names(self):
        return ['action_prob', 'value']

    # Optimize it.

    a3c = dc.rl.A3C(
@@ -119,14 +121,21 @@ class TestA3C(unittest.TestCase):

    class TestPolicy(dc.rl.Policy):

      def create_layers(self, state, **kwargs):
      def __init__(self):
        super(TestPolicy, self).__init__(['action_prob', 'value', 'rnn_state'],
                                         [np.zeros(10)])

        reshaped = Reshape(shape=(1, -1, 10), in_layers=state)
        gru = GRU(n_hidden=10, batch_size=1, in_layers=reshaped)
        output = SoftMax(
            in_layers=[Reshape(in_layers=[gru], shape=(-1, env.n_actions))])
        value = Variable([0.0])
        return {'action_prob': output, 'value': value}
      def create_model(self, **kwargs):
        state = Input(shape=(10,))
        rnn_state = Input(shape=(10,))
        reshaped = Reshape((1, 10))(state)
        gru, rnn_final_state = GRU(
            10, return_state=True, return_sequences=True)(
                reshaped, initial_state=rnn_state)
        output = Softmax()(gru)
        value = dc.models.layers.Variable([0.0])([])
        return tf.keras.Model(
            inputs=[state, rnn_state], outputs=[output, value, rnn_final_state])

    # We don't care about actually optimizing it, so just run a few rollouts to make
    # sure fit() doesn't crash, then check the behavior of the GRU state.
@@ -206,33 +215,17 @@ class TestA3C(unittest.TestCase):

    class TestPolicy(dc.rl.Policy):

      def __init__(self):
        super(TestPolicy, self).__init__(['action_prob', 'value'])

      def create_model(self, **kwargs):
        state = tf.keras.layers.Input(shape=(4,))
        dense1 = tf.keras.layers.Dense(6, activation=tf.nn.relu)(state)
        dense2 = tf.keras.layers.Dense(6, activation=tf.nn.relu)(dense1)
        output = tf.keras.layers.Dense(
            4,
            activation=tf.nn.softmax,
            use_bias=False)(dense2)
        value = tf.keras.layers.Dense(1)(dense2)
        state = Input(shape=(4,))
        dense1 = Dense(6, activation=tf.nn.relu)(state)
        dense2 = Dense(6, activation=tf.nn.relu)(dense1)
        output = Dense(4, activation=tf.nn.softmax, use_bias=False)(dense2)
        value = Dense(1)(dense2)
        return tf.keras.Model(inputs=state, outputs=[output, value])

      @property
      def output_names(self):
        return ['action_prob', 'value']

      def create_layers(self, state, **kwargs):

        dense1 = Dense(6, activation_fn=tf.nn.relu, in_layers=state)
        dense2 = Dense(6, activation_fn=tf.nn.relu, in_layers=dense1)
        output = Dense(
            4,
            activation_fn=tf.nn.softmax,
            biases_initializer=None,
            in_layers=dense2)
        value = Dense(1, in_layers=dense2)
        return {'action_prob': output, 'value': value}

    # Optimize it.

    env = TestEnvironment()
@@ -289,23 +282,24 @@ class TestA3C(unittest.TestCase):

    class TestPolicy(dc.rl.Policy):

      def __init__(self):
        super(TestPolicy, self).__init__(['action_mean', 'action_std', 'value'])

      def create_model(self, **kwargs):

        class TestModel(tf.keras.Model):

          def __init__(self):
            super(TestModel, self).__init__(**kwargs)
            self.mean = tf.keras.layers.Dense(1, kernel_initializer='zeros')
            self.mean = Dense(1, kernel_initializer='zeros')
            self.std = tf.constant([10.0])
            self.value = tf.keras.layers.Dense(1)
            self.value = Dense(1)

          def call(self, inputs, **kwargs):
            return (self.mean(inputs), self.std, self.value(inputs))

        return TestModel()

      @property
      def output_names(self):
        return ['action_mean', 'action_std', 'value']

    # Optimize it.

    env = TestEnvironment()