Commit 11220926 authored by peastman's avatar peastman
Browse files

Updated documentation and minor code cleanup

parent 515e01ed
Loading
Loading
Loading
Loading
+41 −20
Original line number Diff line number Diff line
@@ -172,13 +172,28 @@ class GymEnvironment(Environment):
class Policy(object):
  """A policy for taking actions within an environment.

  A policy is defined by a set of TensorGraph Layer objects that perform the
  necessary calculations.  There are many algorithms for reinforcement learning,
  and they differ in what values they require a policy to compute.  That makes
  it impossible to define a single interface allowing any policy to be optimized
  with any algorithm.  Instead, this interface just tries to be as flexible and
  generic as possible.  Each algorithm must document what values it expects
  create_layers() to return.
  A policy is defined by a tf.keras.Model that takes the current state as input
  and performs the necessary calculations.  There are many algorithms for
  reinforcement learning, and they differ in what values they require a policy to
  compute.  That makes it impossible to define a single interface allowing any
  policy to be optimized with any algorithm.  Instead, this interface just tries
  to be as flexible and generic as possible.  Each algorithm must document what
  values it expects the model to output.

  Special handling is needed for models that include recurrent layers.  In that
  case, the model has its own internal state which the learning algorithm must
  be able to specify and query.  To support this, the Policy must do three things:

  1. The Model must take additional inputs that specify the initial states of
     all its recurrent layers.  These will be appended to the list of arrays
     specifying the environment state.

  2. The Model must also return the final states of all its recurrent layers as
     outputs.

  3. The constructor argument rnn_initial_states must be specified to define
     the states to use for the Model's recurrent layers at the start of a new
     rollout.

  Policy objects should be written to support pickling.  Many algorithms involve
  creating multiple copies of the Policy, possibly running in different processes
@@ -186,23 +201,29 @@ class Policy(object):
  """

  def __init__(self, output_names, rnn_initial_states=[]):
    """Subclasses should call the superclass constructor in addition to doing
    their own initialization.

    Parameters
    ----------
    output_names: list of strings
      the names of the Model's outputs, in order.  It is up to each reinforcement
      learning algorithm to document what outputs it expects policies to compute.
      Outputs that return the final states of recurrent layers should have the
      name 'rnn_state'.
    rnn_initial_states: list of NumPy arrays
      the initial states of the Model's recurrent layers at the start of a new
      rollout
    """
    self.output_names = output_names
    self.rnn_initial_states = rnn_initial_states

  def create_model(self, **kwargs):
    raise NotImplemented("Subclasses must implement this")

  def create_layers(self, state, **kwargs):
    """Create the TensorGraph Layers that define the policy.

    The arguments always include a list of Feature layers representing the current
    state of the environment (one layer for each array in the state).  Depending on
    the algorithm being used, other arguments might get passed as well.  It is up
    to each algorithm to document that.
    """Construct and return a tf.keras.Model that computes the policy.

    This method should construct and return a dict that maps strings to Layer
    objects.  Each algorithm must document what Layers it expects the policy to
    create.  If this method is called multiple times, it should create a new set
    of Layers every time.
    The inputs to the model consist of the arrays representing the current state
    of the environment, followed by the initial states for all recurrent layers.
    Depending on the algorithm being used, other inputs might get passed as
    well.  It is up to each algorithm to document that.
    """
    raise NotImplemented("Subclasses must implement this")
+26 −33
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ import threading


class A3CLossDiscrete(object):
  """This layer computes the loss function for A3C with discrete action spaces."""
  """This class computes the loss function for A3C with discrete action spaces."""

  def __init__(self, value_weight, entropy_weight, action_prob_index,
               value_index, **kwargs):
@@ -38,7 +38,7 @@ class A3CLossDiscrete(object):


class A3CLossContinuous(object):
  """This layer computes the loss function for A3C with continuous action spaces."""
  """This class computes the loss function for A3C with continuous action spaces."""

  def __init__(self, value_weight, entropy_weight, mean_index, std_index,
               value_index, **kwargs):
@@ -71,7 +71,7 @@ class A3C(object):
  (https://arxiv.org/abs/1602.01783).  This class supports environments with both discrete and
  continuous action spaces.  For discrete action spaces, the "action" argument passed to the
  environment is an integer giving the index of the action to perform.  The policy must output
  a vector called "action_prob" giving the probability of taking each action.  For continous
  a vector called "action_prob" giving the probability of taking each action.  For continuous
  action spaces, the action is an array where each element is chosen independently from a
  normal distribution.  The policy must output two arrays of the same shape: "action_mean"
  gives the mean value for each element, and "action_std" gives the standard deviation for
@@ -129,8 +129,8 @@ class A3C(object):
    env: Environment
      the Environment to interact with
    policy: Policy
      the Policy to optimize.  Its create_layers() method must return a dict containing the
      keys 'action_prob' and 'value' (for discrete action spaces) or 'action_mean', 'action_std',
      the Policy to optimize.  It must have outputs with the names 'action_prob'
      and 'value' (for discrete action spaces) or 'action_mean', 'action_std',
      and 'value' (for continuous action spaces)
    max_rollout_length: int
      the maximum length of rollouts to generate
@@ -162,30 +162,25 @@ class A3C(object):
      self._optimizer = Adam(learning_rate=0.001, beta1=0.9, beta2=0.999)
    else:
      self._optimizer = optimizer
    self._model = self._build_graph('global', model_dir)
    self._model = self._build_model(model_dir)
    output_names = policy.output_names
    self._value = self._model._output_tensors[output_names.index('value')]
    output_tensors = self._model._output_tensors
    self._value = output_tensors[output_names.index('value')]
    if self.continuous:
      self._action_mean = self._model._output_tensors[output_names.index(
          'action_mean')]
      self._action_std = self._model._output_tensors[output_names.index(
          'action_std')]
      self._action_mean = output_tensors[output_names.index('action_mean')]
      self._action_std = output_tensors[output_names.index('action_std')]
    else:
      self._action_prob = self._model._output_tensors[output_names.index(
          'action_prob')]
      self._action_prob = output_tensors[output_names.index('action_prob')]
    rnn_outputs = [i for i, n in enumerate(output_names) if n == 'rnn_state']
    self._rnn_final_states = [
        self._model._output_tensors[i] for i in rnn_outputs
    ]
    self._rnn_final_states = [output_tensors[i] for i in rnn_outputs]
    self._session = self._model.session
    self._rnn_states = policy.rnn_initial_states
    with tf.variable_scope('global'):
    self._checkpoint = tf.train.Checkpoint()
    self._checkpoint.save_counter  # Ensure the variable has been created
    self._checkpoint.listed = self._model.model.trainable_variables
    self._session.run(self._checkpoint.save_counter.initializer)

  def _build_graph(self, scope, model_dir):
  def _build_model(self, model_dir):
    """Construct a KerasModel containing the policy and loss calculations."""
    state_shape = self._env.state_shape
    state_dtype = self._env.state_dtype
@@ -285,12 +280,12 @@ class A3C(object):

    Parameters
    ----------
    state: array
    state: array or list of arrays
      the state of the environment for which to generate predictions
    use_saved_states: bool
      if True, the states most recently saved by a previous call to predict() or select_action()
      will be used as the initial states.  If False, the internal states of all recurrent layers
      will be set to all zeros before computing the predictions.
      will be set to the initial values defined by the policy before computing the predictions.
    save_states: bool
      if True, the internal states of all recurrent layers at the end of the calculation
      will be saved, and any previously saved states will be discarded.  If False, the
@@ -320,7 +315,7 @@ class A3C(object):

    Parameters
    ----------
    state: array
    state: array or list of arrays
      the state of the environment for which to select an action
    deterministic: bool
      if True, always return the best action (that is, the one with highest probability).
@@ -328,7 +323,7 @@ class A3C(object):
    use_saved_states: bool
      if True, the states most recently saved by a previous call to predict() or select_action()
      will be used as the initial states.  If False, the internal states of all recurrent layers
      will be set to all zeros before computing the predictions.
      will be set to the initial values defined by the policy before computing the predictions.
    save_states: bool
      if True, the internal states of all recurrent layers at the end of the calculation
      will be saved, and any previously saved states will be discarded.  If False, the
@@ -400,19 +395,17 @@ class _Worker(object):
    self.scope = 'worker%d' % index
    self.env = copy.deepcopy(a3c._env)
    self.env.reset()
    self.model = a3c._build_graph(self.scope, None)
    self.model = a3c._build_model(None)
    output_names = a3c._policy.output_names
    self.value = self.model._output_tensors[output_names.index('value')]
    output_tensors = self.model._output_tensors
    self.value = output_tensors[output_names.index('value')]
    if a3c.continuous:
      self.action_mean = self.model._output_tensors[output_names.index(
          'action_mean')]
      self.action_std = self.model._output_tensors[output_names.index(
          'action_std')]
      self.action_mean = output_tensors[output_names.index('action_mean')]
      self.action_std = output_tensors[output_names.index('action_std')]
    else:
      self.action_prob = self.model._output_tensors[output_names.index(
          'action_prob')]
      self.action_prob = output_tensors[output_names.index('action_prob')]
    rnn_outputs = [i for i, n in enumerate(output_names) if n == 'rnn_state']
    self.rnn_final_states = [self.model._output_tensors[i] for i in rnn_outputs]
    self.rnn_final_states = [output_tensors[i] for i in rnn_outputs]
    self.rnn_states = a3c._policy.rnn_initial_states
    local_vars = self.model.model.trainable_variables
    global_vars = a3c._model.model.trainable_variables