Commit 71962e35 authored by peastman's avatar peastman
Browse files

Cleanup and documentation

parent f05c4e19
Loading
Loading
Loading
Loading
+45 −7
Original line number Diff line number Diff line
@@ -10,20 +10,47 @@ class Environment(object):

  An environment has a current state, which is represented as either a single NumPy
  array, or optionally a list of NumPy arrays.  When an action is taken, that causes
  the state to be updated.  Exactly what is meant by an "action" is defined by each
  subclass.  As far as this interface is concerned, it is simply an arbitrary object.
  The environment also computes a reward for each action, and reports when the task
  has been terminated (meaning that no more actions may be taken).
  the state to be updated.  The environment also computes a reward for each action,
  and reports when the task has been terminated (meaning that no more actions may
  be taken).

  Two types of actions are supported.  For environments with discrete action spaces,
  the action is an integer specifying the index of the action to perform (out of a
  fixed list of possible actions).  For environments with continuous action spaces,
  the action is a NumPy array.

  Environment objects should be written to support pickle and deepcopy operations.
  Many algorithms involve creating multiple copies of the Environment, possibly
  running in different processes or even on different computers.
  """

  def __init__(self, state_shape, n_actions, state_dtype=None):
    """Subclasses should call the superclass constructor in addition to doing their own initialization."""
  def __init__(self,
               state_shape,
               n_actions=None,
               state_dtype=None,
               action_shape=None):
    """Subclasses should call the superclass constructor in addition to doing their own initialization.

    A value should be provided for either n_actions (for discrete action spaces)
    or action_shape (for continuous action spaces), but not both.

    Parameters
    ----------
    state_shape: tuple or list of tuples
      the shape(s) of the array(s) making up the state
    n_actions: int
      the number of discrete actions that can be performed.  If the action space
      is continuous, this should be None.
    state_dtype: dtype or list of dtypes
      the type(s) of the array(s) making up the state.  If this is None, all
      arrays are assumed to be float32.
    action_shape: tuple
      the shape of the array describing an action.  If the action space
      is discrete, this should be none.
    """
    self._state_shape = state_shape
    self._n_actions = n_actions
    self._action_shape = action_shape
    self._state = None
    self._terminated = None
    if state_dtype is None:
@@ -74,9 +101,20 @@ class Environment(object):

  @property
  def n_actions(self):
    """The number of possible actions that can be performed in this Environment."""
    """The number of possible actions that can be performed in this Environment.

    If the environment uses a continuous action space, this returns None.
    """
    return self._n_actions

  @property
  def action_shape(self):
    """The expected shape of NumPy arrays representing actions.

    If the environment uses a discrete action space, this returns None.
    """
    return self._action_shape

  def reset(self):
    """Initialize the environment in preparation for doing calculations with it.

+29 −20
Original line number Diff line number Diff line
@@ -62,18 +62,23 @@ class A3C(object):
  Implements the Asynchronous Advantage Actor-Critic (A3C) algorithm for reinforcement learning.

  The algorithm is described in Mnih et al, "Asynchronous Methods for Deep Reinforcement Learning"
  (https://arxiv.org/abs/1602.01783).  This class requires the policy to output two quantities:
  a vector giving the probability of taking each action, and an estimate of the value function for
  the current state.  It optimizes both outputs at once using a loss that is the sum of three terms:
  (https://arxiv.org/abs/1602.01783).  This class supports environments with both discrete and
  continuous action spaces.  For discrete action spaces, the "action" argument passed to the
  environment is an integer giving the index of the action to perform.  The policy must output
  a vector called "action_prob" giving the probability of taking each action.  For continous
  action spaces, the action is an array where each element is chosen independently from a
  normal distribution.  The policy must output two arrays of the same shape: "action_mean"
  gives the mean value for each element, and "action_std" gives the standard deviation for
  each element.  In either case, the policy must also output a scalar called "value" which
  is an estimate of the value function for the current state.

  The algorithm optimizes all outputs at once using a loss that is the sum of three terms:

  1. The policy loss, which seeks to maximize the discounted reward for each action.
  2. The value loss, which tries to make the value estimate match the actual discounted reward
     that was attained at each step.
  3. An entropy term to encourage exploration.

  This class only supports environments with discrete action spaces, not continuous ones.  The
  "action" argument passed to the environment is an integer, giving the index of the action to perform.

  This class supports Generalized Advantage Estimation as described in Schulman et al., "High-Dimensional
  Continuous Control Using Generalized Advantage Estimation" (https://arxiv.org/abs/1506.02438).
  This is a method of trading off bias and variance in the advantage estimate, which can sometimes
@@ -119,7 +124,8 @@ class A3C(object):
      the Environment to interact with
    policy: Policy
      the Policy to optimize.  Its create_layers() method must return a dict containing the
      keys 'action_prob' and 'value', corresponding to the action probabilities and value estimate
      keys 'action_prob' and 'value' (for discrete action spaces) or 'action_mean', 'action_std',
      and 'value' (for continuous action spaces)
    max_rollout_length: int
      the maximum length of rollouts to generate
    discount_factor: float
@@ -153,7 +159,8 @@ class A3C(object):
    fields = self._build_graph(None, 'global', model_dir)
    if self.continuous:
      (self._graph, self._features, self._rewards, self._actions,
       self._action_mean, self._action_std, self._value, self._advantages) = fields
       self._action_mean, self._action_std, self._value,
       self._advantages) = fields
    else:
      (self._graph, self._features, self._rewards, self._actions,
       self._action_prob, self._value, self._advantages) = fields
@@ -195,11 +202,13 @@ class A3C(object):
      self.continuous = True
      action_mean = policy_layers['action_mean']
      action_std = policy_layers['action_std']
      actions = Label(shape=[None]+list(action_mean.shape))
      actions = Label(shape=[None] + list(self._env.action_shape))
      loss = A3CLossContinuous(
          self.value_weight,
          self.entropy_weight,
          in_layers=[rewards, actions, action_mean, action_std, value, advantages])
          in_layers=[
              rewards, actions, action_mean, action_std, value, advantages
          ])
      graph.add_output(action_mean)
      graph.add_output(action_std)
    graph.add_output(value)
@@ -330,7 +339,8 @@ class A3C(object):
      tensors = [self._action_mean, self._action_std]
    else:
      tensors = [self._action_prob]
    outputs = self._predict_outputs(tensors, state, use_saved_states, save_states)
    outputs = self._predict_outputs(tensors, state, use_saved_states,
                                    save_states)
    return self._select_action_from_outputs(outputs, deterministic)

  def restore(self):
@@ -376,9 +386,9 @@ class A3C(object):
    if self.continuous:
      action_mean, action_std = outputs
      if deterministic:
        return action_mean
        return action_mean[0]
      else:
        return np.random.normal(action_mean, action_std)
        return np.random.normal(action_mean[0], action_std[0])
    else:
      action_prob = outputs[0]
      if deterministic:
@@ -451,11 +461,11 @@ class _Worker(object):
      else:
        tensors = [self.action_prob, self.value]
      results = session.run(
          tensors + self.graph.rnn_final_states,
          feed_dict=feed_dict)
          tensors + self.graph.rnn_final_states, feed_dict=feed_dict)
      value = results[len(tensors) - 1]
      self.rnn_states = results[len(tensors):]
      action = self.a3c._select_action_from_outputs(results[:len(tensors)-1], False)
      action = self.a3c._select_action_from_outputs(results[:len(tensors) - 1],
                                                    False)
      actions.append(action)
      values.append(float(value))
      rewards.append(self.env.step(action))
@@ -551,8 +561,7 @@ class _Worker(object):
      feed_dict[f.out_tensor] = s
    values = self.a3c._session.run(self.value.out_tensor, feed_dict=feed_dict)
    values = np.append(values.flatten(), 0.0)
    self.process_rollout(hindsight_states, actions,
                         np.array(rewards),
    self.process_rollout(hindsight_states, actions, np.array(rewards),
                         np.array(values), initial_rnn_states, step_count)

  def create_feed_dict(self, state):
+15 −10
Original line number Diff line number Diff line
@@ -231,7 +231,6 @@ class TestA3C(unittest.TestCase):
        pass_count += 1
    assert pass_count >= 3


  def test_continuous(self):
    """Test A3C on an environment with a continous action space."""

@@ -242,7 +241,7 @@ class TestA3C(unittest.TestCase):
    class TestEnvironment(dc.rl.Environment):

      def __init__(self):
        super(TestEnvironment, self).__init__((2,), 0)
        super(TestEnvironment, self).__init__((2,), action_shape=(1,))

      def reset(self):
        target = np.random.uniform(-50, 50)
@@ -252,9 +251,9 @@ class TestA3C(unittest.TestCase):

      def step(self, action):
        target = self._state[1]
        dist = np.abs(target - action[0][0])
        dist = np.abs(target - action[0])
        old_dist = np.abs(target - self._state[0])
        new_state = np.array([action[0][0], target])
        new_state = np.array([action[0], target])
        self._state = new_state
        self.count += 1
        reward = old_dist - dist
@@ -266,10 +265,15 @@ class TestA3C(unittest.TestCase):
    class TestPolicy(dc.rl.Policy):

      def create_layers(self, state, **kwargs):
        action_mean = Dense(1, in_layers=state, weights_initializer=tf.zeros_initializer)
        action_std = Constant(10.0)
        action_mean = Dense(
            1, in_layers=state, weights_initializer=tf.zeros_initializer)
        action_std = Constant([10.0])
        value = Dense(1, in_layers=state)
        return {'action_mean': action_mean, 'action_std': action_std, 'value': value}
        return {
            'action_mean': action_mean,
            'action_std': action_std,
            'value': value
        }

    # Optimize it.

@@ -278,7 +282,8 @@ class TestA3C(unittest.TestCase):
        initial_rate=0.005, final_rate=0.0005, decay_steps=25000)
    a3c = dc.rl.A3C(
        env,
        TestPolicy(), discount_factor=0,
        TestPolicy(),
        discount_factor=0,
        optimizer=Adam(learning_rate=learning_rate))
    a3c.fit(25000)