Commit c8342e9c authored by Peter Eastman's avatar Peter Eastman
Browse files

State is represented as a list of arrays

parent 2d9a7197
Loading
Loading
Loading
Loading
+21 −15
Original line number Diff line number Diff line
@@ -5,10 +5,10 @@ from deepchem.rl.a3c import A3C
class Environment(object):
  """An environment in which an actor performs actions to accomplish a task.

  An environment has a current state that can be queried.  When an action
  is taken, that causes the state to be updated.  Exactly what is meant by
  a "state" or "action" is defined by each subclass.  As far as this interface
  is concerned, they are simply arbitrary objects.  The environment also computes
  An environment has a current state, which is represented as a list of NumPy
  arrays.  When an action is taken, that causes the state to be updated.  Exactly
  what is meant by an "action" is defined by each subclass.  As far as this interface
  is concerned, it is simply an arbitrary object.  The environment also computes
  a reward for each action, and reports when the task has been terminated
  (meaning that no more actions may be taken).

@@ -26,7 +26,7 @@ class Environment(object):

  @property
  def state(self):
    """The current state of the environment, represented as a class-specific object.
    """The current state of the environment, represented as a list of NumPy arrays.

    If reset() has not yet been called at least once, this is undefined.
    """
@@ -42,7 +42,10 @@ class Environment(object):

  @property
  def state_shape(self):
    """The shape of the values that describe a state."""
    """The shape of the arrays that describe a state.

    This returns a list of tuples, where each tuple is the shape of one array.
    """
    return self._state_shape

  @property
@@ -83,14 +86,16 @@ class GymEnvironment(Environment):
    import gym
    self.env = gym.make(name)
    self.name = name
    super().__init__(self.env.observation_space.shape, self.env.action_space.n)
    super().__init__([self.env.observation_space.shape], self.env.action_space.n)

  def reset(self):
    self._state = self.env.reset()
    state = self.env.reset()
    self._state = [state]
    self._terminated = False

  def step(self, action):
    self._state, reward, self._terminated, info = self.env.step(action)
    state, reward, self._terminated, info = self.env.step(action)
    self._state = [state]
    return reward

  def __deepcopy__(self, memo):
@@ -113,12 +118,13 @@ class Policy(object):
  or even on different computers.
  """

  def create_layers(self, features, **kwargs):
  def create_layers(self, state, **kwargs):
    """Create the TensorGraph Layers that define the policy.

    The arguments always include a Feature layer representing the current state of
    the environment.  Depending on the algorithm being used, other arguments might
    get passed as well.  It is up to each algorithm to document that.
    The arguments always include a list of Feature layers representing the current
    state of the environment (one layer for each array in the state).  Depending on
    the algorithm being used, other arguments might get passed as well.  It is up
    to each algorithm to document that.

    This method should construct and return a dict that maps strings to Layer
    objects.  Each algorithm must document what Layers it expects the policy to
+17 −9
Original line number Diff line number Diff line
@@ -27,6 +27,11 @@ class A3CLoss(Layer):
    self.out_tensor = policy_loss + self.value_weight*value_loss - self.entropy_weight*entropy
    return self.out_tensor


def _create_feed_dict(features, state):
  return dict((f.out_tensor, np.expand_dims(s, axis=0)) for f,s in zip(features, state))


class A3C(object):
  """
  Implements the Asynchronous Advantage Actor-Critic (A3C) algorithm for reinforcement learning.
@@ -78,7 +83,7 @@ class A3C(object):

  def _build_graph(self, tf_graph, scope, model_dir):
    """Construct a TensorGraph containing the policy and loss calculations."""
    features = Feature(shape=[None]+list(self._env.state_shape))
    features = [Feature(shape=[None]+list(s)) for s in self._env.state_shape]
    policy_layers = self._policy.create_layers(features)
    action_prob = policy_layers['action_prob']
    value = policy_layers['value']
@@ -145,7 +150,7 @@ class A3C(object):
    the array of action probabilities, and the estimated value function
    """
    with self._graph._get_tf("Graph").as_default():
      feed_dict = {self._features.out_tensor: np.expand_dims(state, axis=0)}
      feed_dict = _create_feed_dict(self._features, state)
      return self._session.run([self._action_prob.out_tensor, self._value.out_tensor], feed_dict=feed_dict)

  def select_action(self, state, deterministic=False):
@@ -164,7 +169,7 @@ class A3C(object):
    the index of the selected action
    """
    with self._graph._get_tf("Graph").as_default():
      feed_dict = {self._features.out_tensor: np.expand_dims(state, axis=0)}
      feed_dict = _create_feed_dict(self._features, state)
      probabilities = self._session.run(self._action_prob.out_tensor, feed_dict=feed_dict)
      if deterministic:
        return probabilities.argmax()
@@ -206,24 +211,27 @@ class _Worker(object):
        session.run(self.update_local_variables)
        episode_states, episode_actions, episode_rewards = self.create_rollout()
        feed_dict = {}
        feed_dict[self.features.out_tensor] = episode_states
        for f,s in zip(self.features, episode_states):
          feed_dict[f.out_tensor] = s
        feed_dict[self.rewards.out_tensor] = episode_rewards
        feed_dict[self.actions.out_tensor] = episode_actions
        session.run(self.train_op, feed_dict=feed_dict)
        step_count[0] += len(episode_states)
        step_count[0] += len(episode_actions)

  def create_rollout(self):
    """Generate a rollout."""
    n_actions = self.env.n_actions
    session = self.a3c._session
    states = []
    states = [[] for i in range(len(self.features))]
    actions = []
    rewards = []
    for i in range(self.a3c.max_rollout_length):
      if self.env.terminated:
        break
      states.append(self.env.state)
      feed_dict = {self.features.out_tensor: np.expand_dims(self.env.state, axis=0)}
      state = self.env.state
      for j in range(len(state)):
        states[j].append(state[j])
      feed_dict = _create_feed_dict(self.features, state)
      probabilities = session.run(self.action_prob.out_tensor, feed_dict=feed_dict)
      action = np.random.choice(np.arange(n_actions), p=probabilities[0])
      actions.append(np.zeros(n_actions))
@@ -231,7 +239,7 @@ class _Worker(object):
      rewards.append(self.env.step(action))
    if not self.env.terminated:
      # Add an estimate of the reward for the rest of the episode.
      feed_dict = {self.features.out_tensor: np.expand_dims(self.env.state, axis=0)}
      feed_dict = _create_feed_dict(self.features, self.env.state)
      rewards[-1] += self.a3c.discount_factor*session.run(self.value.out_tensor, feed_dict)
    for j in range(len(rewards)-1, 0, -1):
      rewards[j-1] += self.a3c.discount_factor*rewards[j]
+10 −10
Original line number Diff line number Diff line
@@ -17,8 +17,8 @@ class TestA3C(unittest.TestCase):

    class RouletteEnvironment(dc.rl.Environment):
      def __init__(self):
        super().__init__([1], 38)
        self._state = np.array([0])
        super().__init__([(1,)], 38)
        self._state = [np.array([0])]

      def step(self, action):
        if action == 37:
@@ -41,30 +41,30 @@ class TestA3C(unittest.TestCase):
    # This policy just learns a constant probability for each action, and a constant for the value.

    class TestPolicy(dc.rl.Policy):
      def create_layers(self, features, **kwargs):
        action = Dense(in_layers=[features], out_channels=env.n_actions)
      def create_layers(self, state, **kwargs):
        action = Dense(in_layers=state, out_channels=env.n_actions)
        output = SoftMax(in_layers=[Reshape(in_layers=[action], shape=(-1, env.n_actions))])
        value = Dense(in_layers=[features], out_channels=1)
        value = Dense(in_layers=state, out_channels=1)
        return {'action_prob':output, 'value':value}

    # Optimize it.

    a3c = dc.rl.A3C(env, TestPolicy(), value_weight=100.0)
    a3c.optimizer = dc.models.tensorgraph.TFWrapper(tf.train.AdamOptimizer, learning_rate=0.1)
    a3c = dc.rl.A3C(env, TestPolicy(), value_weight=100.0, max_rollout_length=50)
    a3c.optimizer = dc.models.tensorgraph.TFWrapper(tf.train.AdamOptimizer, learning_rate=0.2)
    a3c.fit(100000)

    # It should have learned that the expected value is very close to zero, and that the best
    # action is to walk away.

    action_prob, value = a3c.predict([0])
    action_prob, value = a3c.predict([[0]])
    assert -0.5 < value[0] < 0.5
    assert action_prob.argmax() == 37
    assert a3c.select_action([0], deterministic=True) == 37
    assert a3c.select_action([[0]], deterministic=True) == 37

    # Verify that we can create a new A3C object, reload the parameters from the first one, and
    # get the same result.

    new_a3c = dc.rl.A3C(env, TestPolicy(), model_dir=a3c._graph.model_dir)
    new_a3c.restore()
    action_prob2, value2 = new_a3c.predict([0])
    action_prob2, value2 = new_a3c.predict([[0]])
    assert value2 == value