Copyright 2023 The TF-Agents Authors.

Copyright 2023 The TF-Agents Authors.#

#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

环境#

在 tensorflow.google.cn 上查看

在 Google Colab 运行

在 Github 上查看源代码

下载笔记本

简介#

强化学习 (RL) 的目标是设计可通过与环境交互进行学习的代理。在标准 RL 设置中，代理在每个时间步骤都会收到一个观测值并选择一个操作。该操作将应用于环境，而环境会返回奖励和新的观测值。代理会训练策略以选择合适的操作，旨在使奖励总和（即回报）最大化。

在 TF-Agents 中，可以使用 Python 或 TensorFlow 实现环境。Python 环境通常更易于实现、理解和调试，但 TensorFlow 环境则更为高效并且支持自然并行化。最常见的工作流是在 Python 中实现环境，然后使用我们的包装器之一将其自动转换为 TensorFlow。

让我们首先看一下 Python 环境。TensorFlow 环境采用非常相似的 API。

设置#

如果尚未安装 TF-Agents 或 Gym，请运行以下命令：

!pip install tf-agents[reverb]

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

Python 环境#

Python 环境的 step(action) -> next_time_step 方法可将操作应用于环境，并返回有关下一步的以下信息：

observation：此为环境状态的一部分，可供代理观测以选择下一步的操作。
reward：代理会进行学习，目标是实现多个步骤奖励总和的最大化。
step_type：与环境的交互通常是序列/片段的一部分。例如，下象棋时多次移动棋子。step_type 可以是 FIRST、MID 或 LAST 之一，分别指示该时间步骤是序列中的第一步、中间步或最后一步。
discount：此为一个浮点数，表示下一个时间步骤的奖励相对于当前时间步骤的奖励的权重。

它们被分组到一个命名元组 TimeStep(step_type, reward, discount, observation)。

environments/py_environment.PyEnvironment 内包含了所有 python 环境必须实现的接口。主要方法为：

class PyEnvironment(object):

  def reset(self):
    """Return initial_time_step."""
    self._current_time_step = self._reset()
    return self._current_time_step

  def step(self, action):
    """Apply action and return new time_step."""
    if self._current_time_step is None:
        return self.reset()
    self._current_time_step = self._step(action)
    return self._current_time_step

  def current_time_step(self):
    return self._current_time_step

  def time_step_spec(self):
    """Return time_step_spec."""

  @abc.abstractmethod
  def observation_spec(self):
    """Return observation_spec."""

  @abc.abstractmethod
  def action_spec(self):
    """Return action_spec."""

  @abc.abstractmethod
  def _reset(self):
    """Return initial_time_step."""

  @abc.abstractmethod
  def _step(self, action):
    """Apply action and return new time_step."""

除了 step() 方法外，环境还提供了一个 reset() 方法，该方法可以启动新的序列并提供初始 TimeStep。不必显式调用 reset 方法。我们假定在片段结束或首次调用 step() 时环境均会自动重置。

请注意，子类不会直接实现 step() 或 reset()。相反，它们会重写 _step() 和 _reset() 方法。这些方法返回的时间步骤将通过 current_time_step() 缓存和公开。

observation_spec 和 action_spec 方法会返回一组 (Bounded)ArraySpecs 嵌套，分别描述观测值和操作的名称、形状、数据类型和范围。

我们在 TF-Agents 中反复提及嵌套，其定义为由列表、元组、命名元组或字典组成的任何树状结构。这些内容可以任意组合以保持观测值和操作的结构。我们发现，对于包含许多观测值和操作的更复杂环境而言，这种结构非常实用。

使用标准环境#

TF Agents 针对许多标准环境（如 OpenAI Gym、DeepMind-control 和 Atari）内置了包装器，因此它们支持我们的 py_environment.PyEnvironment 接口。这些包装的环境可以使用我们的环境套件轻松加载。让我们通过 OpenAI Gym 加载 CartPole 环境，并查看操作和 time_step_spec。

environment = suite_gym.load('CartPole-v0')
print('action_spec:', environment.action_spec())
print('time_step_spec.observation:', environment.time_step_spec().observation)
print('time_step_spec.step_type:', environment.time_step_spec().step_type)
print('time_step_spec.discount:', environment.time_step_spec().discount)
print('time_step_spec.reward:', environment.time_step_spec().reward)

可以看到，环境所预期的操作类型为 [0, 1] 区间内的 int64，当观测值为长度等于 4 的 float32 向量且折扣因子为 [0.0, 1.0] 区间内的 float32 时会返回 TimeSteps。现在，让我们尝试对整个片段采取固定操作 (1,)。

action = np.array(1, dtype=np.int32)
time_step = environment.reset()
print(time_step)
while not time_step.is_last():
  time_step = environment.step(action)
  print(time_step)

创建自己的 Python 环境#

对于许多客户而言，一个常见用例是采用 TF-Agents 中的一个标准代理（请参见 agents/）解决他们的问题。为此，客户需要将问题视为环境。那么，让我们看一下如何在 Python 中实现环境。

假设我们要训练一个代理来玩以下纸牌游戏（受 21 点玩法启发）：

使用无限张数字为 1 到 10 的纸牌进行游戏。
代理每个回合可以做两件事：随机抽取一张新的纸牌，或者停止当前回合。
目标是在回合结束时使您的纸牌上数字的总和尽可能接近 21，但不大于 21。

代表游戏的环境可能如下所示：

操作：有 2 个操作。操作 0 为抽取一张新的纸牌；操作 1 为终止当前回合。
观测值：当前回合的纸牌上数字的总和。
奖励：目标是尽可能接近 21 但不超过 21，因此我们可以在回合结束时使用以下奖励实现这一目标：sum_of_cards - 21 if sum_of_cards <= 21, else -21

class CardGameEnv(py_environment.PyEnvironment):

  def __init__(self):
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
    self._observation_spec = array_spec.BoundedArraySpec(
        shape=(1,), dtype=np.int32, minimum=0, name='observation')
    self._state = 0
    self._episode_ended = False

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self._state = 0
    self._episode_ended = False
    return ts.restart(np.array([self._state], dtype=np.int32))

  def _step(self, action):

    if self._episode_ended:
      # The last action ended the episode. Ignore the current action and start
      # a new episode.
      return self.reset()

    # Make sure episodes don't go on forever.
    if action == 1:
      self._episode_ended = True
    elif action == 0:
      new_card = np.random.randint(1, 11)
      self._state += new_card
    else:
      raise ValueError('`action` should be 0 or 1.')

    if self._episode_ended or self._state >= 21:
      reward = self._state - 21 if self._state <= 21 else -21
      return ts.termination(np.array([self._state], dtype=np.int32), reward)
    else:
      return ts.transition(
          np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)

让我们确保已正确地定义了上述环境。创建自己的环境时，您必须确保生成的观测值和 time_step 符合规范中定义的正确形状和类型。这些内容用于生成 TensorFlow 计算图，因此如有差错，可能会造成难以调试的问题。

为了验证我们的环境，我们将使用随机策略来生成操作，并将迭代 5 个片段以确保按预期进行。如果我们收到的 time_step 不符合环境规范，则会提示错误。

environment = CardGameEnv()
utils.validate_py_environment(environment, episodes=5)

现在我们可以确定环境正在按预期工作，让我们使用固定策略运行此环境：抽取 3 张纸牌，然后结束该回合。

get_new_card_action = np.array(0, dtype=np.int32)
end_round_action = np.array(1, dtype=np.int32)

environment = CardGameEnv()
time_step = environment.reset()
print(time_step)
cumulative_reward = time_step.reward

for _ in range(3):
  time_step = environment.step(get_new_card_action)
  print(time_step)
  cumulative_reward += time_step.reward

time_step = environment.step(end_round_action)
print(time_step)
cumulative_reward += time_step.reward
print('Final Reward = ', cumulative_reward)

环境包装器#

环境包装器使用 python 环境，并返回该环境的修改版本。原始环境和修改后的环境均为 py_environment.PyEnvironment 的实例，并且可以将多个包装器链接在一起。

可以在 environments/wrappers.py 中找到一些常用的包装器。例如：

ActionDiscretizeWrapper：将连续操作空间转换成离散操作空间。
RunStats：捕获环境的运行统计信息，例如采用的步数、完成的片段数等。
TimeLimit：在固定步数后终止片段。

示例 1：操作离散化包装器#

InvertedPendulum 是一个接受 [-2, 2] 区间内连续操作的 PyBullet 环境。如果要在此环境中训练离散操作代理（例如 DQN），则必须离散化（量化）操作空间。这正是 ActionDiscretizeWrapper 的工作。请对比包装前后的 action_spec：

env = suite_gym.load('Pendulum-v1')
print('Action Spec:', env.action_spec())

discrete_action_env = wrappers.ActionDiscretizeWrapper(env, num_actions=5)
print('Discretized Action Spec:', discrete_action_env.action_spec())

包装后的 discrete_action_env 为 py_environment.PyEnvironment 的实例，可视为常规 python 环境。

TensorFlow 环境#

TF 环境的接口在 environments/tf_environment.TFEnvironment 中定义，其与 Python 环境非常相似。TF 环境与 python 环境在以下两个方面有所不同：

TF 环境生成张量对象而非数组
与规范相比，TF 环境会为生成的张量添加批次维度。

将 python 环境转换为 TF 环境可以使 tensorflow 支持并行化运算。例如，用户可以定义 collect_experience_op 从环境中收集数据并添加到 replay_buffer，并定义 train_op 从 replay_buffer 中读取数据并训练代理，然后在 TensorFlow 中自然地并行运行二者。

class TFEnvironment(object):

  def time_step_spec(self):
    """Describes the `TimeStep` tensors returned by `step()`."""

  def observation_spec(self):
    """Defines the `TensorSpec` of observations provided by the environment."""

  def action_spec(self):
    """Describes the TensorSpecs of the action expected by `step(action)`."""

  def reset(self):
    """Returns the current `TimeStep` after resetting the Environment."""
    return self._reset()

  def current_time_step(self):
    """Returns the current `TimeStep`."""
    return self._current_time_step()

  def step(self, action):
    """Applies the action and returns the new `TimeStep`."""
    return self._step(action)

  @abc.abstractmethod
  def _reset(self):
    """Returns the current `TimeStep` after resetting the Environment."""

  @abc.abstractmethod
  def _current_time_step(self):
    """Returns the current `TimeStep`."""

  @abc.abstractmethod
  def _step(self, action):
    """Applies the action and returns the new `TimeStep`."""

current_time_step() 方法会返回当前 time_step 并在需要时初始化环境。

reset() 方法会在环境中强制执行重置并返回 current_step。

如果 action 不依赖于上一个 time_step，则在 Graph 模式下将需要 tf.control_dependency。

现在，让我们看看如何创建 TFEnvironments。

创建自己的 TensorFlow 环境#

此操作比在 Python 中创建环境复杂得多，因此，我们将不会在本 Colab 中进行介绍。此处提供了一个示例。更常见的用例是在 Python 中实现您的环境，并使用我们的 TFPyEnvironment 包装器将其包装为 TensorFlow 环境（请参见下文）。

将 Python 环境包装为 TensorFlow 环境#

我们可以使用 TFPyEnvironment 包装器将任何 Python 环境轻松包装为 TensorFlow 环境。

env = suite_gym.load('CartPole-v0')
tf_env = tf_py_environment.TFPyEnvironment(env)

print(isinstance(tf_env, tf_environment.TFEnvironment))
print("TimeStep Specs:", tf_env.time_step_spec())
print("Action Specs:", tf_env.action_spec())

请注意，规范的类型现在为：(Bounded)TensorSpec。

用法示例#

简单示例#

env = suite_gym.load('CartPole-v0')

tf_env = tf_py_environment.TFPyEnvironment(env)
# reset() creates the initial time_step after resetting the environment.
time_step = tf_env.reset()
num_steps = 3
transitions = []
reward = 0
for i in range(num_steps):
  action = tf.constant([i % 2])
  # applies the action and returns the new TimeStep.
  next_time_step = tf_env.step(action)
  transitions.append([time_step, action, next_time_step])
  reward += next_time_step.reward
  time_step = next_time_step

np_transitions = tf.nest.map_structure(lambda x: x.numpy(), transitions)
print('\n'.join(map(str, np_transitions)))
print('Total reward:', reward.numpy())

整个片段#

env = suite_gym.load('CartPole-v0')
tf_env = tf_py_environment.TFPyEnvironment(env)

time_step = tf_env.reset()
rewards = []
steps = []
num_episodes = 5

for _ in range(num_episodes):
  episode_reward = 0
  episode_steps = 0
  while not time_step.is_last():
    action = tf.random.uniform([1], 0, 2, dtype=tf.int32)
    time_step = tf_env.step(action)
    episode_steps += 1
    episode_reward += time_step.reward.numpy()
  rewards.append(episode_reward)
  steps.append(episode_steps)
  time_step = tf_env.reset()

num_steps = np.sum(steps)
avg_length = np.mean(steps)
avg_reward = np.mean(rewards)

print('num_episodes:', num_episodes, 'num_steps:', num_steps)
print('avg_length', avg_length, 'avg_reward:', avg_reward)