Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions com.unity.ml-agents/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ and this project adheres to
1. env_params.max_lifetime_restarts (--max-lifetime-restarts) [default=10]
2. env_params.restarts_rate_limit_n (--restarts-rate-limit-n) [default=1]
3. env_params.restarts_rate_limit_period_s (--restarts-rate-limit-period-s) [default=60]

### Bug Fixes
- Fixed a bug where the critics were not being normalized during training. (#5595)
- Fixed the bug where curriculum learning would crash because of the incorrect run_options parsing. (#5586)

#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
Expand Down
1 change: 1 addition & 0 deletions ml-agents/mlagents/trainers/poca/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory)
self.optimizer.critic.update_normalization(agent_buffer_trajectory)

# Get all value estimates
(
Expand Down
1 change: 1 addition & 0 deletions ml-agents/mlagents/trainers/ppo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory)
self.optimizer.critic.update_normalization(agent_buffer_trajectory)

# Get all value estimates
(
Expand Down
1 change: 1 addition & 0 deletions ml-agents/mlagents/trainers/sac/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory)
self.optimizer.critic.update_normalization(agent_buffer_trajectory)

# Evaluate all reward functions for reporting purposes
self.collected_rewards["environment"][agent_id] += np.sum(
Expand Down
164 changes: 164 additions & 0 deletions ml-agents/mlagents/trainers/tests/test_trainers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
from unittest.mock import patch

import pytest

from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents.trainers.settings import RunOptions
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.dummy_config import (
create_observation_specs_with_shapes,
ppo_dummy_config,
poca_dummy_config,
sac_dummy_config,
)
from mlagents.trainers.tests.mock_brain import make_fake_trajectory
from mlagents.trainers.trainer import TrainerFactory


@pytest.fixture
def ppo_config():
return RunOptions(behaviors={"test_brain": ppo_dummy_config()})


@pytest.fixture
def sac_config():
return RunOptions(behaviors={"test_brain": sac_dummy_config()})


@pytest.fixture
def poca_config():
return RunOptions(behaviors={"test_brain": poca_dummy_config()})


def test_ppo_trainer_update_normalization(ppo_config):
behavior_id_team0 = "test_brain?team=0"
brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1
)
base_config = ppo_config.behaviors
output_path = "results_dir"
train_model = True
load_model = False
seed = 42
trainer_factory = TrainerFactory(
trainer_config=base_config,
output_path=output_path,
train_model=train_model,
load_model=load_model,
seed=seed,
param_manager=EnvironmentParameterManager(),
)
ppo_trainer = trainer_factory.generate(brain_name)
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
policy = ppo_trainer.create_policy(parsed_behavior_id0, mock_specs)
ppo_trainer.add_policy(parsed_behavior_id0, policy)
trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
ppo_trainer.subscribe_trajectory_queue(trajectory_queue0)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
observation_specs=create_observation_specs_with_shapes([(1,)]),
action_spec=mock_specs.action_spec,
)
trajectory_queue0.put(trajectory)
# mocking out update_normalization in both the policy and critic
with patch(
"mlagents.trainers.torch.networks.ValueNetwork.update_normalization"
) as optimizer_update_normalization_mock, patch(
"mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization"
) as policy_update_normalization_mock:
ppo_trainer.advance()
optimizer_update_normalization_mock.assert_called_once()
policy_update_normalization_mock.assert_called_once()


def test_sac_trainer_update_normalization(sac_config):
behavior_id_team0 = "test_brain?team=0"
brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1
)
base_config = sac_config.behaviors
output_path = "results_dir"
train_model = True
load_model = False
seed = 42
trainer_factory = TrainerFactory(
trainer_config=base_config,
output_path=output_path,
train_model=train_model,
load_model=load_model,
seed=seed,
param_manager=EnvironmentParameterManager(),
)
ppo_trainer = trainer_factory.generate(brain_name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: these should be sac_trainers in this method

parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
policy = ppo_trainer.create_policy(parsed_behavior_id0, mock_specs)
ppo_trainer.add_policy(parsed_behavior_id0, policy)
trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
ppo_trainer.subscribe_trajectory_queue(trajectory_queue0)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
observation_specs=create_observation_specs_with_shapes([(1,)]),
action_spec=mock_specs.action_spec,
)
trajectory_queue0.put(trajectory)
# mocking out update_normalization in both the policy and critic
with patch(
"mlagents.trainers.torch.networks.ValueNetwork.update_normalization"
) as optimizer_update_normalization_mock, patch(
"mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization"
) as policy_update_normalization_mock:
ppo_trainer.advance()
optimizer_update_normalization_mock.assert_called_once()
policy_update_normalization_mock.assert_called_once()


def test_poca_trainer_update_normalization(poca_config):
behavior_id_team0 = "test_brain?team=0"
brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1
)
base_config = poca_config.behaviors
output_path = "results_dir"
train_model = True
load_model = False
seed = 42
trainer_factory = TrainerFactory(
trainer_config=base_config,
output_path=output_path,
train_model=train_model,
load_model=load_model,
seed=seed,
param_manager=EnvironmentParameterManager(),
)
poca_trainer = trainer_factory.generate(brain_name)
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
policy = poca_trainer.create_policy(parsed_behavior_id0, mock_specs)
poca_trainer.add_policy(parsed_behavior_id0, policy)
trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
poca_trainer.subscribe_trajectory_queue(trajectory_queue0)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
observation_specs=create_observation_specs_with_shapes([(1,)]),
action_spec=mock_specs.action_spec,
)
trajectory_queue0.put(trajectory)
# mocking out update_normalization in both the policy and critic
with patch(
"mlagents.trainers.poca.optimizer_torch.TorchPOCAOptimizer.POCAValueNetwork.update_normalization"
) as optimizer_update_normalization_mock, patch(
"mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization"
) as policy_update_normalization_mock:
poca_trainer.advance()
optimizer_update_normalization_mock.assert_called_once()
policy_update_normalization_mock.assert_called_once()