Skip to content

Commit

Permalink
Revert "RockPaperScissors Pettingzoo" (#16886)
Browse files Browse the repository at this point in the history
This reverts commit bf3e322.
  • Loading branch information
amogkam authored Jul 6, 2021
1 parent a27a817 commit ecb6321
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 43 deletions.
91 changes: 91 additions & 0 deletions rllib/examples/env/rock_paper_scissors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from gym.spaces import Discrete

from ray.rllib.env.multi_agent_env import MultiAgentEnv


class RockPaperScissors(MultiAgentEnv):
"""Two-player environment for the famous rock paper scissors game.
The observation is simply the last opponent action."""

ROCK = 0
PAPER = 1
SCISSORS = 2
LIZARD = 3
SPOCK = 4

def __init__(self, config):
self.sheldon_cooper = config.get("sheldon_cooper", False)
self.action_space = Discrete(5 if self.sheldon_cooper else 3)
self.observation_space = Discrete(5 if self.sheldon_cooper else 3)
self.player1 = "player1"
self.player2 = "player2"
self.last_move = None
self.num_moves = 0

# For test-case inspections (compare both players' scores).
self.player1_score = self.player2_score = 0

def reset(self):
self.last_move = (0, 0)
self.num_moves = 0
return {
self.player1: self.last_move[1],
self.player2: self.last_move[0],
}

def step(self, action_dict):
move1 = action_dict[self.player1]
move2 = action_dict[self.player2]
if self.sheldon_cooper is False:
assert move1 not in [self.LIZARD, self.SPOCK]
assert move2 not in [self.LIZARD, self.SPOCK]

self.last_move = (move1, move2)
obs = {
self.player1: self.last_move[1],
self.player2: self.last_move[0],
}
r1, r2 = {
(self.ROCK, self.ROCK): (0, 0),
(self.ROCK, self.PAPER): (-1, 1),
(self.ROCK, self.SCISSORS): (1, -1),
(self.PAPER, self.ROCK): (1, -1),
(self.PAPER, self.PAPER): (0, 0),
(self.PAPER, self.SCISSORS): (-1, 1),
(self.SCISSORS, self.ROCK): (-1, 1),
(self.SCISSORS, self.PAPER): (1, -1),
(self.SCISSORS, self.SCISSORS): (0, 0),
# Sheldon Cooper extension:
(self.LIZARD, self.LIZARD): (0, 0),
(self.LIZARD, self.SPOCK): (1, -1), # Lizard poisons Spock
(self.LIZARD, self.ROCK): (-1, 1), # Rock crushes lizard
(self.LIZARD, self.PAPER): (1, -1), # Lizard eats paper
(self.LIZARD, self.SCISSORS): (-1, 1), # Scissors decapitate Lizrd
(self.ROCK, self.LIZARD): (1, -1), # Rock crushes lizard
(self.PAPER, self.LIZARD): (-1, 1), # Lizard eats paper
(self.SCISSORS, self.LIZARD): (1, -1), # Scissors decapitate Lizrd
(self.SPOCK, self.SPOCK): (0, 0),
(self.SPOCK, self.LIZARD): (-1, 1), # Lizard poisons Spock
(self.SPOCK, self.ROCK): (1, -1), # Spock vaporizes rock
(self.SPOCK, self.PAPER): (-1, 1), # Paper disproves Spock
(self.SPOCK, self.SCISSORS): (1, -1), # Spock smashes scissors
(self.ROCK, self.SPOCK): (-1, 1), # Spock vaporizes rock
(self.PAPER, self.SPOCK): (1, -1), # Paper disproves Spock
(self.SCISSORS, self.SPOCK): (-1, 1), # Spock smashes scissors
}[move1, move2]
rew = {
self.player1: r1,
self.player2: r2,
}
self.num_moves += 1
done = {
"__all__": self.num_moves >= 10,
}

if rew["player1"] > rew["player2"]:
self.player1_score += 1
elif rew["player2"] > rew["player1"]:
self.player2_score += 1

return obs, rew, done, {}
27 changes: 14 additions & 13 deletions rllib/examples/policy/rock_paper_scissors_dummies.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import gym
import numpy as np
import random

from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.view_requirement import ViewRequirement

ROCK = 0
PAPER = 1
SCISSORS = 2


class AlwaysSameHeuristic(Policy):
"""Pick a random move and stick with it for the entire episode."""
Expand All @@ -23,7 +21,12 @@ def __init__(self, *args, **kwargs):
})

def get_initial_state(self):
return [random.choice([ROCK, PAPER, SCISSORS])]
return [
random.choice([
RockPaperScissors.ROCK, RockPaperScissors.PAPER,
RockPaperScissors.SCISSORS
])
]

def compute_actions(self,
obs_batch,
Expand Down Expand Up @@ -52,14 +55,12 @@ def compute_actions(self,
episodes=None,
**kwargs):
def successor(x):
if x[ROCK] == 1:
return PAPER
elif x[PAPER] == 1:
return SCISSORS
elif x[SCISSORS] == 1:
return ROCK
elif x[-1] == 1:
return random.choice([ROCK, PAPER, SCISSORS])
if x[RockPaperScissors.ROCK] == 1:
return RockPaperScissors.PAPER
elif x[RockPaperScissors.PAPER] == 1:
return RockPaperScissors.SCISSORS
elif x[RockPaperScissors.SCISSORS] == 1:
return RockPaperScissors.ROCK

return [successor(x) for x in obs_batch], [], {}

Expand Down
48 changes: 18 additions & 30 deletions rllib/examples/rock_paper_scissors_multiagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,18 @@
"""

import argparse
from gym.spaces import Discrete
import os
import random

from ray import tune
from ray.rllib.agents.pg import PGTrainer, PGTFPolicy, PGTorchPolicy
from ray.rllib.agents.registry import get_trainer_class
from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors
from ray.rllib.examples.policy.rock_paper_scissors_dummies import \
BeatLastHeuristic, AlwaysSameHeuristic
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.tune.registry import register_env
from ray.rllib.env import PettingZooEnv
from pettingzoo.classic import rps_v1

tf1, tf, tfv = try_import_tf()
torch, _ = try_import_torch()
Expand Down Expand Up @@ -53,23 +52,10 @@
help="Reward at which we stop training.")


def env_creator(args):
env = rps_v1.env()
return env


register_env("RockPaperScissors",
lambda config: PettingZooEnv(env_creator(config)))

env_for_spaces = PettingZooEnv(env_creator({}))
obs_space = env_for_spaces.observation_space
act_space = env_for_spaces.action_space


def run_same_policy(args, stop):
"""Use the same policy for both agents (trivial case)."""
config = {
"env": "RockPaperScissors",
"env": RockPaperScissors,
"framework": args.framework,
}

Expand All @@ -90,27 +76,27 @@ def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"):
"""

def select_policy(agent_id, episode, **kwargs):
if agent_id == "player_0":
if agent_id == "player1":
return "learned"
else:
return random.choice(["always_same", "beat_last"])

config = {
"env": "RockPaperScissors",
"env": RockPaperScissors,
"gamma": 0.9,
# Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
"num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
"num_workers": 0,
"num_envs_per_worker": 4,
"rollout_fragment_length": 10,
"train_batch_size": 200,
"metrics_smoothing_episodes": 200,
"multiagent": {
"policies_to_train": ["learned"],
"policies": {
"always_same": (AlwaysSameHeuristic, obs_space, act_space, {}),
"beat_last": (BeatLastHeuristic, obs_space, act_space, {}),
"learned": (None, obs_space, act_space, {
"always_same": (AlwaysSameHeuristic, Discrete(3), Discrete(3),
{}),
"beat_last": (BeatLastHeuristic, Discrete(3), Discrete(3), {}),
"learned": (None, Discrete(3), Discrete(3), {
"model": {
"use_lstm": use_lstm
},
Expand All @@ -123,24 +109,22 @@ def select_policy(agent_id, episode, **kwargs):
}
cls = get_trainer_class(trainer) if isinstance(trainer, str) else trainer
trainer_obj = cls(config=config)
env = trainer_obj.workers.local_worker().env
for _ in range(args.stop_iters):
results = trainer_obj.train()
print(results)
# Timesteps reached.
if "policy_always_same_reward" not in results["hist_stats"]:
reward_diff = 0
continue
reward_diff = sum(results["hist_stats"]["policy_learned_reward"])
if results["timesteps_total"] > args.stop_timesteps:
break
# Reward (difference) reached -> all good, return.
elif reward_diff > args.stop_reward:
elif env.player1_score - env.player2_score > args.stop_reward:
return

# Reward (difference) not reached: Error if `as_test`.
if args.as_test:
raise ValueError(
"Desired reward difference ({}) not reached! Only got to {}.".
format(args.stop_reward, reward_diff))
format(args.stop_reward, env.player1_score - env.player2_score))


def run_with_custom_entropy_loss(args, stop):
Expand Down Expand Up @@ -173,7 +157,7 @@ def entropy_policy_gradient_loss(policy, model, dist_class, train_batch):
run_heuristic_vs_learned(args, use_lstm=True, trainer=EntropyLossPG)


if __name__ == "__main__":
def main():
args = parser.parse_args()

stop = {
Expand All @@ -193,3 +177,7 @@ def entropy_policy_gradient_loss(policy, model, dist_class, train_batch):

run_with_custom_entropy_loss(args, stop=stop)
print("run_with_custom_entropy_loss: ok.")


if __name__ == "__main__":
main()

0 comments on commit ecb6321

Please sign in to comment.