From 5384dff1956ff4c0f1f3601f5b34df2f250529ae Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 22 Apr 2024 14:05:19 +0200
Subject: [PATCH 1/2] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/env/multi_agent_episode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/env/multi_agent_episode.py b/rllib/env/multi_agent_episode.py
index 559dfcb9dd5e..f13375c7fc64 100644
--- a/rllib/env/multi_agent_episode.py
+++ b/rllib/env/multi_agent_episode.py
@@ -27,7 +27,7 @@
 
 
 # TODO (simon): Include cases in which the number of agents in an
-# episode are shrinking or growing during the episode itself.
+#  episode are shrinking or growing during the episode itself.
 @PublicAPI(stability="alpha")
 class MultiAgentEpisode:
     """Stores multi-agent episode data.

From a61cb72b949a6c46dbec2c8cb3e0662a8f85ee94 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 22 Apr 2024 15:18:05 +0200
Subject: [PATCH 2/2] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/env/multi_agent_episode.py            | 23 ++++--
 rllib/env/tests/test_multi_agent_episode.py | 85 ++++++++++++++++-----
 rllib/env/utils/infinite_lookback_buffer.py |  2 +-
 3 files changed, 85 insertions(+), 25 deletions(-)

diff --git a/rllib/env/multi_agent_episode.py b/rllib/env/multi_agent_episode.py
index f13375c7fc64..1faac818ae6e 100644
--- a/rllib/env/multi_agent_episode.py
+++ b/rllib/env/multi_agent_episode.py
@@ -1633,21 +1633,30 @@ def __repr__(self):
         )
 
     def print(self) -> None:
+        """Prints this MultiAgentEpisode as a table of observations for the agents."""
+
         # Find the maximum timestep across all agents to determine the grid width.
-        max_ts = max(len(ts) for ts in self.env_t_to_agent_t.values())
+        max_ts = max(ts.len_incl_lookback() for ts in self.env_t_to_agent_t.values())
+        lookback = next(iter(self.env_t_to_agent_t.values())).lookback
+        longest_agent = max(len(aid) for aid in self.agent_ids)
         # Construct the header.
-        header = "ts   " + " ".join(str(i) for i in range(max_ts)) + "\n"
+        header = (
+            "ts"
+            + (" " * longest_agent)
+            + "   ".join(str(i) for i in range(-lookback, max_ts - lookback))
+            + "\n"
+        )
         # Construct each agent's row.
         rows = []
-        for agent, timesteps in self.env_t_to_agent_t.items():
-            row = f"{agent}  "
-            for t in timesteps:
+        for agent, inf_buffer in self.env_t_to_agent_t.items():
+            row = f"{agent}  " + (" " * (longest_agent - len(agent)))
+            for t in inf_buffer.data:
                 # Two spaces for alignment.
                 if t == "S":
-                    row += "  "
+                    row += "    "
                 # Mark the step with an x.
                 else:
-                    row += "x "
+                    row += " x  "
             # Remove trailing space for alignment.
             rows.append(row.rstrip())
 
diff --git a/rllib/env/tests/test_multi_agent_episode.py b/rllib/env/tests/test_multi_agent_episode.py
index 7f0a77e99e0b..15318dfeb89c 100644
--- a/rllib/env/tests/test_multi_agent_episode.py
+++ b/rllib/env/tests/test_multi_agent_episode.py
@@ -975,7 +975,7 @@ def test_get_actions(self):
             check(act, actions[i])
         # Access >=0 integer indices (expect index error as everything is in
         # lookback buffer).
-        for i in range(1, 5):
+        for i in range(0, 5):
             with self.assertRaises(IndexError):
                 episode.get_actions(i)
         # Access <= -5 integer indices (expect index error as this goes beyond length of
@@ -1023,6 +1023,50 @@ def test_get_actions(self):
         act = episode.get_actions(-4, env_steps=False, fill="skip")
         check(act, {"a0": "skip", "a1": 0})
 
+        episode.add_env_step(
+            observations={"a0": 5, "a1": 5}, actions={"a1": 4}, rewards={"a1": 4}
+        )
+        check(episode.get_actions(0), {"a1": 4})
+        check(episode.get_actions(-1), {"a1": 4})
+        check(episode.get_actions(-2), {"a1": 3})
+        episode.add_env_step(
+            observations={"a1": 6},
+            actions={"a0": 5, "a1": 5},
+            rewards={"a0": 5, "a1": 5},
+        )
+        check(episode.get_actions(0), {"a1": 4})
+        check(episode.get_actions(1), {"a0": 5, "a1": 5})
+        check(episode.get_actions(-1), {"a0": 5, "a1": 5})
+
+        # Generate a simple multi-agent episode, where a hanging action is at the end.
+        observations = [
+            {"a0": 0, "a1": 0},
+            {"a0": 0, "a1": 1},
+            {"a0": 2},
+        ]
+        actions = [{"a0": 0, "a1": 0}, {"a0": 1, "a1": 1}]
+        rewards = [{"a0": 0.0, "a1": 0.0}, {"a0": 0.1, "a1": 0.1}]
+        episode = MultiAgentEpisode(
+            observations=observations,
+            actions=actions,
+            rewards=rewards,
+            len_lookback_buffer=0,
+        )
+        # Test, whether the hanging action of a1 at the end gets returned properly
+        # for idx=-1.
+        act = episode.get_actions(-1)
+        check(act, {"a0": 1, "a1": 1})
+        act = episode.get_actions(-2)
+        check(act, {"a0": 0, "a1": 0})
+        act = episode.get_actions(0)
+        check(act, {"a0": 0, "a1": 0})
+        act = episode.get_actions(1)
+        check(act, {"a0": 1, "a1": 1})
+        with self.assertRaises(IndexError):
+            episode.get_actions(2)
+        with self.assertRaises(IndexError):
+            episode.get_actions(-3)
+
         # Generate a simple multi-agent episode, where one agent is done.
         # observations = [
         #    {"a0": 0, "a1": 0},
@@ -1132,15 +1176,23 @@ def test_get_actions(self):
         check(
             act,
             {
-                "agent_1": [-10, 0],
-                "agent_2": [-10, 0],
-                "agent_3": [-10, 0],
-                "agent_4": [-10, -10],
+                "agent_1": [0, 1],
+                "agent_2": [0, -10],
+                "agent_3": [0, 1],
+                "agent_4": [-10, 1],
+            },
+        )
+        # Same, but w/o fill.
+        act = episode.get_actions(indices=[-2, -1], neg_indices_left_of_zero=True)
+        check(
+            act,
+            {
+                "agent_1": [0, 1],
+                "agent_2": [0],
+                "agent_3": [0, 1],
+                "agent_4": [1],
             },
         )
-        # Same, but w/o fill (should produce error as the lookback is only 1 long).
-        with self.assertRaises(IndexError):
-            episode.get_actions(indices=[-2, -1], neg_indices_left_of_zero=True)
 
         # Get last actions for each individual agent.
         act = episode.get_actions(indices=-1, env_steps=False)
@@ -1158,7 +1210,7 @@ def test_get_actions(self):
         act = episode.get_actions(-1, env_steps=False, agent_ids=["agent_1", "agent_2"])
         check(act, {"agent_1": 1, "agent_2": 0})
         act = episode.get_actions(-2, env_steps=True, agent_ids={"agent_4"})
-        check(act, {"agent_4": 1})
+        check(act, {})
         act = episode.get_actions([-1, -2], env_steps=True, agent_ids={"agent_4"})
         check(act, {"agent_4": [1]})
         # Agent 4 has only acted 2x, so there is no (local) ts=-2 for it.
@@ -1173,7 +1225,7 @@ def test_get_actions(self):
         # actions are in these buffers (and won't get returned here).
         act = episode.get_actions(return_list=True)
         self.assertTrue(act == [])
-        # Expect error when calling with env_steps=False.
+        # Expect error when calling with env_steps=False AND return_list=True.
         with self.assertRaises(ValueError):
             episode.get_actions(env_steps=False, return_list=True)
         # List of indices.
@@ -1364,15 +1416,14 @@ def test_get_rewards(self):
         check(
             rew,
             {
-                "agent_1": [-10, 0.5],
-                "agent_2": [-10, 0.6],
-                "agent_3": [-10, 0.7],
-                "agent_4": [-10, -10],
+                "agent_1": [0.5, 1.1],
+                "agent_2": [0.6, -10],
+                "agent_3": [0.7, 1.2],
+                "agent_4": [-10, 1.3],
             },
         )
-        # Same, but w/o fill (should produce error as the lookback is only 1 long).
-        with self.assertRaises(IndexError):
-            episode.get_rewards(indices=[-2, -1], neg_indices_left_of_zero=True)
+        # Same, but w/o fill.
+        episode.get_rewards(indices=[-2, -1], neg_indices_left_of_zero=True)
 
         # Get last rewards for each individual agent.
         rew = episode.get_rewards(indices=-1, env_steps=False)
diff --git a/rllib/env/utils/infinite_lookback_buffer.py b/rllib/env/utils/infinite_lookback_buffer.py
index ff9129cb5e71..b1264305a3a6 100644
--- a/rllib/env/utils/infinite_lookback_buffer.py
+++ b/rllib/env/utils/infinite_lookback_buffer.py
@@ -464,7 +464,7 @@ def _get_int_index(
         # If index >= 0 -> Ignore lookback buffer.
         # Otherwise, include lookback buffer.
         if idx >= 0 or neg_indices_left_of_zero:
-            idx = self.lookback + idx - (_ignore_last_ts is True)
+            idx = self.lookback + idx
         # Negative indices mean: Go to left into lookback buffer starting from idx=0.
         # But if we pass the lookback buffer, the index should be invalid and we will
         # have to fill, if required. Invalidate the index by setting it to one larger