Spaces:

openenv-testing
/

maze_env-pr-106

Sleeping

App Files Files Community

burtenshaw HF Staff commited on Nov 2

Commit

be46a16

verified ·

1 Parent(s): 29a7cec

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

src/envs/maze_env/client.py +11 -4
src/envs/maze_env/models.py +1 -0
src/envs/maze_env/server/__init__.py +1 -1
src/envs/maze_env/server/app.py +2 -1
src/envs/maze_env/server/maze.py +145 -73
src/envs/maze_env/server/maze_environment.py +19 -10
src/envs/maze_env/server/mazearray.py +12 -10

src/envs/maze_env/client.py CHANGED Viewed

@@ -23,10 +23,17 @@ from .models import MazeAction, MazeObservation, MazeState
 if TYPE_CHECKING:
     pass
 class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]):
     """HTTP client for Maze Environment."""
-    def render_ascii_maze(self, maze: List[List[int]], position: List[int], start: List[int], goal: List[int]) -> None:
         """
         Render the maze grid as ASCII art in the terminal.
         - 0 = free cell
@@ -49,8 +56,8 @@ class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]):
                     line += "G "
                 elif maze[r][c] == 1:
                     line += "█ "
-                elif r == rows-1 and c == cols-1:
-                    line+= "E "
                 else:
                     line += ". "
             print(line)
@@ -82,4 +89,4 @@ class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]):
             episode_id=payload.get("episode_id", ""),
             step_count=payload.get("step_count", 0),
             done=payload.get("done", False),
-        )

 if TYPE_CHECKING:
     pass
 class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]):
     """HTTP client for Maze Environment."""
+    def render_ascii_maze(
+        self,
+        maze: List[List[int]],
+        position: List[int],
+        start: List[int],
+        goal: List[int],
+    ) -> None:
         """
         Render the maze grid as ASCII art in the terminal.
         - 0 = free cell
                     line += "G "
                 elif maze[r][c] == 1:
                     line += "█ "
+                elif r == rows - 1 and c == cols - 1:
+                    line += "E "
                 else:
                     line += ". "
             print(line)
             episode_id=payload.get("episode_id", ""),
             step_count=payload.get("step_count", 0),
             done=payload.get("done", False),
+        )

src/envs/maze_env/models.py CHANGED Viewed

@@ -29,6 +29,7 @@ class MazeObservation(Observation):
     total_reward: float
     legal_actions: List[int] = field(default_factory=list)
 @dataclass
 class MazeState(State):
     episode_id: str

     total_reward: float
     legal_actions: List[int] = field(default_factory=list)
 @dataclass
 class MazeState(State):
     episode_id: str

src/envs/maze_env/server/__init__.py CHANGED Viewed

@@ -8,4 +8,4 @@
 from .maze import Maze, Status
 from .maze_environment import MazeEnvironment
-__all__ = ["Maze","MazeEnvironment","Status"]

 from .maze import Maze, Status
 from .maze_environment import MazeEnvironment
+__all__ = ["Maze", "MazeEnvironment", "Status"]

src/envs/maze_env/server/app.py CHANGED Viewed

@@ -28,10 +28,11 @@ from core.env_server import create_app
 from ..models import MazeAction, MazeObservation
 from .maze_environment import MazeEnvironment
 from .mazearray import maze
 # Get game configuration from environment variables
 # Create the environment instance
-env = MazeEnvironment(maze_array=maze,start_cell=(0,0),exit_cell=(7,7))
 # Create the FastAPI app with web interface and README integration
 app = create_app(env, MazeAction, MazeObservation, env_name="maze_env")

 from ..models import MazeAction, MazeObservation
 from .maze_environment import MazeEnvironment
 from .mazearray import maze
 # Get game configuration from environment variables
 # Create the environment instance
+env = MazeEnvironment(maze_array=maze, start_cell=(0, 0), exit_cell=(7, 7))
 # Create the FastAPI app with web interface and README integration
 app = create_app(env, MazeAction, MazeObservation, env_name="maze_env")

src/envs/maze_env/server/maze.py CHANGED Viewed

@@ -39,52 +39,73 @@ class Status(Enum):
 class Maze:
-    """ A maze with walls. An agent is placed at the start cell and must find the exit cell by moving through the maze.
-        The layout of the maze and the rules how to move through it are called the environment. An agent is placed
-        at start_cell. The agent chooses actions (move left/right/up/down) in order to reach the exit_cell. Every
-        action results in a reward or penalty which are accumulated during the game. Every move gives a small
-        penalty (-0.05), returning to a cell the agent visited earlier a bigger penalty (-0.25) and running into
-        a wall a large penalty (-0.75). The reward (+10.0) is collected when the agent reaches the exit. The
-        game always reaches a terminal state; the agent either wins or looses. Obviously reaching the exit means
-        winning, but if the penalties the agent is collecting during play exceed a certain threshold the agent is
-        assumed to wander around clueless and looses.
-        A note on cell coordinates:
-        The cells in the maze are stored as (col, row) or (x, y) tuples. (0, 0) is the upper left corner of the maze.
-        This way of storing coordinates is in line with what matplotlib's plot() function expects as inputs. The maze
-        itself is stored as a 2D numpy array so cells are accessed via [row, col]. To convert a (col, row) tuple
-        to (row, col) use (col, row)[::-1]
     """
-    actions = [Action.MOVE_LEFT, Action.MOVE_RIGHT, Action.MOVE_UP, Action.MOVE_DOWN]  # all possible actions
     reward_exit = 10.0  # reward for reaching the exit cell
-    penalty_move = -0.05  # penalty for a move which did not result in finding the exit cell
     penalty_visited = -0.25  # penalty for returning to a cell which was visited earlier
-    penalty_impossible_move = -0.75  # penalty for trying to enter an occupied cell or moving out of the maze
     def __init__(self, maze, start_cell=(0, 0), exit_cell=None):
-        """ Create a new maze game.
-            :param numpy.array maze: 2D array containing empty cells (= 0) and cells occupied with walls (= 1)
-            :param tuple start_cell: starting cell for the agent in the maze (optional, else upper left)
-            :param tuple exit_cell: exit cell which the agent has to reach (optional, else lower right)
         """
         self.maze = maze
-        self.__minimum_reward = -0.5 * self.maze.size  # stop game if accumulated reward is below this threshold
         nrows, ncols = self.maze.shape
         self.cells = [(col, row) for col in range(ncols) for row in range(nrows)]
-        self.empty = [(col, row) for col in range(ncols) for row in range(nrows) if self.maze[row, col] == Cell.EMPTY]
         self.__exit_cell = (ncols - 1, nrows - 1) if exit_cell is None else exit_cell
         self.empty.remove(self.__exit_cell)
         # Check for impossible maze layout
         if self.__exit_cell not in self.cells:
-            raise Exception("Error: exit cell at {} is not inside maze".format(self.__exit_cell))
         if self.maze[self.__exit_cell[::-1]] == Cell.OCCUPIED:
-            raise Exception("Error: exit cell at {} is not free".format(self.__exit_cell))
         # Variables for rendering using Matplotlib
         self.__render = Render.NOTHING  # what to render
@@ -94,17 +115,21 @@ class Maze:
         self.reset(start_cell)
     def reset(self, start_cell=(0, 0)):
-        """ Reset the maze to its initial state and place the agent at start_cell.
-            :param tuple start_cell: here the agent starts its journey through the maze (optional, else upper left)
-            :return: new state after reset
         """
         if start_cell not in self.cells:
-            raise Exception("Error: start cell at {} is not inside maze".format(start_cell))
         if self.maze[start_cell[::-1]] == Cell.OCCUPIED:
             raise Exception("Error: start cell at {} is not free".format(start_cell))
         if start_cell == self.__exit_cell:
-            raise Exception("Error: start- and exit cell cannot be the same {}".format(start_cell))
         self.__previous_cell = self.__current_cell = start_cell
         self.__total_reward = 0.0  # accumulated reward
@@ -119,10 +144,18 @@ class Maze:
             self.__ax1.set_yticks(np.arange(0.5, ncols, step=1))
             self.__ax1.set_yticklabels([])
             self.__ax1.grid(True)
-            self.__ax1.plot(*self.__current_cell, "rs", markersize=30)  # start is a big red square
-            self.__ax1.text(*self.__current_cell, "Start", ha="center", va="center", color="white")
-            self.__ax1.plot(*self.__exit_cell, "gs", markersize=30)  # exit is a big green square
-            self.__ax1.text(*self.__exit_cell, "Exit", ha="center", va="center", color="white")
             self.__ax1.imshow(self.maze, cmap="binary")
             self.__ax1.get_figure().canvas.draw()
             self.__ax1.get_figure().canvas.flush_events()
@@ -130,35 +163,43 @@ class Maze:
         return self.__observe()
     def __draw(self):
-        """ Draw a line from the agents previous cell to its current cell. """
-        self.__ax1.plot(*zip(*[self.__previous_cell, self.__current_cell]), "bo-")  # previous cells are blue dots
         self.__ax1.plot(*self.__current_cell, "ro")  # current cell is a red dot
         self.__ax1.get_figure().canvas.draw()
         self.__ax1.get_figure().canvas.flush_events()
     def step(self, action):
-        """ Move the agent according to 'action' and return the new state, reward and game status.
-            :param Action action: the agent will move in this direction
-            :return: state, reward, status
         """
         reward = self.__execute(action)
         self.__total_reward += reward
         status = self.__status()
         state = self.__observe()
-        logging.debug("action: {:10s} | reward: {: .2f} | status: {}".format(Action(action).name, reward, status))
         return state, reward, status
     def __execute(self, action):
-        """ Execute action and collect the reward or penalty.
-            :param Action action: direction in which the agent will move
-            :return float: reward or penalty which results from the action
         """
         possible_actions = self.__possible_actions(self.__current_cell)
         if not possible_actions:
-            reward = self.__minimum_reward - 1  # cannot move anywhere, force end of game
         elif action in possible_actions:
             col, row = self.__current_cell
             if action == Action.MOVE_LEFT:
@@ -179,21 +220,27 @@ class Maze:
             if self.__current_cell == self.__exit_cell:
                 reward = Maze.reward_exit  # maximum reward when reaching the exit cell
             elif self.__current_cell in self.__visited:
-                reward = Maze.penalty_visited  # penalty when returning to a cell which was visited earlier
             else:
-                reward = Maze.penalty_move  # penalty for a move which did not result in finding the exit cell
             self.__visited.add(self.__current_cell)
         else:
-            reward = Maze.penalty_impossible_move  # penalty for trying to enter an occupied cell or move out of the maze
         return reward
     def __possible_actions(self, cell=None):
-        """ Create a list with all possible actions from 'cell', avoiding the maze's edges and walls.
-            :param tuple cell: location of the agent (optional, else use current cell)
-            :return list: all possible actions
         """
         if cell is None:
             col, row = self.__current_cell
@@ -206,42 +253,48 @@ class Maze:
         nrows, ncols = self.maze.shape
         if row == 0 or (row > 0 and self.maze[row - 1, col] == Cell.OCCUPIED):
             possible_actions.remove(Action.MOVE_UP)
-        if row == nrows - 1 or (row < nrows - 1 and self.maze[row + 1, col] == Cell.OCCUPIED):
             possible_actions.remove(Action.MOVE_DOWN)
         if col == 0 or (col > 0 and self.maze[row, col - 1] == Cell.OCCUPIED):
             possible_actions.remove(Action.MOVE_LEFT)
-        if col == ncols - 1 or (col < ncols - 1 and self.maze[row, col + 1] == Cell.OCCUPIED):
             possible_actions.remove(Action.MOVE_RIGHT)
         return possible_actions
     def __status(self):
-        """ Return the game status.
-            :return Status: current game status (WIN, LOSE, PLAYING)
         """
         if self.__current_cell == self.__exit_cell:
             return Status.WIN
-        if self.__total_reward < self.__minimum_reward:  # force end of game after too much loss
             return Status.LOSE
         return Status.PLAYING
     def __observe(self):
-        """ Return the state of the maze - in this game the agents current location.
-            :return numpy.array [1][2]: agents current location
         """
         return np.array([[*self.__current_cell]])
     def play(self, model, start_cell=(0, 0)):
-        """ Play a single game, choosing the next move based a prediction from 'model'.
-            :param class AbstractModel model: the prediction model to use
-            :param tuple start_cell: agents initial cell (optional, else upper left)
-            :return Status: WIN, LOSE
         """
         self.reset(start_cell)
@@ -254,9 +307,11 @@ class Maze:
                 return status
     def check_win_all(self, model):
-        """ Check if the model wins from all possible starting cells. """
         previous = self.__render
-        self.__render = Render.NOTHING  # avoid rendering anything during execution of the check games
         win = 0
         lose = 0
@@ -269,14 +324,18 @@ class Maze:
         self.__render = previous  # restore previous rendering setting
-        logging.info("won: {} | lost: {} | win rate: {:.5f}".format(win, lose, win / (win + lose)))
         result = True if lose == 0 else False
         return result, win / (win + lose)
     def render_q(self, model):
-        """ Render the recommended action(s) for each cell as provided by 'model'.
         :param class AbstractModel model: the prediction model to use
         """
@@ -293,8 +352,12 @@ class Maze:
             self.__ax2.set_yticks(np.arange(0.5, ncols, step=1))
             self.__ax2.set_yticklabels([])
             self.__ax2.grid(True)
-            self.__ax2.plot(*self.__exit_cell, "gs", markersize=30)  # exit is a big green square
-            self.__ax2.text(*self.__exit_cell, "Exit", ha="center", va="center", color="white")
             for cell in self.empty:
                 q = model.q(cell) if model is not None else [0, 0, 0, 0]
@@ -315,9 +378,18 @@ class Maze:
                     # color (from red to green) represents the certainty of the preferred action(s)
                     maxv = 1
                     minv = -1
-                    color = clip((q[action] - minv) / (maxv - minv))  # normalize in [-1, 1]
-                    self.__ax2.arrow(*cell, dx, dy, color=(1 - color, color, 0), head_width=0.2, head_length=0.1)
             self.__ax2.imshow(self.maze, cmap="binary")
-            self.__ax2.get_figure().canvas.draw()

 class Maze:
+    """A maze with walls. An agent is placed at the start cell and must find the exit cell by moving through the maze.
+    The layout of the maze and the rules how to move through it are called the environment. An agent is placed
+    at start_cell. The agent chooses actions (move left/right/up/down) in order to reach the exit_cell. Every
+    action results in a reward or penalty which are accumulated during the game. Every move gives a small
+    penalty (-0.05), returning to a cell the agent visited earlier a bigger penalty (-0.25) and running into
+    a wall a large penalty (-0.75). The reward (+10.0) is collected when the agent reaches the exit. The
+    game always reaches a terminal state; the agent either wins or looses. Obviously reaching the exit means
+    winning, but if the penalties the agent is collecting during play exceed a certain threshold the agent is
+    assumed to wander around clueless and looses.
+    A note on cell coordinates:
+    The cells in the maze are stored as (col, row) or (x, y) tuples. (0, 0) is the upper left corner of the maze.
+    This way of storing coordinates is in line with what matplotlib's plot() function expects as inputs. The maze
+    itself is stored as a 2D numpy array so cells are accessed via [row, col]. To convert a (col, row) tuple
+    to (row, col) use (col, row)[::-1]
     """
+    actions = [
+        Action.MOVE_LEFT,
+        Action.MOVE_RIGHT,
+        Action.MOVE_UP,
+        Action.MOVE_DOWN,
+    ]  # all possible actions
     reward_exit = 10.0  # reward for reaching the exit cell
+    penalty_move = (
+        -0.05
+    )  # penalty for a move which did not result in finding the exit cell
     penalty_visited = -0.25  # penalty for returning to a cell which was visited earlier
+    penalty_impossible_move = (
+        -0.75
+    )  # penalty for trying to enter an occupied cell or moving out of the maze
     def __init__(self, maze, start_cell=(0, 0), exit_cell=None):
+        """Create a new maze game.
+        :param numpy.array maze: 2D array containing empty cells (= 0) and cells occupied with walls (= 1)
+        :param tuple start_cell: starting cell for the agent in the maze (optional, else upper left)
+        :param tuple exit_cell: exit cell which the agent has to reach (optional, else lower right)
         """
         self.maze = maze
+        self.__minimum_reward = (
+            -0.5 * self.maze.size
+        )  # stop game if accumulated reward is below this threshold
         nrows, ncols = self.maze.shape
         self.cells = [(col, row) for col in range(ncols) for row in range(nrows)]
+        self.empty = [
+            (col, row)
+            for col in range(ncols)
+            for row in range(nrows)
+            if self.maze[row, col] == Cell.EMPTY
+        ]
         self.__exit_cell = (ncols - 1, nrows - 1) if exit_cell is None else exit_cell
         self.empty.remove(self.__exit_cell)
         # Check for impossible maze layout
         if self.__exit_cell not in self.cells:
+            raise Exception(
+                "Error: exit cell at {} is not inside maze".format(self.__exit_cell)
+            )
         if self.maze[self.__exit_cell[::-1]] == Cell.OCCUPIED:
+            raise Exception(
+                "Error: exit cell at {} is not free".format(self.__exit_cell)
+            )
         # Variables for rendering using Matplotlib
         self.__render = Render.NOTHING  # what to render
         self.reset(start_cell)
     def reset(self, start_cell=(0, 0)):
+        """Reset the maze to its initial state and place the agent at start_cell.
+        :param tuple start_cell: here the agent starts its journey through the maze (optional, else upper left)
+        :return: new state after reset
         """
         if start_cell not in self.cells:
+            raise Exception(
+                "Error: start cell at {} is not inside maze".format(start_cell)
+            )
         if self.maze[start_cell[::-1]] == Cell.OCCUPIED:
             raise Exception("Error: start cell at {} is not free".format(start_cell))
         if start_cell == self.__exit_cell:
+            raise Exception(
+                "Error: start- and exit cell cannot be the same {}".format(start_cell)
+            )
         self.__previous_cell = self.__current_cell = start_cell
         self.__total_reward = 0.0  # accumulated reward
             self.__ax1.set_yticks(np.arange(0.5, ncols, step=1))
             self.__ax1.set_yticklabels([])
             self.__ax1.grid(True)
+            self.__ax1.plot(
+                *self.__current_cell, "rs", markersize=30
+            )  # start is a big red square
+            self.__ax1.text(
+                *self.__current_cell, "Start", ha="center", va="center", color="white"
+            )
+            self.__ax1.plot(
+                *self.__exit_cell, "gs", markersize=30
+            )  # exit is a big green square
+            self.__ax1.text(
+                *self.__exit_cell, "Exit", ha="center", va="center", color="white"
+            )
             self.__ax1.imshow(self.maze, cmap="binary")
             self.__ax1.get_figure().canvas.draw()
             self.__ax1.get_figure().canvas.flush_events()
         return self.__observe()
     def __draw(self):
+        """Draw a line from the agents previous cell to its current cell."""
+        self.__ax1.plot(
+            *zip(*[self.__previous_cell, self.__current_cell]), "bo-"
+        )  # previous cells are blue dots
         self.__ax1.plot(*self.__current_cell, "ro")  # current cell is a red dot
         self.__ax1.get_figure().canvas.draw()
         self.__ax1.get_figure().canvas.flush_events()
     def step(self, action):
+        """Move the agent according to 'action' and return the new state, reward and game status.
+        :param Action action: the agent will move in this direction
+        :return: state, reward, status
         """
         reward = self.__execute(action)
         self.__total_reward += reward
         status = self.__status()
         state = self.__observe()
+        logging.debug(
+            "action: {:10s} | reward: {: .2f} | status: {}".format(
+                Action(action).name, reward, status
+            )
+        )
         return state, reward, status
     def __execute(self, action):
+        """Execute action and collect the reward or penalty.
+        :param Action action: direction in which the agent will move
+        :return float: reward or penalty which results from the action
         """
         possible_actions = self.__possible_actions(self.__current_cell)
         if not possible_actions:
+            reward = (
+                self.__minimum_reward - 1
+            )  # cannot move anywhere, force end of game
         elif action in possible_actions:
             col, row = self.__current_cell
             if action == Action.MOVE_LEFT:
             if self.__current_cell == self.__exit_cell:
                 reward = Maze.reward_exit  # maximum reward when reaching the exit cell
             elif self.__current_cell in self.__visited:
+                reward = (
+                    Maze.penalty_visited
+                )  # penalty when returning to a cell which was visited earlier
             else:
+                reward = (
+                    Maze.penalty_move
+                )  # penalty for a move which did not result in finding the exit cell
             self.__visited.add(self.__current_cell)
         else:
+            reward = (
+                Maze.penalty_impossible_move
+            )  # penalty for trying to enter an occupied cell or move out of the maze
         return reward
     def __possible_actions(self, cell=None):
+        """Create a list with all possible actions from 'cell', avoiding the maze's edges and walls.
+        :param tuple cell: location of the agent (optional, else use current cell)
+        :return list: all possible actions
         """
         if cell is None:
             col, row = self.__current_cell
         nrows, ncols = self.maze.shape
         if row == 0 or (row > 0 and self.maze[row - 1, col] == Cell.OCCUPIED):
             possible_actions.remove(Action.MOVE_UP)
+        if row == nrows - 1 or (
+            row < nrows - 1 and self.maze[row + 1, col] == Cell.OCCUPIED
+        ):
             possible_actions.remove(Action.MOVE_DOWN)
         if col == 0 or (col > 0 and self.maze[row, col - 1] == Cell.OCCUPIED):
             possible_actions.remove(Action.MOVE_LEFT)
+        if col == ncols - 1 or (
+            col < ncols - 1 and self.maze[row, col + 1] == Cell.OCCUPIED
+        ):
             possible_actions.remove(Action.MOVE_RIGHT)
         return possible_actions
     def __status(self):
+        """Return the game status.
+        :return Status: current game status (WIN, LOSE, PLAYING)
         """
         if self.__current_cell == self.__exit_cell:
             return Status.WIN
+        if (
+            self.__total_reward < self.__minimum_reward
+        ):  # force end of game after too much loss
             return Status.LOSE
         return Status.PLAYING
     def __observe(self):
+        """Return the state of the maze - in this game the agents current location.
+        :return numpy.array [1][2]: agents current location
         """
         return np.array([[*self.__current_cell]])
     def play(self, model, start_cell=(0, 0)):
+        """Play a single game, choosing the next move based a prediction from 'model'.
+        :param class AbstractModel model: the prediction model to use
+        :param tuple start_cell: agents initial cell (optional, else upper left)
+        :return Status: WIN, LOSE
         """
         self.reset(start_cell)
                 return status
     def check_win_all(self, model):
+        """Check if the model wins from all possible starting cells."""
         previous = self.__render
+        self.__render = (
+            Render.NOTHING
+        )  # avoid rendering anything during execution of the check games
         win = 0
         lose = 0
         self.__render = previous  # restore previous rendering setting
+        logging.info(
+            "won: {} | lost: {} | win rate: {:.5f}".format(
+                win, lose, win / (win + lose)
+            )
+        )
         result = True if lose == 0 else False
         return result, win / (win + lose)
     def render_q(self, model):
+        """Render the recommended action(s) for each cell as provided by 'model'.
         :param class AbstractModel model: the prediction model to use
         """
             self.__ax2.set_yticks(np.arange(0.5, ncols, step=1))
             self.__ax2.set_yticklabels([])
             self.__ax2.grid(True)
+            self.__ax2.plot(
+                *self.__exit_cell, "gs", markersize=30
+            )  # exit is a big green square
+            self.__ax2.text(
+                *self.__exit_cell, "Exit", ha="center", va="center", color="white"
+            )
             for cell in self.empty:
                 q = model.q(cell) if model is not None else [0, 0, 0, 0]
                     # color (from red to green) represents the certainty of the preferred action(s)
                     maxv = 1
                     minv = -1
+                    color = clip(
+                        (q[action] - minv) / (maxv - minv)
+                    )  # normalize in [-1, 1]
+                    self.__ax2.arrow(
+                        *cell,
+                        dx,
+                        dy,
+                        color=(1 - color, color, 0),
+                        head_width=0.2,
+                        head_length=0.1,
+                    )
             self.__ax2.imshow(self.maze, cmap="binary")
+            self.__ax2.get_figure().canvas.draw()

src/envs/maze_env/server/maze_environment.py CHANGED Viewed

@@ -42,7 +42,7 @@ class MazeEnvironment(Environment):
         self,
         maze_array: np.ndarray,
         start_cell: Tuple[int, int] = (0, 0),
-        exit_cell: Optional[Tuple[int, int]] = (7,7),
     ):
         # Create underlying Maze instance (matches your working code)
         self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell)
@@ -54,16 +54,26 @@ class MazeEnvironment(Environment):
     def reset(self) -> MazeObservation:
         """Reset environment and return initial observation (MazeObservation)."""
-        observation = self.env.reset()  # typically returns np.array([row, col]) or similar
         # initialize episode state
         self.state = MazeState(episode_id="episode_1", step_count=0, done=False)
         # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields
-        pos_list = observation.tolist() if hasattr(observation, "tolist") else list(observation)
         self.total_reward = 0
         legal_actions = self._compute_legal_actions(pos_list[0])
-        return MazeObservation(position=pos_list, total_reward=self.total_reward, legal_actions=legal_actions)
     def step(self, action: MazeAction) -> MazeObservation:
         """
@@ -91,9 +101,9 @@ class MazeEnvironment(Environment):
         }
         # --- Reward settings ---
-        reward_exit = 10.0          # reward for reaching the exit cell
-        reward_move = 0.05        # reward for a move that didn't find the exit but is valid
-        penalty_visited = -0.25     # penalty for revisiting a cell
         penalty_impossible = -0.75  # penalty for invalid move (wall/outside)
         dr, dc = move_map.get(action.action, (0, 0))
@@ -153,10 +163,9 @@ class MazeEnvironment(Environment):
             position=pos_list,
             total_reward=self.total_reward,
             legal_actions=legal_actions,
-            done=done
         )
     def state(self) -> Optional[MazeState]:
         """Return the current MazeState object."""
         return self.state
@@ -186,4 +195,4 @@ class MazeEnvironment(Environment):
         if col < ncols - 1 and self.env.maze[row, col + 1] == 0:
             actions.append(3)
-        return actions

         self,
         maze_array: np.ndarray,
         start_cell: Tuple[int, int] = (0, 0),
+        exit_cell: Optional[Tuple[int, int]] = (7, 7),
     ):
         # Create underlying Maze instance (matches your working code)
         self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell)
     def reset(self) -> MazeObservation:
         """Reset environment and return initial observation (MazeObservation)."""
+        observation = (
+            self.env.reset()
+        )  # typically returns np.array([row, col]) or similar
         # initialize episode state
         self.state = MazeState(episode_id="episode_1", step_count=0, done=False)
         # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields
+        pos_list = (
+            observation.tolist()
+            if hasattr(observation, "tolist")
+            else list(observation)
+        )
         self.total_reward = 0
         legal_actions = self._compute_legal_actions(pos_list[0])
+        return MazeObservation(
+            position=pos_list,
+            total_reward=self.total_reward,
+            legal_actions=legal_actions,
+        )
     def step(self, action: MazeAction) -> MazeObservation:
         """
         }
         # --- Reward settings ---
+        reward_exit = 10.0  # reward for reaching the exit cell
+        reward_move = 0.05  # reward for a move that didn't find the exit but is valid
+        penalty_visited = -0.25  # penalty for revisiting a cell
         penalty_impossible = -0.75  # penalty for invalid move (wall/outside)
         dr, dc = move_map.get(action.action, (0, 0))
             position=pos_list,
             total_reward=self.total_reward,
             legal_actions=legal_actions,
+            done=done,
         )
     def state(self) -> Optional[MazeState]:
         """Return the current MazeState object."""
         return self.state
         if col < ncols - 1 and self.env.maze[row, col + 1] == 0:
             actions.append(3)
+        return actions

src/envs/maze_env/server/mazearray.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import numpy as np
 # Maze
-maze = np.array([
-    [0, 1, 0, 0, 0, 0, 0, 0],
-    [0, 1, 0, 1, 0, 1, 0, 0],
-    [0, 0, 0, 1, 1, 0, 1, 0],
-    [0, 1, 0, 1, 0, 0, 0, 0],
-    [1, 0, 0, 1, 0, 1, 0, 0],
-    [0, 0, 0, 1, 0, 1, 1, 1],
-    [0, 1, 1, 0, 0, 0, 0, 0],
-    [0, 0, 0, 0, 0, 1, 0, 0]
-])

 import numpy as np
 # Maze
+maze = np.array(
+    [
+        [0, 1, 0, 0, 0, 0, 0, 0],
+        [0, 1, 0, 1, 0, 1, 0, 0],
+        [0, 0, 0, 1, 1, 0, 1, 0],
+        [0, 1, 0, 1, 0, 0, 0, 0],
+        [1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 0, 1, 0, 1, 1, 1],
+        [0, 1, 1, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 1, 0, 0],
+    ]
+)