# Лабораторная работа 6

**Крестики-нолики**	https://github.com/nczempin/gym-tic-tac-toe/tree/master

#### **Перевод среды на Gymnasium**

In [1]:
import gymnasium as gym
from gymnasium import spaces

class TicTacToeEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    symbols = ['O', ' ', 'X']

    def __init__(self):
        super().__init__()
        self.action_space = spaces.Discrete(9)
        self.observation_space = spaces.Discrete(9 * 3 * 2)
        self.reset()

    def step(self, action):
        done = False
        reward = 0
        p, square = action

        board = self.state['board']
        proposed = board[square] 
        om = self.state['on_move'] 
        if proposed != 0:
            print(f"Незаконный ход: Клетка {square} занята.")
            done = True
            reward = -1 * om
        if p != om:
            print(f"Незаконный ход: игрок {p} не ходит")
            done = True
            reward = -1 * om
        else:
            board[square] = p
            self.state['on_move'] = -p

        for i in range(3):
            if (board[i * 3] == p and board[i * 3 + 1] == p and board[i * 3 + 2] == p) or \
               (board[i] == p and board[i + 3] == p and board[i + 6] == p):
                reward = p
                done = True
                break

        if (board[0] == p and board[4] == p and board[8] == p) or \
           (board[2] == p and board[4] == p and board[6] == p):
            reward = p
            done = True    
        return self.state, reward, done, {} 

    def reset(self):
        self.state = {}
        self.state['board'] = [0, 0, 0, 0, 0, 0, 0, 0, 0] 
        self.state['on_move'] = 1 
        return self.state, {}

    def render(self, close=False):
        if close:
            return
        print("on move: " , self.symbols[self.state['on_move']+1])
        for i in range (9):
            print (self.symbols[self.state['board'][i]+1], end=" ");
            if ((i % 3) == 2):
                print();

    def move_generator(self):
        moves = []
        for i in range(9):
            if self.state['board'][i] == 0:
                p = self.state['on_move']
                m = [p, i]
                moves.append(m)
        return moves

#### **Реализация агента**

In [2]:
import random

class RandomTicTacToeAgent:
    def __init__(self, symbol):
        self.symbol = symbol
    
    def get_action(self, moves):
        return random.choice(moves)


#### **Основной цикл обучения**

In [3]:
env = TicTacToeEnv()
agent = RandomTicTacToeAgent(symbol=1)
num_episodes = 500
collected_rewards = []
oom = 1

for i in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False
    om = oom

    for j in range(9): 
        moves = env.move_generator()

        if not moves:
            break
        if len(moves) == 1:
            move = moves[0]
        else:
            move = agent.get_action(moves)

        next_state, reward, done, info = env.step(move)
        total_reward += reward
        state = next_state
        env.render()

        if done:
            break

        om = -om
    collected_rewards.append(total_reward)

    print(f"Episode {i+1}, Total Reward: {total_reward}")
    
    average_reward = sum(collected_rewards) / len(collected_rewards)
    print(f"Average Reward: {average_reward}")



on move:  O
      
      
    X 
on move:  X
  O   
      
    X 
on move:  O
  O   
X     
    X 
on move:  X
  O   
X O   
    X 
on move:  O
  O   
X O   
  X X 
on move:  X
  O O 
X O   
  X X 
on move:  O
  O O 
X O X 
  X X 
on move:  X
  O O 
X O X 
O X X 
Episode 1, Total Reward: -1
Average Reward: -1.0
on move:  O
      
X     
      
on move:  X
      
X     
O     
on move:  O
      
X     
O X   
on move:  X
      
X O   
O X   
on move:  O
  X   
X O   
O X   
on move:  X
  X   
X O   
O X O 
on move:  O
X X   
X O   
O X O 
on move:  X
X X O 
X O   
O X O 
Episode 2, Total Reward: -1
Average Reward: -1.0
on move:  O
X     
      
      
on move:  X
X     
      
O     
on move:  O
X     
  X   
O     
on move:  X
X     
  X O 
O     
on move:  O
X     
  X O 
O X   
on move:  X
X     
  X O 
O X O 
on move:  O
X X   
  X O 
O X O 
Episode 3, Total Reward: 1
Average Reward: -0.3333333333333333
on move:  O
      
      
  X   
on move:  X
      
O     
  X   
on move:  O
  