2024-12-07 05:30:11 +04:00

83 KiB
Raw Blame History

Игра «Крестики-нолики».


Исходный проект: GitHub.

Описание проекта:

Проект представляет собой реализацию среды для игры в крестики-нолики с использованием библиотеки Gymnasium. Среда моделирует игровую логику с соблюдением правил, таких как легальность ходов, определение победителя и смена хода между игроками. Основные методы среды включают step(action) для выполнения хода, reset() для сброса игрового поля и render() для отображения текущего состояния игры. Также реализован генератор возможных ходов.

Для тестирования и демонстрации работы используется случайный выбор хода, который осуществляется с помощью функции random_move(). Игровой процесс симулируется в серии эпизодов (300 партий), где игроки делают ходы до окончания игры. После каждого эпизода вычисляются награды и отображаются результаты.


Реализация среды:

Среда (environment) среда, в которой объект выполняет действия для решения задачи.

In [131]:
import gymnasium as gym
from gymnasium import spaces


class TicTacToeEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    symbols = ['O', ' ', 'X'];

    def __init__(self) -> None:
        super().__init__()
        self.action_space = spaces.Discrete(9)
        self.observation_space = spaces.Discrete(9*3*2)  # flattened
        self.reset()
    
    def step(self, action):
        done = False
        reward = 0

        p, square = action
        
        # check move legality
        board = self.state['board']
        proposed = board[square]
        om = self.state['on_move']
        
        if proposed != 0:  # wrong player, not empty
            print("illegal move ", action, ". (square occupied): ", square)
            done = True
            reward = -1 * om  # player who did NOT make the illegal mov
        if p != om:  # wrong player, not empty
            print("illegal move  ", action, " not on move: ", p)
            done = True
            reward = -1 * om  # player who did NOT make the illegal move
        else:
            board[square] = p
            self.state['on_move'] = -p

        # check game over
        for i in range(3):
            # horizontals and verticals
            if ((board[i * 3] == p and board[i * 3 + 1] == p and board[i * 3 + 2] == p)
                or (board[i + 0] == p and board[i + 3] == p and board[i + 6] == p)):
                reward = p
                done = True
                break
        # diagonals
        if ((board[0] == p and board[4] == p and board[8] == p)
            or (board[2] == p and board[4] == p and board[6] == p)):
                reward = p
                done = True
                
        return self.state, reward, done, {}
    
    def reset(self):
        self.state = {}
        self.state['board'] = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.state['on_move'] = 1
        return self.state
    
    def render(self, close=False):
        if close:
            return
        print("on move: " , self.symbols[self.state['on_move']+1])
        for i in range(9):
            print(self.symbols[self.state['board'][i]+1], end=" ")
            if i % 3 == 2:
                print()
        print()
                
    def move_generator(self):
        moves = []
        for i in range(9):
            if self.state['board'][i] == 0:
                p = self.state['on_move']
                m = [p, i]
                moves.append(m)
        return moves

Реализация основного цикла обучения:

In [132]:
import random


def random_move(moves):
    m = random.choice(moves)
    return m


env = TicTacToeEnv()

alpha = 0.01
beta = 0.01

num_episodes = 300

collected_rewards = []
oom = 1

for i in range(num_episodes):
    state = env.reset()
    
    total_reward = 0
    
    done = False
    om = oom;

    for j in range(9):
        moves = env.move_generator()
        if not moves:
            break
        
        if len(moves) == 1:
            # only a single possible move
            move = moves[0]
        else:
            move = random_move(moves)
            
        next_state, reward, done, info = env.step(move)
        total_reward += reward
        state = next_state
        
        if (i + 1) % 50 == 0: 
            env.render()
        
        if done:
            break
        
        om = -om

    collected_rewards.append(total_reward)
    
    if (i + 1) % 50 == 0: 
        print(f"Episode {i+1}, Total Reward: {total_reward}")
        average_reward = sum(collected_rewards) / len(collected_rewards)
        print(f"Average Reward: {average_reward}\n")
on move:  O
      
      
X     

on move:  X
      
O     
X     

on move:  O
  X   
O     
X     

on move:  X
O X   
O     
X     

on move:  O
O X   
O     
X   X 

on move:  X
O X O 
O     
X   X 

on move:  O
O X O 
O X   
X   X 

on move:  X
O X O 
O X O 
X   X 

on move:  O
O X O 
O X O 
X X X 

Episode 50, Total Reward: 1
Average Reward: 0.34

on move:  O
  X   
      
      

on move:  X
  X   
      
O     

on move:  O
  X   
X     
O     

on move:  X
O X   
X     
O     

on move:  O
O X X 
X     
O     

on move:  X
O X X 
X   O 
O     

on move:  O
O X X 
X X O 
O     

on move:  X
O X X 
X X O 
O   O 

on move:  O
O X X 
X X O 
O X O 

Episode 100, Total Reward: 1
Average Reward: 0.36

on move:  O
X     
      
      

on move:  X
X     
      
O     

on move:  O
X     
X     
O     

on move:  X
X     
X     
O   O 

on move:  O
X X   
X     
O   O 

on move:  X
X X   
X     
O O O 

Episode 150, Total Reward: -1
Average Reward: 0.35333333333333333

on move:  O
      
      
  X   

on move:  X
      
    O 
  X   

on move:  O
  X   
    O 
  X   

on move:  X
O X   
    O 
  X   

on move:  O
O X X 
    O 
  X   

on move:  X
O X X 
O   O 
  X   

on move:  O
O X X 
O   O 
X X   

on move:  X
O X X 
O   O 
X X O 

on move:  O
O X X 
O X O 
X X O 

Episode 200, Total Reward: 1
Average Reward: 0.355

on move:  O
  X   
      
      

on move:  X
  X   
      
O     

on move:  O
  X   
      
O   X 

on move:  X
O X   
      
O   X 

on move:  O
O X   
      
O X X 

on move:  X
O X O 
      
O X X 

on move:  O
O X O 
X     
O X X 

on move:  X
O X O 
X O   
O X X 

Episode 250, Total Reward: -1
Average Reward: 0.384

on move:  O
X     
      
      

on move:  X
X     
  O   
      

on move:  O
X     
  O   
  X   

on move:  X
X     
  O O 
  X   

on move:  O
X     
X O O 
  X   

on move:  X
X O   
X O O 
  X   

on move:  O
X O   
X O O 
  X X 

on move:  X
X O O 
X O O 
  X X 

on move:  O
X O O 
X O O 
X X X 

Episode 300, Total Reward: 1
Average Reward: 0.36333333333333334

Обновлённая реализация проекта.

Описание проекта:

Основные изменения:


Реализация среды:

Среда (environment) среда, в которой объект выполняет действия для решения задачи.

In [133]:
import numpy as np


class TicTacToeEnv(gym.Env):
    metadata: dict[str, list[str]] = {"render_modes": ["ansi"]}
    
    def __init__(self) -> None:
        self.action_space = spaces.Discrete(9)  # 9 клеток
        self.observation_space = spaces.Box(low=-1, high=1, shape=(9,), dtype=int)
        self.symbols: dict[int, str] = {1: "X", -1: "O", 0: " "}

        self.reset()

    def reset(self, seed=None):
        super().reset(seed=seed)
        self.board = np.zeros(9, dtype=int)  # Пустое поле
        self.current_player = 1  # Ход первого игрока
        return self.board

    def step(self, action):
        if self.board[action] != 0:
            # Нелегальный ход (клетка уже занята)
            reward = -self.current_player  # Штраф за нелегальный ход: награду получает тот, кто НЕ совершил ошибку
            self.current_player *= -1  # Смена хода к следующему игроку
            return self.board, reward, False, False, {}

        # Совершение хода
        self.board[action] = self.current_player

        # Проверка на победу
        if self.check_winner(self.current_player):
            reward = self.current_player
            terminated = True
        elif np.all(self.board != 0):
            # Ничья
            reward = 0
            terminated = True
        else:
            # Продолжение игры
            reward = 0
            terminated = False
            self.current_player *= -1  # Смена хода

        return self.board, reward, terminated, False, {}

    def check_winner(self, player):
        winning_positions: list[tuple[int, int, int]] = [
            (0, 1, 2), (3, 4, 5), (6, 7, 8),  # Горизонтали
            (0, 3, 6), (1, 4, 7), (2, 5, 8),  # Вертикали
            (0, 4, 8), (2, 4, 6),             # Диагонали
        ]
        for positions in winning_positions:
            if all(self.board[pos] == player for pos in positions):
                return True
        return False

    def render(self):
        board = self.board.reshape(3, 3)
        print("\n".join(" | ".join(self.symbols[cell] for cell in row) for row in board))
        print()

Реализация агента:

Агент (agent) объект обучения, который выполняет действия в некоторой среде для получения вознаграждения, принимая решения на основе своих целей и информации, которую он получает.

In [134]:
class TicTacToeAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.q_table = {}  # Q-таблица
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon

    def get_state_key(self, state):
        return tuple(state)

    def select_action(self, state, possible_actions):
        state_key = self.get_state_key(state)
        if np.random.rand() < self.epsilon or state_key not in self.q_table:
            # Исследование: случайный ход
            return np.random.choice(possible_actions)
        # Эксплуатация: выбор действия с максимальным Q-значением
        return max(possible_actions, key=lambda a: self.q_table.get((state_key, a), 0))

    def update(self, state, action, reward, next_state, possible_actions, terminated):
        state_key = self.get_state_key(state)
        next_state_key = self.get_state_key(next_state)

        if (state_key, action) not in self.q_table:
            self.q_table[(state_key, action)] = 0

        if terminated:
            future_reward = 0
        else:
            future_reward = max(self.q_table.get((next_state_key, a), 0) for a in possible_actions)

        # Формула Q-learning
        td_target = reward + self.discount_factor * future_reward
        td_error = td_target - self.q_table[(state_key, action)]
        self.q_table[(state_key, action)] += self.learning_rate * td_error

Реализация основного цикла обучения:

In [135]:
import matplotlib.pyplot as plt


# Параметры обучения
learning_rate = 0.1
discount_factor = 0.9
epsilon = 0.1
episodes = 300

# Инициализация среды и агента
env = TicTacToeEnv()
agent = TicTacToeAgent(learning_rate, discount_factor, epsilon)

# Инициализация метрик
statistics = {
    "Episode": [],
    "Total Reward": [],
    "Wins_X": [],
    "Wins_O": [],
    "Draws": [],
}

for episode in range(episodes):
    state = env.reset()
    done = False
    total_reward = 0
    wins_X = 0
    wins_O = 0
    draws = 0

    while not done:
        possible_actions = [i for i in range(9) if state[i] == 0]
        action = agent.select_action(state, possible_actions)
        next_state, reward, terminated, truncated, _ = env.step(action)

        # Обновление Q-таблицы
        agent.update(state, action, reward, next_state, possible_actions, terminated)

        # Статистика
        total_reward += reward
        if reward == 1:
            wins_X += 1
        if reward == -1:
            wins_O += 1
        elif reward == 0 and terminated:  # Ничья
            draws += 1

        state = next_state
        done = terminated

    # Сохраняем данные по эпизоду
    statistics["Episode"].append(episode + 1)
    statistics["Total Reward"].append(total_reward)
    statistics["Wins_X"].append(wins_X)
    statistics["Wins_O"].append(wins_O)
    statistics["Draws"].append(draws)

    # Отображение прогресса каждые 50 эпизодов
    if (episode + 1) % 50 == 0:
        average_reward = sum(statistics["Total Reward"][-50:]) / 50
        print(f"Average Reward (last 50 episodes): {average_reward}")


# Построение кумулятивного графика
plt.figure(figsize=(12, 6))

episodes_range = statistics["Episode"]
wins_X = np.cumsum(statistics["Wins_X"])
wins_O = np.cumsum(statistics["Wins_O"])
draws = np.cumsum(statistics["Draws"])

plt.plot(episodes_range, wins_X, label="Wins X", color="red")
plt.plot(episodes_range, wins_O, label="Wins O", color="blue")
plt.plot(episodes_range, draws, label="Draws", color="green")

plt.xlabel("Episodes")
plt.ylabel("Cumulative Count")
plt.title("Cumulative Performance Over Episodes")
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()
Average Reward (last 50 episodes): 0.26
Average Reward (last 50 episodes): 0.26
Average Reward (last 50 episodes): 0.42
Average Reward (last 50 episodes): 0.38
Average Reward (last 50 episodes): 0.38
Average Reward (last 50 episodes): 0.26
No description has been provided for this image