Source code for chipiron.scripts.generate_datasets.generate_over_boards

"""Generate chess over boards dataset from Lichess PGN dumps."""

import random
from pathlib import Path
from typing import Any

import chess
import chess.pgn
import pandas as pd

from chipiron.scripts.generate_datasets.generate_boards import (
    DEFAULT_RANDOM_SEED,
    ensure_month_pgn,
    iterate_months,
    save_dataset_progress,
)
from chipiron.utils.path_variables import EXTERNAL_DATA_DIR


[docs]def is_game_over_position(board: chess.Board) -> bool:
    """
    Check if a board position represents a game-ending situation.
    Only considers checkmate, stalemate, and insufficient material.
    Since we only check final positions, this will only be called on game endings.

    Args:
        board: The chess board to check

    Returns:
        True if the position represents a game-ending situation
    """
    # Check for checkmate
    if board.is_checkmate():
        return True

    # Check for stalemate
    if board.is_stalemate():
        return True

    # Check for insufficient material draws
    if board.is_insufficient_material():
        return True

    return False


[docs]def process_game_for_over_positions(
    game: chess.pgn.Game,
    total_count_move: int,
    the_dic: list[dict[str, Any]],
) -> int:
    """
    Process a single game and extract only the final game-ending board position.
    Since game-over situations only happen at the end, we only need to check the final position.

    Args:
        game: The chess game to process
        total_count_move: Current total move count across all games
        the_dic: List to append board positions to

    Returns:
        Updated total_count_move after processing this game
    """
    chess_board: chess.Board = game.board()
    # Access headers from the game object
    game_result = game.headers.get("Result", None)

    # Get the final position directly by following the mainline to the end
    final_node = game.end()
    chess_board = final_node.board()

    # Count total moves for statistics
    moves_list = list(game.mainline_moves())
    moves_processed = len(moves_list)
    total_count_move += moves_processed

    # Only check the final position for game-ending situations (removed eval requirement)
    if is_game_over_position(chess_board):
        # Determine the specific reason for game ending
        reason = "unknown"
        if chess_board.is_checkmate():
            reason = "checkmate"
        elif chess_board.is_stalemate():
            reason = "stalemate"
        elif chess_board.is_insufficient_material():
            reason = "insufficient_material"

        the_dic.append(
            {
                "fen": chess_board.fen(),
                "game_over_reason": reason,
                "result": game_result,
                "is_checkmate": chess_board.is_checkmate(),
                "is_stalemate": chess_board.is_stalemate(),
                "is_insufficient_material": chess_board.is_insufficient_material(),
                "is_final_position": True,  # Always true since we only check final positions
                "move_number": moves_processed,
            }
        )

    return total_count_move


[docs]def generate_over_boards_dataset_multi_months(
    output_file_path: str,
    max_boards: int = 1_000_000,
    seed: int | None = DEFAULT_RANDOM_SEED,
    start_month: str = "2015-03",
    max_months: int | None = None,
    delete_pgn_after_use: bool = True,
    intermediate_every_games: int = 10_000,
    dest_dir: Path | None = None,
) -> None:
    """Generate over boards dataset streaming through monthly Lichess dumps downloaded on-the-fly.

    Stops when max_boards collected or month limit reached. Each month PGN is deleted when done (optional).
    """
    if seed is not None:
        random.seed(seed)
    if dest_dir is None:
        dest_dir = EXTERNAL_DATA_DIR / "lichess_pgn"
    dest_dir.mkdir(parents=True, exist_ok=True)

    the_dic: list[dict[str, Any]] = []
    months_used: list[str] = []
    count_game = 0
    total_count_move = 0
    recorded_board = 0

    month_iter = iterate_months(start_month)
    months_processed = 0

    while recorded_board < max_boards:
        month = next(month_iter)
        if max_months is not None and months_processed >= max_months:
            print("Reached max_months limit.")
            break
        print(f"\n=== Processing month {month} for over boards ===")
        pgn_path = ensure_month_pgn(month, dest_dir)
        months_used.append(month)
        months_processed += 1

        with open(pgn_path, "r", encoding="utf-8") as pgn_file:
            while recorded_board < max_boards:
                game = chess.pgn.read_game(pgn_file)
                if game is None:
                    break
                count_game += 1

                if count_game % intermediate_every_games == 0:
                    recorded_board = save_dataset_progress(
                        the_dic,
                        output_file_path,
                        count_game,
                        total_count_move,
                        max_boards,
                        None,
                        None,
                        f"months:{','.join(months_used)}",
                        0,  # no sampling frequency for over boards
                        0,  # no offset for over boards
                        seed,
                        is_final=False,
                    )

                total_count_move = process_game_for_over_positions(
                    game, total_count_move, the_dic
                )
                recorded_board = len(the_dic)

        if delete_pgn_after_use:
            print(f"Deleting processed PGN for month {month}: {pgn_path}")
            try:
                Path(pgn_path).unlink(missing_ok=True)
            except OSError as exc:
                print(f"Warning: could not delete {pgn_path}: {exc}")

    # Final save using unified function
    if the_dic:
        df = pd.DataFrame.from_dict(the_dic)

        # Add over-boards specific metadata
        df.attrs["dataset_type"] = "over_boards"
        df.attrs["filter_criteria"] = (
            "game-ending positions: checkmate, stalemate, insufficient material only"
        )

        # Add statistics about game-ending reasons
        if "game_over_reason" in df.columns:
            reason_counts = df["game_over_reason"].value_counts().to_dict()
            df.attrs["game_over_reason_distribution"] = reason_counts

        # Use unified save function for final save
        save_dataset_progress(
            the_dic,
            output_file_path,
            count_game,
            total_count_move,
            max_boards,
            None,
            None,
            f"months:{','.join(months_used)}",
            0,  # no sampling frequency for over boards
            0,  # no offset for over boards
            seed,
            is_final=True,
            months_used=months_used,
        )

        # Print distribution of game-ending reasons
        if "game_over_reason" in df.columns:
            print("\nGame-ending reason distribution:")
            for reason, count in df["game_over_reason"].value_counts().items():
                print(f"  {reason}: {count:,} ({count / len(df) * 100:.1f}%)")


[docs]def generate_over_boards_dataset_legacy(
    input_pgn_file_path: str,
    output_file_path: str,
    max_boards: int = 1_000_000,
    total_games_in_file: int | None = None,
    total_moves_in_file: int | None = None,
    intermediate_every_games: int = 10_000,
) -> None:
    """
    Generate a dataset of game-ending chess board positions from a single PGN file.
    Legacy function for backwards compatibility.
    """
    the_dic: list[dict[str, Any]] = []
    count_game: int = 0
    total_count_move: int = 0
    recorded_board = 0

    with open(input_pgn_file_path, "r", encoding="utf-8") as pgn:
        while recorded_board < max_boards:
            count_game += 1
            game: chess.pgn.Game | None = chess.pgn.read_game(pgn)

            if count_game % intermediate_every_games == 0:
                recorded_board = save_dataset_progress(
                    the_dic,
                    output_file_path,
                    count_game,
                    total_count_move,
                    max_boards,
                    total_games_in_file,
                    total_moves_in_file,
                    input_pgn_file_path,
                    0,  # no sampling frequency for over boards
                    0,  # no offset for over boards
                    None,  # no seed for legacy
                    is_final=False,
                )

            if game is None:
                print("GAME NONE")
                break
            else:
                total_count_move = process_game_for_over_positions(
                    game, total_count_move, the_dic
                )

    # Final save using unified function
    if the_dic:
        df = pd.DataFrame.from_dict(the_dic)

        # Add over-boards specific metadata
        df.attrs["dataset_type"] = "over_boards"
        df.attrs["filter_criteria"] = (
            "game-ending positions: checkmate, stalemate, insufficient material only"
        )

        # Add statistics about game-ending reasons
        if "game_over_reason" in df.columns:
            reason_counts = df["game_over_reason"].value_counts().to_dict()
            df.attrs["game_over_reason_distribution"] = reason_counts

        # Use unified save function for final save
        save_dataset_progress(
            the_dic,
            output_file_path,
            count_game,
            total_count_move,
            max_boards,
            total_games_in_file,
            total_moves_in_file,
            input_pgn_file_path,
            0,  # no sampling frequency for over boards
            0,  # no offset for over boards
            None,  # no seed for legacy
            is_final=True,
        )

        # Print distribution of game-ending reasons
        if "game_over_reason" in df.columns:
            print("\nGame-ending reason distribution:")
            for reason, count in df["game_over_reason"].value_counts().items():
                print(f"  {reason}: {count:,} ({count / len(df) * 100:.1f}%)")
    else:
        print("No over board positions found to save.")


# --- CLI integration (dynamic only) ---
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Generate chess over boards dataset via on-the-fly monthly Lichess downloads (dynamic only)"
    )
    parser.add_argument(
        "--start-month", default="2015-03", help="Start month YYYY-MM for dynamic mode"
    )
    parser.add_argument(
        "--max-months", type=int, default=None, help="Maximum number of months to fetch"
    )
    parser.add_argument("--max-boards", type=int, default=1_000_000)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument(
        "--output", type=str, default=str(EXTERNAL_DATA_DIR / "over_boards_2025")
    )
    parser.add_argument(
        "--keep-pgn",
        action="store_true",
        help="Keep monthly PGN files after processing",
    )
    parser.add_argument(
        "--intermediate-games",
        type=int,
        default=10_000,
        help="Games interval for intermediate saves",
    )
    parser.add_argument(
        "--legacy-file",
        type=str,
        help="Use legacy single file mode with specified PGN file path",
    )
    args = parser.parse_args()

    if args.legacy_file:
        print(f"Running legacy single-file mode with: {args.legacy_file}")
        generate_over_boards_dataset_legacy(
            input_pgn_file_path=args.legacy_file,
            output_file_path=args.output,
            max_boards=args.max_boards,
            intermediate_every_games=args.intermediate_games,
        )
    else:
        print("Running dynamic monthly download mode for over boards")
        generate_over_boards_dataset_multi_months(
            output_file_path=args.output,
            max_boards=args.max_boards,
            seed=args.seed,
            start_month=args.start_month,
            max_months=args.max_months,
            delete_pgn_after_use=not args.keep_pgn,
            intermediate_every_games=args.intermediate_games,
        )