#!/usr/bin/env python3

# Check if we should be using the virtual environment's Python interpreter
import os
import sys
from pathlib import Path

def check_and_reexec_with_venv():
    """Check if we should re-execute with the virtual environment's Python"""
    # Skip if we're already running from venv or explicitly told not to use venv
    if hasattr(sys, 'real_prefix') or (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix) or os.environ.get('R1SETUP_NO_VENV'):
        return False
    
    # Get the real user's home directory (handles sudo scenarios)
    if 'SUDO_USER' in os.environ:
        real_user = os.environ['SUDO_USER']
        if os.name == 'posix':
            import pwd
            real_home = Path(pwd.getpwnam(real_user).pw_dir)
        else:
            real_home = Path.home()
    else:
        real_home = Path.home()
    
    # Check if virtual environment Python exists
    venv_python = real_home / '.ratio1' / 'r1_setup' / '.r1_venv' / 'bin' / 'python3'
    
    if venv_python.exists() and str(venv_python) != sys.executable:
        # Re-execute with virtual environment Python
        import subprocess
        try:
            os.execv(str(venv_python), [str(venv_python)] + sys.argv)
        except OSError:
            # If exec fails, continue with current Python
            pass
    
    return False

# Try to re-execute with virtual environment Python
check_and_reexec_with_venv()

import subprocess
import copy
import yaml
import getpass
import re
import json
import shlex
import urllib.request
import urllib.error
import shutil
import tempfile
import stat
import ssl
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, Any, Optional, List, Tuple
from datetime import datetime

# Import version from ver.py
try:
    # Try to import from the same directory as this script
    script_dir = Path(__file__).parent
    sys.path.insert(0, str(script_dir))
    from ver import __VER__ as CLI_VERSION
except ImportError:
    # Fallback to hardcoded version if ver.py is not available
    CLI_VERSION = "1.8.0"

# Debug configuration
DEBUG = False  # Set to True to enable debug output, or use --debug flag

# Version information
UPDATE_CHECK_URL = "https://raw.githubusercontent.com/Ratio1/r1setup/refs/heads/main/mnl_factory/scripts/ver.py"
# Release assets are published to this repo by .github/workflows/release.yml.
# (Previously pointed at Ratio1/multi_node_launcher which 404s; the raw-main
# fallback in _perform_update masked the bug. Fixed in 1.8.0.)
DOWNLOAD_BASE_URL = "https://github.com/Ratio1/r1setup/releases/download"

# Whitelist of service template variables that can be overridden via Advanced → Customize Service
CUSTOMIZABLE_VARS = {
    'mnl_docker_image_url': {
        'description': 'Docker image URL',
        'default': '<from group_vars>',
        'example': 'ratio1/edge_node:devnet',
    },
    'mnl_docker_gpus': {
        'description': 'GPU flags passed to docker run',
        'default': '--gpus all',
        'example': '--gpus "device=0"',
    },
    'mnl_port_forward': {
        'description': 'Port forwarding flags',
        'default': '(empty or -p 1883:1883 in dev mode)',
        'example': '-p 1883:1883',
    },
    'mnl_commented_restart': {
        'description': 'Auto-reboot toggle (empty = enabled, "#" = disabled)',
        'default': '(empty — reboot enabled)',
        'example': '#',
    },
    'mnl_docker_container_name': {
        'description': 'Docker container name',
        'default': 'edge_node',
        'example': 'edge_node_canary',
    },
    'mnl_docker_volume_path': {
        'description': 'Host volume path for persistent data',
        'default': '/var/cache/edge_node/_local_cache',
        'example': '/data/edge_node/_local_cache',
    },
}

SSH_SCHEMA_VERSION = 2
SSH_AUTH_MODE_PASSWORD_ONLY = 'password_only'
SSH_AUTH_MODE_KEY_CONFIGURED_LEGACY = 'key_configured_legacy'
SSH_AUTH_MODE_KEY_INSTALLED_UNVERIFIED = 'key_installed_unverified'
SSH_AUTH_MODE_KEY_VERIFIED = 'key_verified'
SSH_AUTH_MODE_PASSWORD_DISABLED = 'password_disabled'
SSH_AUTH_MODE_VERIFICATION_FAILED = 'verification_failed'
SERVICE_FILE_VERSION_FIELD = 'r1setup_service_file_version'
DEFAULT_SERVICE_FILE_VERSION = 'v0'
UNKNOWN_SERVICE_FILE_VERSION_MARKERS = {'', 'unknown', 'not found', 'n/a', 'none', 'null'}

# Per-host install tracking fields (see docs/_todos/20260418T183123_r1setup_mixed_cpu_gpu_fleets.md).
INSTALL_LAST_VARIANT_FIELD = 'r1setup_last_install_variant'
INSTALL_LAST_DRIVER_OWNER_FIELD = 'r1setup_last_install_driver_owner'
INSTALL_LAST_AT_FIELD = 'r1setup_last_install_at'
INSTALL_LAST_COLLECTION_VERSION_FIELD = 'r1setup_last_install_collection_version'
INSTALL_ATTEMPTED_VARIANT_FIELD = 'r1setup_last_attempted_variant'
INSTALL_ATTEMPTED_DRIVER_OWNER_FIELD = 'r1setup_last_attempted_driver_owner'
INSTALL_ATTEMPTED_AT_FIELD = 'r1setup_last_attempted_at'
INSTALL_ATTEMPTED_RESULT_FIELD = 'r1setup_last_attempted_result'
INSTALL_TRACKING_FIELDS = (
    INSTALL_LAST_VARIANT_FIELD,
    INSTALL_LAST_DRIVER_OWNER_FIELD,
    INSTALL_LAST_AT_FIELD,
    INSTALL_LAST_COLLECTION_VERSION_FIELD,
    INSTALL_ATTEMPTED_VARIANT_FIELD,
    INSTALL_ATTEMPTED_DRIVER_OWNER_FIELD,
    INSTALL_ATTEMPTED_AT_FIELD,
    INSTALL_ATTEMPTED_RESULT_FIELD,
)
CONFIG_SCHEMA_VERSION = 1
DEFAULT_MACHINE_TOPOLOGY_MODE = 'standard'
DEFAULT_MACHINE_DEPLOYMENT_STATE = 'unknown'
DEFAULT_RUNTIME_SERVICE_NAME = 'edge_node'
DEFAULT_RUNTIME_CONTAINER_NAME = 'edge_node'
DEFAULT_RUNTIME_VOLUME_PATH = '/var/cache/edge_node/_local_cache'
DEFAULT_RUNTIME_EXIT_STATUS_PATH = '/tmp/edge_node.exit'
HELPER_MODE_STANDARD = 'standard_helpers'
HELPER_MODE_EXPERT = 'expert_dispatcher'
DEFAULT_HELPER_REGISTRY_DIR = '/var/lib/ratio1/r1setup/helpers'
MIN_RECOMMENDED_NODE_CPU_CORES = 4
MIN_RECOMMENDED_NODE_MEMORY_GIB = 16.0
MIN_TOLERATED_NODE_MEMORY_GIB = 15.0

SSH_KEY_MANAGEMENT_REQUIRED_PLAYBOOKS = (
    'playbooks/ssh_install_key.yml',
    'playbooks/ssh_add_extra_keys.yml',
    'playbooks/ssh_disable_password_auth.yml',
)

SSH_KEY_MANAGEMENT_REQUIRED_TOOLS = {
    'ssh': 'OpenSSH client',
    'ssh-keygen': 'OpenSSH key management',
    'openssl': 'OpenSSL',
}

DISCOVERY_KNOWN_ENVIRONMENTS = ('mainnet', 'testnet', 'devnet')


def _get_gpu_hosts(inventory: dict) -> dict:
    """Extract gpu_nodes hosts from an inventory dict."""
    return inventory.get('all', {}).get('children', {}).get('gpu_nodes', {}).get('hosts', {})


def _parse_iso_to_datetime(value) -> Optional[datetime]:
    """Parse ISO datetime string to datetime object. Returns None on failure."""
    if not value:
        return None
    try:
        if isinstance(value, str):
            return datetime.fromisoformat(value.replace('Z', '+00:00'))
        return datetime.fromtimestamp(value)
    except (ValueError, TypeError, OSError):
        return None


def _parse_iso_datetime(value, fmt: str = '%Y-%m-%d %H:%M') -> Optional[str]:
    """Parse ISO datetime string, return formatted string or None."""
    dt = _parse_iso_to_datetime(value)
    return dt.strftime(fmt) if dt else None


class VersionManager:
    """Handles CLI and Ansible collection version checking and updates.

    Accesses from self.app (R1Setup):
        - print_colored(), print_debug()
        - os_type
        - ansible_config_root
    """

    def __init__(self, app):
        self.app = app
        self._version_cache = {
            'collection_version': None,
            'collection_check_time': None,
            'cache_duration': 300  # 5 minutes cache
        }

    def _create_ssl_context(self) -> ssl.SSLContext:
        """Create SSL context for secure connections, with certifi support"""
        try:
            # Try to create a default SSL context
            context = ssl.create_default_context()

            # Try to use certifi for certificate verification (should be installed in venv)
            try:
                import certifi
                context.load_verify_locations(certifi.where())
                # If we reach here, certifi is working properly
                return context
            except ImportError:
                # Certifi not available, try alternative approaches
                if self.app.os_type == "macos":
                    try:
                        # Try to load system root certificates on macOS
                        context.load_default_certs()
                        return context
                    except Exception:
                        # If all else fails, disable certificate verification for GitHub
                        # This is safe since we're only downloading from GitHub releases
                        self.app.print_colored("Warning: Could not load system certificates, disabling SSL verification for updates", 'yellow')
                        context.check_hostname = False
                        context.verify_mode = ssl.CERT_NONE
                else:
                    # On Linux, the default context should work
                    try:
                        context.load_default_certs()
                        return context
                    except Exception:
                        self.app.print_colored("Warning: Could not load system certificates, disabling SSL verification for updates", 'yellow')
                        context.check_hostname = False
                        context.verify_mode = ssl.CERT_NONE

            return context

        except Exception as e:
            # Fallback: create unverified context
            self.app.print_colored(f"SSL context creation failed ({e}), using unverified context for updates", 'yellow')
            context = ssl.create_default_context()
            context.check_hostname = False
            context.verify_mode = ssl.CERT_NONE
            return context

    def _check_latest_version(self) -> Tuple[Optional[str], Optional[str]]:
        """Check the latest version from GitHub repository"""
        try:
            req = urllib.request.Request(UPDATE_CHECK_URL)
            req.add_header('User-Agent', f'Ratio1-CLI/{CLI_VERSION}')

            # Create SSL context for secure connection
            ssl_context = self._create_ssl_context()

            with urllib.request.urlopen(req, timeout=10, context=ssl_context) as response:
                content = response.read().decode('utf-8')

                # Parse version from ver.py content
                latest_version = None
                for line in content.split('\n'):
                    line = line.strip()
                    if line.startswith('__VER__') and '=' in line:
                        # Extract version from line like: __VER__ = '1.1.6'
                        version_part = line.split('=')[1].strip()
                        # Remove quotes and whitespace
                        latest_version = version_part.strip('\'"')
                        break

                if not latest_version:
                    self.app.print_colored("Could not parse version from repository", 'red')
                    return None, None

                # Construct download URLs for the latest version
                # First try release assets, fall back to raw content if needed
                download_urls = {
                    'r1setup': f"{DOWNLOAD_BASE_URL}/v{latest_version}/r1setup",
                    'ver.py': f"{DOWNLOAD_BASE_URL}/v{latest_version}/ver.py",
                    'update.py': f"{DOWNLOAD_BASE_URL}/v{latest_version}/update.py"
                }

                # Fallback URLs using raw GitHub content
                fallback_urls = {
                    'r1setup': "https://raw.githubusercontent.com/Ratio1/r1setup/refs/heads/main/mnl_factory/scripts/r1setup",
                    'ver.py': "https://raw.githubusercontent.com/Ratio1/r1setup/refs/heads/main/mnl_factory/scripts/ver.py",
                    'update.py': "https://raw.githubusercontent.com/Ratio1/r1setup/refs/heads/main/mnl_factory/scripts/update.py"
                }

                return latest_version, download_urls, fallback_urls

        except urllib.error.URLError as e:
            error_msg = str(e)
            if "CERTIFICATE_VERIFY_FAILED" in error_msg or "SSL" in error_msg:
                self.app.print_colored("SSL certificate verification failed.", 'red')
                self.app.print_colored("This is a common issue on macOS. Possible solutions:", 'yellow')
                self.app.print_colored("1. Install certificates: /Applications/Python\\ 3.x/Install\\ Certificates.command", 'white')
                self.app.print_colored("2. Install certifi: pip install certifi", 'white')
                self.app.print_colored("3. Update macOS and Python to latest versions", 'white')
            else:
                self.app.print_colored(f"Network error checking for updates: {e}", 'red')
            return None, None, None
        except Exception as e:
            self.app.print_colored(f"Error checking for updates: {e}", 'red')
            return None, None, None

    @staticmethod
    def _compare_versions(version1: str, version2: str) -> int:
        """Compare two version strings. Returns: 1 if v1 > v2, -1 if v1 < v2, 0 if equal"""
        def normalize_version(v):
            # Handle pre-release versions by splitting on '-' and taking first part
            v = v.split('-')[0]
            # Split version into parts and convert to integers
            parts = []
            for part in v.split('.'):
                try:
                    parts.append(int(part))
                except ValueError:
                    # If conversion fails, treat as 0
                    parts.append(0)
            return parts

        v1_parts = normalize_version(version1)
        v2_parts = normalize_version(version2)

        # Pad shorter version with zeros
        max_len = max(len(v1_parts), len(v2_parts))
        v1_parts.extend([0] * (max_len - len(v1_parts)))
        v2_parts.extend([0] * (max_len - len(v2_parts)))

        for i in range(max_len):
            if v1_parts[i] > v2_parts[i]:
                return 1
            elif v1_parts[i] < v2_parts[i]:
                return -1

        return 0

    def _perform_update(self, latest_version: str, download_urls: Dict[str, str], fallback_urls: Dict[str, str] = None) -> bool:
        """Download and install the update"""
        try:
            self.app.print_colored("Downloading update files...", 'yellow')

            # Get current script path and directory
            current_script = Path(sys.argv[0]).resolve()
            script_dir = current_script.parent

            # Create temporary directory for downloads
            with tempfile.TemporaryDirectory() as temp_dir:
                temp_path = Path(temp_dir)
                downloaded_files = {}

                # Download r1setup, ver.py, and update.py
                for filename, url in download_urls.items():
                    self.app.print_colored(f"Downloading {filename}...", 'yellow')

                    temp_file = temp_path / filename

                    req = urllib.request.Request(url)
                    req.add_header('User-Agent', f'Ratio1-CLI/{CLI_VERSION}')

                    try:
                        # Create SSL context for secure connection
                        ssl_context = self._create_ssl_context()

                        with urllib.request.urlopen(req, timeout=30, context=ssl_context) as response:
                            with open(temp_file, 'wb') as f:
                                shutil.copyfileobj(response, f)

                        # Verify download
                        if not temp_file.exists() or temp_file.stat().st_size == 0:
                            self.app.print_colored(f"Download failed - {filename} is empty", 'red')
                            if filename == 'r1setup':
                                return False  # r1setup is critical
                            continue  # Other files are optional

                        downloaded_files[filename] = temp_file
                        self.app.print_colored(f"✅ Downloaded {filename}", 'green')

                    except urllib.error.URLError as e:
                        # Try fallback URL if available
                        if fallback_urls and filename in fallback_urls:
                            self.app.print_colored(f"Trying fallback URL for {filename}...", 'yellow')
                            try:
                                fallback_req = urllib.request.Request(fallback_urls[filename])
                                fallback_req.add_header('User-Agent', f'Ratio1-CLI/{CLI_VERSION}')

                                with urllib.request.urlopen(fallback_req, timeout=30, context=ssl_context) as response:
                                    with open(temp_file, 'wb') as f:
                                        shutil.copyfileobj(response, f)

                                # Verify fallback download
                                if not temp_file.exists() or temp_file.stat().st_size == 0:
                                    self.app.print_colored(f"Fallback download failed - {filename} is empty", 'red')
                                    if filename == 'r1setup':
                                        return False  # r1setup is critical
                                    continue  # Other files are optional

                                downloaded_files[filename] = temp_file
                                self.app.print_colored(f"✅ Downloaded {filename} (fallback)", 'green')
                                continue  # Success with fallback, move to next file

                            except urllib.error.URLError as fallback_error:
                                self.app.print_colored(f"Fallback also failed for {filename}: {fallback_error}", 'red')

                        if filename == 'r1setup':
                            # r1setup is critical, fail the update
                            self.app.print_colored(f"Failed to download critical file {filename}: {e}", 'red')
                            return False
                        else:
                            # Other files are optional, continue without them
                            self.app.print_colored(f"Warning: Could not download {filename}: {e}", 'yellow')
                            continue

                # Install new files
                self.app.print_colored("Installing new version...", 'yellow')

                # Install r1setup
                if 'r1setup' in downloaded_files:
                    current_stat = current_script.stat()
                    shutil.move(str(downloaded_files['r1setup']), str(current_script))
                    current_script.chmod(current_stat.st_mode)  # Restore executable permissions
                    self.app.print_colored("✅ Installed new r1setup", 'green')

                # Install ver.py
                if 'ver.py' in downloaded_files:
                    ver_py_path = script_dir / 'ver.py'
                    shutil.move(str(downloaded_files['ver.py']), str(ver_py_path))
                    ver_py_path.chmod(0o644)  # Set readable permissions
                    self.app.print_colored("✅ Installed new ver.py", 'green')

                # Install update.py
                if 'update.py' in downloaded_files:
                    update_py_path = script_dir / 'update.py'
                    shutil.move(str(downloaded_files['update.py']), str(update_py_path))
                    update_py_path.chmod(0o755)  # Set executable permissions
                    self.app.print_colored("✅ Installed new update.py", 'green')

                # Verify the new script works
                self.app.print_colored("Validating installation...", 'yellow')
                result = subprocess.run([str(current_script), '--version'],
                                        capture_output=True, text=True, timeout=5)
                if result.returncode != 0:
                    self.app.print_colored("Warning: New version validation failed, but files have been updated", 'yellow')

                self.app.print_colored("✅ Installation completed successfully", 'green')

                # Show information about the update script
                update_py_path = script_dir / 'update.py'
                if update_py_path.exists():
                    self.app.print_colored(f"\n💡 Update script available at: {update_py_path}", 'cyan')
                    self.app.print_colored("   You can run 'python update.py --help' for future update options", 'white')

                return True

        except Exception as e:
            self.app.print_colored(f"Update installation failed: {e}", 'red')
            return False

    def _update_ansible_collection(self) -> bool:
        """Update the Ansible collection to the latest version"""
        try:
            # Use the same collection path as the setup scripts
            ansible_dir = self.app.ansible_config_root
            collections_path = ansible_dir / 'collections'
            collection_dir = collections_path / 'ansible_collections' / 'ratio1' / 'multi_node_launcher'

            # Ensure collections directory exists
            collections_path.mkdir(parents=True, exist_ok=True)

            # Set environment variables for the subprocess
            env = os.environ.copy()
            env['ANSIBLE_CONFIG'] = str(self.app.ansible_config_root / 'ansible.cfg')
            env['ANSIBLE_COLLECTIONS_PATH'] = str(collections_path)
            env['ANSIBLE_HOME'] = str(self.app.ansible_config_root)

            # Get current version if installed
            current_version = None
            if collection_dir.exists():
                galaxy_yml = collection_dir / 'galaxy.yml'
                if galaxy_yml.exists():
                    try:
                        with open(galaxy_yml, 'r') as f:
                            galaxy_data = yaml.safe_load(f)
                            current_version = galaxy_data.get('version', 'unknown')
                            self.app.print_colored(f"  Current collection version: {current_version}", 'cyan')
                    except Exception as e:
                        self.app.print_debug(f"Could not read current version: {e}")

            # The issue with ansible-galaxy is that --upgrade doesn't actually force update
            # So we need to first uninstall and then reinstall to get the latest version

            # Method 1: Try to uninstall first, then reinstall (most reliable)
            if collection_dir.exists():
                self.app.print_colored("  Removing existing collection to force update...", 'yellow')
                try:
                    # Remove the entire collection directory
                    shutil.rmtree(collection_dir)
                    self.app.print_colored("  Existing collection removed successfully", 'green')
                except Exception as e:
                    self.app.print_colored(f"  Warning: Could not remove existing collection: {e}", 'yellow')
                    # Continue anyway - maybe the install will work

            # Now install the latest version
            self.app.print_colored("  Installing latest collection from Ansible Galaxy...", 'yellow')

            cmd = [
                'ansible-galaxy', 'collection', 'install',
                'ratio1.multi_node_launcher',
                '--collections-path', str(collections_path),
                '--force'
            ]

            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=300,  # 5 minute timeout
                env=env
            )

            if result.returncode == 0:
                self.app.print_colored("  Collection installation completed successfully", 'green')

                # Get the new version and verify the update
                new_version = None
                if collection_dir.exists():
                    galaxy_yml = collection_dir / 'galaxy.yml'
                    if galaxy_yml.exists():
                        try:
                            with open(galaxy_yml, 'r') as f:
                                galaxy_data = yaml.safe_load(f)
                                new_version = galaxy_data.get('version', 'unknown')
                                self.app.print_colored(f"  Updated to collection version: {new_version}", 'cyan')

                                # Show version change if we had a previous version
                                if current_version and current_version != 'unknown' and new_version != current_version:
                                    self.app.print_colored(f"  Version changed: {current_version} → {new_version}", 'green')
                                elif current_version and current_version != 'unknown' and new_version == current_version:
                                    self.app.print_colored(f"  Collection was already at the latest version", 'green')
                        except Exception as e:
                            self.app.print_debug(f"Could not read new version: {e}")

                # Verify the collection exists and is functional
                verification_success = False

                # Method 1: Check filesystem directly
                if collection_dir.exists():
                    verification_success = True
                    self.app.print_colored("  Collection verified via filesystem check", 'cyan')

                # Method 2: Check with ansible-galaxy list as backup verification
                if verification_success:
                    try:
                        verify_cmd = [
                            'ansible-galaxy', 'collection', 'list',
                            'ratio1.multi_node_launcher',
                            '--collections-path', str(collections_path)
                        ]

                        verify_result = subprocess.run(
                            verify_cmd,
                            capture_output=True,
                            text=True,
                            timeout=30,
                            env=env
                        )

                        if DEBUG:
                            self.app.print_debug(f"Verification command output: {verify_result.stdout}")
                            self.app.print_debug(f"Verification command stderr: {verify_result.stderr}")

                        if verify_result.returncode == 0 and 'ratio1' in verify_result.stdout.lower():
                            self.app.print_colored("  Collection verified via ansible-galaxy list", 'cyan')
                        else:
                            self.app.print_colored("  Warning: ansible-galaxy list verification failed, but filesystem check passed", 'yellow')
                    except Exception as e:
                        self.app.print_debug(f"Verification command failed: {e}")

                # Clear stale service overrides after template files may have changed
                self._clear_overrides_on_update()

                return verification_success
            else:
                # Log the error details
                error_msg = result.stderr.strip() if result.stderr else "Unknown error"
                self.app.print_colored(f"  ansible-galaxy command failed: {error_msg}", 'red')

                # Show command output for debugging
                if DEBUG and result.stdout:
                    self.app.print_debug(f"Command output: {result.stdout}")

                # Check if it's a common issue and provide helpful hints
                if "timeout" in error_msg.lower() or "connection" in error_msg.lower():
                    self.app.print_colored("  This might be a network connectivity issue. Try again later.", 'yellow')
                elif "permission" in error_msg.lower():
                    self.app.print_colored("  This might be a permissions issue. Check directory permissions.", 'yellow')
                elif "not found" in error_msg.lower():
                    self.app.print_colored("  The collection might not exist on Galaxy. Check the collection name.", 'yellow')

                return False

        except subprocess.TimeoutExpired:
            self.app.print_colored("  Collection update timed out. This might be due to slow network.", 'yellow')
            return False
        except FileNotFoundError:
            self.app.print_colored("  ansible-galaxy command not found. Please ensure Ansible is installed.", 'yellow')
            return False
        except Exception as e:
            self.app.print_colored(f"  Unexpected error updating collection: {e}", 'red')
            if DEBUG:
                import traceback
                self.app.print_debug(f"Full traceback: {traceback.format_exc()}")
            return False

    def _clear_overrides_on_update(self) -> None:
        """Remove stale service overrides after an Ansible collection update.

        Templates may have changed, so old overrides could be invalid.
        """
        try:
            overrides = self.app._get_service_overrides()
            if not overrides:
                return

            cleared_vars = list(overrides.keys())
            self.app._save_service_overrides({})

            self.app.print_colored("\n⚠️  Service overrides cleared after collection update:", 'yellow', bold=True)
            for var in cleared_vars:
                desc = CUSTOMIZABLE_VARS.get(var, {}).get('description', var)
                self.app.print_colored(f"   • {desc} ({var}) = {overrides[var]}", 'yellow')
            self.app.print_colored("  Re-apply via Advanced Menu → Customize Service if needed.", 'yellow')
        except Exception as e:
            self.app.print_debug(f"Could not clear overrides on update: {e}")

    def _get_current_collection_version(self, force_refresh: bool = False) -> Optional[str]:
        """Get current collection version using multiple smart methods with caching"""
        # Check cache first (unless force refresh)
        if not force_refresh and self._version_cache['collection_version'] is not None:
            import time
            cache_time = self._version_cache.get('collection_check_time', 0)
            current_time = time.time()

            # Use cached version if within cache duration
            if current_time - cache_time < self._version_cache['cache_duration']:
                if DEBUG:
                    self.app.print_debug(f"Using cached collection version: {self._version_cache['collection_version']}")
                return self._version_cache['collection_version']

        current_version = None

        # Set environment variables for consistency
        env = os.environ.copy()
        env['ANSIBLE_CONFIG'] = str(self.app.ansible_config_root / 'ansible.cfg')
        env['ANSIBLE_COLLECTIONS_PATH'] = str(self.app.ansible_config_root / 'collections')
        env['ANSIBLE_HOME'] = str(self.app.ansible_config_root)

        # Method 1: ansible-galaxy collection list with specific collection (most targeted)
        try:
            cmd = [
                'ansible-galaxy', 'collection', 'list',
                'ratio1.multi_node_launcher',
                '--collections-path', str(self.app.ansible_config_root / 'collections')
            ]

            result = subprocess.run(cmd, capture_output=True, text=True, timeout=15, env=env)

            if result.returncode == 0:
                # Parse output more carefully
                for line in result.stdout.split('\n'):
                    line = line.strip()
                    if 'ratio1.multi_node_launcher' in line:
                        parts = line.split()
                        if len(parts) >= 2:
                            current_version = parts[1]
                            if DEBUG:
                                self.app.print_debug(f"Method 1 (targeted list): Found version {current_version}")
                            return current_version
            elif DEBUG:
                self.app.print_debug(f"Method 1 failed: {result.stderr}")
        except Exception as e:
            if DEBUG:
                self.app.print_debug(f"Method 1 error: {e}")

        # Method 2: ansible-galaxy collection list (broad search)
        if not current_version:
            try:
                cmd = [
                    'ansible-galaxy', 'collection', 'list',
                    '--collections-path', str(self.app.ansible_config_root / 'collections')
                ]

                result = subprocess.run(cmd, capture_output=True, text=True, timeout=20, env=env)

                if result.returncode == 0:
                    for line in result.stdout.split('\n'):
                        if 'ratio1.multi_node_launcher' in line:
                            parts = line.split()
                            if len(parts) >= 2:
                                current_version = parts[1]
                                if DEBUG:
                                    self.app.print_debug(f"Method 2 (broad list): Found version {current_version}")
                                break
                elif DEBUG:
                    self.app.print_debug(f"Method 2 failed: {result.stderr}")
            except Exception as e:
                if DEBUG:
                    self.app.print_debug(f"Method 2 error: {e}")

        # Method 3: Read galaxy.yml directly (filesystem approach)
        if not current_version:
            try:
                collection_dir = self.app.ansible_config_root / 'collections' / 'ansible_collections' / 'ratio1' / 'multi_node_launcher'
                galaxy_yml = collection_dir / 'galaxy.yml'

                if galaxy_yml.exists():
                    with open(galaxy_yml, 'r') as f:
                        galaxy_data = yaml.safe_load(f)
                        current_version = galaxy_data.get('version')
                        if current_version and DEBUG:
                            self.app.print_debug(f"Method 3 (galaxy.yml): Found version {current_version}")
            except Exception as e:
                if DEBUG:
                    self.app.print_debug(f"Method 3 error: {e}")

        # Method 4: Read MANIFEST.json (alternative metadata file)
        if not current_version:
            try:
                collection_dir = self.app.ansible_config_root / 'collections' / 'ansible_collections' / 'ratio1' / 'multi_node_launcher'
                manifest_json = collection_dir / 'MANIFEST.json'

                if manifest_json.exists():
                    with open(manifest_json, 'r') as f:
                        manifest_data = json.load(f)
                        # MANIFEST.json has different structure
                        collection_info = manifest_data.get('collection_info', {})
                        current_version = collection_info.get('version')
                        if current_version and DEBUG:
                            self.app.print_debug(f"Method 4 (MANIFEST.json): Found version {current_version}")
            except Exception as e:
                if DEBUG:
                    self.app.print_debug(f"Method 4 error: {e}")

        # Method 5: Test if collection modules are importable (Python approach)
        if not current_version:
            try:
                # Try to use Python to check if collection is available
                python_check = f"""
import sys
sys.path.insert(0, '{self.app.ansible_config_root / "collections"}')
try:
    from ansible_collections.ratio1.multi_node_launcher import __version__
    print(__version__)
except Exception:
    try:
        import os
        galaxy_path = '{self.app.ansible_config_root / "collections" / "ansible_collections" / "ratio1" / "multi_node_launcher" / "galaxy.yml"}'
        if os.path.exists(galaxy_path):
            import yaml
            with open(galaxy_path) as f:
                data = yaml.safe_load(f)
                print(data.get('version', ''))
    except Exception:
        pass
"""
                result = subprocess.run([sys.executable, '-c', python_check],
                                      capture_output=True, text=True, timeout=10)

                if result.returncode == 0 and result.stdout.strip():
                    current_version = result.stdout.strip()
                    if DEBUG:
                        self.app.print_debug(f"Method 5 (Python import): Found version {current_version}")
            except Exception as e:
                if DEBUG:
                    self.app.print_debug(f"Method 5 error: {e}")

        # Method 6: Use ansible-doc to check collection availability (functional test)
        if not current_version:
            try:
                # Try to get documentation for a known module in the collection
                cmd = [
                    'ansible-doc', '--list',
                    '--type', 'module',
                    'ratio1.multi_node_launcher'
                ]

                result = subprocess.run(cmd, capture_output=True, text=True, timeout=15, env=env)

                if result.returncode == 0 and 'ratio1.multi_node_launcher' in result.stdout:
                    # Collection is available, try to get version from filesystem
                    collection_dir = self.app.ansible_config_root / 'collections' / 'ansible_collections' / 'ratio1' / 'multi_node_launcher'
                    if collection_dir.exists():
                        # If ansible-doc found it, collection exists, try galaxy.yml again
                        galaxy_yml = collection_dir / 'galaxy.yml'
                        if galaxy_yml.exists():
                            with open(galaxy_yml, 'r') as f:
                                galaxy_data = yaml.safe_load(f)
                                current_version = galaxy_data.get('version', 'detected')
                                if DEBUG:
                                    self.app.print_debug(f"Method 6 (ansible-doc + filesystem): Found version {current_version}")
            except Exception as e:
                if DEBUG:
                    self.app.print_debug(f"Method 6 error: {e}")

        # Clean up version string
        if current_version:
            current_version = str(current_version).strip('\'"')

        # Cache the result
        import time
        self._version_cache['collection_version'] = current_version
        self._version_cache['collection_check_time'] = time.time()

        if DEBUG:
            self.app.print_debug(f"Final current version result: {current_version}")

        return current_version

    def _check_ansible_collection_version(self) -> Tuple[Optional[str], Optional[str], bool]:
        """Check the current Ansible collection version and if updates are available"""
        try:
            # Use the smart method to get current version
            current_version = self._get_current_collection_version()
            latest_version = None

            # Check latest available version from Galaxy API
            try:
                # Use the correct Galaxy API v3 endpoint format
                api_url = "https://galaxy.ansible.com/api/v3/plugin/ansible/content/published/collections/index/ratio1/multi_node_launcher/"
                req = urllib.request.Request(api_url)
                req.add_header('User-Agent', f'Ratio1-CLI/{CLI_VERSION}')

                # Create SSL context for secure connection
                ssl_context = self._create_ssl_context()

                with urllib.request.urlopen(req, timeout=10, context=ssl_context) as response:
                    data = json.loads(response.read().decode('utf-8'))
                    # The API response has highest_version.version structure
                    highest_version = data.get('highest_version', {})
                    latest_version = highest_version.get('version')
                    if DEBUG:
                        self.app.print_debug(f"Found latest version via Galaxy API v3: {latest_version}")
            except Exception as e:
                if DEBUG:
                    self.app.print_debug(f"Galaxy API v3 query failed: {e}")

                # Fallback: Try the older API format (keep as backup)
                try:
                    # Try the older API format as fallback
                    api_url = "https://galaxy.ansible.com/api/v1/collections/ratio1/multi_node_launcher/"
                    req = urllib.request.Request(api_url)
                    req.add_header('User-Agent', f'Ratio1-CLI/{CLI_VERSION}')

                    with urllib.request.urlopen(req, timeout=10, context=ssl_context) as response:
                        data = json.loads(response.read().decode('utf-8'))
                        # Try different possible fields for v1 API
                        latest_version = (data.get('latest_version') or
                                        data.get('version') or
                                        data.get('current_version'))
                        if DEBUG:
                            self.app.print_debug(f"Found latest version via fallback API v1: {latest_version}")
                except Exception as e2:
                    if DEBUG:
                        self.app.print_debug(f"Fallback API v1 also failed: {e2}")
                    # If all else fails, we can't determine the latest version
                    latest_version = None

            # Compare versions if we have both
            update_available = False
            if current_version and latest_version:
                try:
                    # Remove any extra quotes or whitespace
                    current_version = current_version.strip('\'"')
                    latest_version = latest_version.strip('\'"')

                    # Simple version comparison
                    current_parts = [int(x) for x in current_version.split('.')]
                    latest_parts = [int(x) for x in latest_version.split('.')]

                    # Pad shorter version with zeros
                    max_len = max(len(current_parts), len(latest_parts))
                    current_parts.extend([0] * (max_len - len(current_parts)))
                    latest_parts.extend([0] * (max_len - len(latest_parts)))

                    # Compare version parts
                    for i in range(max_len):
                        if latest_parts[i] > current_parts[i]:
                            update_available = True
                            break
                        elif latest_parts[i] < current_parts[i]:
                            break  # Current is newer

                    if DEBUG:
                        self.app.print_debug(f"Version comparison: current={current_version}, latest={latest_version}, update_available={update_available}")

                except (ValueError, IndexError) as e:
                    if DEBUG:
                        self.app.print_debug(f"Version comparison failed: {e}")
                    # If we can't compare, assume update might be available
                    update_available = True
            elif current_version and not latest_version:
                # We have current but not latest - assume update might be available
                update_available = True
                if DEBUG:
                    self.app.print_debug(f"Could not determine latest version, assuming update available")
            elif not current_version and latest_version:
                # No current version means collection is not installed
                update_available = True
                if DEBUG:
                    self.app.print_debug(f"Collection not installed, update available")
            else:
                # Neither version available - can't determine
                update_available = False
                if DEBUG:
                    self.app.print_debug(f"Could not determine any versions")

            return current_version, latest_version, update_available

        except Exception as e:
            self.app.print_colored(f"Error checking Ansible collection version: {e}", 'yellow')
            if DEBUG:
                import traceback
                self.app.print_debug(f"Full traceback: {traceback.format_exc()}")
            return None, None, False

    def _auto_update_check(self) -> None:
        """Automatically check for and install updates on startup"""
        if str(os.environ.get("R1SETUP_SKIP_AUTO_UPDATE", "")).strip() == "1":
            if DEBUG:
                self.app.print_debug("Skipping auto-update check because R1SETUP_SKIP_AUTO_UPDATE=1")
            return

        print("  Checking for updates...", end='\r')

        # Check CLI version
        latest_cli_version = None
        cli_update_available = False
        cli_download_urls = None
        cli_fallback_urls = None

        try:
            latest_cli_version, cli_download_urls, cli_fallback_urls = self._check_latest_version()
            if latest_cli_version:
                cli_update_available = self._compare_versions(CLI_VERSION, latest_cli_version) < 0
        except Exception as e:
            self.app.print_colored(f"Could not check for CLI updates: {e}", 'yellow')

        # Check Ansible collection version
        current_collection_version, latest_collection_version, collection_update_available = self._check_ansible_collection_version()

        # Show update status
        if DEBUG:
            self.app.print_debug(f"Auto-update check: current_collection_version={current_collection_version}, latest_collection_version={latest_collection_version}, collection_update_available={collection_update_available}")

        # Clear the "Checking for updates..." line
        print("                              ", end='\r')

        # Only show messages when updates are available
        if cli_update_available:
            self.app.print_colored(f"\U0001f195 CLI update available: {CLI_VERSION} \u2192 {latest_cli_version}", 'green')

        if collection_update_available:
            self.app.print_colored("\U0001f195 Ansible collection update available", 'green')

        # Perform auto-updates
        updates_performed = False

        if cli_update_available and latest_cli_version and cli_download_urls:
            self.app.print_colored(f"\n🚀 Auto-updating CLI to version {latest_cli_version}...", 'cyan', bold=True)
            success = self._perform_update(latest_cli_version, cli_download_urls, cli_fallback_urls)

            if success:
                self.app.print_colored(f"✅ Successfully updated CLI to version {latest_cli_version}!", 'green')
                self.app.print_colored("Restarting with the new version...", 'cyan')
                # Give user a moment to see the message
                import time
                time.sleep(2)
                os.execv(sys.executable, [sys.executable] + sys.argv)
            else:
                self.app.print_colored("❌ CLI auto-update failed. Continuing with current version.", 'red')

        if collection_update_available:
            self.app.print_colored("\n🚀 Auto-updating Ansible collection...", 'cyan', bold=True)
            if current_collection_version:
                self.app.print_colored(f"  Current version: {current_collection_version}", 'cyan')
            if latest_collection_version:
                self.app.print_colored(f"  Updating to: {latest_collection_version}", 'cyan')

            success = self._update_ansible_collection()

            if success:
                self.app.print_colored("✅ Ansible collection updated successfully!", 'green')
                updates_performed = True
            else:
                self.app.print_colored("❌ Ansible collection auto-update failed.", 'red')

        if updates_performed:
            self.app.print_colored("\n\u2705 Auto-update completed successfully!", 'green')
            self.app.wait_for_enter()


class SettingsManager:
    """Manages global user preferences stored in ~/.ratio1/r1_setup/settings.json.

    Separate from active_config.json (per-configuration metadata).
    Settings are global user preferences that persist across config switches.
    """

    DEFAULT_SETTINGS = {
        'show_node_status': False,
        'status_refresh_cooldown': 60,  # seconds
        'connection_timeout': 30,  # seconds (30-600); base timeout for node-facing operations
    }

    def __init__(self, app):
        self.app = app
        self.settings_file = app.r1_setup_dir / 'settings.json'
        self.settings = dict(self.DEFAULT_SETTINGS)
        self._last_status_refresh = None  # in-memory only; resets on CLI restart

    def load_settings(self):
        """Load settings from disk, merging with defaults for forward-compat."""
        if self.settings_file.exists():
            try:
                with open(self.settings_file, 'r') as f:
                    loaded = json.load(f)
                self.settings = {**self.DEFAULT_SETTINGS, **loaded}
                self.app.print_debug(f"Settings loaded from {self.settings_file}")
            except (json.JSONDecodeError, IOError) as e:
                self.app.print_debug(f"Warning: corrupt settings.json, using defaults: {e}")
                self.settings = dict(self.DEFAULT_SETTINGS)
        else:
            self.app.print_debug("No settings.json found, using defaults")

    def save_settings(self):
        """Persist current settings to disk."""
        try:
            self.settings_file.parent.mkdir(parents=True, exist_ok=True)
            with open(self.settings_file, 'w') as f:
                json.dump(self.settings, f, indent=2)
            self.app.print_debug(f"Settings saved to {self.settings_file}")
        except IOError as e:
            self.app.print_debug(f"Warning: could not save settings: {e}")

    def get(self, key):
        return self.settings.get(key, self.DEFAULT_SETTINGS.get(key))

    def set(self, key, value):
        self.settings[key] = value
        self.save_settings()

    def should_refresh_status(self):
        """Check if enough time has elapsed since last status probe."""
        if not self.get('show_node_status'):
            return False
        if self._last_status_refresh is None:
            return True
        import time
        elapsed = time.time() - self._last_status_refresh
        return elapsed >= self.get('status_refresh_cooldown')

    def mark_status_refreshed(self):
        """Record that a status probe just completed."""
        import time
        self._last_status_refresh = time.time()

    @property
    def connection_timeout(self) -> int:
        """Base timeout (seconds) for node-facing operations. Clamped to [30, 600]."""
        val = self.get('connection_timeout')
        try:
            val = int(val)
        except (TypeError, ValueError):
            val = self.DEFAULT_SETTINGS['connection_timeout']
        return max(30, min(600, val))

    @property
    def ssh_connect_timeout(self) -> int:
        """SSH ConnectTimeout derived from connection_timeout. Floor of 10s."""
        return max(10, self.connection_timeout // 3)

    def settings_menu(self):
        """Interactive toggle UI for user preferences."""
        while True:
            self.app.print_header("Settings")

            current_status = "ON" if self.get('show_node_status') else "OFF"
            cooldown = self.get('status_refresh_cooldown')

            self.app.print_section("Preferences")
            self.app.print_colored(
                f"  1) Show Node Status on Main Menu: [{current_status}]", 'white'
            )
            self.app.print_colored(
                f"     When ON, auto-checks live node statuses on main menu ({cooldown}s cooldown)",
                'white'
            )
            print()
            self.app.print_colored(
                f"  2) Connection Timeout: [{self.connection_timeout}s]", 'white'
            )
            self.app.print_colored(
                f"     Base timeout for SSH/playbook operations on remote nodes (30-600s)",
                'white'
            )
            print()
            self.app.print_colored("  0) Back", 'white')
            print()

            choice = self.app.get_input("Select option", "0")

            if choice == '0':
                break
            elif choice == '1':
                new_value = not self.get('show_node_status')
                self.set('show_node_status', new_value)
                state = "ON" if new_value else "OFF"
                self.app.print_colored(f"Live node status display: {state}", 'green')
                self.app.wait_for_enter()
            elif choice == '2':
                val = self.app.get_input(
                    f"Enter connection timeout in seconds (30-600, current: {self.connection_timeout}s)",
                    str(self.connection_timeout)
                )
                try:
                    val = int(val)
                    if 30 <= val <= 600:
                        self.set('connection_timeout', val)
                        self.app.print_colored(f"Connection timeout set to {val}s (SSH connect timeout: {self.ssh_connect_timeout}s)", 'green')
                    else:
                        self.app.print_colored("Invalid value. Must be between 30 and 600 seconds.", 'red')
                except ValueError:
                    self.app.print_colored("Invalid input. Please enter a number.", 'red')
                self.app.wait_for_enter()
            else:
                self.app.print_colored("Invalid option. Valid choices are 0-2.", 'red')
                self.app.wait_for_enter()


class ConfigurationManager:
    """Handles all configuration persistence: load/save/switch/export/import.

    Accesses from self.app (R1Setup):
        - active_config_file, configs_dir, config_file, config_dir, vars_file (paths)
        - inventory (read/write)
        - real_user
        - print_colored(), print_debug(), print_section(), print_header()
        - get_input()
        - check_hosts_config()
        - _select_network_environment(), _get_valid_hostname(), _configure_single_node()
        - set_mnl_app_env()
    """

    def __init__(self, app):
        self.app = app
        self.fleet_state = self._default_fleet_state()
        self.active_config = {
            'config_name': None,
            'environment': None,
            'created_at': None,
            'nodes_count': 0,
            'config_schema_version': CONFIG_SCHEMA_VERSION,
            'last_deployed_date': None,
            'last_deployed_network': None,
            'deployment_status': 'never_deployed',
            'last_deleted_date': None,
            'last_written_cli_version': None,
            'last_written_collection_version': None,
            'last_written_at': None,
        }

    def _load_active_config(self) -> None:
        """Load the active configuration settings"""
        self.app.print_debug(f"Loading active config from: {self.app.active_config_file}")

        self.active_config = {
            'config_name': None,
            'environment': None,
            'created_at': None,
            'nodes_count': 0,
            'config_schema_version': CONFIG_SCHEMA_VERSION,
            'last_deployed_date': None,
            'last_deployed_network': None,
            'deployment_status': 'never_deployed',
            'last_deleted_date': None,
            'last_written_cli_version': None,
            'last_written_collection_version': None,
            'last_written_at': None,
        }

        if self.app.active_config_file.exists():
            try:
                with open(self.app.active_config_file) as f:
                    loaded_config = json.load(f)
                    self.app.print_debug(f"Loaded active config from file: {loaded_config}")
                    self.active_config.update(loaded_config)
                    self.fleet_state = self._normalize_fleet_state(loaded_config.get('fleet_state'))
                    self.app.print_debug(f"Final active config after update: {self.active_config}")

                # Ensure network environment is synchronized after loading active config
                env = self.active_config.get('environment')
                if env:
                    self.set_mnl_app_env(env)
                    self.app.print_debug(f"Network environment synchronized after loading active config: {env}")
            except Exception as e:
                self.app.print_colored(f"Warning: Could not load active config: {e}", 'yellow')
                self.app.print_debug(f"Exception loading active config: {e}")
        else:
            self.app.print_debug(f"Active config file does not exist: {self.app.active_config_file}")
            self.fleet_state = self._default_fleet_state()

    def _save_active_config(self) -> None:
        """Save the active configuration settings"""
        try:
            self.active_config['config_schema_version'] = CONFIG_SCHEMA_VERSION
            self.active_config['fleet_state'] = copy.deepcopy(self.fleet_state)
            self.app.print_debug(f"Saving active config to: {self.app.active_config_file}")
            self.app.print_debug(f"Active config being saved: {self.active_config}")
            with open(self.app.active_config_file, 'w') as f:
                json.dump(self.active_config, f, indent=2)
            self.app.print_debug(f"Successfully saved active config")
        except Exception as e:
            self.app.print_colored(f"Error saving active config: {e}", 'red')
            self.app.print_debug(f"Exception saving active config: {e}")

    # -- Shared config-creation primitives (Phase 1) --

    def _prompt_new_config_name(self) -> str:
        """Prompt the operator for a valid configuration base name."""
        self.app.print_colored("\n📝 Configuration Naming", 'cyan', bold=True)
        self.app.print_colored("Give your configuration a meaningful name to identify it later.", 'yellow')
        self.app.print_colored("Examples: 'production-cluster', 'test-env', 'gpu-farm-1'", 'white')
        while True:
            custom_name = self.app.get_input("Enter configuration name (letters, numbers, -, _)", required=True)
            if re.match(r'^[a-zA-Z0-9_-]+$', custom_name):
                return custom_name
            self.app.print_colored("Invalid name. Use only letters, numbers, hyphens (-), and underscores (_)", 'red')

    def _prompt_new_config_environment(self) -> str:
        """Prompt for network environment and persist the selection."""
        env = self.app._select_network_environment()
        self.set_mnl_app_env(env)
        return env

    def _reset_inventory_for_new_config(self) -> None:
        """Reset inventory to an empty gpu_nodes structure."""
        self.app.inventory = {
            'all': {
                'vars': {},
                'children': {
                    'gpu_nodes': {
                        'hosts': {}
                    }
                }
            }
        }

    # -- Machine-first config-creation primitives --

    def _select_configuration_mode(self) -> str:
        """Prompt the operator for simple or advanced configuration mode.

        Returns ``'simple'`` or ``'advanced'``.
        """
        self.app.print_colored("\nConfiguration Mode:", 'cyan', bold=True)
        self.app.print_colored("  Simple   — One edge node instance per machine (recommended)", 'white')
        self.app.print_colored("  Advanced — Multiple edge node instances per machine", 'white')
        choice = self.app.get_input(
            "\nType 'advanced' to enable advanced mode, or press Enter for simple", "",
        ).strip().lower()
        if choice == 'advanced':
            return 'advanced'
        return 'simple'

    def _prompt_machine_count(self) -> int:
        """Prompt the operator for the number of machines to register."""
        while True:
            try:
                num_machines = int(self.app.get_input("How many machines do you want to register", "1"))
                if num_machines <= 0:
                    self.app.print_colored("Please enter a positive number", 'red')
                    continue
                return num_machines
            except ValueError:
                self.app.print_colored("Please enter a valid number", 'red')

    def _collect_machine_registration_entries(self, num_machines: int) -> List[str]:
        """Register machines interactively. Returns list of registered machine_ids."""
        registered_ids = []
        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        existing_ids = set(fleet_state.get('fleet', {}).get('machines', {}).keys())

        for i in range(num_machines):
            self.app.print_section(f"Registering Machine {i + 1} of {num_machines}")

            # Machine label
            default_label = f"machine-{i + 1}"
            while True:
                label = self.app.get_input(
                    f"Enter label for machine {i + 1}", default_label, required=True,
                ).strip()
                if not re.match(r'^[a-zA-Z0-9_-]+$', label):
                    self.app.print_colored(
                        "Invalid label. Use only letters, numbers, hyphens (-), and underscores (_).", 'red',
                    )
                    continue
                if label in existing_ids or label in registered_ids:
                    self.app.print_colored(
                        f"Machine '{label}' already registered. Choose a different label.", 'red',
                    )
                    continue
                break

            # SSH details
            node_config = self.app._configure_single_node()
            machine_access = self.app._extract_machine_access_config(node_config)

            # Build and persist machine record (no specs yet)
            machine_data = {
                'topology_mode': 'standard',
                'deployment_state': 'empty',
                'instance_names': [],
                **machine_access,
            }

            self.upsert_machine_record(label, machine_data)
            registered_ids.append(label)
            self.app.print_colored(f"\u2713 Machine '{label}' registered.", 'green')

        # Batch probe specs for all registered machines in parallel
        if registered_ids:
            probe_choice = self.app.get_input(
                f"Probe specs for all {len(registered_ids)} machine(s)? (Y/n)", "Y",
            )
            if probe_choice.lower() != 'n':
                self._batch_probe_machine_specs(registered_ids)

        return registered_ids

    def _batch_probe_machine_specs(self, machine_ids: List[str]) -> Dict[str, Dict[str, Any]]:
        """Probe specs for multiple machines in parallel. Returns {mid: probe_result}."""
        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        results: Dict[str, Dict[str, Any]] = {}

        tasks: Dict[str, Dict[str, Any]] = {}
        for mid in machine_ids:
            record = fleet_state['fleet']['machines'].get(mid)
            if record:
                tasks[mid] = record

        if not tasks:
            return results

        self.app.print_colored(f"  Probing specs for {len(tasks)} machine(s) in parallel...", 'cyan')

        def _probe_one(mid: str, record: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
            return mid, self.app._probe_machine_specs(record)

        with ThreadPoolExecutor(max_workers=min(len(tasks), 10)) as pool:
            futures = {pool.submit(_probe_one, mid, rec): mid for mid, rec in tasks.items()}
            for future in as_completed(futures):
                mid, probe_result = future.result()
                results[mid] = probe_result
                if probe_result.get('status') == 'success':
                    machine_specs = {
                        'hostname': probe_result['hostname'],
                        'cpu_total': probe_result['cpu_total'],
                        'memory_gb_total': probe_result['memory_gb_total'],
                        'last_checked_at': probe_result['last_checked_at'],
                    }
                    self.upsert_machine_record(mid, {'machine_specs': machine_specs})
                    self.app.print_colored(
                        f"    {mid}: \u2713 {probe_result['hostname']}: {probe_result['cpu_total']} CPU, "
                        f"{probe_result['memory_gb_total']:.1f} GiB RAM", 'green',
                    )
                    recommendation = self.assess_machine_resource_recommendation(machine_specs)
                    if recommendation.get('status') != 'meets_recommendation':
                        self.app.print_colored(f"    {mid}: {recommendation['message']}", recommendation['color'])
                else:
                    self.app.print_colored(
                        f"    {mid}: \u26a0 {probe_result.get('message', 'unknown error')}", 'yellow',
                    )

        return results

    def _collect_advanced_instance_counts(self, registered_ids: List[str]) -> Dict[str, int]:
        """For advanced mode, ask for desired instance count per machine.

        Uses spec probe results to compute recommended max.
        Returns ``{machine_id: desired_count}``.
        """
        counts: Dict[str, int] = {}
        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        for mid in registered_ids:
            machine = fleet_state['fleet']['machines'].get(mid, {})
            specs = machine.get('machine_specs') or {}
            cpu = specs.get('cpu_total')
            mem = specs.get('memory_gb_total')

            max_recommended = 1
            specs_available = False
            if cpu is not None and mem is not None:
                try:
                    max_recommended = max(1, min(
                        int(cpu) // MIN_RECOMMENDED_NODE_CPU_CORES,
                        int(float(mem) // MIN_RECOMMENDED_NODE_MEMORY_GIB),
                    ))
                    specs_available = True
                except (TypeError, ValueError):
                    pass

            if specs_available:
                specs_summary = self._format_machine_specs_summary(specs)
                self.app.print_colored(
                    f"\nMachine '{mid}': {specs_summary}", 'cyan',
                )
                self.app.print_colored(
                    f"  Recommended max: {max_recommended} instance(s) "
                    f"({MIN_RECOMMENDED_NODE_CPU_CORES} CPU / {MIN_RECOMMENDED_NODE_MEMORY_GIB:.0f} GiB each)",
                    'cyan',
                )
            else:
                self.app.print_colored(
                    f"\nMachine '{mid}': specs unavailable — enter desired count manually.", 'yellow',
                )

            while True:
                try:
                    default = str(max_recommended) if specs_available else "1"
                    count = int(self.app.get_input(
                        f"How many instances on '{mid}'", default,
                    ))
                    if count < 1:
                        self.app.print_colored("Must be at least 1.", 'red')
                        continue
                    if count > max_recommended and specs_available:
                        confirm = self.app.get_input(
                            f"  This exceeds the recommended max ({max_recommended}). Continue? (y/n)", "n",
                        )
                        if confirm.lower() != 'y':
                            continue
                    counts[mid] = count
                    break
                except ValueError:
                    self.app.print_colored("Please enter a valid number.", 'red')
        return counts

    def _create_machine_first_configuration(self) -> None:
        """Create a machine-first configuration: register machines, zero hosts."""
        self.app.print_section("Create New Configuration")

        custom_name = self._prompt_new_config_name()
        env = self._prompt_new_config_environment()
        config_mode = self._select_configuration_mode()
        num_machines = self._prompt_machine_count()
        config_name = self._generate_config_name(num_machines, custom_name, unit='m')

        self._reset_inventory_for_new_config()

        # Save config shell first (required before upsert_machine_record)
        self.ensure_configuration_shell(config_name, env)

        # Register machines
        registered_ids = self._collect_machine_registration_entries(num_machines)

        # Advanced mode: collect desired instance counts per machine
        desired_counts: Dict[str, int] = {}
        if config_mode == 'advanced':
            desired_counts = self._collect_advanced_instance_counts(registered_ids)

        # Offer batch discovery and import
        discovery_result = self._onboarding_batch_discovery_and_import(
            registered_ids, env, session_mode=config_mode,
        )

        # If discovery switched the session to advanced, propagate to gap fill
        config_mode = discovery_result.get('session_mode', config_mode)

        # Gap fill: create fresh instances on scanned-clean machines
        fresh_count = self._onboarding_gap_fill_clean_machines(
            discovery_result.get('clean_machine_ids', []), env,
            config_mode=config_mode,
            desired_counts=desired_counts,
        )

        imported = discovery_result.get('imported_total', 0)
        total_instances = imported + fresh_count

        # Final summary
        self.app.print_section("Configuration Summary")
        self.app.print_colored(f"Configuration '{config_name}' created and activated.", 'green')
        if total_instances > 0:
            parts = []
            if imported > 0:
                parts.append(f"{imported} imported")
            if fresh_count > 0:
                parts.append(f"{fresh_count} new")
            self.app.print_colored(
                f"  {len(registered_ids)} machine(s), {total_instances} instance(s) ({', '.join(parts)}).",
                'cyan',
            )
        else:
            self.app.print_colored(
                f"  {len(registered_ids)} machine(s) registered, 0 instances.", 'cyan',
            )

        # Deploy prompt: only when fresh instances were created, default n
        if fresh_count > 0:
            deploy_choice = self.app.get_input("Would you like to deploy now? (y/n)", "n")
            if deploy_choice.lower() == 'y':
                self.app.wait_for_enter("Press Enter to continue to deployment...")
                self.app.deployment_menu()
                return

        if total_instances == 0:
            self.app.print_colored(
                "\n\U0001f4a1 Use 'Discover Services' to scan for existing edge_node services,\n"
                "   or 'Fleet Summary' to review your registered machines.",
                'cyan',
            )
        self.app.wait_for_enter()

    # -- Batch discovery primitives (Phase 3) --

    def _batch_discover_machines(self, machine_ids: List[str]) -> Dict[str, Dict[str, Any]]:
        """Scan registered machines for existing edge_node services in parallel.

        Returns ``{machine_id: {status, candidates, error}}`` without persisting.
        """
        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        scan_buffer: Dict[str, Dict[str, Any]] = {}

        # Separate valid machines from unknown ids
        tasks: Dict[str, Dict[str, Any]] = {}
        for mid in machine_ids:
            machine_record = fleet_state['fleet']['machines'].get(mid)
            if not machine_record:
                scan_buffer[mid] = {'status': 'skipped', 'candidates': [], 'error': 'not registered'}
            else:
                tasks[mid] = machine_record

        if not tasks:
            return scan_buffer

        self.app.print_colored(f"  Scanning {len(tasks)} machine(s) in parallel...", 'cyan')

        def _scan_one(mid: str, record: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
            try:
                result = self.app.discover_existing_edge_node_services(record)
                if result.get('status') == 'success':
                    return mid, {'status': 'success', 'candidates': result.get('candidates', []), 'error': None}
                else:
                    return mid, {'status': 'error', 'candidates': [], 'error': result.get('message', 'unknown error')}
            except Exception as e:
                return mid, {'status': 'error', 'candidates': [], 'error': str(e)}

        with ThreadPoolExecutor(max_workers=min(len(tasks), 10)) as pool:
            futures = {pool.submit(_scan_one, mid, rec): mid for mid, rec in tasks.items()}
            for future in as_completed(futures):
                mid, result = future.result()
                scan_buffer[mid] = result
                if result['status'] == 'success':
                    count = len(result['candidates'])
                    self.app.print_colored(
                        f"    {mid}: \u2713 {count} service(s) found" if count else f"    {mid}: \u2713 clean",
                        'green',
                    )
                else:
                    self.app.print_colored(f"    {mid}: \u2717 {result['error']}", 'red')

        return scan_buffer

    def _persist_batch_discovery_results(self, scan_buffer: Dict[str, Dict[str, Any]]) -> None:
        """Persist all successful scan results after the batch completes."""
        for mid, result in scan_buffer.items():
            if result['status'] == 'success':
                self.record_machine_discovery_scan(mid, result['candidates'])

    def _classify_scan_results(self, scan_buffer: Dict[str, Dict[str, Any]]) -> Dict[str, List[str]]:
        """Categorize scan results into clean / discovered / failed / skipped."""
        classified: Dict[str, List[str]] = {'clean': [], 'discovered': [], 'failed': [], 'skipped': []}
        for mid, data in scan_buffer.items():
            if data['status'] == 'skipped':
                classified['skipped'].append(mid)
            elif data['status'] == 'error':
                classified['failed'].append(mid)
            elif data['candidates']:
                classified['discovered'].append(mid)
            else:
                classified['clean'].append(mid)
        return classified

    def _onboarding_review_machine_candidates(
        self,
        machine_id: str,
        candidates: List[Dict[str, Any]],
        config_env: str,
        session_mode: str,
    ) -> Dict[str, Any]:
        """Review and optionally import cached candidates for one machine during onboarding.

        *session_mode* is ``'simple'`` or ``'advanced'``.
        Returns ``{action, imported_count, mode_switched_to}``.
        """
        # 1. Separate env-matching vs mismatched candidates
        matching = [
            c for c in candidates
            if c.get('environment') == config_env or c.get('environment') in ('', 'unknown')
        ]
        mismatched = [c for c in candidates if c not in matching]
        total_count = len(candidates)

        # 2. Display summary with env filtering
        self.app.print_section(f"Machine {machine_id}: {total_count} service(s) found")
        for c in matching:
            self.app.print_colored(
                f"  {c['service_name']} [{c['service_state']}] {c.get('environment', '?')}"
                f"  \u2190 matches this config",
                'green',
            )
        for c in mismatched:
            self.app.print_colored(
                f"  {c['service_name']} [{c['service_state']}] {c.get('environment', '?')}"
                f"  \u2190 different environment",
                'yellow',
            )
        if mismatched and matching:
            self.app.print_colored(
                f"\nOnly {config_env} services can be imported into this config.", 'white',
            )

        if not matching:
            self.app.print_colored(
                f"\nNo {config_env} services on this machine. Skipping.", 'yellow',
            )
            return {'action': 'skipped', 'imported_count': 0, 'mode_switched_to': None}

        # 3. Multi-service topology rule: 2+ total services = expert
        needs_expert = total_count >= 2

        # 4. Simple-mode guardrail
        if needs_expert and session_mode == 'simple':
            self.app.print_colored(
                f"\nMachine {machine_id} has {total_count} running services "
                f"({len(matching)} match this config's environment).\n"
                "This machine requires Advanced mode because multiple services share its resources.",
                'yellow',
            )
            self.app.print_colored("\n  1) Switch to Advanced mode to import matching services")
            self.app.print_colored("  2) Skip this machine for now")
            choice = self.app.get_input("\nSelect (1-2)", "2")
            if choice != '1':
                return {'action': 'skipped', 'imported_count': 0, 'mode_switched_to': None}
            session_mode = 'advanced'

        # 5. Candidate selection (reuse existing UI)
        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        machine_record = fleet_state['fleet']['machines'].get(machine_id, {})
        selected = self.app._select_discovery_candidates(machine_record, matching)
        if not selected:
            return {
                'action': 'skipped', 'imported_count': 0,
                'mode_switched_to': 'advanced' if needs_expert and session_mode == 'advanced' else None,
            }

        # 6. Cross-config duplicate warnings
        claims_warnings: List[str] = []
        for candidate in selected:
            claims = self.find_runtime_identity_claims(
                machine_record, candidate['service_name'],
                exclude_config_name=self.active_config.get('config_name'),
            )
            if claims:
                for claim in claims:
                    claims_warnings.append(
                        f"  {candidate['service_name']} is also tracked in "
                        f"'{claim['config_name']}' as '{claim['instance_name']}'"
                    )
        if claims_warnings:
            self.app.print_colored("\n\u26a0 Cross-config warnings:", 'yellow')
            for w in claims_warnings:
                self.app.print_colored(w, 'yellow')
            if self.app.get_input("Continue with import? (y/n)", "y").lower() != 'y':
                return {'action': 'skipped', 'imported_count': 0, 'mode_switched_to': None}

        # 7. Topology promotion if needed
        final_topology = 'expert' if needs_expert else str(machine_record.get('topology_mode') or 'standard')
        existing_instances = machine_record.get('instance_names', [])
        if final_topology != 'expert' and (len(existing_instances) + len(selected)) > 1:
            final_topology = 'expert'

        if final_topology == 'expert' and str(machine_record.get('topology_mode') or 'standard') != 'expert':
            self.upsert_machine_record(machine_id, {'topology_mode': 'expert'})

        # 8. Logical name mapping
        hosts = _get_gpu_hosts(self.app.inventory)
        logical_name_map: Dict[str, str] = {}
        for candidate in selected:
            name = self.app._prompt_discovery_import_name(candidate, hosts, machine_id=machine_id)
            if name is None:
                return {'action': 'cancelled', 'imported_count': 0, 'mode_switched_to': None}
            logical_name_map[candidate['service_name']] = name
            hosts[name] = {}  # Reserve to prevent duplicates

        # 9. Import
        result = self.app.import_discovery_candidates(machine_id, selected, logical_name_map)
        imported_count = len(result.get('imported_names', [])) if result.get('status') == 'success' else 0

        return {
            'action': 'imported',
            'imported_count': imported_count,
            'mode_switched_to': 'advanced' if needs_expert else None,
        }

    def _onboarding_batch_discovery_and_import(
        self,
        registered_ids: List[str],
        config_env: str,
        *,
        session_mode: str = 'simple',
    ) -> Dict[str, Any]:
        """Offer batch discovery and import during onboarding.

        Returns ``{scanned, imported_total, session_mode, clean_machine_ids}``.
        """
        scan_choice = self.app.get_input(
            "Check these machines for existing edge_node services now? (Y/n)", "Y",
        )
        if scan_choice.lower() == 'n':
            return {'scanned': False, 'imported_total': 0, 'session_mode': session_mode, 'clean_machine_ids': []}

        # Batch scan
        self.app.print_section("Scanning Machines")
        scan_buffer = self._batch_discover_machines(registered_ids)

        # Persist all scan results after the batch completes
        self._persist_batch_discovery_results(scan_buffer)

        # Summary
        classified = self._classify_scan_results(scan_buffer)
        self.app.print_section("Discovery Summary")
        if classified['clean']:
            self.app.print_colored(f"  \u2713 Clean: {', '.join(classified['clean'])}", 'green')
        if classified['discovered']:
            for mid in classified['discovered']:
                count = len(scan_buffer[mid]['candidates'])
                self.app.print_colored(f"  \U0001f4e1 Found services: {mid} ({count} service(s))", 'cyan')
        if classified['failed']:
            self.app.print_colored(f"  \u2717 Failed: {', '.join(classified['failed'])}", 'red')
        if classified['skipped']:
            self.app.print_colored(f"  \u23ed Skipped: {', '.join(classified['skipped'])}", 'yellow')

        # Import flow for machines with candidates
        imported_total = 0
        for mid in classified['discovered']:
            candidates = scan_buffer[mid]['candidates']
            result = self._onboarding_review_machine_candidates(
                mid, candidates, config_env, session_mode,
            )
            imported_total += result.get('imported_count', 0)
            if result.get('mode_switched_to') == 'advanced':
                session_mode = 'advanced'

        return {
            'scanned': True,
            'imported_total': imported_total,
            'session_mode': session_mode,
            'clean_machine_ids': classified['clean'],
        }

    # -- Gap fill primitives (Phase 4) --

    def _build_fresh_host_entry(self, machine_id: str, host_name: str, *, topology_mode: str = 'standard') -> Dict[str, Any]:
        """Build a fresh inventory host entry from a registered machine record."""
        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        machine_record = fleet_state['fleet']['machines'].get(machine_id)
        if not machine_record:
            raise ValueError(f"Machine '{machine_id}' not found in fleet state")

        machine_access = self.app._extract_machine_access_config(machine_record)

        host_config = dict(machine_access)
        host_config.update({
            'r1setup_machine_id': machine_id,
            'r1setup_topology_mode': topology_mode,
            'r1setup_machine_deployment_state': 'active',
            'r1setup_runtime_name_policy': 'normalize_to_target',
            'r1setup_instance_logical_name': host_name,
            'node_status': 'never_deployed',
            'last_status_update': datetime.now().isoformat(),
            'r1setup_service_file_version': DEFAULT_SERVICE_FILE_VERSION,
        })

        self.apply_runtime_snapshot_to_host_config(host_name, host_config)

        return host_config

    def _onboarding_gap_fill_clean_machines(
        self,
        clean_ids: List[str],
        config_env: str,
        *,
        config_mode: str = 'simple',
        desired_counts: Optional[Dict[str, int]] = None,
    ) -> int:
        """Offer to create fresh instances on scanned-clean machines.

        In simple mode, creates one instance per machine.
        In advanced mode, creates N instances per machine (from *desired_counts*),
        minus any already-imported instances.

        Returns the number of fresh instances created.
        """
        if not clean_ids:
            return 0
        desired_counts = desired_counts or {}

        self.app.print_section("Fresh Instance Creation")
        self.app.print_colored(
            f"{len(clean_ids)} machine(s) have no existing services and are ready for fresh instances:",
            'cyan',
        )
        for mid in clean_ids:
            target = desired_counts.get(mid, 1)
            label = f"  {mid}" if target <= 1 else f"  {mid} ({target} planned)"
            self.app.print_colored(label, 'white')

        create_choice = self.app.get_input(
            "Create fresh edge_node instance(s) on clean machines? (Y/n)", "Y",
        )
        if create_choice.lower() == 'n':
            return 0

        created_count = 0
        hosts = _get_gpu_hosts(self.app.inventory)
        for mid in clean_ids:
            # Determine how many instances to create
            fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
            existing_count = len(
                fleet_state.get('fleet', {}).get('machines', {}).get(mid, {}).get('instance_names', [])
            )
            instance_target = max(0, desired_counts.get(mid, 1) - existing_count)
            if instance_target <= 0:
                continue

            is_expert = config_mode == 'advanced' and instance_target > 1
            topology = 'expert' if is_expert else 'standard'

            if is_expert:
                self.upsert_machine_record(mid, {'topology_mode': 'expert'})

            for j in range(instance_target):
                # Generate default name
                if instance_target == 1:
                    default_name = mid
                else:
                    default_name = f"{mid}-{j + 1}"
                if default_name in hosts:
                    suffix = 2
                    while f"{default_name}_{suffix}" in hosts:
                        suffix += 1
                    default_name = f"{default_name}_{suffix}"

                host_name = self.app._get_valid_hostname(
                    f"Instance name for machine '{mid}'", default_name,
                )
                while host_name in hosts:
                    self.app.print_colored(f"Instance '{host_name}' already exists.", 'red')
                    host_name = self.app._get_valid_hostname(
                        f"Instance name for machine '{mid}'", default_name,
                    )

                host_config = self._build_fresh_host_entry(mid, host_name, topology_mode=topology)
                hosts[host_name] = host_config

                # Update fleet: link instance to machine, mark active
                self.upsert_machine_record(mid, {
                    'deployment_state': 'active',
                    'instance_names': sorted(set(
                        self.get_fleet_state_copy()['fleet']['machines'].get(mid, {}).get('instance_names', [])
                        + [host_name]
                    )),
                })

                self.app.print_colored(f"  \u2713 Instance '{host_name}' created on machine '{mid}'.", 'green')
                created_count += 1

        # Save inventory with new hosts
        config_name = self.active_config.get('config_name')
        nodes_count = len(_get_gpu_hosts(self.app.inventory))
        self._save_config_with_metadata(config_name, config_env, nodes_count)

        return created_count

    def _generate_config_name(self, nodes_count: int, custom_name: str = None, *, unit: str = 'n') -> str:
        """Generate a configuration name with user input, timestamp and metadata.

        The *unit* suffix defaults to ``'n'`` (node count) for existing callers.
        Machine-first flows pass ``unit='m'`` so the resulting name reflects
        machine count instead of the (initially zero) node count.
        """
        if not custom_name:
            custom_name = self._prompt_new_config_name()

        # Generate timestamp components
        now = datetime.now()
        date_str = now.strftime('%Y%m%d')  # YYYYMMDD
        time_str = now.strftime('%H%M')    # HHMM

        config_name = f"{custom_name}_{date_str}_{time_str}_{nodes_count}{unit}"

        return config_name

    def _list_available_configs(self) -> List[Tuple[str, Dict]]:
        """List all available configurations with their metadata"""
        configs = []
        # Look for all .yml files in configs directory (not just hosts_config_*)
        for config_file in self.app.configs_dir.glob("*.yml"):
            try:
                with open(config_file) as f:
                    config_data = yaml.safe_load(f)

                # Check if this is a valid configuration file by looking for the expected structure
                if not config_data or 'all' not in config_data:
                    continue
                if 'children' not in config_data.get('all', {}) or 'gpu_nodes' not in config_data.get('all', {}).get('children', {}):
                    continue

                # Extract metadata from config
                hosts = _get_gpu_hosts(config_data)
                metadata_file = config_file.with_suffix('.json')

                metadata = {
                    'nodes_count': len(hosts),
                    'environment': 'unknown',
                    'created_at': config_file.stat().st_mtime
                }

                if metadata_file.exists():
                    try:
                        with open(metadata_file) as f:
                            metadata.update(json.load(f))
                    except Exception:
                        pass

                configs.append((config_file.name, metadata))
            except Exception:
                continue

        # Sort by creation time, newest first
        configs.sort(key=lambda x: x[1]['created_at'], reverse=True)
        return configs

    def find_runtime_identity_claims(
        self,
        machine_config: Dict[str, Any],
        service_name: str,
        *,
        exclude_config_name: Optional[str] = None,
    ) -> List[Dict[str, str]]:
        """Return saved config claims for one runtime identity across all known configs."""
        endpoint = self._machine_endpoint_from_record(machine_config)
        normalized_service_name = str(service_name or '').strip()
        if not endpoint or not normalized_service_name:
            return []

        claims: List[Dict[str, str]] = []
        for config_filename, _metadata in self._list_available_configs():
            config_name = Path(config_filename).stem
            if exclude_config_name and config_name == exclude_config_name:
                continue

            config_path = self.app.configs_dir / config_filename
            metadata_path = self._metadata_path_for_config(self.app.configs_dir, config_name)
            try:
                with open(config_path) as handle:
                    inventory = yaml.safe_load(handle) or {}
            except Exception:
                continue

            metadata = {}
            if metadata_path.exists():
                try:
                    with open(metadata_path) as handle:
                        metadata = json.load(handle)
                except Exception:
                    metadata = {}

            fleet_state = metadata.get('fleet_state') or self.build_fleet_state(inventory)
            normalized_fleet = self._normalize_fleet_state(fleet_state)
            machines = normalized_fleet.get('fleet', {}).get('machines', {})
            instances = normalized_fleet.get('fleet', {}).get('instances', {})
            for instance_name, instance_data in instances.items():
                assigned_machine_id = str(instance_data.get('assigned_machine_id') or '').strip()
                machine_record = dict(machines.get(assigned_machine_id, {}))
                if self._machine_endpoint_from_record(machine_record) != endpoint:
                    continue
                runtime = dict(instance_data.get('runtime') or {})
                if str(runtime.get('service_name') or '').strip() != normalized_service_name:
                    continue
                claims.append({
                    'config_name': config_name,
                    'instance_name': instance_name,
                    'machine_id': assigned_machine_id,
                })

        return claims

    @staticmethod
    def _default_fleet_state() -> Dict[str, Any]:
        """Return the default fleet-state structure."""
        return {
            'config_schema_version': CONFIG_SCHEMA_VERSION,
            'fleet': {
                'machines': {},
                'instances': {},
            },
        }

    def _normalize_fleet_state(self, fleet_state: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        """Normalize fleet-state metadata into the current schema shape."""
        normalized = copy.deepcopy(fleet_state) if isinstance(fleet_state, dict) else {}
        normalized['config_schema_version'] = int(normalized.get('config_schema_version') or CONFIG_SCHEMA_VERSION)
        fleet = normalized.get('fleet')
        if not isinstance(fleet, dict):
            fleet = {}
        machines = fleet.get('machines')
        instances = fleet.get('instances')
        fleet['machines'] = machines if isinstance(machines, dict) else {}
        fleet['instances'] = instances if isinstance(instances, dict) else {}
        normalized['fleet'] = fleet
        normalized = self._canonicalize_fleet_machine_records(normalized)
        return normalized

    @staticmethod
    def _normalize_machine_endpoint(ansible_host: Any, ansible_user: Any = 'root', ansible_port: Any = 22) -> str:
        """Return a stable endpoint key for a machine connection identity."""
        host = str(ansible_host or '').strip()
        if not host:
            return ''
        user = str(ansible_user or 'root').strip() or 'root'
        port = str(ansible_port or 22).strip() or '22'
        return f"{user}@{host}:{port}"

    @classmethod
    def _machine_endpoint_from_record(
        cls,
        record: Dict[str, Any],
        *,
        fallback_host_name: Optional[str] = None,
    ) -> str:
        """Return the normalized endpoint for a machine or host record."""
        return cls._normalize_machine_endpoint(
            record.get('ansible_host') or fallback_host_name,
            record.get('ansible_user') or 'root',
            record.get('ansible_port') or 22,
        )

    @classmethod
    def _is_derived_machine_id(cls, machine_id: str, machine_record: Dict[str, Any]) -> bool:
        """Return True when the machine id is just the derived endpoint form."""
        return str(machine_id or '').strip() == cls._machine_endpoint_from_record(machine_record)

    @classmethod
    def _select_canonical_machine_id(cls, machine_ids: List[str], machines: Dict[str, Dict[str, Any]]) -> str:
        """Choose the preferred machine id for a duplicate-endpoint machine group."""
        candidates = sorted(set(str(machine_id) for machine_id in machine_ids if str(machine_id).strip()))
        if not candidates:
            return ''

        explicit_candidates = [
            machine_id
            for machine_id in candidates
            if not cls._is_derived_machine_id(machine_id, machines.get(machine_id, {}))
        ]
        if explicit_candidates:
            return sorted(explicit_candidates)[0]
        return candidates[0]

    @classmethod
    def _canonicalize_fleet_machine_records(cls, fleet_state: Optional[Dict[str, Any]]) -> Dict[str, Any]:
        """Collapse duplicate machine records that describe the same endpoint."""
        normalized = copy.deepcopy(fleet_state) if isinstance(fleet_state, dict) else cls._default_fleet_state()
        fleet = normalized.setdefault('fleet', {})
        machines = dict(fleet.get('machines') or {})
        instances = dict(fleet.get('instances') or {})

        endpoint_to_machine_ids: Dict[str, List[str]] = {}
        for machine_id, machine_record in machines.items():
            endpoint = cls._machine_endpoint_from_record(machine_record)
            if not endpoint:
                continue
            endpoint_to_machine_ids.setdefault(endpoint, []).append(machine_id)

        canonical_machine_map: Dict[str, str] = {}
        rebuilt_machines: Dict[str, Dict[str, Any]] = {}
        for machine_id, machine_record in machines.items():
            endpoint = cls._machine_endpoint_from_record(machine_record)
            if endpoint:
                canonical_machine_id = cls._select_canonical_machine_id(endpoint_to_machine_ids.get(endpoint, [machine_id]), machines)
            else:
                canonical_machine_id = machine_id
            canonical_machine_map[machine_id] = canonical_machine_id

        for machine_id, machine_record in machines.items():
            canonical_machine_id = canonical_machine_map.get(machine_id, machine_id)
            canonical_record = rebuilt_machines.setdefault(canonical_machine_id, {})
            current_record = dict(machine_record)
            current_record['machine_id'] = canonical_machine_id

            if not canonical_record:
                canonical_record.update(current_record)
                canonical_record['machine_id'] = canonical_machine_id
                canonical_record['instance_names'] = []
                continue

            for field in (
                'ansible_host',
                'ansible_user',
                'ansible_port',
                'ansible_ssh_common_args',
                'ansible_ssh_pass',
                'ansible_become_password',
                'ansible_ssh_private_key_file',
                'machine_specs',
                'discovery',
            ):
                if canonical_record.get(field) in (None, '', {}):
                    value = current_record.get(field)
                    if value not in (None, '', {}):
                        canonical_record[field] = value

            if canonical_record.get('topology_mode') != 'expert' and current_record.get('topology_mode') == 'expert':
                canonical_record['topology_mode'] = 'expert'

            if canonical_record.get('deployment_state') in (None, '', DEFAULT_MACHINE_DEPLOYMENT_STATE):
                deployment_state = current_record.get('deployment_state')
                if deployment_state not in (None, ''):
                    canonical_record['deployment_state'] = deployment_state

            current_specs = dict(current_record.get('machine_specs') or {})
            if current_specs:
                merged_specs = dict(canonical_record.get('machine_specs') or {})
                merged_specs.update({key: value for key, value in current_specs.items() if value not in (None, '')})
                canonical_record['machine_specs'] = merged_specs

        for instance_name, instance_record in instances.items():
            assigned_machine_id = str(instance_record.get('assigned_machine_id') or '').strip()
            if not assigned_machine_id:
                continue
            instance_record['assigned_machine_id'] = canonical_machine_map.get(assigned_machine_id, assigned_machine_id)

        for machine_record in rebuilt_machines.values():
            machine_record['instance_names'] = []

        for instance_name, instance_record in instances.items():
            assigned_machine_id = str(instance_record.get('assigned_machine_id') or '').strip()
            if not assigned_machine_id:
                continue
            machine_record = rebuilt_machines.setdefault(assigned_machine_id, {
                'machine_id': assigned_machine_id,
                'topology_mode': DEFAULT_MACHINE_TOPOLOGY_MODE,
                'deployment_state': DEFAULT_MACHINE_DEPLOYMENT_STATE,
                'instance_names': [],
            })
            instance_names = machine_record.setdefault('instance_names', [])
            if instance_name not in instance_names:
                instance_names.append(instance_name)

        for machine_record in rebuilt_machines.values():
            machine_record['instance_names'] = sorted(set(machine_record.get('instance_names') or []))

        fleet['machines'] = rebuilt_machines
        fleet['instances'] = instances
        normalized['fleet'] = fleet
        return normalized

    def find_machine_record_by_endpoint(
        self,
        host_config: Dict[str, Any],
        *,
        fleet_state: Optional[Dict[str, Any]] = None,
    ) -> Tuple[str, Dict[str, Any]]:
        """Return the canonical machine record for a host endpoint, if any."""
        normalized_fleet = self._normalize_fleet_state(fleet_state if fleet_state is not None else self.get_fleet_state_copy())
        endpoint = self._machine_endpoint_from_record(host_config)
        if not endpoint:
            return '', {}

        for machine_id, machine_record in normalized_fleet.get('fleet', {}).get('machines', {}).items():
            if self._machine_endpoint_from_record(machine_record) == endpoint:
                return machine_id, dict(machine_record)

        return '', {}

    def bind_host_to_existing_machine(
        self,
        host_name: str,
        host_config: Dict[str, Any],
        *,
        fleet_state: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Attach a host config to an existing canonical machine record when endpoints match."""
        bound_host = dict(host_config or {})
        machine_id, machine_record = self.find_machine_record_by_endpoint(bound_host, fleet_state=fleet_state)
        if not machine_id:
            return bound_host

        bound_host['r1setup_machine_id'] = machine_id
        bound_host.setdefault('r1setup_instance_logical_name', host_name)
        if machine_record.get('topology_mode'):
            bound_host['r1setup_topology_mode'] = machine_record.get('topology_mode')
        if machine_record.get('deployment_state'):
            bound_host['r1setup_machine_deployment_state'] = machine_record.get('deployment_state')
        return bound_host

    def promote_machine_to_expert(
        self,
        machine_id: str,
        inventory: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Promote an existing machine and its assigned hosts to expert topology in memory."""
        inventory = inventory if inventory is not None else self.app.inventory
        hosts = _get_gpu_hosts(inventory)

        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        machine_record = dict(fleet_state.get('fleet', {}).get('machines', {}).get(machine_id, {}))
        if not machine_record:
            return {}

        machine_record['topology_mode'] = 'expert'
        fleet_state['fleet']['machines'][machine_id] = machine_record

        machine_endpoint = self._machine_endpoint_from_record(machine_record)
        for host_name, host_config in hosts.items():
            host_machine_id = str(host_config.get('r1setup_machine_id') or '').strip()
            host_endpoint = self._machine_endpoint_from_record(host_config, fallback_host_name=host_name)
            if host_machine_id == machine_id or (machine_endpoint and host_endpoint == machine_endpoint):
                if not host_config.get('edge_node_service_name'):
                    standard_runtime_host = dict(host_config)
                    standard_runtime_host.setdefault('r1setup_runtime_name_policy', 'preserve')
                    standard_runtime_host['r1setup_topology_mode'] = DEFAULT_MACHINE_TOPOLOGY_MODE
                    self.apply_runtime_snapshot_to_host_config(host_name, standard_runtime_host)
                    for field in (
                        'edge_node_service_name',
                        'mnl_docker_container_name',
                        'mnl_docker_volume_path',
                        'mnl_r1setup_metadata_host_path',
                        'r1setup_runtime_exit_status_path',
                    ):
                        if standard_runtime_host.get(field):
                            host_config[field] = standard_runtime_host.get(field)
                host_config['r1setup_machine_id'] = machine_id
                host_config['r1setup_topology_mode'] = 'expert'
                host_config.setdefault('r1setup_runtime_name_policy', 'preserve')

        for instance_name, instance_record in fleet_state.get('fleet', {}).get('instances', {}).items():
            if str(instance_record.get('assigned_machine_id') or '').strip() == machine_id:
                instance_record['assigned_machine_id'] = machine_id

        self.fleet_state = fleet_state
        return machine_record

    def _merge_fleet_state(self, fleet_state: Optional[Dict[str, Any]], inventory: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        """Merge persisted fleet-state metadata with the legacy inventory-derived view."""
        normalized = self._normalize_fleet_state(fleet_state)
        derived = self.build_fleet_state(inventory)

        merged = self._default_fleet_state()
        merged['fleet']['machines'] = copy.deepcopy(normalized['fleet']['machines'])
        merged['fleet']['instances'] = {}

        for machine_id, machine_data in derived['fleet']['machines'].items():
            persisted_machine = merged['fleet']['machines'].get(machine_id, {})
            merged_machine = dict(persisted_machine)
            merged_machine.update({
                'machine_id': machine_id,
                'ansible_host': machine_data.get('ansible_host'),
                'ansible_user': machine_data.get('ansible_user', 'root'),
                'ansible_port': machine_data.get('ansible_port', 22),
            })
            for field in (
                'ansible_ssh_common_args',
                'ansible_ssh_pass',
                'ansible_become_password',
                'ansible_ssh_private_key_file',
            ):
                if machine_data.get(field) not in (None, ''):
                    merged_machine[field] = machine_data.get(field)
            merged_machine.setdefault('topology_mode', machine_data.get('topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE))
            merged_machine.setdefault(
                'deployment_state',
                machine_data.get('deployment_state', DEFAULT_MACHINE_DEPLOYMENT_STATE),
            )
            merged_machine['instance_names'] = []
            merged['fleet']['machines'][machine_id] = merged_machine

        for instance_name, instance_data in derived['fleet']['instances'].items():
            persisted_instance = normalized['fleet']['instances'].get(instance_name, {})
            merged_instance = dict(persisted_instance)
            merged_instance.update(instance_data)
            merged['fleet']['instances'][instance_name] = merged_instance

        for machine in merged['fleet']['machines'].values():
            machine['instance_names'] = []

        for instance_name, instance_data in merged['fleet']['instances'].items():
            machine_id = instance_data.get('assigned_machine_id')
            if not machine_id:
                continue
            machine = merged['fleet']['machines'].setdefault(machine_id, {
                'machine_id': machine_id,
                'topology_mode': DEFAULT_MACHINE_TOPOLOGY_MODE,
                'deployment_state': DEFAULT_MACHINE_DEPLOYMENT_STATE,
                'instance_names': [],
            })
            machine.setdefault('instance_names', [])
            if instance_name not in machine['instance_names']:
                machine['instance_names'].append(instance_name)

        for machine in merged['fleet']['machines'].values():
            machine['instance_names'] = sorted(machine.get('instance_names', []))

        return self._canonicalize_fleet_machine_records(merged)

    def prepare_host_for_persistence(
        self,
        host_name: str,
        host_config: Dict[str, Any],
        *,
        previous_host_config: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Prepare a host config for save after add/edit flows."""
        prepared_host = dict(host_config or {})
        previous_host = dict(previous_host_config or {})

        preserved_fields = (
            'r1setup_machine_id',
            'r1setup_topology_mode',
            'r1setup_machine_deployment_state',
            'r1setup_runtime_name_policy',
            'edge_node_service_name',
            'mnl_docker_container_name',
            'mnl_docker_volume_path',
            'mnl_r1setup_metadata_host_path',
            'r1setup_runtime_exit_status_path',
            'r1setup_cpu_limit_cores',
            'r1setup_memory_limit_gb',
        )
        for field in preserved_fields:
            if field not in prepared_host and field in previous_host:
                prepared_host[field] = previous_host.get(field)

        prepared_host = self.bind_host_to_existing_machine(host_name, prepared_host)

        if not prepared_host.get('r1setup_machine_id') and previous_host.get('r1setup_machine_id'):
            prepared_host['r1setup_machine_id'] = previous_host.get('r1setup_machine_id')
            prepared_host['r1setup_topology_mode'] = previous_host.get(
                'r1setup_topology_mode',
                DEFAULT_MACHINE_TOPOLOGY_MODE,
            )
            if previous_host.get('r1setup_machine_deployment_state'):
                prepared_host['r1setup_machine_deployment_state'] = previous_host.get(
                    'r1setup_machine_deployment_state'
                )
            if previous_host.get('r1setup_runtime_name_policy'):
                prepared_host['r1setup_runtime_name_policy'] = previous_host.get(
                    'r1setup_runtime_name_policy'
                )

        prepared_host['r1setup_instance_logical_name'] = host_name
        self.apply_runtime_snapshot_to_host_config(host_name, prepared_host)
        return prepared_host

    def remove_instance_from_fleet_state(self, instance_name: str) -> Dict[str, Any]:
        """Remove one instance from in-memory fleet state while retaining the machine record."""
        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        machines = fleet_state['fleet']['machines']
        instances = fleet_state['fleet']['instances']

        instance_record = dict(instances.pop(instance_name, {}))
        machine_id = str(instance_record.get('assigned_machine_id') or '').strip()
        machine_record = dict(machines.get(machine_id, {}))

        remaining_instances: List[str] = []
        if machine_record:
            remaining_instances = [name for name in machine_record.get('instance_names', []) if name != instance_name]
            machine_record['instance_names'] = sorted(remaining_instances)
            if not remaining_instances and machine_record.get('deployment_state') == 'active':
                machine_record['deployment_state'] = 'prepared'
            machines[machine_id] = machine_record

        self.fleet_state = fleet_state
        return {
            'machine_id': machine_id,
            'remaining_instance_names': sorted(remaining_instances),
            'machine_retained': bool(machine_record),
        }

    def record_imported_discovery_instance(
        self,
        instance_name: str,
        *,
        machine_id: str,
        runtime: Dict[str, Any],
        status: str,
        service_file_version: str,
        environment: str,
        environment_source: str,
    ) -> None:
        """Persist discovery-import metadata for one imported instance in fleet state."""
        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        machines = fleet_state['fleet']['machines']
        instances = fleet_state['fleet']['instances']

        instance_record = dict(instances.get(instance_name, {}))
        instance_record.update({
            'logical_name': instance_name,
            'assigned_machine_id': machine_id,
            'runtime_name_policy': 'preserve',
            'runtime': dict(runtime),
            'status': {
                'node_status': status,
                'service_file_version': service_file_version or DEFAULT_SERVICE_FILE_VERSION,
            },
            'imported_from_discovery': True,
            'discovery_import': {
                'imported_at': datetime.now().isoformat(),
                'environment': environment or 'unknown',
                'environment_source': environment_source or 'unknown',
            },
        })
        instances[instance_name] = instance_record

        machine_record = dict(machines.get(machine_id, {}))
        machine_record.setdefault('instance_names', [])
        if instance_name not in machine_record['instance_names']:
            machine_record['instance_names'].append(instance_name)
        machine_record['instance_names'] = sorted(set(machine_record['instance_names']))
        machine_record['deployment_state'] = 'active'
        machines[machine_id] = machine_record

        self.fleet_state = fleet_state

    def get_fleet_state_copy(self) -> Dict[str, Any]:
        """Return a deep copy of the current fleet state."""
        return copy.deepcopy(self.fleet_state)

    def ensure_configuration_shell(self, config_name: str, environment: str) -> None:
        """Persist the current empty-or-existing inventory as an active configuration shell."""
        hosts = _get_gpu_hosts(self.app.inventory)
        self._save_config_with_metadata(config_name, environment, len(hosts), update_symlink=True)

    def upsert_machine_record(self, machine_id: str, machine_data: Dict[str, Any]) -> None:
        """Persist or update a machine record in the current fleet state."""
        fleet_state = self.get_fleet_state_copy()
        fleet_state = self._normalize_fleet_state(fleet_state)

        machine_record = dict(fleet_state['fleet']['machines'].get(machine_id, {}))
        machine_record.update(machine_data)
        machine_record['machine_id'] = machine_id
        machine_record.setdefault('instance_names', [])
        machine_record['instance_names'] = sorted(set(machine_record.get('instance_names', [])))
        fleet_state['fleet']['machines'][machine_id] = machine_record
        self.fleet_state = fleet_state

        config_name = self.active_config.get('config_name')
        if not config_name:
            raise ValueError("No active configuration shell is available")

        env = self.get_mnl_app_env() or self.active_config.get('environment') or 'mainnet'
        hosts = _get_gpu_hosts(self.app.inventory)
        self._save_config_with_metadata(config_name, env, len(hosts), update_symlink=False)

    def record_machine_discovery_scan(
        self,
        machine_id: str,
        candidates: List[Dict[str, Any]],
    ) -> None:
        """Persist the latest local discovery scan results for one machine."""
        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        machine_record = dict(fleet_state['fleet']['machines'].get(machine_id, {}))
        if not machine_record:
            raise ValueError(f"Machine '{machine_id}' is not present in the active fleet state")

        machine_record['discovery'] = {
            'last_scanned_at': datetime.now().isoformat(),
            'candidates': copy.deepcopy(candidates),
        }
        fleet_state['fleet']['machines'][machine_id] = machine_record
        self.fleet_state = fleet_state

        config_name = self.active_config.get('config_name')
        if not config_name:
            raise ValueError("No active configuration shell is available")

        env = self.get_mnl_app_env() or self.active_config.get('environment') or 'mainnet'
        hosts = _get_gpu_hosts(self.app.inventory)
        self._save_config_with_metadata(config_name, env, len(hosts), update_symlink=False)

    def set_migration_plan_state(self, plan_state: Optional[Dict[str, Any]]) -> None:
        """Persist or clear the locally saved migration-plan state."""
        config_name = self.active_config.get('config_name')
        if not config_name:
            raise ValueError("No active configuration shell is available")

        metadata_path = self._metadata_path_for_config(self.app.configs_dir, config_name)
        metadata = {}
        if metadata_path.exists():
            try:
                with open(metadata_path) as f:
                    metadata = json.load(f)
            except (json.JSONDecodeError, IOError):
                metadata = {}

        if plan_state:
            metadata['migration_plan_state'] = copy.deepcopy(plan_state)
            self.active_config['migration_plan_state'] = copy.deepcopy(plan_state)
        else:
            metadata.pop('migration_plan_state', None)
            self.active_config.pop('migration_plan_state', None)

        metadata['config_schema_version'] = CONFIG_SCHEMA_VERSION
        metadata['fleet_state'] = self._merge_fleet_state(self.fleet_state or metadata.get('fleet_state'), self.app.inventory)

        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)

        self._save_active_config()

    def reconcile_legacy_migration_plan_state(
        self,
        inventory: Optional[Dict[str, Any]] = None,
        *,
        persist: bool = False,
    ) -> Dict[str, Any]:
        """Repair clearly recoverable pre-fix migration-plan states without guessing."""
        plan = copy.deepcopy(self.active_config.get('migration_plan_state') or {})
        if not plan or str(plan.get('status') or '').strip() != 'rollback_failed':
            return {'changed': False, 'plan': plan}

        instance_name = str(plan.get('instance_name') or '').strip()
        source_machine_id = str(plan.get('source_machine_id') or '').strip()
        if not instance_name or not source_machine_id:
            return {'changed': False, 'plan': plan, 'reason': 'plan_missing_identity'}

        inventory = inventory if inventory is not None else self.app.inventory
        hosts = _get_gpu_hosts(inventory)
        host_config = dict(hosts.get(instance_name, {}))
        fleet_state = self._normalize_fleet_state(self.get_fleet_state_copy())
        instance_record = dict(fleet_state.get('fleet', {}).get('instances', {}).get(instance_name, {}))

        assigned_machine_id = str(
            host_config.get('r1setup_machine_id')
            or instance_record.get('assigned_machine_id')
            or ''
        ).strip()
        if assigned_machine_id != source_machine_id:
            return {'changed': False, 'plan': plan, 'reason': 'instance_not_on_source_machine'}

        node_status = str(
            host_config.get('node_status')
            or (instance_record.get('status') or {}).get('node_status')
            or ''
        ).strip().lower()
        if node_status != 'running':
            return {'changed': False, 'plan': plan, 'reason': 'source_runtime_not_running'}

        repaired_plan = copy.deepcopy(plan)
        repaired_plan['status'] = 'rolled_back'
        repaired_plan['last_step'] = 'rollback_completed'
        repaired_plan['legacy_repair'] = {
            'repaired_at': datetime.now().isoformat(),
            'previous_status': str(plan.get('status') or 'rollback_failed'),
            'reason': 'source_runtime_already_running',
        }
        rollback_recovery = dict(repaired_plan.get('rollback_recovery') or {})
        rollback_recovery.setdefault('reconciled_after_error', True)
        repaired_plan['rollback_recovery'] = rollback_recovery

        self.active_config['migration_plan_state'] = copy.deepcopy(repaired_plan)
        if persist:
            self.set_migration_plan_state(repaired_plan)
        return {'changed': True, 'plan': repaired_plan}

    def finalize_instance_migration(
        self,
        instance_name: str,
        target_machine_id: str,
        target_runtime: Dict[str, Any],
        *,
        runtime_name_policy: str,
        migration_plan_state: Optional[Dict[str, Any]] = None,
    ) -> None:
        """Finalize a migrated instance assignment in inventory and fleet state."""
        hosts = _get_gpu_hosts(self.app.inventory)
        host_config = hosts.get(instance_name)
        if host_config is None:
            raise ValueError(f"Instance host '{instance_name}' is not present in the active inventory")

        fleet_state = self.get_fleet_state_copy()
        fleet_state = self._normalize_fleet_state(fleet_state)
        machines = fleet_state['fleet']['machines']
        instances = fleet_state['fleet']['instances']

        instance_record = dict(instances.get(instance_name, {}))
        if not instance_record:
            raise ValueError(f"Instance '{instance_name}' is not present in the active fleet state")

        source_machine_id = str(instance_record.get('assigned_machine_id') or '').strip()
        source_machine = dict(machines.get(source_machine_id, {}))
        target_machine = dict(machines.get(target_machine_id, {}))
        if not target_machine:
            raise ValueError(f"Target machine '{target_machine_id}' is not present in the active fleet state")

        for key in (
            'ansible_host',
            'ansible_user',
            'ansible_port',
            'ansible_ssh_common_args',
            'ansible_ssh_pass',
            'ansible_become_password',
            'ansible_ssh_private_key_file',
        ):
            if key in target_machine:
                host_config[key] = target_machine[key]
            else:
                host_config.pop(key, None)

        host_config['r1setup_machine_id'] = target_machine_id
        host_config['r1setup_topology_mode'] = target_machine.get('topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE)
        host_config['r1setup_machine_deployment_state'] = 'active'
        host_config['r1setup_runtime_name_policy'] = runtime_name_policy
        host_config['edge_node_service_name'] = target_runtime.get('service_name', DEFAULT_RUNTIME_SERVICE_NAME)
        host_config['mnl_docker_container_name'] = target_runtime.get('container_name', DEFAULT_RUNTIME_CONTAINER_NAME)
        host_config['mnl_docker_volume_path'] = target_runtime.get('volume_path', DEFAULT_RUNTIME_VOLUME_PATH)
        host_config['mnl_r1setup_metadata_host_path'] = target_runtime.get(
            'metadata_path',
            f"{host_config['mnl_docker_volume_path']}/_data/r1setup/metadata.json",
        )
        host_config['r1setup_runtime_exit_status_path'] = target_runtime.get(
            'exit_status_path',
            f"/tmp/{host_config['mnl_docker_container_name']}.exit",
        )

        instance_record['assigned_machine_id'] = target_machine_id
        instance_record['runtime_name_policy'] = runtime_name_policy
        instance_record['runtime'] = dict(target_runtime)
        status_record = dict(instance_record.get('status', {}))
        status_record['node_status'] = host_config.get('node_status', 'running')
        instance_record['status'] = status_record
        instances[instance_name] = instance_record

        if source_machine:
            source_instances = [name for name in source_machine.get('instance_names', []) if name != instance_name]
            source_machine['instance_names'] = sorted(source_instances)
            source_machine['deployment_state'] = 'active' if source_instances else 'prepared'
            machines[source_machine_id] = source_machine

        target_instances = sorted(set(list(target_machine.get('instance_names', [])) + [instance_name]))
        target_machine['instance_names'] = target_instances
        target_machine['deployment_state'] = 'active'
        machines[target_machine_id] = target_machine

        self.fleet_state = fleet_state
        env = self.get_mnl_app_env() or self.active_config.get('environment') or 'mainnet'
        self._save_config_with_metadata(
            self.active_config['config_name'],
            env,
            len(hosts),
            update_symlink=False,
        )
        if migration_plan_state is not None:
            self.set_migration_plan_state(migration_plan_state)

    @staticmethod
    def _metadata_path_for_config(configs_dir: Path, config_name: str) -> Path:
        """Return the metadata path for a named configuration."""
        return configs_dir / f"{config_name}.json"

    def _save_config_with_metadata(self, config_name: str, environment: str, nodes_count: int, update_symlink: bool = True) -> None:
        """Save configuration with metadata"""
        config_path = self.app.configs_dir / f"{config_name}.yml"
        metadata_path = self._metadata_path_for_config(self.app.configs_dir, config_name)

        # Save the inventory configuration
        inventory_to_save = dict(self.app.inventory)
        # Remove environment from inventory as it's stored in metadata
        if 'vars' in inventory_to_save['all']:
            inventory_to_save['all']['vars'].pop('mnl_app_env', None)
            if not inventory_to_save['all']['vars']:
                inventory_to_save['all'].pop('vars', None)

        inventory_yaml = yaml.safe_dump(inventory_to_save, default_flow_style=False)

        # Load existing metadata if it exists, otherwise create new
        if metadata_path.exists():
            try:
                with open(metadata_path) as f:
                    metadata = json.load(f)

                # Update specific fields
                metadata['environment'] = environment
                metadata['nodes_count'] = nodes_count
            except (json.JSONDecodeError, IOError):
                # If file is corrupted or unreadable, create a new one
                metadata = {}
        else:
            metadata = {}

        # Fill in any missing metadata fields
        if 'created_at' not in metadata:
            metadata['created_at'] = datetime.now().isoformat()
        if 'config_schema_version' not in metadata:
            metadata['config_schema_version'] = CONFIG_SCHEMA_VERSION
        if 'config_name' not in metadata:
            metadata['config_name'] = config_name
        if 'description' not in metadata:
            fleet_machines = (self.fleet_state or {}).get('fleet', {}).get('machines', {})
            if fleet_machines and nodes_count == 0:
                metadata['description'] = f"Configuration with {len(fleet_machines)} machine(s) for {environment} network"
            else:
                metadata['description'] = f"Configuration with {nodes_count} node(s) for {environment} network"
        if 'last_deployed_date' not in metadata:
            metadata['last_deployed_date'] = None
        if 'last_deployed_network' not in metadata:
            metadata['last_deployed_network'] = None
        if 'deployment_status' not in metadata:
            metadata['deployment_status'] = 'never_deployed'
        if 'last_deleted_date' not in metadata:
            metadata['last_deleted_date'] = None
        # last_deployment_type was fleet-level and lied on mixed fleets — drop
        # it on load so stale values don't linger in the config file.
        metadata.pop('last_deployment_type', None)

        # Always update these fields
        metadata['environment'] = environment
        metadata['nodes_count'] = nodes_count
        metadata['config_schema_version'] = CONFIG_SCHEMA_VERSION
        # Version stamping: record the CLI + collection versions that wrote
        # this config, plus the write timestamp, so post-mortems can reason
        # about what tooling produced it. Added in 1.8.0; older configs gain
        # these keys on first save under 1.8.0.
        metadata['last_written_cli_version'] = CLI_VERSION
        metadata['last_written_collection_version'] = self.get_collection_version()
        metadata['last_written_at'] = datetime.now().isoformat()
        fleet_for_save = self._merge_fleet_state(self.fleet_state or metadata.get('fleet_state'), self.app.inventory)
        metadata['machines_count'] = len(fleet_for_save.get('fleet', {}).get('machines', {}))
        metadata['fleet_state'] = fleet_for_save

        metadata_json = json.dumps(metadata, indent=2)

        self._atomic_write_text(config_path, inventory_yaml, mode=0o600)
        self._atomic_write_text(metadata_path, metadata_json, mode=0o600)

        # Update active config
        self.active_config.update(metadata)
        self.fleet_state = copy.deepcopy(metadata['fleet_state'])
        self._save_active_config()

        # Create/update symlink to active configuration only if requested
        if update_symlink:
            self._update_hosts_symlink(config_path)

    @staticmethod
    def _atomic_write_text(path: Path, content: str, *, mode: Optional[int] = None) -> None:
        """Atomically replace a text file after fully writing it to a temp file."""
        path.parent.mkdir(parents=True, exist_ok=True)
        temp_path: Optional[Path] = None
        try:
            with tempfile.NamedTemporaryFile(
                'w',
                encoding='utf-8',
                dir=path.parent,
                delete=False,
            ) as temp_file:
                temp_file.write(content)
                temp_file.flush()
                os.fsync(temp_file.fileno())
                temp_path = Path(temp_file.name)

            if mode is not None:
                os.chmod(temp_path, mode)
            os.replace(temp_path, path)
        finally:
            if temp_path and temp_path.exists():
                temp_path.unlink(missing_ok=True)

    def _update_hosts_symlink(self, config_path: Path) -> None:
        """Update the hosts.yml symlink to point to the active configuration"""
        # Ensure the config directory exists
        self.app.config_dir.mkdir(parents=True, exist_ok=True)

        # Remove existing hosts.yml if it exists
        if self.app.config_file.exists() or self.app.config_file.is_symlink():
            self.app.config_file.unlink()

        # Create symlink to the active configuration
        try:
            self.app.config_file.symlink_to(config_path)
            self.app.print_colored(f"Active configuration linked to: {config_path.name}", 'green')
        except Exception as e:
            self.app.print_colored(f"Error creating symlink: {e}", 'red')

    def _load_config_by_name(self, config_name: str) -> bool:
        """Load a specific configuration by name"""
        config_path = self.app.configs_dir / f"{config_name}.yml"
        metadata_path = self._metadata_path_for_config(self.app.configs_dir, config_name)

        if not config_path.exists():
            return False

        try:
            # Load inventory
            with open(config_path) as f:
                self.app.inventory = yaml.safe_load(f) or self.app.inventory

            # Load metadata
            if metadata_path.exists():
                with open(metadata_path) as f:
                    metadata = json.load(f)
                metadata['fleet_state'] = self._merge_fleet_state(metadata.get('fleet_state'), self.app.inventory)
                self.active_config.update(metadata)
                self.fleet_state = copy.deepcopy(metadata['fleet_state'])
                self.reconcile_legacy_migration_plan_state(self.app.inventory, persist=True)
                self._save_active_config()
            else:
                self.fleet_state = self.build_fleet_state(self.app.inventory)

            # Update symlink
            self._update_hosts_symlink(config_path)

            # Set environment variable
            env = self.active_config.get('environment')
            if env:
                self.set_mnl_app_env(env)

            return True
        except Exception as e:
            self.app.print_colored(f"Error loading configuration: {e}", 'red')
            return False

    def load_configuration(self) -> bool:
        """Load existing configuration"""
        if not self.app.config_file.exists():
            return False

        try:
            with open(self.app.config_file) as f:
                self.app.inventory = yaml.safe_load(f) or self.app.inventory

            # Ensure network environment is synchronized with active configuration
            env = self.active_config.get('environment')
            if env:
                self.set_mnl_app_env(env)
                self.app.print_debug(f"Network environment synchronized to: {env}")

            config_name = self.active_config.get('config_name')
            if config_name:
                metadata_path = self._metadata_path_for_config(self.app.configs_dir, config_name)
                if metadata_path.exists():
                    try:
                        with open(metadata_path) as f:
                            metadata = json.load(f)
                        metadata['fleet_state'] = self._merge_fleet_state(metadata.get('fleet_state'), self.app.inventory)
                        self.active_config.update(metadata)
                        self.fleet_state = copy.deepcopy(metadata['fleet_state'])
                        self.reconcile_legacy_migration_plan_state(self.app.inventory, persist=True)
                    except Exception as e:
                        self.app.print_debug(f"Unable to load config metadata for {config_name}: {e}")
                        self.fleet_state = self.build_fleet_state(self.app.inventory)
                else:
                    self.fleet_state = self.build_fleet_state(self.app.inventory)
            else:
                self.fleet_state = self.build_fleet_state(self.app.inventory)

            updated = self._normalize_inventory(self.app.inventory)

            # Save configuration if we updated any status fields
            if updated:
                self.app.print_debug("Saving configuration after normalizing inventory state")
                self._save_configuration()
            else:
                self.app.print_debug("All nodes already have status fields, no updates needed")

            return True
        except Exception as e:
            self.app.print_colored(f"Error loading configuration: {e}", 'red')
            return False

    def get_mnl_app_env(self) -> Optional[str]:
        """Get the current network environment setting"""
        if self.app.vars_file.exists():
            try:
                with open(self.app.vars_file) as f:
                    data = yaml.safe_load(f) or {}
                    return data.get('mnl_app_env')
            except Exception:
                pass
        return self.app.inventory.get('all', {}).get('vars', {}).get('mnl_app_env')

    def set_mnl_app_env(self, env_value: str) -> None:
        """Set the network environment"""
        data = {}
        if self.app.vars_file.exists():
            try:
                with open(self.app.vars_file) as f:
                    data = yaml.safe_load(f) or {}
            except Exception:
                data = {}

        data['mnl_app_env'] = env_value
        self.app.vars_file.parent.mkdir(parents=True, exist_ok=True)

        with open(self.app.vars_file, 'w') as f:
            yaml.safe_dump(data, f, default_flow_style=False)

        self.app.inventory['all'].setdefault('vars', {})['mnl_app_env'] = env_value

    def get_mnl_service_version(self) -> str:
        """Get the current service-template version from group_vars/mnl.yml."""
        service_vars_file = self.app.config_dir / 'group_vars' / 'mnl.yml'
        if service_vars_file.exists():
            try:
                with open(service_vars_file) as f:
                    data = yaml.safe_load(f) or {}
                version = str(data.get('mnl_service_version') or '').strip()
                if version:
                    return version
            except Exception as e:
                self.app.print_debug(f"Unable to read service version from {service_vars_file}: {e}")
        return DEFAULT_SERVICE_FILE_VERSION

    def get_collection_version(self) -> str:
        """Get the current Ansible collection version from galaxy.yml."""
        galaxy_file = self.app.config_dir / 'galaxy.yml'
        if galaxy_file.exists():
            try:
                with open(galaxy_file) as f:
                    data = yaml.safe_load(f) or {}
                version = str(data.get('version') or '').strip()
                if version:
                    return version
            except Exception as e:
                self.app.print_debug(f"Unable to read collection version from {galaxy_file}: {e}")
        return 'unknown'

    @staticmethod
    def get_host_service_file_version(host_config: Dict[str, Any]) -> str:
        """Return the stored per-node service version, defaulting missing/blank values to v0."""
        version = str(host_config.get(SERVICE_FILE_VERSION_FIELD) or '').strip()
        return version or DEFAULT_SERVICE_FILE_VERSION

    def record_service_file_version(self, host_names: List[str], service_version: Optional[str] = None) -> None:
        """Persist the applied service-template version for the selected hosts."""
        applied_version = str(service_version or self.get_mnl_service_version()).strip() or DEFAULT_SERVICE_FILE_VERSION
        self.record_service_file_versions({host_name: applied_version for host_name in host_names})

    def record_service_file_versions(self, host_versions: Dict[str, str]) -> None:
        """Persist discovered service-template versions for the given hosts."""
        hosts = _get_gpu_hosts(self.app.inventory)
        changed = False

        for host_name, version in host_versions.items():
            normalized_version = str(version or '').strip()
            if normalized_version.lower() in UNKNOWN_SERVICE_FILE_VERSION_MARKERS:
                continue

            host_config = hosts.get(host_name)
            if host_config is None:
                continue

            if host_config.get(SERVICE_FILE_VERSION_FIELD) != normalized_version:
                host_config[SERVICE_FILE_VERSION_FIELD] = normalized_version
                changed = True

        if changed:
            self._save_configuration()

    @staticmethod
    def _derive_driver_owner(variant: str, manage_drivers: bool) -> str:
        """Map (variant, manage_drivers) → the driver_owner label persisted on each host."""
        if variant != 'gpu':
            return 'n/a'
        return 'r1setup' if manage_drivers else 'user'

    def record_install_attempt(
        self,
        host_names: List[str],
        variant: str,
        driver_owner: str,
        result: str,
    ) -> None:
        """Persist the last-attempted install metadata for each host.

        Attempts are recorded regardless of success, so the selection menu
        can show e.g. 'GPU (user) • 2026-04-17 ✗' next to a host whose last
        attempt failed but which still has a prior successful CPU install.
        """
        if result not in ('success', 'failed'):
            raise ValueError(f"Invalid attempt result {result!r}; expected 'success' or 'failed'")
        hosts = _get_gpu_hosts(self.app.inventory)
        now = datetime.now().isoformat()
        changed = False
        for name in host_names:
            host_config = hosts.get(name)
            if host_config is None:
                continue
            host_config[INSTALL_ATTEMPTED_VARIANT_FIELD] = variant
            host_config[INSTALL_ATTEMPTED_DRIVER_OWNER_FIELD] = driver_owner
            host_config[INSTALL_ATTEMPTED_AT_FIELD] = now
            host_config[INSTALL_ATTEMPTED_RESULT_FIELD] = result
            changed = True
        if changed:
            self._save_configuration()

    def record_install_success(
        self,
        host_names: List[str],
        variant: str,
        driver_owner: str,
    ) -> None:
        """Persist the last-successful install metadata for each host."""
        hosts = _get_gpu_hosts(self.app.inventory)
        now = datetime.now().isoformat()
        collection_version = self.get_collection_version()
        changed = False
        for name in host_names:
            host_config = hosts.get(name)
            if host_config is None:
                continue
            host_config[INSTALL_LAST_VARIANT_FIELD] = variant
            host_config[INSTALL_LAST_DRIVER_OWNER_FIELD] = driver_owner
            host_config[INSTALL_LAST_AT_FIELD] = now
            host_config[INSTALL_LAST_COLLECTION_VERSION_FIELD] = collection_version
            changed = True
        if changed:
            self._save_configuration()

    @staticmethod
    def install_variant_summary(inventory: Dict[str, Any]) -> str:
        """Per-host variant rollup for display, e.g. 'GPU: 2, CPU: 3' or
        'no installs yet'. Reads r1setup_last_install_variant from each host.

        Replaces the fleet-level last_deployment_type field, which lied on
        mixed fleets.
        """
        hosts = _get_gpu_hosts(inventory)
        counts: Dict[str, int] = {'gpu': 0, 'cpu': 0}
        never = 0
        for cfg in hosts.values():
            v = cfg.get(INSTALL_LAST_VARIANT_FIELD)
            if v in counts:
                counts[v] += 1
            else:
                never += 1
        pieces = []
        if counts['gpu']:
            pieces.append(f"GPU: {counts['gpu']}")
        if counts['cpu']:
            pieces.append(f"CPU: {counts['cpu']}")
        if not pieces:
            return 'no installs yet'
        if never:
            pieces.append(f"Never: {never}")
        return ", ".join(pieces)

    @staticmethod
    def _read_fetched_metadata(
        host_names: List[str],
        fetched_dir: Optional[Path] = None,
    ) -> Dict[str, Dict[str, Any]]:
        """Read per-host metadata JSON files pulled from each host by the
        `fetch` task in render_edge_node_definition.yml. Returns a mapping
        {hostname: metadata_dict}. Missing or malformed files yield an
        empty entry for that host so callers can uniformly iterate.
        """
        base = fetched_dir if fetched_dir is not None else Path('/tmp/r1setup-fetched')
        out: Dict[str, Dict[str, Any]] = {}
        for name in host_names:
            fpath = base / f"{name}.json"
            if not fpath.exists():
                out[name] = {}
                continue
            try:
                with fpath.open('r') as fh:
                    out[name] = json.load(fh)
            except (OSError, ValueError):
                out[name] = {}
        return out

    def _save_configuration(self) -> None:
        """Save configuration to file (legacy method - now uses new config management)"""
        # Get current environment and node count
        env = self.get_mnl_app_env() or 'mainnet'
        hosts = _get_gpu_hosts(self.app.inventory)
        nodes_count = len(hosts)

        # Generate config name if not set
        if not self.active_config.get('config_name'):
            config_name = self._generate_config_name(nodes_count)
            self._save_config_with_metadata(config_name, env, nodes_count, update_symlink=True)
        else:
            # Update existing config - no need to update symlink since we're not switching configs
            config_name = self.active_config['config_name']
            self._save_config_with_metadata(config_name, env, nodes_count, update_symlink=False)

    @staticmethod
    def _normalize_host_config(host_config: Dict[str, Any], timestamp: Optional[str] = None) -> bool:
        """Normalize host-level persisted fields without changing runtime behavior."""
        changed = False
        timestamp = timestamp or datetime.now().isoformat()

        if 'node_status' not in host_config:
            host_config['node_status'] = 'unknown'
            changed = True

        if 'last_status_update' not in host_config:
            host_config['last_status_update'] = timestamp
            changed = True

        normalized_service_version = ConfigurationManager.get_host_service_file_version(host_config)
        if host_config.get(SERVICE_FILE_VERSION_FIELD) != normalized_service_version:
            host_config[SERVICE_FILE_VERSION_FIELD] = normalized_service_version
            changed = True

        if 'last_status_check' in host_config:
            del host_config['last_status_check']
            changed = True

        # Backfill install-tracking fields for old inventories so the menu
        # renderer can rely on them always being present (None = never).
        for field in INSTALL_TRACKING_FIELDS:
            if field not in host_config:
                host_config[field] = None
                changed = True

        return changed

    def _normalize_inventory(self, inventory: Optional[Dict[str, Any]] = None) -> bool:
        """Normalize persisted inventory state in place.

        Returns True when any host-level data was updated.
        """
        inventory = inventory if inventory is not None else self.app.inventory
        hosts = _get_gpu_hosts(inventory)
        updated = False
        timestamp = datetime.now().isoformat()
        self.app.print_debug(f"Checking status field initialization for {len(hosts)} nodes")
        normalized_fleet = self._normalize_fleet_state(self.get_fleet_state_copy())
        endpoint_to_machine: Dict[str, Dict[str, Any]] = {}
        for machine_id, machine_record in normalized_fleet.get('fleet', {}).get('machines', {}).items():
            endpoint = self._machine_endpoint_from_record(machine_record)
            if endpoint:
                endpoint_to_machine[endpoint] = {
                    'machine_id': machine_id,
                    'machine_record': dict(machine_record),
                }

        for host_name, host_config in hosts.items():
            needs_update = self._normalize_host_config(host_config, timestamp)
            endpoint = self._machine_endpoint_from_record(host_config, fallback_host_name=host_name)
            if endpoint and endpoint in endpoint_to_machine:
                machine_id = endpoint_to_machine[endpoint]['machine_id']
                machine_record = endpoint_to_machine[endpoint]['machine_record']
                if str(host_config.get('r1setup_machine_id') or '').strip() != machine_id:
                    host_config['r1setup_machine_id'] = machine_id
                    needs_update = True
                if machine_record.get('topology_mode') and host_config.get('r1setup_topology_mode') != machine_record.get('topology_mode'):
                    host_config['r1setup_topology_mode'] = machine_record.get('topology_mode')
                    needs_update = True
                if machine_record.get('deployment_state') and host_config.get('r1setup_machine_deployment_state') != machine_record.get('deployment_state'):
                    host_config['r1setup_machine_deployment_state'] = machine_record.get('deployment_state')
                    needs_update = True
            if self.apply_runtime_snapshot_to_host_config(host_name, host_config):
                needs_update = True
            if needs_update:
                updated = True
                self.app.print_debug(f"Initialized missing status fields for node: {host_name}")
                self.app.print_debug(
                    f"  status={host_config.get('node_status')}, update={host_config.get('last_status_update')}"
                )
            else:
                current_status = host_config.get('node_status', 'unknown')
                current_update = host_config.get('last_status_update', 'none')
                self.app.print_debug(
                    f"Node {host_name} has all status fields: status={current_status}, update={current_update}"
                )

        return updated

    @staticmethod
    def _derive_machine_id(host_name: str, host_config: Dict[str, Any]) -> str:
        """Return a stable machine identity for fleet grouping."""
        explicit_id = str(host_config.get('r1setup_machine_id') or '').strip()
        if explicit_id:
            return explicit_id

        ansible_host = str(host_config.get('ansible_host') or host_name).strip()
        ansible_user = str(host_config.get('ansible_user') or 'root').strip()
        ansible_port = str(host_config.get('ansible_port') or 22).strip()
        return f"{ansible_user}@{ansible_host}:{ansible_port}"

    @staticmethod
    def _sanitize_runtime_suffix(value: str) -> str:
        """Convert a logical instance name into a stable runtime-safe suffix."""
        sanitized = re.sub(r'[^a-zA-Z0-9]+', '_', str(value or '').strip().lower()).strip('_')
        return sanitized or 'instance'

    @staticmethod
    def resolve_helper_mode(topology_mode: str = DEFAULT_MACHINE_TOPOLOGY_MODE) -> str:
        """Resolve helper-installation mode from topology."""
        return HELPER_MODE_EXPERT if str(topology_mode or '').strip() == 'expert' else HELPER_MODE_STANDARD

    @classmethod
    def resolve_runtime_names(
        cls,
        logical_name: str,
        *,
        topology_mode: str = DEFAULT_MACHINE_TOPOLOGY_MODE,
        runtime_name_policy: str = 'normalize_to_target',
        existing_runtime: Optional[Dict[str, Any]] = None,
        custom_runtime: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Resolve runtime names for a logical instance using the configured policy."""
        existing_runtime = dict(existing_runtime or {})
        custom_runtime = dict(custom_runtime or {})

        def _finalize_runtime(service_name: str, container_name: str, volume_path: str, metadata_path: Optional[str] = None,
                              exit_status_path: Optional[str] = None) -> Dict[str, Any]:
            normalized_volume = str(volume_path).strip()
            normalized_metadata = str(
                metadata_path or f"{normalized_volume}/_data/r1setup/metadata.json"
            ).strip()
            normalized_exit_status = str(
                exit_status_path or f"/tmp/{container_name}.exit"
            ).strip()
            return {
                'service_name': str(service_name).strip(),
                'container_name': str(container_name).strip(),
                'volume_path': normalized_volume,
                'metadata_path': normalized_metadata,
                'exit_status_path': normalized_exit_status,
            }

        if runtime_name_policy == 'preserve' and existing_runtime:
            return _finalize_runtime(
                existing_runtime.get('service_name') or DEFAULT_RUNTIME_SERVICE_NAME,
                existing_runtime.get('container_name') or DEFAULT_RUNTIME_CONTAINER_NAME,
                existing_runtime.get('volume_path') or DEFAULT_RUNTIME_VOLUME_PATH,
                existing_runtime.get('metadata_path'),
                existing_runtime.get('exit_status_path'),
            )

        if runtime_name_policy == 'custom':
            return _finalize_runtime(
                custom_runtime.get('service_name') or DEFAULT_RUNTIME_SERVICE_NAME,
                custom_runtime.get('container_name') or custom_runtime.get('service_name') or DEFAULT_RUNTIME_CONTAINER_NAME,
                custom_runtime.get('volume_path') or DEFAULT_RUNTIME_VOLUME_PATH,
                custom_runtime.get('metadata_path'),
                custom_runtime.get('exit_status_path'),
            )

        if topology_mode == 'standard':
            return _finalize_runtime(
                DEFAULT_RUNTIME_SERVICE_NAME,
                DEFAULT_RUNTIME_CONTAINER_NAME,
                DEFAULT_RUNTIME_VOLUME_PATH,
                exit_status_path=DEFAULT_RUNTIME_EXIT_STATUS_PATH,
            )

        suffix = cls._sanitize_runtime_suffix(logical_name)
        runtime_base = f"edge_node_{suffix}"
        return _finalize_runtime(
            runtime_base,
            runtime_base,
            f"/var/cache/{runtime_base}/_local_cache",
        )

    @classmethod
    def build_helper_runtime(
        cls,
        host_name: str,
        host_config: Dict[str, Any],
    ) -> Dict[str, Any]:
        """Build helper-mode and remote helper commands for a host config."""
        runtime = cls._build_runtime_snapshot(host_name, host_config)
        helper_mode = cls.resolve_helper_mode(
            str(host_config.get('r1setup_topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE)
        )
        service_name = runtime.get('service_name') or DEFAULT_RUNTIME_SERVICE_NAME
        helper_registry_path = f"{DEFAULT_HELPER_REGISTRY_DIR}/{service_name}.env"

        if helper_mode == HELPER_MODE_EXPERT:
            command_prefix = f"r1service {shlex.quote(service_name)}"
            remote_commands = {
                'logs': f"{command_prefix} logs",
                'info': f"{command_prefix} info",
                'restart': f"{command_prefix} restart",
                'history': f"{command_prefix} history",
                'get_e2_pem': f"{command_prefix} get-e2-pem",
            }
        else:
            remote_commands = {
                'logs': 'get_logs',
                'info': 'get_node_info',
                'restart': 'restart_service',
                'history': 'get_node_history',
                'get_e2_pem': 'get_e2_pem_file',
            }

        return {
            'helper_mode': helper_mode,
            'helper_registry_path': helper_registry_path,
            'remote_commands': remote_commands,
        }

    @staticmethod
    def detect_runtime_collisions(
        machine_id: str,
        runtime: Dict[str, Any],
        fleet_state: Dict[str, Any],
        *,
        exclude_instance: Optional[str] = None,
    ) -> Dict[str, List[str]]:
        """Return colliding instances keyed by runtime field for a machine."""
        collisions: Dict[str, List[str]] = {}
        instances = fleet_state.get('fleet', {}).get('instances', {})

        for instance_name, instance_data in instances.items():
            if exclude_instance and instance_name == exclude_instance:
                continue
            if instance_data.get('assigned_machine_id') != machine_id:
                continue
            other_runtime = instance_data.get('runtime') or {}
            for field in ('service_name', 'container_name', 'volume_path', 'metadata_path', 'exit_status_path'):
                runtime_value = runtime.get(field)
                if runtime_value and runtime_value == other_runtime.get(field):
                    collisions.setdefault(field, []).append(instance_name)

        return collisions

    @classmethod
    def _build_runtime_snapshot(cls, host_name: str, host_config: Dict[str, Any]) -> Dict[str, Any]:
        """Build a minimal runtime snapshot from legacy inventory data."""
        return cls.resolve_runtime_names(
            str(host_config.get('r1setup_instance_logical_name') or host_name),
            topology_mode=str(host_config.get('r1setup_topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE),
            runtime_name_policy=str(host_config.get('r1setup_runtime_name_policy') or 'preserve'),
            existing_runtime={
                'service_name': host_config.get('edge_node_service_name'),
                'container_name': host_config.get('mnl_docker_container_name'),
                'volume_path': host_config.get('mnl_docker_volume_path'),
                'metadata_path': host_config.get('mnl_r1setup_metadata_host_path'),
                'exit_status_path': host_config.get('r1setup_runtime_exit_status_path'),
            },
        )

    @classmethod
    def apply_runtime_snapshot_to_host_config(cls, host_name: str, host_config: Dict[str, Any]) -> bool:
        """Persist the resolved runtime snapshot onto a host config."""
        changed = False
        host_config.setdefault('r1setup_instance_logical_name', host_name)
        host_config.setdefault('r1setup_topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE)
        host_config.setdefault('r1setup_runtime_name_policy', 'preserve')

        runtime = cls._build_runtime_snapshot(host_name, host_config)
        runtime_field_map = {
            'edge_node_service_name': runtime['service_name'],
            'mnl_docker_container_name': runtime['container_name'],
            'mnl_docker_volume_path': runtime['volume_path'],
            'mnl_r1setup_metadata_host_path': runtime['metadata_path'],
            'r1setup_runtime_exit_status_path': runtime['exit_status_path'],
        }

        for field, value in runtime_field_map.items():
            if host_config.get(field) != value:
                host_config[field] = value
                changed = True

        return changed

    def detect_helper_mode_conflicts(
        self,
        inventory: Optional[Dict[str, Any]] = None,
        *,
        selected_host_names: Optional[List[str]] = None,
    ) -> Dict[str, Dict[str, Any]]:
        """Return machines whose configured instances disagree on helper mode."""
        inventory = inventory if inventory is not None else self.app.inventory
        hosts = _get_gpu_hosts(inventory)
        selected = set(selected_host_names or hosts.keys())
        machine_details: Dict[str, Dict[str, Any]] = {}

        for host_name, host_config in hosts.items():
            machine_id = self._derive_machine_id(host_name, host_config)
            detail = machine_details.setdefault(machine_id, {
                'all_hosts': [],
                'selected_hosts': [],
                'helper_modes': {},
            })
            helper_mode = self.resolve_helper_mode(
                str(host_config.get('r1setup_topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE)
            )
            detail['all_hosts'].append(host_name)
            detail['helper_modes'].setdefault(helper_mode, []).append(host_name)
            if host_name in selected:
                detail['selected_hosts'].append(host_name)

        conflicts = {}
        for machine_id, detail in machine_details.items():
            if not detail['selected_hosts']:
                continue
            if len(detail['helper_modes']) <= 1:
                continue
            conflicts[machine_id] = detail

        return conflicts

    def build_fleet_state(self, inventory: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        """Derive a schema-aware fleet view from the current legacy inventory."""
        inventory = inventory if inventory is not None else self.app.inventory
        hosts = _get_gpu_hosts(inventory)
        fleet_state = {
            'config_schema_version': CONFIG_SCHEMA_VERSION,
            'fleet': {
                'machines': {},
                'instances': {},
            },
        }

        for host_name, host_config in sorted(hosts.items()):
            machine_id = self._derive_machine_id(host_name, host_config)
            runtime = self._build_runtime_snapshot(host_name, host_config)
            machine = fleet_state['fleet']['machines'].setdefault(machine_id, {
                'machine_id': machine_id,
                'ansible_host': host_config.get('ansible_host'),
                'ansible_user': host_config.get('ansible_user', 'root'),
                'ansible_port': host_config.get('ansible_port', 22),
                'ansible_ssh_common_args': host_config.get('ansible_ssh_common_args'),
                'ansible_ssh_pass': host_config.get('ansible_ssh_pass'),
                'ansible_become_password': host_config.get('ansible_become_password'),
                'ansible_ssh_private_key_file': host_config.get('ansible_ssh_private_key_file'),
                'topology_mode': host_config.get('r1setup_topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE),
                'deployment_state': host_config.get(
                    'r1setup_machine_deployment_state', DEFAULT_MACHINE_DEPLOYMENT_STATE
                ),
                'instance_names': [],
            })
            for field in (
                'ansible_ssh_common_args',
                'ansible_ssh_pass',
                'ansible_become_password',
                'ansible_ssh_private_key_file',
            ):
                if machine.get(field) in (None, '') and host_config.get(field) not in (None, ''):
                    machine[field] = host_config.get(field)
            machine['instance_names'].append(host_name)

            fleet_state['fleet']['instances'][host_name] = {
                'logical_name': str(host_config.get('r1setup_instance_logical_name') or host_name),
                'assigned_machine_id': machine_id,
                'runtime_name_policy': host_config.get('r1setup_runtime_name_policy', 'preserve'),
                'runtime': runtime,
                'resources': {
                    'cpu_limit_cores': host_config.get('r1setup_cpu_limit_cores'),
                    'memory_limit_gb': host_config.get('r1setup_memory_limit_gb'),
                },
                'status': {
                    'node_status': host_config.get('node_status', 'unknown'),
                    'service_file_version': self.get_host_service_file_version(host_config),
                },
            }

        return fleet_state

    @staticmethod
    def _get_group_view_status_info(status: str) -> Dict[str, str]:
        """Return compact display information for instance/machine group status."""
        status_map = {
            'running': {'emoji': '🟢', 'color': 'green', 'label': 'Running'},
            'stopped': {'emoji': '🔴', 'color': 'red', 'label': 'Stopped'},
            'pending_restart': {'emoji': '🟡', 'color': 'yellow', 'label': 'Pending Restart'},
            'unknown': {'emoji': '❓', 'color': 'white', 'label': 'Unknown'},
            'deploying': {'emoji': '🔄', 'color': 'cyan', 'label': 'Deploying'},
            'error': {'emoji': '❌', 'color': 'red', 'label': 'Error'},
            'never_deployed': {'emoji': '⚪', 'color': 'white', 'label': 'Never Deployed'},
            'deleted': {'emoji': '🗑️', 'color': 'red', 'label': 'Deleted'},
            'unreachable': {'emoji': '🔌', 'color': 'red', 'label': 'Unreachable'},
            'not_deployed': {'emoji': '📦', 'color': 'yellow', 'label': 'Not Deployed'},
            'empty': {'emoji': '📭', 'color': 'yellow', 'label': 'No Instances'},
            'mixed': {'emoji': '🟡', 'color': 'yellow', 'label': 'Mixed States'},
        }
        return dict(status_map.get(status, status_map['unknown']))

    @classmethod
    def _summarize_machine_group_status(cls, status_counts: Dict[str, int], instance_count: int) -> Dict[str, str]:
        """Summarize a machine group into one operator-facing status."""
        if instance_count <= 0:
            return cls._get_group_view_status_info('empty')

        non_zero_statuses = [status for status, count in status_counts.items() if count > 0]
        if len(non_zero_statuses) == 1:
            return cls._get_group_view_status_info(non_zero_statuses[0])

        return cls._get_group_view_status_info('mixed')

    @staticmethod
    def _format_machine_connection_display(machine_data: Dict[str, Any]) -> str:
        """Return a concise machine connection string for CLI display."""
        host = str(machine_data.get('ansible_host') or 'unknown').strip()
        user = str(machine_data.get('ansible_user') or 'root').strip()
        port = str(machine_data.get('ansible_port') or 22).strip()
        if port == '22':
            return f"{user}@{host}"
        return f"{user}@{host}:{port}"

    @classmethod
    def _format_machine_display_label(cls, machine_id: str, machine_data: Dict[str, Any]) -> str:
        """Return a readable machine label for grouped CLI views."""
        hostname = str((machine_data.get('machine_specs') or {}).get('hostname') or '').strip()
        if cls._is_derived_machine_id(machine_id, machine_data):
            return hostname or cls._format_machine_connection_display(machine_data)
        return machine_id

    @staticmethod
    def _format_machine_memory_gib(memory_total: Any) -> str:
        """Return a stable GiB display string for machine memory totals."""
        if memory_total in (None, ''):
            return '?'
        try:
            numeric = float(memory_total)
        except (TypeError, ValueError):
            return str(memory_total)
        if numeric.is_integer():
            return str(int(numeric))
        return f"{numeric:.1f}"

    @classmethod
    def _format_machine_specs_summary(cls, machine_specs: Optional[Dict[str, Any]]) -> str:
        """Return a concise machine-spec string or an empty string."""
        specs = machine_specs or {}
        cpu_total = specs.get('cpu_total')
        memory_total = specs.get('memory_gb_total')
        if cpu_total is None and memory_total is None:
            return ''
        return (
            f"{cpu_total if cpu_total is not None else '?'} CPU / "
            f"{cls._format_machine_memory_gib(memory_total)} GiB RAM"
        )

    @staticmethod
    def assess_machine_resource_recommendation(
        machine_specs: Optional[Dict[str, Any]],
        *,
        planned_instances: int = 1,
    ) -> Dict[str, Any]:
        """Classify machine capacity against the recommended per-instance minimum."""
        specs = machine_specs or {}
        cpu_total = specs.get('cpu_total')
        memory_total = specs.get('memory_gb_total')
        if cpu_total in (None, '') or memory_total in (None, ''):
            return {
                'status': 'unknown',
                'color': 'white',
                'message': 'Machine specs are unavailable; recommended capacity could not be evaluated.',
            }

        try:
            cpu_total_value = int(cpu_total)
            memory_total_value = float(memory_total)
        except (TypeError, ValueError):
            return {
                'status': 'unknown',
                'color': 'white',
                'message': 'Machine specs are unavailable; recommended capacity could not be evaluated.',
            }

        planned_instances = max(1, int(planned_instances))
        required_cpu = MIN_RECOMMENDED_NODE_CPU_CORES * planned_instances
        required_memory_nominal = MIN_RECOMMENDED_NODE_MEMORY_GIB * planned_instances
        required_memory_tolerated = MIN_TOLERATED_NODE_MEMORY_GIB * planned_instances

        if cpu_total_value >= required_cpu and memory_total_value >= required_memory_nominal:
            return {
                'status': 'meets_recommendation',
                'color': 'green',
                'message': (
                    f"Observed capacity meets the recommended minimum for {planned_instances} node(s): "
                    f"{required_cpu} CPU / {required_memory_nominal:.0f} GiB RAM."
                ),
            }

        if cpu_total_value >= required_cpu and memory_total_value >= required_memory_tolerated:
            return {
                'status': 'tolerated_near_boundary',
                'color': 'yellow',
                'message': (
                    f"Observed memory is slightly below the nominal recommendation for {planned_instances} node(s), "
                    f"but within the tolerated near-16 GiB boundary "
                    f"({memory_total_value:.1f} GiB observed vs {required_memory_nominal:.0f} GiB nominal)."
                ),
            }

        return {
            'status': 'below_recommendation',
            'color': 'yellow',
            'message': (
                f"Observed capacity is below the recommended minimum for {planned_instances} node(s): "
                f"{cpu_total_value} CPU / {memory_total_value:.1f} GiB RAM observed, "
                f"{required_cpu} CPU / {required_memory_nominal:.0f} GiB RAM recommended."
            ),
        }

    def build_machine_group_views(
        self,
        inventory: Optional[Dict[str, Any]] = None,
        fleet_state: Optional[Dict[str, Any]] = None,
        node_status_data: Optional[Dict[str, Dict[str, Any]]] = None,
    ) -> List[Dict[str, Any]]:
        """Build grouped machine/instance views for fleet-aware CLI rendering."""
        inventory = inventory if inventory is not None else self.app.inventory
        hosts = _get_gpu_hosts(inventory)
        normalized_fleet = self._normalize_fleet_state(fleet_state if fleet_state is not None else self.get_fleet_state_copy())
        machines = copy.deepcopy(normalized_fleet['fleet']['machines'])
        instances = copy.deepcopy(normalized_fleet['fleet']['instances'])
        live_status = dict(node_status_data or {})

        for host_name, host_config in hosts.items():
            machine_id = self._derive_machine_id(host_name, host_config)
            machines.setdefault(machine_id, {
                'machine_id': machine_id,
                'ansible_host': host_config.get('ansible_host'),
                'ansible_user': host_config.get('ansible_user', 'root'),
                'ansible_port': host_config.get('ansible_port', 22),
                'topology_mode': host_config.get('r1setup_topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE),
                'deployment_state': host_config.get('r1setup_machine_deployment_state', DEFAULT_MACHINE_DEPLOYMENT_STATE),
                'instance_names': [],
            })
            if host_name not in machines[machine_id].setdefault('instance_names', []):
                machines[machine_id]['instance_names'].append(host_name)

        machine_to_instances: Dict[str, List[str]] = {}
        for machine_id, machine_data in machines.items():
            machine_to_instances[machine_id] = list(machine_data.get('instance_names') or [])

        for instance_name, instance_data in instances.items():
            machine_id = instance_data.get('assigned_machine_id')
            if not machine_id:
                continue
            machine_to_instances.setdefault(machine_id, [])
            if instance_name not in machine_to_instances[machine_id]:
                machine_to_instances[machine_id].append(instance_name)

        machine_views = []
        for machine_id in sorted(set(list(machines.keys()) + list(machine_to_instances.keys()))):
            machine_data = dict(machines.get(machine_id, {}))
            instance_names = sorted(set(machine_to_instances.get(machine_id, [])))
            topology_mode = str(machine_data.get('topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE)
            deployment_state = str(machine_data.get('deployment_state') or DEFAULT_MACHINE_DEPLOYMENT_STATE)
            machine_specs = dict(machine_data.get('machine_specs') or {})
            discovery_info = dict(machine_data.get('discovery') or {})
            discovered_candidates = [
                dict(candidate)
                for candidate in (discovery_info.get('candidates') or [])
                if isinstance(candidate, dict)
            ]

            instance_views = []
            status_counts: Dict[str, int] = {}

            for instance_name in instance_names:
                instance_data = dict(instances.get(instance_name, {}))
                host_config = dict(hosts.get(instance_name, {}))
                runtime = dict(instance_data.get('runtime') or self._build_runtime_snapshot(instance_name, host_config))
                live_entry = dict(live_status.get(instance_name, {}))
                host_service_file_version = self.get_host_service_file_version(host_config)
                app_get_host_service_file_version = getattr(self.app, 'get_host_service_file_version', None)
                if callable(app_get_host_service_file_version):
                    host_service_file_version = str(app_get_host_service_file_version(host_config))

                status = str(
                    live_entry.get('status')
                    or instance_data.get('status', {}).get('node_status')
                    or host_config.get('node_status')
                    or 'unknown'
                )
                status_info = self._get_group_view_status_info(status)
                service_file_version = str(
                    live_entry.get('service_file_version')
                    or instance_data.get('status', {}).get('service_file_version')
                    or host_service_file_version
                )
                status_counts[status] = status_counts.get(status, 0) + 1

                instance_views.append({
                    'instance_name': instance_name,
                    'logical_name': str(instance_data.get('logical_name') or host_config.get('r1setup_instance_logical_name') or instance_name),
                    'assigned_machine_id': machine_id,
                    'runtime_name_policy': str(instance_data.get('runtime_name_policy') or host_config.get('r1setup_runtime_name_policy') or 'preserve'),
                    'runtime': runtime,
                    'status': status,
                    'status_emoji': status_info['emoji'],
                    'status_color': status_info['color'],
                    'status_label': status_info['label'],
                    'service_file_version': service_file_version,
                    'last_update': host_config.get('last_status_update', ''),
                    'ssh_auth_mode': str(host_config.get('r1setup_ssh_auth_mode') or 'unknown'),
                    'ansible_host': host_config.get('ansible_host'),
                    'ansible_user': host_config.get('ansible_user'),
                    'ansible_port': host_config.get('ansible_port'),
                })

            tracked_service_names = {
                str((instance_view.get('runtime') or {}).get('service_name') or '').strip()
                for instance_view in instance_views
                if str((instance_view.get('runtime') or {}).get('service_name') or '').strip()
            }
            untracked_discovered_candidates = [
                candidate
                for candidate in discovered_candidates
                if str(candidate.get('service_name') or '').strip() not in tracked_service_names
            ]

            group_status_info = self._summarize_machine_group_status(status_counts, len(instance_views))
            machine_views.append({
                'machine_id': machine_id,
                'display_label': self._format_machine_display_label(machine_id, machine_data),
                'ansible_host': machine_data.get('ansible_host'),
                'ansible_user': machine_data.get('ansible_user', 'root'),
                'ansible_port': machine_data.get('ansible_port', 22),
                'connection_display': self._format_machine_connection_display(machine_data),
                'topology_mode': topology_mode,
                'deployment_state': deployment_state,
                'instance_count': len(instance_views),
                'machine_specs': machine_specs,
                'machine_specs_summary': self._format_machine_specs_summary(machine_specs),
                'instances': instance_views,
                'discovered_candidates': discovered_candidates,
                'untracked_discovered_candidates': untracked_discovered_candidates,
                'last_discovery_scan_at': discovery_info.get('last_scanned_at'),
                'status_counts': status_counts,
                'group_status': group_status_info['label'],
                'group_status_color': group_status_info['color'],
                'group_status_emoji': group_status_info['emoji'],
            })

        return machine_views

    @classmethod
    def build_execution_host_config(cls, host_name: str, host_config: Dict[str, Any]) -> Dict[str, Any]:
        """Return a host config enriched with resolved runtime/helper fields."""
        execution_config = copy.deepcopy(host_config)
        runtime = cls._build_runtime_snapshot(host_name, execution_config)
        helper_runtime = cls.build_helper_runtime(host_name, execution_config)
        volume_path = runtime.get('volume_path') or DEFAULT_RUNTIME_VOLUME_PATH
        metadata_path = runtime.get('metadata_path') or f"{volume_path}/_data/r1setup/metadata.json"
        metadata_dir = str(Path(metadata_path).parent)

        execution_config.update({
            'r1setup_machine_id': cls._derive_machine_id(host_name, execution_config),
            'r1setup_instance_logical_name': str(execution_config.get('r1setup_instance_logical_name') or host_name),
            'r1setup_topology_mode': str(execution_config.get('r1setup_topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE),
            'r1setup_runtime_name_policy': str(execution_config.get('r1setup_runtime_name_policy') or 'preserve'),
            'edge_node_service_name': runtime['service_name'],
            'mnl_docker_container_name': runtime['container_name'],
            'mnl_docker_volume_path': volume_path,
            'mnl_base_folder': str(Path(volume_path).parent),
            'mnl_local_cache_folder': volume_path,
            'mnl_config_startup_path': f"{volume_path}/config_startup.json",
            'mnl_r1setup_metadata_host_dir': metadata_dir,
            'mnl_r1setup_metadata_host_path': metadata_path,
            'r1setup_runtime_exit_status_path': runtime['exit_status_path'],
            'r1setup_helper_mode': helper_runtime['helper_mode'],
            'r1setup_effective_helper_mode': helper_runtime['helper_mode'],
            'r1setup_helper_registry_dir': DEFAULT_HELPER_REGISTRY_DIR,
            'r1setup_helper_registry_path': helper_runtime['helper_registry_path'],
            'r1setup_effective_helper_registry_path': helper_runtime['helper_registry_path'],
            'r1setup_remote_get_logs_command': helper_runtime['remote_commands']['logs'],
            'r1setup_remote_get_node_info_command': helper_runtime['remote_commands']['info'],
            'r1setup_remote_restart_service_command': helper_runtime['remote_commands']['restart'],
            'r1setup_remote_get_node_history_command': helper_runtime['remote_commands']['history'],
            'r1setup_remote_get_e2_pem_command': helper_runtime['remote_commands']['get_e2_pem'],
            'r1setup_effective_service_name': runtime['service_name'],
            'r1setup_effective_container_name': runtime['container_name'],
            'r1setup_effective_volume_path': volume_path,
            'r1setup_effective_base_folder': str(Path(volume_path).parent),
            'r1setup_effective_local_cache_folder': volume_path,
            'r1setup_effective_config_startup_path': f"{volume_path}/config_startup.json",
            'r1setup_effective_metadata_host_dir': metadata_dir,
            'r1setup_effective_metadata_host_path': metadata_path,
            'r1setup_effective_exit_status_path': runtime['exit_status_path'],
        })
        return execution_config

    @classmethod
    def group_host_names_by_machine(
        cls,
        inventory: Dict[str, Any],
        selected_host_names: List[str],
    ) -> Dict[str, Dict[str, Any]]:
        """Group selected instance-hosts by physical machine identity."""
        hosts = _get_gpu_hosts(inventory)
        grouped: Dict[str, Dict[str, Any]] = {}

        for host_name in selected_host_names:
            host_config = hosts.get(host_name)
            if not host_config:
                continue
            machine_id = cls._derive_machine_id(host_name, host_config)
            entry = grouped.setdefault(machine_id, {
                'representative_host': host_name,
                'host_names': [],
            })
            entry['host_names'].append(host_name)

        return grouped

    def build_execution_inventory(
        self,
        host_names: List[str],
        *,
        inventory: Optional[Dict[str, Any]] = None,
        dedupe_by_machine: bool = False,
    ) -> Dict[str, Any]:
        """Build a minimal inventory for the selected instances or machine representatives."""
        inventory = inventory if inventory is not None else self.app.inventory
        hosts = _get_gpu_hosts(inventory)
        selected_host_names = [host_name for host_name in host_names if host_name in hosts]
        execution_hosts: Dict[str, Dict[str, Any]] = {}

        if dedupe_by_machine:
            grouped = self.group_host_names_by_machine(inventory, selected_host_names)
            for machine_id, detail in grouped.items():
                representative_host = detail['representative_host']
                execution_host = self.build_execution_host_config(representative_host, hosts[representative_host])
                execution_host['r1setup_execution_scope'] = 'machine'
                execution_host['r1setup_machine_id'] = machine_id
                execution_host['r1setup_selected_instance_hosts'] = list(detail['host_names'])
                execution_host['r1setup_selected_instance_logical_names'] = [
                    str(hosts[name].get('r1setup_instance_logical_name') or name)
                    for name in detail['host_names']
                ]
                execution_hosts[representative_host] = execution_host
        else:
            for host_name in selected_host_names:
                execution_host = self.build_execution_host_config(host_name, hosts[host_name])
                execution_host['r1setup_execution_scope'] = 'instance'
                execution_hosts[host_name] = execution_host

        return {
            'all': {
                'children': {
                    'gpu_nodes': {
                        'hosts': execution_hosts,
                    }
                }
            }
        }

    @staticmethod
    def _build_machine_execution_host_alias(machine_id: str, used_aliases: Optional[set] = None) -> str:
        """Return a stable inventory-safe alias for a registered machine."""
        alias_base = re.sub(r'[^a-zA-Z0-9_]+', '_', str(machine_id or '').strip()).strip('_').lower()
        alias_base = alias_base or 'machine'
        alias = alias_base
        used_aliases = used_aliases if used_aliases is not None else set()
        counter = 2
        while alias in used_aliases:
            alias = f"{alias_base}_{counter}"
            counter += 1
        return alias

    def build_registered_machine_execution_inventory(
        self,
        machine_ids: List[str],
        *,
        fleet_state: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Build a machine-scope inventory from registered fleet machine records."""
        normalized_fleet = self._normalize_fleet_state(fleet_state if fleet_state is not None else self.get_fleet_state_copy())
        machines = normalized_fleet.get('fleet', {}).get('machines', {})
        instances = normalized_fleet.get('fleet', {}).get('instances', {})
        execution_hosts: Dict[str, Dict[str, Any]] = {}
        used_aliases: set = set()

        for machine_id in machine_ids:
            machine_record = dict(machines.get(machine_id, {}))
            if not machine_record:
                continue

            alias = self._build_machine_execution_host_alias(machine_id, used_aliases)
            used_aliases.add(alias)

            host_config = {
                'ansible_host': machine_record.get('ansible_host'),
                'ansible_user': machine_record.get('ansible_user', 'root'),
                'ansible_port': machine_record.get('ansible_port', 22),
                'r1setup_execution_scope': 'machine',
                'r1setup_machine_id': machine_id,
                'r1setup_machine_inventory_alias': alias,
                'r1setup_machine_label': machine_record.get('machine_id', machine_id),
                'r1setup_topology_mode': machine_record.get('topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE),
                'r1setup_machine_deployment_state': machine_record.get(
                    'deployment_state',
                    DEFAULT_MACHINE_DEPLOYMENT_STATE,
                ),
                'r1setup_selected_instance_hosts': list(machine_record.get('instance_names') or []),
                'r1setup_selected_instance_logical_names': [
                    str(instances.get(instance_name, {}).get('logical_name') or instance_name)
                    for instance_name in machine_record.get('instance_names') or []
                ],
                'r1setup_helper_mode': self.resolve_helper_mode(
                    str(machine_record.get('topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE)
                ),
            }

            for key in (
                'ansible_ssh_common_args',
                'ansible_ssh_pass',
                'ansible_become_password',
                'ansible_ssh_private_key_file',
            ):
                if key in machine_record:
                    host_config[key] = machine_record[key]

            execution_hosts[alias] = host_config

        return {
            'all': {
                'children': {
                    'gpu_nodes': {
                        'hosts': execution_hosts,
                    }
                }
            }
        }

    def manage_configurations_menu(self) -> None:
        """Show configuration management submenu"""
        while True:
            self.app.print_header("Configuration Management")

            # List available configurations
            configs = self._list_available_configs()
            active_config_name = self.active_config.get('config_name')

            if configs:
                self.app.print_section(f"Available Configurations ({len(configs)})")
                for i, (config_name, metadata) in enumerate(configs, 1):
                    # Remove .yml extension and format display
                    display_name = config_name.replace('.yml', '')
                    env = metadata.get('environment', 'unknown')
                    nodes = metadata.get('nodes_count', 0)
                    created = metadata.get('created_at', '')
                    description = metadata.get('description', '')

                    # Extract custom name from the config name (everything before the first timestamp)
                    # Format: customname_YYYYMMDD_HHMM_Nnodes_environment
                    custom_name = display_name
                    if '_' in display_name:
                        parts = display_name.split('_')
                        if len(parts) >= 2:
                            # Find where the timestamp starts (8 digits)
                            for idx, part in enumerate(parts):
                                if len(part) == 8 and part.isdigit():
                                    custom_name = '_'.join(parts[:idx])
                                    break

                    # Format creation date
                    created_str = _parse_iso_datetime(created) or "Unknown"

                    # Mark active configuration
                    active_marker = " ← ACTIVE" if display_name == active_config_name else ""

                    self.app.print_colored(f"  {i}. {custom_name}", 'cyan' if active_marker else 'white', bold=bool(active_marker))
                    info_line = f"     {env} | {nodes} node(s) | {created_str}{active_marker}"
                    if description:
                        info_line = f"     {description} | {created_str}{active_marker}"

                    # Add deployment status info
                    deployment_status = metadata.get('deployment_status', 'never_deployed')
                    last_deployed_date = metadata.get('last_deployed_date')
                    last_deployed_network = metadata.get('last_deployed_network')
                    last_deleted_date = metadata.get('last_deleted_date')
                    # Per-host variant rollup replaces the retired
                    # fleet-level last_deployment_type field.
                    variant_summary = self.install_variant_summary(
                        metadata.get('inventory') or self.app.inventory
                    )

                    if deployment_status == 'deployed' and last_deployed_date:
                        deployed_str = _parse_iso_datetime(last_deployed_date)
                        if deployed_str:
                            deployment_info = f"🚀 Last deployed: {deployed_str}"
                            if last_deployed_network:
                                deployment_info += f" ({last_deployed_network}"
                            if variant_summary and variant_summary != 'no installs yet':
                                deployment_info += f", {variant_summary})"
                            elif last_deployed_network:
                                deployment_info += ")"
                        else:
                            deployment_info = "🚀 Deployed"
                    elif deployment_status == 'deleted' and last_deleted_date:
                        deleted_str = _parse_iso_datetime(last_deleted_date)
                        if deleted_str:
                            deployment_info = f"🗑️ Last deleted: {deleted_str}"
                        else:
                            deployment_info = "🗑️ Deleted"
                    else:
                        deployment_info = "📋 Never deployed"

                    self.app.print_colored(info_line, 'green' if active_marker else 'white')
                    deployment_color = 'cyan' if deployment_status == 'deployed' else 'red' if deployment_status == 'deleted' else 'yellow'
                    self.app.print_colored(f"     {deployment_info}", deployment_color)
                print()

            self.app.print_colored("Manage Configurations Menu", 'cyan', bold=True)
            print()
            self.app.print_colored("  1) Create New Configuration       - Set up new configuration")
            if configs:
                self.app.print_colored("  2) Switch Configuration          - Activate different configuration")
                self.app.print_colored("  3) Delete Configuration          - Remove saved configuration")
                self.app.print_colored("  4) Rename Configuration          - Change configuration name")
                self.app.print_colored("")
                self.app.print_colored("  5) Backup Configuration          - Save configuration backup")
                self.app.print_colored("  6) Restore Configuration         - Restore from backup")
                self.app.print_colored("  7) Export Configuration          - Create portable config file")
                self.app.print_colored("  8) Quick Export Current Config   - Export active config")
            else:
                self.app.print_colored("  7) Export Configuration          - Create portable config file (no configs)")
                self.app.print_colored("  8) Quick Export Current          - Export active config")
            self.app.print_colored("  9) Import Configuration          - Import from portable file")
            print()
            self.app.print_colored("  0) Back to Main Menu")
            print()

            choice = self.app.get_input("Select option", "0")

            if choice == '0':
                break
            elif choice == '1':
                self._create_machine_first_configuration()
            elif choice == '2' and configs:
                self._switch_configuration(configs)
            elif choice == '3' and configs:
                self._delete_configuration(configs)
            elif choice == '4' and configs:
                self._rename_configuration(configs)
            elif choice == '5' and configs:
                self._backup_configuration(configs)
            elif choice == '6' and configs:
                self._restore_configuration()
            elif choice == '7':
                if configs:
                    self._export_configuration(configs)
                else:
                    self.app.print_colored("No configurations available to export.", 'yellow')
                    self.app.wait_for_enter()
            elif choice == '8':
                self._quick_export_current()
            elif choice == '9':
                self._import_configuration()
            else:
                self.app.print_colored("Invalid option. Valid choices are 0-9.", 'red')
                self.app.wait_for_enter()

    def _switch_configuration(self, configs: List[Tuple[str, Dict]]) -> None:
        """Switch to a different configuration"""
        self.app.print_section("Switch Configuration")

        for i, (config_name, metadata) in enumerate(configs, 1):
            display_name = config_name.replace('.yml', '')
            env = metadata.get('environment', 'unknown')
            nodes = metadata.get('nodes_count', 0)
            self.app.print_colored(f"  {i}) {display_name} ({env}, {nodes} nodes)")

        while True:
            try:
                choice = int(self.app.get_input("Select configuration number", "1")) - 1
                if 0 <= choice < len(configs):
                    selected_config = configs[choice][0].replace('.yml', '')
                    break
                self.app.print_colored("Invalid selection", 'red')
            except ValueError:
                self.app.print_colored("Please enter a number", 'red')

        if self._load_config_by_name(selected_config):
            self.app.print_colored(f"Switched to configuration: {selected_config}", 'green')
        else:
            self.app.print_colored("Failed to switch configuration", 'red')

        self.app.wait_for_enter()

    def _delete_configuration(self, configs: List[Tuple[str, Dict]]) -> None:
        """Delete a configuration"""
        self.app.print_section("Delete Configuration")

        for i, (config_name, metadata) in enumerate(configs, 1):
            display_name = config_name.replace('.yml', '')
            env = metadata.get('environment', 'unknown')
            nodes = metadata.get('nodes_count', 0)
            self.app.print_colored(f"  {i}) {display_name} ({env}, {nodes} nodes)")

        while True:
            try:
                choice = int(self.app.get_input("Select configuration number to delete", "1")) - 1
                if 0 <= choice < len(configs):
                    selected_config = configs[choice][0].replace('.yml', '')
                    break
                self.app.print_colored("Invalid selection", 'red')
            except ValueError:
                self.app.print_colored("Please enter a number", 'red')

        if self.app.get_input(f"Delete configuration '{selected_config}'? (y/n)", "n").lower() == 'y':
            config_path = self.app.configs_dir / f"{selected_config}.yml"
            metadata_path = self.app.configs_dir / f"{selected_config}.json"

            try:
                # Delete files
                if config_path.exists():
                    config_path.unlink()
                if metadata_path.exists():
                    metadata_path.unlink()

                # If this was the active config, clear it
                if self.active_config.get('config_name') == selected_config:
                    self.active_config = {
                        'config_name': None,
                        'environment': None,
                        'created_at': None,
                        'nodes_count': 0,
                        'last_deployed_date': None,
                        'last_deployed_network': None,
                        'deployment_status': 'never_deployed',
                        'last_deleted_date': None,
                    }
                    self._save_active_config()

                    # Remove symlink
                    if self.app.config_file.is_symlink():
                        self.app.config_file.unlink()

                self.app.print_colored(f"Configuration '{selected_config}' deleted successfully!", 'green')
            except Exception as e:
                self.app.print_colored(f"Error deleting configuration: {e}", 'red')

        self.app.wait_for_enter()

    def _rename_configuration(self, configs: List[Tuple[str, Dict]]) -> None:
        """Rename a configuration"""
        self.app.print_section("Rename Configuration")

        for i, (config_name, metadata) in enumerate(configs, 1):
            display_name = config_name.replace('.yml', '')
            env = metadata.get('environment', 'unknown')
            nodes = metadata.get('nodes_count', 0)
            self.app.print_colored(f"  {i}) {display_name} ({env}, {nodes} nodes)")

        while True:
            try:
                choice = int(self.app.get_input("Select configuration number to rename", "1")) - 1
                if 0 <= choice < len(configs):
                    old_config_name = configs[choice][0].replace('.yml', '')
                    break
                self.app.print_colored("Invalid selection", 'red')
            except ValueError:
                self.app.print_colored("Please enter a number", 'red')

        new_name = self.app.get_input(f"Enter new name for '{old_config_name}'", required=True)

        # Validate new name
        if not re.match(r'^[a-zA-Z0-9_-]+$', new_name):
            self.app.print_colored("Invalid name. Use only letters, numbers, underscore, and hyphen.", 'red')
            self.app.wait_for_enter()
            return

        # Generate new config name with timestamp and metadata (like the original creation)
        # Extract environment and node count from existing metadata
        old_metadata_path = self.app.configs_dir / f"{old_config_name}.json"
        nodes_count = 1
        environment = 'mainnet'

        if old_metadata_path.exists():
            try:
                with open(old_metadata_path) as f:
                    metadata = json.load(f)
                nodes_count = metadata.get('nodes_count', 1)
                environment = metadata.get('environment', 'mainnet')
            except Exception:
                pass

        new_config_name = self._generate_config_name(nodes_count, new_name)

        # Check if new name already exists
        if (self.app.configs_dir / f"{new_config_name}.yml").exists():
            self.app.print_colored("A configuration with this name already exists!", 'red')
            self.app.wait_for_enter()
            return

        try:
            # Rename files
            old_config_path = self.app.configs_dir / f"{old_config_name}.yml"
            old_metadata_path = self.app.configs_dir / f"{old_config_name}.json"
            new_config_path = self.app.configs_dir / f"{new_config_name}.yml"
            new_metadata_path = self.app.configs_dir / f"{new_config_name}.json"

            old_config_path.rename(new_config_path)
            if old_metadata_path.exists():
                old_metadata_path.rename(new_metadata_path)

                # Update metadata
                with open(new_metadata_path) as f:
                    metadata = json.load(f)
                metadata['config_name'] = new_config_name

                with open(new_metadata_path, 'w') as f:
                    json.dump(metadata, f, indent=2)

            # Update active config if this was the active one
            if self.active_config.get('config_name') == old_config_name:
                self.active_config['config_name'] = new_config_name
                self._save_active_config()
                self._update_hosts_symlink(new_config_path)

            self.app.print_colored(f"Configuration renamed from '{old_config_name}' to '{new_config_name}'!", 'green')
        except Exception as e:
            self.app.print_colored(f"Error renaming configuration: {e}", 'red')

        self.app.wait_for_enter()

    def _backup_configuration(self, configs: List[Tuple[str, Dict]]) -> None:
        """Create a backup of a configuration"""
        self.app.print_section("Backup Configuration")

        for i, (config_name, metadata) in enumerate(configs, 1):
            display_name = config_name.replace('.yml', '')
            env = metadata.get('environment', 'unknown')
            nodes = metadata.get('nodes_count', 0)
            self.app.print_colored(f"  {i}) {display_name} ({env}, {nodes} nodes)")

        while True:
            try:
                choice = int(self.app.get_input("Select configuration number to backup", "1")) - 1
                if 0 <= choice < len(configs):
                    config_name = configs[choice][0].replace('.yml', '')
                    break
                self.app.print_colored("Invalid selection", 'red')
            except ValueError:
                self.app.print_colored("Please enter a number", 'red')

        try:
            # Create backup directory if it doesn't exist
            backup_dir = self.app.configs_dir / 'backups'
            backup_dir.mkdir(parents=True, exist_ok=True)

            # Generate backup filename with timestamp
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            backup_name = f"{config_name}_backup_{timestamp}"

            # Copy configuration files
            config_path = self.app.configs_dir / f"{config_name}.yml"
            metadata_path = self.app.configs_dir / f"{config_name}.json"

            backup_config_path = backup_dir / f"{backup_name}.yml"
            backup_metadata_path = backup_dir / f"{backup_name}.json"

            if config_path.exists():
                shutil.copy2(config_path, backup_config_path)
            if metadata_path.exists():
                shutil.copy2(metadata_path, backup_metadata_path)

            self.app.print_colored(f"✅ Configuration '{config_name}' backed up as '{backup_name}'", 'green')
            self.app.print_colored(f"📁 Backup location: {backup_dir}", 'cyan')

        except Exception as e:
            self.app.print_colored(f"❌ Error creating backup: {e}", 'red')

        self.app.wait_for_enter()

    def _restore_configuration(self) -> None:
        """Restore a configuration from backup"""
        self.app.print_section("Restore Configuration")

        # Check if backup directory exists
        backup_dir = self.app.configs_dir / 'backups'
        if not backup_dir.exists():
            self.app.print_colored("No backups found. Backup directory doesn't exist.", 'yellow')
            self.app.wait_for_enter()
            return

        # Get list of backup files
        backup_files = list(backup_dir.glob('*_backup_*.yml'))
        if not backup_files:
            self.app.print_colored("No backup files found.", 'yellow')
            self.app.wait_for_enter()
            return

        # Parse backup information
        backups = []
        for backup_file in backup_files:
            backup_name = backup_file.stem
            # Extract original name and timestamp from backup name
            # Format: originalname_backup_YYYYMMDD_HHMMSS
            parts = backup_name.split('_backup_')
            if len(parts) == 2:
                original_name = parts[0]
                timestamp_str = parts[1]
                try:
                    timestamp = datetime.strptime(timestamp_str, '%Y%m%d_%H%M%S')
                    backups.append((backup_name, original_name, timestamp))
                except ValueError:
                    continue

        if not backups:
            self.app.print_colored("No valid backup files found.", 'yellow')
            self.app.wait_for_enter()
            return

        # Sort backups by timestamp (newest first)
        backups.sort(key=lambda x: x[2], reverse=True)

        # Show available backups
        self.app.print_colored("Available Backups:")
        for i, (backup_name, original_name, timestamp) in enumerate(backups, 1):
            timestamp_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
            self.app.print_colored(f"  {i}) {original_name} (backed up on {timestamp_str})")

        while True:
            try:
                choice = int(self.app.get_input("Select backup number to restore", "1")) - 1
                if 0 <= choice < len(backups):
                    selected_backup = backups[choice]
                    break
                self.app.print_colored("Invalid selection", 'red')
            except ValueError:
                self.app.print_colored("Please enter a number", 'red')

        backup_name, original_name, timestamp = selected_backup

        # Get new name for restored configuration
        self.app.print_colored(f"\nRestoring backup of '{original_name}' from {timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
        new_name = self.app.get_input(f"Enter name for restored configuration (default: {original_name}_restored)",
                                  f"{original_name}_restored")

        # Validate new name
        if not re.match(r'^[a-zA-Z0-9_-]+$', new_name):
            self.app.print_colored("Invalid name. Use only letters, numbers, underscore, and hyphen.", 'red')
            self.app.wait_for_enter()
            return

        try:
            # Load backup metadata to get proper configuration structure
            backup_metadata_path = backup_dir / f"{backup_name}.json"
            nodes_count = 1
            environment = 'mainnet'

            if backup_metadata_path.exists():
                try:
                    with open(backup_metadata_path) as f:
                        metadata = json.load(f)
                    nodes_count = metadata.get('nodes_count', 1)
                    environment = metadata.get('environment', 'mainnet')
                except Exception:
                    pass

            # Generate proper configuration name
            restored_config_name = self._generate_config_name(nodes_count, new_name)

            # Check if restored name already exists
            if (self.app.configs_dir / f"{restored_config_name}.yml").exists():
                self.app.print_colored("A configuration with this name already exists!", 'red')
                self.app.wait_for_enter()
                return

            # Copy backup files to main config directory
            backup_config_path = backup_dir / f"{backup_name}.yml"
            backup_metadata_path = backup_dir / f"{backup_name}.json"

            restored_config_path = self.app.configs_dir / f"{restored_config_name}.yml"
            restored_metadata_path = self.app.configs_dir / f"{restored_config_name}.json"

            if backup_config_path.exists():
                shutil.copy2(backup_config_path, restored_config_path)
            if backup_metadata_path.exists():
                shutil.copy2(backup_metadata_path, restored_metadata_path)

                # Update metadata with new name
                with open(restored_metadata_path) as f:
                    metadata = json.load(f)
                metadata['config_name'] = restored_config_name
                metadata['restored_from'] = backup_name
                metadata['restored_at'] = datetime.now().isoformat()

                with open(restored_metadata_path, 'w') as f:
                    json.dump(metadata, f, indent=2)

            self.app.print_colored(f"✅ Configuration restored as '{restored_config_name}'", 'green')
            self.app.print_colored("💡 Use 'Switch Configuration' to activate the restored configuration.", 'cyan')

        except Exception as e:
            self.app.print_colored(f"❌ Error restoring configuration: {e}", 'red')

        self.app.wait_for_enter()

    def _export_configuration(self, configs: List[Tuple[str, Dict]]) -> None:
        """Export configuration to a portable file for transfer between machines"""
        self.app.print_section("Export Configuration")

        if not configs:
            self.app.print_colored("No configurations available to export.", 'yellow')
            self.app.wait_for_enter()
            return

        # Show available configurations
        self.app.print_colored("Available configurations to export:")
        for i, (config_name, metadata) in enumerate(configs, 1):
            display_name = config_name.replace('.yml', '')
            env = metadata.get('environment', 'unknown')
            nodes = metadata.get('nodes_count', 0)
            created = metadata.get('created_at', 'unknown')
            created = _parse_iso_datetime(created) or created
            self.app.print_colored(f"  {i}) {display_name} ({env}, {nodes} nodes, created: {created})")

        # Select configuration to export
        while True:
            try:
                choice = int(self.app.get_input("Select configuration number to export", "1")) - 1
                if 0 <= choice < len(configs):
                    config_name = configs[choice][0].replace('.yml', '')
                    config_metadata = configs[choice][1]
                    break
                self.app.print_colored("Invalid selection", 'red')
            except ValueError:
                self.app.print_colored("Please enter a number", 'red')

        # Get export destination
        default_filename = f"{config_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.r1config"
        filename = self.app.get_input(f"Export filename [{default_filename}]", default_filename)

        # Ensure .r1config extension
        if not filename.endswith('.r1config'):
            filename += '.r1config'

        export_path = Path(filename)
        if not export_path.is_absolute():
            export_path = Path.cwd() / filename

        try:
            # Load configuration files
            config_path = self.app.configs_dir / f"{config_name}.yml"
            metadata_path = self.app.configs_dir / f"{config_name}.json"

            # Read configuration data
            with open(config_path) as f:
                config_data = yaml.safe_load(f)

            with open(metadata_path) as f:
                metadata = json.load(f)

            # Read current environment variables
            env_data = {}
            if self.app.vars_file.exists():
                with open(self.app.vars_file) as f:
                    env_data = yaml.safe_load(f) or {}

            # Create unified export structure
            export_data = {
                'format_version': '1.0',
                'exported_at': datetime.now().isoformat(),
                'exported_by': self.app.real_user,
                'export_source': 'r1setup',
                'configuration': {
                    'name': config_name,
                    'metadata': metadata,
                    'inventory': config_data,
                    'environment_vars': env_data,
                    'ansible_vars': {}
                }
            }

            # Include relevant ansible variables if they exist
            ansible_vars_files = [
                self.app.config_dir / 'group_vars/all.yml',
                self.app.config_dir / 'group_vars/mnl.yml'
            ]

            for var_file in ansible_vars_files:
                if var_file.exists():
                    with open(var_file) as f:
                        var_data = yaml.safe_load(f) or {}
                        export_data['configuration']['ansible_vars'][var_file.name] = var_data

            # Write export file
            with open(export_path, 'w') as f:
                json.dump(export_data, f, indent=2, default=str)

            self.app.print_colored(f"✅ Configuration '{config_name}' exported successfully!", 'green')
            self.app.print_colored(f"📁 Export file: {export_path}", 'cyan')
            self.app.print_colored(f"📝 File size: {export_path.stat().st_size} bytes", 'white')

            # Show what was exported
            hosts_count = len(_get_gpu_hosts(config_data))
            self.app.print_colored(f"🔧 Exported: {hosts_count} nodes, {metadata.get('environment', 'unknown')} environment", 'cyan')

            self.app.print_colored("\n💡 To import on another machine:", 'yellow')
            self.app.print_colored(f"   1. Copy {filename} to the target machine", 'white')
            self.app.print_colored(f"   2. Run 'r1setup' and select Configuration Management → Import Configuration", 'white')

        except Exception as e:
            self.app.print_colored(f"❌ Error exporting configuration: {e}", 'red')

        self.app.wait_for_enter()

    def _import_configuration(self) -> None:
        """Import configuration from a portable file"""
        self.app.print_section("Import Configuration")

        self.app.print_colored("This will import a configuration from an .r1config file.", 'cyan')
        self.app.print_colored("The file should have been created using the Export Configuration option.", 'yellow')

        # Get import file path
        import_file = self.app.get_input("Enter path to .r1config file")
        import_path = Path(import_file)

        if not import_path.is_absolute():
            import_path = Path.cwd() / import_file

        if not import_path.exists():
            self.app.print_colored(f"❌ File not found: {import_path}", 'red')
            self.app.wait_for_enter()
            return

        try:
            # Read and validate import file
            with open(import_path) as f:
                import_data = json.load(f)

            # Validate format
            if not isinstance(import_data, dict) or 'configuration' not in import_data:
                self.app.print_colored("❌ Invalid configuration file format", 'red')
                self.app.wait_for_enter()
                return

            config_info = import_data['configuration']
            metadata = config_info.get('metadata', {})
            inventory = config_info.get('inventory', {})
            env_vars = config_info.get('environment_vars', {})
            ansible_vars = config_info.get('ansible_vars', {})

            # Show import details
            original_name = config_info.get('name', 'unknown')
            nodes_count = len(_get_gpu_hosts(inventory))
            environment = metadata.get('environment', 'unknown')
            exported_at = import_data.get('exported_at', 'unknown')
            exported_by = import_data.get('exported_by', 'unknown')

            self.app.print_colored(f"\n📋 Configuration Details:", 'cyan', bold=True)
            self.app.print_colored(f"   Original name: {original_name}")
            self.app.print_colored(f"   Environment: {environment}")
            self.app.print_colored(f"   Nodes: {nodes_count}")
            self.app.print_colored(f"   Exported: {exported_at}")
            self.app.print_colored(f"   Exported by: {exported_by}")

            # Show nodes that will be imported
            hosts = _get_gpu_hosts(inventory)
            if hosts:
                self.app.print_colored(f"\n🖥️  Nodes to import:", 'yellow', bold=True)
                for hostname, config in hosts.items():
                    ip = config.get('ansible_host', 'unknown')
                    user = config.get('ansible_user', 'unknown')
                    self.app.print_colored(f"   • {hostname} ({ip}) - user: {user}")

            # Confirm import
            if self.app.get_input(f"\nProceed with import? (y/n)", "y").lower() != 'y':
                self.app.print_colored("Import cancelled.", 'yellow')
                self.app.wait_for_enter()
                return

            # Get new configuration name
            suggested_name = self.app.get_input(f"Configuration name [{original_name}]", original_name)

            # Generate proper config filename
            final_config_name = self._generate_config_name(nodes_count, suggested_name)

            # Check if config already exists
            if (self.app.configs_dir / f"{final_config_name}.yml").exists():
                if self.app.get_input(f"Configuration '{final_config_name}' already exists. Overwrite? (y/n)", "n").lower() != 'y':
                    self.app.print_colored("Import cancelled.", 'yellow')
                    self.app.wait_for_enter()
                    return

            # Save imported configuration
            config_path = self.app.configs_dir / f"{final_config_name}.yml"
            metadata_path = self.app.configs_dir / f"{final_config_name}.json"

            # Update metadata
            updated_metadata = dict(metadata)
            updated_metadata['config_name'] = final_config_name
            updated_metadata['imported_at'] = datetime.now().isoformat()
            updated_metadata['imported_from'] = str(import_path)
            updated_metadata['original_name'] = original_name

            # Save configuration files
            with open(config_path, 'w') as f:
                yaml.safe_dump(inventory, f, default_flow_style=False)

            with open(metadata_path, 'w') as f:
                json.dump(updated_metadata, f, indent=2)

            os.chmod(config_path, 0o600)

            # Update environment variables if included
            if env_vars:
                current_env_vars = {}
                if self.app.vars_file.exists():
                    try:
                        with open(self.app.vars_file) as f:
                            current_env_vars = yaml.safe_load(f) or {}
                    except (yaml.YAMLError, IOError, OSError):
                        pass

                # Merge environment variables (imported ones take precedence)
                current_env_vars.update(env_vars)

                # Ensure directory exists
                self.app.vars_file.parent.mkdir(parents=True, exist_ok=True)

                with open(self.app.vars_file, 'w') as f:
                    yaml.safe_dump(current_env_vars, f, default_flow_style=False)

            # Apply ansible variables if included
            for var_filename, var_content in ansible_vars.items():
                var_file_path = self.app.config_dir / 'group_vars' / var_filename
                var_file_path.parent.mkdir(parents=True, exist_ok=True)

                # Only update if the imported file has content
                if var_content:
                    with open(var_file_path, 'w') as f:
                        yaml.safe_dump(var_content, f, default_flow_style=False)

            self.app.print_colored(f"✅ Configuration imported successfully as '{final_config_name}'!", 'green')
            self.app.print_colored(f"📁 Saved to: {config_path}", 'cyan')

            # Ask if user wants to activate the imported configuration
            if self.app.get_input("Activate this configuration now? (y/n)", "y").lower() == 'y':
                self.app.inventory = inventory
                self.active_config.update(updated_metadata)
                self._save_active_config()
                self._update_hosts_symlink(config_path)

                # Set environment
                env = updated_metadata.get('environment')
                if env:
                    self.set_mnl_app_env(env)

                self.app.print_colored("✅ Configuration activated!", 'green')
            else:
                self.app.print_colored("💡 Use 'Switch Configuration' to activate it later.", 'cyan')

        except json.JSONDecodeError:
            self.app.print_colored("❌ Invalid JSON in configuration file", 'red')
        except yaml.YAMLError as e:
            self.app.print_colored(f"❌ Invalid YAML in configuration: {e}", 'red')
        except Exception as e:
            self.app.print_colored(f"❌ Error importing configuration: {e}", 'red')

        self.app.wait_for_enter()

    def _quick_export_current(self) -> None:
        """Quick export of current active configuration"""
        if not self.app.has_active_config_shell():
            self.app.print_colored("No active configuration to export.", 'yellow')
            self.app.wait_for_enter()
            return

        # Generate filename based on active config
        config_name = self.active_config.get('config_name', 'current_config')
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"{config_name}_{timestamp}.r1config"

        try:
            # Create export in current directory
            export_path = Path.cwd() / filename

            # Read current configuration
            with open(self.app.config_file) as f:
                config_data = yaml.safe_load(f)

            # Read environment variables
            env_data = {}
            if self.app.vars_file.exists():
                with open(self.app.vars_file) as f:
                    env_data = yaml.safe_load(f) or {}

            # Create export structure
            export_data = {
                'format_version': '1.0',
                'exported_at': datetime.now().isoformat(),
                'exported_by': self.app.real_user,
                'export_source': 'r1setup_quick',
                'configuration': {
                    'name': config_name,
                    'metadata': dict(self.active_config),
                    'inventory': config_data,
                    'environment_vars': env_data,
                    'ansible_vars': {}
                }
            }

            # Include ansible variables
            ansible_vars_files = [
                self.app.config_dir / 'group_vars/all.yml',
                self.app.config_dir / 'group_vars/mnl.yml'
            ]

            for var_file in ansible_vars_files:
                if var_file.exists():
                    with open(var_file) as f:
                        var_data = yaml.safe_load(f) or {}
                        export_data['configuration']['ansible_vars'][var_file.name] = var_data

            # Write export file
            with open(export_path, 'w') as f:
                json.dump(export_data, f, indent=2, default=str)

            self.app.print_colored(f"✅ Current configuration exported to: {filename}", 'green')
            nodes = len(_get_gpu_hosts(config_data))
            env = self.active_config.get('environment', 'unknown')
            self.app.print_colored(f"🔧 Exported: {nodes} nodes, {env} environment", 'cyan')

        except Exception as e:
            self.app.print_colored(f"❌ Error exporting configuration: {e}", 'red')

        self.app.wait_for_enter()




class NodeStatusTracker:
    """Tracks and displays node deployment/service status.

    Accesses from self.app (R1Setup):
        - inventory (read)
        - config_dir, config_file (paths)
        - print_colored(), print_debug(), print_header(), print_section()
        - get_input()
        - run_command()
        - _save_configuration()
        - _format_timestamp_ago()
        - load_configuration()
        - check_hosts_config()
    """

    def __init__(self, app):
        self.app = app

    def _should_preserve_node_status(self, node_name: str) -> bool:
        """Check if a node's status should be preserved (e.g., deleted nodes)"""
        current_status = self._get_node_status_info(node_name)['status']
        return current_status == 'deleted'

    def _update_node_status(self, node_name: str, status: str) -> None:
        """Update the status of a specific node"""
        hosts = _get_gpu_hosts(self.app.inventory)
        if node_name in hosts:
            old_status = hosts[node_name].get('node_status', 'unknown')

            # Preserve certain statuses (e.g., deleted nodes)
            if self._should_preserve_node_status(node_name) and status not in ['deploying', 'error']:
                self.app.print_debug(f"Preserving {node_name} status: {old_status} (requested: {status})")
                return

            old_update = hosts[node_name].get('last_status_update', 'never')
            new_update = datetime.now().isoformat()

            hosts[node_name]['node_status'] = status
            hosts[node_name]['last_status_update'] = new_update
            self.app._save_configuration()

            self.app.print_debug(f"Updated {node_name} status: {old_status} → {status}")
            self.app.print_debug(f"  Previous update: {old_update}")
            self.app.print_debug(f"  New update: {new_update}")

    def _get_node_status_info(self, node_name: str) -> Dict[str, str]:
        """Get status information for a node"""
        hosts = _get_gpu_hosts(self.app.inventory)
        if node_name in hosts:
            status = hosts[node_name].get('node_status', 'unknown')
            last_update = hosts[node_name].get('last_status_update', '')
            self.app.print_debug(f"Retrieved {node_name} status: {status} (updated: {last_update})")
            return {'status': status, 'last_update': last_update}
        self.app.print_debug(f"Node {node_name} not found in hosts configuration")
        return {'status': 'unknown', 'last_update': ''}

    @staticmethod
    def _resolve_node_status(service_status, container_status):
        """Map service/container status to overall (status, result, overwrite) tuple.

        overwrite=True means the result should overwrite existing data
        (container info takes priority over service-only info).
        Returns None if no resolution is possible.
        """
        if container_status == 'RUNNING':
            return ('running', 'Container is running', True)
        elif container_status == 'NOT_RUNNING':
            if service_status == 'ACTIVE':
                return ('stopped', 'Service active but container not running', True)
            elif service_status == 'NOT_FOUND':
                return ('not_deployed', 'Service not found - not deployed', True)
            else:
                return ('stopped', 'Container not running', True)
        elif service_status == 'ACTIVE':
            return ('running', 'Service is active', False)
        elif service_status in ('INACTIVE', 'FAILED'):
            return ('stopped', 'Service is inactive/failed', False)
        elif service_status == 'NOT_FOUND':
            return ('not_deployed', 'Service not found - not deployed', False)
        return None

    def _apply_resolved_status(self, node_status_data, current_host, service_status, container_status):
        """Apply resolved status to node_status_data, respecting priority rules."""
        resolved = self._resolve_node_status(service_status, container_status)
        if resolved:
            status, result, overwrite = resolved
            if overwrite or current_host not in node_status_data:
                existing = node_status_data.get(current_host, {})
                node_status_data[current_host] = {'status': status, 'result': result}
                if existing.get('service_file_version'):
                    node_status_data[current_host]['service_file_version'] = existing['service_file_version']
                self.app.print_debug(f"Set {current_host} to {status} ({result})")

    @staticmethod
    def _parse_service_file_version(text):
        """Extract a service file version from status output text."""
        import re

        patterns = (
            r'Service File Version:\s*([^\s"\\]+)',
            r'R1SETUP_SERVICE_FILE_VERSION=([^\s"\\]+)',
        )

        for pattern in patterns:
            match = re.search(pattern, text)
            if not match:
                continue
            version = match.group(1).strip()
            if version.lower() not in UNKNOWN_SERVICE_FILE_VERSION_MARKERS:
                return version
        return None

    def _apply_service_file_version(self, node_status_data, current_host, service_file_version):
        """Attach discovered service version metadata to the current host result."""
        if not current_host or not service_file_version:
            return
        node_status_data.setdefault(current_host, {})
        node_status_data[current_host]['service_file_version'] = service_file_version
        self.app.print_debug(f"Set {current_host} service file version to {service_file_version}")

    def _run_status_playbook(self):
        """Run the ansible service status playbook.

        Returns (success, output) tuple.
        """
        playbook_path = self.app.config_dir / 'playbooks/service_status.yml'
        if not playbook_path.exists():
            self.app.print_colored(f"❌ Service status playbook not found: {playbook_path}", 'red')
            return False, ''

        host_names = list(_get_gpu_hosts(self.app.inventory).keys())
        success, output, _, _ = self.app.run_generated_playbook(
            playbook_path,
            host_names,
            machine_scope=False,
            last_applied_action='status_check',
            show_output=False,
            timeout=self.app.connection_timeout,
        )
        return success, output

    def _parse_status_fields(self, text):
        """Parse service and container status from a text block.

        Returns (service_status, container_status) tuple.
        """
        service_status = None
        container_status = None

        if 'Service Status: ACTIVE' in text:
            service_status = 'ACTIVE'
        elif 'Service Status: INACTIVE' in text or 'Service Status: INACTIVE/FAILED' in text:
            service_status = 'INACTIVE'
        elif 'Service Status: FAILED' in text:
            service_status = 'FAILED'
        elif 'Service Status: NOT FOUND' in text:
            service_status = 'NOT_FOUND'

        if 'Container Status: RUNNING' in text:
            container_status = 'RUNNING'
        elif 'Container Status: NOT RUNNING' in text:
            container_status = 'NOT_RUNNING'

        return service_status, container_status

    def _parse_ansible_status_lines(self, lines):
        """Parse ansible playbook output lines for node status information.

        Tries multiple parsing strategies:
        1. Structured message output (msg-based)
        2. Direct status lines
        3. Aggressive fallback (broad search)

        Returns dict of {hostname: {status, result}}.
        """
        import re
        node_status_data = {}
        current_host = None
        task_pattern = re.compile(r'ok: \[([^\]]+)\] => \{')

        for i, line in enumerate(lines):
            line = line.strip()

            # Track current host from task results
            task_match = task_pattern.match(line)
            if task_match:
                current_host = task_match.group(1)
                self.app.print_debug(f"Found task result for host: {current_host}")
                continue

            # Handle unreachable nodes
            if 'unreachable:' in line and '[' in line and ']' in line:
                unreachable_match = re.search(r'unreachable: \[([^\]]+)\]', line)
                if unreachable_match:
                    hostname = unreachable_match.group(1)
                    node_status_data[hostname] = {
                        'status': 'unreachable',
                        'result': 'Node unreachable'
                    }
                    self.app.print_debug(f"Set {hostname} to unreachable")
                    continue

            # Handle fatal unreachable nodes
            if 'fatal:' in line and 'UNREACHABLE!' in line and '[' in line and ']' in line:
                fatal_match = re.search(r'fatal: \[([^\]]+)\]: UNREACHABLE!', line)
                if fatal_match:
                    hostname = fatal_match.group(1)
                    node_status_data[hostname] = {
                        'status': 'unreachable',
                        'result': 'Node unreachable - connection failed'
                    }
                    self.app.print_debug(f"Set {hostname} to unreachable (fatal)")
                    continue

            # Look for structured message output from the playbook
            if current_host and '"msg":' in line:
                self.app.print_debug(f"Processing message line for {current_host}: {line}")
                msg_match = re.search(r'"msg":\s*"([^"]*(?:\\.[^"]*)*)"', line)
                if msg_match:
                    msg_content = msg_match.group(1)
                    msg_content = msg_content.replace('\\n', '\n').replace('\\"', '"')
                    self.app.print_debug(f"Extracted message for {current_host}: {msg_content}")

                    # Handle truncated messages
                    if msg_content.endswith('\\n') or not msg_content.endswith('"'):
                        self.app.print_debug(f"Message appears truncated for {current_host}, looking for continuation")
                        for j in range(i + 1, min(i + 10, len(lines))):
                            next_line = lines[j].strip()
                            if next_line.startswith('"') and next_line.endswith('"'):
                                msg_content += next_line[1:-1].replace('\\n', '\n').replace('\\"', '"')
                                break
                            elif next_line.startswith('"'):
                                msg_content += next_line[1:].replace('\\n', '\n').replace('\\"', '"')
                            else:
                                break
                        self.app.print_debug(f"Complete message for {current_host}: {msg_content}")

                    service_file_version = self._parse_service_file_version(msg_content)
                    if service_file_version:
                        self._apply_service_file_version(node_status_data, current_host, service_file_version)

                    if 'Service Status:' in msg_content or 'Container Status:' in msg_content:
                        self.app.print_debug(f"Found status information in message for {current_host}")
                    else:
                        self.app.print_debug(f"No status information found in message for {current_host}")
                        continue

                    service_status, container_status = self._parse_status_fields(msg_content)
                    self.app.print_debug(f"Parsed {current_host}: service={service_status}, container={container_status}")
                    self._apply_resolved_status(node_status_data, current_host, service_status, container_status)

            # Direct status lines
            elif current_host and ('Service Status:' in line or 'Container Status:' in line):
                self.app.print_debug(f"Processing status line for {current_host}: {line}")
                service_status, container_status = self._parse_status_fields(line)
                self.app.print_debug(f"Parsed {current_host}: service={service_status}, container={container_status}")
                self._apply_resolved_status(node_status_data, current_host, service_status, container_status)

            elif current_host and ('Service File Version:' in line or 'R1SETUP_SERVICE_FILE_VERSION=' in line):
                service_file_version = self._parse_service_file_version(line)
                self._apply_service_file_version(node_status_data, current_host, service_file_version)

        self.app.print_debug(f"Initial parse node status data: {node_status_data}")

        # Aggressive fallback if no data found
        if not node_status_data:
            self.app.print_debug("No status data found, trying aggressive parsing approach")
            current_host = None
            for i, line in enumerate(lines):
                line = line.strip()
                host_match = re.search(r'ok:\s*\[([^\]]+)\]', line)
                if host_match:
                    current_host = host_match.group(1)
                    self.app.print_debug(f"Found host mention: {current_host}")

                if current_host and ('Service Status:' in line or 'Container Status:' in line):
                    self.app.print_debug(f"Found status line for {current_host}: {line}")
                    service_status, container_status = self._parse_status_fields(line)
                    self.app.print_debug(f"Aggressive parsing for {current_host}: service={service_status}, container={container_status}")
                    self._apply_resolved_status(node_status_data, current_host, service_status, container_status)
                elif current_host and ('Service File Version:' in line or 'R1SETUP_SERVICE_FILE_VERSION=' in line):
                    service_file_version = self._parse_service_file_version(line)
                    self._apply_service_file_version(node_status_data, current_host, service_file_version)

        self.app.print_debug(f"Final node status data: {node_status_data}")
        return node_status_data

    def _record_discovered_service_file_versions(self, node_status_data) -> None:
        """Persist service versions discovered during a status check."""
        host_versions = {
            host_name: data.get('service_file_version')
            for host_name, data in node_status_data.items()
            if data.get('service_file_version')
        }
        if host_versions:
            self.app.record_service_file_versions(host_versions)

    @staticmethod
    def _fill_missing_host_statuses(data, hosts):
        """Add 'unknown' status for any hosts not in the status data."""
        for hostname in hosts.keys():
            if hostname not in data:
                data[hostname] = {
                    'status': 'unknown',
                    'result': 'Unable to determine status'
                }
        return data

    def _get_real_time_node_status(self) -> Dict[str, Dict[str, str]]:
        """Get real-time status for all nodes by running the service status playbook"""
        success, output = self._run_status_playbook()
        if not output and not success:
            return {}

        if success:
            self.app.print_debug(f"Playbook output length: {len(output)} characters")
            self.app.print_debug("=== PLAYBOOK OUTPUT ===")
            self.app.print_debug(output)
            self.app.print_debug("=== END OUTPUT ===")
        else:
            self.app.print_debug(f"Status check failed with output length: {len(output) if output else 0}")
            if output and "timed out" in output.lower():
                self.app.print_debug("Status check timed out - some hosts may be offline")
            elif output:
                self.app.print_debug(f"Status check failed with error: {output}")
                self.app.print_debug("=== PARTIAL PLAYBOOK OUTPUT ===")
                self.app.print_debug(output)
                self.app.print_debug("=== END PARTIAL OUTPUT ===")

        lines = output.split('\n') if output else []
        data = self._parse_ansible_status_lines(lines)
        self._record_discovered_service_file_versions(data)
        hosts = _get_gpu_hosts(self.app.inventory)
        return self._fill_missing_host_statuses(data, hosts)

    def _get_status_display_info(self, status: str) -> Tuple[str, str, str]:
        """Get display information for a status (emoji, color, description)"""
        status_info = {
            'running': ('🟢', 'green', 'Running'),
            'stopped': ('🔴', 'red', 'Stopped'),
            'pending_restart': ('🟡', 'yellow', 'Pending Restart'),
            'unknown': ('❓', 'white', 'Unknown'),
            'deploying': ('🔄', 'cyan', 'Deploying'),
            'error': ('❌', 'red', 'Error'),
            'never_deployed': ('⚪', 'white', 'Never Deployed'),
            'deleted': ('🗑️', 'red', 'Deleted'),
            'unreachable': ('🔌', 'red', 'Unreachable'),
            'not_deployed': ('📦', 'yellow', 'Not Deployed')
        }
        return status_info.get(status, ('❓', 'white', 'Unknown'))

    def _display_node_status(self, node_name: str, compact: bool = False) -> None:
        """Display the status of a node"""
        status_info = self._get_node_status_info(node_name)
        status = status_info['status']
        emoji, color, description = self._get_status_display_info(status)

        if compact:
            self.app.print_colored(f"{emoji} {description}", color, end='')
        else:
            self.app.print_colored(f"Status: {emoji} {description}", color)

    def check_and_update_node_status(self) -> None:
        """Check actual service status and update node statuses accordingly"""
        if not self.app.check_hosts_config():
            self.app.print_colored("No nodes configured! Please configure nodes first.", 'red')
            self.app.wait_for_enter()
            return

        self.app.print_header("Check & Update Node Status")

        # Load configuration to show current statuses
        self.app.load_configuration()
        hosts = _get_gpu_hosts(self.app.inventory)

        # Show current statuses
        self.app.print_colored(f"🔍 Current Node Statuses:", 'cyan', bold=True)
        self.app.print_debug(f"Debug: Checking status for {len(hosts)} nodes")
        for name in hosts.keys():
            status_info = self._get_node_status_info(name)
            status = status_info['status']
            last_update = status_info['last_update']
            emoji, color, description = self._get_status_display_info(status)
            last_update_str = self.app._format_timestamp_ago(last_update)
            self.app.print_colored(f"   • {name}: {emoji} {description}", color, end='')
            self.app.print_colored(f" (Last updated: {last_update_str})", 'white')
            self.app.print_debug(f"  {name}: status={status}, last_update={last_update}")

        self.app.print_colored(f"\n📋 This will:", 'yellow', bold=True)
        self.app.print_colored("   • Check the actual service status on all nodes", 'yellow')
        self.app.print_colored("   • Update node statuses based on real service state", 'yellow')
        self.app.print_colored("   • Preserve 'Pending Restart' status (requires manual restart)", 'yellow')
        self.app.print_colored("   • Show differences between tracked and actual status", 'yellow')

        if self.app.get_input(f"\n🔍 Continue with status check on {len(hosts)} node(s)? (y/n)", "y").lower() != 'y':
            self.app.print_colored("Status check cancelled.", 'yellow')
            return

        playbook_path = self.app.config_dir / 'playbooks/service_status.yml'
        if not playbook_path.exists():
            self.app.print_colored(f"Service status playbook not found: {playbook_path}", 'red')
            self.app.wait_for_enter()
            return

        cmd = (f"ANSIBLE_CONFIG={os.environ['ANSIBLE_CONFIG']} "
               f"ANSIBLE_COLLECTIONS_PATH={os.environ['ANSIBLE_COLLECTIONS_PATH']} "
               f"ANSIBLE_HOME={os.environ['ANSIBLE_HOME']} "
               f"ansible-playbook -i {self.app.config_file} {playbook_path}")

        self.app.print_colored("\n🔍 Checking actual service status on all nodes...", 'cyan')
        self.app.print_debug(f"Running ansible command: {cmd}")
        success, output = self.app.run_command(cmd, show_output=False, timeout=self.app.connection_timeout)

        if success:
            self.app.print_colored("✅ Service status check completed!", 'green')
            self.app.print_debug(f"Ansible output length: {len(output)} characters")
            if DEBUG and output:
                self.app.print_debug("Raw ansible output (first 1000 chars):")
                self.app.print_debug(output[:1000] + ("..." if len(output) > 1000 else ""))

            parsed_status_data = self._parse_ansible_status_lines(output.split('\n'))
            self._record_discovered_service_file_versions(parsed_status_data)

            # Parse the output to determine actual service states
            actual_statuses = self._parse_service_status_output(output)

            # Update node statuses based on results
            status_changes = []
            self.app.print_debug(f"Processing status updates for {len(actual_statuses)} nodes")

            # Update last_update timestamp for all nodes that were checked
            for node_name in actual_statuses.keys():
                self._update_node_status(node_name, actual_statuses[node_name])

            for node_name, actual_status in actual_statuses.items():
                current_status_info = self._get_node_status_info(node_name)
                current_status = current_status_info['status']

                # Determine new status based on actual service state and current tracked status
                new_status = self._determine_updated_status(current_status, actual_status)

                self.app.print_debug(f"Node {node_name}: current={current_status}, actual={actual_status}, new={new_status}")

                if new_status != current_status:
                    self._update_node_status(node_name, new_status)
                    status_changes.append({
                        'node': node_name,
                        'old': current_status,
                        'new': new_status,
                        'actual': actual_status
                    })
                    self.app.print_debug(f"  Status change recorded for {node_name}")
                else:
                    self.app.print_debug(f"  No status change needed for {node_name}")

            # Show results
            self.app.print_colored(f"\n📊 Status Update Results:", 'cyan', bold=True)

            if status_changes:
                self.app.print_colored(f"Updated {len(status_changes)} node(s):", 'green')
                for change in status_changes:
                    old_emoji, old_color, old_desc = self._get_status_display_info(change['old'])
                    new_emoji, new_color, new_desc = self._get_status_display_info(change['new'])
                    self.app.print_colored(f"   • {change['node']}: ", 'white', end='')
                    self.app.print_colored(f"{old_emoji} {old_desc}", old_color, end='')
                    self.app.print_colored(" → ", 'white', end='')
                    self.app.print_colored(f"{new_emoji} {new_desc}", new_color)
                    if change['actual'] == 'service_missing':
                        self.app.print_colored(f"     (Service not found - node may not be deployed)", 'yellow')
                    elif change['actual'] == 'connection_failed':
                        self.app.print_colored(f"     (Connection failed - node may be unreachable)", 'yellow')
            else:
                self.app.print_colored("No status changes needed - all nodes are correctly tracked.", 'green')

            # Show final status summary
            self.app.print_colored(f"\n🖥️  Final Node Statuses:", 'cyan', bold=True)
            for name in hosts.keys():
                status_info = self._get_node_status_info(name)
                status = status_info['status']
                last_update = status_info['last_update']
                emoji, color, description = self._get_status_display_info(status)
                last_update_str = self.app._format_timestamp_ago(last_update)
                self.app.print_colored(f"   • {name}: {emoji} {description}", color, end='')
                self.app.print_colored(f" (Last updated: {last_update_str})", 'white')

        else:
            self.app.print_colored("❌ Service status check failed. Please check network connectivity and node access.", 'red')
            self.app.print_colored("Output for debugging:", 'yellow')
            if output:
                print(output[:500] + "..." if len(output) > 500 else output)

        self.app.wait_for_enter()

    def _parse_service_status_output(self, output: str) -> Dict[str, str]:
        """Parse ansible service status output to determine actual service states"""
        node_statuses = {}

        try:
            lines = output.split('\n')
            current_node = None
            self.app.print_debug(f"Parsing ansible output with {len(lines)} lines")

            for i, line in enumerate(lines):
                line = line.strip()

                # Look for node task execution patterns
                if 'TASK [' in line and 'Check service status' in line:
                    self.app.print_debug(f"Line {i}: Found task header: {line}")
                    continue

                # Look for node results
                if line.startswith(('ok: [', 'changed: [', 'fatal: [', 'unreachable: [')):
                    start = line.find('[') + 1
                    end = line.find(']')
                    if start > 0 and end > start:
                        current_node = line[start:end]
                        self.app.print_debug(f"Line {i}: Found node result for {current_node}: {line}")

                        if line.startswith('unreachable:'):
                            node_statuses[current_node] = 'connection_failed'
                            self.app.print_debug(f"  Set {current_node} to connection_failed")
                        elif line.startswith('fatal:'):
                            if 'could not be found' in line.lower() or 'not found' in line.lower():
                                node_statuses[current_node] = 'service_missing'
                                self.app.print_debug(f"  Set {current_node} to service_missing")
                            else:
                                node_statuses[current_node] = 'error'
                                self.app.print_debug(f"  Set {current_node} to error")
                        continue

                # Look for systemctl status output patterns
                # Avoid Ansible summary lines (e.g., "my-node : ok=10 changed=3 failed=0")
                if current_node and not (':' in line and 'ok=' in line and 'changed=' in line):
                    if any(keyword in line.lower() for keyword in ['active', 'inactive', 'failed', 'running']):
                        self.app.print_debug(f"Line {i}: Status line for {current_node}: {line}")
                        if 'active (running)' in line.lower():
                            node_statuses[current_node] = 'running'
                            self.app.print_debug(f"  Set {current_node} to running")
                        elif 'inactive' in line.lower() or 'stopped' in line.lower():
                            node_statuses[current_node] = 'stopped'
                            self.app.print_debug(f"  Set {current_node} to stopped")
                        elif 'failed' in line.lower() and 'failed=' not in line.lower():
                            node_statuses[current_node] = 'error'
                            self.app.print_debug(f"  Set {current_node} to error")

                # Look for service status in structured output
                elif current_node and 'service status:' in line.lower():
                    self.app.print_debug(f"Line {i}: Service status line for {current_node}: {line}")
                    if 'active' in line.lower():
                        node_statuses[current_node] = 'running'
                        self.app.print_debug(f"  Set {current_node} to running (from service status)")
                    elif 'inactive' in line.lower() or 'inactive/failed' in line.lower():
                        node_statuses[current_node] = 'stopped'
                        self.app.print_debug(f"  Set {current_node} to stopped (from service status)")

            # Broader fallback approach
            if not node_statuses:
                self.app.print_debug("No statuses parsed, trying broader approach")
                import re
                node_pattern = r'(?:ok|changed|fatal|unreachable): \[([^\]]+)\]'
                matches = re.findall(node_pattern, output)
                self.app.print_debug(f"Found node mentions: {matches}")
                for node_name in set(matches):
                    if node_name not in node_statuses:
                        node_statuses[node_name] = 'unknown'
                        self.app.print_debug(f"  Set {node_name} to unknown (fallback)")

        except Exception as e:
            self.app.print_debug(f"Error parsing service status output: {e}")

        self.app.print_debug(f"Final parsed node statuses: {node_statuses}")
        return node_statuses

    def _determine_updated_status(self, current_status: str, actual_status: str) -> str:
        """Determine the new status based on current tracked status and actual service state"""

        self.app.print_debug(f"Determining status: current='{current_status}', actual='{actual_status}'")

        # Special handling for pending_restart
        if current_status == 'pending_restart':
            if actual_status == 'stopped' or actual_status == 'service_missing':
                result = 'stopped' if actual_status == 'stopped' else 'never_deployed'
                self.app.print_debug(f"  Pending restart cleared: {current_status} → {result}")
                return result
            else:
                self.app.print_debug(f"  Pending restart preserved (service state: {actual_status})")
                return 'pending_restart'

        # For other statuses, update based on actual service state
        status_mapping = {
            'running': 'running',
            'stopped': 'stopped',
            'error': 'error',
            'service_missing': 'never_deployed',
            'connection_failed': 'error',
            'unknown': 'unknown'
        }

        result = status_mapping.get(actual_status, 'unknown')
        self.app.print_debug(f"  Status mapping: {actual_status} → {result}")
        return result





class DeploymentService:
    """Handles deployment, deletion, and deployment status operations.

    Accesses from self.app (R1Setup):
        - inventory (read)
        - config_dir, config_file, configs_dir (paths)
        - active_config (read/write)
        - print_colored(), print_debug(), print_header(), print_section()
        - get_input()
        - run_command()
        - check_hosts_config()
        - load_configuration()
        - _save_active_config(), _save_configuration()
        - get_mnl_app_env()
        - select_hosts()
        - _get_node_status_info(), _get_status_display_info()
        - _update_node_status(), _display_node_status()
        - _get_real_time_node_status()
        - _display_copy_friendly_addresses()
        - _load_active_config()
    """

    def __init__(self, app):
        self.app = app

    @staticmethod
    def _phase_timeout(base_timeout: int, minimum_timeout: int) -> int:
        """Return a safe timeout floor for longer-running operations."""
        return max(int(base_timeout), int(minimum_timeout))

    def _machine_preparation_timeout(self) -> int:
        """Timeout for fresh-machine preparation phases."""
        return self._phase_timeout(self.app.connection_timeout, 600)

    def _instance_apply_timeout(self) -> int:
        """Timeout for instance runtime apply/start phases."""
        return self._phase_timeout(self.app.connection_timeout, 180)

    def _extract_successful_hosts_from_output(
        self,
        output: str,
        expected_hosts: List[str],
    ) -> List[str]:
        """Return hosts that completed successfully according to PLAY RECAP."""
        recap = self.app._parse_ansible_play_recap(output or "")
        successful_hosts = []
        for host_name in expected_hosts:
            result = recap.get(host_name, {})
            if result.get('status') == 'connected':
                successful_hosts.append(host_name)
        return successful_hosts

    def _get_empty_registered_machine_records(self) -> Dict[str, Dict[str, Any]]:
        """Return registered machines that currently have no assigned instances."""
        fleet_state = self.app.get_fleet_state_copy()
        machines = fleet_state.get('fleet', {}).get('machines', {})
        return {
            machine_id: dict(machine_data)
            for machine_id, machine_data in machines.items()
            if not machine_data.get('instance_names')
        }

    def _update_machine_deployment_state(self, machine_id: str, deployment_state: str) -> None:
        """Persist a machine deployment-state change."""
        fleet_state = self.app.get_fleet_state_copy()
        machine_record = dict(fleet_state.get('fleet', {}).get('machines', {}).get(machine_id, {}))
        if not machine_record:
            return
        machine_record['deployment_state'] = deployment_state
        self.app.upsert_machine_record(machine_id, machine_record)

    @staticmethod
    def _extract_successful_machine_ids(
        execution_inventory: Dict[str, Any],
        successful_execution_hosts: List[str],
    ) -> List[str]:
        """Map successful machine inventory aliases back to fleet machine ids."""
        execution_hosts = _get_gpu_hosts(execution_inventory)
        successful_machine_ids = []
        for host_name in successful_execution_hosts:
            machine_id = str(execution_hosts.get(host_name, {}).get('r1setup_machine_id') or '').strip()
            if machine_id:
                successful_machine_ids.append(machine_id)
        return successful_machine_ids

    def prepare_registered_machines(self, *, skip_gpu: bool = False) -> None:
        """Prepare registered machines that do not yet have assigned instances."""
        if not self.app.active_config.get('config_name'):
            self.app.print_colored("No active configuration. Create or load one first.", 'red')
            self.app.wait_for_enter()
            return

        self.app.print_header("Prepare Registered Machines")
        self.app.load_configuration()
        machine_records = self._get_empty_registered_machine_records()

        if not machine_records:
            self.app.print_colored("No registered empty machines are available.", 'yellow')
            self.app.print_colored(
                "Register a machine first or assign instances before using this preparation flow.",
                'white',
            )
            self.app.wait_for_enter()
            return

        operation_name = "machine preparation"
        selected_machine_ids = self.app.select_registered_machines(
            machine_records,
            operation_name,
            preselect_mode='all',
        )
        if not selected_machine_ids:
            self.app.print_colored("Machine preparation cancelled - no machines selected.", 'yellow')
            self.app.wait_for_enter()
            return

        env = self.app.get_mnl_app_env()
        self.app.print_colored("📋 Machine Preparation Details:", 'cyan', bold=True)
        self.app.print_colored(
            f"   • Action: {'Install Docker + NVIDIA prerequisites' if not skip_gpu else 'Install Docker prerequisites only'}",
            'white',
        )
        self.app.print_colored(f"   • Network context: {env if env else 'Not set'}", 'white')
        self.app.print_colored(f"   • Selected Machines: {len(selected_machine_ids)}/{len(machine_records)}", 'white')
        print()
        self.app.print_colored("🖥️  Selected Registered Machines:", 'cyan', bold=True)
        for machine_id in selected_machine_ids:
            machine_record = machine_records[machine_id]
            connection_display = self.app.config_manager._format_machine_connection_display(machine_record)
            specs_summary = self.app.config_manager._format_machine_specs_summary(machine_record.get('machine_specs'))
            topology_mode = machine_record.get('topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE)
            deployment_state = machine_record.get('deployment_state', DEFAULT_MACHINE_DEPLOYMENT_STATE)
            self.app.print_colored(
                f"   • {machine_id}: {connection_display} | mode={topology_mode} | state={deployment_state}",
                'white',
            )
            if specs_summary:
                self.app.print_colored(f"     specs: {specs_summary}", 'cyan')

        self.app.print_colored("\n⚠️  This will:", 'yellow', bold=True)
        self.app.print_colored("   • Install machine prerequisites", 'yellow')
        self.app.print_colored("   • Install and configure Docker", 'yellow')
        if not skip_gpu:
            self.app.print_colored("   • Install NVIDIA drivers and CUDA prerequisites when applicable", 'yellow')
        self.app.print_colored("   • Not deploy or start any Edge Node instance", 'yellow')
        prep_timeout = self._machine_preparation_timeout()
        self.app.print_colored(
            f"\nℹ️  First-time machine preparation can take several minutes. Timeout for this phase: {prep_timeout}s (base setting: {self.app.connection_timeout}s).",
            'cyan',
        )

        if self.app.get_input(
            f"\n🚀 Continue with preparation for {len(selected_machine_ids)} registered machine(s)? (y/n)",
            "y",
        ).lower() != 'y':
            self.app.print_colored("Machine preparation cancelled.", 'yellow')
            return

        playbook_path = self.app.config_dir / 'playbooks/prepare_machine.yml'
        if not playbook_path.exists():
            self.app.print_colored(f"Playbook not found: {playbook_path}", 'red')
            self.app.wait_for_enter()
            return

        extra_vars = {'skip_gpu': True} if skip_gpu else None
        success, output, executed_machine_hosts, execution_inventory = self.app.run_registered_machine_playbook(
            playbook_path,
            selected_machine_ids,
            extra_vars=extra_vars,
            show_output=True,
            timeout=prep_timeout,
            fleet_state=self.app.get_fleet_state_copy(),
        )

        successful_execution_hosts = self._extract_successful_hosts_from_output(output, executed_machine_hosts)
        successful_machine_ids = self._extract_successful_machine_ids(execution_inventory, successful_execution_hosts)
        failed_machine_ids = [
            machine_id
            for machine_id in selected_machine_ids
            if machine_id not in successful_machine_ids
        ]

        for machine_id in successful_machine_ids:
            self._update_machine_deployment_state(machine_id, 'prepared')
        for machine_id in failed_machine_ids:
            self._update_machine_deployment_state(machine_id, 'error')

        if success and successful_machine_ids:
            self.app.print_colored(
                f"\n✅ Machine preparation succeeded on {len(successful_machine_ids)} machine(s)",
                'green',
            )
        elif successful_machine_ids:
            self.app.print_colored(
                f"\n⚠️  Machine preparation partially succeeded on {len(successful_machine_ids)} machine(s)",
                'yellow',
            )
        else:
            self.app.print_colored("\n❌ Machine preparation failed on all selected machines.", 'red')

        self.app.print_colored("\n📊 Machine Preparation Results:", 'cyan', bold=True)
        for machine_id in selected_machine_ids:
            machine_record = machine_records.get(machine_id, {})
            connection_display = self.app.config_manager._format_machine_connection_display(machine_record)
            if machine_id in successful_machine_ids:
                result_text = "✅ PREPARED"
                color = 'green'
            else:
                result_text = "❌ ERROR"
                color = 'red'
            self.app.print_colored(f"   • {machine_id}: {connection_display} [{result_text}]", color)

        self.app.print_colored(
            "\nℹ️  Preparation finished at machine scope only. No Edge Node instances were deployed or started.",
            'cyan',
        )

        self.app.wait_for_enter()

    def deploy_full(self) -> None:
        """Install GPU Nodes — deploys the GPU edge_node image.

        Sub-prompts whether r1setup should install/manage the NVIDIA drivers
        (Mode 2) or whether the user has already set them up themselves
        (Mode 3). Mode 3 skips the nvidia_gpu role entirely and runs a
        preflight nvidia-smi check before the Docker pull.
        """
        self.app.print_header("Install GPU Nodes")
        self.app.print_colored(
            "\nThis deploys the GPU image (ratio1/edge_node_gpu:{env}) on the hosts you select.\n"
            "GPU installs never silently fall back to CPU — if anything goes wrong, the affected\n"
            "hosts fail loudly and are listed for retry.",
            'cyan',
        )
        self.app.print_colored(
            "\nShould r1setup install and manage the NVIDIA drivers on the selected hosts?",
            'yellow',
            bold=True,
        )
        self.app.print_colored(
            "   Y = r1setup installs drivers + NVIDIA Container Toolkit (default)",
            'white',
        )
        self.app.print_colored(
            "   N = drivers are already installed by you; r1setup only verifies nvidia-smi and deploys the image",
            'white',
        )
        answer = self.app.get_input(
            "Install/manage NVIDIA drivers? (Y/n)", "Y"
        ).strip().lower()
        manage_drivers = answer not in ('n', 'no')

        if manage_drivers:
            description = "Docker + NVIDIA drivers + GPU image deploy"
            last_applied_action = "deploy_gpu_managed"
        else:
            description = "GPU image deploy (user-managed NVIDIA drivers)"
            last_applied_action = "deploy_gpu_user_drivers"

        self._deploy_setup(
            "site.yml",
            "Install GPU Nodes",
            description,
            variant="gpu",
            manage_drivers=manage_drivers,
            last_applied_action=last_applied_action,
        )

    @staticmethod
    def _derive_variant_from_probe(probe: Dict[str, Any]) -> Dict[str, Any]:
        """Map a probe-summary dict (from probe_install_state.yml) to
        (variant, driver_owner, applied_at). Returns None values when there's
        no concrete evidence, letting the caller skip the host rather than
        guess. Prefers, in order:
          1. image_variant recorded by a prior 1.8.0+ deploy (authoritative).
          2. The current docker image name (suffix-based).
          3. The systemd unit having --gpus (last-resort for deploys where
             the container is gone and no metadata JSON exists).
        """
        prior = probe.get('prior_metadata') or {}
        image = (probe.get('docker_image') or '').strip()
        variant: Optional[str] = None
        driver_owner: Optional[str] = None
        applied_at = prior.get('last_applied_at') or prior.get('applied_at')

        # 1. Authoritative: prior metadata written by 1.8.0+.
        if prior.get('image_variant') in ('cpu', 'gpu'):
            variant = prior['image_variant']
            if prior.get('driver_owner') in ('r1setup', 'user', 'n/a'):
                driver_owner = prior['driver_owner']

        # 2. Derive from current docker image suffix.
        if variant is None and image:
            variant = 'gpu' if image.endswith('_gpu') or '_gpu:' in image else 'cpu'

        # 3. Last-resort: systemd unit has --gpus and nothing else survived.
        if variant is None and probe.get('systemd_has_gpu_flag'):
            variant = 'gpu'

        if variant is None:
            # No concrete evidence — skip rather than guess. Prior metadata
            # with only a timestamp is not enough to pick CPU vs GPU.
            return {'variant': None, 'driver_owner': None, 'applied_at': None}

        # Derive driver_owner only if prior metadata didn't already carry it.
        if driver_owner is None:
            if variant == 'gpu':
                # nvidia_gpu role installs the NVIDIA Container Toolkit (Mode 2).
                # Toolkit + working nvidia-smi => r1setup-managed; working
                # nvidia-smi without toolkit => user-managed; no working
                # nvidia-smi at all but GPU image was deployed => the drivers
                # are currently broken. Classify those as 'user' so we don't
                # incorrectly attribute them to r1setup.
                if probe.get('nvidia_container_toolkit') and probe.get('nvidia_smi_works'):
                    driver_owner = 'r1setup'
                else:
                    driver_owner = 'user'
            else:
                driver_owner = 'n/a'

        return {'variant': variant, 'driver_owner': driver_owner, 'applied_at': applied_at}

    def migrate_install_tracking(self) -> None:
        """Probe running hosts and back-populate the 1.8.0 install tracking
        fields. Read-only on the target hosts (no package or config changes).
        """
        if not self.app.check_hosts_config():
            self.app.print_colored("No nodes configured! Please configure nodes first.", 'red')
            self.app.wait_for_enter()
            return

        self.app.print_header("Migrate Install State")
        self.app.print_colored(
            "\nProbes each selected host for its currently-deployed image, NVIDIA\n"
            "driver state, and prior r1setup metadata, then back-populates the\n"
            "per-host install-tracking fields introduced in 1.8.0.\n\n"
            "Read-only: no packages are installed, no configs are changed on\n"
            "the target hosts. Safe to run on live fleets.",
            'cyan',
        )

        self.app.load_configuration()
        all_hosts = _get_gpu_hosts(self.app.inventory)

        # Default to hosts that look like they were deployed before 1.8.0:
        # node_status indicates a deploy happened, but install-tracking is
        # still empty.
        candidates = {
            name for name, cfg in all_hosts.items()
            if cfg.get('node_status') in ('running', 'stopped', 'error')
            and not cfg.get(INSTALL_LAST_VARIANT_FIELD)
        }
        preselect_label = None
        if candidates:
            preselect_label = "hosts deployed before 1.8.0 (running, no install tracking yet)"

        selected = self.app.select_hosts(
            all_hosts, "install-state migration",
            preselect_mode='none' if not candidates else 'all',
            initial_selection=candidates if candidates else None,
            preselection_label=preselect_label,
        )
        if not selected:
            self.app.print_colored("Migration cancelled - no hosts selected.", 'yellow')
            self.app.wait_for_enter()
            return

        playbook = self.app.config_dir / 'playbooks/probe_install_state.yml'
        if not playbook.exists():
            self.app.print_colored(f"Probe playbook not found: {playbook}", 'red')
            self.app.wait_for_enter()
            return

        self.app.print_colored(f"\nProbing {len(selected)} host(s)...", 'cyan')
        success, output, executed_hosts, _ = self.app.run_generated_playbook(
            playbook,
            selected,
            machine_scope=False,
            last_applied_action='migrate_install_tracking',
            show_output=True,
            timeout=self.app.connection_timeout,
        )
        reachable = self._extract_successful_hosts_from_output(output, executed_hosts)
        unreachable = [h for h in selected if h not in reachable]

        # Fetched probes land at /tmp/r1setup-fetched/probe-<host>.json
        fetched_dir = Path('/tmp/r1setup-fetched')
        migrated: List[str] = []
        skipped: List[Tuple[str, str]] = []
        for host in reachable:
            probe_path = fetched_dir / f"probe-{host}.json"
            if not probe_path.exists():
                skipped.append((host, "probe result not fetched"))
                continue
            try:
                probe = json.loads(probe_path.read_text())
            except (OSError, ValueError) as exc:
                skipped.append((host, f"probe parse error: {exc}"))
                continue
            derived = self._derive_variant_from_probe(probe)
            if not derived['variant']:
                skipped.append((host, "no deploy evidence on host"))
                continue
            variant = derived['variant']
            owner = derived['driver_owner']
            self.app.record_install_attempt([host], variant, owner, "success")
            self.app.record_install_success([host], variant, owner)
            migrated.append(f"{host}: {variant.upper()} ({owner})")

        # Report.
        self.app.print_colored("\n📊 Migration Summary:", 'cyan', bold=True)
        if migrated:
            self.app.print_colored(f"   ✅ Back-populated {len(migrated)} host(s):", 'green')
            for line in migrated:
                self.app.print_colored(f"      • {line}", 'white')
        if skipped:
            self.app.print_colored(f"   ⚠️  Skipped {len(skipped)} host(s):", 'yellow')
            for host, reason in skipped:
                self.app.print_colored(f"      • {host}: {reason}", 'yellow')
        if unreachable:
            self.app.print_colored(f"   ❌ Unreachable {len(unreachable)} host(s):", 'red')
            for host in unreachable:
                self.app.print_colored(f"      • {host}", 'red')
            self.app.print_colored(
                "   Retry the migration after these hosts come back online.",
                'yellow',
            )
        if not success and not migrated:
            self.app.print_colored("\n❌ Probe playbook failed. See output above.", 'red')

        self.app.wait_for_enter()

    def deploy_docker_only(self) -> None:
        """Install CPU Nodes — deploys the CPU edge_node image, skips all GPU setup."""
        self._deploy_setup(
            "site.yml",
            "Install CPU Nodes",
            "Docker only (CPU image deploy)",
            variant="cpu",
            manage_drivers=False,
            last_applied_action="deploy_cpu",
        )

    def delete_edge_node(self) -> None:
        """Delete deployed Edge Node with host selection"""
        if not self.app.check_hosts_config():
            self.app.print_colored("No nodes configured! Please configure nodes first.", 'red')
            self.app.wait_for_enter()
            return

        self.app.print_header("Delete Edge Node Deployment")

        # Load configuration to show target hosts
        self.app.load_configuration()
        all_hosts = _get_gpu_hosts(self.app.inventory)
        env = self.app.get_mnl_app_env()

        # Interactive host selection for deletion
        selected_host_names = self.app.select_hosts(all_hosts, "Edge Node deletion", preselect_mode='all')

        if not selected_host_names:
            self.app.print_colored("Deletion cancelled - no hosts selected.", 'yellow')
            self.app.wait_for_enter()
            return

        if not self.app._ensure_helper_mode_supported_for_hosts(
            selected_host_names,
            action_label="delete selected nodes",
        ):
            self.app.wait_for_enter()
            return

        # Filter hosts to only include selected ones
        selected_hosts = {name: config for name, config in all_hosts.items() if name in selected_host_names}

        # Show deletion details
        self.app.print_colored(f"🗑️  Deletion Details:", 'cyan', bold=True)
        self.app.print_colored(f"   • Network: {env if env else 'Not set'}", 'white')
        self.app.print_colored(f"   • Selected Nodes: {len(selected_hosts)}/{len(all_hosts)}", 'white')

        self.app.print_colored(f"\n🖥️  Edge Nodes will be deleted from these selected machines:", 'cyan', bold=True)
        for name in selected_host_names:
            config = all_hosts[name]
            ip = config.get('ansible_host', 'Unknown')
            user = config.get('ansible_user', 'Unknown')
            # Show current status
            status_info = self.app._get_node_status_info(name)
            status_emoji, _, status_desc = self.app._get_status_display_info(status_info['status'])
            self.app.print_colored(f"   • {name}: {user}@{ip} [{status_emoji} {status_desc}]", 'white')

        self.app.print_colored("\n⚠️  WARNING: This will completely remove the Edge Node deployment including:", 'red', bold=True)
        self.app.print_colored("   • Systemd service and Docker containers", 'yellow')
        self.app.print_colored("   • Docker images and application data", 'yellow')
        self.app.print_colored("   • Created command scripts", 'yellow')
        self.app.print_colored("   • Docker daemon configuration", 'yellow')

        if self.app.get_input(f"\n⚠️  Are you sure you want to delete Edge Node deployment from {len(selected_hosts)} selected machine(s) (y/n)", "n").lower() != 'y':
            self.app.print_colored("Deletion cancelled.", 'yellow')
            return

        # Final confirmation
        if self.app.get_input(f"⚠️  Type 'DELETE' to confirm deletion from {len(selected_hosts)} selected node(s)", "").upper() != 'DELETE':
            self.app.print_colored("Deletion cancelled - confirmation not received.", 'yellow')
            return

        playbook_path = self.app.config_dir / 'playbooks/delete_edge_node.yml'
        if not playbook_path.exists():
            self.app.print_colored(f"Delete Edge Node playbook not found: {playbook_path}", 'red')
            self.app.wait_for_enter()
            return

        # Update selected node statuses to deploying (deletion is a deployment operation)
        for host_name in selected_host_names:
            self.app._update_node_status(host_name, 'deploying')

        self.app.print_colored("\nStarting Edge Node deletion on selected nodes...", 'cyan')
        success, _, _, _ = self.app.run_generated_playbook(
            playbook_path,
            selected_host_names,
            machine_scope=False,
            last_applied_action='delete_edge_node',
            show_output=True,
            timeout=self.app.connection_timeout,
        )

        if success:
            self.app.print_colored(f"\n✅ Deletion succeeded on {len(selected_host_names)} node(s)", 'green')
            # Update deployment metadata after successful deletion
            self._update_deletion_metadata()

            # Update selected node statuses to deleted after successful deletion
            for host_name in selected_host_names:
                self.app._update_node_status(host_name, 'deleted')

            # Show updated statuses
            self.app.print_colored(f"\n📊 Node Deletion Status:", 'cyan', bold=True)
            for host_name in selected_host_names:
                self.app.print_colored(f"   • {host_name}: ", 'white', end='')
                self.app._display_node_status(host_name, compact=True)
                print()  # New line after each status

            self.app.print_colored(f"\n💡 Note: Deleted nodes will maintain their 'deleted' status.", 'cyan')
            self.app.print_colored("   This status will not change automatically to preserve the deletion history.", 'white')
        else:
            self.app.print_colored(f"\n❌ Deletion failed. Check output above for details.", 'red')
            # Update selected node statuses to error after failed deletion
            for host_name in selected_host_names:
                self.app._update_node_status(host_name, 'error')
            self.app.print_colored(f"\n📊 Selected node statuses updated to Error due to deletion failure.", 'yellow')

        self.app.wait_for_enter()

    @staticmethod
    def _build_install_extra_vars(
        variant: str,
        manage_drivers: bool,
    ) -> Dict[str, Any]:
        """Validate the (variant, manage_drivers) pair and return the
        extra-vars dict that the Ansible playbook expects.

        Legal combinations:
          - ('cpu', False) -> Mode 1 (CPU install)
          - ('gpu', True)  -> Mode 2 (GPU install, r1setup manages drivers)
          - ('gpu', False) -> Mode 3 (GPU image only, user-managed drivers)

        Rejects 'cpu' + manage_drivers=True: installing drivers for a CPU
        deploy makes no sense and almost always signals a refactor bug.
        """
        if variant not in ('cpu', 'gpu'):
            raise ValueError(f"Invalid variant {variant!r}; expected 'cpu' or 'gpu'")
        if variant == 'cpu' and manage_drivers:
            raise ValueError(
                "Illegal install mode: variant='cpu' with manage_drivers=True. "
                "CPU install does not install NVIDIA drivers."
            )

        extra_vars: Dict[str, Any] = {"mnl_image_variant_cli": variant}
        # skip_gpu is true for Mode 1 and Mode 3 — in both, we don't run
        # the nvidia_gpu role. Only Mode 2 (manage_drivers=True) runs it.
        if not manage_drivers:
            extra_vars["skip_gpu"] = True
        return extra_vars

    def _deploy_setup(
        self,
        playbook: str,
        title: str,
        description: str,
        variant: str,
        manage_drivers: bool,
        last_applied_action: str = "deploy",
    ) -> None:
        """Common deployment logic with host selection.

        variant: 'cpu' or 'gpu' — selects the image variant.
        manage_drivers: when True (Mode 2), r1setup runs the nvidia_gpu role
            to install/manage drivers. When False for variant='gpu' (Mode 3),
            the nvidia_gpu role is skipped and a preflight nvidia-smi check
            runs instead. For variant='cpu', always False.
        """
        extra_vars = self._build_install_extra_vars(variant, manage_drivers)

        if not self.app.check_hosts_config():
            self.app.print_colored("No nodes configured! Please configure nodes first.", 'red')
            self.app.wait_for_enter()
            return

        self.app.print_header(title)

        # Load configuration to show deployment details
        self.app.load_configuration()
        all_hosts = _get_gpu_hosts(self.app.inventory)
        env = self.app.get_mnl_app_env()

        if not env:
            self.app.print_colored("\n⚠️  WARNING: Network environment is not set!", 'red', bold=True)
            self.app.print_colored("   Please set the network environment before deploying.", 'red')
            self.app.wait_for_enter()
            return

        # Interactive host selection with preselection of never-deployed nodes
        selected_host_names = self.app.select_hosts(all_hosts, title.lower(), preselect_mode='undeployed')

        if not selected_host_names:
            self.app.print_colored("Deployment cancelled - no hosts selected.", 'yellow')
            self.app.wait_for_enter()
            return

        if not self.app._ensure_helper_mode_supported_for_hosts(
            selected_host_names,
            action_label="deploy selected nodes",
        ):
            self.app.wait_for_enter()
            return

        # Filter hosts to only include selected ones
        selected_hosts = {name: config for name, config in all_hosts.items() if name in selected_host_names}

        # Show deployment details
        self.app.print_colored(f"📋 Deployment Details:", 'cyan', bold=True)
        self.app.print_colored(f"   • Action: {description}", 'white')
        self.app.print_colored(f"   • Network: {env}", 'green')
        self.app.print_colored(f"   • Selected Nodes: {len(selected_hosts)}/{len(all_hosts)}", 'white')

        self.app.print_colored(f"\n🖥️  Selected Target Machines:", 'cyan', bold=True)
        for name in selected_host_names:
            config = all_hosts[name]
            ip = config.get('ansible_host', 'Unknown')
            user = config.get('ansible_user', 'Unknown')
            # Show status
            status_info = self.app._get_node_status_info(name)
            status_emoji, _, status_desc = self.app._get_status_display_info(status_info['status'])
            self.app.print_colored(f"   • {name}: {user}@{ip} [{status_emoji} {status_desc}]", 'white')

        # Mode-specific confirmation copy, keyed on (variant, manage_drivers).
        self.app.print_colored(f"\n⚠️  This will:", 'yellow', bold=True)
        if variant == "gpu" and manage_drivers:
            # Mode 2 — r1setup installs/manages NVIDIA drivers.
            self.app.print_colored("   • Install Docker and Docker Compose", 'yellow')
            self.app.print_colored("   • Install NVIDIA drivers + NVIDIA Container Toolkit", 'yellow')
            self.app.print_colored(f"   • Pull the GPU image (ratio1/edge_node_gpu:{env})", 'yellow')
            self.app.print_colored("   • Deploy and start the Edge Node with GPU access", 'yellow')
            self.app.print_colored("   • Abort per host if GPU setup fails — no silent CPU fallback", 'yellow')
        elif variant == "gpu" and not manage_drivers:
            # Mode 3 — user-managed drivers.
            self.app.print_colored("   • Install Docker and Docker Compose", 'yellow')
            self.app.print_colored("   • Preflight nvidia-smi (r1setup will NOT touch your NVIDIA drivers)", 'yellow')
            self.app.print_colored(f"   • Pull the GPU image (ratio1/edge_node_gpu:{env})", 'yellow')
            self.app.print_colored("   • Deploy and start the Edge Node with GPU access", 'yellow')
            self.app.print_colored("   • Abort per host if nvidia-smi fails — no silent CPU fallback, no driver changes", 'yellow')
        else:
            # Mode 1 — CPU install.
            self.app.print_colored("   • Install Docker and Docker Compose", 'yellow')
            self.app.print_colored(f"   • Pull the CPU image (ratio1/edge_node:{env})", 'yellow')
            self.app.print_colored("   • Deploy and start the Edge Node in CPU mode", 'yellow')

        # Check for network change warning for selected hosts
        deployment_status = self.app.active_config.get('deployment_status', 'never_deployed')
        last_deployed_network = self.app.active_config.get('last_deployed_network')

        if deployment_status == 'deployed' and last_deployed_network and last_deployed_network != env:
            self.app.print_colored(f"\n🚨 NETWORK CHANGE DETECTED!", 'red', bold=True)
            self.app.print_colored(f"   This configuration was previously deployed on: {last_deployed_network}", 'yellow')
            self.app.print_colored(f"   You are now deploying to: {env}", 'cyan')
            self.app.print_colored(f"\n⚠️  Important Information:", 'yellow', bold=True)
            self.app.print_colored(f"   • The Edge Node will run on the NEW network: {env}", 'white')
            self.app.print_colored(f"   • Selected node addresses will be used:", 'white')
            for name in selected_host_names:
                config = all_hosts[name]
                ip = config.get('ansible_host', 'Unknown')
                self.app.print_colored(f"     - {name}: {ip}", 'white')
            self.app.print_colored(f"   • Node configuration and credentials remain the same", 'white')
            self.app.print_colored(f"   • Only the blockchain network environment changes", 'white')

            self.app.print_colored(f"\n❓ Type 'yes' to confirm deployment to {env} (different from previous {last_deployed_network})", 'yellow')
            if self.app.get_input("Confirm network change", "").lower() != 'yes':
                self.app.print_colored("Deployment cancelled.", 'yellow')
                return

        if self.app.get_input(f"\n🚀 Continue with deployment to {len(selected_hosts)} selected node(s) on {env}? (y/n)", "y").lower() != 'y':
            self.app.print_colored("Deployment cancelled.", 'yellow')
            return

        machine_prepare_path = self.app.config_dir / 'playbooks/prepare_machine.yml'
        instance_apply_path = self.app.config_dir / 'playbooks/apply_instance.yml'
        if not machine_prepare_path.exists():
            self.app.print_colored(f"Playbook not found: {machine_prepare_path}", 'red')
            self.app.wait_for_enter()
            return
        if not instance_apply_path.exists():
            self.app.print_colored(f"Playbook not found: {instance_apply_path}", 'red')
            self.app.wait_for_enter()
            return

        # Update selected node statuses to deploying
        for host_name in selected_host_names:
            self.app._update_node_status(host_name, 'deploying')

        machine_groups = self.app.group_host_names_by_machine(selected_host_names)
        machine_host_names = [detail['representative_host'] for detail in machine_groups.values()]
        machine_prepare_timeout = self._machine_preparation_timeout()
        instance_apply_timeout = self._instance_apply_timeout()
        self.app.print_colored(
            f"\nPreparing {len(machine_host_names)} machine(s) for {len(selected_host_names)} selected instance(s)...",
            'cyan',
        )
        self.app.print_colored(
            f"Fresh-machine preparation can take several minutes. Timeout for machine preparation: {machine_prepare_timeout}s (base setting: {self.app.connection_timeout}s).",
            'cyan',
        )

        machine_success, machine_output, executed_machine_hosts, _ = self.app.run_generated_playbook(
            machine_prepare_path,
            selected_host_names,
            machine_scope=True,
            extra_vars=extra_vars,
            last_applied_action=f"{last_applied_action}_machine_prepare",
            show_output=True,
            timeout=machine_prepare_timeout,
        )
        successful_machine_hosts = self._extract_successful_hosts_from_output(machine_output, executed_machine_hosts)
        successful_machine_ids = {
            self.app.config_manager._derive_machine_id(host_name, all_hosts[host_name])
            for host_name in successful_machine_hosts
            if host_name in all_hosts
        }
        deployable_host_names = [
            host_name
            for host_name in selected_host_names
            if self.app.config_manager._derive_machine_id(host_name, all_hosts[host_name]) in successful_machine_ids
        ]
        skipped_host_names = [host_name for host_name in selected_host_names if host_name not in deployable_host_names]

        if skipped_host_names:
            self.app.print_colored(
                f"\n⚠️  Machine preparation failed or was incomplete for: {', '.join(skipped_host_names)}",
                'yellow',
            )
            for host_name in skipped_host_names:
                self.app._update_node_status(host_name, 'error')

        if not deployable_host_names:
            self.app.print_colored(
                "\n❌ No machines completed preparation successfully. Instance apply phase was skipped.",
                'red',
            )
            self.app.wait_for_enter()
            return

        self.app.print_colored(
            f"\nApplying instance runtime on {len(deployable_host_names)} selected node(s)...",
            'cyan',
        )
        self.app.print_colored(
            f"Instance apply and startup can take longer than a status check. Timeout for this phase: {instance_apply_timeout}s (base setting: {self.app.connection_timeout}s).",
            'cyan',
        )
        success, instance_output, _, _ = self.app.run_generated_playbook(
            instance_apply_path,
            deployable_host_names,
            machine_scope=False,
            extra_vars=extra_vars,
            last_applied_action=last_applied_action,
            show_output=True,
            timeout=instance_apply_timeout,
        )
        successful_instance_hosts = self._extract_successful_hosts_from_output(instance_output, deployable_host_names)
        failed_instance_hosts = [host_name for host_name in deployable_host_names if host_name not in successful_instance_hosts]

        # Record install attempt + success on every run (not only on the
        # overall-success branch below) so per-host history reflects
        # reality for mixed-fleet partial failures.
        driver_owner = ConfigurationManager._derive_driver_owner(variant, manage_drivers)
        if successful_instance_hosts:
            self.app.record_install_attempt(successful_instance_hosts, variant, driver_owner, "success")
            self.app.record_install_success(successful_instance_hosts, variant, driver_owner)
        failed_for_tracking = list(failed_instance_hosts) + list(skipped_host_names)
        if failed_for_tracking:
            self.app.record_install_attempt(failed_for_tracking, variant, driver_owner, "failed")

        if success:
            self.app.print_colored(f"\n✅ Deployment succeeded on {len(successful_instance_hosts)} node(s)", 'green')
            # Update deployment metadata after successful deployment.
            # deployment_type derives from variant (not a description string sniff).
            deployment_type = "full" if variant == "gpu" else "docker_only"
            self._update_deployment_metadata(deployment_type)
            self.app.record_service_file_version(successful_instance_hosts)

            # Update selected node statuses to running after successful deployment
            for host_name in successful_instance_hosts:
                self.app._update_node_status(host_name, 'running')
            for host_name in failed_instance_hosts:
                self.app._update_node_status(host_name, 'error')

            # Show updated statuses
            self.app.print_colored(f"\n📊 Node Deployment Status:", 'cyan', bold=True)
            for host_name in successful_instance_hosts + failed_instance_hosts + skipped_host_names:
                self.app.print_colored(f"   • {host_name}: ", 'white', end='')
                self.app._display_node_status(host_name, compact=True)
                print()  # New line after each status

            # Image Summary — per-host variant / URL pulled from fetched metadata.
            self._render_image_summary(successful_instance_hosts)

            # Display copy-friendly node addresses after successful deployment
            if successful_instance_hosts:
                self.app._display_copy_friendly_addresses(successful_instance_hosts)
        else:
            self.app.print_colored(f"\n❌ Instance apply phase failed. Check output above for details.", 'red')
            # Update selected node statuses to error after failed deployment
            for host_name in deployable_host_names:
                self.app._update_node_status(host_name, 'error')
            for host_name in skipped_host_names:
                self.app._update_node_status(host_name, 'error')
            self.app.print_colored(f"\n📊 Selected node statuses updated to Error due to deployment failure.", 'yellow')

        # Mode-specific failure hint: only for GPU runs with failed hosts.
        all_failed = list(failed_instance_hosts) + list(skipped_host_names)
        if variant == "gpu" and all_failed:
            self._render_gpu_failure_hint(all_failed, manage_drivers)

        self.app.wait_for_enter()

    def _render_image_summary(self, host_names: List[str]) -> None:
        """Print a per-host "Image Summary" block from the fetched metadata."""
        if not host_names:
            return
        metadata = self.app.read_fetched_metadata(host_names)
        self.app.print_colored("\n🖼️  Image Summary:", 'cyan', bold=True)
        for host in host_names:
            meta = metadata.get(host) or {}
            variant_label = (meta.get('image_variant') or '?').upper()
            image_url = meta.get('image_url') or 'unknown'
            owner = meta.get('driver_owner') or 'n/a'
            owner_suffix = f" ({owner})" if owner in ('r1setup', 'user') else ''
            self.app.print_colored(
                f"   • {host}: {variant_label}{owner_suffix} → {image_url}",
                'white',
            )

    def _render_gpu_failure_hint(self, failed_hosts: List[str], manage_drivers: bool) -> None:
        """Print a mode-specific hint after a GPU run that had failed hosts."""
        joined = ", ".join(failed_hosts)
        if manage_drivers:
            # Mode 2
            self.app.print_colored("\n⚠️  GPU install failed on: " + joined, 'yellow', bold=True)
            self.app.print_colored(
                "   Partial NVIDIA driver state has been cleaned up on these hosts.",
                'yellow',
            )
            self.app.print_colored(
                "   Re-run them via 'Install CPU Nodes', or fix the underlying driver",
                'yellow',
            )
            self.app.print_colored(
                "   issue and retry GPU install. No CPU image was deployed.",
                'yellow',
            )
        else:
            # Mode 3
            self.app.print_colored("\n⚠️  GPU image deploy failed on: " + joined, 'yellow', bold=True)
            self.app.print_colored(
                "   nvidia-smi did not work on these hosts. r1setup did NOT touch the",
                'yellow',
            )
            self.app.print_colored(
                "   drivers (you marked them as user-managed). Either install/repair",
                'yellow',
            )
            self.app.print_colored(
                "   them yourself, or retry via 'Install GPU Nodes' with driver",
                'yellow',
            )
            self.app.print_colored(
                "   management enabled. No image was deployed.",
                'yellow',
            )

    def _update_deployment_metadata(self, deployment_type: str) -> None:
        """Update deployment metadata after successful deployment"""
        self.app.print_debug(f"Updating deployment metadata with type: {deployment_type}")
        self.app.print_debug(f"Active config: {self.app.active_config}")

        if not self.app.active_config.get('config_name'):
            self.app.print_debug("No config_name in active_config, cannot update deployment metadata")
            self.app.print_colored("Warning: No configuration name found, cannot update deployment tracking", 'yellow')
            return

        config_name = self.app.active_config['config_name']
        metadata_path = self.app.configs_dir / f"{config_name}.json"
        self.app.print_debug(f"Looking for metadata file: {metadata_path}")

        if not metadata_path.exists():
            self.app.print_debug(f"Metadata file does not exist: {metadata_path}")
            self.app.print_colored(f"Warning: Metadata file not found: {metadata_path}", 'yellow')
            return

        try:
            # Load existing metadata
            with open(metadata_path) as f:
                metadata = json.load(f)

            self.app.print_debug(f"Loaded existing metadata: {metadata}")

            # Update deployment info
            current_network = self.app.get_mnl_app_env()
            metadata['last_deployed_date'] = datetime.now().isoformat()
            metadata['last_deployed_network'] = current_network
            metadata['deployment_status'] = 'deployed'
            # last_deployment_type retired: per-host variant is the truth now.
            metadata.pop('last_deployment_type', None)

            self.app.print_debug(f"Updated metadata: {metadata}")

            # Save updated metadata
            with open(metadata_path, 'w') as f:
                json.dump(metadata, f, indent=2)

            # Update active config
            self.app.active_config.update(metadata)
            self.app._save_active_config()

            self.app.print_colored(f"✅ Deployment tracking updated for configuration: {config_name}", 'green')
            self.app.print_debug(f"Successfully updated deployment metadata for {config_name}")
        except Exception as e:
            self.app.print_colored(f"Warning: Could not update deployment metadata: {e}", 'yellow')
            self.app.print_debug(f"Error updating deployment metadata: {e}")

    def _update_deletion_metadata(self) -> None:
        """Update deployment metadata after successful deletion"""
        if not self.app.active_config.get('config_name'):
            return

        config_name = self.app.active_config['config_name']
        metadata_path = self.app.configs_dir / f"{config_name}.json"

        if not metadata_path.exists():
            return

        try:
            # Load existing metadata
            with open(metadata_path) as f:
                metadata = json.load(f)

            # Update deletion info
            metadata['last_deleted_date'] = datetime.now().isoformat()
            metadata['deployment_status'] = 'deleted'
            # Keep the last deployment info for reference but mark as deleted

            # Save updated metadata
            with open(metadata_path, 'w') as f:
                json.dump(metadata, f, indent=2)

            # Update active config
            self.app.active_config.update(metadata)
            self.app._save_active_config()

            self.app.print_colored(f"✅ Deletion tracking updated for configuration: {config_name}", 'green')
        except Exception as e:
            self.app.print_colored(f"Warning: Could not update deletion metadata: {e}", 'yellow')

    def deployment_status(self) -> None:
        """Show deployment status for all nodes"""
        if not self.app.check_hosts_config():
            self.app.print_colored("No nodes configured! Please configure nodes first.", 'red')
            self.app.wait_for_enter()
            return

        # Reload active config to ensure deployment status is current
        self.app._load_active_config()

        self.app.print_header("Deployment Status")

        # Load configuration
        self.app.load_configuration()
        hosts = _get_gpu_hosts(self.app.inventory)

        if not hosts:
            self.app.print_colored("No nodes configured.", 'yellow')
            self.app.wait_for_enter()
            return

        # Show deployment status overview
        deployment_status = self.app.active_config.get('deployment_status', 'never_deployed')
        deployment_display = self.app._get_deployment_display_state(
            metadata=self.app.active_config,
            inventory=self.app.inventory,
        )

        # Overall deployment status
        self.app.print_section("Overall Deployment Status")
        self.app.print_colored(deployment_display['status_line'], deployment_display['color'])
        if deployment_display.get('status_note'):
            self.app.print_colored(f"ℹ️  {deployment_display['status_note']}", 'white')
        if deployment_status == 'deployed':
            last_deployed_network = self.app.active_config.get('last_deployed_network')
            variant_summary = self.app.config_manager.install_variant_summary(self.app.inventory)
            if last_deployed_network:
                self.app.print_colored(f"🌐 Network: {last_deployed_network}", 'cyan')
            if variant_summary and variant_summary != 'no installs yet':
                self.app.print_colored(f"🔧 Per-host variants: {variant_summary}", 'cyan')

        # Individual node status - check in real-time
        self.app.print_section(f"Machine Status Overview")
        self.app.print_colored(f"🔍 Checking current status (max {self.app.connection_timeout}s timeout)...", 'cyan')

        # Get real-time status for each node
        real_time_status = self.app._get_real_time_node_status()
        machine_views = self.app.build_machine_group_views(node_status_data=real_time_status)

        print()  # Add a blank line after the checking message

        self.app.print_colored(
            f"Machines: {len(machine_views)} | Instances: {len(hosts)}",
            'white',
        )
        lines, _ = self.app._build_machine_group_display_lines(machine_views)
        self.app._print_machine_group_display_lines(lines)

        self.app.wait_for_enter()



class MigrationPlanner:
    """Build and persist migration plans without executing data transfer."""

    def __init__(self, app):
        self.app = app

    def _get_plannable_instances(self) -> Dict[str, Dict[str, Any]]:
        """Return logical instances that have a valid current machine assignment."""
        fleet_state = self.app.get_fleet_state_copy()
        instances = fleet_state.get('fleet', {}).get('instances', {})
        machines = fleet_state.get('fleet', {}).get('machines', {})
        return {
            instance_name: dict(instance_data)
            for instance_name, instance_data in instances.items()
            if instance_data.get('assigned_machine_id') in machines
        }

    def _select_migration_source_instance(self, plannable_instances: Dict[str, Dict[str, Any]]) -> Optional[str]:
        """Select one logical instance to migrate."""
        instance_names = sorted(plannable_instances.keys())
        if not instance_names:
            return None

        while True:
            self.app.print_header("Select Instance For Migration")
            self.app.print_colored("Choose the logical instance you want to move.", 'cyan')
            print()
            for index, instance_name in enumerate(instance_names, start=1):
                instance_data = plannable_instances[instance_name]
                source_machine_id = instance_data.get('assigned_machine_id', 'unknown')
                runtime = instance_data.get('runtime') or {}
                self.app.print_colored(
                    f"  {index}) {instance_name} | source={source_machine_id} | service={runtime.get('service_name', '?')}",
                    'white',
                )
            print()
            self.app.print_colored("  0) Cancel", 'white')
            print()

            choice = self.app.get_input("Select instance", "0").strip()
            if choice == '0':
                return None
            try:
                selection = int(choice)
            except ValueError:
                self.app.print_colored("Invalid choice. Please enter a number.", 'red')
                self.app.wait_for_enter()
                continue
            if 1 <= selection <= len(instance_names):
                return instance_names[selection - 1]
            self.app.print_colored(f"Invalid option. Valid choices are 0-{len(instance_names)}.", 'red')
            self.app.wait_for_enter()

    def _confirm_source_machine(self, source_machine_id: str, source_machine_record: Dict[str, Any]) -> bool:
        """Confirm the resolved current source machine for the selected instance."""
        connection_display = self.app.config_manager._format_machine_connection_display(source_machine_record)
        self.app.print_colored(
            f"Resolved source machine: {source_machine_id} ({connection_display})",
            'cyan',
        )
        return self.app.get_input("Use this as the source machine? (Y/n)", "Y").lower() == 'y'

    def _select_migration_target_machine(
        self,
        fleet_state: Dict[str, Any],
        source_machine_id: str,
    ) -> Optional[str]:
        """Select a target machine for a migration plan."""
        machines = fleet_state.get('fleet', {}).get('machines', {})
        target_machine_ids = [machine_id for machine_id in sorted(machines.keys()) if machine_id != source_machine_id]
        if not target_machine_ids:
            return None

        while True:
            self.app.print_header("Select Migration Target Machine")
            self.app.print_colored("Choose the target machine for the migration plan.", 'cyan')
            print()
            for index, machine_id in enumerate(target_machine_ids, start=1):
                machine_record = machines[machine_id]
                connection_display = self.app.config_manager._format_machine_connection_display(machine_record)
                specs_summary = self.app.config_manager._format_machine_specs_summary(machine_record.get('machine_specs'))
                topology_mode = machine_record.get('topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE)
                deployment_state = machine_record.get('deployment_state', DEFAULT_MACHINE_DEPLOYMENT_STATE)
                instance_count = len(machine_record.get('instance_names') or [])
                self.app.print_colored(
                    f"  {index}) {machine_id} | {connection_display} | mode={topology_mode} | state={deployment_state} | instances={instance_count}",
                    'white',
                )
                if specs_summary:
                    self.app.print_colored(f"      specs: {specs_summary}", 'cyan')
            print()
            self.app.print_colored("  0) Cancel", 'white')
            print()

            choice = self.app.get_input("Select target machine", "0").strip()
            if choice == '0':
                return None
            try:
                selection = int(choice)
            except ValueError:
                self.app.print_colored("Invalid choice. Please enter a number.", 'red')
                self.app.wait_for_enter()
                continue
            if 1 <= selection <= len(target_machine_ids):
                return target_machine_ids[selection - 1]
            self.app.print_colored(f"Invalid option. Valid choices are 0-{len(target_machine_ids)}.", 'red')
            self.app.wait_for_enter()

    def _select_runtime_name_policy(self) -> Optional[str]:
        """Choose how runtime names should be resolved on the target machine."""
        self.app.print_header("Select Target Runtime Naming")
        self.app.print_colored("Choose how runtime names should be resolved on the target machine.", 'cyan')
        print()
        self.app.print_colored("  1) preserve             - Keep the current service/container/volume names")
        self.app.print_colored("  2) normalize_to_target  - Resolve names according to the target topology")
        self.app.print_colored("  3) custom               - Enter custom target runtime names")
        print()
        self.app.print_colored("  0) Cancel", 'white')
        print()

        while True:
            choice = self.app.get_input("Select naming policy", "1").strip()
            if choice == '0':
                return None
            if choice == '1':
                return 'preserve'
            if choice == '2':
                return 'normalize_to_target'
            if choice == '3':
                return 'custom'
            self.app.print_colored("Invalid option. Valid choices are 0-3.", 'red')

    def _collect_custom_runtime(self, source_runtime: Dict[str, Any]) -> Dict[str, Any]:
        """Prompt for custom target runtime names."""
        self.app.print_header("Custom Target Runtime Names")
        self.app.print_colored("Enter the target runtime names to use after migration.", 'cyan')
        print()
        return {
            'service_name': self.app.get_input(
                "Target service name",
                str(source_runtime.get('service_name') or DEFAULT_RUNTIME_SERVICE_NAME),
            ),
            'container_name': self.app.get_input(
                "Target container name",
                str(source_runtime.get('container_name') or DEFAULT_RUNTIME_CONTAINER_NAME),
            ),
            'volume_path': self.app.get_input(
                "Target volume path",
                str(source_runtime.get('volume_path') or DEFAULT_RUNTIME_VOLUME_PATH),
            ),
        }

    @staticmethod
    def _build_remote_shell_probe(command: str) -> str:
        """Wrap a shell command for remote execution."""
        return f"sh -lc {shlex.quote(command)}"

    def _probe_machine_reachability(self, machine_record: Dict[str, Any]) -> Dict[str, Any]:
        """Check whether a machine responds to a simple non-mutating SSH probe."""
        return self.app._run_machine_probe(machine_record, self._build_remote_shell_probe("printf ok"))

    def _probe_remote_path_size(self, machine_record: Dict[str, Any], remote_path: str) -> Dict[str, Any]:
        """Return the size in bytes of a remote path."""
        if not remote_path:
            return {'status': 'error', 'message': 'Missing remote path'}
        inner = f"test -d {shlex.quote(remote_path)} && du -sb {shlex.quote(remote_path)} | awk '{{print $1}}'"
        result = self.app._run_machine_probe(machine_record, self._build_remote_shell_probe(inner))
        if result.get('status') != 'success':
            return result
        try:
            result['bytes'] = int((result.get('stdout') or '').strip().splitlines()[0])
            return result
        except (IndexError, ValueError):
            return {'status': 'error', 'message': 'Unable to parse remote size probe output'}

    @staticmethod
    def _get_free_space_probe_path(volume_path: str) -> str:
        """Return a stable ancestor path to use for free-space probes."""
        path = Path(str(volume_path or '/').strip() or '/')
        parts = path.parts
        if len(parts) >= 3:
            return str(Path(*parts[:3]))
        if len(parts) >= 2:
            return str(Path(*parts[:2]))
        return '/'

    def _probe_remote_free_space(self, machine_record: Dict[str, Any], remote_path: str) -> Dict[str, Any]:
        """Return free space in bytes for a remote path's filesystem."""
        probe_path = self._get_free_space_probe_path(remote_path)
        inner = f"df -PB1 {shlex.quote(probe_path)} | tail -1 | awk '{{print $4}}'"
        result = self.app._run_machine_probe(machine_record, self._build_remote_shell_probe(inner))
        if result.get('status') != 'success':
            return result
        try:
            result['bytes'] = int((result.get('stdout') or '').strip().splitlines()[0])
            result['probe_path'] = probe_path
            return result
        except (IndexError, ValueError):
            return {'status': 'error', 'message': 'Unable to parse remote free-space output'}

    def _collect_preflight(
        self,
        source_machine_record: Dict[str, Any],
        target_machine_record: Dict[str, Any],
        source_runtime: Dict[str, Any],
        target_runtime: Dict[str, Any],
    ) -> Dict[str, Any]:
        """Collect non-mutating preflight data for a migration plan."""
        local_temp_dir = self.app._default_migration_temp_dir()
        source_reachability = self._probe_machine_reachability(source_machine_record)
        target_reachability = self._probe_machine_reachability(target_machine_record)
        source_volume_probe = self._probe_remote_path_size(source_machine_record, source_runtime.get('volume_path', ''))
        target_free_probe = self._probe_remote_free_space(target_machine_record, target_runtime.get('volume_path', ''))

        try:
            local_free_bytes = self.app._probe_local_free_space(local_temp_dir)
            local_free_probe = {'status': 'success', 'bytes': local_free_bytes}
        except Exception as e:
            local_free_probe = {'status': 'error', 'message': str(e)}

        return {
            'local_temp_dir': str(local_temp_dir),
            'source_reachability': source_reachability,
            'target_reachability': target_reachability,
            'source_volume_probe': source_volume_probe,
            'local_free_probe': local_free_probe,
            'target_free_probe': target_free_probe,
        }

    def build_migration_plan(
        self,
        instance_name: str,
        target_machine_id: str,
        *,
        runtime_name_policy: str = 'preserve',
        custom_runtime: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Build a migration plan without mutating remote state."""
        fleet_state = self.app.get_fleet_state_copy()
        normalized_fleet = self.app.config_manager._normalize_fleet_state(fleet_state)
        instances = normalized_fleet.get('fleet', {}).get('instances', {})
        machines = normalized_fleet.get('fleet', {}).get('machines', {})
        errors: List[str] = []
        warnings: List[str] = []

        instance_data = dict(instances.get(instance_name, {}))
        source_machine_id = str(instance_data.get('assigned_machine_id') or '').strip()
        source_machine_record = dict(machines.get(source_machine_id, {}))
        target_machine_record = dict(machines.get(target_machine_id, {}))
        source_runtime = dict(instance_data.get('runtime') or {})

        if not instance_data:
            errors.append(f"Instance '{instance_name}' is not present in the active fleet state.")
        if not source_machine_id or not source_machine_record:
            errors.append("The selected instance does not have a valid source machine assignment.")
        if not target_machine_record:
            errors.append(f"Target machine '{target_machine_id}' is not registered in the active fleet state.")
        if source_machine_id and target_machine_id == source_machine_id:
            errors.append("The target machine must be different from the current source machine.")
        if not source_runtime.get('volume_path'):
            errors.append("The selected instance is missing a source volume path.")

        target_topology_mode = str(target_machine_record.get('topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE)
        target_runtime = self.app.config_manager.resolve_runtime_names(
            str(instance_data.get('logical_name') or instance_name),
            topology_mode=target_topology_mode,
            runtime_name_policy=runtime_name_policy,
            existing_runtime=source_runtime,
            custom_runtime=custom_runtime,
        )
        collisions = self.app.config_manager.detect_runtime_collisions(
            target_machine_id,
            target_runtime,
            normalized_fleet,
            exclude_instance=instance_name,
        )
        if collisions:
            for field, conflicting_instances in sorted(collisions.items()):
                errors.append(
                    f"Target runtime {field} collides with: {', '.join(sorted(conflicting_instances))}"
                )

        target_instance_names = list(target_machine_record.get('instance_names') or [])
        if target_topology_mode == 'standard' and target_instance_names:
            errors.append(
                f"Target machine '{target_machine_id}' is standard mode and already has assigned instances."
            )

        target_deployment_state = str(target_machine_record.get('deployment_state') or DEFAULT_MACHINE_DEPLOYMENT_STATE)
        requires_target_preparation = target_deployment_state != 'prepared'
        if requires_target_preparation:
            warnings.append(
                f"Target machine '{target_machine_id}' is not marked prepared and will need machine preparation before execution."
            )

        if errors:
            preflight = {
                'local_temp_dir': str(self.app._default_migration_temp_dir()),
                'source_reachability': {'status': 'error', 'message': 'Skipped because plan has structural errors'},
                'target_reachability': {'status': 'error', 'message': 'Skipped because plan has structural errors'},
                'source_volume_probe': {'status': 'error', 'message': 'Skipped because plan has structural errors'},
                'local_free_probe': {'status': 'error', 'message': 'Skipped because plan has structural errors'},
                'target_free_probe': {'status': 'error', 'message': 'Skipped because plan has structural errors'},
            }
        else:
            preflight = self._collect_preflight(
                source_machine_record,
                target_machine_record,
                source_runtime,
                target_runtime,
            )

        source_reachable = preflight['source_reachability'].get('status') == 'success'
        target_reachable = preflight['target_reachability'].get('status') == 'success'
        if not source_reachable:
            errors.append(
                f"Source machine reachability probe failed: {preflight['source_reachability'].get('message', 'unknown error')}"
            )
        if not target_reachable:
            errors.append(
                f"Target machine reachability probe failed: {preflight['target_reachability'].get('message', 'unknown error')}"
            )

        source_volume_bytes = preflight['source_volume_probe'].get('bytes')
        if preflight['source_volume_probe'].get('status') != 'success':
            errors.append(
                f"Source volume size probe failed: {preflight['source_volume_probe'].get('message', 'unknown error')}"
            )

        local_free_bytes = preflight['local_free_probe'].get('bytes')
        if preflight['local_free_probe'].get('status') != 'success':
            errors.append(
                f"Controller temp-space probe failed: {preflight['local_free_probe'].get('message', 'unknown error')}"
            )

        target_free_bytes = preflight['target_free_probe'].get('bytes')
        if preflight['target_free_probe'].get('status') != 'success':
            errors.append(
                f"Target free-space probe failed: {preflight['target_free_probe'].get('message', 'unknown error')}"
            )

        if source_volume_bytes is not None and local_free_bytes is not None and local_free_bytes < source_volume_bytes:
            errors.append("Controller temp directory does not have enough free space for the migration archive.")
        if source_volume_bytes is not None and target_free_bytes is not None and target_free_bytes < source_volume_bytes:
            errors.append("Target machine does not have enough free space for the migrated volume.")

        safe_archive_name = f"{self.app.config_manager._sanitize_runtime_suffix(instance_name)}.tar.gz"
        source_archive_path = f"/tmp/r1setup_migration_{safe_archive_name}"
        local_archive_path = str(Path(preflight['local_temp_dir']) / safe_archive_name)
        target_archive_path = f"/tmp/r1setup_migration_{safe_archive_name}"
        status = 'planned' if not errors else 'blocked'

        return {
            'plan_id': f"migration-{self.app.config_manager._sanitize_runtime_suffix(instance_name)}-{datetime.now().strftime('%Y%m%d%H%M%S')}",
            'status': status,
            'created_at': datetime.now().isoformat(),
            'instance_name': instance_name,
            'logical_name': str(instance_data.get('logical_name') or instance_name),
            'source_machine_id': source_machine_id,
            'target_machine_id': target_machine_id,
            'runtime_name_policy': runtime_name_policy,
            'source_runtime': source_runtime,
            'target_runtime': target_runtime,
            'transfer': {
                'route': f"{source_machine_id} -> local temp -> {target_machine_id}",
                'source_archive_path': source_archive_path,
                'local_temp_dir': preflight['local_temp_dir'],
                'local_archive_path': local_archive_path,
                'target_archive_path': target_archive_path,
                'checksum_algorithm': 'sha256',
            },
            'preflight': {
                'source_reachable': source_reachable,
                'target_reachable': target_reachable,
                'source_volume_bytes': source_volume_bytes,
                'local_free_bytes': local_free_bytes,
                'target_free_bytes': target_free_bytes,
                'target_deployment_state': target_deployment_state,
                'requires_target_preparation': requires_target_preparation,
            },
            'validation': {
                'errors': errors,
                'warnings': warnings,
                'collisions': collisions,
            },
        }

    @staticmethod
    def _format_bytes(value: Optional[int]) -> str:
        """Return a compact operator-facing byte count."""
        if value is None:
            return 'unknown'
        if value >= 1024 ** 3:
            return f"{value / (1024 ** 3):.2f} GB"
        if value >= 1024 ** 2:
            return f"{value / (1024 ** 2):.2f} MB"
        if value >= 1024:
            return f"{value / 1024:.2f} KB"
        return f"{value} B"

    def _display_migration_plan(self, plan: Dict[str, Any], *, context: str = 'planning') -> None:
        """Render a migration plan review summary."""
        self.app.print_header("Migration Plan Review")
        status = str(plan.get('status') or 'unknown')
        status_color = {
            'planned': 'green',
            'executed': 'green',
            'finalized': 'green',
            'rolled_back': 'green',
            'executing': 'yellow',
            'rolling_back': 'yellow',
            'finalizing': 'yellow',
            'failed': 'red',
            'rollback_failed': 'red',
            'finalization_failed': 'red',
        }.get(status, 'red')
        self.app.print_colored(f"Plan ID: {plan['plan_id']}", 'cyan')
        self.app.print_colored(
            f"Status: {status}",
            status_color,
        )
        self.app.print_colored(
            f"Instance: {plan['instance_name']} | source={plan['source_machine_id']} | target={plan['target_machine_id']}",
            'white',
        )
        self.app.print_colored(f"Naming Policy: {plan['runtime_name_policy']}", 'white')
        print()

        self.app.print_section("Resolved Target Runtime")
        target_runtime = plan.get('target_runtime') or {}
        self.app.print_colored(f"service={target_runtime.get('service_name', '?')}", 'white')
        self.app.print_colored(f"container={target_runtime.get('container_name', '?')}", 'white')
        self.app.print_colored(f"volume={target_runtime.get('volume_path', '?')}", 'white')
        print()

        self.app.print_section("Transfer Route")
        transfer = plan.get('transfer') or {}
        self.app.print_colored(f"Route: {transfer.get('route', 'unknown')}", 'cyan')
        self.app.print_colored(f"Source archive: {transfer.get('source_archive_path', '?')}", 'white')
        self.app.print_colored(f"Local archive: {transfer.get('local_archive_path', '?')}", 'white')
        self.app.print_colored(f"Target archive: {transfer.get('target_archive_path', '?')}", 'white')
        self.app.print_colored(f"Checksum: {transfer.get('checksum_algorithm', 'unknown')}", 'white')
        print()

        self.app.print_section("Preflight")
        preflight = plan.get('preflight') or {}
        self.app.print_colored(
            f"Source volume size: {self._format_bytes(preflight.get('source_volume_bytes'))}",
            'white',
        )
        self.app.print_colored(
            f"Controller free space: {self._format_bytes(preflight.get('local_free_bytes'))}",
            'white',
        )
        self.app.print_colored(
            f"Target free space: {self._format_bytes(preflight.get('target_free_bytes'))}",
            'white',
        )
        self.app.print_colored(
            f"Target deployment state: {preflight.get('target_deployment_state', 'unknown')}",
            'white',
        )
        if context in ('planning', 'execution') and preflight.get('requires_target_preparation'):
            self.app.print_colored("Target preparation required before execution.", 'yellow')
        print()

        validation = plan.get('validation') or {}
        errors = validation.get('errors') or []
        warnings = validation.get('warnings') or []

        if errors:
            self.app.print_section("Blocking Errors")
            for error in errors:
                self.app.print_colored(f"  • {error}", 'red')
            print()

        if warnings:
            self.app.print_section("Warnings")
            for warning in warnings:
                self.app.print_colored(f"  • {warning}", 'yellow')
            print()

        context_note = {
            'planning': "Planning only: no source shutdown, assignment change, or data transfer has occurred.",
            'execution': "Execution review: running this plan will stop the source service, transfer the volume, and update assignment on success.",
            'rollback': "Rollback review: this will clean partial target artifacts, remove transfer archives, and restore the source runtime.",
            'finalization': "Finalization review: this will clean deferred source-side artifacts after a verified migration.",
        }.get(context)
        if context_note:
            self.app.print_colored(context_note, 'cyan')

    def _revalidate_saved_migration_plan(self, plan: Dict[str, Any]) -> Dict[str, Any]:
        """Refresh a saved plan against current fleet and probe state before execution."""
        runtime_name_policy = str(plan.get('runtime_name_policy') or 'preserve')
        custom_runtime = None
        if runtime_name_policy == 'custom':
            custom_runtime = dict(plan.get('target_runtime') or {})

        refreshed_plan = self.build_migration_plan(
            str(plan.get('instance_name') or ''),
            str(plan.get('target_machine_id') or ''),
            runtime_name_policy=runtime_name_policy,
            custom_runtime=custom_runtime,
        )
        refreshed_plan['plan_id'] = plan.get('plan_id') or refreshed_plan.get('plan_id')
        refreshed_plan['created_at'] = plan.get('created_at') or refreshed_plan.get('created_at')
        refreshed_plan['revalidated_at'] = datetime.now().isoformat()
        return refreshed_plan

    def _migration_runtime_timeout(self) -> int:
        """Timeout for migration lifecycle steps that can take as long as deploy/apply."""
        return DeploymentService._phase_timeout(self.app.connection_timeout, 180)

    def _migration_probe_timeout(self) -> int:
        """Timeout for shorter migration verification probes."""
        return self.app.connection_timeout * 2

    def _migration_transfer_timeout(self) -> int:
        """Timeout for migration data-transfer phases (archive, SCP, extract).

        These can run for a very long time on large volumes, so the floor is
        1800 seconds (30 minutes).
        """
        return DeploymentService._phase_timeout(self.app.connection_timeout, 1800)

    def plan_instance_migration(self) -> None:
        """Interactively build and optionally persist a migration plan."""
        if not self.app.active_config.get('config_name'):
            self.app.print_colored("No active configuration. Create or load one first.", 'red')
            self.app.wait_for_enter()
            return

        self.app.load_configuration()
        plannable_instances = self._get_plannable_instances()
        if not plannable_instances:
            self.app.print_colored("No assigned instances are available for migration planning.", 'yellow')
            self.app.wait_for_enter()
            return

        instance_name = self._select_migration_source_instance(plannable_instances)
        if not instance_name:
            self.app.print_colored("Migration planning cancelled.", 'yellow')
            self.app.wait_for_enter()
            return

        fleet_state = self.app.get_fleet_state_copy()
        source_machine_id = plannable_instances[instance_name]['assigned_machine_id']
        source_machine_record = fleet_state.get('fleet', {}).get('machines', {}).get(source_machine_id, {})
        if not self._confirm_source_machine(source_machine_id, source_machine_record):
            self.app.print_colored("Migration planning cancelled.", 'yellow')
            self.app.wait_for_enter()
            return

        target_machine_id = self._select_migration_target_machine(fleet_state, source_machine_id)
        if not target_machine_id:
            self.app.print_colored("Migration planning cancelled.", 'yellow')
            self.app.wait_for_enter()
            return

        runtime_name_policy = self._select_runtime_name_policy()
        if not runtime_name_policy:
            self.app.print_colored("Migration planning cancelled.", 'yellow')
            self.app.wait_for_enter()
            return

        custom_runtime = None
        if runtime_name_policy == 'custom':
            custom_runtime = self._collect_custom_runtime(plannable_instances[instance_name].get('runtime') or {})

        plan = self.build_migration_plan(
            instance_name,
            target_machine_id,
            runtime_name_policy=runtime_name_policy,
            custom_runtime=custom_runtime,
        )
        self._display_migration_plan(plan, context='planning')

        if self.app.get_input("\nSave this migration plan locally? (Y/n)", "Y").lower() == 'y':
            self.app.set_migration_plan_state(plan)
            if plan.get('status') == 'blocked':
                self.app.print_colored(
                    "Blocked migration plan saved locally for review. Resolve the errors above; execution will revalidate before proceeding.",
                    'yellow',
                )
            else:
                self.app.print_colored("Migration plan saved locally.", 'green')
        else:
            self.app.print_colored("Migration plan was not saved.", 'yellow')

        self.app.wait_for_enter()

    @staticmethod
    def _update_plan_fields(plan: Dict[str, Any], **updates: Any) -> Dict[str, Any]:
        """Return a shallow-updated copy of a migration plan."""
        updated = copy.deepcopy(plan)
        updated.update(updates)
        return updated

    def _build_remote_root_shell_command(self, machine_record: Dict[str, Any], command: str) -> str:
        """Build a root-capable remote shell command string."""
        wrapped = self._build_remote_shell_probe(command)
        user = str(machine_record.get('ansible_user') or 'root').strip()
        if user == 'root':
            return wrapped
        become_password = machine_record.get('ansible_become_password')
        if become_password:
            return f"printf %s {shlex.quote(str(become_password))} | sudo -S -p '' {wrapped}"
        return f"sudo -n {wrapped}"

    def _run_machine_command(
        self,
        machine_record: Dict[str, Any],
        remote_command: str,
        *,
        timeout: Optional[int] = None,
    ) -> Dict[str, Any]:
        """Run a remote machine command over SSH."""
        try:
            ssh_cmd = self.app._build_machine_ssh_command(machine_record, remote_command)
        except ValueError as e:
            return {'status': 'error', 'message': str(e)}

        try:
            result = subprocess.run(
                ssh_cmd,
                capture_output=True,
                text=True,
                timeout=timeout or self.app.connection_timeout,
                check=False,
            )
        except FileNotFoundError as e:
            return {'status': 'error', 'message': str(e)}
        except subprocess.TimeoutExpired:
            return {'status': 'error', 'message': f'Command timed out after {timeout or self.app.connection_timeout} seconds'}

        if result.returncode != 0:
            message = result.stderr.strip() or result.stdout.strip() or 'Remote command failed'
            return {'status': 'error', 'message': message, 'stdout': result.stdout, 'stderr': result.stderr}
        return {'status': 'success', 'stdout': result.stdout, 'stderr': result.stderr}

    def _build_scp_command(
        self,
        machine_record: Dict[str, Any],
        source: str,
        destination: str,
        *,
        download: bool,
    ) -> List[str]:
        """Build an scp command for upload or download."""
        host = machine_record.get('ansible_host')
        user = machine_record.get('ansible_user')
        if not host or not user:
            raise ValueError("Machine record is missing ansible_host or ansible_user")

        scp_cmd = ['scp']
        ssh_port = machine_record.get('ansible_port')
        if ssh_port and str(ssh_port) != '22':
            scp_cmd.extend(['-P', str(ssh_port)])

        if 'ansible_ssh_private_key_file' in machine_record:
            key_file = machine_record['ansible_ssh_private_key_file']
            if str(key_file).startswith('~'):
                key_file = os.path.expanduser(key_file)
            scp_cmd.extend(['-i', str(key_file)])

        scp_cmd.extend(['-o', 'StrictHostKeyChecking=no', '-o', 'UserKnownHostsFile=/dev/null'])
        remote_spec = f"{user}@{host}:{source if download else destination}"
        if download:
            scp_cmd.extend([remote_spec, destination])
        else:
            scp_cmd.extend([source, remote_spec])

        if 'ansible_ssh_pass' in machine_record:
            return ['sshpass', '-p', machine_record['ansible_ssh_pass']] + scp_cmd
        return scp_cmd

    def _copy_from_machine(self, machine_record: Dict[str, Any], remote_path: str, local_path: str, *, timeout: Optional[int] = None) -> Dict[str, Any]:
        """Copy a file from a machine to the controller."""
        effective_timeout = timeout or self.app.connection_timeout * 4
        try:
            scp_cmd = self._build_scp_command(machine_record, remote_path, local_path, download=True)
        except ValueError as e:
            return {'status': 'error', 'message': str(e)}
        try:
            result = subprocess.run(
                scp_cmd,
                capture_output=True,
                text=True,
                timeout=effective_timeout,
                check=False,
            )
        except FileNotFoundError as e:
            return {'status': 'error', 'message': str(e)}
        except subprocess.TimeoutExpired:
            return {'status': 'error', 'message': f'SCP download timed out after {effective_timeout} seconds'}

        if result.returncode != 0:
            return {'status': 'error', 'message': result.stderr.strip() or result.stdout.strip() or 'SCP download failed'}
        return {'status': 'success'}

    def _copy_to_machine(self, machine_record: Dict[str, Any], local_path: str, remote_path: str, *, timeout: Optional[int] = None) -> Dict[str, Any]:
        """Copy a file from the controller to a machine."""
        effective_timeout = timeout or self.app.connection_timeout * 4
        try:
            scp_cmd = self._build_scp_command(machine_record, local_path, remote_path, download=False)
        except ValueError as e:
            return {'status': 'error', 'message': str(e)}
        try:
            result = subprocess.run(
                scp_cmd,
                capture_output=True,
                text=True,
                timeout=effective_timeout,
                check=False,
            )
        except FileNotFoundError as e:
            return {'status': 'error', 'message': str(e)}
        except subprocess.TimeoutExpired:
            return {'status': 'error', 'message': f'SCP upload timed out after {effective_timeout} seconds'}

        if result.returncode != 0:
            return {'status': 'error', 'message': result.stderr.strip() or result.stdout.strip() or 'SCP upload failed'}
        return {'status': 'success'}

    def _compute_local_checksum(self, path: Path) -> str:
        """Compute a SHA-256 checksum for a local file."""
        import hashlib

        hasher = hashlib.sha256()
        with open(path, 'rb') as handle:
            for chunk in iter(lambda: handle.read(1024 * 1024), b''):
                hasher.update(chunk)
        return hasher.hexdigest()

    def _compute_remote_checksum(self, machine_record: Dict[str, Any], remote_path: str) -> Dict[str, Any]:
        """Compute a SHA-256 checksum for a remote file."""
        remote_command = self._build_remote_root_shell_command(
            machine_record,
            f"sha256sum {shlex.quote(remote_path)} | awk '{{print $1}}'",
        )
        result = self._run_machine_command(machine_record, remote_command, timeout=self.app.connection_timeout * 2)
        if result.get('status') != 'success':
            return result
        checksum = (result.get('stdout') or '').strip().splitlines()
        if not checksum:
            return {'status': 'error', 'message': 'Unable to parse remote checksum output'}
        return {'status': 'success', 'checksum': checksum[0]}

    def _probe_remote_path_stat(self, machine_record: Dict[str, Any], remote_path: str) -> Dict[str, Any]:
        """Probe uid/gid/mode for a remote path."""
        remote_command = self._build_remote_root_shell_command(
            machine_record,
            f"stat -c '%u:%g:%a' {shlex.quote(remote_path)}",
        )
        result = self._run_machine_command(machine_record, remote_command)
        if result.get('status') != 'success':
            return result
        stat_lines = (result.get('stdout') or '').strip().splitlines()
        if not stat_lines:
            return {'status': 'error', 'message': 'Unable to parse remote stat output'}
        try:
            uid, gid, mode = stat_lines[0].split(':', 2)
            return {'status': 'success', 'uid': uid, 'gid': gid, 'mode': mode}
        except ValueError:
            return {'status': 'error', 'message': 'Unexpected remote stat format'}

    def _probe_remote_file_size(self, machine_record: Dict[str, Any], remote_path: str, *, timeout: int = 10) -> Dict[str, Any]:
        """Return the size (bytes) of a remote file, or 0 if it does not exist yet."""
        remote_command = self._build_remote_root_shell_command(
            machine_record,
            f"stat -c '%s' {shlex.quote(remote_path)} 2>/dev/null || echo 0",
        )
        result = self._run_machine_command(machine_record, remote_command, timeout=timeout)
        if result.get('status') != 'success':
            return result
        try:
            size = int((result.get('stdout') or '').strip().splitlines()[0])
            return {'status': 'success', 'size': size}
        except (ValueError, IndexError):
            return {'status': 'error', 'message': 'Unable to parse remote file size'}

    def _probe_remote_dir_size(self, machine_record: Dict[str, Any], remote_path: str, *, timeout: int = 15) -> Dict[str, Any]:
        """Return the total size (bytes) of a remote directory."""
        remote_command = self._build_remote_root_shell_command(
            machine_record,
            f"du -sb {shlex.quote(remote_path)} 2>/dev/null | awk '{{print $1}}'",
        )
        result = self._run_machine_command(machine_record, remote_command, timeout=timeout)
        if result.get('status') != 'success':
            return result
        try:
            size = int((result.get('stdout') or '').strip().splitlines()[0])
            return {'status': 'success', 'size': size}
        except (ValueError, IndexError):
            return {'status': 'error', 'message': 'Unable to parse remote directory size'}

    @staticmethod
    def _format_bytes(size) -> str:
        """Format a byte count into a human-readable string."""
        if size is None:
            return 'unknown'
        size = float(size)
        for unit in ('B', 'KB', 'MB', 'GB'):
            if abs(size) < 1024:
                return f"{size:.1f} {unit}"
            size /= 1024
        return f"{size:.1f} TB"

    def _ensure_local_temp_dir(self, path: Path) -> None:
        """Ensure controller temp directory exists with restricted permissions."""
        path.mkdir(parents=True, exist_ok=True)
        try:
            os.chmod(path, 0o700)
        except OSError:
            pass

    def _prepare_target_machine_for_migration(self, plan: Dict[str, Any]) -> Dict[str, Any]:
        """Prepare the target machine if the saved plan requires it."""
        if not plan.get('preflight', {}).get('requires_target_preparation'):
            return {'status': 'success', 'prepared': False}
        playbook_path = self.app.config_dir / 'playbooks/prepare_machine.yml'
        prep_timeout = DeploymentService._phase_timeout(self.app.connection_timeout, 600)
        self.app.print_colored(
            f"Target machine preparation can take several minutes. Timeout for this phase: {prep_timeout}s (base setting: {self.app.connection_timeout}s).",
            'cyan',
        )
        success, output, executed_hosts, execution_inventory = self.app.run_registered_machine_playbook(
            playbook_path,
            [plan['target_machine_id']],
            show_output=True,
            timeout=prep_timeout,
            fleet_state=self.app.get_fleet_state_copy(),
        )
        successful_hosts = DeploymentService(self.app)._extract_successful_hosts_from_output(output, executed_hosts)
        successful_machine_ids = DeploymentService._extract_successful_machine_ids(execution_inventory, successful_hosts)
        if success and plan['target_machine_id'] in successful_machine_ids:
            return {'status': 'success', 'prepared': True}
        return {'status': 'error', 'message': 'Target machine preparation failed'}

    def _stop_source_instance_for_migration(self, instance_name: str) -> Dict[str, Any]:
        """Stop the source instance before archiving its data."""
        playbook_path = self.app.config_dir / 'playbooks/service_stop.yml'
        lifecycle_timeout = self._migration_runtime_timeout()
        success, output, _, _ = self.app.run_generated_playbook(
            playbook_path,
            [instance_name],
            machine_scope=False,
            last_applied_action='migration_source_stop',
            show_output=False,
            timeout=lifecycle_timeout,
        )
        if success:
            return {'status': 'success', 'output': output}
        return {'status': 'error', 'message': output or 'Source stop failed'}

    def _create_source_archive(self, machine_record: Dict[str, Any], source_volume_path: str, archive_path: str) -> Dict[str, Any]:
        """Create a source archive on the source machine."""
        remote_command = self._build_remote_root_shell_command(
            machine_record,
            (
                f"rm -f {shlex.quote(archive_path)} && "
                f"test -d {shlex.quote(source_volume_path)} && "
                f"tar -C {shlex.quote(source_volume_path)} -czf {shlex.quote(archive_path)} ."
            ),
        )
        return self._run_machine_command(machine_record, remote_command, timeout=self._migration_transfer_timeout())

    def _prepare_target_volume_root(
        self,
        source_machine_record: Dict[str, Any],
        target_machine_record: Dict[str, Any],
        source_volume_path: str,
        target_volume_path: str,
    ) -> Dict[str, Any]:
        """Create the target volume root with source ownership and permissions."""
        source_stat = self._probe_remote_path_stat(source_machine_record, source_volume_path)
        if source_stat.get('status') != 'success':
            return source_stat
        remote_command = self._build_remote_root_shell_command(
            target_machine_record,
            (
                f"mkdir -p {shlex.quote(target_volume_path)} && "
                f"chown {source_stat['uid']}:{source_stat['gid']} {shlex.quote(target_volume_path)} && "
                f"chmod {source_stat['mode']} {shlex.quote(target_volume_path)}"
            ),
        )
        result = self._run_machine_command(target_machine_record, remote_command)
        if result.get('status') != 'success':
            return result

        target_stat = self._probe_remote_path_stat(target_machine_record, target_volume_path)
        if target_stat.get('status') != 'success':
            return target_stat
        if (
            target_stat.get('uid') != source_stat.get('uid')
            or target_stat.get('gid') != source_stat.get('gid')
            or target_stat.get('mode') != source_stat.get('mode')
        ):
            return {'status': 'error', 'message': 'Target ownership or permissions do not match the source volume root'}
        return {'status': 'success'}

    def _extract_archive_on_target(self, machine_record: Dict[str, Any], archive_path: str, target_volume_path: str) -> Dict[str, Any]:
        """Extract the uploaded archive onto the target machine."""
        remote_command = self._build_remote_root_shell_command(
            machine_record,
            f"mkdir -p {shlex.quote(target_volume_path)} && tar -xzf {shlex.quote(archive_path)} -C {shlex.quote(target_volume_path)}",
        )
        return self._run_machine_command(machine_record, remote_command, timeout=self._migration_transfer_timeout())

    def _build_target_execution_inventory(self, plan: Dict[str, Any]) -> Dict[str, Any]:
        """Build an explicit execution inventory for the migrated target instance."""
        hosts = _get_gpu_hosts(self.app.inventory)
        base_config = copy.deepcopy(hosts.get(plan['instance_name'], {}))
        fleet_state = self.app.get_fleet_state_copy()
        target_machine = dict(fleet_state.get('fleet', {}).get('machines', {}).get(plan['target_machine_id'], {}))
        target_runtime = dict(plan.get('target_runtime') or {})

        for key in (
            'ansible_host',
            'ansible_user',
            'ansible_port',
            'ansible_ssh_common_args',
            'ansible_ssh_pass',
            'ansible_become_password',
            'ansible_ssh_private_key_file',
        ):
            if key in target_machine:
                base_config[key] = target_machine[key]
            else:
                base_config.pop(key, None)

        base_config.update({
            'r1setup_machine_id': plan['target_machine_id'],
            'r1setup_topology_mode': target_machine.get('topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE),
            'r1setup_machine_deployment_state': target_machine.get('deployment_state', DEFAULT_MACHINE_DEPLOYMENT_STATE),
            'r1setup_runtime_name_policy': plan.get('runtime_name_policy', 'preserve'),
            'edge_node_service_name': target_runtime.get('service_name', DEFAULT_RUNTIME_SERVICE_NAME),
            'mnl_docker_container_name': target_runtime.get('container_name', DEFAULT_RUNTIME_CONTAINER_NAME),
            'mnl_docker_volume_path': target_runtime.get('volume_path', DEFAULT_RUNTIME_VOLUME_PATH),
            'mnl_r1setup_metadata_host_path': target_runtime.get(
                'metadata_path',
                f"{target_runtime.get('volume_path', DEFAULT_RUNTIME_VOLUME_PATH)}/_data/r1setup/metadata.json",
            ),
            'r1setup_runtime_exit_status_path': target_runtime.get(
                'exit_status_path',
                f"/tmp/{target_runtime.get('container_name', DEFAULT_RUNTIME_CONTAINER_NAME)}.exit",
            ),
        })

        execution_host = self.app.config_manager.build_execution_host_config(plan['instance_name'], base_config)
        execution_host['r1setup_execution_scope'] = 'instance'
        return {
            'all': {
                'children': {
                    'gpu_nodes': {
                        'hosts': {
                            plan['instance_name']: execution_host,
                        }
                    }
                }
            }
        }

    def _run_target_instance_playbook(
        self,
        plan: Dict[str, Any],
        playbook_name: str,
        *,
        last_applied_action: str,
        show_output: bool = False,
    ) -> Tuple[bool, str]:
        """Run an instance-scoped playbook against the target inventory override."""
        playbook_path = self.app.config_dir / f'playbooks/{playbook_name}'
        execution_inventory = self._build_target_execution_inventory(plan)
        timeout = self._migration_probe_timeout()
        if playbook_name in {'apply_instance.yml', 'service_start.yml'}:
            timeout = self._migration_runtime_timeout()
        success, output, _, _ = self.app.run_custom_inventory_playbook(
            playbook_path,
            execution_inventory,
            machine_scope=False,
            last_applied_action=last_applied_action,
            show_output=show_output,
            timeout=timeout,
        )
        return success, output

    def _apply_target_runtime_definition(self, plan: Dict[str, Any]) -> Dict[str, Any]:
        """Apply the target runtime definition to the migrated instance."""
        success, output = self._run_target_instance_playbook(
            plan,
            'apply_instance.yml',
            last_applied_action='migration_apply_target',
            show_output=True,
        )
        if success:
            return {'status': 'success', 'output': output}
        return {'status': 'error', 'message': output or 'Target runtime apply failed'}

    def _start_target_instance(self, plan: Dict[str, Any]) -> Dict[str, Any]:
        """Start the migrated instance on the target machine."""
        success, output = self._run_target_instance_playbook(
            plan,
            'service_start.yml',
            last_applied_action='migration_start_target',
            show_output=True,
        )
        if success:
            return {'status': 'success', 'output': output}
        return {'status': 'error', 'message': output or 'Target start failed'}

    def _verify_target_migration_health(self, plan: Dict[str, Any]) -> Dict[str, Any]:
        """Verify target runtime health after startup."""
        success, output = self._run_target_instance_playbook(
            plan,
            'service_status.yml',
            last_applied_action='migration_verify_target',
            show_output=False,
        )
        if not success:
            return {'status': 'error', 'message': output or 'Target status check failed'}

        lines = output.splitlines()
        status_data = self.app.status_tracker._parse_ansible_status_lines(lines)
        host_status = status_data.get(plan['instance_name'], {})
        if host_status.get('status') != 'running':
            return {'status': 'error', 'message': f"Target status is {host_status.get('status', 'unknown')} instead of running"}

        node_info_success, node_info_output = self._run_target_instance_playbook(
            plan,
            'get_node_info.yml',
            last_applied_action='migration_get_node_info',
            show_output=False,
        )
        app_health = None
        app_health_status = 'unknown'
        if node_info_success:
            node_results = self.app._parse_node_info_output(node_info_output)
            node_entry = node_results.get(plan['instance_name'])
            if node_entry:
                node_status = str(node_entry.get('status') or 'unknown')
                if node_status == 'success':
                    app_health = True
                    app_health_status = 'verified'
                elif node_status in ('unreachable', 'error', 'failed'):
                    return {'status': 'error', 'message': f"Application health check returned status '{node_status}'"}

        return {
            'status': 'success',
            'runtime_health': 'verified',
            'app_health': app_health,
            'app_health_status': app_health_status,
        }

    def _mark_migration_failure(self, plan: Dict[str, Any], message: str) -> None:
        """Persist and log a migration execution failure."""
        failed_plan = self._update_plan_fields(
            plan,
            status='failed',
            last_error=message,
            last_updated_at=datetime.now().isoformat(),
        )
        self.app.set_migration_plan_state(failed_plan)
        self.app.log_operation_event(
            'migration_execution',
            'failed',
            {
                'plan_id': failed_plan.get('plan_id'),
                'instance_name': failed_plan.get('instance_name'),
                'source_machine_id': failed_plan.get('source_machine_id'),
                'target_machine_id': failed_plan.get('target_machine_id'),
                'message': message,
            },
        )

    def _persist_plan_step(self, plan: Dict[str, Any], step: str, **extra_updates: Any) -> Dict[str, Any]:
        """Persist a migration-plan step transition and return the updated plan."""
        updated_plan = self._update_plan_fields(
            plan,
            last_step=step,
            last_updated_at=datetime.now().isoformat(),
            **extra_updates,
        )
        self.app.set_migration_plan_state(updated_plan)
        return updated_plan

    def _announce_migration_phase(self, step_number: int, total_steps: int, title: str, detail: str) -> None:
        """Print a visible progress marker for long-running migration phases."""
        self.app.print_colored(f"[{step_number}/{total_steps}] {title}", 'cyan', bold=True)
        self.app.print_colored(detail, 'white')

    def _run_with_spinner(self, label: str, func, *args, progress_fn=None, progress_poll_secs: float = 5.0, **kwargs):
        """Run *func* while displaying a CLI spinner (with optional progress) on stdout.

        Parameters
        ----------
        progress_fn : callable, optional
            Called periodically; should return ``(current_bytes, total_bytes)``
            or ``(current_bytes, None)`` when the total is unknown.  Return
            ``None`` to skip the update.
        progress_poll_secs : float
            How often (in seconds) to call *progress_fn*.  Remote probes
            should use ~5 s; local file-size checks can use ~1 s.
        """
        result_container: list = []
        done_event = threading.Event()

        def _worker():
            result_container.append(func(*args, **kwargs))
            done_event.set()

        worker = threading.Thread(target=_worker, daemon=True)
        worker.start()

        frames = ['|', '/', '-', '\\']
        idx = 0
        poll_every = max(1, int(progress_poll_secs / 0.25))
        last_progress = ''
        while not done_event.wait(timeout=0.25):
            if progress_fn and idx % poll_every == 0 and idx > 0:
                try:
                    info = progress_fn()
                    if info is not None:
                        current, total = info
                        if total and total > 0:
                            pct = min(100, int(current * 100 / total))
                            last_progress = f" {self._format_bytes(current)} / {self._format_bytes(total)} ({pct}%)"
                        elif current and current > 0:
                            last_progress = f" {self._format_bytes(current)}"
                except Exception:
                    pass
            line = f"\r  {frames[idx % len(frames)]} {label}{last_progress}"
            sys.stdout.write(line)
            sys.stdout.flush()
            idx += 1

        # Clear the spinner line
        sys.stdout.write(f"\r{' ' * (len(label) + len(last_progress) + 6)}\r")
        sys.stdout.flush()

        worker.join()
        return result_container[0]

    def _cleanup_local_archive(self, archive_path: Path) -> Dict[str, Any]:
        """Remove a local archive path if it exists."""
        if not archive_path.exists():
            return {'status': 'success', 'removed': False}
        try:
            archive_path.unlink()
        except OSError as e:
            return {'status': 'error', 'message': str(e)}
        return {'status': 'success', 'removed': True}

    def _cleanup_remote_archive(self, machine_record: Dict[str, Any], archive_path: str) -> Dict[str, Any]:
        """Remove a remote archive path if present."""
        if not archive_path:
            return {'status': 'success', 'removed': False}
        remote_command = self._build_remote_root_shell_command(
            machine_record,
            f"rm -f {shlex.quote(archive_path)}",
        )
        result = self._run_machine_command(machine_record, remote_command)
        if result.get('status') != 'success':
            return result
        return {'status': 'success', 'removed': True}

    def _cleanup_runtime_artifacts(
        self,
        machine_record: Dict[str, Any],
        runtime: Dict[str, Any],
        *,
        remove_volume: bool = False,
    ) -> Dict[str, Any]:
        """Remove per-instance runtime artifacts without touching machine-global Docker config."""
        service_name = str(runtime.get('service_name') or DEFAULT_RUNTIME_SERVICE_NAME).strip()
        container_name = str(runtime.get('container_name') or DEFAULT_RUNTIME_CONTAINER_NAME).strip()
        volume_path = str(runtime.get('volume_path') or DEFAULT_RUNTIME_VOLUME_PATH).strip()
        metadata_path = str(runtime.get('metadata_path') or f"{volume_path}/_data/r1setup/metadata.json").strip()
        metadata_dir = str(Path(metadata_path).parent)
        exit_status_path = str(runtime.get('exit_status_path') or f"/tmp/{container_name}.exit").strip()
        helper_registry_path = f"{DEFAULT_HELPER_REGISTRY_DIR}/{service_name}.env"

        cleanup_parts = [
            f"systemctl stop {shlex.quote(service_name)} >/dev/null 2>&1 || true",
            f"systemctl disable {shlex.quote(service_name)} >/dev/null 2>&1 || true",
            f"rm -f /etc/systemd/system/{shlex.quote(service_name)}.service",
            "systemctl daemon-reload >/dev/null 2>&1 || true",
            f"docker rm -f {shlex.quote(container_name)} >/dev/null 2>&1 || true",
            f"rm -f {shlex.quote(metadata_path)}",
            f"rmdir {shlex.quote(metadata_dir)} >/dev/null 2>&1 || true",
            f"rm -f {shlex.quote(exit_status_path)}",
            f"rm -f {shlex.quote(helper_registry_path)}",
        ]
        if remove_volume and volume_path:
            cleanup_parts.append(f"rm -rf {shlex.quote(volume_path)}")

        remote_command = self._build_remote_root_shell_command(
            machine_record,
            " && ".join(cleanup_parts),
        )
        return self._run_machine_command(machine_record, remote_command, timeout=self.app.connection_timeout * 3)

    def _start_source_instance_after_rollback(self, instance_name: str) -> Dict[str, Any]:
        """Restart the source instance after rollback cleanup."""
        playbook_path = self.app.config_dir / 'playbooks/service_start.yml'
        lifecycle_timeout = self._migration_runtime_timeout()
        success, output, _, _ = self.app.run_generated_playbook(
            playbook_path,
            [instance_name],
            machine_scope=False,
            last_applied_action='migration_source_restart',
            show_output=False,
            timeout=lifecycle_timeout,
        )
        if success:
            return {'status': 'success', 'output': output}
        return {'status': 'error', 'message': output or 'Source restart failed'}

    def _verify_source_instance_after_rollback(self, instance_name: str) -> Dict[str, Any]:
        """Verify that the source instance is actually running after rollback restart."""
        playbook_path = self.app.config_dir / 'playbooks/service_status.yml'
        success, output, _, _ = self.app.run_generated_playbook(
            playbook_path,
            [instance_name],
            machine_scope=False,
            last_applied_action='migration_verify_source_rollback',
            show_output=False,
            timeout=self._migration_probe_timeout(),
        )
        if not success:
            return {'status': 'error', 'message': output or 'Source rollback verification failed'}

        status_tracker = getattr(self.app, 'status_tracker', None)
        if status_tracker is None:
            return {'status': 'error', 'message': 'Status tracker unavailable for rollback verification'}

        status_data = status_tracker._parse_ansible_status_lines((output or '').splitlines())
        host_status = status_data.get(instance_name, {})
        resolved_status = str(host_status.get('status') or 'unknown')
        if resolved_status != 'running':
            return {
                'status': 'error',
                'message': f"Source rollback verification returned status '{resolved_status}'",
            }
        return {'status': 'success', 'runtime_health': 'verified'}

    def rollback_saved_migration_plan(self) -> None:
        """Rollback a failed or interrupted migration plan and restore the source runtime."""
        plan = copy.deepcopy(self.app.active_config.get('migration_plan_state') or {})
        if not plan:
            self.app.print_colored("No saved migration plan found. Nothing to roll back.", 'yellow')
            self.app.wait_for_enter()
            return

        plan_status = str(plan.get('status') or 'unknown')
        if plan_status not in ('failed', 'executing'):
            self.app.print_colored(
                f"Saved migration plan is not rollback-eligible in status '{plan_status}'.",
                'red',
            )
            self.app.wait_for_enter()
            return

        self.app.print_header("Rollback Saved Migration Plan")
        self._display_migration_plan(plan, context='rollback')
        if self.app.get_input("\n↩️  Roll back this migration plan now? (y/N)", "N").lower() != 'y':
            self.app.print_colored("Migration rollback cancelled.", 'yellow')
            self.app.wait_for_enter()
            return

        fleet_state = self.app.get_fleet_state_copy()
        machines = fleet_state.get('fleet', {}).get('machines', {})
        source_machine = dict(machines.get(plan.get('source_machine_id'), {}))
        target_machine = dict(machines.get(plan.get('target_machine_id'), {}))
        if not source_machine or not target_machine:
            self.app.print_colored("Saved migration plan no longer matches the registered source/target machines.", 'red')
            self.app.wait_for_enter()
            return

        transfer = plan.get('transfer') or {}
        source_runtime = plan.get('source_runtime') or {}
        target_runtime = plan.get('target_runtime') or {}
        local_archive_path = Path(transfer.get('local_archive_path') or (self.app._default_migration_temp_dir() / f"{plan.get('instance_name', 'migration')}.tar.gz"))

        rolling_back_plan = self._persist_plan_step(
            plan,
            'rollback_started',
            status='rolling_back',
            rollback_started_at=datetime.now().isoformat(),
        )
        self.app.log_operation_event(
            'migration_rollback',
            'started',
            {
                'plan_id': rolling_back_plan.get('plan_id'),
                'instance_name': rolling_back_plan.get('instance_name'),
                'source_machine_id': rolling_back_plan.get('source_machine_id'),
                'target_machine_id': rolling_back_plan.get('target_machine_id'),
            },
        )

        target_cleanup = self._cleanup_runtime_artifacts(target_machine, target_runtime, remove_volume=True)
        if target_cleanup.get('status') != 'success':
            failed_plan = self._persist_plan_step(
                rolling_back_plan,
                'rollback_failed_target_cleanup',
                status='rollback_failed',
                last_error=target_cleanup.get('message', 'Target cleanup failed'),
            )
            self.app.log_operation_event(
                'migration_rollback',
                'failed',
                {
                    'plan_id': failed_plan.get('plan_id'),
                    'instance_name': failed_plan.get('instance_name'),
                    'message': target_cleanup.get('message', 'Target cleanup failed'),
                },
            )
            self.app.print_colored(target_cleanup.get('message', 'Target cleanup failed'), 'red')
            self.app.wait_for_enter()
            return
        rolling_back_plan = self._persist_plan_step(rolling_back_plan, 'target_cleaned')

        for machine_record, archive_path in (
            (target_machine, transfer.get('target_archive_path', '')),
            (source_machine, transfer.get('source_archive_path', '')),
        ):
            cleanup_result = self._cleanup_remote_archive(machine_record, archive_path)
            if cleanup_result.get('status') != 'success':
                failed_plan = self._persist_plan_step(
                    rolling_back_plan,
                    'rollback_failed_archive_cleanup',
                    status='rollback_failed',
                    last_error=cleanup_result.get('message', 'Remote archive cleanup failed'),
                )
                self.app.log_operation_event(
                    'migration_rollback',
                    'failed',
                    {
                        'plan_id': failed_plan.get('plan_id'),
                        'instance_name': failed_plan.get('instance_name'),
                        'message': cleanup_result.get('message', 'Remote archive cleanup failed'),
                    },
                )
                self.app.print_colored(cleanup_result.get('message', 'Remote archive cleanup failed'), 'red')
                self.app.wait_for_enter()
                return
        local_cleanup = self._cleanup_local_archive(local_archive_path)
        if local_cleanup.get('status') != 'success':
            failed_plan = self._persist_plan_step(
                rolling_back_plan,
                'rollback_failed_local_cleanup',
                status='rollback_failed',
                last_error=local_cleanup.get('message', 'Local archive cleanup failed'),
            )
            self.app.log_operation_event(
                'migration_rollback',
                'failed',
                {
                    'plan_id': failed_plan.get('plan_id'),
                    'instance_name': failed_plan.get('instance_name'),
                    'message': local_cleanup.get('message', 'Local archive cleanup failed'),
                },
            )
            self.app.print_colored(local_cleanup.get('message', 'Local archive cleanup failed'), 'red')
            self.app.wait_for_enter()
            return
        rolling_back_plan = self._persist_plan_step(rolling_back_plan, 'archives_cleaned')

        source_restart = self._start_source_instance_after_rollback(str(plan.get('instance_name') or ''))
        if source_restart.get('status') != 'success':
            self.app.log_operation_event(
                'migration_rollback',
                'failed',
                {
                    'plan_id': rolling_back_plan.get('plan_id'),
                    'instance_name': rolling_back_plan.get('instance_name'),
                    'message': source_restart.get('message', 'Source restart failed'),
                },
            )
            recovery_probe = self._verify_source_instance_after_rollback(str(plan.get('instance_name') or ''))
            if recovery_probe.get('status') != 'success':
                failed_plan = self._persist_plan_step(
                    rolling_back_plan,
                    'rollback_failed_source_restart',
                    status='rollback_failed',
                    last_error=source_restart.get('message', 'Source restart failed'),
                )
                self.app._update_node_status(str(plan.get('instance_name') or ''), 'stopped')
                self.app.print_colored(source_restart.get('message', 'Source restart failed'), 'red')
                self.app.wait_for_enter()
                return
            rolling_back_plan = self._persist_plan_step(
                rolling_back_plan,
                'rollback_reconciled_source_recovered',
                rollback_recovery={
                    'reconciled_after_error': True,
                    'recovered_at': datetime.now().isoformat(),
                    'original_error': source_restart.get('message', 'Source restart failed'),
                    'runtime_health': recovery_probe.get('runtime_health', 'verified'),
                },
                last_error=None,
            )
            self.app.print_colored(
                "Source restart reported an error, but live verification confirmed the source node is running again.",
                'yellow',
            )

        rolled_back_plan = self._persist_plan_step(
            rolling_back_plan,
            'rollback_completed',
            status='rolled_back',
            rolled_back_at=datetime.now().isoformat(),
            last_error=None,
            rollback_cleanup={
                'target_volume_removed': True,
                'local_archive_removed': True,
                'source_archive_removed': True,
                'target_archive_removed': True,
            },
        )
        self.app._update_node_status(str(plan.get('instance_name') or ''), 'running')
        self.app.log_operation_event(
            'migration_rollback',
            'success',
            {
                'plan_id': rolled_back_plan.get('plan_id'),
                'instance_name': rolled_back_plan.get('instance_name'),
                'source_machine_id': rolled_back_plan.get('source_machine_id'),
                'target_machine_id': rolled_back_plan.get('target_machine_id'),
                'reconciled_after_error': bool((rolled_back_plan.get('rollback_recovery') or {}).get('reconciled_after_error')),
            },
        )
        self.app.print_colored("Migration rollback completed successfully.", 'green')
        self.app.wait_for_enter()

    def finalize_saved_migration_plan(self) -> None:
        """Finalize a verified migration by cleaning source-side runtime artifacts."""
        plan = copy.deepcopy(self.app.active_config.get('migration_plan_state') or {})
        if not plan:
            self.app.print_colored("No saved migration plan found. Nothing to finalize.", 'yellow')
            self.app.wait_for_enter()
            return
        if str(plan.get('status') or 'unknown') != 'executed':
            self.app.print_colored(
                f"Saved migration plan is not finalization-eligible in status '{plan.get('status', 'unknown')}'.",
                'red',
            )
            self.app.wait_for_enter()
            return

        self.app.print_header("Finalize Saved Migration Plan")
        self._display_migration_plan(plan, context='finalization')
        remove_source_volume = self.app.get_input(
            "Remove source volume data after finalization? (y/N)",
            "N",
        ).lower() == 'y'
        if self.app.get_input("\n🧹 Finalize this migration now? (y/N)", "N").lower() != 'y':
            self.app.print_colored("Migration finalization cancelled.", 'yellow')
            self.app.wait_for_enter()
            return

        fleet_state = self.app.get_fleet_state_copy()
        machines = fleet_state.get('fleet', {}).get('machines', {})
        source_machine = dict(machines.get(plan.get('source_machine_id'), {}))
        target_machine = dict(machines.get(plan.get('target_machine_id'), {}))
        if not source_machine or not target_machine:
            self.app.print_colored("Saved migration plan no longer matches the registered source/target machines.", 'red')
            self.app.wait_for_enter()
            return

        transfer = plan.get('transfer') or {}
        source_runtime = plan.get('source_runtime') or {}
        local_archive_path = Path(transfer.get('local_archive_path') or (self.app._default_migration_temp_dir() / f"{plan.get('instance_name', 'migration')}.tar.gz"))

        finalizing_plan = self._persist_plan_step(
            plan,
            'finalization_started',
            status='finalizing',
            finalization_started_at=datetime.now().isoformat(),
        )
        self.app.log_operation_event(
            'migration_finalization',
            'started',
            {
                'plan_id': finalizing_plan.get('plan_id'),
                'instance_name': finalizing_plan.get('instance_name'),
                'source_machine_id': finalizing_plan.get('source_machine_id'),
                'target_machine_id': finalizing_plan.get('target_machine_id'),
                'remove_source_volume': remove_source_volume,
            },
        )

        source_cleanup = self._cleanup_runtime_artifacts(
            source_machine,
            source_runtime,
            remove_volume=remove_source_volume,
        )
        if source_cleanup.get('status') != 'success':
            failed_plan = self._persist_plan_step(
                finalizing_plan,
                'finalization_failed_source_cleanup',
                status='finalization_failed',
                last_error=source_cleanup.get('message', 'Source cleanup failed'),
            )
            self.app.log_operation_event(
                'migration_finalization',
                'failed',
                {
                    'plan_id': failed_plan.get('plan_id'),
                    'instance_name': failed_plan.get('instance_name'),
                    'message': source_cleanup.get('message', 'Source cleanup failed'),
                },
            )
            self.app.print_colored(source_cleanup.get('message', 'Source cleanup failed'), 'red')
            self.app.wait_for_enter()
            return
        finalizing_plan = self._persist_plan_step(finalizing_plan, 'source_cleaned')

        for machine_record, archive_path in (
            (source_machine, transfer.get('source_archive_path', '')),
            (target_machine, transfer.get('target_archive_path', '')),
        ):
            cleanup_result = self._cleanup_remote_archive(machine_record, archive_path)
            if cleanup_result.get('status') != 'success':
                failed_plan = self._persist_plan_step(
                    finalizing_plan,
                    'finalization_failed_archive_cleanup',
                    status='finalization_failed',
                    last_error=cleanup_result.get('message', 'Remote archive cleanup failed'),
                )
                self.app.log_operation_event(
                    'migration_finalization',
                    'failed',
                    {
                        'plan_id': failed_plan.get('plan_id'),
                        'instance_name': failed_plan.get('instance_name'),
                        'message': cleanup_result.get('message', 'Remote archive cleanup failed'),
                    },
                )
                self.app.print_colored(cleanup_result.get('message', 'Remote archive cleanup failed'), 'red')
                self.app.wait_for_enter()
                return
        local_cleanup = self._cleanup_local_archive(local_archive_path)
        if local_cleanup.get('status') != 'success':
            failed_plan = self._persist_plan_step(
                finalizing_plan,
                'finalization_failed_local_cleanup',
                status='finalization_failed',
                last_error=local_cleanup.get('message', 'Local archive cleanup failed'),
            )
            self.app.log_operation_event(
                'migration_finalization',
                'failed',
                {
                    'plan_id': failed_plan.get('plan_id'),
                    'instance_name': failed_plan.get('instance_name'),
                    'message': local_cleanup.get('message', 'Local archive cleanup failed'),
                },
            )
            self.app.print_colored(local_cleanup.get('message', 'Local archive cleanup failed'), 'red')
            self.app.wait_for_enter()
            return

        finalized_plan = self._persist_plan_step(
            finalizing_plan,
            'finalization_completed',
            status='finalized',
            finalized_at=datetime.now().isoformat(),
            finalization_cleanup={
                'source_volume_removed': remove_source_volume,
                'local_archive_removed': True,
                'source_archive_removed': True,
                'target_archive_removed': True,
            },
        )
        self.app.log_operation_event(
            'migration_finalization',
            'success',
            {
                'plan_id': finalized_plan.get('plan_id'),
                'instance_name': finalized_plan.get('instance_name'),
                'source_machine_id': finalized_plan.get('source_machine_id'),
                'target_machine_id': finalized_plan.get('target_machine_id'),
                'source_volume_removed': remove_source_volume,
            },
        )
        self.app.print_colored("Migration finalization completed successfully.", 'green')
        self.app.wait_for_enter()

    def execute_saved_migration_plan(self) -> None:
        """Execute the saved migration plan using controller-temp archive transfer."""
        plan = copy.deepcopy(self.app.active_config.get('migration_plan_state') or {})
        if not plan:
            self.app.print_colored("No saved migration plan found. Build a plan first.", 'yellow')
            self.app.wait_for_enter()
            return

        plan_status = str(plan.get('status') or 'unknown')
        if plan_status in {'planned', 'blocked'}:
            self.app.print_colored(
                f"Revalidating saved migration plan from status '{plan_status}' against current fleet state.",
                'cyan',
            )
            plan = self._revalidate_saved_migration_plan(plan)
            self.app.set_migration_plan_state(plan)
            if plan.get('status') != 'planned':
                self.app.print_colored("Saved migration plan remains blocked after revalidation.", 'red')
                self._display_migration_plan(plan, context='execution')
                self.app.print_colored(
                    "Resolve the blocking errors above or rebuild the migration plan after fleet/runtime changes.",
                    'yellow',
                )
                self.app.wait_for_enter()
                return
            self.app.print_colored("Saved migration plan passed revalidation and is ready to execute.", 'green')
        else:
            self.app.print_colored(
                f"Saved migration plan is not executable in status '{plan.get('status', 'unknown')}'.",
                'red',
            )
            self.app.wait_for_enter()
            return

        self.app.print_header("Execute Saved Migration Plan")
        self._display_migration_plan(plan, context='execution')
        if self.app.get_input("\n🚚 Execute this migration plan now? (y/N)", "N").lower() != 'y':
            self.app.print_colored("Migration execution cancelled.", 'yellow')
            self.app.wait_for_enter()
            return

        fleet_state = self.app.get_fleet_state_copy()
        machines = fleet_state.get('fleet', {}).get('machines', {})
        source_machine = dict(machines.get(plan['source_machine_id'], {}))
        target_machine = dict(machines.get(plan['target_machine_id'], {}))
        if not source_machine or not target_machine:
            self._mark_migration_failure(plan, 'Saved migration plan no longer matches registered source/target machines')
            self.app.print_colored(
                "Saved migration plan no longer matches the registered source/target machines.",
                'red',
            )
            self.app.wait_for_enter()
            return
        transfer = plan.get('transfer') or {}
        source_runtime = plan.get('source_runtime') or {}
        target_runtime = plan.get('target_runtime') or {}

        local_temp_dir = Path(transfer.get('local_temp_dir') or self.app._default_migration_temp_dir())
        local_archive_path = Path(transfer.get('local_archive_path') or (local_temp_dir / f"{plan['instance_name']}.tar.gz"))
        self._ensure_local_temp_dir(local_temp_dir)
        if local_archive_path.exists():
            try:
                local_archive_path.unlink()
            except OSError as e:
                self._mark_migration_failure(plan, f'Unable to reset local archive path: {e}')
                self.app.print_colored(f"Unable to reset local archive path: {e}", 'red')
                self.app.wait_for_enter()
                return

        executing_plan = self._update_plan_fields(
            plan,
            status='executing',
            started_at=datetime.now().isoformat(),
            last_updated_at=datetime.now().isoformat(),
        )
        self.app.set_migration_plan_state(executing_plan)
        self.app.log_operation_event(
            'migration_execution',
            'started',
            {
                'plan_id': executing_plan.get('plan_id'),
                'instance_name': executing_plan.get('instance_name'),
                'source_machine_id': executing_plan.get('source_machine_id'),
                'target_machine_id': executing_plan.get('target_machine_id'),
            },
        )

        total_steps = 9
        self._announce_migration_phase(1, total_steps, "Prepare Target Machine", "Ensuring the destination host is ready to accept the migrated node.")
        self.app.print_colored(
            "This step may stay quiet between Ansible tasks while packages install or repositories refresh.",
            'cyan',
        )
        prep_result = self._prepare_target_machine_for_migration(executing_plan)
        if prep_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, prep_result.get('message', 'Target preparation failed'))
            self.app.print_colored(prep_result.get('message', 'Target preparation failed'), 'red')
            self.app.wait_for_enter()
            return
        executing_plan = self._persist_plan_step(executing_plan, 'target_prepared')

        self._announce_migration_phase(2, total_steps, "Stop Source Instance", "Stopping the source service before archiving its persistent volume.")
        stop_result = self._stop_source_instance_for_migration(executing_plan['instance_name'])
        if stop_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, stop_result.get('message', 'Source stop failed'))
            self.app.print_colored(stop_result.get('message', 'Source stop failed'), 'red')
            self.app.wait_for_enter()
            return
        self.app._update_node_status(executing_plan['instance_name'], 'stopped')
        executing_plan = self._persist_plan_step(executing_plan, 'source_stopped')

        transfer_timeout = self._migration_transfer_timeout()
        source_vol_path = source_runtime.get('volume_path', '')
        source_archive_path = transfer.get('source_archive_path', '')
        target_archive_path = transfer.get('target_archive_path', '')
        target_vol_path = target_runtime.get('volume_path', '')

        self._announce_migration_phase(3, total_steps, "Archive Source Volume", "Creating the transfer archive on the source machine and verifying its checksum.")
        source_vol_size_result = self._probe_remote_dir_size(source_machine, source_vol_path)
        source_vol_size = source_vol_size_result.get('size', 0) if source_vol_size_result.get('status') == 'success' else 0
        if source_vol_size > 0:
            self.app.print_colored(f"Source volume size: {self._format_bytes(source_vol_size)}.  Timeout: {transfer_timeout}s.", 'cyan')
        else:
            self.app.print_colored(f"This may take a while for large volumes. Timeout: {transfer_timeout}s.", 'cyan')

        def _archive_progress():
            r = self._probe_remote_file_size(source_machine, source_archive_path)
            if r.get('status') == 'success':
                return (r['size'], None)
            return None

        archive_result = self._run_with_spinner(
            "Archiving source volume...",
            self._create_source_archive,
            source_machine,
            source_vol_path,
            source_archive_path,
            progress_fn=_archive_progress,
        )
        if archive_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, archive_result.get('message', 'Source archive failed'))
            self.app.print_colored(archive_result.get('message', 'Source archive failed'), 'red')
            self.app.wait_for_enter()
            return
        executing_plan = self._persist_plan_step(executing_plan, 'source_archived')

        archive_size_result = self._probe_remote_file_size(source_machine, source_archive_path)
        archive_size = archive_size_result.get('size', 0) if archive_size_result.get('status') == 'success' else 0
        if archive_size > 0:
            self.app.print_colored(f"Archive size: {self._format_bytes(archive_size)}", 'cyan')

        source_checksum_result = self._compute_remote_checksum(source_machine, source_archive_path)
        if source_checksum_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, source_checksum_result.get('message', 'Source checksum failed'))
            self.app.print_colored(source_checksum_result.get('message', 'Source checksum failed'), 'red')
            self.app.wait_for_enter()
            return
        source_checksum = source_checksum_result['checksum']

        self._announce_migration_phase(4, total_steps, "Download To Controller", "Copying the archive to the local controller temp folder and validating integrity.")

        def _download_progress():
            try:
                return (local_archive_path.stat().st_size, archive_size or None)
            except OSError:
                return None

        download_result = self._run_with_spinner(
            "Downloading archive from source...",
            self._copy_from_machine,
            source_machine,
            source_archive_path,
            str(local_archive_path),
            timeout=transfer_timeout,
            progress_fn=_download_progress if archive_size else None,
            progress_poll_secs=1.0,
        )
        if download_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, download_result.get('message', 'Archive download failed'))
            self.app.print_colored(download_result.get('message', 'Archive download failed'), 'red')
            self.app.wait_for_enter()
            return

        local_checksum = self._compute_local_checksum(local_archive_path)
        if local_checksum != source_checksum:
            self._mark_migration_failure(executing_plan, 'Local checksum does not match source checksum')
            self.app.print_colored("Local checksum does not match source checksum.", 'red')
            self.app.wait_for_enter()
            return
        executing_plan = self._persist_plan_step(executing_plan, 'archive_downloaded')

        self._announce_migration_phase(5, total_steps, "Upload To Target", "Transferring the verified archive from the controller to the target machine.")

        def _upload_progress():
            r = self._probe_remote_file_size(target_machine, target_archive_path)
            if r.get('status') == 'success':
                return (r['size'], archive_size or None)
            return None

        upload_result = self._run_with_spinner(
            "Uploading archive to target...",
            self._copy_to_machine,
            target_machine,
            str(local_archive_path),
            target_archive_path,
            timeout=transfer_timeout,
            progress_fn=_upload_progress,
            progress_poll_secs=10.0,
        )
        if upload_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, upload_result.get('message', 'Archive upload failed'))
            self.app.print_colored(upload_result.get('message', 'Archive upload failed'), 'red')
            self.app.wait_for_enter()
            return

        target_checksum_result = self._compute_remote_checksum(target_machine, target_archive_path)
        if target_checksum_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, target_checksum_result.get('message', 'Target checksum failed'))
            self.app.print_colored(target_checksum_result.get('message', 'Target checksum failed'), 'red')
            self.app.wait_for_enter()
            return
        if target_checksum_result['checksum'] != source_checksum:
            self._mark_migration_failure(executing_plan, 'Target checksum does not match source checksum')
            self.app.print_colored("Target checksum does not match source checksum.", 'red')
            self.app.wait_for_enter()
            return
        executing_plan = self._persist_plan_step(executing_plan, 'archive_uploaded')

        self._announce_migration_phase(6, total_steps, "Prepare Target Volume", "Creating the target volume root with source ownership and permissions.")
        prepare_volume_result = self._prepare_target_volume_root(
            source_machine,
            target_machine,
            source_vol_path,
            target_vol_path,
        )
        if prepare_volume_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, prepare_volume_result.get('message', 'Target volume preparation failed'))
            self.app.print_colored(prepare_volume_result.get('message', 'Target volume preparation failed'), 'red')
            self.app.wait_for_enter()
            return
        executing_plan = self._persist_plan_step(executing_plan, 'target_volume_prepared')

        self._announce_migration_phase(7, total_steps, "Extract On Target", "Restoring the transferred archive into the target volume path.")

        def _extract_progress():
            r = self._probe_remote_dir_size(target_machine, target_vol_path)
            if r.get('status') == 'success':
                return (r['size'], source_vol_size or None)
            return None

        extract_result = self._run_with_spinner(
            "Extracting archive on target...",
            self._extract_archive_on_target,
            target_machine,
            target_archive_path,
            target_vol_path,
            progress_fn=_extract_progress if source_vol_size else None,
            progress_poll_secs=10.0,
        )
        if extract_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, extract_result.get('message', 'Target extract failed'))
            self.app.print_colored(extract_result.get('message', 'Target extract failed'), 'red')
            self.app.wait_for_enter()
            return
        executing_plan = self._persist_plan_step(executing_plan, 'target_extracted')

        self._announce_migration_phase(8, total_steps, "Apply And Start Target Runtime", "Rendering the service definition on the target machine and starting the node.")
        apply_timeout = self._migration_runtime_timeout()
        self.app.print_colored(
            f"Target apply/start can take several minutes. Timeout for this phase: {apply_timeout}s (base setting: {self.app.connection_timeout}s).",
            'cyan',
        )
        self.app.print_colored(
            "Ansible task output will be shown while the service definition is rendered and the runtime starts.",
            'cyan',
        )
        apply_result = self._apply_target_runtime_definition(executing_plan)
        if apply_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, apply_result.get('message', 'Target apply failed'))
            self.app.print_colored(apply_result.get('message', 'Target apply failed'), 'red')
            self.app.wait_for_enter()
            return
        executing_plan = self._persist_plan_step(executing_plan, 'target_applied')

        start_result = self._start_target_instance(executing_plan)
        if start_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, start_result.get('message', 'Target start failed'))
            self.app.print_colored(start_result.get('message', 'Target start failed'), 'red')
            self.app.wait_for_enter()
            return
        executing_plan = self._persist_plan_step(executing_plan, 'target_started')

        self._announce_migration_phase(9, total_steps, "Verify Target Health", "Checking runtime status and best-effort application health on the target machine.")
        verify_result = self._verify_target_migration_health(executing_plan)
        if verify_result.get('status') != 'success':
            self._mark_migration_failure(executing_plan, verify_result.get('message', 'Target verification failed'))
            self.app.print_colored(verify_result.get('message', 'Target verification failed'), 'red')
            self.app.wait_for_enter()
            return
        executing_plan = self._persist_plan_step(
            executing_plan,
            'target_verified',
            verified_target=True,
            runtime_health=verify_result.get('runtime_health'),
            app_health=verify_result.get('app_health'),
            app_health_status=verify_result.get('app_health_status'),
        )

        finalized_plan = self._update_plan_fields(
            executing_plan,
            status='executed',
            finalized_assignment=True,
            verified_target=True,
            runtime_health=verify_result.get('runtime_health'),
            app_health=verify_result.get('app_health'),
            app_health_status=verify_result.get('app_health_status'),
            finished_at=datetime.now().isoformat(),
            last_updated_at=datetime.now().isoformat(),
        )
        self.app.finalize_instance_migration(
            executing_plan['instance_name'],
            executing_plan['target_machine_id'],
            target_runtime,
            runtime_name_policy=executing_plan.get('runtime_name_policy', 'preserve'),
            migration_plan_state=finalized_plan,
        )
        self.app.record_service_file_version([executing_plan['instance_name']])
        self.app._update_node_status(executing_plan['instance_name'], 'running')
        self.app.log_operation_event(
            'migration_execution',
            'success',
            {
                'plan_id': finalized_plan.get('plan_id'),
                'instance_name': finalized_plan.get('instance_name'),
                'source_machine_id': finalized_plan.get('source_machine_id'),
                'target_machine_id': finalized_plan.get('target_machine_id'),
                'local_archive_path': transfer.get('local_archive_path'),
            },
        )
        self.app.print_colored("Migration execution completed successfully.", 'green')
        self.app.print_colored("  • Source service stopped: yes", 'white')
        self.app.print_colored("  • Archive verified across source, controller, and target: yes", 'white')
        self.app.print_colored("  • Target runtime verified running: yes", 'white')
        app_health_status = str(finalized_plan.get('app_health_status') or 'unknown')
        if app_health_status == 'verified':
            self.app.print_colored("  • Application health: verified", 'green')
        else:
            self.app.print_colored("  • Application health: unknown (runtime verified, app probe not confirmed)", 'yellow')
        self.app.print_colored("  • Assignment updated to target machine: yes", 'white')
        self.app.print_colored(
            "Assignment has been updated to the target machine. Source cleanup remains deferred for rollback/finalization.",
            'yellow',
        )
        self.app.wait_for_enter()


class SSHKeyManager:
    """Handles SSH key management workflows and metadata migration."""

    def __init__(self, app):
        self.app = app

    def _get_default_host_metadata(self, host_config: Dict[str, Any]) -> Dict[str, Any]:
        """Return default SSH metadata for a host based on current auth fields."""
        if 'ansible_ssh_pass' in host_config:
            mode = SSH_AUTH_MODE_PASSWORD_ONLY
            requires_revalidation = False
        elif 'ansible_ssh_private_key_file' in host_config:
            mode = SSH_AUTH_MODE_KEY_CONFIGURED_LEGACY
            requires_revalidation = True
        else:
            mode = SSH_AUTH_MODE_VERIFICATION_FAILED
            requires_revalidation = True

        return {
            'r1setup_ssh_auth_mode': mode,
            'r1setup_ssh_primary_key_fingerprint': host_config.get('r1setup_ssh_primary_key_fingerprint'),
            'r1setup_ssh_primary_key_path': host_config.get('ansible_ssh_private_key_file'),
            'r1setup_ssh_key_auth_verified_at': host_config.get('r1setup_ssh_key_auth_verified_at'),
            'r1setup_ssh_last_verified_fingerprint': host_config.get('r1setup_ssh_last_verified_fingerprint'),
            'r1setup_ssh_last_verification_status': host_config.get('r1setup_ssh_last_verification_status', 'not_checked'),
            'r1setup_password_auth_disabled': host_config.get('r1setup_password_auth_disabled', False),
            'r1setup_ssh_hardening_applied_at': host_config.get('r1setup_ssh_hardening_applied_at'),
            'r1setup_ssh_requires_revalidation': host_config.get('r1setup_ssh_requires_revalidation', requires_revalidation),
            'r1setup_managed_public_keys': host_config.get('r1setup_managed_public_keys', []),
        }

    def _migrate_host_metadata(self, host_config: Dict[str, Any]) -> bool:
        """Populate missing SSH metadata for a host."""
        defaults = self._get_default_host_metadata(host_config)
        changed = False
        for key, value in defaults.items():
            if key not in host_config:
                host_config[key] = value
                changed = True
        return changed

    def migrate_legacy_ssh_metadata(self) -> None:
        """Add SSH metadata to saved configurations without changing active auth."""
        migrated_configs = 0
        migrated_hosts = 0

        for config_path in self.app.configs_dir.glob("*.yml"):
            try:
                with open(config_path) as f:
                    config_data = yaml.safe_load(f) or {}
            except Exception as e:
                self.app.print_debug(f"Skipping SSH metadata migration for {config_path}: {e}")
                continue

            if not config_data or 'all' not in config_data:
                continue

            all_section = config_data.setdefault('all', {})
            all_vars = all_section.setdefault('vars', {})
            children = all_section.setdefault('children', {})
            gpu_nodes = children.setdefault('gpu_nodes', {})
            hosts = gpu_nodes.setdefault('hosts', {})

            changed = False
            if all_vars.get('r1setup_schema_version') != SSH_SCHEMA_VERSION:
                all_vars['r1setup_schema_version'] = SSH_SCHEMA_VERSION
                changed = True

            for host_config in hosts.values():
                if self._migrate_host_metadata(host_config):
                    migrated_hosts += 1
                    changed = True

            if changed:
                with open(config_path, 'w') as f:
                    yaml.safe_dump(config_data, f, default_flow_style=False)
                os.chmod(config_path, 0o600)
                migrated_configs += 1

        if migrated_configs:
            self.app.print_colored("SSH key management metadata initialized for existing configurations.", 'cyan')
            self.app.print_colored(f"Updated {migrated_hosts} host definition(s) across {migrated_configs} configuration(s).", 'cyan')

    def check_ssh_key_tooling(self) -> Tuple[bool, List[str]]:
        """Check whether required local SSH tooling is available."""
        missing = []
        for cmd, description in SSH_KEY_MANAGEMENT_REQUIRED_TOOLS.items():
            if shutil.which(cmd) is None:
                missing.append(f"{cmd} ({description})")
        return len(missing) == 0, missing

    def check_feature_capabilities(self) -> Tuple[bool, List[str]]:
        """Check local tools and required playbooks for SSH key management."""
        ok, missing = self.check_ssh_key_tooling()
        issues = list(missing)

        for rel_path in SSH_KEY_MANAGEMENT_REQUIRED_PLAYBOOKS:
            playbook_path = self.app.config_dir / rel_path
            if not playbook_path.exists():
                issues.append(f"missing playbook: {playbook_path}")

        return ok and not any(item.startswith('missing playbook:') for item in issues), issues

    def _show_feature_unavailable(self, issues: List[str]) -> None:
        self.app.print_colored("SSH Key Management is unavailable.", 'red')
        for issue in issues:
            self.app.print_colored(f"  • {issue}", 'yellow')
        self.app.print_colored("Ensure the latest collection is installed and required local tools are available.", 'yellow')
        self.app.wait_for_enter()

    @staticmethod
    def _resolve_abs_path(path_value: str) -> Path:
        """Resolve a local path to an absolute path."""
        return Path(os.path.expanduser(path_value)).resolve()

    def _validate_public_key_content(self, public_key: str) -> Dict[str, Any]:
        """Validate public key material and extract a fingerprint."""
        tmp_path = None
        if not public_key or not public_key.strip():
            return {'valid': False, 'error': 'Public key is empty'}

        try:
            with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp:
                tmp.write(public_key.strip() + '\n')
                tmp_path = tmp.name

            result = subprocess.run(
                ['ssh-keygen', '-l', '-f', tmp_path],
                capture_output=True,
                text=True,
                timeout=10
            )
            if result.returncode != 0:
                return {'valid': False, 'error': result.stderr.strip() or 'Invalid public key'}

            fingerprint = result.stdout.strip().split()[1] if result.stdout.strip() else None
            return {'valid': True, 'fingerprint': fingerprint}
        except Exception as e:
            return {'valid': False, 'error': str(e)}
        finally:
            if tmp_path:
                try:
                    os.unlink(tmp_path)
                except Exception:
                    pass

    def _validate_public_key_file(self, public_key_path: str) -> Dict[str, Any]:
        """Validate a public key file on disk."""
        resolved = self._resolve_abs_path(public_key_path)
        if not resolved.exists():
            return {'valid': False, 'error': f"Public key file does not exist: {resolved}"}
        if not resolved.is_file():
            return {'valid': False, 'error': f"Public key path is not a file: {resolved}"}

        try:
            with open(resolved) as f:
                content = f.read().strip()
        except Exception as e:
            return {'valid': False, 'error': str(e)}

        result = self._validate_public_key_content(content)
        result.update({'path': str(resolved), 'content': content})
        return result

    def _derive_public_key(self, private_key_path: str) -> Dict[str, Any]:
        """Derive public key material from a private key using ssh-keygen."""
        resolved = self._resolve_abs_path(private_key_path)
        if not resolved.exists():
            return {'valid': False, 'error': f"Private key file does not exist: {resolved}"}
        if not resolved.is_file():
            return {'valid': False, 'error': f"Private key path is not a file: {resolved}"}

        try:
            result = subprocess.run(
                ['ssh-keygen', '-y', '-f', str(resolved)],
                capture_output=True,
                text=True,
                timeout=10
            )
        except Exception as e:
            return {'valid': False, 'error': str(e)}

        if result.returncode != 0:
            return {'valid': False, 'error': result.stderr.strip() or 'Unable to derive public key from private key'}

        public_key = result.stdout.strip()
        validation = self._validate_public_key_content(public_key)
        if not validation['valid']:
            return validation

        validation.update({'content': public_key, 'path': str(resolved)})
        return validation

    def _validate_keypair(self, private_key_path: str, public_key_path: str = '') -> Dict[str, Any]:
        """Validate a migration keypair and ensure the public key matches when provided."""
        private_path = self._resolve_abs_path(private_key_path)
        if not private_path.exists():
            return {'valid': False, 'error': f"Private key file does not exist: {private_path}"}
        if not private_path.is_file():
            return {'valid': False, 'error': f"Private key path is not a file: {private_path}"}
        if not os.access(private_path, os.R_OK):
            return {'valid': False, 'error': f"Private key file is not readable: {private_path}"}

        derived = self._derive_public_key(str(private_path))
        if not derived['valid']:
            return {'valid': False, 'error': derived['error']}

        resolved_public_path = ''
        chosen_public = public_key_path.strip()
        if chosen_public:
            public_result = self._validate_public_key_file(chosen_public)
            if not public_result['valid']:
                return {'valid': False, 'error': public_result['error']}
            resolved_public_path = public_result['path']
            public_content = public_result['content']
            fingerprint = public_result['fingerprint']
        else:
            sibling_public = Path(f"{private_path}.pub")
            if sibling_public.exists():
                public_result = self._validate_public_key_file(str(sibling_public))
                if not public_result['valid']:
                    return {'valid': False, 'error': public_result['error']}
                resolved_public_path = public_result['path']
                public_content = public_result['content']
                fingerprint = public_result['fingerprint']
            else:
                public_content = derived['content']
                fingerprint = derived['fingerprint']

        if derived['content'].strip() != public_content.strip():
            return {'valid': False, 'error': "Public key does not match the selected private key"}

        return {
            'valid': True,
            'private_key_path': str(private_path),
            'public_key_path': resolved_public_path,
            'public_key': public_content,
            'fingerprint': fingerprint,
        }

    def _generate_keypair(self) -> Optional[Dict[str, Any]]:
        """Generate a new local ed25519 keypair."""
        self.app.print_header("Generate SSH Keypair")
        default_path = self._resolve_abs_path("~/.ssh/r1setup_ed25519")
        path_value = self.app.get_input("Private key path", str(default_path))
        private_key_path = self._resolve_abs_path(path_value)
        public_key_path = Path(f"{private_key_path}.pub")

        if private_key_path.exists() or public_key_path.exists():
            if self.app.get_input("Key files already exist. Overwrite? (y/n)", "n").lower() != 'y':
                self.app.print_colored("Key generation cancelled.", 'yellow')
                return None

        private_key_path.parent.mkdir(parents=True, exist_ok=True)
        comment = self.app.get_input("Key comment", f"r1setup@{os.uname().nodename}")

        try:
            result = subprocess.run(
                ['ssh-keygen', '-t', 'ed25519', '-f', str(private_key_path), '-N', '', '-C', comment],
                capture_output=True,
                text=True,
                timeout=30
            )
        except Exception as e:
            self.app.print_colored(f"Key generation failed: {e}", 'red')
            return None

        if result.returncode != 0:
            self.app.print_colored(result.stderr.strip() or "Key generation failed.", 'red')
            return None

        validation = self._validate_keypair(str(private_key_path), str(public_key_path))
        if not validation['valid']:
            self.app.print_colored(f"Generated key validation failed: {validation['error']}", 'red')
            return None

        self.app.print_colored("New SSH keypair generated.", 'green')
        self.app.print_colored(f"  • Private key: {validation['private_key_path']}", 'cyan')
        self.app.print_colored(f"  • Public key:  {validation['public_key_path']}", 'cyan')
        self.app.print_colored("Keep the private key secure. r1setup will use this exact file for future authentication.", 'yellow')
        return validation

    def _select_migration_keypair(self) -> Optional[Dict[str, Any]]:
        """Select or generate the keypair used for SSH migration."""
        self.app.print_header("SSH Key Source")
        self.app.print_colored("  1) Use existing SSH keypair", 'white')
        self.app.print_colored("  2) Generate new SSH keypair", 'white')
        self.app.print_colored("  0) Cancel", 'white')
        print()

        choice = self.app.get_input("Select option", "1")
        if choice == '0':
            return None
        if choice == '2':
            return self._generate_keypair()
        if choice != '1':
            self.app.print_colored("Invalid option.", 'red')
            self.app.wait_for_enter()
            return None

        private_key_path = self.app.get_input("Private key path", "~/.ssh/id_ed25519")
        public_key_path = self.app.get_input("Public key path (Enter for matching .pub)", "")
        validation = self._validate_keypair(private_key_path, public_key_path)
        if not validation['valid']:
            self.app.print_colored(f"Key validation failed: {validation['error']}", 'red')
            self.app.wait_for_enter()
            return None

        self.app.print_colored("SSH keypair validated.", 'green')
        self.app.print_colored(f"  • Private key: {validation['private_key_path']}", 'cyan')
        if validation['public_key_path']:
            self.app.print_colored(f"  • Public key:  {validation['public_key_path']}", 'cyan')
        else:
            self.app.print_colored("  • Public key:  derived from selected private key", 'cyan')
        self.app.print_colored(f"  • Fingerprint: {validation['fingerprint']}", 'cyan')
        return validation

    def _verify_ssh_login(self, host_config: Dict[str, Any], private_key_path: str) -> Tuple[bool, str]:
        """Verify controller-side SSH access using a selected private key."""
        host = host_config.get('ansible_host')
        user = host_config.get('ansible_user')
        port = host_config.get('ansible_port', 22)

        if not host or not user:
            return False, "Missing ansible_host or ansible_user"

        cmd = [
            'ssh',
            '-i', private_key_path,
            '-o', 'BatchMode=yes',
            '-o', 'StrictHostKeyChecking=no',
            '-o', 'UserKnownHostsFile=/dev/null',
            '-o', f'ConnectTimeout={self.app.ssh_connect_timeout}',
        ]
        if port != 22:
            cmd.extend(['-p', str(port)])
        cmd.extend([f"{user}@{host}", 'true'])

        try:
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=self.app.connection_timeout)
        except subprocess.TimeoutExpired:
            return False, f"SSH verification timed out after {self.app.connection_timeout}s"
        except Exception as e:
            return False, str(e)

        if result.returncode == 0:
            return True, "SSH verification succeeded"

        stderr = result.stderr.strip() or result.stdout.strip() or "SSH verification failed"
        return False, stderr

    def _get_hosts_by_auth_mode(self, modes: List[str]) -> Dict[str, Dict[str, Any]]:
        """Return hosts whose SSH auth mode matches one of the supplied modes."""
        hosts = _get_gpu_hosts(self.app.inventory)
        filtered = {}
        for host_name, config in hosts.items():
            mode = config.get('r1setup_ssh_auth_mode')
            if not mode:
                mode = self._get_default_host_metadata(config)['r1setup_ssh_auth_mode']
            if mode in modes:
                filtered[host_name] = config
        return filtered

    def _get_hosts_ready_for_password_disable(self) -> Dict[str, Dict[str, Any]]:
        """Return hosts that are safe candidates for disabling password authentication."""
        candidates = self._get_hosts_by_auth_mode([SSH_AUTH_MODE_KEY_VERIFIED])
        ready = {}
        for host_name, config in candidates.items():
            fingerprint = config.get('r1setup_ssh_primary_key_fingerprint')
            verified_fingerprint = config.get('r1setup_ssh_last_verified_fingerprint')
            requires_revalidation = config.get('r1setup_ssh_requires_revalidation', False)
            if not requires_revalidation and fingerprint and fingerprint == verified_fingerprint:
                ready[host_name] = config
        return ready

    def _set_host_ssh_metadata(self, host_name: str, updates: Dict[str, Any]) -> None:
        """Update in-memory SSH metadata for a host."""
        hosts = _get_gpu_hosts(self.app.inventory)
        if host_name in hosts:
            hosts[host_name].update(updates)

    def _run_playbook_for_hosts(
        self,
        playbook_relative_path: str,
        selected_hosts: List[str],
        extra_vars: Optional[Dict[str, Any]] = None,
        show_output: bool = True,
    ) -> Tuple[bool, str]:
        """Run an SSH management playbook for the selected hosts."""
        playbook_path = self.app.config_dir / playbook_relative_path
        host_limit = ','.join(selected_hosts)
        extra_vars_path = None
        try:
            cmd = (f"ANSIBLE_CONFIG={os.environ['ANSIBLE_CONFIG']} "
                   f"ANSIBLE_COLLECTIONS_PATH={os.environ['ANSIBLE_COLLECTIONS_PATH']} "
                   f"ANSIBLE_HOME={os.environ['ANSIBLE_HOME']} "
                   f"ansible-playbook -i {self.app.config_file} --limit {host_limit} {playbook_path}")

            if extra_vars:
                with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as tmp:
                    json.dump(extra_vars, tmp)
                    extra_vars_path = tmp.name
                cmd = f"{cmd} --extra-vars @{extra_vars_path}"

            return self.app.run_command(cmd, show_output=show_output)
        finally:
            if extra_vars_path:
                try:
                    os.unlink(extra_vars_path)
                except OSError:
                    pass

    def _apply_successful_key_migration(self, host_name: str, private_key_path: str, fingerprint: str) -> None:
        """Apply in-memory inventory changes after successful SSH verification."""
        hosts = _get_gpu_hosts(self.app.inventory)
        if host_name not in hosts:
            return

        host = hosts[host_name]
        host['ansible_ssh_private_key_file'] = private_key_path
        host.pop('ansible_ssh_pass', None)
        host['r1setup_ssh_auth_mode'] = SSH_AUTH_MODE_KEY_VERIFIED
        host['r1setup_ssh_primary_key_path'] = private_key_path
        host['r1setup_ssh_primary_key_fingerprint'] = fingerprint
        host['r1setup_ssh_last_verified_fingerprint'] = fingerprint
        host['r1setup_ssh_key_auth_verified_at'] = datetime.now().isoformat()
        host['r1setup_ssh_last_verification_status'] = 'success'
        host['r1setup_ssh_requires_revalidation'] = False

    def _apply_failed_key_verification(self, host_name: str, private_key_path: str, fingerprint: str = None) -> None:
        """Record a failed SSH key verification without switching active auth."""
        updates = {
            'r1setup_ssh_auth_mode': SSH_AUTH_MODE_VERIFICATION_FAILED,
            'r1setup_ssh_primary_key_path': private_key_path,
            'r1setup_ssh_last_verification_status': 'failed',
            'r1setup_ssh_requires_revalidation': True,
        }
        if fingerprint:
            updates['r1setup_ssh_primary_key_fingerprint'] = fingerprint
        self._set_host_ssh_metadata(host_name, updates)

    def _apply_successful_password_hardening(self, host_name: str) -> None:
        """Record that password authentication was disabled successfully."""
        self._set_host_ssh_metadata(host_name, {
            'r1setup_ssh_auth_mode': SSH_AUTH_MODE_PASSWORD_DISABLED,
            'r1setup_password_auth_disabled': True,
            'r1setup_ssh_hardening_applied_at': datetime.now().isoformat(),
            'r1setup_ssh_last_verification_status': 'success',
            'r1setup_ssh_requires_revalidation': False,
        })

    def _apply_failed_password_hardening(self, host_name: str) -> None:
        """Record that SSH hardening failed and the host needs revalidation."""
        self._set_host_ssh_metadata(host_name, {
            'r1setup_ssh_auth_mode': SSH_AUTH_MODE_VERIFICATION_FAILED,
            'r1setup_password_auth_disabled': False,
            'r1setup_ssh_last_verification_status': 'failed',
            'r1setup_ssh_requires_revalidation': True,
        })

    def _get_ssh_state_guidance(self, host_config: Dict[str, Any]) -> str:
        """Return the recommended next action for a host's SSH state."""
        mode = host_config.get('r1setup_ssh_auth_mode', self._get_default_host_metadata(host_config)['r1setup_ssh_auth_mode'])
        requires_revalidation = host_config.get('r1setup_ssh_requires_revalidation', False)

        if mode == SSH_AUTH_MODE_PASSWORD_ONLY:
            return "Next: Install Key / Migrate Password Hosts."
        if mode == SSH_AUTH_MODE_KEY_CONFIGURED_LEGACY:
            return "Next: Validate Key Authentication before hardening."
        if mode == SSH_AUTH_MODE_KEY_INSTALLED_UNVERIFIED:
            return "Next: Validate Key Authentication to confirm the installed key."
        if mode == SSH_AUTH_MODE_VERIFICATION_FAILED:
            return "Next: Fix key access, then rerun Validate Key Authentication."
        if mode == SSH_AUTH_MODE_PASSWORD_DISABLED:
            return "State is hardened. Keep at least one recovery key secured outside this host."
        if mode == SSH_AUTH_MODE_KEY_VERIFIED and not requires_revalidation:
            return "Ready: Disable Password Authentication is allowed."
        if mode == SSH_AUTH_MODE_KEY_VERIFIED and requires_revalidation:
            return "Next: Revalidate key authentication before hardening."
        return "Next: Review host SSH settings before changing authentication."

    def ssh_key_management_menu(self) -> None:
        """Show Phase 1 SSH key management menu."""
        while True:
            self.app.print_header("SSH Key Management")

            if not self.app.check_hosts_config() and not self.app.has_fleet_machines():
                self.app.print_colored("No nodes or machines configured. Configure nodes first.", 'red')
                self.app.wait_for_enter()
                return

            ok, issues = self.check_feature_capabilities()
            if not ok:
                self._show_feature_unavailable(issues)
                return

            self.app.load_configuration()
            hosts = _get_gpu_hosts(self.app.inventory)
            # Fall back to fleet-state machines when no inventory hosts exist
            if not hosts:
                hosts = self.app.get_fleet_machines_as_hosts()
            self.app.print_section(f"Available for {len(hosts)} configured node(s)/machine(s)")
            self.app.print_colored("  1) Install Key / Migrate Password Hosts", 'white')
            self.app.print_colored("  2) Add Extra Public Key", 'white')
            self.app.print_colored("  3) Validate Key Authentication", 'white')
            self.app.print_colored("  4) Disable Password Authentication", 'white')
            self.app.print_colored("  5) Show SSH Auth Status", 'white')
            self.app.print_colored("  0) Back", 'white')
            print()

            choice = self.app.get_input("Select option", "0")
            if choice == '0':
                return
            if choice == '1':
                self.install_key_and_migrate_hosts()
            elif choice == '2':
                self.add_extra_public_keys()
            elif choice == '3':
                self.validate_key_authentication()
            elif choice == '4':
                self.disable_password_authentication()
            elif choice == '5':
                self.show_ssh_auth_status()
            else:
                self.app.print_colored("Invalid option.", 'red')
                self.app.wait_for_enter()

    def install_key_and_migrate_hosts(self) -> None:
        """Install a public key on password-auth hosts and migrate inventory after verification."""
        self.app.load_configuration()
        password_hosts = self._get_hosts_by_auth_mode([SSH_AUTH_MODE_PASSWORD_ONLY])
        if not password_hosts:
            self.app.print_colored("No password-auth hosts available for SSH key migration.", 'yellow')
            self.app.wait_for_enter()
            return

        self.app.print_header("Install Key / Migrate Password Hosts")
        self.app.print_colored("⚠️  WARNING", 'red', bold=True)
        self.app.print_colored("This installs a public key on selected hosts and then tests key-based login.", 'yellow')
        self.app.print_colored("Inventory will switch to SSH key auth only after verification succeeds.", 'yellow')
        self.app.print_colored("Some cloud providers also require the public key in the provider dashboard or instance metadata.", 'yellow')
        print()

        selected_hosts = self.app.select_hosts(password_hosts, "ssh key migration", preselect_mode='all')
        if not selected_hosts:
            self.app.print_colored("No hosts selected. Migration cancelled.", 'yellow')
            self.app.wait_for_enter()
            return

        keypair = self._select_migration_keypair()
        if not keypair:
            return

        self.app.print_colored("\nInstalling public key on selected hosts...", 'cyan')
        success, _ = self._run_playbook_for_hosts(
            'playbooks/ssh_install_key.yml',
            selected_hosts,
            {'ssh_target_public_key': keypair['public_key']},
            show_output=True,
        )
        if not success:
            self.app.print_colored("Public key installation failed. Inventory was not changed.", 'red')
            self.app.wait_for_enter()
            return

        verified_hosts = []
        failed_hosts = []
        for host_name in selected_hosts:
            host_config = password_hosts[host_name]
            self._set_host_ssh_metadata(host_name, {
                'r1setup_ssh_auth_mode': SSH_AUTH_MODE_KEY_INSTALLED_UNVERIFIED,
                'r1setup_ssh_primary_key_path': keypair['private_key_path'],
                'r1setup_ssh_primary_key_fingerprint': keypair['fingerprint'],
                'r1setup_ssh_requires_revalidation': True,
            })
            ok, message = self._verify_ssh_login(host_config, keypair['private_key_path'])
            if ok:
                verified_hosts.append(host_name)
                self._apply_successful_key_migration(host_name, keypair['private_key_path'], keypair['fingerprint'])
            else:
                failed_hosts.append((host_name, message))
                self._apply_failed_key_verification(host_name, keypair['private_key_path'], keypair['fingerprint'])

        self.app._save_configuration()

        self.app.print_section("Migration Summary")
        if verified_hosts:
            self.app.print_colored(f"✅ Verified and migrated: {', '.join(verified_hosts)}", 'green')
        if failed_hosts:
            self.app.print_colored("❌ Verification failed for:", 'red')
            for host_name, message in failed_hosts:
                self.app.print_colored(f"   • {host_name}: {message}", 'yellow')

        if verified_hosts:
            self.app.print_colored("Successful hosts now use SSH key authentication in r1setup.", 'cyan')
        if failed_hosts:
            self.app.print_colored("Failed hosts kept their previous inventory auth and were marked for revalidation.", 'yellow')

        self.app.wait_for_enter()

    def add_extra_public_keys(self) -> None:
        """Install an extra public key on selected hosts without switching primary auth."""
        self.app.load_configuration()
        hosts = _get_gpu_hosts(self.app.inventory)
        if not hosts:
            self.app.print_colored("No hosts configured.", 'yellow')
            self.app.wait_for_enter()
            return

        self.app.print_header("Add Extra Public Key")
        selected_hosts = self.app.select_hosts(hosts, "add extra public key", preselect_mode='all')
        if not selected_hosts:
            self.app.print_colored("No hosts selected.", 'yellow')
            self.app.wait_for_enter()
            return

        public_key_path = self.app.get_input("Public key path", "~/.ssh/id_ed25519.pub")
        validation = self._validate_public_key_file(public_key_path)
        if not validation['valid']:
            self.app.print_colored(f"Public key validation failed: {validation['error']}", 'red')
            self.app.wait_for_enter()
            return

        label = self.app.get_input("Label for this key", Path(validation['path']).name)
        self.app.print_colored("Installing extra public key on selected hosts...", 'cyan')
        success, _ = self._run_playbook_for_hosts(
            'playbooks/ssh_add_extra_keys.yml',
            selected_hosts,
            {'ssh_extra_public_keys': [validation['content']]},
            show_output=True,
        )
        if not success:
            self.app.print_colored("Extra public key installation failed.", 'red')
            self.app.wait_for_enter()
            return

        for host_name in selected_hosts:
            host = hosts[host_name]
            managed_keys = host.get('r1setup_managed_public_keys', [])
            entry = {
                'fingerprint': validation['fingerprint'],
                'label': label,
                'added_at': datetime.now().isoformat(),
                'source': validation['path'],
            }
            if not any(k.get('fingerprint') == validation['fingerprint'] for k in managed_keys):
                managed_keys.append(entry)
            host['r1setup_managed_public_keys'] = managed_keys

        self.app._save_configuration()
        self.app.print_colored("Extra public key installed successfully.", 'green')
        self.app.wait_for_enter()

    def validate_key_authentication(self) -> None:
        """Re-validate SSH key authentication for configured key-auth hosts."""
        self.app.load_configuration()
        candidate_hosts = self._get_hosts_by_auth_mode([
            SSH_AUTH_MODE_KEY_CONFIGURED_LEGACY,
            SSH_AUTH_MODE_KEY_INSTALLED_UNVERIFIED,
            SSH_AUTH_MODE_KEY_VERIFIED,
            SSH_AUTH_MODE_VERIFICATION_FAILED,
        ])

        if not candidate_hosts:
            self.app.print_colored("No key-auth hosts available for validation.", 'yellow')
            self.app.wait_for_enter()
            return

        self.app.print_header("Validate Key Authentication")
        selected_hosts = self.app.select_hosts(candidate_hosts, "validate key auth", preselect_mode='all')
        if not selected_hosts:
            self.app.print_colored("No hosts selected.", 'yellow')
            self.app.wait_for_enter()
            return

        successes = []
        failures = []
        for host_name in selected_hosts:
            host_config = candidate_hosts[host_name]
            key_path = host_config.get('ansible_ssh_private_key_file') or host_config.get('r1setup_ssh_primary_key_path')
            if not key_path:
                failures.append((host_name, "No SSH private key path configured"))
                self._apply_failed_key_verification(host_name, '')
                continue

            fingerprint = host_config.get('r1setup_ssh_primary_key_fingerprint')
            derived = self._derive_public_key(key_path)
            if derived['valid']:
                fingerprint = derived['fingerprint']

            ok, message = self._verify_ssh_login(host_config, key_path)
            if ok:
                self._set_host_ssh_metadata(host_name, {
                    'r1setup_ssh_auth_mode': SSH_AUTH_MODE_KEY_VERIFIED,
                    'r1setup_ssh_key_auth_verified_at': datetime.now().isoformat(),
                    'r1setup_ssh_primary_key_fingerprint': fingerprint,
                    'r1setup_ssh_last_verified_fingerprint': fingerprint,
                    'r1setup_ssh_last_verification_status': 'success',
                    'r1setup_ssh_requires_revalidation': False,
                    'r1setup_ssh_primary_key_path': key_path,
                })
                successes.append(host_name)
            else:
                self._apply_failed_key_verification(host_name, key_path, fingerprint)
                failures.append((host_name, message))

        self.app._save_configuration()

        self.app.print_section("Validation Summary")
        if successes:
            self.app.print_colored(f"✅ Verified: {', '.join(successes)}", 'green')
        if failures:
            self.app.print_colored("❌ Verification failed for:", 'red')
            for host_name, message in failures:
                self.app.print_colored(f"   • {host_name}: {message}", 'yellow')
        self.app.wait_for_enter()

    def disable_password_authentication(self) -> None:
        """Disable SSH password authentication on hosts with verified key-based access."""
        self.app.load_configuration()
        candidate_hosts = self._get_hosts_ready_for_password_disable()
        if not candidate_hosts:
            self.app.print_colored("No hosts are ready for password-auth disable. Validate key authentication first.", 'yellow')
            self.app.wait_for_enter()
            return

        self.app.print_header("Disable Password Authentication")
        self.app.print_colored("⚠️  WARNING", 'red', bold=True)
        self.app.print_colored("This changes the remote SSH daemon policy for the selected machine(s), not just r1setup.", 'yellow')
        self.app.print_colored("Only proceed if key-based SSH login is already verified and you understand the lockout risk.", 'yellow')
        self.app.print_colored("A timed remote rollback will be scheduled automatically in case controller-side verification fails.", 'yellow')
        print()

        selected_hosts = self.app.select_hosts(candidate_hosts, "disable password authentication", preselect_mode='none')
        if not selected_hosts:
            self.app.print_colored("No hosts selected.", 'yellow')
            self.app.wait_for_enter()
            return

        confirm = self.app.get_input("Type 'disable' to continue", "")
        if confirm.strip().lower() != 'disable':
            self.app.print_colored("SSH hardening cancelled.", 'yellow')
            self.app.wait_for_enter()
            return

        self.app.print_colored("Applying SSH hardening on selected hosts...", 'cyan')
        success, _ = self._run_playbook_for_hosts(
            'playbooks/ssh_disable_password_auth.yml',
            selected_hosts,
            {
                'ssh_password_auth_action': 'apply',
                'ssh_hardening_rollback_delay': max(self.app.connection_timeout, 90),
            },
            show_output=True,
        )
        if not success:
            self.app.print_colored("SSH hardening playbook failed. Existing host metadata was not promoted.", 'red')
            self.app.wait_for_enter()
            return

        confirmed_hosts = []
        rollback_pending = []
        confirm_failures = []

        for host_name in selected_hosts:
            host_config = candidate_hosts[host_name]
            key_path = host_config.get('ansible_ssh_private_key_file') or host_config.get('r1setup_ssh_primary_key_path')
            ok, message = self._verify_ssh_login(host_config, key_path) if key_path else (False, "No SSH private key path configured")
            if ok:
                confirm_ok, confirm_output = self._run_playbook_for_hosts(
                    'playbooks/ssh_disable_password_auth.yml',
                    [host_name],
                    {'ssh_password_auth_action': 'confirm'},
                    show_output=True,
                )
                if confirm_ok:
                    self._apply_successful_password_hardening(host_name)
                    confirmed_hosts.append(host_name)
                else:
                    self._apply_failed_password_hardening(host_name)
                    confirm_failures.append((host_name, confirm_output or "Unable to confirm SSH hardening"))
            else:
                self._apply_failed_password_hardening(host_name)
                rollback_ok, rollback_output = self._run_playbook_for_hosts(
                    'playbooks/ssh_disable_password_auth.yml',
                    [host_name],
                    {'ssh_password_auth_action': 'rollback'},
                    show_output=True,
                )
                rollback_message = message
                if not rollback_ok:
                    rollback_message = f"{message} | rollback attempt failed: {rollback_output or 'host unreachable'}"
                rollback_pending.append((host_name, rollback_message))

        self.app._save_configuration()

        self.app.print_section("Hardening Summary")
        if confirmed_hosts:
            self.app.print_colored(f"✅ Password authentication disabled: {', '.join(confirmed_hosts)}", 'green')
        if confirm_failures:
            self.app.print_colored("⚠️  Hardening applied, but confirmation failed for:", 'yellow')
            for host_name, message in confirm_failures:
                self.app.print_colored(f"   • {host_name}: {message}", 'yellow')
        if rollback_pending:
            self.app.print_colored("❌ Verification failed; rollback was attempted or left scheduled for:", 'red')
            for host_name, message in rollback_pending:
                self.app.print_colored(f"   • {host_name}: {message}", 'yellow')
            self.app.print_colored("If a host is temporarily unreachable, the remote rollback timer should restore SSH access automatically.", 'yellow')

        self.app.wait_for_enter()

    def show_ssh_auth_status(self) -> None:
        """Display SSH auth metadata for configured hosts."""
        self.app.load_configuration()
        hosts = _get_gpu_hosts(self.app.inventory)
        if not hosts:
            self.app.print_colored("No hosts configured.", 'yellow')
            self.app.wait_for_enter()
            return

        self.app.print_header("SSH Auth Status")
        for host_name, host_config in hosts.items():
            mode = host_config.get('r1setup_ssh_auth_mode', self._get_default_host_metadata(host_config)['r1setup_ssh_auth_mode'])
            key_path = host_config.get('r1setup_ssh_primary_key_path') or host_config.get('ansible_ssh_private_key_file') or 'N/A'
            verified_at = host_config.get('r1setup_ssh_key_auth_verified_at') or 'never'
            requires_revalidation = host_config.get('r1setup_ssh_requires_revalidation', False)
            fingerprint = host_config.get('r1setup_ssh_primary_key_fingerprint') or 'unknown'
            password_auth_disabled = host_config.get('r1setup_password_auth_disabled', False)
            hardening_applied_at = host_config.get('r1setup_ssh_hardening_applied_at') or 'never'
            auth_type = "Password" if 'ansible_ssh_pass' in host_config else "SSH Key"
            self.app.print_colored(f"  • {host_name}", 'cyan', bold=True)
            self.app.print_colored(f"     Current inventory auth: {auth_type}", 'white')
            self.app.print_colored(f"     SSH state: {mode}", 'white')
            self.app.print_colored(f"     Key path: {key_path}", 'white')
            self.app.print_colored(f"     Key fingerprint: {fingerprint}", 'white')
            self.app.print_colored(f"     Last verified: {verified_at}", 'white')
            self.app.print_colored(f"     Requires revalidation: {'yes' if requires_revalidation else 'no'}", 'white')
            self.app.print_colored(f"     Password auth disabled: {'yes' if password_auth_disabled else 'no'}", 'white')
            self.app.print_colored(f"     Hardening applied at: {hardening_applied_at}", 'white')
            self.app.print_colored(f"     Guidance: {self._get_ssh_state_guidance(host_config)}", 'yellow')
            managed_keys = host_config.get('r1setup_managed_public_keys', [])
            if managed_keys:
                self.app.print_colored(f"     Extra managed keys: {len(managed_keys)}", 'white')
            print()

        self.app.wait_for_enter()


class R1Setup:
    def __init__(self):
        self.colors = {
            'red': '\033[91m',
            'green': '\033[92m',
            'yellow': '\033[93m',
            'blue': '\033[94m',
            'cyan': '\033[96m',
            'magenta': '\033[95m',
            'white': '\033[97m',
            'end': '\033[0m'
        }

        # Get the real user's home directory when running with sudo
        if 'SUDO_USER' in os.environ:
            import pwd
            real_user = os.environ['SUDO_USER']
            self.real_home = Path(pwd.getpwnam(real_user).pw_dir)
            self.real_user = real_user
        else:
            self.real_home = Path.home()
            self.real_user = os.environ.get('USER', 'unknown')

        # Detect OS
        self.os_type = self._detect_os()

        # Set up paths
        self.ratio1_base_dir = self.real_home / '.ratio1'
        self.r1_setup_dir = self.ratio1_base_dir / 'r1_setup'
        self.ansible_config_root = self.ratio1_base_dir / 'ansible_config'
        self.config_dir = self.ansible_config_root / 'collections/ansible_collections/ratio1/multi_node_launcher'

        # Configuration management paths
        self.configs_dir = self.r1_setup_dir / 'configs'
        self.active_config_file = self.r1_setup_dir / 'active_config.json'
        self.config_file = self.config_dir / 'hosts.yml'
        self.vars_file = self.config_dir / 'group_vars/variables.yml'

        # Create configs directory if it doesn't exist
        self.configs_dir.mkdir(parents=True, exist_ok=True)

        # Set installation directories based on OS
        if self.os_type == "macos":
            self.install_dir = self.real_home / "r1setup"
        else:
            self.install_dir = Path("/opt/r1setup")

        # Set up Ansible environment
        self._setup_ansible_env()

        # Initialize inventory
        self.inventory = {
            'all': {
                'vars': {},
                'children': {
                    'gpu_nodes': {
                        'hosts': {}
                    }
                }
            }
        }

        # Component instances (order matters)
        self.version_manager = VersionManager(self)
        self.config_manager = ConfigurationManager(self)
        self.status_tracker = NodeStatusTracker(self)
        self.deployment_service = DeploymentService(self)
        self.migration_planner = MigrationPlanner(self)
        self.settings_manager = SettingsManager(self)
        self.ssh_key_manager = SSHKeyManager(self)
        self.settings_manager.load_settings()

        # Load or initialize active configuration
        self.config_manager._load_active_config()

    # -- Backward-compat property for active_config --
    @property
    def active_config(self):
        return self.config_manager.active_config

    @active_config.setter
    def active_config(self, value):
        self.config_manager.active_config = value

    def _detect_os(self) -> str:
        """Detect the operating system"""
        os_name = os.uname().sysname
        if os_name == "Darwin":
            return "macos"
        elif os_name == "Linux":
            return "linux"
        else:
            self.print_colored(f"Unsupported OS: {os_name}", 'red')
            sys.exit(1)

    def _setup_ansible_env(self):
        """Set up Ansible environment variables"""
        os.environ['ANSIBLE_CONFIG'] = str(self.ansible_config_root / 'ansible.cfg')
        os.environ['ANSIBLE_COLLECTIONS_PATH'] = str(self.ansible_config_root / 'collections')
        os.environ['ANSIBLE_HOME'] = str(self.ansible_config_root)

    def wait_for_enter(self, message: str = "Press Enter to continue...") -> None:
        try:
            sys.stdout.flush()
            sys.stderr.flush()
        except Exception:
            pass
        input(f"\n{message}")

    def print_colored(self, text: str, color: str = 'white', bold: bool = False, end: str = '\n') -> None:
        """Print colored text"""
        color_code = self.colors.get(color, self.colors['white'])
        if bold:
            color_code = '\033[1m' + color_code
        print(f"{color_code}{text}{self.colors['end']}", end=end)

    def print_debug(self, text: str, color: str = 'cyan') -> None:
        """Print debug text only when DEBUG is enabled"""
        if DEBUG:
            self.print_colored(f"[DEBUG] {text}", color)

    def clear_screen(self) -> None:
        """Clear the terminal unless explicit no-clear dev mode is enabled."""
        if os.environ.get('R1SETUP_NO_CLEAR'):
            return
        if os.name == 'nt':
            os.system('cls')
        else:
            print("\033[2J\033[H", end="")

    def print_header(self, title: str) -> None:
        """Print a formatted header"""
        self.clear_screen()
        self.print_colored("=" * 60, 'cyan')
        self.print_colored(f" {title.center(58)} ", 'cyan', bold=True)
        self.print_colored("=" * 60, 'cyan')

    def print_section(self, title: str) -> None:
        """Print a section header"""
        self.print_colored(f"\n{title}", 'yellow', bold=True)
        self.print_colored("-" * len(title), 'yellow')

    def _print_cancellation_guidance(self) -> None:
        """Show recovery guidance after a keyboard interrupt when saved migration state exists."""
        plan = copy.deepcopy(self.active_config.get('migration_plan_state') or {})
        if not plan:
            return
        status = str(plan.get('status') or 'unknown')
        if status == 'executing':
            instance_name = str(plan.get('instance_name') or '?')
            last_step = str(plan.get('last_step') or 'started')
            self.print_colored(
                f"Saved migration plan for '{instance_name}' remains in 'executing' state (last step: {last_step}).",
                'yellow',
            )
            self.print_colored(
                "Reopen Deployment Menu -> Rollback Migration to recover source ownership, or review the saved plan before retrying execution.",
                'yellow',
            )
        elif status in {'rolling_back', 'finalizing'}:
            self.print_colored(
                f"Saved migration plan remains in '{status}' state. Reopen Deployment Menu to review the saved plan before continuing.",
                'yellow',
            )

    def get_input(self, prompt: str, default: str = '', required: bool = False) -> str:
        """Get user input with validation"""
        while True:
            default_str = f" [{default}]" if default else ""
            self.print_colored(f"{prompt}{default_str}: ", 'blue', end='')
            try:
                value = input().strip() or default
                if required and not value:
                    self.print_colored("This field cannot be empty. Please try again.", 'red')
                    continue
                return value
            except KeyboardInterrupt:
                self.print_colored("\nOperation cancelled by user.", 'yellow')
                self._print_cancellation_guidance()
                sys.exit(0)

    def get_secure_input(self, prompt: str) -> str:
        """Get secure password input"""
        try:
            # Use getpass with the full prompt to avoid display issues
            return getpass.getpass(f"{prompt}: ")
        except (EOFError, KeyboardInterrupt):
            self.print_colored("\nOperation cancelled by user.", 'yellow')
            self._print_cancellation_guidance()
            sys.exit(0)

    def validate_ip(self, ip: str) -> bool:
        """Validate IP address format"""
        pattern = r'^(\d{1,3}\.){3}\d{1,3}$'
        if not re.match(pattern, ip):
            return False
        return all(0 <= int(part) <= 255 for part in ip.split('.'))

    def _validate_ssh_key_file(self, key_path: str) -> Dict[str, Any]:
        """Validate SSH private key file exists and is readable"""
        try:
            # Check if file exists
            if not os.path.exists(key_path):
                return {
                    'valid': False,
                    'error': f"File does not exist"
                }
            
            # Check if it's a file (not a directory)
            if not os.path.isfile(key_path):
                return {
                    'valid': False,
                    'error': f"Path is not a file"
                }
            
            # Check if file is readable
            if not os.access(key_path, os.R_OK):
                return {
                    'valid': False,
                    'error': f"File is not readable (check permissions)"
                }
            
            # Check file size (empty files are invalid)
            if os.path.getsize(key_path) == 0:
                return {
                    'valid': False,
                    'error': f"File is empty"
                }
            
            # Optional: Basic content validation for SSH key format
            try:
                with open(key_path, 'r') as f:
                    first_line = f.readline().strip()
                    # Check for common SSH private key headers
                    valid_headers = [
                        '-----BEGIN RSA PRIVATE KEY-----',
                        '-----BEGIN DSA PRIVATE KEY-----', 
                        '-----BEGIN EC PRIVATE KEY-----',
                        '-----BEGIN OPENSSH PRIVATE KEY-----',
                        '-----BEGIN PRIVATE KEY-----'
                    ]
                    
                    if not any(first_line.startswith(header) for header in valid_headers):
                        self.print_colored(f"Warning: File may not be a valid SSH private key format", 'yellow')
                        self.print_colored(f"Expected headers: RSA, DSA, EC, or OpenSSH format", 'yellow')
                        # Don't fail validation, just warn
            except (UnicodeDecodeError, IOError):
                # If we can't read as text, it might be a binary key format - that's okay
                pass
            
            return {
                'valid': True,
                'error': None
            }
            
        except Exception as e:
            return {
                'valid': False,
                'error': f"Unexpected error: {str(e)}"
            }

    def _get_valid_hostname(self, prompt: str, default: str = "") -> str:
        """Get a valid hostname with length and character restrictions"""
        self.print_colored("\n📝 Hostname Requirements:", 'cyan')
        self.print_colored("   • Maximum 15 characters", 'white')
        self.print_colored("   • Only letters (a-z, A-Z), numbers (0-9), hyphens (-), underscores (_)", 'white')
        self.print_colored("   • Cannot be empty", 'white')

        while True:
            hostname = self.get_input(prompt, default, required=True).strip()

            # Check if empty
            if not hostname:
                self.print_colored("Hostname cannot be empty. Please try again.", 'red')
                continue

            # Check character restrictions
            if not re.match(r'^[a-zA-Z0-9_-]+$', hostname):
                self.print_colored("Invalid characters in hostname.", 'red')
                self.print_colored("Only letters, numbers, hyphens (-), and underscores (_) are allowed.", 'red')
                continue

            # Check length
            if len(hostname) <= 15:
                return hostname

            # Hostname is too long, suggest shortened version
            shortened = hostname[:15]
            self.print_colored(f"Hostname '{hostname}' is too long ({len(hostname)} characters, max 15).", 'red')
            self.print_colored(f"Suggested shortened version: '{shortened}'", 'yellow')

            choice = self.get_input("Options:\n  1) Use shortened version\n  2) Enter a different name\nSelect option (1/2)", "1")

            if choice == '1':
                return shortened
            elif choice == '2':
                continue  # Ask for hostname again
            else:
                self.print_colored("Invalid choice. Please select 1 or 2.", 'red')
                continue

    def run_command(self, cmd: str, show_output: bool = True, shell: bool = True, timeout: int = None) -> tuple:
        """Run a shell command and return success status and output"""
        try:
            if show_output:
                self.print_colored(f"Running: {cmd}", 'cyan')
                process = subprocess.Popen(
                    cmd,
                    shell=shell,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    text=True
                )
                captured_lines = []
                try:
                    stdout, _ = process.communicate(timeout=timeout)
                    if stdout:
                        print(stdout, end='')
                        captured_lines.append(stdout)
                    return process.returncode == 0, ''.join(captured_lines)
                except subprocess.TimeoutExpired:
                    process.kill()
                    try:
                        stdout, _ = process.communicate(timeout=5)
                        if stdout:
                            print(stdout, end='')
                            captured_lines.append(stdout)
                        self.print_colored(f"Command timed out after {timeout} seconds but captured partial output", 'yellow')
                        return False, ''.join(captured_lines)
                    except subprocess.TimeoutExpired:
                        self.print_colored(f"Command timed out after {timeout} seconds: {cmd}", 'red')
                        return False, f"Command timed out after {timeout} seconds"

            if timeout and not show_output:
                # Use Popen for timeout cases where we want to capture partial output
                process = subprocess.Popen(
                    cmd,
                    shell=shell,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    text=True
                )
                
                try:
                    stdout, _ = process.communicate(timeout=timeout)
                    return process.returncode == 0, stdout
                except subprocess.TimeoutExpired:
                    process.kill()
                    # Try to get any partial output
                    try:
                        stdout, _ = process.communicate(timeout=5)
                        self.print_colored(f"Command timed out after {timeout} seconds but captured partial output", 'yellow')
                        return False, stdout  # Return partial output even though it timed out
                    except subprocess.TimeoutExpired:
                        self.print_colored(f"Command timed out after {timeout} seconds: {cmd}", 'red')
                        return False, f"Command timed out after {timeout} seconds"
            # Use regular subprocess.run for non-streaming cases
            result = subprocess.run(
                cmd,
                shell=shell,
                capture_output=True,
                text=True,
                check=False,
                timeout=timeout
            )
            return result.returncode == 0, result.stdout
        except subprocess.TimeoutExpired:
            self.print_colored(f"Command timed out after {timeout} seconds: {cmd}", 'red')
            return False, f"Command timed out after {timeout} seconds"
        except Exception as e:
            self.print_colored(f"Error running command: {e}", 'red')
            return False, str(e)

    def check_ansible_installation(self) -> bool:
        """Check if Ansible is properly installed"""
        success, _ = self.run_command("ansible --version", show_output=False)
        if not success:
            self.print_colored("Ansible is not installed or not accessible!", 'red')
            return False

        # Check if collection is installed
        success, output = self.run_command(
            f"ANSIBLE_CONFIG={os.environ['ANSIBLE_CONFIG']} "
            f"ANSIBLE_COLLECTIONS_PATH={os.environ['ANSIBLE_COLLECTIONS_PATH']} "
            f"ANSIBLE_HOME={os.environ['ANSIBLE_HOME']} "
            "ansible-galaxy collection list",
            show_output=False
        )

        if not success or "ratio1.multi_node_launcher" not in output:
            self.print_colored("Required Ansible collection is not installed!", 'red')
            return False

        return True

    def check_hosts_config(self) -> bool:
        """Check if hosts configuration exists and is valid"""
        if not self.config_file.exists():
            return False

        if self.config_file.stat().st_size == 0:
            return False

        try:
            with open(self.config_file) as f:
                config = yaml.safe_load(f)
                if not config or 'all' not in config:
                    return False
                hosts = _get_gpu_hosts(config)
                return len(hosts) > 0
        except Exception:
            return False

    def has_active_config_shell(self) -> bool:
        """Check whether a valid configuration shell exists, even with zero hosts.

        Returns True when either:
        - check_hosts_config() is True (hosts exist), OR
        - an active config name is set and its config file exists on disk
          (a zero-host shell created by ensure_configuration_shell).

        This does NOT change check_hosts_config() semantics.
        """
        if self.check_hosts_config():
            return True
        config_name = self.config_manager.active_config.get('config_name')
        if not config_name:
            return False
        config_path = self.config_manager.app.configs_dir / f"{config_name}.yml"
        return config_path.exists()

    def has_fleet_machines(self) -> bool:
        """Check whether any machines are registered in fleet state."""
        fleet_state = self.config_manager.fleet_state or {}
        return bool(fleet_state.get('fleet', {}).get('machines', {}))

    def get_fleet_machines_as_hosts(self) -> dict:
        """Return fleet-state machines as a host-compatible dict.

        Each machine record already contains ansible_host, ansible_user, etc.
        so it can be used in place of inventory hosts for machine-level
        operations (SSH, logs, key management).
        """
        fleet_state = self.config_manager.fleet_state or {}
        return dict(fleet_state.get('fleet', {}).get('machines', {}))

    def _get_operation_log_dir(self) -> Path:
        """Return the local operation-log directory."""
        return self.r1_setup_dir / 'logs'

    def _get_operation_log_path(self) -> Path:
        """Return the active local operation-log path."""
        return self._get_operation_log_dir() / 'operations.log'

    def _rotate_operation_log_if_needed(self, max_bytes: int = 1_000_000, retention_days: int = 30) -> None:
        """Rotate and clean up the local operation log when it grows too large."""
        log_dir = self._get_operation_log_dir()
        log_dir.mkdir(parents=True, exist_ok=True)
        try:
            os.chmod(log_dir, 0o700)
        except OSError:
            pass

        log_path = self._get_operation_log_path()
        if log_path.exists() and log_path.stat().st_size >= max_bytes:
            rotated = log_dir / f"operations_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
            log_path.rename(rotated)
            try:
                os.chmod(rotated, 0o600)
            except OSError:
                pass

        cutoff = datetime.now().timestamp() - (retention_days * 86400)
        for rotated_log in log_dir.glob("operations_*.log"):
            try:
                if rotated_log.stat().st_mtime < cutoff:
                    rotated_log.unlink()
            except OSError:
                continue

    def log_operation_event(
        self,
        operation_type: str,
        status: str,
        details: Optional[Dict[str, Any]] = None,
    ) -> None:
        """Append a local operation-log event."""
        self._rotate_operation_log_if_needed()
        log_path = self._get_operation_log_path()
        entry = {
            'logged_at': datetime.now().isoformat(),
            'operation_type': operation_type,
            'status': status,
            'details': details or {},
        }
        with open(log_path, 'a') as handle:
            handle.write(json.dumps(entry, sort_keys=True) + "\n")
        try:
            os.chmod(log_path, 0o600)
        except OSError:
            pass

    # -- ConfigurationManager delegation stubs --
    def _load_active_config(self):
        return self.config_manager._load_active_config()

    def _save_active_config(self):
        return self.config_manager._save_active_config()

    def _prompt_new_config_name(self):
        return self.config_manager._prompt_new_config_name()

    def _prompt_new_config_environment(self):
        return self.config_manager._prompt_new_config_environment()

    def _reset_inventory_for_new_config(self):
        return self.config_manager._reset_inventory_for_new_config()

    def _create_machine_first_configuration(self):
        return self.config_manager._create_machine_first_configuration()

    def _generate_config_name(self, nodes_count, custom_name=None, *, unit='n'):
        return self.config_manager._generate_config_name(nodes_count, custom_name, unit=unit)

    def _list_available_configs(self):
        return self.config_manager._list_available_configs()

    def _save_config_with_metadata(self, config_name, environment, nodes_count, update_symlink=True):
        return self.config_manager._save_config_with_metadata(config_name, environment, nodes_count, update_symlink)

    def _update_hosts_symlink(self, config_path):
        return self.config_manager._update_hosts_symlink(config_path)

    def _load_config_by_name(self, config_name):
        return self.config_manager._load_config_by_name(config_name)

    def load_configuration(self):
        return self.config_manager.load_configuration()

    def get_mnl_app_env(self):
        return self.config_manager.get_mnl_app_env()

    def set_mnl_app_env(self, env_value):
        return self.config_manager.set_mnl_app_env(env_value)

    def _save_configuration(self):
        return self.config_manager._save_configuration()

    def build_fleet_state(self, inventory=None):
        return self.config_manager.build_fleet_state(inventory)

    def build_machine_group_views(self, inventory=None, fleet_state=None, node_status_data=None):
        config_manager = getattr(self, 'config_manager', None)
        if config_manager and hasattr(config_manager, 'build_machine_group_views'):
            result = config_manager.build_machine_group_views(
                inventory=inventory,
                fleet_state=fleet_state,
                node_status_data=node_status_data,
            )
            if isinstance(result, list):
                return result

        fallback_manager = ConfigurationManager(self)
        return fallback_manager.build_machine_group_views(
            inventory=inventory,
            fleet_state=fleet_state,
            node_status_data=node_status_data,
        )

    def build_execution_inventory(self, host_names, dedupe_by_machine=False, inventory=None):
        return self.config_manager.build_execution_inventory(
            host_names,
            inventory=inventory,
            dedupe_by_machine=dedupe_by_machine,
        )

    def build_registered_machine_execution_inventory(self, machine_ids, fleet_state=None):
        return self.config_manager.build_registered_machine_execution_inventory(
            machine_ids,
            fleet_state=fleet_state,
        )

    def group_host_names_by_machine(self, selected_host_names, inventory=None):
        inventory = inventory if inventory is not None else self.inventory
        return self.config_manager.group_host_names_by_machine(inventory, selected_host_names)

    def get_fleet_state_copy(self):
        return self.config_manager.get_fleet_state_copy()

    def ensure_configuration_shell(self, config_name, environment):
        return self.config_manager.ensure_configuration_shell(config_name, environment)

    def upsert_machine_record(self, machine_id, machine_data):
        return self.config_manager.upsert_machine_record(machine_id, machine_data)

    def set_migration_plan_state(self, plan_state):
        return self.config_manager.set_migration_plan_state(plan_state)

    def finalize_instance_migration(
        self,
        instance_name,
        target_machine_id,
        target_runtime,
        *,
        runtime_name_policy,
        migration_plan_state=None,
    ):
        return self.config_manager.finalize_instance_migration(
            instance_name,
            target_machine_id,
            target_runtime,
            runtime_name_policy=runtime_name_policy,
            migration_plan_state=migration_plan_state,
        )

    @property
    def fleet_state(self):
        return self.config_manager.fleet_state

    @fleet_state.setter
    def fleet_state(self, value):
        self.config_manager.fleet_state = value

    def manage_configurations_menu(self):
        return self.config_manager.manage_configurations_menu()

    def _switch_configuration(self, configs):
        return self.config_manager._switch_configuration(configs)

    def _delete_configuration(self, configs):
        return self.config_manager._delete_configuration(configs)

    def _rename_configuration(self, configs):
        return self.config_manager._rename_configuration(configs)

    def _backup_configuration(self, configs):
        return self.config_manager._backup_configuration(configs)

    def _restore_configuration(self):
        return self.config_manager._restore_configuration()

    def _export_configuration(self, configs):
        return self.config_manager._export_configuration(configs)

    def _import_configuration(self):
        return self.config_manager._import_configuration()

    def _quick_export_current(self):
        return self.config_manager._quick_export_current()

    # -- NodeStatusTracker delegation stubs --
    def _should_preserve_node_status(self, node_name):
        return self.status_tracker._should_preserve_node_status(node_name)

    def _update_node_status(self, node_name, status):
        return self.status_tracker._update_node_status(node_name, status)

    def _get_node_status_info(self, node_name):
        return self.status_tracker._get_node_status_info(node_name)

    def _get_real_time_node_status(self):
        return self.status_tracker._get_real_time_node_status()

    # -- SSHKeyManager delegation stubs --
    def migrate_legacy_ssh_metadata(self):
        return self.ssh_key_manager.migrate_legacy_ssh_metadata()

    def check_ssh_key_tooling(self):
        return self.ssh_key_manager.check_ssh_key_tooling()

    def ssh_key_management_menu(self):
        return self.ssh_key_manager.ssh_key_management_menu()

    def install_key_and_migrate_hosts(self):
        return self.ssh_key_manager.install_key_and_migrate_hosts()

    def add_extra_public_keys(self):
        return self.ssh_key_manager.add_extra_public_keys()

    def validate_key_authentication(self):
        return self.ssh_key_manager.validate_key_authentication()

    def disable_password_authentication(self):
        return self.ssh_key_manager.disable_password_authentication()

    def show_ssh_auth_status(self):
        return self.ssh_key_manager.show_ssh_auth_status()

    def _get_status_display_info(self, status):
        return self.status_tracker._get_status_display_info(status)

    def _display_node_status(self, node_name, compact=False):
        return self.status_tracker._display_node_status(node_name, compact)

    def check_and_update_node_status(self):
        return self.status_tracker.check_and_update_node_status()

    def _parse_service_status_output(self, output):
        return self.status_tracker._parse_service_status_output(output)

    def _determine_updated_status(self, current_status, actual_status):
        return self.status_tracker._determine_updated_status(current_status, actual_status)

    # -- DeploymentService delegation stubs --
    def deploy_full(self):
        return self.deployment_service.deploy_full()

    def deploy_docker_only(self):
        return self.deployment_service.deploy_docker_only()

    def migrate_install_tracking(self):
        return self.deployment_service.migrate_install_tracking()

    def prepare_registered_machines(self, skip_gpu=False):
        return self.deployment_service.prepare_registered_machines(skip_gpu=skip_gpu)

    def plan_instance_migration(self):
        return self.migration_planner.plan_instance_migration()

    def execute_saved_migration_plan(self):
        return self.migration_planner.execute_saved_migration_plan()

    def rollback_saved_migration_plan(self):
        return self.migration_planner.rollback_saved_migration_plan()

    def finalize_saved_migration_plan(self):
        return self.migration_planner.finalize_saved_migration_plan()

    def delete_edge_node(self):
        return self.deployment_service.delete_edge_node()

    def deployment_status(self):
        return self.deployment_service.deployment_status()

    # -- SettingsManager delegation stub --
    def settings_menu(self):
        return self.settings_manager.settings_menu()

    @property
    def connection_timeout(self) -> int:
        return self.settings_manager.connection_timeout

    @property
    def ssh_connect_timeout(self) -> int:
        return self.settings_manager.ssh_connect_timeout

    # -- Service override helpers --

    def _get_service_overrides(self) -> dict:
        """Return the current service_overrides dict from active_config (may be empty)."""
        return dict(self.active_config.get('service_overrides', {}))

    def _save_service_overrides(self, overrides: dict) -> None:
        """Persist *overrides* into the active config metadata JSON and in-memory dict."""
        if overrides:
            self.config_manager.active_config['service_overrides'] = overrides
        else:
            self.config_manager.active_config.pop('service_overrides', None)
        self.config_manager._save_active_config()

    def get_mnl_service_version(self) -> str:
        return self.config_manager.get_mnl_service_version()

    def get_collection_version(self) -> str:
        try:
            version = self.version_manager._get_current_collection_version()
            if version and str(version).strip().lower() != 'unknown':
                return str(version).strip()
        except Exception as e:
            self.print_debug(f"Unable to get collection version from version manager: {e}")
        return self.config_manager.get_collection_version()

    def get_host_service_file_version(self, host_config: Dict[str, Any]) -> str:
        return self.config_manager.get_host_service_file_version(host_config)

    def record_service_file_version(self, host_names: List[str], service_version: Optional[str] = None) -> None:
        return self.config_manager.record_service_file_version(host_names, service_version)

    def record_service_file_versions(self, host_versions: Dict[str, str]) -> None:
        return self.config_manager.record_service_file_versions(host_versions)

    def record_install_attempt(
        self,
        host_names: List[str],
        variant: str,
        driver_owner: str,
        result: str,
    ) -> None:
        return self.config_manager.record_install_attempt(host_names, variant, driver_owner, result)

    def record_install_success(
        self,
        host_names: List[str],
        variant: str,
        driver_owner: str,
    ) -> None:
        return self.config_manager.record_install_success(host_names, variant, driver_owner)

    @staticmethod
    def read_fetched_metadata(
        host_names: List[str],
        fetched_dir: Optional[Path] = None,
    ) -> Dict[str, Dict[str, Any]]:
        return ConfigurationManager._read_fetched_metadata(host_names, fetched_dir)

    def detect_helper_mode_conflicts(
        self,
        selected_host_names: Optional[List[str]] = None,
    ) -> Dict[str, Dict[str, Any]]:
        return self.config_manager.detect_helper_mode_conflicts(
            self.inventory,
            selected_host_names=selected_host_names,
        )

    def _build_host_helper_runtime(self, host_name: str, host_config: Dict[str, Any]) -> Dict[str, Any]:
        return self.config_manager.build_helper_runtime(host_name, host_config)

    def _build_remote_helper_command(
        self,
        host_name: str,
        host_config: Dict[str, Any],
        action: str,
        *args: str,
    ) -> str:
        """Resolve a host-aware remote helper command."""
        helper_runtime = self._build_host_helper_runtime(host_name, host_config)
        remote_commands = helper_runtime.get('remote_commands') or {}
        base_command = remote_commands.get(action)
        if not base_command:
            raise ValueError(f"Unsupported helper action: {action}")

        extra_args = [str(arg) for arg in args if str(arg).strip()]
        if not extra_args:
            return base_command
        return f"{base_command} {' '.join(shlex.quote(arg) for arg in extra_args)}"

    def _build_node_ssh_command(
        self,
        node_name: str,
        node_config: Dict[str, Any],
        remote_command: str,
    ) -> List[str]:
        """Build an SSH command list for a node-aware remote command."""
        host = node_config.get('ansible_host', '')
        user = node_config.get('ansible_user', '')
        if not host or not user:
            raise ValueError(f"Node '{node_name}' is missing ansible_host or ansible_user")

        ssh_cmd = ['ssh']
        ssh_port = node_config.get('ansible_port')
        if ssh_port and str(ssh_port) != '22':
            ssh_cmd.extend(['-p', str(ssh_port)])

        if 'ansible_ssh_private_key_file' in node_config:
            key_file = node_config['ansible_ssh_private_key_file']
            if str(key_file).startswith('~'):
                key_file = os.path.expanduser(key_file)
            ssh_cmd.extend(['-i', str(key_file)])

        ssh_cmd.extend([f"{user}@{host}", remote_command])
        if 'ansible_ssh_pass' in node_config:
            return ['sshpass', '-p', node_config['ansible_ssh_pass']] + ssh_cmd
        return ssh_cmd

    def _ensure_helper_mode_supported_for_hosts(
        self,
        selected_host_names: List[str],
        *,
        action_label: str,
    ) -> bool:
        """Reject unsupported mixed helper semantics on the same machine."""
        conflicts = self.detect_helper_mode_conflicts(selected_host_names)
        if not conflicts:
            return True

        self.print_colored(
            f"Unsupported helper-mode mix detected while preparing to {action_label}.",
            'red',
            bold=True,
        )
        for machine_id in sorted(conflicts.keys()):
            detail = conflicts[machine_id]
            mode_parts = []
            for helper_mode, host_names in sorted(detail['helper_modes'].items()):
                mode_parts.append(f"{helper_mode}: {', '.join(sorted(host_names))}")
            self.print_colored(
                f"  • {machine_id}: {' | '.join(mode_parts)}",
                'yellow',
            )

        self.print_colored(
            "A single machine cannot mix standard global helpers and expert dispatcher helpers.",
            'yellow',
        )
        self.print_colored(
            "Keep the machine in one helper mode or migrate to expert semantics explicitly before proceeding.",
            'yellow',
        )
        return False

    def _build_runtime_metadata_extra_vars(
        self,
        last_applied_action: str,
        extra_vars: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Attach runtime metadata fields to an Ansible extra-vars payload."""
        payload = dict(extra_vars or {})
        payload.update({
            'r1setup_cli_version': CLI_VERSION,
            'r1setup_collection_version': self.get_collection_version(),
            'r1setup_last_applied_action': last_applied_action,
        })
        return payload

    @staticmethod
    def build_machine_extra_vars(extra_vars: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        """Attach machine-scope execution metadata to an extra-vars payload."""
        payload = dict(extra_vars or {})
        payload['r1setup_execution_scope'] = 'machine'
        return payload

    def build_instance_extra_vars(
        self,
        last_applied_action: str,
        extra_vars: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Attach instance-scope execution metadata to an extra-vars payload."""
        payload = self._build_runtime_metadata_extra_vars(last_applied_action, extra_vars)
        payload['r1setup_execution_scope'] = 'instance'
        return payload

    @staticmethod
    def _append_ansible_extra_vars(cmd: str, extra_vars: Optional[Dict[str, Any]] = None) -> str:
        """Append JSON-encoded extra-vars to an ansible-playbook command."""
        if not extra_vars:
            return cmd
        return f"{cmd} --extra-vars '{json.dumps(extra_vars)}'"

    @staticmethod
    def _get_execution_inventory_host_names(execution_inventory: Dict[str, Any]) -> List[str]:
        """Return host names from a generated execution inventory."""
        return list(_get_gpu_hosts(execution_inventory).keys())

    def _write_execution_inventory(
        self,
        host_names: List[str],
        *,
        dedupe_by_machine: bool = False,
    ) -> Tuple[Path, Dict[str, Any]]:
        """Write a generated execution inventory to a temp file."""
        execution_inventory = self.build_execution_inventory(host_names, dedupe_by_machine=dedupe_by_machine)
        with tempfile.NamedTemporaryFile(
            mode='w',
            suffix='.yml',
            prefix='r1setup_exec_inventory_',
            delete=False,
        ) as handle:
            yaml.safe_dump(execution_inventory, handle, sort_keys=False)
            temp_path = Path(handle.name)
        return temp_path, execution_inventory

    def _write_registered_machine_execution_inventory(
        self,
        machine_ids: List[str],
        *,
        fleet_state: Optional[Dict[str, Any]] = None,
    ) -> Tuple[Path, Dict[str, Any]]:
        """Write a generated machine-only execution inventory to a temp file."""
        execution_inventory = self.build_registered_machine_execution_inventory(
            machine_ids,
            fleet_state=fleet_state,
        )
        with tempfile.NamedTemporaryFile(
            mode='w',
            suffix='.yml',
            prefix='r1setup_exec_machine_inventory_',
            delete=False,
        ) as handle:
            yaml.safe_dump(execution_inventory, handle, sort_keys=False)
            temp_path = Path(handle.name)
        return temp_path, execution_inventory

    @staticmethod
    def _write_provided_execution_inventory(execution_inventory: Dict[str, Any]) -> Tuple[Path, Dict[str, Any]]:
        """Write a caller-provided execution inventory to a temp file."""
        with tempfile.NamedTemporaryFile(
            mode='w',
            suffix='.yml',
            prefix='r1setup_exec_custom_inventory_',
            delete=False,
        ) as handle:
            yaml.safe_dump(execution_inventory, handle, sort_keys=False)
            temp_path = Path(handle.name)
        return temp_path, execution_inventory

    def run_generated_playbook(
        self,
        playbook_path: Path,
        host_names: List[str],
        *,
        machine_scope: bool = False,
        extra_vars: Optional[Dict[str, Any]] = None,
        last_applied_action: str = 'operation',
        show_output: bool = False,
        timeout: Optional[int] = None,
    ) -> Tuple[bool, str, List[str], Dict[str, Any]]:
        """Run a playbook with a generated per-operation inventory."""
        inventory_path, execution_inventory = self._write_execution_inventory(
            host_names,
            dedupe_by_machine=machine_scope,
        )
        execution_host_names = self._get_execution_inventory_host_names(execution_inventory)

        cmd = (
            f"ANSIBLE_CONFIG={os.environ['ANSIBLE_CONFIG']} "
            f"ANSIBLE_COLLECTIONS_PATH={os.environ['ANSIBLE_COLLECTIONS_PATH']} "
            f"ANSIBLE_HOME={os.environ['ANSIBLE_HOME']} "
            f"ansible-playbook -i {shlex.quote(str(inventory_path))} {shlex.quote(str(playbook_path))}"
        )

        if machine_scope:
            payload = self.build_machine_extra_vars(extra_vars)
        else:
            payload = self.build_instance_extra_vars(last_applied_action, extra_vars)
        cmd = self._append_ansible_extra_vars(cmd, payload)

        try:
            success, output = self.run_command(
                cmd,
                show_output=show_output,
                timeout=timeout,
            )
        finally:
            try:
                inventory_path.unlink()
            except FileNotFoundError:
                pass

        return success, output, execution_host_names, execution_inventory

    def run_custom_inventory_playbook(
        self,
        playbook_path: Path,
        execution_inventory: Dict[str, Any],
        *,
        machine_scope: bool = False,
        extra_vars: Optional[Dict[str, Any]] = None,
        last_applied_action: str = 'operation',
        show_output: bool = False,
        timeout: Optional[int] = None,
    ) -> Tuple[bool, str, List[str], Dict[str, Any]]:
        """Run a playbook against an explicit execution inventory."""
        inventory_path, execution_inventory = self._write_provided_execution_inventory(execution_inventory)
        execution_host_names = self._get_execution_inventory_host_names(execution_inventory)

        cmd = (
            f"ANSIBLE_CONFIG={os.environ['ANSIBLE_CONFIG']} "
            f"ANSIBLE_COLLECTIONS_PATH={os.environ['ANSIBLE_COLLECTIONS_PATH']} "
            f"ANSIBLE_HOME={os.environ['ANSIBLE_HOME']} "
            f"ansible-playbook -i {shlex.quote(str(inventory_path))} {shlex.quote(str(playbook_path))}"
        )
        if machine_scope:
            payload = self.build_machine_extra_vars(extra_vars)
        else:
            payload = self.build_instance_extra_vars(last_applied_action, extra_vars)
        cmd = self._append_ansible_extra_vars(cmd, payload)

        try:
            success, output = self.run_command(
                cmd,
                show_output=show_output,
                timeout=timeout,
            )
        finally:
            try:
                inventory_path.unlink()
            except FileNotFoundError:
                pass

        return success, output, execution_host_names, execution_inventory

    def run_registered_machine_playbook(
        self,
        playbook_path: Path,
        machine_ids: List[str],
        *,
        extra_vars: Optional[Dict[str, Any]] = None,
        show_output: bool = False,
        timeout: Optional[int] = None,
        fleet_state: Optional[Dict[str, Any]] = None,
    ) -> Tuple[bool, str, List[str], Dict[str, Any]]:
        """Run a machine-scope playbook against registered fleet machines."""
        inventory_path, execution_inventory = self._write_registered_machine_execution_inventory(
            machine_ids,
            fleet_state=fleet_state,
        )
        execution_host_names = self._get_execution_inventory_host_names(execution_inventory)

        cmd = (
            f"ANSIBLE_CONFIG={os.environ['ANSIBLE_CONFIG']} "
            f"ANSIBLE_COLLECTIONS_PATH={os.environ['ANSIBLE_COLLECTIONS_PATH']} "
            f"ANSIBLE_HOME={os.environ['ANSIBLE_HOME']} "
            f"ansible-playbook -i {shlex.quote(str(inventory_path))} {shlex.quote(str(playbook_path))}"
        )
        cmd = self._append_ansible_extra_vars(cmd, self.build_machine_extra_vars(extra_vars))

        try:
            success, output = self.run_command(
                cmd,
                show_output=show_output,
                timeout=timeout,
            )
        finally:
            try:
                inventory_path.unlink()
            except FileNotFoundError:
                pass

        return success, output, execution_host_names, execution_inventory

    def _build_machine_group_display_lines(
        self,
        machine_views: List[Dict[str, Any]],
        *,
        target_service_version: Optional[str] = None,
        include_last_update: bool = False,
    ) -> Tuple[List[Tuple[str, str]], List[str]]:
        """Build CLI-ready grouped machine display lines."""
        lines: List[Tuple[str, str]] = []
        outdated_service_nodes: List[str] = []

        for machine_view in machine_views:
            header = (
                f"  • {machine_view.get('display_label', machine_view['machine_id'])}: {machine_view['connection_display']} | "
                f"mode={machine_view['topology_mode']} | state={machine_view['deployment_state']} | "
                f"{machine_view['group_status_emoji']} {machine_view['group_status']}"
            )
            lines.append((header, machine_view['group_status_color']))

            if machine_view.get('machine_specs_summary'):
                lines.append((f"      specs: {machine_view['machine_specs_summary']}", 'cyan'))

            if not machine_view['instances']:
                lines.append(("      no assigned instances in this config", 'yellow'))
                if machine_view.get('untracked_discovered_candidates'):
                    lines.append((self._format_discovery_cache_line(machine_view), 'cyan'))
                    for candidate in machine_view.get('untracked_discovered_candidates') or []:
                        candidate_line = (
                            f"        ~ {candidate.get('service_name', '?')} "
                            f"[DISCOVERED] state={candidate.get('service_state', 'unknown')} "
                            f"env={candidate.get('environment', 'unknown')} "
                            f"({candidate.get('environment_source', 'unknown')})"
                        )
                        lines.append((candidate_line, 'white'))
                continue

            if (
                machine_view.get('topology_mode') == 'expert'
                and len(machine_view.get('instances') or []) == 1
            ):
                lines.append((
                    "      expert mode retained with 1 instance; normalize back to standard only via an explicit future action",
                    'yellow',
                ))

            for instance_view in machine_view['instances']:
                runtime = instance_view.get('runtime') or {}
                instance_line = (
                    f"      - {instance_view['status_emoji']} {instance_view['instance_name']} "
                    f"[{instance_view['status_label'].upper()}] "
                    f"service={runtime.get('service_name', '?')} "
                    f"container={runtime.get('container_name', '?')}"
                )

                if target_service_version:
                    applied_service_version = instance_view.get('service_file_version') or DEFAULT_SERVICE_FILE_VERSION
                    if not self._is_service_update_candidate_instance(instance_view):
                        version_state = 'N/A'
                    elif applied_service_version != target_service_version:
                        outdated_service_nodes.append(instance_view['instance_name'])
                        version_state = 'UPDATE'
                    else:
                        version_state = 'OK'
                    instance_line += (
                        f" | service {applied_service_version} / target {target_service_version} [{version_state}]"
                    )

                lines.append((instance_line, instance_view['status_color']))

                if include_last_update:
                    last_update_text = self._format_timestamp_ago(instance_view.get('last_update', ''))
                    lines.append((
                        f"          last update {last_update_text} | ssh auth {instance_view.get('ssh_auth_mode', 'unknown')}",
                        'white',
                    ))

            if machine_view.get('untracked_discovered_candidates'):
                lines.append((self._format_discovery_cache_line(machine_view), 'cyan'))
                for candidate in machine_view.get('untracked_discovered_candidates') or []:
                    candidate_line = (
                        f"        ~ {candidate.get('service_name', '?')} "
                        f"[DISCOVERED] state={candidate.get('service_state', 'unknown')} "
                        f"env={candidate.get('environment', 'unknown')} "
                        f"({candidate.get('environment_source', 'unknown')})"
                    )
                    lines.append((candidate_line, 'white'))

        return lines, sorted(set(outdated_service_nodes))

    def _format_discovery_cache_line(self, machine_view: Dict[str, Any]) -> str:
        """Describe cached discovery results in grouped machine views."""
        last_scanned_at = machine_view.get('last_discovery_scan_at')
        if last_scanned_at:
            return (
                "      cached discovery results not imported into this config "
                f"(last scan {self._format_timestamp_ago(last_scanned_at)}; refresh via Configuration -> Discover Services):"
            )
        return (
            "      cached discovery results not imported into this config "
            "(refresh via Configuration -> Discover Services):"
        )

    def _print_machine_group_display_lines(self, lines: List[Tuple[str, str]]) -> None:
        """Print grouped machine display lines."""
        for text, color in lines:
            self.print_colored(text, color)

    @staticmethod
    def _extract_machine_access_config(host_config: Dict[str, Any]) -> Dict[str, Any]:
        """Extract machine-level SSH access fields from a node-style config dict."""
        machine_data = {}
        for key in (
            'ansible_host',
            'ansible_user',
            'ansible_port',
            'ansible_ssh_common_args',
            'ansible_ssh_pass',
            'ansible_become_password',
            'ansible_ssh_private_key_file',
        ):
            if key in host_config:
                machine_data[key] = host_config[key]
        return machine_data

    def _probe_machine_specs(self, machine_config: Dict[str, Any]) -> Dict[str, Any]:
        """Best-effort probe for remote machine CPU and memory totals."""
        host = machine_config.get('ansible_host')
        user = machine_config.get('ansible_user')
        if not host or not user:
            return {'status': 'error', 'message': 'Missing SSH connection details'}

        ssh_cmd = ['ssh']
        ssh_port = machine_config.get('ansible_port', 22)
        if ssh_port != 22:
            ssh_cmd.extend(['-p', str(ssh_port)])

        if 'ansible_ssh_pass' in machine_config:
            ssh_cmd = ['sshpass', '-p', machine_config['ansible_ssh_pass']] + ssh_cmd
        elif 'ansible_ssh_private_key_file' in machine_config:
            key_file = machine_config['ansible_ssh_private_key_file']
            if str(key_file).startswith('~'):
                key_file = os.path.expanduser(key_file)
            ssh_cmd.extend(['-i', key_file])

        ssh_cmd.extend([
            '-o', 'StrictHostKeyChecking=no',
            '-o', 'UserKnownHostsFile=/dev/null',
            '-o', f'ConnectTimeout={self.ssh_connect_timeout}',
            f"{user}@{host}",
            (
                "python3 -c 'import os, socket; "
                "cpu = os.cpu_count() or 0; "
                "mem_gib = round((os.sysconf(\"SC_PAGE_SIZE\") * os.sysconf(\"SC_PHYS_PAGES\")) / (1024 ** 3), 1); "
                "print(socket.gethostname()); "
                "print(cpu); "
                "print(mem_gib)'"
            ),
        ])

        try:
            result = subprocess.run(
                ssh_cmd,
                capture_output=True,
                text=True,
                timeout=self.connection_timeout,
                check=False,
            )
        except FileNotFoundError as e:
            return {'status': 'error', 'message': str(e)}
        except subprocess.TimeoutExpired:
            return {'status': 'error', 'message': f'Probe timed out after {self.connection_timeout} seconds'}

        if result.returncode != 0:
            message = result.stderr.strip() or result.stdout.strip() or 'Probe failed'
            return {'status': 'error', 'message': message}

        lines = [line.strip() for line in result.stdout.splitlines() if line.strip()]
        if len(lines) < 3:
            return {'status': 'error', 'message': 'Incomplete machine-spec probe output'}

        try:
            cpu_total = int(lines[1])
            memory_gb_total = float(lines[2])
            if cpu_total <= 0:
                raise ValueError("invalid cpu_total")
            if memory_gb_total <= 0 or memory_gb_total > 16384:
                raise ValueError("invalid memory_gb_total")
            return {
                'status': 'success',
                'hostname': lines[0],
                'cpu_total': cpu_total,
                'memory_gb_total': memory_gb_total,
                'last_checked_at': datetime.now().isoformat(),
            }
        except ValueError:
            return {'status': 'error', 'message': 'Unable to parse machine-spec probe output'}

    @staticmethod
    def _default_migration_temp_dir() -> Path:
        """Return the default controller-side temp directory for migration artifacts."""
        return Path.home() / '.ratio1' / 'migration_tmp'

    def _probe_local_free_space(self, path: Path) -> int:
        """Return controller-side free space in bytes for the given path."""
        target = Path(path)
        probe_path = target if target.exists() else target.parent
        probe_path.mkdir(parents=True, exist_ok=True)
        return shutil.disk_usage(probe_path).free

    @staticmethod
    def _build_machine_ssh_command(machine_config: Dict[str, Any], remote_command: str) -> List[str]:
        """Build an SSH command list for a machine-scoped remote command."""
        host = machine_config.get('ansible_host', '')
        user = machine_config.get('ansible_user', '')
        if not host or not user:
            raise ValueError("Machine record is missing ansible_host or ansible_user")

        ssh_cmd = ['ssh']
        ssh_port = machine_config.get('ansible_port')
        if ssh_port and str(ssh_port) != '22':
            ssh_cmd.extend(['-p', str(ssh_port)])

        if 'ansible_ssh_private_key_file' in machine_config:
            key_file = machine_config['ansible_ssh_private_key_file']
            if str(key_file).startswith('~'):
                key_file = os.path.expanduser(key_file)
            ssh_cmd.extend(['-i', str(key_file)])

        ssh_cmd.extend([f"{user}@{host}", remote_command])
        if 'ansible_ssh_pass' in machine_config:
            return ['sshpass', '-p', machine_config['ansible_ssh_pass']] + ssh_cmd
        return ssh_cmd

    def _run_machine_probe(self, machine_config: Dict[str, Any], remote_command: str) -> Dict[str, Any]:
        """Run a non-mutating remote probe command against a machine."""
        try:
            ssh_cmd = self._build_machine_ssh_command(machine_config, remote_command)
        except ValueError as e:
            return {'status': 'error', 'message': str(e)}

        try:
            result = subprocess.run(
                ssh_cmd,
                capture_output=True,
                text=True,
                timeout=self.connection_timeout,
                check=False,
            )
        except FileNotFoundError as e:
            return {'status': 'error', 'message': str(e)}
        except subprocess.TimeoutExpired:
            return {'status': 'error', 'message': f'Probe timed out after {self.connection_timeout} seconds'}

        if result.returncode != 0:
            message = result.stderr.strip() or result.stdout.strip() or 'Probe failed'
            return {'status': 'error', 'message': message}

        return {'status': 'success', 'stdout': result.stdout, 'stderr': result.stderr}

    @staticmethod
    def _extract_environment_from_image_tag(image: Any) -> str:
        """Return a known network environment from a Docker image tag when possible."""
        image_text = str(image or '').strip().lower()
        if ':' not in image_text:
            return ''
        tag = image_text.rsplit(':', 1)[-1]
        if tag in DISCOVERY_KNOWN_ENVIRONMENTS:
            return tag
        return ''

    @classmethod
    def _infer_discovery_environment(cls, raw_candidate: Dict[str, Any]) -> Dict[str, str]:
        """Infer the node environment from discovery sources with explicit precedence."""
        metadata_env = str(raw_candidate.get('metadata_app_env') or '').strip().lower()
        if metadata_env in DISCOVERY_KNOWN_ENVIRONMENTS:
            return {'value': metadata_env, 'source': 'metadata', 'confidence': 'high'}

        env_map = raw_candidate.get('environment_map') or {}
        if isinstance(env_map, dict):
            for key in ('MNL_APP_ENV', 'EE_ENV', 'APP_ENV'):
                candidate_value = str(env_map.get(key) or '').strip().lower()
                if candidate_value in DISCOVERY_KNOWN_ENVIRONMENTS:
                    return {'value': candidate_value, 'source': 'service_environment', 'confidence': 'high'}

        image_env = cls._extract_environment_from_image_tag(raw_candidate.get('image'))
        if image_env:
            return {'value': image_env, 'source': 'image_tag', 'confidence': 'medium'}

        return {'value': 'unknown', 'source': 'unknown', 'confidence': 'low'}

    @staticmethod
    def _normalize_discovery_mounts(mounts: Any) -> List[Dict[str, str]]:
        """Normalize discovered mount dictionaries into a stable list shape."""
        normalized_mounts: List[Dict[str, str]] = []
        if not isinstance(mounts, list):
            return normalized_mounts

        seen = set()
        for mount in mounts:
            if not isinstance(mount, dict):
                continue
            source = str(mount.get('source') or '').strip()
            destination = str(mount.get('destination') or '').strip()
            mount_type = str(mount.get('type') or 'bind').strip() or 'bind'
            if not source or not destination:
                continue
            key = (source, destination, mount_type)
            if key in seen:
                continue
            seen.add(key)
            normalized_mounts.append({
                'source': source,
                'destination': destination,
                'type': mount_type,
            })
        return normalized_mounts

    @classmethod
    def _normalize_discovery_candidate(
        cls,
        machine_config: Dict[str, Any],
        raw_candidate: Dict[str, Any],
    ) -> Dict[str, Any]:
        """Normalize one raw remote discovery candidate into a stable internal model."""
        service_name = str(raw_candidate.get('service_name') or '').strip()
        service_state = str(raw_candidate.get('service_state') or 'unknown').strip().lower() or 'unknown'
        container_name = str(raw_candidate.get('container_name') or '').strip()
        container_state = str(raw_candidate.get('container_state') or '').strip().lower()
        endpoint = ConfigurationManager._normalize_machine_endpoint(
            machine_config.get('ansible_host'),
            machine_config.get('ansible_user', 'root'),
            machine_config.get('ansible_port', 22),
        )
        env_info = cls._infer_discovery_environment(raw_candidate)

        configured_mounts = cls._normalize_discovery_mounts(raw_candidate.get('configured_mounts'))
        live_mounts = cls._normalize_discovery_mounts(raw_candidate.get('live_mounts'))
        effective_mounts = live_mounts or configured_mounts

        discovery_sources: List[str] = []
        if raw_candidate.get('service_file_path'):
            discovery_sources.append('service_file')
        if raw_candidate.get('metadata_app_env') or raw_candidate.get('metadata_host_path'):
            discovery_sources.append('metadata')
        if raw_candidate.get('image'):
            discovery_sources.append('image')
        if container_name:
            discovery_sources.append('container')

        return {
            'candidate_id': f"{endpoint}::{service_name}" if endpoint and service_name else service_name,
            'machine_endpoint': endpoint,
            'service_name': service_name,
            'service_file_path': str(raw_candidate.get('service_file_path') or '').strip(),
            'service_state': service_state,
            'container_name': container_name,
            'container_state': container_state or ('running' if raw_candidate.get('container_present') else 'absent'),
            'container_present': bool(raw_candidate.get('container_present')),
            'configured_mounts': configured_mounts,
            'live_mounts': live_mounts,
            'effective_mounts': effective_mounts,
            'metadata_host_path': str(raw_candidate.get('metadata_host_path') or '').strip(),
            'service_file_version': str(raw_candidate.get('service_file_version') or '').strip() or DEFAULT_SERVICE_FILE_VERSION,
            'image': str(raw_candidate.get('image') or '').strip(),
            'environment': env_info['value'],
            'environment_source': env_info['source'],
            'environment_confidence': env_info['confidence'],
            'managed_by_r1setup': bool(raw_candidate.get('managed_by_r1setup')),
            'logical_topology_hint': 'multiple_candidates_possible',
            'discovery_sources': discovery_sources,
        }

    @staticmethod
    def _build_discovery_probe_command() -> str:
        """Return the remote command used to scan a machine for existing edge-node services."""
        return """python3 - <<'PY'
import glob
import json
import os
import re
import shlex
import subprocess


def run(args):
    return subprocess.run(args, capture_output=True, text=True, check=False)


def parse_environment_map(raw_environment):
    env_map = {}
    try:
        for token in shlex.split(raw_environment or ''):
            if '=' not in token:
                continue
            key, value = token.split('=', 1)
            env_map[key] = value
    except Exception:
        return {}
    return env_map


def parse_configured_mounts(service_text):
    mounts = []
    seen = set()
    pattern = re.compile(r'(?:^|[\\s\\\\])(?:-v|--volume)\\s+([^\\s:]+):([^\\s\\\\;]+)')
    for match in pattern.finditer(service_text or ''):
        source = match.group(1).strip()
        destination = match.group(2).strip()
        key = (source, destination, 'bind')
        if not source or not destination or key in seen:
            continue
        seen.add(key)
        mounts.append({'source': source, 'destination': destination, 'type': 'bind'})
    return mounts


def list_service_names():
    names = set()
    for pattern in ('/etc/systemd/system/*.service', '/lib/systemd/system/*.service'):
        for path in glob.glob(pattern):
            name = os.path.basename(path)
            if 'edge_node' in name and name.endswith('.service'):
                names.add(name[:-8])

    listed = run(['systemctl', 'list-unit-files', '--type=service', '--all', '--no-legend', '--no-pager'])
    if listed.returncode == 0:
        for line in listed.stdout.splitlines():
            parts = line.split()
            if not parts:
                continue
            unit_name = parts[0].strip()
            if unit_name.endswith('.service') and 'edge_node' in unit_name:
                names.add(unit_name[:-8])

    return sorted(names)


def parse_candidate(service_name):
    service_show = run(['systemctl', 'show', service_name, '-p', 'FragmentPath', '-p', 'Environment'])
    fragment_path = ''
    environment_raw = ''
    if service_show.returncode == 0:
        for line in service_show.stdout.splitlines():
            if line.startswith('FragmentPath='):
                fragment_path = line.split('=', 1)[1].strip()
            elif line.startswith('Environment='):
                environment_raw = line.split('=', 1)[1]

    candidate_paths = [
        fragment_path,
        f'/etc/systemd/system/{service_name}.service',
        f'/lib/systemd/system/{service_name}.service',
    ]
    service_file_path = next((path for path in candidate_paths if path and os.path.exists(path)), '')

    service_cat = run(['systemctl', 'cat', service_name])
    service_text = service_cat.stdout if service_cat.returncode == 0 else ''
    service_state = run(['systemctl', 'is-active', service_name]).stdout.strip() or 'unknown'

    container_name = ''
    image = ''
    metadata_host_path = ''
    service_file_version = ''

    container_match = re.search(r'--name\\s+([^\\s\\\\]+)', service_text or '')
    if container_match:
        container_name = container_match.group(1).strip().strip("'\\\"")

    image_match = re.search(r'([A-Za-z0-9./_-]*edge_node:[A-Za-z0-9._-]+)', service_text or '')
    if image_match:
        image = image_match.group(1).strip().strip("'\\\"")

    metadata_match = re.search(r'R1SETUP_METADATA_PATH=([^\\s\\\\\\'\\"]+)', service_text or '')
    if metadata_match:
        metadata_host_path = metadata_match.group(1).strip()

    version_match = re.search(r'R1SETUP_SERVICE_FILE_VERSION=([^\\s\\n\\r\\'\\"]+)', service_text or '')
    if version_match:
        service_file_version = version_match.group(1).strip()

    metadata_app_env = ''
    managed_by_r1setup = 'R1SETUP_SERVICE_FILE_VERSION=' in service_text or 'Managed by ratio1.multi_node_launcher' in service_text
    if metadata_host_path and os.path.exists(metadata_host_path):
        try:
            with open(metadata_host_path, 'r', encoding='utf-8') as handle:
                metadata = json.load(handle)
            metadata_app_env = str(metadata.get('app_env') or '').strip()
            managed_by_r1setup = managed_by_r1setup or str(metadata.get('managed_by') or '').strip() == 'r1setup'
        except Exception:
            pass

    docker_available = run(['sh', '-lc', 'command -v docker >/dev/null 2>&1']).returncode == 0
    container_present = False
    container_state = ''
    live_mounts = []
    if docker_available and container_name:
        inspect_result = run(['docker', 'inspect', container_name])
        if inspect_result.returncode == 0:
            try:
                inspect_payload = json.loads(inspect_result.stdout or '[]')
                if inspect_payload:
                    container_present = True
                    container_state = str(((inspect_payload[0].get('State') or {}).get('Status')) or '').strip().lower()
                    for mount in inspect_payload[0].get('Mounts') or []:
                        source = str(mount.get('Source') or '').strip()
                        destination = str(mount.get('Destination') or '').strip()
                        mount_type = str(mount.get('Type') or 'bind').strip() or 'bind'
                        if source and destination:
                            live_mounts.append({'source': source, 'destination': destination, 'type': mount_type})
            except Exception:
                pass

    return {
        'service_name': service_name,
        'service_file_path': service_file_path,
        'service_state': service_state,
        'container_name': container_name,
        'container_present': container_present,
        'container_state': container_state,
        'configured_mounts': parse_configured_mounts(service_text),
        'live_mounts': live_mounts,
        'metadata_host_path': metadata_host_path,
        'metadata_app_env': metadata_app_env,
        'service_file_version': service_file_version,
        'environment_map': parse_environment_map(environment_raw),
        'image': image,
        'managed_by_r1setup': managed_by_r1setup,
    }


print(json.dumps({'services': [parse_candidate(service_name) for service_name in list_service_names()]}))
PY"""

    def discover_existing_edge_node_services(self, machine_config: Dict[str, Any]) -> Dict[str, Any]:
        """Scan a machine for existing edge-node services without mutating remote state."""
        probe_result = self._run_machine_probe(machine_config, self._build_discovery_probe_command())
        if probe_result.get('status') != 'success':
            return probe_result

        try:
            payload = json.loads(probe_result.get('stdout') or '{}')
        except json.JSONDecodeError:
            return {'status': 'error', 'message': 'Discovery probe returned invalid JSON'}

        raw_candidates = payload.get('services') or []
        if not isinstance(raw_candidates, list):
            return {'status': 'error', 'message': 'Discovery probe returned an invalid candidate list'}

        normalized_candidates = [
            self._normalize_discovery_candidate(machine_config, raw_candidate)
            for raw_candidate in raw_candidates
            if isinstance(raw_candidate, dict) and str(raw_candidate.get('service_name') or '').strip()
        ]
        normalized_candidates.sort(key=lambda item: item.get('service_name') or '')

        return {
            'status': 'success',
            'candidates': normalized_candidates,
            'candidate_count': len(normalized_candidates),
        }

    @staticmethod
    def _derive_discovery_candidate_node_status(candidate: Dict[str, Any]) -> str:
        """Translate a discovered service/container state into the CLI node-status model."""
        service_state = str(candidate.get('service_state') or 'unknown').strip().lower()
        container_state = str(candidate.get('container_state') or '').strip().lower()
        container_present = bool(candidate.get('container_present'))

        if service_state == 'active' or container_state == 'running':
            return 'running'
        if service_state in {'inactive', 'failed'} or (container_present and container_state in {'created', 'exited', 'dead'}):
            return 'stopped'
        if service_state in {'activating', 'deactivating'}:
            return 'pending_restart'
        return 'unknown'

    def _select_single_registered_machine(
        self,
        machines: Dict[str, Dict[str, Any]],
        *,
        title: str,
    ) -> Optional[str]:
        """Select one registered machine by number."""
        if not machines:
            return None

        machine_ids = list(machines.keys())
        while True:
            self.print_header(title)
            for index, machine_id in enumerate(machine_ids, start=1):
                machine_record = machines[machine_id]
                connection_display = self.config_manager._format_machine_connection_display(machine_record)
                topology_mode = str(machine_record.get('topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE)
                deployment_state = str(machine_record.get('deployment_state') or DEFAULT_MACHINE_DEPLOYMENT_STATE)
                specs_summary = self.config_manager._format_machine_specs_summary(machine_record.get('machine_specs'))
                line = f"  {index}) {machine_id} | {connection_display} | mode={topology_mode} | state={deployment_state}"
                self.print_colored(line, 'white')
                if specs_summary:
                    self.print_colored(f"      specs: {specs_summary}", 'cyan')
            print()
            self.print_colored("  0) Cancel", 'white')
            print()

            choice = self.get_input("Select machine", "0").strip()
            if choice == '0':
                return None
            try:
                selection = int(choice) - 1
            except ValueError:
                self.print_colored("Please enter a number.", 'red')
                continue
            if 0 <= selection < len(machine_ids):
                return machine_ids[selection]
            self.print_colored("Invalid selection.", 'red')

    def _select_discovery_candidates(
        self,
        machine_record: Dict[str, Any],
        candidates: List[Dict[str, Any]],
    ) -> List[Dict[str, Any]]:
        """Interactively select which discovered candidates to import."""
        if not candidates:
            return []

        while True:
            self.print_header("Discover Existing Services")
            self.print_colored(
                f"Machine: {self.config_manager._format_machine_display_label(machine_record.get('machine_id', ''), machine_record)}",
                'cyan',
                bold=True,
            )
            self.print_colored(
                self.config_manager._format_machine_connection_display(machine_record),
                'white',
            )
            print()
            self.print_colored("Discovered candidates:", 'cyan', bold=True)
            for index, candidate in enumerate(candidates, start=1):
                mount_summary = ''
                effective_mounts = candidate.get('effective_mounts') or []
                if effective_mounts:
                    mount_summary = f" | mount={effective_mounts[0].get('source')} -> {effective_mounts[0].get('destination')}"
                self.print_colored(
                    (
                        f"  {index}) {candidate['service_name']} | state={candidate['service_state']} | "
                        f"env={candidate['environment']} ({candidate['environment_source']}) | "
                        f"container={candidate.get('container_name') or '-'}{mount_summary}"
                    ),
                    'white',
                )
            print()
            self.print_colored("Enter numbers separated by commas, 'all', or press Enter to cancel.", 'white')
            raw_choice = self.get_input("Import which services", "").strip().lower()
            if not raw_choice:
                return []
            if raw_choice == 'all':
                return list(candidates)

            selected_indexes: List[int] = []
            for token in raw_choice.replace(',', ' ').split():
                try:
                    selected_indexes.append(int(token))
                except ValueError:
                    selected_indexes = []
                    break

            if not selected_indexes:
                self.print_colored("Invalid selection.", 'red')
                continue

            selected_candidates: List[Dict[str, Any]] = []
            valid = True
            for index in selected_indexes:
                if not 1 <= index <= len(candidates):
                    valid = False
                    break
                candidate = candidates[index - 1]
                if candidate not in selected_candidates:
                    selected_candidates.append(candidate)

            if valid and selected_candidates:
                return selected_candidates
            self.print_colored("Invalid selection.", 'red')

    def _prompt_discovery_import_name(
        self,
        candidate: Dict[str, Any],
        existing_hosts: Dict[str, Dict[str, Any]],
        machine_id: str = '',
    ) -> Optional[str]:
        """Prompt for the config-local logical name of one imported discovered candidate."""
        # Default to machine name; fall back to service name if machine_id conflicts
        suggested_name = machine_id or self.config_manager._sanitize_runtime_suffix(
            candidate.get('service_name') or 'imported_node',
        )
        if suggested_name in existing_hosts:
            suffix = 2
            base_name = suggested_name
            while f"{base_name}_{suffix}" in existing_hosts:
                suffix += 1
            suggested_name = f"{base_name}_{suffix}"

        self.print_colored(
            f"  Node alias for service '{candidate['service_name']}' (this is only a local alias in r1setup):",
            'cyan',
        )
        while True:
            chosen_name = self._get_valid_hostname(
                f"Node alias",
                suggested_name,
            )
            if chosen_name in existing_hosts:
                self.print_colored(f"Node '{chosen_name}' already exists in the current config.", 'red')
                continue
            return chosen_name

    def import_discovery_candidates(
        self,
        machine_id: str,
        selected_candidates: List[Dict[str, Any]],
        logical_name_map: Dict[str, str],
    ) -> Dict[str, Any]:
        """Import selected discovery candidates into the current config with preserved runtime identities."""
        if not selected_candidates:
            return {'status': 'error', 'message': 'No discovery candidates were selected.'}

        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        fleet_state = self.config_manager._normalize_fleet_state(self.get_fleet_state_copy())
        machines = fleet_state.get('fleet', {}).get('machines', {})
        machine_record = dict(machines.get(machine_id, {}))
        if not machine_record:
            return {'status': 'error', 'message': f"Machine '{machine_id}' is not registered in the current config."}

        existing_instances = list(machine_record.get('instance_names') or [])
        final_topology_mode = str(machine_record.get('topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE)
        if final_topology_mode != 'expert' and (len(existing_instances) + len(selected_candidates)) > 1:
            final_topology_mode = 'expert'
            if existing_instances:
                self.config_manager.promote_machine_to_expert(machine_id, self.inventory)
            else:
                fleet_state = self.config_manager._normalize_fleet_state(self.get_fleet_state_copy())
                machines = fleet_state.get('fleet', {}).get('machines', {})
                machine_record = dict(machines.get(machine_id, {}))
                machine_record['topology_mode'] = 'expert'
                machines[machine_id] = machine_record
                self.config_manager.fleet_state = fleet_state

        imported_names: List[str] = []
        working_fleet_state = self.config_manager._normalize_fleet_state(self.get_fleet_state_copy())
        for candidate in selected_candidates:
            service_name = str(candidate.get('service_name') or '').strip()
            host_name = str(logical_name_map.get(service_name) or '').strip()
            if not host_name:
                return {'status': 'error', 'message': f"Missing logical name for discovered service '{service_name}'."}
            if host_name in hosts:
                return {'status': 'error', 'message': f"Node '{host_name}' already exists in the current config."}

            runtime = {
                'service_name': service_name,
                'container_name': candidate.get('container_name') or service_name,
                'volume_path': (candidate.get('effective_mounts') or candidate.get('configured_mounts') or [{}])[0].get(
                    'source',
                    DEFAULT_RUNTIME_VOLUME_PATH,
                ),
                'metadata_path': candidate.get('metadata_host_path') or '',
                'exit_status_path': f"/tmp/{candidate.get('container_name') or service_name}.exit",
            }
            collisions = self.config_manager.detect_runtime_collisions(machine_id, runtime, working_fleet_state)
            if collisions:
                collision_details = ', '.join(
                    f"{field} (used by {', '.join(owners)})"
                    for field, owners in collisions.items()
                )
                return {
                    'status': 'error',
                    'message': (
                        f"Discovered service '{service_name}' collides with already-tracked instance(s) on machine "
                        f"'{machine_id}': {collision_details}. "
                        f"If this service is already managed under a different name, skip importing it."
                    ),
                }

            host_config = self._extract_machine_access_config(machine_record)
            host_config.update({
                'r1setup_machine_id': machine_id,
                'r1setup_topology_mode': final_topology_mode,
                'r1setup_machine_deployment_state': 'active',
                'r1setup_runtime_name_policy': 'preserve',
                'r1setup_instance_logical_name': host_name,
                'edge_node_service_name': runtime['service_name'],
                'mnl_docker_container_name': runtime['container_name'],
                'mnl_docker_volume_path': runtime['volume_path'],
                'mnl_r1setup_metadata_host_path': runtime['metadata_path'] or f"{runtime['volume_path']}/_data/r1setup/metadata.json",
                'r1setup_runtime_exit_status_path': runtime['exit_status_path'],
                'node_status': self._derive_discovery_candidate_node_status(candidate),
                'last_status_update': datetime.now().isoformat(),
                SERVICE_FILE_VERSION_FIELD: candidate.get('service_file_version') or DEFAULT_SERVICE_FILE_VERSION,
                'imported_from_discovery': True,
                'r1setup_discovery_environment': candidate.get('environment') or 'unknown',
                'r1setup_discovery_environment_source': candidate.get('environment_source') or 'unknown',
            })
            self.config_manager.apply_runtime_snapshot_to_host_config(host_name, host_config)
            hosts[host_name] = host_config
            imported_names.append(host_name)

            self.config_manager.record_imported_discovery_instance(
                host_name,
                machine_id=machine_id,
                runtime={
                    'service_name': host_config['edge_node_service_name'],
                    'container_name': host_config['mnl_docker_container_name'],
                    'volume_path': host_config['mnl_docker_volume_path'],
                    'metadata_path': host_config['mnl_r1setup_metadata_host_path'],
                    'exit_status_path': host_config['r1setup_runtime_exit_status_path'],
                },
                status=host_config['node_status'],
                service_file_version=host_config[SERVICE_FILE_VERSION_FIELD],
                environment=str(candidate.get('environment') or 'unknown'),
                environment_source=str(candidate.get('environment_source') or 'unknown'),
            )
            working_fleet_state = self.config_manager._normalize_fleet_state(self.get_fleet_state_copy())

        updated_fleet_state = self.config_manager._normalize_fleet_state(self.get_fleet_state_copy())
        updated_machine_record = dict(updated_fleet_state.get('fleet', {}).get('machines', {}).get(machine_id, {}))
        updated_machine_record['topology_mode'] = final_topology_mode
        updated_machine_record['deployment_state'] = 'active'
        updated_fleet_state['fleet']['machines'][machine_id] = updated_machine_record
        self.config_manager.fleet_state = updated_fleet_state
        self._save_configuration()

        return {
            'status': 'success',
            'imported_names': imported_names,
            'topology_mode': final_topology_mode,
        }

    def discover_and_import_existing_services(self, preselected_machine_id: Optional[str] = None) -> None:
        """Discover existing remote edge-node services and import a selected subset into the current config."""
        if not self._ensure_configuration_shell_for_machine_registration():
            return

        self.load_configuration()
        fleet_state = self.get_fleet_state_copy()
        machines = dict(fleet_state.get('fleet', {}).get('machines', {}))
        if not machines:
            self.print_colored("No registered machines available. Register a machine first.", 'yellow')
            self.wait_for_enter()
            return

        # Determine which machines to scan
        selected_ids: List[str] = []
        if preselected_machine_id:
            mid = str(preselected_machine_id).strip()
            if mid in machines:
                selected_ids = [mid]

        if not selected_ids:
            selected_ids = self._select_machines_for_discovery(machines)
        if not selected_ids:
            return

        # Batch scan in parallel
        self.print_section("Scanning Machines")
        scan_buffer = self.config_manager._batch_discover_machines(selected_ids)
        self.config_manager._persist_batch_discovery_results(scan_buffer)

        # Summary
        classified = self.config_manager._classify_scan_results(scan_buffer)
        self.print_section("Discovery Summary")
        if classified['clean']:
            self.print_colored(f"  \u2713 Clean: {', '.join(classified['clean'])}", 'green')
        if classified['discovered']:
            for mid in classified['discovered']:
                count = len(scan_buffer[mid]['candidates'])
                self.print_colored(f"  \U0001f4e1 Found services: {mid} ({count} service(s))", 'cyan')
        if classified['failed']:
            self.print_colored(f"  \u2717 Failed: {', '.join(classified['failed'])}", 'red')

        if not classified['discovered']:
            self.print_colored("No existing edge-node services were discovered.", 'yellow')
            self.wait_for_enter()
            return

        # Import flow: walk through each discovered machine interactively
        for machine_id in classified['discovered']:
            machine_record = dict(machines.get(machine_id, {}))
            candidates = list(scan_buffer[machine_id].get('candidates') or [])
            if not candidates:
                continue

            self.print_section(f"Import Services from {machine_id}")
            self._import_discovered_candidates_for_machine(machine_id, machine_record, candidates)

        self.wait_for_enter()

    def _select_machines_for_discovery(self, machines: Dict[str, Dict[str, Any]]) -> List[str]:
        """Show machine list and let user select one, multiple, or all for discovery."""
        machine_ids = list(machines.keys())
        self.print_header("Select Machines For Discovery")

        for index, machine_id in enumerate(machine_ids, start=1):
            machine_record = machines[machine_id]
            connection_display = self.config_manager._format_machine_connection_display(machine_record)
            topology_mode = str(machine_record.get('topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE)
            deployment_state = str(machine_record.get('deployment_state') or DEFAULT_MACHINE_DEPLOYMENT_STATE)
            specs_summary = self.config_manager._format_machine_specs_summary(machine_record.get('machine_specs'))
            line = f"  {index}) {machine_id} | {connection_display} | mode={topology_mode} | state={deployment_state}"
            self.print_colored(line, 'white')
            if specs_summary:
                self.print_colored(f"      specs: {specs_summary}", 'cyan')

        print()
        self.print_colored("  a) All machines", 'cyan')
        self.print_colored("  0) Cancel", 'white')
        print()

        while True:
            choice = self.get_input(
                f"Select machines (e.g. 1,3,5 or 'a' for all)", "a",
            ).strip().lower()

            if choice == '0':
                return []
            if choice == 'a':
                return list(machine_ids)

            # Parse comma-separated numbers
            try:
                indices = [int(x.strip()) for x in choice.split(',')]
                selected = []
                for idx in indices:
                    if 1 <= idx <= len(machine_ids):
                        mid = machine_ids[idx - 1]
                        if mid not in selected:
                            selected.append(mid)
                    else:
                        self.print_colored(f"  Invalid number: {idx}. Valid range is 1-{len(machine_ids)}.", 'red')
                        selected = []
                        break
                if selected:
                    return selected
            except ValueError:
                self.print_colored("  Enter comma-separated numbers, 'a' for all, or '0' to cancel.", 'red')

    def _import_discovered_candidates_for_machine(
        self,
        machine_id: str,
        machine_record: Dict[str, Any],
        candidates: List[Dict[str, Any]],
    ) -> None:
        """Run the interactive import flow for one machine's discovered candidates."""
        selected_candidates = self._select_discovery_candidates(machine_record, candidates)
        if not selected_candidates:
            self.print_colored(f"  Skipped import for {machine_id}.", 'yellow')
            return

        current_env = str(self.get_mnl_app_env() or self.active_config.get('environment') or '').strip().lower()
        mismatch_candidates = [
            candidate for candidate in selected_candidates
            if candidate.get('environment') not in ('', 'unknown', current_env)
        ]
        if mismatch_candidates:
            mismatch_services = ", ".join(sorted(candidate['service_name'] for candidate in mismatch_candidates))
            proceed = self.get_input(
                (
                    f"Selected services ({mismatch_services}) do not match the current config environment "
                    f"'{current_env or 'unknown'}'. Continue importing them? (y/n)"
                ),
                "n",
            ).lower() == 'y'
            if not proceed:
                self.print_colored(f"  Skipped import for {machine_id}.", 'yellow')
                return

        duplicate_claims: List[str] = []
        current_config_name = self.active_config.get('config_name')
        for candidate in selected_candidates:
            claims = self.config_manager.find_runtime_identity_claims(
                machine_record,
                candidate['service_name'],
                exclude_config_name=current_config_name,
            )
            for claim in claims:
                duplicate_claims.append(
                    f"{candidate['service_name']} already tracked by config '{claim['config_name']}' as '{claim['instance_name']}'"
                )
        if duplicate_claims:
            self.print_colored("Cross-config tracking warnings:", 'yellow', bold=True)
            for line in duplicate_claims:
                self.print_colored(f"  - {line}", 'yellow')
            proceed = self.get_input("Continue importing into the current config anyway? (y/n)", "n").lower() == 'y'
            if not proceed:
                self.print_colored(f"  Skipped import for {machine_id}.", 'yellow')
                return

        existing_instances = list(machine_record.get('instance_names') or [])
        requires_expert = (
            str(machine_record.get('topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE) == 'expert'
            or (len(existing_instances) + len(selected_candidates)) > 1
        )
        if requires_expert and str(machine_record.get('topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE) != 'expert':
            self.print_colored(
                (
                    f"Importing {len(selected_candidates)} service(s) onto machine '{machine_id}' will require expert mode "
                    f"because the machine would track {len(existing_instances) + len(selected_candidates)} instances."
                ),
                'yellow',
            )
            proceed = self.get_input("Promote this machine to expert mode and continue? (y/n)", "n").lower() == 'y'
            if not proceed:
                self.print_colored(f"  Skipped import for {machine_id}.", 'yellow')
                return

        hosts = _get_gpu_hosts(self.inventory)
        logical_name_map: Dict[str, str] = {}
        for candidate in selected_candidates:
            logical_name = self._prompt_discovery_import_name(candidate, hosts, machine_id=machine_id)
            if not logical_name:
                self.print_colored(f"  Skipped import for {machine_id}.", 'yellow')
                return
            logical_name_map[candidate['service_name']] = logical_name
            hosts[logical_name] = {}  # temporary reservation against duplicate prompts
        for logical_name in logical_name_map.values():
            if logical_name in hosts and not hosts[logical_name]:
                del hosts[logical_name]

        result = self.import_discovery_candidates(machine_id, selected_candidates, logical_name_map)
        if result.get('status') != 'success':
            self.print_colored(f"  Import failed for {machine_id}: {result.get('message', 'unknown error')}", 'red')
            return

        self.print_colored(
            f"  Imported {len(result.get('imported_names') or [])} service(s) from {machine_id}.",
            'green',
        )
        self.print_colored(
            f"  Machine '{machine_id}' now operates in {result.get('topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE)} mode.",
            'cyan',
        )

    def _ensure_configuration_shell_for_machine_registration(self) -> bool:
        """Ensure an active configuration exists before persisting machine-only fleet data."""
        if self.active_config.get('config_name'):
            return True

        self.print_header("Create Fleet Configuration")
        self.print_colored("A configuration is required before you can register machines.", 'yellow')
        self.print_colored("This can be an empty fleet configuration with no deployed nodes yet.", 'white')
        print()

        while True:
            custom_name = self.get_input("Enter configuration name (letters, numbers, -, _)", required=True)
            if re.match(r'^[a-zA-Z0-9_-]+$', custom_name):
                break
            self.print_colored("Invalid name. Use only letters, numbers, hyphens (-), and underscores (_)", 'red')

        env = self._select_network_environment()
        self.set_mnl_app_env(env)
        config_name = self._generate_config_name(0, custom_name)
        self.ensure_configuration_shell(config_name, env)
        self.print_colored(f"Configuration '{config_name}' created and activated.", 'green')
        return True

    def register_machine_without_deployment(self) -> None:
        """Register a machine in fleet metadata without creating an Edge Node instance."""
        if not self._ensure_configuration_shell_for_machine_registration():
            return

        self.print_header("Register Machine")
        self.load_configuration()

        self.print_colored("This flow registers a machine in the fleet without deploying an Edge Node.", 'cyan')
        self.print_colored("You can prepare it now and assign or migrate an instance later.", 'white')
        print()

        registered_ids = self.config_manager._collect_machine_registration_entries(1)
        if not registered_ids:
            self.wait_for_enter()
            return

        machine_id = registered_ids[0]
        self.print_colored(f"Machine '{machine_id}' registered successfully.", 'green')
        if self.get_input("Discover existing edge-node services on this machine now? (y/n)", "n").lower() == 'y':
            self.discover_and_import_existing_services(machine_id)
            return
        self.wait_for_enter()

    def fleet_summary(self) -> None:
        """Display the currently known fleet machines and assignments."""
        if not self.active_config.get('config_name'):
            self.print_colored("No active configuration. Create or load one first.", 'red')
            self.wait_for_enter()
            return

        self.print_header("Fleet Summary")
        self.load_configuration()
        fleet_state = self.get_fleet_state_copy()
        machines = fleet_state['fleet']['machines']

        if not machines:
            self.print_colored("No machines registered yet.", 'yellow')
            self.wait_for_enter()
            return

        machine_views = self.build_machine_group_views(fleet_state=fleet_state)
        self.print_section(f"Machines ({len(machine_views)})")
        lines, _ = self._build_machine_group_display_lines(machine_views)
        self._print_machine_group_display_lines(lines)

        self.wait_for_enter()

    def _get_tracked_live_node_entries(
        self,
        inventory: Optional[Dict[str, Any]] = None,
    ) -> List[Dict[str, str]]:
        """Return configured nodes with live runtime evidence but no deploy record."""
        hosts = _get_gpu_hosts(inventory if inventory is not None else self.inventory)
        tracked_statuses = {'running', 'stopped', 'pending_restart', 'error', 'unreachable'}
        tracked_entries = []
        for host_name, host_config in hosts.items():
            status = str(host_config.get('node_status') or 'unknown')
            if status not in tracked_statuses:
                continue
            tracked_entries.append({
                'host_name': host_name,
                'status': status,
            })
        return tracked_entries

    def _get_deployment_display_state(
        self,
        metadata: Optional[Dict[str, Any]] = None,
        inventory: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Return a shared operator-facing deployment summary for the active config."""
        metadata = metadata if metadata is not None else self.active_config
        deployment_status = str(metadata.get('deployment_status') or 'never_deployed')
        last_deployed_date = metadata.get('last_deployed_date')
        last_deployed_network = metadata.get('last_deployed_network')
        last_deleted_date = metadata.get('last_deleted_date')
        # Per-host variant rollup replaces retired fleet-level last_deployment_type.
        variant_summary = self.config_manager.install_variant_summary(
            inventory if inventory is not None else self.inventory
        )

        if deployment_status == 'deployed':
            deployed_str = _parse_iso_datetime(last_deployed_date) if last_deployed_date else None
            summary_text = f"🚀 Last deployed: {deployed_str}" if deployed_str else "🚀 Deployed"
            main_menu_text = f"🚀 deployed {deployed_str}" if deployed_str else "✓ deployed"
            if deployed_str and last_deployed_network:
                summary_text += f" ({last_deployed_network}"
                if variant_summary and variant_summary != 'no installs yet':
                    summary_text += f", {variant_summary})"
                else:
                    summary_text += ")"
                main_menu_text += f" ({last_deployed_network})"
            status_line = f"🚀 Status: Deployed on {deployed_str}" if deployed_str else "🚀 Status: Deployed"
            return {
                'state_key': 'deployed',
                'color': 'green',
                'config_summary': summary_text,
                'status_line': status_line,
                'status_note': None,
                'main_menu_text': main_menu_text,
                'deployment_line': f"Deployment: {summary_text}",
                'suggested_action': None,
                'deployment_menu_default': '0',
            }

        if deployment_status == 'deleted':
            deleted_str = _parse_iso_datetime(last_deleted_date) if last_deleted_date else None
            summary_text = f"🗑️ Last deleted: {deleted_str}" if deleted_str else "🗑️ Deleted"
            status_line = f"🗑️ Status: Deleted on {deleted_str}" if deleted_str else "🗑️ Status: Deleted"
            return {
                'state_key': 'deleted',
                'color': 'red',
                'config_summary': summary_text,
                'status_line': status_line,
                'status_note': None,
                'main_menu_text': "🗑️ deleted",
                'deployment_line': f"Deployment: {summary_text}",
                'suggested_action': None,
                'deployment_menu_default': '2',
            }

        tracked_live_nodes = self._get_tracked_live_node_entries(inventory=inventory)
        if deployment_status == 'never_deployed' and tracked_live_nodes:
            live_count = len(tracked_live_nodes)
            live_label = f"{live_count} live node(s)"
            return {
                'state_key': 'tracking_live_nodes',
                'color': 'yellow',
                'config_summary': f"📡 Tracking {live_label}",
                'status_line': f"📡 Status: Tracking {live_label}",
                'status_note': (
                    "This configuration is actively tracking live runtimes. They may have been "
                    "imported from discovery or moved via migration, so a local deploy record may "
                    "not exist yet."
                ),
                'main_menu_text': f"📡 tracking {live_label}",
                'deployment_line': f"Deployment: 📡 Tracking {live_label}",
                'suggested_action': (
                    '4',
                    '\U0001f4a1 Suggested: Review fleet status before deploying changes',
                ),
                'deployment_menu_default': '0',
            }

        return {
            'state_key': 'never_deployed',
            'color': 'yellow',
            'config_summary': "📋 Never deployed",
            'status_line': "📋 Status: Never deployed",
            'status_note': None,
            'main_menu_text': "✗ not deployed",
            'deployment_line': "Deployment: ✗ Never deployed",
            'suggested_action': None,
            'deployment_menu_default': '2',
        }

    def _get_suggested_action(self):
        """Return (default_option, hint_text) based on current state."""
        has_config = self.check_hosts_config()
        if not has_config:
            fleet_state = self.get_fleet_state_copy()
            machines = fleet_state.get('fleet', {}).get('machines', {})
            if machines:
                return '2', '\U0001f4a1 Suggested: Prepare or review your registered machines'
            return '1', '\U0001f4a1 Suggested: Create or load a configuration first'
        deployment_display = self._get_deployment_display_state()
        if deployment_display.get('suggested_action'):
            return deployment_display['suggested_action']
        deployment_status = self.active_config.get('deployment_status', 'never_deployed')
        if deployment_status in ('never_deployed', 'deleted'):
            return '2', '\U0001f4a1 Suggested: Deploy your configured instances'
        hosts = _get_gpu_hosts(self.inventory)
        _, eligible_hosts, outdated_hosts, _, _ = self._get_service_update_inventory(hosts)
        if eligible_hosts and outdated_hosts:
            return '3', f'\U0001f4a1 Suggested: Update service file on {len(outdated_hosts)} node(s)'
        error_count = sum(1 for h in hosts.values() if h.get('node_status') == 'error')
        if error_count > 0:
            return '4', f'\U0001f4a1 Suggested: Check {error_count} node(s) with errors'
        return '', ''

    def show_main_menu(self) -> None:
        """Display the main menu"""
        # Reload active config to ensure deployment status is current
        self._load_active_config()
        
        self.print_header("Ratio1 Multi-Node Launcher Setup")

        # Show compact status line
        has_config = self.check_hosts_config()
        has_shell = self.has_active_config_shell()
        if has_config or has_shell:
            self.load_configuration()
        current_env = self.get_mnl_app_env() or 'not set'
        active_config_name = self.active_config.get('config_name', 'None')

        # Line 1: config | network | deployment
        deployment_display = self._get_deployment_display_state()
        deploy_str = deployment_display['main_menu_text']

        config_label = active_config_name if active_config_name != 'None' else 'none'
        self.print_colored(f"  {config_label} | {current_env} | {deploy_str}", 'cyan')

        # Line 2: node status summary (only when hosts exist)
        if has_config:
            hosts = _get_gpu_hosts(self.inventory)
            if hosts:
                status_counts = {}
                oldest_update = None
                for host_name in hosts.keys():
                    status_info = self._get_node_status_info(host_name)
                    status = status_info['status']
                    status_counts[status] = status_counts.get(status, 0) + 1
                    last_update = status_info.get('last_update', '')
                    if last_update:
                        if oldest_update is None or last_update < oldest_update:
                            oldest_update = last_update

                status_summary = []
                for status, count in status_counts.items():
                    emoji, color, description = self._get_status_display_info(status)
                    status_summary.append(f"{emoji}{count} {description.lower()}")

                age_str = self._format_timestamp_ago(oldest_update) if oldest_update else "never checked"
                self.print_colored(
                    f"  Nodes ({len(hosts)}): {', '.join(status_summary)} ({age_str})",
                    'cyan'
                )
        elif has_shell:
            # Zero-host shell with registered machines
            fleet_state = self.get_fleet_state_copy()
            machines = fleet_state.get('fleet', {}).get('machines', {})
            if machines:
                self.print_colored(
                    f"  Machines: {len(machines)} registered, 0 instances", 'cyan',
                )

        self.print_section("R1Setup Main Menu")
        print()
        self.print_colored("\U0001f4cb CONFIGURATION", 'cyan', bold=True)
        self.print_colored("  1) Configuration Menu      - Node setup, environments, and management")
        print()
        self.print_colored("\U0001f680 DEPLOYMENT", 'cyan', bold=True)
        self.print_colored("  2) Deployment Menu         - Deploy, delete, and manage deployments")
        print()
        self.print_colored("\U0001f527 OPERATIONS", 'cyan', bold=True)
        self.print_colored("  3) Operations Menu         - Start, stop, restart service and test connectivity")
        print()
        self.print_colored("\U0001f4ca MONITORING & INFO", 'cyan', bold=True)
        self.print_colored("  4) Node Status & Info      - Get latest info and show detailed status")
        self.print_colored("  5) Node Addresses & Export - Display addresses and export to CSV")
        print()
        self.print_colored("\u2699\ufe0f  SETTINGS & TOOLS", 'cyan', bold=True)
        self.print_colored("  6) Settings                - Toggle live status display and preferences")
        self.print_colored("  7) Advanced Menu           - SSH, logs, and security tools")
        print()
        self.print_colored("  0) Exit")
        print()

        # Show contextual suggestion
        _, hint = self._get_suggested_action()
        if hint:
            self.print_colored(hint, 'cyan')
            print()

    def configuration_menu(self) -> None:
        """Show configuration submenu"""
        while True:
            self.print_header("Configuration Menu")

            # Show current configuration status
            has_config = self.check_hosts_config()
            has_shell = self.has_active_config_shell()
            current_env = self.get_mnl_app_env()
            active_config_name = self.active_config.get('config_name', 'None')

            self.print_section("Current Status")
            self.print_colored(f"Active Config: {active_config_name if active_config_name != 'None' else '✗ No active config'}",
                               'green' if active_config_name != 'None' else 'red')
            is_configured = has_config or has_shell
            self.print_colored(f"Configuration: {'✓ Configured' if is_configured else '✗ Not configured'}",
                               'green' if is_configured else 'red')
            self.print_colored(f"Network: {current_env if current_env else '✗ Not set'}",
                               'green' if current_env else 'red')

            # Show node/machine counts
            if has_config or has_shell:
                self.load_configuration()
                hosts = _get_gpu_hosts(self.inventory)
                if hosts:
                    self.print_colored(f"Nodes: {len(hosts)} configured", 'cyan')
                fleet_state = self.get_fleet_state_copy()
                machines = fleet_state.get('fleet', {}).get('machines', {})
                if machines:
                    self.print_colored(f"Machines: {len(machines)} registered", 'cyan')

            self.print_colored("Configuration Menu", 'cyan', bold=True)
            print()
            self.print_colored("  1) Configure Nodes        - Instance and node management")
            self.print_colored("  2) Manage Configurations  - Switch, backup, restore configurations")
            self.print_colored("  3) View Configuration     - Display current configuration")
            self.print_colored("  4) Switch Environment     - Change network environment (mainnet/testnet/devnet)")
            self.print_colored("  5) Register Machine       - Add a fleet machine without deploying a node")
            self.print_colored("  6) Fleet Summary          - Show machines and current assignments")
            self.print_colored("  7) Discover Services      - Scan a machine and import selected existing services")
            print()
            self.print_colored("  0) Back to Main Menu")
            print()

            choice = self.get_input("Select option", "0")

            if choice == '0':
                break
            elif choice == '1':
                self.configure_nodes_menu()
            elif choice == '2':
                self.manage_configurations_menu()
            elif choice == '3':
                self.view_configuration()
            elif choice == '4':
                self.switch_environment()
            elif choice == '5':
                self.register_machine_without_deployment()
            elif choice == '6':
                self.fleet_summary()
            elif choice == '7':
                self.discover_and_import_existing_services()
            else:
                self.print_colored("Invalid option. Valid choices are 0-7.", 'red')
                self.wait_for_enter()

    def configure_nodes_menu(self) -> None:
        """Show node configuration submenu"""
        while True:
            self.print_header("Node Configuration")

            # Load current configuration
            self.load_configuration()
            hosts = _get_gpu_hosts(self.inventory)

            if hosts:
                self.print_section(f"Current Nodes ({len(hosts)})")
                for i, (name, config) in enumerate(hosts.items(), 1):
                    ip = config.get('ansible_host', 'Unknown')
                    user = config.get('ansible_user', 'Unknown')
                    
                    # Get status information
                    status_info = self._get_node_status_info(name)
                    status = status_info['status']
                    status_emoji, status_color, status_desc = self._get_status_display_info(status)
                    
                    last_update = status_info.get('last_update', '')
                    age_str = self._format_timestamp_ago(last_update) if last_update else "Never"

                    self.print_colored(f"  {i}. {name} ({user}@{ip}) ", 'white', end='')
                    self.print_colored(f"[{status_emoji} {status_desc}]", status_color, end='')
                    self.print_colored(f" ({age_str})", 'white')
                print()

            self.print_colored("Configure Nodes Menu", 'cyan', bold=True)
            print()
            if not hosts:
                self.print_colored("  1) Create New Configuration        - Register machines and set up instances")
            else:
                self.print_colored("  1) Add New Node                   - Add node to existing config")
                self.print_colored("  2) Edit Existing Node             - Modify node settings")
                self.print_colored("  3) Remove Node                    - Delete node from config")
                self.print_colored("  4) Create New Configuration       - Start fresh configuration")
            print()
            self.print_colored("  0) Back to Main Menu")
            print()

            choice = self.get_input("Select option", "0")

            if choice == '0':
                break
            elif choice == '1':
                if not hosts:
                    self._create_machine_first_configuration()
                else:
                    self._add_node()
            elif choice == '2' and hosts:
                self._update_node()
            elif choice == '3' and hosts:
                self._delete_node()
            elif choice == '4' and hosts:
                self._create_new_configuration()
            else:
                self.print_colored("Invalid option. Valid choices are 0-4.", 'red')
                self.wait_for_enter()

    def deployment_menu(self) -> None:
        """Show deployment submenu"""
        while True:
            # Reload active config to ensure deployment status is current
            self._load_active_config()
            
            self.print_header("Deployment Menu")

            # Show deployment status overview
            deployment_status = self.active_config.get('deployment_status', 'never_deployed')
            deployment_display = self._get_deployment_display_state()

            # Show current deployment status
            self.print_section("Current Deployment Status")
            self.print_colored(deployment_display['status_line'], deployment_display['color'])
            if deployment_display.get('status_note'):
                self.print_colored(f"ℹ️  {deployment_display['status_note']}", 'white')
            if deployment_status == 'deployed':
                last_deployed_network = self.active_config.get('last_deployed_network')
                variant_summary = self.config_manager.install_variant_summary(self.inventory)
                if last_deployed_network:
                    self.print_colored(f"🌐 Network: {last_deployed_network}", 'cyan')
                if variant_summary and variant_summary != 'no installs yet':
                    self.print_colored(f"🔧 Per-host variants: {variant_summary}", 'cyan')
            migration_plan_state = self.active_config.get('migration_plan_state')
            if migration_plan_state:
                self.print_colored(
                    f"🧭 Saved Migration Plan: {migration_plan_state.get('instance_name', '?')} -> {migration_plan_state.get('target_machine_id', '?')} [{migration_plan_state.get('status', 'unknown')}]",
                    'cyan',
                )

            self.print_colored("Deployment Menu", 'cyan', bold=True)
            print()
            self.print_colored("  1) Install GPU Nodes      - Deploy Edge Node with GPU image (ratio1/edge_node_gpu)")
            self.print_colored("  2) Install CPU Nodes      - Deploy Edge Node with CPU image (ratio1/edge_node)")
            self.print_colored("  3) Prepare Machines       - Prepare registered machines without deploying nodes")
            self.print_colored("  4) Plan Migration        - Build and save a migration plan without executing it")
            self.print_colored("  5) Execute Migration    - Run the saved migration plan")
            self.print_colored("  6) Rollback Migration   - Recover a failed or interrupted migration")
            self.print_colored("  7) Finalize Migration   - Clean up source artifacts after verified migration")
            self.print_colored("  8) Delete Deployment      - Remove deployed Edge Node")
            self.print_colored("  9) Deployment Status      - Check detailed deployment status")
            self.print_colored("  m) Migrate Install State  - Back-populate 1.8.0 tracking from running hosts")
            print()
            self.print_colored("  0) Back to Main Menu")
            print()

            deploy_default = deployment_display.get('deployment_menu_default') or ('2' if deployment_status in ('never_deployed', 'deleted') else '0')
            choice = self.get_input("Select option", deploy_default)

            if choice == '0':
                break
            elif choice == '1':
                self.deploy_full()
            elif choice == '2':
                self.deploy_docker_only()
            elif choice == '3':
                self.prepare_registered_machines(skip_gpu=False)
            elif choice == '4':
                self.plan_instance_migration()
            elif choice == '5':
                self.execute_saved_migration_plan()
            elif choice == '6':
                self.rollback_saved_migration_plan()
            elif choice == '7':
                self.finalize_saved_migration_plan()
            elif choice == '8':
                self.delete_edge_node()
            elif choice == '9':
                self.deployment_status()
            elif choice.lower() == 'm':
                self.migrate_install_tracking()
            else:
                self.print_colored("Invalid option. Valid choices are 0-9 or m.", 'red')
                self.wait_for_enter()

    def operations_menu(self) -> None:
        """Show operations submenu for service management and connectivity"""
        while True:
            self.print_header("Operations Menu")

            # Show if configuration exists
            has_config = self.check_hosts_config()
            has_machines = self.has_fleet_machines()
            if has_config:
                self.load_configuration()
                hosts = _get_gpu_hosts(self.inventory)
                self.print_section(f"Available for {len(hosts)} configured node(s)")
            elif has_machines:
                self.load_configuration()
                machines = self.get_fleet_machines_as_hosts()
                self.print_section(f"{len(machines)} registered machine(s), no node instances yet")
                self.print_colored(
                    "\U0001f4a1 Use 'Discover Services' (Main Menu \u2192 1 \u2192 7) to import existing services,\n"
                    "   or 'Add New Node' (Main Menu \u2192 1 \u2192 1) to create new instances.",
                    'yellow',
                )
            else:
                self.print_section("No nodes configured")
                self.print_colored("\u26a0\ufe0f  Configure nodes first to use operations tools", 'yellow')

            self.print_colored("Operations Menu", 'cyan', bold=True)
            print()
            self.print_colored("\U0001f527 SERVICE MANAGEMENT", 'cyan', bold=True)
            self.print_colored("  1) Start Service          - Start the Edge Node service")
            self.print_colored("  2) Stop Service           - Stop the Edge Node service")
            self.print_colored("  3) Restart Service        - Restart the Edge Node service")
            print()
            self.print_colored("\U0001f4e1 CONNECTIVITY", 'cyan', bold=True)
            self.print_colored("  4) Test Connectivity      - Verify connection to configured nodes")
            print()
            self.print_colored("\U0001f9e9 SERVICE UPDATES", 'cyan', bold=True)
            self.print_colored("  5) Update Service File    - Re-apply the versioned service template")
            print()
            self.print_colored("  0) Back to Main Menu")
            print()

            choice = self.get_input("Select option", "0")

            if choice == '0':
                break
            elif choice in ('1', '2', '3', '4', '5') and not has_config:
                if has_machines:
                    self.print_colored(
                        "Machines registered but no node instances configured.\n"
                        "Use 'Discover Services' (Main Menu \u2192 1 \u2192 7) to import existing services,\n"
                        "or 'Add New Node' (Main Menu \u2192 1 \u2192 1) to create new instances.",
                        'red',
                    )
                else:
                    self.print_colored("No nodes configured. Please configure nodes first (Main Menu \u2192 1).", 'red')
                self.wait_for_enter()
                continue
            elif choice == '1':
                self.start_edge_node_service()
            elif choice == '2':
                self.stop_edge_node_service()
            elif choice == '3':
                self.restart_edge_node_service()
            elif choice == '4':
                self.test_connectivity()
            elif choice == '5':
                self.update_service_file()
            else:
                self.print_colored("Invalid option. Valid choices are 0-5.", 'red')
                self.wait_for_enter()

    def _get_service_version_drift(self, hosts: Dict[str, Dict[str, Any]]) -> Tuple[str, List[str], List[str]]:
        """Return target version plus outdated and current host lists in inventory order."""
        target_service_version = self.get_mnl_service_version()
        outdated_hosts = []
        current_hosts = []

        for host_name, host_config in hosts.items():
            applied_service_version = self.get_host_service_file_version(host_config)
            if applied_service_version == target_service_version:
                current_hosts.append(host_name)
            else:
                outdated_hosts.append(host_name)

        return target_service_version, outdated_hosts, current_hosts

    @staticmethod
    def _is_service_update_candidate(node_status: str) -> bool:
        """Return whether a node should participate in service-file update checks."""
        return node_status not in {'never_deployed', 'deleted', 'not_deployed'}

    @classmethod
    def _is_service_update_candidate_instance(cls, instance_view: Dict[str, Any]) -> bool:
        """Return whether a grouped instance should participate in update recommendations."""
        return cls._is_service_update_candidate(str(instance_view.get('status') or 'unknown'))

    def _get_service_update_inventory(
        self,
        hosts: Dict[str, Dict[str, Any]],
    ) -> Tuple[str, Dict[str, Dict[str, Any]], List[str], List[str], List[str]]:
        """Return eligible hosts plus service-version drift information."""
        eligible_hosts = {}
        skipped_hosts = []

        for host_name, host_config in hosts.items():
            node_status = self._get_node_status_info(host_name).get('status', 'unknown')
            if self._is_service_update_candidate(node_status):
                eligible_hosts[host_name] = host_config
            else:
                skipped_hosts.append(host_name)

        target_service_version, outdated_hosts, current_hosts = self._get_service_version_drift(eligible_hosts)
        return target_service_version, eligible_hosts, outdated_hosts, current_hosts, skipped_hosts

    def _offer_startup_service_update(self) -> None:
        """Offer a direct service-file update when startup drift is detected."""
        if not self.check_hosts_config():
            return

        deployment_status = self.active_config.get('deployment_status', 'never_deployed')
        if deployment_status in ('never_deployed', 'deleted'):
            return

        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        if not hosts:
            return

        target_service_version, eligible_hosts, outdated_hosts, _, _ = self._get_service_update_inventory(hosts)
        if not eligible_hosts or not outdated_hosts:
            return

        overrides = self._get_service_overrides()
        print()
        self.print_colored("⚠️  Service file update available", 'yellow', bold=True)
        self.print_colored(f"  • Target version: {target_service_version}", 'yellow')
        self.print_colored(f"  • Needs update: {', '.join(outdated_hosts)}", 'red')
        self.print_colored(
            "  • Apply now to re-template and restart the service on those nodes.",
            'white'
        )
        if overrides:
            self.print_colored(
                f"  • Active service overrides: {len(overrides)}. They will be re-applied too.",
                'yellow'
            )

        if self.get_input("Update outdated service files now? (Y/n)", "Y").lower() != 'y':
            if self.get_input(
                "Service updates are recommended to keep nodes aligned. Skip for now anyway? (y/N)",
                "N",
            ).lower() != 'y':
                self.print_colored("Proceeding with the recommended service update.", 'yellow')
            else:
                self.print_colored(
                    "You can update later via Main Menu -> 3 -> Update Service File.",
                    'yellow'
                )
                print()
                return

        self._apply_service_template_to_hosts(
            outdated_hosts,
            overrides=overrides or None,
            last_applied_action="update_service_file",
            progress_message="Applying service file update...",
            success_message=f"Service file update applied on {len(outdated_hosts)} node(s).",
            failure_message="Service file update encountered errors. Check the output above.",
        )
        self.wait_for_enter()
        print()

    def _refresh_service_update_status(self, selected_host_names: List[str]) -> None:
        """Best-effort verification after a service template update."""
        self.print_colored("\nVerifying service update status...", 'cyan')

        try:
            node_status_data = self._get_real_time_node_status()
        except Exception as e:
            self.print_debug(f"Unable to refresh status after service update: {e}")
            node_status_data = {}

        if not node_status_data:
            self.print_colored(
                "⚠️  Service update completed, but live verification was unavailable. Stored version was updated locally.",
                'yellow'
            )
            for host_name in selected_host_names:
                self._update_node_status(host_name, 'running')
            return

        refreshed_hosts = set()
        for host_name in selected_host_names:
            status_data = node_status_data.get(host_name)
            if not status_data:
                continue

            resolved_status = status_data.get('status', 'running')
            self._update_node_status(host_name, resolved_status)
            refreshed_hosts.add(host_name)

        missing_hosts = [host_name for host_name in selected_host_names if host_name not in refreshed_hosts]
        if missing_hosts:
            self.print_colored(
                f"⚠️  No live verification returned for: {', '.join(sorted(missing_hosts))}. Keeping them marked as running.",
                'yellow'
            )
            for host_name in missing_hosts:
                self._update_node_status(host_name, 'running')

    def _apply_service_template_to_hosts(
        self,
        selected_host_names: List[str],
        *,
        overrides: Optional[Dict[str, Any]] = None,
        last_applied_action: str,
        progress_message: str,
        success_message: str,
        failure_message: str,
    ) -> bool:
        """Apply the service template playbook to selected nodes."""
        if not self._ensure_helper_mode_supported_for_hosts(
            selected_host_names,
            action_label="apply service-helper changes",
        ):
            return False

        playbook_path = self.config_dir / 'playbooks/customize_service.yml'
        if not playbook_path.exists():
            self.print_colored(f"Playbook not found: {playbook_path}", 'red')
            self.wait_for_enter()
            return False

        for host_name in selected_host_names:
            self._update_node_status(host_name, 'deploying')

        self.print_colored(f"\n{progress_message}", 'cyan')
        success, _, _, _ = self.run_generated_playbook(
            playbook_path,
            selected_host_names,
            machine_scope=False,
            extra_vars=overrides,
            last_applied_action=last_applied_action,
            show_output=True,
            timeout=self.connection_timeout,
        )

        if success:
            self.print_colored(f"\n✅ {success_message}", 'green')
            self.record_service_file_version(selected_host_names)
            self._refresh_service_update_status(selected_host_names)
            return True

        self.print_colored(f"\n❌ {failure_message}", 'red')
        for host_name in selected_host_names:
            self._update_node_status(host_name, 'error')
        return False

    def update_service_file(self) -> None:
        """Operator-facing flow for applying the current service template version."""
        if not self.check_hosts_config():
            self.print_colored("No nodes configured!", 'red')
            self.wait_for_enter()
            return

        self.load_configuration()
        deployment_status = self.active_config.get('deployment_status', 'never_deployed')
        if deployment_status in ('never_deployed', 'deleted'):
            self.print_colored("No active deployment found. Deploy nodes before updating the service file.", 'yellow')
            self.wait_for_enter()
            return

        hosts = _get_gpu_hosts(self.inventory)
        if not hosts:
            self.print_colored("No hosts found in inventory.", 'red')
            self.wait_for_enter()
            return

        overrides = self._get_service_overrides()
        target_service_version, eligible_hosts, outdated_hosts, current_hosts, skipped_hosts = self._get_service_update_inventory(hosts)

        self.print_header("Update Service File")
        self.print_colored(f"Target service template version: {target_service_version}", 'cyan', bold=True)

        if skipped_hosts:
            self.print_colored(
                f"Skipping nodes without a deployed service: {', '.join(skipped_hosts)}",
                'yellow'
            )

        if outdated_hosts:
            self.print_colored(f"Needs update ({len(outdated_hosts)}): {', '.join(outdated_hosts)}", 'red')
        else:
            self.print_colored("All nodes already have the current service file version.", 'green')

        if current_hosts:
            self.print_colored(f"Already current ({len(current_hosts)}): {', '.join(current_hosts)}", 'green')

        if overrides:
            self.print_colored(
                f"Active service overrides: {len(overrides)}. They will be included in this update.",
                'yellow'
            )
        print()

        if not outdated_hosts:
            if self.get_input("Re-apply the service file anyway? (y/N)", "N").lower() != 'y':
                self.print_colored("Service update cancelled.", 'yellow')
                self.wait_for_enter()
                return
            initial_selection = set(hosts.keys())
            preselection_label = "all configured nodes"
        else:
            initial_selection = set(outdated_hosts)
            preselection_label = "nodes that need a service update"

        selected_host_names = self.select_hosts(
            eligible_hosts,
            "update service file",
            preselect_mode='none',
            initial_selection=initial_selection,
            preselection_label=preselection_label,
        )
        if not selected_host_names:
            self.print_colored("Cancelled — no hosts selected.", 'yellow')
            self.wait_for_enter()
            return

        self.print_colored(f"\n🖥️  Target nodes ({len(selected_host_names)}):", 'cyan', bold=True)
        for name in selected_host_names:
            config = hosts[name]
            ip = config.get('ansible_host', '?')
            current_version = self.get_host_service_file_version(config)
            version_color = 'red' if current_version != target_service_version else 'green'
            self.print_colored(
                f"   • {name}: {ip} | service {current_version} -> {target_service_version}",
                version_color
            )

        if overrides:
            self.print_colored("   • active overrides will also be re-applied", 'yellow')

        if self.get_input("\nProceed with service file update? (y/n)", "y").lower() != 'y':
            self.print_colored("Service update cancelled.", 'yellow')
            self.wait_for_enter()
            return

        self._apply_service_template_to_hosts(
            selected_host_names,
            overrides=overrides or None,
            last_applied_action="update_service_file",
            progress_message="Applying service file update...",
            success_message=f"Service file update applied on {len(selected_host_names)} node(s).",
            failure_message="Service file update encountered errors. Check the output above.",
        )

        self.wait_for_enter()

    # -- Customize Service --

    def customize_service(self) -> None:
        """Entry point: show disclaimer then open sub-menu."""
        self.print_header("Customize Service")
        self.print_colored("⚠️  WARNING", 'red', bold=True)
        self.print_colored("This feature lets you override service template variables", 'yellow')
        self.print_colored("(e.g. Docker image, GPU flags) and re-deploy the service", 'yellow')
        self.print_colored("file WITHOUT a full site.yml run.", 'yellow')
        print()
        self.print_colored("Overrides take precedence over group_vars defaults.", 'yellow')
        self.print_colored("A collection update will auto-clear all overrides.", 'yellow')
        print()

        confirm = self.get_input("Type 'yes' to continue", "")
        if confirm.lower() != 'yes':
            self.print_colored("Cancelled.", 'yellow')
            self.wait_for_enter()
            return

        self._customize_service_menu()

    def _customize_service_menu(self) -> None:
        """Sub-menu loop for managing service overrides."""
        while True:
            self.print_header("Customize Service")

            overrides = self._get_service_overrides()
            if overrides:
                self.print_colored("📋 Active Overrides:", 'cyan', bold=True)
                for var, val in overrides.items():
                    desc = CUSTOMIZABLE_VARS.get(var, {}).get('description', var)
                    self.print_colored(f"   {desc} ({var})", 'white')
                    self.print_colored(f"     = {val}", 'green')
            else:
                self.print_colored("No active overrides — defaults from group_vars are used.", 'white')
            print()

            self.print_colored("Options:", 'cyan', bold=True)
            self.print_colored("  1) Set/Edit Override    - Pick a variable and set a new value")
            self.print_colored("  2) Remove Override      - Remove a single override")
            self.print_colored("  3) Clear All Overrides  - Reset all to defaults")
            self.print_colored("  4) Apply Overrides      - Re-template & restart on selected nodes")
            self.print_colored("  0) Back")
            print()

            choice = self.get_input("Select option", "0")

            if choice == '0':
                break
            elif choice == '1':
                self._set_service_override()
            elif choice == '2':
                self._remove_service_override()
            elif choice == '3':
                self._clear_all_overrides()
            elif choice == '4':
                self._apply_service_overrides()
            else:
                self.print_colored("Invalid option.", 'red')
                self.wait_for_enter()

    def _set_service_override(self) -> None:
        """Pick a variable from the whitelist and enter a new value."""
        self.print_header("Set/Edit Override")

        var_names = list(CUSTOMIZABLE_VARS.keys())
        overrides = self._get_service_overrides()

        for idx, var in enumerate(var_names, 1):
            info = CUSTOMIZABLE_VARS[var]
            current = overrides.get(var)
            line = f"  {idx}) {info['description']} ({var})"
            if current is not None:
                line += f"  [current: {current}]"
            else:
                line += f"  [default: {info['default']}]"
            self.print_colored(line)
        print()

        pick = self.get_input(f"Select variable (1-{len(var_names)}, 0=cancel)", "0")
        if pick == '0':
            return
        try:
            idx = int(pick) - 1
            if idx < 0 or idx >= len(var_names):
                raise ValueError
        except ValueError:
            self.print_colored("Invalid selection.", 'red')
            self.wait_for_enter()
            return

        var = var_names[idx]
        info = CUSTOMIZABLE_VARS[var]

        self.print_colored(f"\nVariable : {var}", 'cyan')
        self.print_colored(f"Desc     : {info['description']}", 'white')
        self.print_colored(f"Example  : {info['example']}", 'white')
        current = overrides.get(var)
        if current is not None:
            self.print_colored(f"Current  : {current}", 'green')
        print()

        value = self.get_input("Enter new value (empty=cancel)")
        if not value:
            self.print_colored("Cancelled.", 'yellow')
            self.wait_for_enter()
            return

        overrides[var] = value
        self._save_service_overrides(overrides)
        self.print_colored(f"✅ Override saved: {var} = {value}", 'green')
        self.wait_for_enter()

    def _remove_service_override(self) -> None:
        """Remove a single override."""
        overrides = self._get_service_overrides()
        if not overrides:
            self.print_colored("No overrides to remove.", 'yellow')
            self.wait_for_enter()
            return

        self.print_header("Remove Override")

        keys = list(overrides.keys())
        for idx, var in enumerate(keys, 1):
            desc = CUSTOMIZABLE_VARS.get(var, {}).get('description', var)
            self.print_colored(f"  {idx}) {desc} ({var}) = {overrides[var]}")
        print()

        pick = self.get_input(f"Select override to remove (1-{len(keys)}, 0=cancel)", "0")
        if pick == '0':
            return
        try:
            idx = int(pick) - 1
            if idx < 0 or idx >= len(keys):
                raise ValueError
        except ValueError:
            self.print_colored("Invalid selection.", 'red')
            self.wait_for_enter()
            return

        var = keys[idx]
        del overrides[var]
        self._save_service_overrides(overrides)
        self.print_colored(f"✅ Override removed: {var}", 'green')
        self.wait_for_enter()

    def _clear_all_overrides(self) -> None:
        """Clear all overrides after confirmation."""
        overrides = self._get_service_overrides()
        if not overrides:
            self.print_colored("No overrides to clear.", 'yellow')
            self.wait_for_enter()
            return

        self.print_colored(f"This will remove {len(overrides)} override(s):", 'yellow')
        for var, val in overrides.items():
            self.print_colored(f"   {var} = {val}", 'white')
        print()

        confirm = self.get_input("Type 'yes' to confirm", "")
        if confirm.lower() != 'yes':
            self.print_colored("Cancelled.", 'yellow')
            self.wait_for_enter()
            return

        self._save_service_overrides({})
        self.print_colored("✅ All overrides cleared.", 'green')
        self.wait_for_enter()

    def _apply_service_overrides(self) -> None:
        """Select nodes, build ansible command with --extra-vars, run customize_service.yml."""
        if not self.check_hosts_config():
            self.print_colored("No nodes configured!", 'red')
            self.wait_for_enter()
            return

        self.load_configuration()
        all_hosts = _get_gpu_hosts(self.inventory)
        if not all_hosts:
            self.print_colored("No hosts found in inventory.", 'red')
            self.wait_for_enter()
            return

        overrides = self._get_service_overrides()

        self.print_header("Apply Service Overrides")
        if overrides:
            self.print_colored("📋 Overrides to apply:", 'cyan', bold=True)
            for var, val in overrides.items():
                desc = CUSTOMIZABLE_VARS.get(var, {}).get('description', var)
                self.print_colored(f"   {desc}: {val}", 'green')
        else:
            self.print_colored("No overrides set — defaults from group_vars will be applied.", 'white')
        print()

        selected_host_names = self.select_hosts(all_hosts, "apply service overrides", preselect_mode='all')
        if not selected_host_names:
            self.print_colored("Cancelled — no hosts selected.", 'yellow')
            self.wait_for_enter()
            return

        self.print_colored(f"\n🖥️  Target nodes ({len(selected_host_names)}):", 'cyan', bold=True)
        for name in selected_host_names:
            config = all_hosts[name]
            ip = config.get('ansible_host', '?')
            self.print_colored(f"   • {name}: {ip}", 'white')

        if self.get_input("\nProceed? (y/n)", "y").lower() != 'y':
            self.print_colored("Cancelled.", 'yellow')
            self.wait_for_enter()
            return

        self._apply_service_template_to_hosts(
            selected_host_names,
            overrides=overrides or None,
            last_applied_action="customize_service",
            progress_message="Applying service customization...",
            success_message=f"Service customization applied on {len(selected_host_names)} node(s).",
            failure_message="Customization encountered errors. Check the output above.",
        )

        self.wait_for_enter()

    def advanced_menu(self) -> None:
        """Show advanced menu with utilities and expert tools"""
        while True:
            self.print_header("Advanced Menu")

            # Show if configuration exists (inventory hosts or fleet machines)
            has_config = self.check_hosts_config()
            has_machines = self.has_fleet_machines()
            if has_config:
                self.load_configuration()
                hosts = _get_gpu_hosts(self.inventory)
                self.print_section(f"Available for {len(hosts)} configured node(s)")
            elif has_machines:
                self.load_configuration()
                machines = self.get_fleet_machines_as_hosts()
                self.print_section(f"{len(machines)} registered machine(s), no node instances yet")
                self.print_colored(
                    "\U0001f4a1 Use 'Discover Services' (Main Menu \u2192 1 \u2192 7) to import existing services,\n"
                    "   or 'Add New Node' (Main Menu \u2192 1 \u2192 1) to create new instances.",
                    'yellow',
                )
            else:
                self.print_section("No nodes configured")
                self.print_colored("\u26a0\ufe0f  Configure nodes first to use advanced tools", 'yellow')

            self.print_colored("Advanced Menu", 'cyan', bold=True)
            print()
            self.print_colored("\U0001f527 OPERATIONAL TOOLS", 'cyan', bold=True)
            self.print_colored("  1) SSH to Machine        - Connect to machine via SSH")
            self.print_colored("  2) Get Logs              - Stream logs from nodes")
            self.print_colored("  3) Write Logs to File    - Save node logs to local file")
            print()
            self.print_colored("\U0001f510 SECURITY TOOLS", 'red', bold=True)
            self.print_colored("  4) Import Private Keys   - Collect private keys from all nodes")
            self.print_colored("  5) SSH Key Management    - Install, validate, and track SSH keys")
            print()
            self.print_colored("\u2699\ufe0f  CUSTOMIZATION", 'cyan', bold=True)
            self.print_colored("  6) Customize Service     - Override service template variables")
            print()
            self.print_colored("  0) Back to Main Menu")
            print()

            choice = self.get_input("Select option", "0")

            if choice == '0':
                break
            # Machine-level operations (SSH, logs, keys) work with fleet machines
            elif choice in ('1', '2', '3', '4', '5') and not has_config and not has_machines:
                self.print_colored("No nodes or machines configured. Please configure nodes first (Main Menu \u2192 1).", 'red')
                self.wait_for_enter()
                continue
            # Service customization requires deployed node instances
            elif choice == '6' and not has_config:
                if has_machines:
                    self.print_colored(
                        "Machines registered but no node instances configured.\n"
                        "Use 'Discover Services' (Main Menu \u2192 1 \u2192 7) to import existing services first.",
                        'red',
                    )
                else:
                    self.print_colored("No nodes configured. Please configure nodes first (Main Menu \u2192 1).", 'red')
                self.wait_for_enter()
                continue
            elif choice == '1':
                self.ssh_into_node_machine()
            elif choice == '2':
                self.get_logs()
            elif choice == '3':
                self.write_logs_to_file()
            elif choice == '4':
                self.import_nodes_private_keys()
            elif choice == '5':
                self.ssh_key_management_menu()
            elif choice == '6':
                self.customize_service()
            else:
                self.print_colored("Invalid option. Valid choices are 0-6.", 'red')
                self.wait_for_enter()



    def import_nodes_private_keys(self) -> None:
        """Import private keys from all configured nodes"""
        if not self.check_hosts_config() and not self.has_fleet_machines():
            self.print_colored("No nodes or machines configured! Please configure nodes first.", 'red')
            self.wait_for_enter()
            return

        self.print_header("Import Nodes Private Keys")

        # WARNING MESSAGE
        self.print_section("\u26a0\ufe0f  CRITICAL SECURITY WARNING")
        self.print_colored("This operation will collect private keys from all configured nodes.", 'red', bold=True)
        self.print_colored("Private keys provide full access to node wallets and should be handled with extreme care.", 'red')
        self.print_colored("Only proceed if you understand the security implications.", 'red')
        print()

        # Load configuration
        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        # Fall back to fleet-state machines when no inventory hosts exist
        if not hosts:
            hosts = self.get_fleet_machines_as_hosts()

        if not hosts:
            self.print_colored("No nodes or machines configured.", 'yellow')
            self.wait_for_enter()
            return
        
        # Show nodes that will be processed
        self.print_section(f"Target Nodes ({len(hosts)})")
        for host_name, config in hosts.items():
            ip = config.get('ansible_host', 'Unknown')
            user = config.get('ansible_user', 'Unknown')
            self.print_colored(f"  • {host_name}: {user}@{ip}", 'white')
        
        # Confirm operation
        print()
        self.print_colored("This will:", 'yellow', bold=True)
        self.print_colored("  1. Connect to each node via SSH", 'yellow')
        self.print_colored("  2. Retrieve the private key file: /var/cache/edge_node/_local_cache/_data/e2.pem", 'yellow')
        self.print_colored("  3. Create a local 'node_keys' folder", 'yellow')
        self.print_colored("  4. Save each key as: node_keys/{node_name}_e2.pem", 'yellow')
        print()
        
        # Final confirmation
        confirm = self.get_input("Type 'yes' to proceed", "")
        if confirm.lower() != 'yes':
            self.print_colored("Operation cancelled.", 'yellow')
            self.wait_for_enter()
            return
        
        # Create keys directory
        keys_dir = os.path.join(os.getcwd(), 'node_keys')
        try:
            os.makedirs(keys_dir, exist_ok=True)
            self.print_colored(f"Created keys directory: {keys_dir}", 'green')
        except Exception as e:
            self.print_colored(f"Failed to create keys directory: {e}", 'red')
            self.wait_for_enter()
            return
        
        # Process each node
        self.print_section("Collecting Private Keys")
        successful_imports = 0
        failed_imports = 0
        
        for host_name, config in hosts.items():
            self.print_colored(f"\n📡 Processing {host_name}...", 'cyan')
            
            # Get connection details
            ip = config.get('ansible_host')
            user = config.get('ansible_user')
            
            if not ip or not user:
                self.print_colored(f"❌ Missing connection details for {host_name}", 'red')
                failed_imports += 1
                continue
            
            # Build SSH command to retrieve the private key
            ssh_cmd = ['ssh']
            
            # Add port if specified
            ssh_port = config.get('ansible_port', 22)
            if ssh_port != 22:
                ssh_cmd.extend(['-p', str(ssh_port)])
            
            # Handle authentication
            if 'ansible_ssh_pass' in config:
                # For password authentication, we need to use sshpass
                ssh_cmd = ['sshpass', '-p', config['ansible_ssh_pass']] + ssh_cmd
            elif 'ansible_ssh_private_key_file' in config:
                key_file = config['ansible_ssh_private_key_file']
                if key_file.startswith('~'):
                    key_file = os.path.expanduser(key_file)
                ssh_cmd.extend(['-i', key_file])
            
            # Add SSH options
            ssh_cmd.extend([
                '-o', 'StrictHostKeyChecking=no',
                '-o', 'UserKnownHostsFile=/dev/null',
                '-o', f'ConnectTimeout={self.ssh_connect_timeout}',
                f"{user}@{ip}",
                'cat /var/cache/edge_node/_local_cache/_data/e2.pem'
            ])

            try:
                # Execute SSH command to get the private key
                result = subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=self.connection_timeout)
                
                if result.returncode == 0 and result.stdout.strip():
                    # Save the private key to file
                    key_filename = f"{host_name}_e2.pem"
                    key_filepath = os.path.join(keys_dir, key_filename)
                    
                    with open(key_filepath, 'w') as f:
                        f.write(result.stdout)
                    
                    # Set restrictive permissions on the key file
                    os.chmod(key_filepath, 0o600)
                    
                    self.print_colored(f"✅ Successfully imported key: {key_filename}", 'green')
                    successful_imports += 1
                else:
                    error_msg = result.stderr.strip() if result.stderr else "No key content received"
                    self.print_colored(f"❌ Failed to retrieve key: {error_msg}", 'red')
                    failed_imports += 1
                    
            except subprocess.TimeoutExpired:
                self.print_colored(f"❌ Connection timeout to {host_name}", 'red')
                failed_imports += 1
            except FileNotFoundError as e:
                if 'sshpass' in str(e):
                    self.print_colored(f"❌ sshpass not found - install with: sudo apt-get install sshpass", 'red')
                else:
                    self.print_colored(f"❌ SSH command failed: {e}", 'red')
                failed_imports += 1
            except Exception as e:
                self.print_colored(f"❌ Error processing {host_name}: {e}", 'red')
                failed_imports += 1
        
        # Summary
        self.print_section("Import Summary")
        self.print_colored(f"✅ Successful imports: {successful_imports}", 'green')
        self.print_colored(f"❌ Failed imports: {failed_imports}", 'red')
        self.print_colored(f"📁 Keys saved to: {keys_dir}", 'cyan')
        
        if successful_imports > 0:
            self.print_colored("\n🔐 SECURITY REMINDER:", 'red', bold=True)
            self.print_colored("  • Private keys have been saved locally", 'red')
            self.print_colored("  • Keep these files secure and delete when no longer needed", 'red')
            self.print_colored("  • These keys provide full access to node wallets", 'red')
        
        self.wait_for_enter()

    def _select_network_environment(self) -> str:
        """Select network environment"""
        self.print_colored("\nNetwork Environment Options:")
        self.print_colored("  1) mainnet")
        self.print_colored("  2) testnet")
        self.print_colored("  3) devnet")

        current_env = self.get_mnl_app_env()
        if current_env:
            self.print_colored(f"Current: {current_env}", 'yellow')

        while True:
            choice = self.get_input("Select network environment (1-3)", "1")
            if choice == '1':
                return 'mainnet'
            elif choice == '2':
                return 'testnet'
            elif choice == '3':
                return 'devnet'
            else:
                self.print_colored("Invalid choice. Please enter 1, 2, or 3", 'red')

    def _configure_single_node(self, existing_config: Dict[str, Any] = None, previous_config: Dict[str, Any] = None) -> Dict[str, Any]:
        """Configure a single node"""
        while True:
            host = {}

            # "Use same credentials" shortcut for batch configuration
            if previous_config and not existing_config:
                prev_user = previous_config.get('ansible_user', '')
                prev_auth = 'SSH Key' if 'ansible_ssh_private_key_file' in previous_config else 'Password'
                self.print_colored(f"\nPrevious node: {prev_user}, {prev_auth} auth", 'cyan')
                use_same = self.get_input("Reuse these credentials for this node? (y/n)", "y")
                if use_same.lower() == 'y':
                    # Ask for IP and username (with previous as default)
                    host_address = self.get_input("Enter host (IP address or URL)", "", required=True)
                    username = self.get_input("Enter SSH username", prev_user)
                    # Copy credentials from previous config
                    host['ansible_host'] = host_address
                    host['ansible_user'] = username or prev_user
                    host['ansible_ssh_common_args'] = previous_config.get('ansible_ssh_common_args', '-o StrictHostKeyChecking=no')
                    for key in ('ansible_ssh_pass', 'ansible_become_password', 'ansible_ssh_private_key_file'):
                        if key in previous_config:
                            host[key] = previous_config[key]
                    # Set new node status
                    timestamp = datetime.now().isoformat()
                    host['node_status'] = 'never_deployed'
                    host['last_status_update'] = timestamp
                    if 'ansible_ssh_private_key_file' in host:
                        host['r1setup_ssh_auth_mode'] = SSH_AUTH_MODE_KEY_INSTALLED_UNVERIFIED
                        host['r1setup_ssh_primary_key_path'] = host['ansible_ssh_private_key_file']
                        host['r1setup_ssh_requires_revalidation'] = True
                    else:
                        host['r1setup_ssh_auth_mode'] = SSH_AUTH_MODE_PASSWORD_ONLY
                        host['r1setup_ssh_primary_key_path'] = None
                        host['r1setup_ssh_requires_revalidation'] = False
                    host['r1setup_ssh_last_verification_status'] = 'not_checked'
                    host['r1setup_managed_public_keys'] = previous_config.get('r1setup_managed_public_keys', [])
                    host[SERVICE_FILE_VERSION_FIELD] = DEFAULT_SERVICE_FILE_VERSION
                    # Show summary and confirm
                    auth_type = 'SSH Key' if 'ansible_ssh_private_key_file' in host else 'Password'
                    self.print_colored("\nConfiguration Summary:", 'yellow')
                    self.print_colored(f"Host: {host['ansible_host']}")
                    self.print_colored(f"User: {host['ansible_user']}")
                    self.print_colored(f"Auth: {auth_type}")
                    if self.get_input("\nConfirm this configuration? (y/n)", "y").lower() == 'y':
                        return host
                    self.print_colored("OK, entering full configuration...", 'yellow')
                    previous_config = None  # Don't offer shortcut again
                    continue
                # If "no", fall through to normal flow with previous values as defaults

            # Get existing values for defaults
            existing_ip = existing_config.get('ansible_host', '') if existing_config else ''
            existing_user = existing_config.get('ansible_user', '') if existing_config else 'root'
            existing_auth_type = 'password' if existing_config and 'ansible_ssh_pass' in existing_config else 'key' if existing_config else None

            # Use previous config values as defaults when available
            if previous_config and not existing_config:
                existing_user = previous_config.get('ansible_user', existing_user)
                existing_auth_type = 'password' if 'ansible_ssh_pass' in previous_config else 'key'

            # Show current values if updating
            if existing_config:
                self.print_colored("\nCurrent configuration:", 'yellow')
                self.print_colored(f"Host: {existing_ip}")
                self.print_colored(f"User: {existing_user}")
                self.print_colored(f"Auth: {'Password' if existing_auth_type == 'password' else 'SSH Key'}")
                self.print_colored("\nPress Enter to keep current values, or enter new values:", 'cyan')

            # Get host (IP address or URL)
            host_prompt = f"Enter host (IP address or URL)"
            if existing_ip:
                host_prompt += f" (current: {existing_ip})"
            host_address = self.get_input(host_prompt, existing_ip, required=True)
            host['ansible_host'] = host_address

            # Get username (default: root for new nodes, previous value for subsequent)
            default_user = existing_user if existing_user else 'root'
            user_prompt = "Enter SSH username"
            if existing_config:
                user_prompt += f" (current: {default_user})"
            username = self.get_input(user_prompt, default_user)
            if not username.strip():
                self.print_colored("Username cannot be empty", 'red')
                continue
            host['ansible_user'] = username

            # Authentication method selection (with retry loop)
            auth_configured = False
            while not auth_configured:
                # Authentication method
                self.print_colored("\nAuthentication method:")
                self.print_colored("  1) Password")
                self.print_colored("  2) SSH Key")

                # Set default based on existing config
                default_auth = "1" if existing_auth_type == 'password' else "2"
                auth_prompt = "Select authentication (1/2)"
                if existing_auth_type:
                    auth_prompt += f" (current: {existing_auth_type})"

                while True:
                    auth_choice = self.get_input(auth_prompt, default_auth)
                    if auth_choice in ['1', '2']:
                        break
                    self.print_colored("Invalid choice. Please enter 1 or 2", 'red')

                if auth_choice == '1':
                    # Password authentication
                    host.pop('ansible_ssh_private_key_file', None)
                    if existing_config and 'ansible_ssh_pass' in existing_config:
                        self.print_colored("\nPassword authentication - Press Enter to keep existing passwords", 'cyan')
                        ssh_pass = self.get_secure_input("Enter SSH password (Enter to keep current)")
                        if not ssh_pass.strip():
                            # Keep existing password
                            host['ansible_ssh_pass'] = existing_config['ansible_ssh_pass']
                            host['ansible_become_password'] = existing_config.get('ansible_become_password', existing_config['ansible_ssh_pass'])
                        else:
                            # New password provided
                            host['ansible_ssh_pass'] = ssh_pass
                            self.print_colored("\nFor sudo password:", 'yellow')
                            self.print_colored("  - Enter a different password if sudo requires it", 'yellow')
                            self.print_colored("  - Press Enter to use the same SSH password", 'yellow')
                            sudo_pass = self.get_secure_input("Enter sudo password")
                            host['ansible_become_password'] = sudo_pass.strip() or ssh_pass
                    else:
                        # New password authentication
                        ssh_pass = self.get_secure_input("Enter SSH password")
                        if not ssh_pass.strip():
                            self.print_colored("SSH password cannot be empty!", 'red')
                            continue  # This will go back to authentication method selection
                        host['ansible_ssh_pass'] = ssh_pass

                        self.print_colored("\nFor sudo password:", 'yellow')
                        self.print_colored("  - Enter a different password if sudo requires it", 'yellow')
                        self.print_colored("  - Press Enter to use the same SSH password", 'yellow')
                        sudo_pass = self.get_secure_input("Enter sudo password")
                        host['ansible_become_password'] = sudo_pass.strip() or ssh_pass
                    
                    auth_configured = True
                else:
                    # Key authentication
                    if existing_config:
                        existing_key = existing_config.get('ansible_ssh_private_key_file', '~/.ssh/id_rsa')
                    elif previous_config and 'ansible_ssh_private_key_file' in previous_config:
                        existing_key = previous_config['ansible_ssh_private_key_file']
                    else:
                        existing_key = '~/.ssh/id_rsa'

                    key_auth_success = False
                    while not key_auth_success:
                        key_prompt = "Enter path to SSH private key"
                        if existing_config and 'ansible_ssh_private_key_file' in existing_config:
                            key_prompt += f" (current: {existing_key})"

                        key_path = self.get_input(key_prompt, existing_key)
                        expanded_path = os.path.expanduser(key_path)
                        
                        # Validate SSH key file
                        validation_result = self._validate_ssh_key_file(expanded_path)
                        if validation_result['valid']:
                            host['ansible_ssh_private_key_file'] = key_path
                            host.pop('ansible_ssh_pass', None)
                            
                            # Prompt for sudo password
                            if existing_config and 'ansible_become_password' in existing_config:
                                self.print_colored("\nFor sudo password:", 'yellow')
                                self.print_colored("  - Enter a new sudo password to change it", 'yellow')
                                self.print_colored("  - Press Enter to keep existing password", 'yellow')
                                sudo_pass = self.get_secure_input("Enter sudo password")
                                if sudo_pass.strip():
                                    host['ansible_become_password'] = sudo_pass.strip()
                                else:
                                    host['ansible_become_password'] = existing_config['ansible_become_password']
                            else:
                                # New configuration
                                self.print_colored("\nFor sudo password:", 'yellow')
                                self.print_colored("  - Enter a sudo password if required", 'yellow')
                                self.print_colored("  - Press Enter if sudo doesn't require a password", 'yellow')
                                sudo_pass = self.get_secure_input("Enter sudo password")
                                if sudo_pass.strip():
                                    host['ansible_become_password'] = sudo_pass.strip()
                            
                            key_auth_success = True
                            auth_configured = True
                            break
                        
                        # Show specific error message
                        self.print_colored(f"SSH key validation failed: {validation_result['error']}", 'red')
                        self.print_colored(f"Path checked: {expanded_path}", 'yellow')
                        
                        if existing_config and 'ansible_ssh_private_key_file' in existing_config:
                            retry_choice = self.get_input("Choose an option:\n  1) Try another SSH key path\n  2) Switch to password authentication\n  3) Keep existing key\nSelect option (1/2/3)", "1")
                        else:
                            retry_choice = self.get_input("Choose an option:\n  1) Try another SSH key path\n  2) Switch to password authentication\nSelect option (1/2)", "1")

                        if retry_choice == '1':
                            continue  # Try another path
                        elif retry_choice == '2':
                            self.print_colored("Switching to password authentication...", 'cyan')
                            break  # Exit SSH key loop, will go back to auth method selection
                        elif retry_choice == '3' and existing_config and 'ansible_ssh_private_key_file' in existing_config:
                            # Keep existing key configuration
                            host['ansible_ssh_private_key_file'] = existing_config['ansible_ssh_private_key_file']
                            # Also keep existing sudo password if present
                            if 'ansible_become_password' in existing_config:
                                host['ansible_become_password'] = existing_config['ansible_become_password']
                            self.print_colored("Keeping existing SSH key configuration", 'yellow')
                            key_auth_success = True
                            auth_configured = True
                            break
                        else:
                            self.print_colored("Invalid choice.", 'red')
                    
                    # If user chose to switch to password auth, continue the auth loop
                    if not key_auth_success and not auth_configured:
                        continue

            # Preserve or set SSH common args
            host['ansible_ssh_common_args'] = existing_config.get('ansible_ssh_common_args', '-o StrictHostKeyChecking=no') if existing_config else '-o StrictHostKeyChecking=no'

            # Initialize or preserve node status fields
            if existing_config:
                # Preserve existing status unless it's being updated
                host['node_status'] = existing_config.get('node_status', 'unknown')
                host['last_status_update'] = existing_config.get('last_status_update', datetime.now().isoformat())
            else:
                # New node - set initial status
                timestamp = datetime.now().isoformat()
                host['node_status'] = 'never_deployed'
                host['last_status_update'] = timestamp

            existing_managed_keys = existing_config.get('r1setup_managed_public_keys', []) if existing_config else previous_config.get('r1setup_managed_public_keys', []) if previous_config else []
            existing_key_path = existing_config.get('ansible_ssh_private_key_file') if existing_config else None
            existing_verified_fingerprint = existing_config.get('r1setup_ssh_last_verified_fingerprint') if existing_config else None
            if 'ansible_ssh_private_key_file' in host:
                same_verified_key = (
                    existing_config
                    and existing_key_path == host['ansible_ssh_private_key_file']
                    and existing_config.get('r1setup_ssh_auth_mode') == SSH_AUTH_MODE_KEY_VERIFIED
                )
                host['r1setup_ssh_auth_mode'] = (
                    SSH_AUTH_MODE_KEY_VERIFIED if same_verified_key else SSH_AUTH_MODE_KEY_INSTALLED_UNVERIFIED
                )
                host['r1setup_ssh_primary_key_path'] = host['ansible_ssh_private_key_file']
                host['r1setup_ssh_requires_revalidation'] = not same_verified_key
            else:
                host['r1setup_ssh_auth_mode'] = SSH_AUTH_MODE_PASSWORD_ONLY
                host['r1setup_ssh_primary_key_path'] = None
                host['r1setup_ssh_requires_revalidation'] = False
            host['r1setup_ssh_last_verification_status'] = (
                existing_config.get('r1setup_ssh_last_verification_status', 'not_checked')
                if existing_config else 'not_checked'
            )
            host['r1setup_ssh_key_auth_verified_at'] = (
                existing_config.get('r1setup_ssh_key_auth_verified_at')
                if existing_config and host['r1setup_ssh_auth_mode'] == SSH_AUTH_MODE_KEY_VERIFIED else None
            )
            host['r1setup_ssh_primary_key_fingerprint'] = (
                existing_config.get('r1setup_ssh_primary_key_fingerprint')
                if existing_config and 'ansible_ssh_private_key_file' in host else None
            )
            host['r1setup_ssh_last_verified_fingerprint'] = (
                existing_verified_fingerprint
                if existing_config and host['r1setup_ssh_auth_mode'] == SSH_AUTH_MODE_KEY_VERIFIED else None
            )
            host['r1setup_managed_public_keys'] = existing_managed_keys
            host[SERVICE_FILE_VERSION_FIELD] = (
                self.get_host_service_file_version(existing_config) if existing_config else DEFAULT_SERVICE_FILE_VERSION
            )

            # Show summary and confirm
            self.print_colored("\nConfiguration Summary:", 'yellow')
            self.print_colored(f"Host: {host['ansible_host']}")
            self.print_colored(f"User: {host['ansible_user']}")
            self.print_colored(f"Auth: {'Password' if auth_choice == '1' else 'SSH Key'}")

            if self.get_input("\nConfirm this configuration? (y/n)", "y").lower() == 'y':
                return host

            self.print_colored("Let's reconfigure this node...", 'yellow')

    def _add_node(self) -> None:
        """Add a new node to existing configuration"""
        self.print_section("Add New Node")
        hosts = self.inventory['all']['children']['gpu_nodes']['hosts']

        while True:
            name = self._get_valid_hostname("Enter name for the new node", "")
            if name in hosts:
                self.print_colored(f"You already have a node named '{name}'! Please choose a different name.", 'red')
                continue
            break

        # Use last existing node's config as previous_config for defaults
        last_node_config = None
        if hosts:
            last_node_name = list(hosts.keys())[-1]
            last_node_config = hosts[last_node_name]
        new_host = self._configure_single_node(previous_config=last_node_config)
        new_host = self.config_manager.bind_host_to_existing_machine(name, new_host)

        bound_machine_id = str(new_host.get('r1setup_machine_id') or '').strip()
        if bound_machine_id:
            fleet_state = self.config_manager.get_fleet_state_copy()
            normalized_fleet = self.config_manager._normalize_fleet_state(fleet_state)
            machine_record = dict(normalized_fleet.get('fleet', {}).get('machines', {}).get(bound_machine_id, {}))
            existing_instances = list(machine_record.get('instance_names') or [])
            topology_mode = str(machine_record.get('topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE)

            if existing_instances:
                connection_display = self.config_manager._format_machine_connection_display(machine_record)
                if topology_mode != 'expert':
                    self.print_section("Expert Mode Required")
                    self.print_colored(
                        f"This machine already has assigned instances in the current config: {', '.join(sorted(existing_instances))}",
                        'yellow',
                    )
                    self.print_colored(
                        f"Machine: {bound_machine_id} | {connection_display}",
                        'white',
                    )
                    self.print_colored("Adding another node on the same machine is an expert-mode operation.", 'yellow')
                    self.print_colored(
                        f"Recommended minimum resources per node: {MIN_RECOMMENDED_NODE_CPU_CORES} CPU / {MIN_RECOMMENDED_NODE_MEMORY_GIB:.0f} GiB RAM.",
                        'yellow',
                    )
                    self.print_colored("Existing runtime names will be preserved for current instances.", 'cyan')
                    self.print_colored("New expert-mode instances will use unique runtime names.", 'cyan')
                    machine_specs = machine_record.get('machine_specs') or {}
                    if machine_specs:
                        recommendation = self.config_manager.assess_machine_resource_recommendation(
                            machine_specs,
                            planned_instances=len(existing_instances) + 1,
                        )
                        self.print_colored(
                            f"Observed machine capacity: {self.config_manager._format_machine_specs_summary(machine_specs)}",
                            'cyan',
                        )
                        self.print_colored(recommendation['message'], recommendation['color'])

                    proceed_expert = self.get_input("Continue and convert this machine to expert mode? (y/n)", "n").lower() == 'y'
                    if not proceed_expert:
                        self.print_colored("Node addition cancelled. No changes were saved.", 'yellow')
                        return

                    machine_record = self.config_manager.promote_machine_to_expert(bound_machine_id, self.inventory)
                    new_host['r1setup_topology_mode'] = 'expert'
                    new_host['r1setup_runtime_name_policy'] = 'normalize_to_target'
                else:
                    self.print_colored(
                        f"Detected existing expert-mode machine '{bound_machine_id}'. The new node will be added as another instance.",
                        'cyan',
                    )
                    new_host['r1setup_topology_mode'] = 'expert'
                    new_host['r1setup_runtime_name_policy'] = 'normalize_to_target'

        self.config_manager.apply_runtime_snapshot_to_host_config(name, new_host)
        hosts[name] = new_host
        self._save_configuration()
        self.print_colored(f"Node '{name}' added successfully!", 'green')
        self.print_colored("\n💡 Next: Deploy your updated configuration from Main Menu → Deployment Menu", 'cyan')

    def _update_node(self) -> None:
        """Update an existing node"""
        hosts = self.inventory['all']['children']['gpu_nodes']['hosts']
        if not hosts:
            self.print_colored("No nodes configured!", 'red')
            return

        self.print_section("Select Node to Update")
        node_list = list(hosts.keys())
        for i, name in enumerate(node_list, 1):
            ip = hosts[name].get('ansible_host', 'Unknown')
            self.print_colored(f"  {i}) {name} ({ip})")

        while True:
            try:
                choice = int(self.get_input("Select node number", "1")) - 1
                if 0 <= choice < len(node_list):
                    original_name = node_list[choice]
                    break
                self.print_colored("Invalid selection", 'red')
            except ValueError:
                self.print_colored("Please enter a number", 'red')

        self.print_colored(f"Updating node: {original_name}", 'yellow')

        # Ask if user wants to rename the node
        rename_node = self.get_input(f"\nDo you want to rename this node? Current name: '{original_name}' (y/n)", "n").lower() == 'y'

        new_name = original_name
        if rename_node:
            while True:
                new_name = self._get_valid_hostname(f"Enter new name for node (current: {original_name})", original_name)
                if new_name == original_name:
                    self.print_colored("New name is the same as current name.", 'yellow')
                    break
                if new_name in hosts:
                    self.print_colored(f"Node name '{new_name}' already exists! Please choose a different name.", 'red')
                    continue
                break

        # Update node configuration
        existing_config = hosts[original_name].copy()
        updated_config = self._configure_single_node(existing_config)
        updated_config = self.config_manager.prepare_host_for_persistence(
            new_name,
            updated_config,
            previous_host_config=existing_config,
        )

        # Handle name change
        name_changed = new_name != original_name
        if name_changed:
            # Remove old entry and add new one
            del hosts[original_name]
            hosts[new_name] = updated_config
            # Set status to pending_restart for renamed nodes
            self._update_node_status(new_name, 'pending_restart')
            self.print_colored(f"Node renamed from '{original_name}' to '{new_name}'", 'green')
        else:
            # Update existing entry
            hosts[original_name] = updated_config

        self._save_configuration()

        if name_changed:
            self.print_colored(f"Node '{original_name}' updated and renamed to '{new_name}' successfully!", 'green')
            self.print_colored(f"Status: ", 'cyan', end='')
            self._display_node_status(new_name, compact=True)
            print()  # New line after status
            self.print_colored("\n💡 Recommendation:", 'cyan', bold=True)
            self.print_colored("Since you changed the node name, the node status is now 'Pending Restart'.", 'yellow')
            self.print_colored("Use Operations Menu (Main Menu \u2192 3) \u2192 Restart Service to update the status.", 'yellow')
            self.print_colored(f"When prompted, select ONLY the renamed node '{new_name}' to avoid", 'yellow')
            self.print_colored("disturbing other running nodes. This will ensure the renamed node", 'yellow')
            self.print_colored("starts with its updated configuration.", 'white')
        else:
            self.print_colored(f"Node '{original_name}' updated successfully!", 'green')

    def _delete_node(self) -> None:
        """Delete a node"""
        hosts = self.inventory['all']['children']['gpu_nodes']['hosts']
        if not hosts:
            self.print_colored("No nodes configured!", 'red')
            return

        self.print_section("Select Node to Delete")
        node_list = list(hosts.keys())
        for i, name in enumerate(node_list, 1):
            ip = hosts[name].get('ansible_host', 'Unknown')
            self.print_colored(f"  {i}) {name} ({ip})")

        while True:
            try:
                choice = int(self.get_input("Select node number", "1")) - 1
                if 0 <= choice < len(node_list):
                    name = node_list[choice]
                    break
                self.print_colored("Invalid selection", 'red')
            except ValueError:
                self.print_colored("Please enter a number", 'red')

        node_config = hosts[name]
        ip = node_config.get('ansible_host', 'Unknown')
        user = node_config.get('ansible_user', 'Unknown')
        fleet_state = self.config_manager._normalize_fleet_state(self.config_manager.get_fleet_state_copy())
        machine_id = str(node_config.get('r1setup_machine_id') or self.config_manager._derive_machine_id(name, node_config)).strip()
        machine_record = dict(fleet_state.get('fleet', {}).get('machines', {}).get(machine_id, {}))
        remaining_instances = [instance for instance in machine_record.get('instance_names', []) if instance != name]

        if remaining_instances:
            confirmation_prompt = (
                f"Delete node '{name}' ({user}@{ip})? Remaining on this machine: {', '.join(sorted(remaining_instances))} (y/n)"
            )
        else:
            confirmation_prompt = (
                f"Delete node '{name}' ({user}@{ip})? This is the last instance on machine '{machine_id}'. "
                "The machine will remain registered as prepared. (y/n)"
            )

        if self.get_input(confirmation_prompt, "n").lower() == 'y':
            del hosts[name]
            self.config_manager.remove_instance_from_fleet_state(name)
            self._save_configuration()
            self.print_colored(f"Node '{name}' deleted successfully!", 'green')
            if (
                len(remaining_instances) == 1
                and str(machine_record.get('topology_mode') or DEFAULT_MACHINE_TOPOLOGY_MODE) == 'expert'
            ):
                remaining_instance = remaining_instances[0]
                self.print_colored(
                    (
                        f"Machine '{machine_id}' remains in expert mode with the remaining instance "
                        f"'{remaining_instance}'. No automatic downgrade to standard was performed."
                    ),
                    'yellow',
                )
                self.print_colored(
                    "Runtime names stay stable in this mode. Normalize back to standard only via an explicit future action.",
                    'yellow',
                )

    def _create_new_configuration(self) -> None:
        """Create completely new configuration"""
        if self.get_input("This will overwrite your current configuration. Continue? (y/n)", "n").lower() == 'y':
            # Backup existing config
            if self.config_file.exists():
                backup_dir = self.config_dir / 'hosts-history'
                backup_dir.mkdir(exist_ok=True)
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                backup_file = backup_dir / f'hosts-{timestamp}.yml'
                self.config_file.rename(backup_file)
                self.print_colored(f"Configuration backed up to: {backup_file}", 'green')

            self.config_manager._reset_inventory_for_new_config()
            self._create_machine_first_configuration()

    def view_configuration(self) -> None:
        """View current configuration"""
        self.print_header("Current Configuration")

        # Show active configuration info
        active_config_name = self.active_config.get('config_name')
        if active_config_name:
            self.print_section("Active Configuration")
            self.print_colored(f"Configuration Name: {active_config_name}", 'green')
            self.print_colored(f"Environment: {self.active_config.get('environment', 'Unknown')}", 'green')
            self.print_colored(f"Nodes Count: {self.active_config.get('nodes_count', 0)}", 'green')
            created_at = self.active_config.get('created_at')
            if created_at:
                created_str = _parse_iso_datetime(created_at) or str(created_at)
                self.print_colored(f"Created: {created_str}", 'green')
        else:
            self.print_colored("No active configuration", 'red')

        # Show network environment
        env = self.get_mnl_app_env()
        self.print_colored(f"\nCurrent Network Environment: {env if env else 'Not set'}",
                           'green' if env else 'red')
        self.print_colored(f"Current Service Template Version: {self.get_mnl_service_version()}", 'green')

        # Load and show hosts
        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)

        if not hosts:
            self.print_colored("\nNo nodes configured!", 'red')
        else:
            self.print_section(f"Configured Nodes ({len(hosts)})")
            for name, config in hosts.items():
                # Get status information
                status_info = self._get_node_status_info(name)
                status = status_info['status']
                status_emoji, status_color, status_desc = self._get_status_display_info(status)
                
                self.print_colored(f"\nNode: {name} ", 'yellow', end='')
                self.print_colored(f"[{status_emoji} {status_desc}]", status_color)
                self.print_colored(f"  service_file_version: {self.get_host_service_file_version(config)}")
                
                for key, value in config.items():
                    # Skip displaying status fields in the config details
                    if key in ['node_status', 'last_status_update', SERVICE_FILE_VERSION_FIELD]:
                        continue
                    if any(k in key.lower() for k in ["password", "key"]):
                        value = "********"
                    self.print_colored(f"  {key}: {value}")

        self.wait_for_enter()

    def test_connectivity(self) -> None:
        """Test connectivity to configured nodes"""
        if not self.check_hosts_config():
            self.print_colored("No nodes configured! Please configure nodes first.", 'red')
            self.wait_for_enter()
            return

        self.print_header("Testing Node Connectivity")

        # Load configuration to show nodes being tested
        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        env = self.get_mnl_app_env()

        # Show pre-test information
        self.print_colored(f"🔧 Connectivity Test Details:", 'cyan', bold=True)
        self.print_colored(f"   • Network: {env if env else 'Not set'}", 'green' if env else 'red')
        self.print_colored(f"   • Nodes to test: {len(hosts)}", 'white')

        self.print_colored(f"\n🖥️  Testing connectivity to:", 'cyan', bold=True)
        for name, config in hosts.items():
            ip = config.get('ansible_host', 'Unknown')
            user = config.get('ansible_user', 'Unknown')
            auth_type = "Password" if 'ansible_ssh_pass' in config else "SSH Key"
            self.print_colored(f"   • {name}: {user}@{ip} ({auth_type})", 'white')

        playbook_path = self.config_dir / 'playbooks/test_connection.yml'
        if not playbook_path.exists():
            self.print_colored(f"❌ Test playbook not found: {playbook_path}", 'red')
            self.wait_for_enter()
            return

        cmd = (f"ANSIBLE_CONFIG={os.environ['ANSIBLE_CONFIG']} "
               f"ANSIBLE_COLLECTIONS_PATH={os.environ['ANSIBLE_COLLECTIONS_PATH']} "
               f"ANSIBLE_HOME={os.environ['ANSIBLE_HOME']} "
               f"ansible-playbook -i {self.config_file} {playbook_path}")

        self.print_colored(f"\n🔍 Running connectivity test...", 'yellow')
        success, output = self.run_command(cmd, show_output=False, timeout=self.connection_timeout)

        # Parse the results
        connectivity_results = self._parse_connectivity_output(output)
        
        # Display formatted results
        self._display_connectivity_results(connectivity_results, hosts)

        self.wait_for_enter()

    def _display_copy_friendly_addresses(self, host_names: List[str]) -> None:
        """Display copy-friendly node addresses after successful deployment"""
        self.print_colored(f"\n🎉 Deployment Complete! Getting node addresses...", 'green', bold=True)
        
        # Get node addresses for the deployed hosts
        playbook_path = self.config_dir / 'playbooks/get_node_info.yml'
        if not playbook_path.exists():
            self.print_colored(f"Node info playbook not found: {playbook_path}", 'red')
            return

        success, output, _, _ = self.run_generated_playbook(
            playbook_path,
            host_names,
            machine_scope=False,
            last_applied_action='get_node_info',
            show_output=False,
            timeout=self.connection_timeout,
        )

        if success:
            node_results = self._parse_node_info_output(output)

            # Filter results to only show the successfully deployed nodes
            successful_deployed_nodes = []
            for host_name in host_names:
                if host_name in node_results and node_results[host_name]['status'] == 'success':
                    successful_deployed_nodes.append((host_name, node_results[host_name]))
            
            if successful_deployed_nodes:
                
                self.print_colored(f"\n📋 Your Node Addresses (Ready to Copy!):", 'green', bold=True)
                self.print_colored(f"\n Copy addresses below  and and link them to your licenses in ratio1.ai dashboard.", 'blue')
                
                self.print_colored("=" * 55, 'cyan')
                
                for i, (node_name, result) in enumerate(successful_deployed_nodes, 1):
                    eth_address = result['data'].get('eth_address', 'N/A')
                    self.print_colored(f"{i}. {node_name}", 'yellow')
                    print(f"   {eth_address}")
                    print()
                
                self.print_colored("\n💡 Tip: Double-click to select an address, then Ctrl+Shift+C (Command+Shift+C on Mac) to copy", 'cyan')
                self.print_colored("\U0001f4a1 These addresses will also be available via menu option 5", 'cyan')
            else:
                self.print_colored(f"\n\u26a0\ufe0f  Node addresses not ready yet. Use menu option 5 to check again later.", 'yellow')
        else:
            self.print_colored(f"\n\u26a0\ufe0f  Could not retrieve node addresses at this time. Use menu option 5 to check later.", 'yellow')

    def _fetch_node_info_results(
        self,
        progress_message: str = "Retrieving node info...",
        host_names: Optional[List[str]] = None,
    ) -> Optional[Dict[str, Dict[str, Any]]]:
        """Fetch parsed node info results without forcing callers to duplicate playbook logic."""
        playbook_path = self.config_dir / 'playbooks/get_node_info.yml'
        if not playbook_path.exists():
            self.print_colored(f"Node info playbook not found: {playbook_path}", 'red')
            return None

        self.print_colored(progress_message, 'yellow')
        target_hosts = host_names or list(_get_gpu_hosts(self.inventory).keys())
        success, output, _, _ = self.run_generated_playbook(
            playbook_path,
            target_hosts,
            machine_scope=False,
            last_applied_action='get_node_info',
            show_output=False,
            timeout=self.connection_timeout * 2,
        )

        if not success and "timed out" in output.lower():
            self.print_colored(f"Node info retrieval timed out after {self.connection_timeout * 2} seconds", 'red')
            self.print_colored("Some nodes may be offline or not responding", 'yellow')
            return None

        node_results = self._parse_node_info_output(output)
        if not node_results:
            self.print_colored("No node information could be parsed from the playbook output.", 'yellow')
            self.print_colored("This might be because the nodes are not running or not accessible.", 'yellow')
            return None

        return node_results

    def _parse_node_info_output(self, output: str) -> Dict[str, Dict[str, Any]]:
        """Parse node info output to extract both successful and failed nodes"""
        node_results = {}
        import re
        
        # First, detect unreachable nodes
        unreachable_pattern = r'fatal: \[([^\]]+)\]: UNREACHABLE!'
        unreachable_matches = re.findall(unreachable_pattern, output)
        
        for node_name in unreachable_matches:
            node_results[node_name] = {
                'status': 'unreachable',
                'data': None
            }
            self.print_debug(f"Found unreachable node: {node_name}")
        
        # Also check for other unreachable patterns
        unreachable_pattern2 = r'unreachable: \[([^\]]+)\]'
        unreachable_matches2 = re.findall(unreachable_pattern2, output)
        
        for node_name in unreachable_matches2:
            if node_name not in node_results:
                node_results[node_name] = {
                    'status': 'unreachable',
                    'data': None
                }
                self.print_debug(f"Found unreachable node (pattern2): {node_name}")
        
        # Now parse successful nodes (existing logic from _get_node_info_data)
        result_blocks = re.split(r'(?=ok: \[[^\]]+\] => \{)', output)
        
        for block in result_blocks:
            if not block.strip():
                continue
                
            # Extract node name from the block
            node_match = re.search(r'ok: \[([^\]]+)\] => \{', block)
            if not node_match:
                continue
                
            node_name = node_match.group(1)
            self.print_debug(f"Processing successful node: {node_name}")
            
            # Check if this block contains node_info.stdout_lines
            if '"node_info.stdout_lines":' not in block:
                self.print_debug(f"No node_info.stdout_lines found for {node_name}")
                continue
            
            try:
                # Extract the JSON lines from stdout_lines array
                stdout_start = block.find('"node_info.stdout_lines":')
                if stdout_start == -1:
                    continue
                
                # Find the opening bracket for the array
                array_start = block.find('[', stdout_start)
                if array_start == -1:
                    continue
                
                # Count brackets to find the matching closing bracket
                bracket_count = 0
                array_end = array_start
                for i in range(array_start, len(block)):
                    if block[i] == '[':
                        bracket_count += 1
                    elif block[i] == ']':
                        bracket_count -= 1
                        if bracket_count == 0:
                            array_end = i
                            break
                
                if bracket_count != 0:
                    continue
                
                # Extract the array content
                array_content = block[array_start+1:array_end]
                
                # Extract all quoted strings from the array content using regex
                # Pattern to match quoted strings, handling escaped quotes
                quoted_pattern = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
                quoted_matches = re.findall(quoted_pattern, array_content)
                
                # Reconstruct the JSON string
                json_str = ''
                for line in quoted_matches:
                    # Unescape the quotes and add to JSON string
                    unescaped_line = line.replace('\\"', '"')
                    json_str += unescaped_line + '\n'
                
                # Parse the JSON
                if json_str.strip():
                    node_data = json.loads(json_str.strip())
                    node_results[node_name] = {
                        'status': 'success',
                        'data': node_data
                    }
                    self.print_debug(f"Successfully parsed JSON for {node_name}")
                    
            except (json.JSONDecodeError, Exception) as e:
                self.print_debug(f"Failed to parse JSON for {node_name}: {e}")
                # If parsing fails, mark as error but don't overwrite unreachable status
                if node_name not in node_results:
                    node_results[node_name] = {
                        'status': 'error',
                        'data': None
                    }
        
        return node_results


    def _parse_node_info_line_by_line(self, output: str) -> Dict[str, Dict[str, Any]]:
        """Fallback method to parse node info line by line"""
        node_info = {}
        lines = output.split('\n')
        
        i = 0
        while i < len(lines):
            line = lines[i].strip()
            
            # Look for node output blocks
            if line.startswith('ok: [') and '] => {' in line:
                # Extract node name
                start = line.find('[') + 1
                end = line.find(']')
                if start > 0 and end > start:
                    node_name = line[start:end]
                    self.print_debug(f"Line-by-line parsing for node: {node_name}")
                    
                    # Find the JSON block for this node
                    json_lines = []
                    i += 1
                    
                    # Look for the start of node_info.stdout_lines
                    while i < len(lines) and 'node_info.stdout_lines' not in lines[i]:
                        i += 1
                    
                    if i < len(lines):
                        self.print_debug(f"Found stdout_lines at line {i}")
                        i += 1  # Skip the stdout_lines line
                        
                        # Collect JSON lines until we hit the end
                        json_started = False
                        while i < len(lines):
                            current_line = lines[i].strip()
                            
                            # Stop if we hit the end of the array or next node
                            if current_line == ']' or current_line == '}' or current_line.startswith('ok: ['):
                                break
                            
                            # Skip empty lines and array markers
                            if not current_line or current_line == '[':
                                i += 1
                                continue
                            
                            # Clean up the JSON line - handle various formats
                            if current_line.startswith('"') and (current_line.endswith('",') or current_line.endswith('"')):
                                # Remove quotes and trailing comma
                                clean_line = current_line[1:]
                                if clean_line.endswith('",'):
                                    clean_line = clean_line[:-2]
                                elif clean_line.endswith('"'):
                                    clean_line = clean_line[:-1]
                                
                                # Unescape quotes
                                clean_line = clean_line.replace('\\"', '"')
                                json_lines.append(clean_line)
                                json_started = True
                            
                            i += 1
                        
                        # Try to parse the collected JSON
                        if json_lines:
                            try:
                                json_str = '\n'.join(json_lines)
                                self.print_debug(f"Line-by-line JSON for {node_name}: {json_str[:200]}...")
                                node_data = json.loads(json_str)
                                node_info[node_name] = node_data
                                self.print_debug(f"Line-by-line parsing successful for {node_name}")
                            except json.JSONDecodeError as e:
                                self.print_debug(f"Line-by-line JSON parsing failed for {node_name}: {e}")
                        else:
                            self.print_debug(f"No JSON lines collected for {node_name}")
            else:
                i += 1
        
        self.print_debug(f"Line-by-line parsing result: {len(node_info)} nodes found")
        return node_info

    def node_addresses_and_export(self) -> None:
        """Get and display node addresses, with optional CSV export"""
        if not self.check_hosts_config():
            self.print_colored("No nodes configured! Please configure nodes first.", 'red')
            self.wait_for_enter()
            return

        self.print_header("Node Addresses & Export")

        node_results = self._fetch_node_info_results("Retrieving node addresses...")
        if not node_results:
            self.wait_for_enter()
            return

        # Display the addresses in a formatted table
        self.print_colored(f"\nNode Addresses Summary:", 'green')
        self.print_colored(f"{'Node Name':<20} {'ETH Address':<48} {'Status':<15}", 'cyan')
        self.print_colored("-" * 83, 'cyan')

        success_count = 0
        unreachable_count = 0

        for node_name, result in node_results.items():
            if result['status'] == 'success':
                address = result['data'].get('address', 'N/A')
                eth_address = result['data'].get('eth_address', 'N/A')
                self.print_colored(f"{node_name:<20} {eth_address:<48} {'SUCCESS':<15}", 'green')
                success_count += 1
            else:
                self.print_colored(f"{node_name:<20} {'N/A':<48} {'UNREACHABLE':<15}", 'red')
                unreachable_count += 1

        # Show summary
        if success_count > 0 and unreachable_count > 0:
            self.print_colored(f"\n\u2705 {success_count} node(s) retrieved successfully, \u274c {unreachable_count} node(s) unreachable", 'yellow')
        elif success_count > 0:
            self.print_colored(f"\n\u2705 All {success_count} node(s) retrieved successfully", 'green')
        else:
            self.print_colored(f"\n\u274c All {unreachable_count} node(s) are unreachable", 'red')

        # Offer CSV export
        print()
        export_choice = self.get_input("Export to CSV? (y/N)", "N")
        if export_choice.lower() == 'y':
            self._write_addresses_csv(node_results)

        self.wait_for_enter()

    def _display_node_info_details(self, node_results: Dict[str, Dict[str, Any]]) -> None:
        """Display expanded node information using fetched node data plus local metadata."""
        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        target_service_version = self.get_mnl_service_version()
        outdated_service_nodes = []

        self.print_header("Node Details")
        self.print_colored(f"Target Service Template Version: {target_service_version}", 'cyan')

        for node_name in sorted(node_results.keys()):
            result = node_results[node_name]
            host_config = hosts.get(node_name, {})
            status_info = self._get_node_status_info(node_name)
            node_status = status_info.get('status', 'unknown')
            status_emoji, status_color, status_desc = self._get_status_display_info(node_status)
            applied_service_version = self.get_host_service_file_version(host_config)
            if not self._is_service_update_candidate(node_status):
                service_version_state = "not applicable"
            elif applied_service_version == target_service_version:
                service_version_state = "current"
            else:
                service_version_state = "update recommended"
                outdated_service_nodes.append(node_name)

            self.print_section(node_name)
            self.print_colored(f"Node Status: {status_emoji} {status_desc}", status_color)
            self.print_colored(f"Lookup Result: {result.get('status', 'unknown')}", 'white')
            self.print_colored(f"SSH Host: {host_config.get('ansible_host', 'N/A')}", 'white')
            self.print_colored(f"SSH User: {host_config.get('ansible_user', 'N/A')}", 'white')
            self.print_colored(
                f"Service File Version: {applied_service_version} ({service_version_state})",
                'green' if service_version_state in {'current', 'not applicable'} else 'red'
            )

            data = result.get('data') or {}
            if result.get('status') == 'success' and data:
                preferred_keys = ['alias', 'eth_address', 'address']
                shown_keys = set()
                for key in preferred_keys:
                    if key in data:
                        self.print_colored(f"{key}: {data[key]}", 'white')
                        shown_keys.add(key)

                for key in sorted(data.keys()):
                    if key in shown_keys:
                        continue
                    self.print_colored(f"{key}: {data[key]}", 'white')
            else:
                self.print_colored("No live node info returned for this node.", 'yellow')

            print()

        if outdated_service_nodes:
            self.print_colored("Recommended Actions:", 'red', bold=True)
            self.print_colored(
                f"  • Update service for: {', '.join(sorted(outdated_service_nodes))}",
                'red'
            )
            self.print_colored(
                "  • Use Operations Menu -> Update Service File, then run Node Status & Info again to verify.",
                'yellow'
            )
            print()

    def _write_addresses_csv(self, node_results: dict) -> None:
        """Write pre-fetched node results to a CSV file"""
        success_count = sum(1 for result in node_results.values() if result['status'] == 'success')
        unreachable_count = sum(1 for result in node_results.values() if result['status'] == 'unreachable')

        self.print_colored(f"Found information for {len(node_results)} node(s) (\u2705 {success_count} successful, \u274c {unreachable_count} unreachable)", 'green')

        default_dir = os.getcwd()
        self.print_colored(f"Default export directory: {default_dir}", 'cyan')
        export_dir = self.get_input("Export directory (Enter to use default)", default_dir)
        export_dir = os.path.expanduser(export_dir.strip())
        if not os.path.isdir(export_dir):
            self.print_colored(f"Directory does not exist: {export_dir}", 'red')
            return

        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        csv_file = os.path.join(export_dir, f"node_addresses_{timestamp}.csv")

        try:
            with open(csv_file, 'w') as f:
                f.write("Node_Name,Status,ETH_Address,SSH_Host,SSH_User,Address\n")

                # Load configuration to get SSH details
                self.load_configuration()
                hosts = _get_gpu_hosts(self.inventory)

                for node_name, result in node_results.items():
                    # Get SSH info from configuration
                    ssh_host = 'N/A'
                    ssh_user = 'N/A'
                    if node_name in hosts:
                        ssh_host = hosts[node_name].get('ansible_host', 'N/A')
                        ssh_user = hosts[node_name].get('ansible_user', 'N/A')

                    if result['status'] == 'success':
                        info = result['data']
                        alias = info.get('alias', node_name)
                        address = info.get('address', 'N/A')
                        eth_address = info.get('eth_address', 'N/A')
                        f.write(f'"{node_name}","SUCCESS","{eth_address}","{ssh_host}","{ssh_user}","{address}"\n')
                    else:
                        f.write(f'"{node_name}","UNREACHABLE","N/A","{ssh_host}","{ssh_user}","N/A"\n')

            self.print_colored(f"Addresses exported to: {csv_file}", 'green')
            self.print_colored(f"CSV contains: Node Name, Status, ETH Address, SSH Host, SSH User, Address", 'cyan')
            if success_count > 0 and unreachable_count > 0:
                self.print_colored(f"Note: {success_count} nodes have valid addresses, {unreachable_count} nodes are marked as unreachable", 'yellow')
        except Exception as e:
            self.print_colored(f"Error exporting to CSV: {e}", 'red')

    def change_network_environment(self) -> None:
        """Change the network environment"""
        self.print_header("Change Network Environment")

        current_env = self.get_mnl_app_env()
        if current_env:
            self.print_colored(f"Current environment: {current_env}", 'yellow')

        env = self._select_network_environment()
        self.set_mnl_app_env(env)
        self.print_colored(f"Network environment changed to: {env}", 'green')

        self.wait_for_enter()

    def _parse_connectivity_output(self, output: str) -> Dict[str, Dict[str, Any]]:
        """Parse ansible connectivity test output to extract connection results"""
        node_results = {}
        
        try:
            lines = output.split('\n')
            self.print_debug(f"Parsing connectivity output with {len(lines)} lines")
            
            # Look for PLAY RECAP section which contains the summary
            recap_started = False
            for line in lines:
                line = line.strip()
                
                if 'PLAY RECAP' in line:
                    recap_started = True
                    continue
                
                if recap_started and line:
                    # Parse lines like: "node-name : ok=5 changed=1 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0"
                    if ':' in line and 'ok=' in line:
                        parts = line.split(':', 1)
                        if len(parts) == 2:
                            node_name = parts[0].strip()
                            stats = parts[1].strip()
                            
                            # Parse the statistics
                            stats_dict = {}
                            for stat_pair in stats.split():
                                if '=' in stat_pair:
                                    key, value = stat_pair.split('=', 1)
                                    try:
                                        stats_dict[key] = int(value)
                                    except ValueError:
                                        stats_dict[key] = value
                            
                            # Determine connection status
                            if stats_dict.get('unreachable', 0) > 0:
                                status = 'unreachable'
                                message = 'Connection failed - node is unreachable'
                            elif stats_dict.get('failed', 0) > 0:
                                status = 'failed'
                                message = 'Connection failed - authentication or other error'
                            elif stats_dict.get('ok', 0) > 0:
                                status = 'connected'
                                message = 'Connection successful'
                            else:
                                status = 'unknown'
                                message = 'Unknown connection status'
                            
                            node_results[node_name] = {
                                'status': status,
                                'message': message,
                                'stats': stats_dict
                            }
                            
                            self.print_debug(f"Parsed {node_name}: {status} - {message}")
                            
        except Exception as e:
            self.print_debug(f"Error parsing connectivity output: {e}")
            
        self.print_debug(f"Final connectivity results: {node_results}")
        return node_results

    def _parse_ansible_play_recap(self, output: str) -> Dict[str, Dict[str, Any]]:
        """Parse generic Ansible PLAY RECAP output into host result summaries."""
        return self._parse_connectivity_output(output)

    def _display_connectivity_results(self, connectivity_results: Dict[str, Dict[str, Any]], hosts: Dict[str, Dict[str, Any]]) -> None:
        """Display formatted connectivity test results"""
        
        if not connectivity_results:
            self.print_colored("\n❌ No connectivity results could be parsed from the test output.", 'red')
            self.print_colored("   This might indicate a configuration or network issue.", 'yellow')
            return
        
        # Count results by status
        connected_count = sum(1 for result in connectivity_results.values() if result['status'] == 'connected')
        unreachable_count = sum(1 for result in connectivity_results.values() if result['status'] == 'unreachable')
        failed_count = sum(1 for result in connectivity_results.values() if result['status'] == 'failed')
        total_count = len(connectivity_results)
        
        # Display overall summary
        self.print_colored(f"\n📊 Connectivity Test Results", 'cyan', bold=True)
        self.print_colored(f"   • Total nodes tested: {total_count}", 'white')
        self.print_colored(f"   • Connected: {connected_count}", 'green' if connected_count > 0 else 'white')
        self.print_colored(f"   • Unreachable: {unreachable_count}", 'red' if unreachable_count > 0 else 'white')
        self.print_colored(f"   • Failed: {failed_count}", 'red' if failed_count > 0 else 'white')
        
        # Display detailed results for each node
        self.print_colored(f"\n🔍 Detailed Node Results:", 'cyan', bold=True)
        
        for node_name, result in connectivity_results.items():
            status = result['status']
            message = result['message']
            stats = result.get('stats', {})
            
            # Get node configuration details
            node_config = hosts.get(node_name, {})
            ip = node_config.get('ansible_host', 'Unknown')
            user = node_config.get('ansible_user', 'Unknown')
            
            # Choose appropriate emoji and color
            if status == 'connected':
                emoji = '✅'
                color = 'green'
            elif status == 'unreachable':
                emoji = '🔴'
                color = 'red'
            elif status == 'failed':
                emoji = '❌'
                color = 'red'
            else:
                emoji = '❓'
                color = 'yellow'
            
            # Display node result
            self.print_colored(f"\n   {emoji} {node_name} ({user}@{ip})", color, bold=True)
            self.print_colored(f"      Status: {message}", color)
            
            # Show statistics if available
            if stats:
                ok_count = stats.get('ok', 0)
                changed_count = stats.get('changed', 0)
                unreachable_count = stats.get('unreachable', 0)
                failed_count = stats.get('failed', 0)
                
                self.print_colored(f"      Tasks: {ok_count} successful, {changed_count} changed, {unreachable_count} unreachable, {failed_count} failed", 'white')
        
        # Overall result and recommendations
        if connected_count == total_count:
            self.print_colored(f"\n✅ All {total_count} node(s) are reachable and ready for deployment!", 'green', bold=True)
        elif connected_count > 0:
            self.print_colored(f"\n⚠️  {connected_count} of {total_count} node(s) are reachable", 'yellow', bold=True)
            self.print_colored("   Some nodes have connectivity issues that need to be resolved.", 'yellow')
        else:
            self.print_colored(f"\n❌ All {total_count} node(s) are unreachable", 'red', bold=True)
            self.print_colored("   Please check your network configuration and node settings.", 'red')
        
        # Show troubleshooting tips if there are issues
        if unreachable_count > 0 or failed_count > 0:
            self.print_colored(f"\n💡 Troubleshooting Tips:", 'cyan', bold=True)
            self.print_colored("   • Verify network connectivity (ping the IP addresses)", 'white')
            self.print_colored("   • Check SSH credentials and authentication method", 'white')
            self.print_colored("   • Ensure SSH service is running on target nodes", 'white')
            self.print_colored("   • Verify firewall settings allow SSH connections", 'white')
            self.print_colored("   • Use option 1 → 3 to view and verify your configuration", 'white')
            self.print_colored("   \u2022 Use option 7 \u2192 1 to test SSH connection manually", 'white')

    def ssh_into_node_machine(self) -> None:
        """SSH into a selected node's machine"""
        if not self.check_hosts_config() and not self.has_fleet_machines():
            self.print_colored("No nodes or machines configured! Please configure nodes first.", 'red')
            self.wait_for_enter()
            return

        self.print_header("SSH Into Node's Machine")

        # Load configuration
        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        # Fall back to fleet-state machines when no inventory hosts exist
        if not hosts:
            hosts = self.get_fleet_machines_as_hosts()
        env = self.get_mnl_app_env()

        # Show SSH connection details
        self.print_colored(f"🔧 SSH Connection Details:", 'cyan', bold=True)
        self.print_colored(f"   • Network: {env if env else 'Not set'}", 'green' if env else 'red')
        self.print_colored(f"   • Available Nodes: {len(hosts)}", 'white')

        self.print_colored(f"\n🖥️  Available Machines:", 'cyan', bold=True)
        for name, config in hosts.items():
            ip = config.get('ansible_host', 'Unknown')
            user = config.get('ansible_user', 'Unknown')
            auth_type = "Password" if 'ansible_ssh_pass' in config else "SSH Key"
            status_info = self._get_node_status_info(name)
            status = status_info['status']
            last_update = status_info['last_update']
            emoji, color, description = self._get_status_display_info(status)
            last_update_str = self._format_timestamp_ago(last_update)
            self.print_colored(f"   • {name}: {user}@{ip} ", 'white', end='')
            self.print_colored(f"[{emoji} {description}]", color, end='')
            self.print_colored(f" ({auth_type}) (Last updated: {last_update_str})", 'white')

        self.print_colored(f"\n📋 This will:", 'yellow', bold=True)
        self.print_colored("   • Connect you directly to the selected node via SSH", 'yellow')
        self.print_colored("   • Use the same connection details as configured for deployment", 'yellow')
        self.print_colored("   • Return you to r1setup when you exit the SSH session", 'yellow')
        self.print_colored("   • Type 'exit' in the SSH session to return here", 'yellow')

        # Node selection for SSH - only allow single selection
        print(f"\n🔍 Select a node's machine to SSH into:")
        host_list = list(hosts.keys())
        
        for i, host_name in enumerate(host_list, 1):
            config = hosts[host_name]
            ip = config.get('ansible_host', 'Unknown')
            user = config.get('ansible_user', 'Unknown')
            status_info = self._get_node_status_info(host_name)
            status = status_info['status']
            last_update = status_info['last_update']
            emoji, color, description = self._get_status_display_info(status)
            last_update_str = self._format_timestamp_ago(last_update)
            
            self.print_colored(f"  {i}) {host_name}", 'cyan')
            self.print_colored(f"     └─ {user}@{ip} [{emoji} {description}] (Last updated: {last_update_str})", 'white')
        
        self.print_colored(f"  0) Cancel")

        while True:
            try:
                choice = input(f"\nSelect node (1-{len(host_list)}, 0 to cancel): ").strip()
                
                if choice == '0':
                    self.print_colored("SSH connection cancelled.", 'yellow')
                    return
                
                choice_num = int(choice)
                if 1 <= choice_num <= len(host_list):
                    selected_host = host_list[choice_num - 1]
                    break
                else:
                    self.print_colored(f"Please enter a number between 1 and {len(host_list)}, or 0 to cancel.", 'red')
            except ValueError:
                self.print_colored("Please enter a valid number.", 'red')

        # Get connection details for selected host
        config = hosts[selected_host]
        ip = config.get('ansible_host')
        user = config.get('ansible_user')
        ssh_port = config.get('ansible_port', 22)
        
        if not ip or not user:
            self.print_colored(f"❌ Missing connection details for {selected_host}. Please reconfigure the node.", 'red')
            self.wait_for_enter()
            return

        # Build SSH command
        ssh_cmd = ['ssh']
        
        # Add port if not default
        if ssh_port != 22:
            ssh_cmd.extend(['-p', str(ssh_port)])
        
        # Handle SSH key vs password authentication
        if 'ansible_ssh_pass' in config:
            # Password authentication - warn user
            self.print_colored(f"\n⚠️  Password Authentication Required:", 'yellow', bold=True)
            self.print_colored(f"   • This node uses password authentication", 'yellow')
            self.print_colored(f"   • You'll be prompted for the password when connecting", 'yellow')
            password = config.get('ansible_ssh_pass')
            # We can't easily pass password to SSH, so just inform user
            self.print_colored(f"   • Use the same password configured for this node", 'white')
        else:
            # SSH key authentication
            if 'ansible_ssh_private_key_file' in config:
                key_file = config['ansible_ssh_private_key_file']
                # Expand user path if needed
                if key_file.startswith('~'):
                    key_file = os.path.expanduser(key_file)
                ssh_cmd.extend(['-i', key_file])
                self.print_colored(f"\n🔑 Using SSH key: {key_file}", 'green')
            else:
                self.print_colored(f"\n🔑 Using default SSH key authentication", 'green')
        
        # Add any additional SSH options for better connectivity
        ssh_cmd.extend([
            '-o', 'StrictHostKeyChecking=no',  # Don't prompt for host key verification
            '-o', 'UserKnownHostsFile=/dev/null',  # Don't save host keys
            '-o', f'ConnectTimeout={self.ssh_connect_timeout}',
        ])
        
        # Add user@host
        ssh_cmd.append(f"{user}@{ip}")

        self.print_colored(f"\n🚀 Connecting to {selected_host} ({user}@{ip})...", 'cyan', bold=True)
        self.print_colored("   Type 'exit' to return to r1setup", 'white')
        self.wait_for_enter()

        try:
            # Execute SSH command
            self.print_colored(f"Executing: {' '.join(ssh_cmd[:4])} ... {ssh_cmd[-1]}", 'cyan')
            
            # Use subprocess.run to execute SSH interactively
            result = subprocess.run(ssh_cmd)
            
            # When SSH exits, we return here
            self.print_colored(f"\n✅ SSH session to {selected_host} ended.", 'green')
            self.print_colored("Returning to r1setup...", 'cyan')
            
        except KeyboardInterrupt:
            self.print_colored(f"\n🛑 SSH connection interrupted.", 'yellow')
        except Exception as e:
            self.print_colored(f"\n❌ SSH connection failed: {e}", 'red')
            self.print_colored("Common issues:", 'yellow')
            self.print_colored("   • Network connectivity problems", 'white')
            self.print_colored("   • Incorrect SSH credentials", 'white')
            self.print_colored("   • Firewall blocking SSH port", 'white')
            self.print_colored("   • Node is not reachable", 'white')
        
        self.wait_for_enter()

    def start_edge_node_service(self) -> None:
        """Start Edge Node on all configured nodes"""
        self._manage_service("service_start.yml", "Start Edge Node", "🚀 Starting Edge Node")

    def stop_edge_node_service(self) -> None:
        """Stop Edge Node on all configured nodes"""
        self._manage_service("service_stop.yml", "Stop Edge Node", "🛑 Stopping Edge Node")

    def restart_edge_node_service(self) -> None:
        """Restart Edge Node on all configured nodes"""
        self._manage_service("service_restart.yml", "Restart Edge Node", "🔄 Restarting Edge Node")


    def _auto_update_check(self):
        return self.version_manager._auto_update_check()

    def _restore_active_configuration_if_possible(self) -> bool:
        """Restore the previously active config if only the hosts.yml link was lost."""
        self._load_active_config()
        config_name = self.active_config.get('config_name')
        if not config_name:
            return False

        config_path = self.configs_dir / f"{config_name}.yml"
        if not config_path.exists():
            self.print_debug(f"Cannot restore active config; missing file: {config_path}")
            return False

        restored = self._load_config_by_name(config_name)
        if restored:
            self.print_colored(f"Restored active configuration: {config_name}", 'green')
        else:
            self.print_debug(f"Failed to restore active config: {config_name}")
        return restored

    def ensure_active_configuration(self) -> bool:
        """Ensure there's an active configuration before proceeding to main menu"""
        # Check if we have a valid active configuration (hosts OR zero-host shell)
        if self.has_active_config_shell():
            return True

        if self._restore_active_configuration_if_possible() and self.has_active_config_shell():
            return True

        self.print_header("Configuration Required")
        self.print_colored("⚠️  No active configuration detected!", 'red', bold=True)

        # Check if we have saved configurations
        configs = self._list_available_configs()

        if configs:
            # If there's only one configuration, automatically select it
            if len(configs) == 1:
                config_name, metadata = configs[0]
                display_name = config_name.replace('.yml', '')

                # Extract custom name for display
                custom_name = display_name
                if '_' in display_name:
                    parts = display_name.split('_')
                    if len(parts) >= 2:
                        # Find where the timestamp starts (8 digits)
                        for idx, part in enumerate(parts):
                            if len(part) == 8 and part.isdigit():
                                custom_name = '_'.join(parts[:idx])
                                break

                env = metadata.get('environment', 'unknown')
                nodes = metadata.get('nodes_count', 0)
                machines = metadata.get('machines_count', 0)
                if machines > 0 and nodes == 0:
                    count_label = f"{machines} machine(s), 0 instances"
                else:
                    count_label = f"{nodes} node(s)"

                self.print_colored(f"\n📁 Found 1 saved configuration: {custom_name} ({env}, {count_label})", 'cyan')
                self.print_colored("Automatically activating the only available configuration...", 'yellow')

                if self._load_config_by_name(display_name):
                    self.print_colored(f"✅ Successfully activated configuration: {custom_name}", 'green')
                    self.print_colored("Proceeding to main menu...", 'green')
                    self.wait_for_enter()
                    return True
                else:
                    self.print_colored(f"❌ Failed to activate configuration: {custom_name}", 'red')
                    self.print_colored("The configuration file may be corrupted. Please create a new one.", 'red')
                    self._create_machine_first_configuration()
                    return True

            # Multiple configurations - show selection menu
            self.print_colored(f"\n📁 Found {len(configs)} saved configuration(s):", 'cyan')
            self.print_colored("It looks like you have existing configurations but none are currently active.", 'yellow')
            self.print_colored("This can happen after reinstalling or updating r1setup.", 'yellow')

            # Show available configurations
            for i, (config_name, metadata) in enumerate(configs, 1):
                display_name = config_name.replace('.yml', '')
                env = metadata.get('environment', 'unknown')
                nodes = metadata.get('nodes_count', 0)
                machines = metadata.get('machines_count', 0)
                created_at = metadata.get('created_at')
                last_deployed_date = metadata.get('last_deployed_date')
                deployment_status = metadata.get('deployment_status', 'never_deployed')

                # Extract custom name from the config name
                custom_name = display_name
                if '_' in display_name:
                    parts = display_name.split('_')
                    if len(parts) >= 2:
                        # Find where the timestamp starts (8 digits)
                        for idx, part in enumerate(parts):
                            if len(part) == 8 and part.isdigit():
                                custom_name = '_'.join(parts[:idx])
                                break

                # Format creation date
                created_str = _parse_iso_datetime(created_at) or "Unknown"

                # Format deployment status
                deployment_str = ""
                if deployment_status == 'deployed' and last_deployed_date:
                    deployed_str = _parse_iso_datetime(last_deployed_date)
                    if deployed_str:
                        deployment_str = f" | 🚀 Last deployed: {deployed_str}"
                    else:
                        deployment_str = " | 🚀 Deployed"
                elif deployment_status == 'deleted':
                    deployment_str = " | 🗑️ Deleted"
                else:
                    deployment_str = " | 📋 Never deployed"

                if machines > 0 and nodes == 0:
                    count_label = f"{machines} machine(s), 0 instances"
                else:
                    count_label = f"{nodes} node(s)"
                self.print_colored(f"  {i}. {custom_name}", 'cyan', bold=True)
                self.print_colored(f"     {env} | {count_label} | Created: {created_str}{deployment_str}", 'white')

            while True:
                self.print_colored("\n🔧 What would you like to do?", 'cyan', bold=True)
                self.print_colored("  1) Select an existing configuration to activate")
                self.print_colored("  2) Create a new configuration")
                self.print_colored("  3) Import configuration from .r1config file")
                print()
                self.print_colored("  0) Exit")

                choice = self.get_input("\nSelect option (0-3)", "1")

                if choice == '0':
                    self.print_colored("Exiting r1setup.", 'yellow')
                    return False
                elif choice == '1':
                    # Let user select from existing configurations
                    while True:
                        try:
                            selection = int(self.get_input(f"Select configuration number (1-{len(configs)})", "1")) - 1
                            if 0 <= selection < len(configs):
                                selected_config = configs[selection][0].replace('.yml', '')
                                break
                            self.print_colored("Invalid selection", 'red')
                        except ValueError:
                            self.print_colored("Please enter a number", 'red')

                    if self._load_config_by_name(selected_config):
                        self.print_colored(f"✅ Successfully activated configuration: {selected_config}", 'green')
                        self.print_colored("You can now access the main menu.", 'green')
                        self.wait_for_enter()
                        return True
                    else:
                        self.print_colored(f"❌ Failed to activate configuration: {selected_config}", 'red')
                        self.print_colored("Please try another configuration or create a new one.", 'red')
                        self.wait_for_enter()
                        continue
                elif choice == '2':
                    # Create new configuration
                    self._create_machine_first_configuration()
                    return True
                elif choice == '3':
                    # Import configuration
                    self._import_configuration()
                    # Check if import was successful by seeing if we now have an active config
                    if self.has_active_config_shell():
                        self.print_colored("You can now access the main menu.", 'green')
                        self.wait_for_enter()
                        return True
                    else:
                        self.print_colored("Import was cancelled or failed. Please try again.", 'yellow')
                        self.wait_for_enter()
                        continue
                else:
                    self.print_colored("Invalid option. Please enter 0, 1, 2, or 3.", 'red')
        else:
            # No saved configurations exist
            self.print_colored("\n📝 No configurations found!", 'yellow')
            self.print_colored("You need to create your first configuration to use r1setup.", 'white')
            self.print_colored("A configuration contains your GPU node connection details and network settings.", 'white')

            while True:
                self.print_colored("\n🔧 What would you like to do?", 'cyan', bold=True)
                self.print_colored("  1) Create your first configuration")
                self.print_colored("  2) Import configuration from .r1config file")
                print()
                self.print_colored("  0) Exit")

                choice = self.get_input("\nSelect option (0-2)", "1")

                if choice == '0':
                    self.print_colored("Exiting r1setup.", 'yellow')
                    return False
                elif choice == '1':
                    self._create_machine_first_configuration()
                    return True
                elif choice == '2':
                    # Import configuration for first-time users
                    self._import_configuration()
                    # Check if import was successful by seeing if we now have an active config
                    if self.has_active_config_shell():
                        self.print_colored("Welcome to r1setup! Your configuration has been imported successfully.", 'green')
                        self.print_colored("You can now access the main menu.", 'green')
                        self.wait_for_enter()
                        return True
                    else:
                        self.print_colored("Import was cancelled or failed. Please try again.", 'yellow')
                        self.wait_for_enter()
                        continue
                else:
                    self.print_colored("Invalid option. Please enter 0, 1, or 2.", 'red')

    def run(self) -> None:
        """Main program loop"""
        # Handle command line arguments
        global DEBUG
        if len(sys.argv) > 1:
            if sys.argv[1] == '--version':
                print(f"r1setup version {CLI_VERSION}")
                sys.exit(0)
            elif sys.argv[1] == '--debug':
                DEBUG = True
                self.print_colored("Debug mode enabled", 'yellow')

        # Check prerequisites
        if not self.check_ansible_installation():
            self.print_colored("Please ensure Ansible and the required collection are installed.", 'red')
            sys.exit(1)

        # Auto-update check - this runs first before everything else
        self._auto_update_check()

        # Add SSH metadata for legacy configurations without changing active auth.
        self.migrate_legacy_ssh_metadata()

        # Ensure we have an active configuration before proceeding
        if not self.ensure_active_configuration():
            sys.exit(0)

        # One-time initial node status refresh on startup
        if self.check_hosts_config() and self.settings_manager.should_refresh_status():
            self.load_configuration()
            print("  Refreshing node statuses...", end='\r')
            node_status_data = self._get_real_time_node_status()
            for node_name, status_data in node_status_data.items():
                self._update_node_status(node_name, status_data['status'])
            self.settings_manager.mark_status_refreshed()
            print("                              ", end='\r')

        self._offer_startup_service_update()

        while True:
            try:
                self.show_main_menu()
                choice = self.get_input("Select option (0-7)", "0")

                if choice == '0':
                    self.print_colored("Thank you for using Ratio1 Multi-Node Launcher Setup!", 'green')
                    break
                elif choice == '1':
                    self.configuration_menu()
                elif choice == '2':
                    self.deployment_menu()
                elif choice == '3':
                    self.operations_menu()
                elif choice == '4':
                    self.combined_node_status_and_info()
                elif choice == '5':
                    self.node_addresses_and_export()
                elif choice == '6':
                    self.settings_menu()
                elif choice == '7':
                    self.advanced_menu()
                else:
                    self.print_colored("Invalid option. Valid choices are 0-7.", 'red')
                    self.wait_for_enter()

            except KeyboardInterrupt:
                self.print_colored("\n\nOperation cancelled by user.", 'yellow')
                self._print_cancellation_guidance()
                break
            except Exception as e:
                self.print_colored(f"An error occurred: {e}", 'red')
                self.wait_for_enter()

    def switch_environment(self) -> None:
        """Switch network environment (wrapper for change_network_environment)"""
        self.change_network_environment()

    def get_logs(self) -> None:
        """Stream logs from selected nodes"""
        if not self.check_hosts_config() and not self.has_fleet_machines():
            self.print_colored("No nodes or machines configured! Please configure nodes first.", 'red')
            self.wait_for_enter()
            return

        self.print_header("Get Node Logs")

        # Load configuration
        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        # Fall back to fleet-state machines when no inventory hosts exist
        if not hosts:
            hosts = self.get_fleet_machines_as_hosts()

        if not hosts:
            self.print_colored("No nodes or machines configured.", 'yellow')
            self.wait_for_enter()
            return
        
        # Show available nodes
        self.print_section(f"Available Nodes ({len(hosts)})")
        host_list = list(hosts.keys())
        
        for i, host_name in enumerate(host_list, 1):
            host_config = hosts[host_name]
            ip = host_config.get('ansible_host', 'Unknown')
            user = host_config.get('ansible_user', 'Unknown')
            
            # Get status information
            status_info = self._get_node_status_info(host_name)
            status = status_info['status']
            emoji, color, description = self._get_status_display_info(status)
            
            self.print_colored(f"  {i}. {host_name} ({user}@{ip}) ", 'white', end='')
            self.print_colored(f"[{emoji} {description}]", color)
        
        print()
        self.print_colored("  0) Return to main menu")
        print()
        
        while True:
            choice = self.get_input("Select a node to view logs (0 to return)", "0")
            
            if choice == '0':
                return
            
            try:
                node_index = int(choice) - 1
                if 0 <= node_index < len(host_list):
                    selected_node = host_list[node_index]
                    self._stream_node_logs(selected_node)
                    break
                else:
                    self.print_colored("Invalid selection. Please try again.", 'red')
            except ValueError:
                self.print_colored("Invalid input. Please enter a number.", 'red')

    def _stream_node_logs(self, node_name: str) -> None:
        """Stream logs from a specific node"""
        self.print_header(f"Streaming Logs - {node_name}")
        
        # Get node connection details first to validate before asking user to proceed
        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        node_config = hosts.get(node_name, {})
        
        host = node_config.get('ansible_host', '')
        user = node_config.get('ansible_user', '')
        
        if not host or not user:
            self.print_colored("Error: Node configuration incomplete.", 'red')
            self.wait_for_enter()
            return
        
        remote_log_command = self._build_remote_helper_command(node_name, node_config, 'logs', '-f')

        self.print_colored(f"📡 Ready to stream logs from: {user}@{host}", 'cyan')
        self.print_colored(f"🔍 This will run '{remote_log_command}' on the target machine", 'white')
        self.print_colored("⚠️  Use Ctrl+C to stop streaming and return to menu", 'yellow', bold=True)
        print()
        
        # Ask user to confirm before starting
        self.wait_for_enter("Press Enter to start streaming logs...")
        print()
        
        try:
            ssh_cmd = self._build_node_ssh_command(node_name, node_config, remote_log_command)

            self.print_colored(f"Connecting to {user}@{host}...", 'yellow')
            print("=" * 80)
            
            # Execute the command
            process = subprocess.Popen(
                ssh_cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                universal_newlines=True,
            )
            
            # Stream the output
            try:
                for line in iter(process.stdout.readline, ''):
                    if line:
                        print(line.rstrip())
                    else:
                        break
            except KeyboardInterrupt:
                print("\n" + "=" * 80)
                self.print_colored("\n🛑 Log streaming stopped by user.", 'yellow')
                process.terminate()
                try:
                    process.wait(timeout=5)
                except subprocess.TimeoutExpired:
                    process.kill()
            
        except Exception as e:
            self.print_colored(f"Error streaming logs: {e}", 'red')
        
        self.wait_for_enter()

    def write_logs_to_file(self) -> None:
        """Save node logs to a local file"""
        if not self.check_hosts_config() and not self.has_fleet_machines():
            self.print_colored("No nodes or machines configured! Please configure nodes first.", 'red')
            self.wait_for_enter()
            return

        self.print_header("Write Logs to File")

        # Load configuration
        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        # Fall back to fleet-state machines when no inventory hosts exist
        if not hosts:
            hosts = self.get_fleet_machines_as_hosts()

        if not hosts:
            self.print_colored("No nodes or machines configured.", 'yellow')
            self.wait_for_enter()
            return

        # Show available nodes
        self.print_section(f"Available Nodes ({len(hosts)})")
        host_list = list(hosts.keys())
        
        for i, host_name in enumerate(host_list, 1):
            host_config = hosts[host_name]
            ip = host_config.get('ansible_host', 'Unknown')
            user = host_config.get('ansible_user', 'Unknown')
            
            # Get status information
            status_info = self._get_node_status_info(host_name)
            status = status_info['status']
            emoji, color, description = self._get_status_display_info(status)
            
            self.print_colored(f"  {i}. {host_name} ({user}@{ip}) ", 'white', end='')
            self.print_colored(f"[{emoji} {description}]", color)
        
        print()
        self.print_colored("  0) Return to main menu")
        print()
        
        while True:
            choice = self.get_input("Select a node to save logs from (0 to return)", "0")
            
            if choice == '0':
                return
            
            try:
                node_index = int(choice) - 1
                if 0 <= node_index < len(host_list):
                    selected_node = host_list[node_index]
                    self._save_node_logs_to_file(selected_node)
                    break
                else:
                    self.print_colored("Invalid selection. Please try again.", 'red')
            except ValueError:
                self.print_colored("Invalid input. Please enter a number.", 'red')

    def _save_node_logs_to_file(self, node_name: str) -> None:
        """Save logs from a specific node to a local file"""
        self.print_header(f"Save Logs to File - {node_name}")
        
        # Get log lines count
        while True:
            try:
                lines_input = self.get_input("How many recent log lines to save (default: 1000)", "1000")
                lines_count = int(lines_input)
                if lines_count <= 0:
                    self.print_colored("Please enter a positive number.", 'red')
                    continue
                break
            except ValueError:
                self.print_colored("Invalid input. Please enter a number.", 'red')
        
        # Generate default filename
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        default_filename = f"{node_name}_logs_{timestamp}.txt"
        
        # Get output filename
        filename = self.get_input(f"Output filename (default: {default_filename})", default_filename)
        
        # Ensure we have a valid filename
        if not filename.strip():
            filename = default_filename
        
        # Add .txt extension if not present
        if not filename.endswith('.txt'):
            filename += '.txt'
        
        try:
            # Get node connection details
            self.load_configuration()
            hosts = _get_gpu_hosts(self.inventory)
            node_config = hosts.get(node_name, {})
            
            host = node_config.get('ansible_host', '')
            user = node_config.get('ansible_user', '')
            
            if not host or not user:
                self.print_colored("Error: Node configuration incomplete.", 'red')
                self.wait_for_enter()
                return
            
            remote_log_command = self._build_remote_helper_command(node_name, node_config, 'logs', '-n', str(lines_count))
            ssh_cmd = self._build_node_ssh_command(node_name, node_config, remote_log_command)

            self.print_colored(f"Connecting to {user}@{host} and retrieving {lines_count} log lines...", 'yellow')
            
            # Execute the command
            result = subprocess.run(
                ssh_cmd,
                capture_output=True,
                text=True,
                timeout=self.connection_timeout,
            )

            if result.returncode == 0:
                # Save logs to file
                with open(filename, 'w') as f:
                    f.write(f"# Edge Node Logs from {node_name} ({user}@{host})\n")
                    f.write(f"# Retrieved on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                    f.write(f"# Last {lines_count} log lines\n")
                    f.write("# " + "="*60 + "\n\n")
                    f.write(result.stdout)
                
                self.print_colored(f"✅ Logs saved to: {filename}", 'green')
                self.print_colored(f"📝 File size: {os.path.getsize(filename)} bytes", 'cyan')
                self.print_colored(f"📅 Lines saved: {lines_count}", 'cyan')
                
            else:
                self.print_colored(f"❌ Error retrieving logs: {result.stderr}", 'red')
                
        except Exception as e:
            self.print_colored(f"Error saving logs: {e}", 'red')
        
        self.wait_for_enter()

    def select_hosts(
        self,
        hosts: Dict[str, Dict[str, Any]],
        operation_name: str,
        preselect_mode: str = 'all',
        initial_selection: Optional[set] = None,
        preselection_label: Optional[str] = None,
    ) -> List[str]:
        """Unified host selection UI (interactive with termios, fallback without).

        preselect_mode: 'all' (all selected) | 'undeployed' (never_deployed/deleted selected) | 'none'
        Returns: list of selected host names, or [] if cancelled.
        """
        if not hosts:
            return []

        # Compute initial selection based on preselect_mode unless the caller supplied one.
        if initial_selection is None:
            if preselect_mode == 'undeployed':
                initial_selection = set()
                for host_name in hosts:
                    status_info = self._get_node_status_info(host_name)
                    if status_info['status'] in ['never_deployed', 'deleted']:
                        initial_selection.add(host_name)
                preselection_label = preselection_label or "nodes that were never deployed or deleted"
            elif preselect_mode == 'none':
                initial_selection = set()
            else:  # 'all'
                initial_selection = set(hosts.keys())
        else:
            initial_selection = set(initial_selection)

        try:
            import tty
            import termios
            return self._interactive_select_hosts(hosts, operation_name, initial_selection, preselection_label)
        except ImportError:
            return self._fallback_select_hosts(hosts, operation_name, initial_selection, preselection_label)

    @staticmethod
    def _format_install_history(host_config: Dict[str, Any], include_attempt: bool = True) -> str:
        """Return a compact one-line summary of a host's install history.

        Shape: ``<last-success>  [<recent-attempt-if-divergent-or-failed>]``

        Examples:
            ``GPU (r1) • 2026-03-12``
            ``CPU      • 2026-02-01    GPU (user) • 2026-04-17 ✗``
            ``—        • never``
        """
        def _fmt_variant(v: Optional[str], owner: Optional[str]) -> str:
            if not v:
                return '—  '
            owner_short = {'r1setup': 'r1', 'user': 'user'}.get(owner or '', '')
            suffix = f" ({owner_short})" if owner_short else ''
            return f"{v.upper()}{suffix}"

        def _fmt_date(ts: Optional[str]) -> str:
            if not ts:
                return 'never'
            return ts.split('T')[0]

        last_variant = host_config.get(INSTALL_LAST_VARIANT_FIELD)
        last_owner = host_config.get(INSTALL_LAST_DRIVER_OWNER_FIELD)
        last_at = host_config.get(INSTALL_LAST_AT_FIELD)
        success_col = f"{_fmt_variant(last_variant, last_owner):<12} • {_fmt_date(last_at):<10}"

        if not include_attempt:
            return success_col

        attempt_variant = host_config.get(INSTALL_ATTEMPTED_VARIANT_FIELD)
        attempt_owner = host_config.get(INSTALL_ATTEMPTED_DRIVER_OWNER_FIELD)
        attempt_at = host_config.get(INSTALL_ATTEMPTED_AT_FIELD)
        attempt_result = host_config.get(INSTALL_ATTEMPTED_RESULT_FIELD)

        # Show attempt column only when it tells the user something new:
        # either the last attempt failed, or it diverges from the last success.
        diverges = (
            attempt_variant and (
                attempt_result == 'failed'
                or attempt_variant != last_variant
                or attempt_owner != last_owner
            )
        )
        if not diverges:
            return success_col

        result_mark = '✗' if attempt_result == 'failed' else '✓'
        attempt_col = f"{_fmt_variant(attempt_variant, attempt_owner):<12} • {_fmt_date(attempt_at):<10} {result_mark}"
        return f"{success_col}    {attempt_col}"

    def _render_host_menu(self, hosts, host_list, selected_hosts, current_index, operation_name,
                          initial_selection=None, interactive=True, preselection_label=None):
        """Render the host selection menu (shared by interactive and fallback modes)."""
        # Clear screen and move cursor to top
        self.clear_screen()

        # Header
        self.print_header(f"Select Hosts for {operation_name.title()}")

        # Instructions
        if interactive:
            self.print_colored("🎮 Navigation Controls:", 'cyan', bold=True)
            self.print_colored("   ↑/↓ Arrow keys    - Navigate up/down", 'white')
            self.print_colored("   Space bar        - Toggle selection", 'white')
            self.print_colored("   Enter           - Confirm selection", 'white')
            self.print_colored("   q/Esc           - Cancel operation", 'white')
        else:
            self.print_colored("📋 Instructions:", 'cyan', bold=True)
            self.print_colored("   • Enter numbers to toggle selection (e.g., 1, 2, 3)", 'white')
            self.print_colored("   • Use 'a' to select all hosts", 'white')
            self.print_colored("   • Use 'n' to deselect all hosts", 'white')
            self.print_colored("   • Use 'c' to cancel operation", 'white')
            self.print_colored("   • Press Enter when ready to proceed", 'white')
        print()

        # Show preselection info when a subset was intentionally pre-selected.
        if initial_selection is not None and initial_selection != set(hosts.keys()):
            preselected = initial_selection
            if preselected:
                label = preselection_label or "pre-selected nodes"
                self.print_colored(f"💡 Pre-selected {label}:", 'cyan', bold=True)
                for host_name in sorted(preselected):
                    status_info = self._get_node_status_info(host_name)
                    status_emoji, _, status_desc = self._get_status_display_info(status_info['status'])
                    self.print_colored(f"   • {host_name} [{status_emoji} {status_desc}]", 'green')
                print()

        # Status
        all_selected = len(selected_hosts) == len(hosts)
        if len(selected_hosts) == 0:
            self.print_colored("⚠️  No hosts selected!", 'red', bold=True)
        elif all_selected:
            self.print_colored(f"✅ All {len(hosts)} hosts selected", 'green', bold=True)
        else:
            self.print_colored(f"📊 Selected: {len(selected_hosts)}/{len(hosts)} hosts", 'cyan', bold=True)
            if not interactive:
                selected_names = ', '.join(sorted(selected_hosts))
                self.print_colored(f"   Selected hosts: {selected_names}", 'cyan')
        print()

        # Menu items
        for i, item in enumerate(host_list):
            is_current = interactive and (i == current_index)

            if item == "All hosts":
                all_sel = len(selected_hosts) == len(hosts)
                marker = "✓" if all_sel else " "
                prefix = "→ " if is_current else "  "
                idx_str = f"{prefix}" if interactive else f"  0) "
                color = 'yellow' if is_current else ('green' if all_sel else 'white')
                style = 'bold' if is_current or all_sel else False
                self.print_colored(f"{idx_str}[{marker}] {item} ({len(hosts)} total)", color, bold=style)
            else:
                host_name = item
                is_selected = host_name in selected_hosts
                is_preselected = initial_selection is not None and host_name in initial_selection and initial_selection != set(hosts.keys())
                marker = "✓" if is_selected else " "

                # Install history column (per-host variant + date + attempt,
                # degrades to just the success column on narrow terminals).
                host_cfg = hosts.get(host_name, {})
                try:
                    import shutil as _sh
                    term_width = _sh.get_terminal_size((100, 24)).columns
                except Exception:
                    term_width = 100
                include_attempt = term_width >= 110
                history = self._format_install_history(host_cfg, include_attempt=include_attempt)

                if interactive:
                    prefix = "→ " if is_current else "  "
                    color = 'yellow' if is_current else ('green' if is_selected else 'white')
                    style = 'bold' if is_current else False
                    preselect_indicator = " (pre-selected)" if is_preselected else ""
                    self.print_colored(
                        f"{prefix}[{marker}] {host_name:<24}  {history}{preselect_indicator}",
                        color, bold=style,
                    )
                else:
                    idx = host_list.index(item)  # 0 is "All hosts", so real index starts at 1
                    color = 'green' if is_selected else 'white'
                    preselect_indicator = " (pre-selected)" if is_preselected else ""
                    self.print_colored(
                        f"  {idx}) [{marker}] {host_name:<24}  {history}{preselect_indicator}",
                        color, bold=is_selected,
                    )

        print()
        self.print_colored("─" * 60, 'blue')

    def _interactive_select_hosts(self, hosts, operation_name, initial_selection, preselection_label=None):
        """Interactive host selection with keyboard navigation (arrow keys + space)."""
        import tty
        import termios

        host_list = ["All hosts"] + list(hosts.keys())
        selected_hosts = initial_selection.copy()
        current_index = 0

        def get_key():
            fd = sys.stdin.fileno()
            old_settings = termios.tcgetattr(fd)
            try:
                tty.setraw(sys.stdin.fileno())
                key = sys.stdin.read(1)
                if key == '\x1b':
                    key += sys.stdin.read(2)
                return key
            finally:
                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)

        while True:
            self._render_host_menu(hosts, host_list, selected_hosts, current_index,
                                   operation_name, initial_selection, interactive=True,
                                   preselection_label=preselection_label)

            try:
                key = get_key()

                if key == '\x1b[A':  # Up arrow
                    current_index = (current_index - 1) % len(host_list)
                elif key == '\x1b[B':  # Down arrow
                    current_index = (current_index + 1) % len(host_list)
                elif key == ' ':  # Space bar - toggle selection
                    current_item = host_list[current_index]
                    if current_item == "All hosts":
                        if len(selected_hosts) == len(hosts):
                            selected_hosts = set()
                        else:
                            selected_hosts = set(hosts.keys())
                    else:
                        if current_item in selected_hosts:
                            selected_hosts.remove(current_item)
                        else:
                            selected_hosts.add(current_item)
                elif key == '\r' or key == '\n':  # Enter - confirm
                    if len(selected_hosts) == 0:
                        self.clear_screen()
                        self.print_colored("❌ Cannot proceed without selecting any hosts!", 'red', bold=True)
                        self.print_colored("Press any key to continue...", 'yellow')
                        get_key()
                        continue
                    break
                elif key == 'q' or key == '\x1b':  # q or Esc - cancel
                    return []

            except KeyboardInterrupt:
                return []

        return list(selected_hosts)

    def _fallback_select_hosts(self, hosts, operation_name, initial_selection, preselection_label=None):
        """Fallback host selection for systems without termios."""
        host_list = ["All hosts"] + list(hosts.keys())
        selected_hosts = initial_selection.copy()

        while True:
            self._render_host_menu(hosts, host_list, selected_hosts, -1,
                                   operation_name, initial_selection, interactive=False,
                                   preselection_label=preselection_label)

            choice = self.get_input("Enter choice (number/a/n/c/Enter to proceed)", "").strip().lower()

            if choice == "":
                if len(selected_hosts) == 0:
                    self.print_colored("❌ Cannot proceed without selecting any hosts!", 'red')
                    self.wait_for_enter("Press Enter to continue selection...")
                    continue
                break
            elif choice == "c":
                return []
            elif choice == "a":
                selected_hosts = set(hosts.keys())
                self.print_colored("✅ All hosts selected", 'green')
            elif choice == "n":
                selected_hosts = set()
                self.print_colored("⚠️  All hosts deselected", 'yellow')
            else:
                try:
                    choices = []
                    for part in choice.replace(',', ' ').split():
                        try:
                            choices.append(int(part))
                        except ValueError:
                            continue

                    if not choices:
                        choices = [int(choice)]

                    real_host_list = list(hosts.keys())
                    for choice_num in choices:
                        if choice_num == 0:
                            all_selected = len(selected_hosts) == len(hosts)
                            if all_selected:
                                selected_hosts = set()
                                self.print_colored("⚠️  All hosts deselected", 'yellow')
                            else:
                                selected_hosts = set(hosts.keys())
                                self.print_colored("✅ All hosts selected", 'green')
                        elif 1 <= choice_num <= len(real_host_list):
                            host_name = real_host_list[choice_num - 1]
                            if host_name in selected_hosts:
                                selected_hosts.remove(host_name)
                                self.print_colored(f"➖ Deselected: {host_name}", 'yellow')
                            else:
                                selected_hosts.add(host_name)
                                self.print_colored(f"➕ Selected: {host_name}", 'green')
                        else:
                            self.print_colored(f"❌ Invalid choice: {choice_num} (valid range: 0-{len(real_host_list)})", 'red')

                except ValueError:
                    self.print_colored("❌ Invalid input. Please enter numbers, 'a', 'n', 'c', or press Enter.", 'red')

            import time
            time.sleep(0.8)

        return list(selected_hosts)

    @staticmethod
    def _get_machine_deployment_state_info(state: str) -> Tuple[str, str, str]:
        """Get display info for machine deployment state."""
        state_info = {
            'empty': ('📭', 'yellow', 'Empty'),
            'registered': ('📝', 'white', 'Registered'),
            'prepared': ('🛠️', 'green', 'Prepared'),
            'active': ('🚀', 'green', 'Active'),
            'unreachable': ('🔌', 'red', 'Unreachable'),
            'error': ('❌', 'red', 'Error'),
            'unknown': ('❓', 'white', 'Unknown'),
        }
        return state_info.get(state, state_info['unknown'])

    def select_registered_machines(
        self,
        machines: Dict[str, Dict[str, Any]],
        operation_name: str,
        preselect_mode: str = 'all',
    ) -> List[str]:
        """Select registered machines for a machine-scope operation."""
        if not machines:
            return []

        if preselect_mode == 'none':
            selected_machine_ids = set()
        else:
            selected_machine_ids = set(machines.keys())

        machine_ids = list(machines.keys())
        while True:
            self.print_header(f"Select Machines for {operation_name.title()}")
            self.print_colored("📋 Instructions:", 'cyan', bold=True)
            self.print_colored("   • Enter numbers to toggle selection (e.g., 1, 2)", 'white')
            self.print_colored("   • Use 'a' to select all machines", 'white')
            self.print_colored("   • Use 'n' to deselect all machines", 'white')
            self.print_colored("   • Use 'c' to cancel operation", 'white')
            self.print_colored("   • Press Enter when ready to proceed", 'white')
            print()

            if not selected_machine_ids:
                self.print_colored("⚠️  No machines selected!", 'red', bold=True)
            elif len(selected_machine_ids) == len(machine_ids):
                self.print_colored(f"✅ All {len(machine_ids)} machines selected", 'green', bold=True)
            else:
                self.print_colored(
                    f"📊 Selected: {len(selected_machine_ids)}/{len(machine_ids)} machines",
                    'cyan',
                    bold=True,
                )
            print()

            for index, machine_id in enumerate(machine_ids, start=1):
                machine_record = machines[machine_id]
                is_selected = machine_id in selected_machine_ids
                marker = "✓" if is_selected else " "
                topology_mode = machine_record.get('topology_mode', DEFAULT_MACHINE_TOPOLOGY_MODE)
                deployment_state = machine_record.get('deployment_state', DEFAULT_MACHINE_DEPLOYMENT_STATE)
                state_emoji, _, state_label = self._get_machine_deployment_state_info(deployment_state)
                connection_display = self.config_manager._format_machine_connection_display(machine_record)
                specs_summary = self.config_manager._format_machine_specs_summary(machine_record.get('machine_specs'))
                color = 'green' if is_selected else 'white'
                self.print_colored(
                    f"  {index}) [{marker}] {machine_id} | {connection_display} | mode={topology_mode} | {state_emoji} {state_label}",
                    color,
                    bold=is_selected,
                )
                if specs_summary:
                    self.print_colored(f"      specs: {specs_summary}", 'cyan')

            print()
            choice = self.get_input("Enter choice (number/a/n/c/Enter to proceed)", "").strip().lower()
            if choice == "":
                if not selected_machine_ids:
                    self.print_colored("❌ Cannot proceed without selecting any machines!", 'red')
                    self.wait_for_enter("Press Enter to continue selection...")
                    continue
                return list(selected_machine_ids)
            if choice == "c":
                return []
            if choice == "a":
                selected_machine_ids = set(machine_ids)
                continue
            if choice == "n":
                selected_machine_ids = set()
                continue

            try:
                choices = []
                for part in choice.replace(',', ' ').split():
                    try:
                        choices.append(int(part))
                    except ValueError:
                        continue
                if not choices:
                    choices = [int(choice)]

                for selection in choices:
                    if selection < 1 or selection > len(machine_ids):
                        self.print_colored(
                            f"❌ Invalid choice: {selection} (valid range: 1-{len(machine_ids)})",
                            'red',
                        )
                        continue
                    machine_id = machine_ids[selection - 1]
                    if machine_id in selected_machine_ids:
                        selected_machine_ids.remove(machine_id)
                    else:
                        selected_machine_ids.add(machine_id)
            except ValueError:
                self.print_colored("❌ Invalid input. Please enter numbers, 'a', 'n', 'c', or press Enter.", 'red')
                self.wait_for_enter("Press Enter to continue selection...")

    def _format_timestamp_ago(self, timestamp: str) -> str:
        """Helper method to format timestamp as 'X time ago' string"""
        if not timestamp:
            return "Never"
            
        timestamp_dt = _parse_iso_to_datetime(timestamp)
        if not timestamp_dt:
            self.print_debug(f"Error parsing timestamp '{timestamp}'")
            return "Unknown"

        # Calculate time difference
        now = datetime.now(timestamp_dt.tzinfo) if timestamp_dt.tzinfo else datetime.now()
        time_diff = now - timestamp_dt

        if time_diff.days > 0:
            return f"{time_diff.days} day(s) ago"
        elif time_diff.seconds > 3600:
            hours = time_diff.seconds // 3600
            return f"{hours} hour(s) ago"
        elif time_diff.seconds > 60:
            minutes = time_diff.seconds // 60
            return f"{minutes} minute(s) ago"
        else:
            return "Just now"

    def combined_node_status_and_info(self) -> None:
        """Display beautiful live container status overview - checks if Edge Node containers are running"""
        if not self.check_hosts_config():
            self.print_colored("No nodes configured! Please configure nodes first.", 'red')
            self.wait_for_enter()
            return

        # Load configuration
        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        env = self.get_mnl_app_env()

        # Clear screen and show loading
        self.clear_screen()
        self.print_header("Container Status")
        self.print_colored(f"🔍 Checking container status (max {self.connection_timeout}s timeout)...", 'cyan')
        
        # Use the same deployment status workflow - get real-time status
        node_status_data = self._get_real_time_node_status()

        # Update persistent status information for each node
        for node_name, status_data in node_status_data.items():
            self._update_node_status(node_name, status_data['status'])
        self.settings_manager.mark_status_refreshed()

        # Clear and display beautiful status
        self.clear_screen()
        self.print_header("Container Status")
        
        # Show deployment status overview (like deployment menu does)
        self._load_active_config()
        deployment_display = self._get_deployment_display_state(
            metadata=self.active_config,
            inventory=self.inventory,
        )
        self.print_colored(deployment_display['deployment_line'], deployment_display['color'])
        if deployment_display.get('status_note'):
            self.print_colored(f"ℹ️  {deployment_display['status_note']}", 'white')
        print()
        
        # Network info
        env_color = 'green' if env else 'red'
        env_text = env if env else 'Not Set'
        self.print_colored(f"🌐 {env_text} │ 🐳 {len(hosts)} containers", env_color, bold=True)
        print()
        target_service_version = self.get_mnl_service_version()
        machine_views = self.build_machine_group_views(node_status_data=node_status_data)

        # Count status
        running = sum(1 for data in node_status_data.values() if data['status'] == 'running')
        stopped = sum(1 for data in node_status_data.values() if data['status'] == 'stopped')
        unreachable = sum(1 for data in node_status_data.values() if data['status'] == 'unreachable')
        not_deployed = sum(1 for data in node_status_data.values() if data['status'] == 'not_deployed')
        unknown = sum(1 for data in node_status_data.values() if data['status'] == 'unknown')

        # Status overview
        self.print_colored("┌─ Container Status " + "─" * 48 + "┐", 'cyan')
        status_line = "│"
        if running > 0:
            status_line += f" 🟢 {running} Running"
        if stopped > 0:
            status_line += f" 🔴 {stopped} Stopped"
        if not_deployed > 0:
            status_line += f" 📦 {not_deployed} Not Deployed"
        if unreachable > 0:
            status_line += f" 🔌 {unreachable} Unreachable"
        if unknown > 0:
            status_line += f" ❓ {unknown} Unknown"
        
        # Pad the status line
        padding = 68 - len(status_line)
        status_line += " " * padding + "│"
        
        if running > 0:
            self.print_colored(status_line, 'green')
        elif stopped > 0:
            self.print_colored(status_line, 'red')
        else:
            self.print_colored(status_line, 'yellow')
        
        self.print_colored("└" + "─" * 68 + "┘", 'cyan')
        print()

        self.print_section(f"Machine Groups ({len(machine_views)} machines / {len(hosts)} instances)")
        grouped_lines, outdated_service_nodes = self._build_machine_group_display_lines(
            machine_views,
            target_service_version=target_service_version,
            include_last_update=True,
        )
        self._print_machine_group_display_lines(grouped_lines)
        print()

        if outdated_service_nodes:
            self.print_colored("Recommended Actions:", 'red', bold=True)
            self.print_colored(
                f"  • Update service for: {', '.join(sorted(outdated_service_nodes))}",
                'red'
            )
            self.print_colored(
                "  • Use Operations Menu -> Update Service File, then run Node Status & Info again to verify.",
                'yellow'
            )
            print()

        detail_choice = self.get_input("Show detailed per-node info? (y/N)", "N")
        if detail_choice.lower() == 'y':
            node_results = self._fetch_node_info_results("Retrieving detailed per-node info...")
            if node_results:
                self._display_node_info_details(node_results)

        # Quick actions
        self.print_colored("\U0001f4a1 Quick Actions: Main Menu \u2192 3 (Operations Menu)", 'white')

        self.wait_for_enter()

    def _manage_service(self, playbook_name: str, title: str, action_text: str) -> None:
        """Common method to manage Edge Nodes"""
        if not self.check_hosts_config():
            self.print_colored("No nodes configured! Please configure nodes first.", 'red')
            self.wait_for_enter()
            return

        self.print_header(title)

        # Load configuration to show service management details
        self.load_configuration()
        hosts = _get_gpu_hosts(self.inventory)
        env = self.get_mnl_app_env()

        # Show service management details
        self.print_colored(f"🔧 Service Management Details:", 'cyan', bold=True)
        self.print_colored(f"   • Action: {action_text}", 'white')
        self.print_colored(f"   • Network: {env if env else 'Not set'}", 'green' if env else 'red')
        self.print_colored(f"   • Available Nodes: {len(hosts)}", 'white')

        self.print_colored(f"\n🖥️  Available Machines:", 'cyan', bold=True)
        for name, config in hosts.items():
            ip = config.get('ansible_host', 'Unknown')
            user = config.get('ansible_user', 'Unknown')
            status_info = self._get_node_status_info(name)
            status = status_info['status']
            emoji, color, description = self._get_status_display_info(status)
            self.print_colored(f"   • {name}: {user}@{ip} ", 'white', end='')
            self.print_colored(f"[{emoji} {description}]", color, end='')

        # Service-specific descriptions
        if "start" in playbook_name:
            self.print_colored(f"\n📋 This will:", 'yellow', bold=True)
            self.print_colored("   • Start the Edge Node systemd service on selected nodes", 'yellow')
            self.print_colored("   • Enable the service to start automatically on boot", 'yellow')
            self.print_colored("   • Verify service startup status", 'yellow')
        elif "stop" in playbook_name:
            self.print_colored(f"\n📋 This will:", 'yellow', bold=True)
            self.print_colored("   • Stop the Edge Node systemd service on selected nodes", 'yellow')
            self.print_colored("   • Keep the service enabled for future startups", 'yellow')
            self.print_colored("   • Verify service shutdown status", 'yellow')
        elif "restart" in playbook_name:
            self.print_colored(f"\n📋 This will:", 'yellow', bold=True)
            self.print_colored("   • Stop the current Edge Node", 'yellow')
            self.print_colored("   • Start the Edge Node with fresh state", 'yellow')
            self.print_colored("   • Verify service restart status", 'yellow')
        elif "status" in playbook_name:
            self.print_colored(f"\n📋 This will:", 'yellow', bold=True)
            self.print_colored("   • Check the current status of Edge Node", 'yellow')
            self.print_colored("   • Show service logs and runtime information", 'yellow')
            self.print_colored("   • Display resource usage if available", 'yellow')

        if not env:
            self.print_colored("\n⚠️  WARNING: Network environment is not set!", 'red', bold=True)
            self.print_colored("   Service operations will proceed, but network environment should be configured.", 'red')

        # Interactive host selection
        operation_name = action_text.lower()
        selected_hosts = self.select_hosts(hosts, operation_name, preselect_mode='all')
        
        if not selected_hosts:
            self.print_colored("No hosts selected. Operation cancelled.", 'yellow')
            return

        if not self._ensure_helper_mode_supported_for_hosts(
            selected_hosts,
            action_label=operation_name,
        ):
            self.wait_for_enter()
            return

        # Show final confirmation with selected hosts
        self.print_colored(f"\n📋 Selected hosts for {operation_name}:", 'cyan', bold=True)
        for host_name in selected_hosts:
            host_config = hosts[host_name]
            ip = host_config.get('ansible_host', 'Unknown')
            user = host_config.get('ansible_user', 'Unknown')
            self.print_colored(f"   ✓ {host_name}: {user}@{ip}", 'green')

        # Confirmation for potentially disruptive operations
        if "stop" in playbook_name or "restart" in playbook_name:
            if self.get_input(f"\n⚠️  Continue with {operation_name} on {len(selected_hosts)} selected node(s)? (y/n)", "y").lower() != 'y':
                self.print_colored("Service operation cancelled.", 'yellow')
                return
        else:
            if self.get_input(f"\n🔧 Continue with {operation_name} on {len(selected_hosts)} selected node(s)? (y/n)", "y").lower() != 'y':
                self.print_colored("Service operation cancelled.", 'yellow')
                return

        playbook_path = self.config_dir / f'playbooks/{playbook_name}'
        if not playbook_path.exists():
            self.print_colored(f"Service management playbook not found: {playbook_path}", 'red')
            self.wait_for_enter()
            return

        # Update node statuses to reflect the operation being performed
        if "start" in playbook_name or "restart" in playbook_name:
            for host_name in selected_hosts:
                self._update_node_status(host_name, 'deploying')
        elif "stop" in playbook_name:
            for host_name in selected_hosts:
                self._update_node_status(host_name, 'deploying')

        self.print_colored(f"\n{action_text} on {len(selected_hosts)} node(s)...", 'cyan')
        success, _, _, _ = self.run_generated_playbook(
            playbook_path,
            selected_hosts,
            machine_scope=False,
            last_applied_action=playbook_name.replace('.yml', ''),
            show_output=True,
            timeout=self.connection_timeout,
        )

        if success:
            self.print_colored(f"\n✅ {title} completed successfully!", 'green')
            
            # Update node statuses based on successful operation
            if "start" in playbook_name:
                for host_name in selected_hosts:
                    self._update_node_status(host_name, 'running')
                self.print_colored(f"Edge Nodes have been started on {len(selected_hosts)} node(s).", 'green')
            elif "stop" in playbook_name:
                for host_name in selected_hosts:
                    self._update_node_status(host_name, 'stopped')
                self.print_colored(f"Edge Nodes have been stopped on {len(selected_hosts)} node(s).", 'green')
            elif "restart" in playbook_name:
                for host_name in selected_hosts:
                    self._update_node_status(host_name, 'running')
                self.print_colored(f"Edge Nodes have been restarted on {len(selected_hosts)} node(s).", 'green')
            elif "status" in playbook_name:
                # Status check doesn't change the actual status, just reports it
                self.print_colored(f"Service status information retrieved for {len(selected_hosts)} node(s).", 'green')
                
            # Show updated statuses
            if "start" in playbook_name or "stop" in playbook_name or "restart" in playbook_name:
                self.print_colored(f"\n📊 Updated Node Statuses:", 'cyan', bold=True)
                for host_name in selected_hosts:
                    self.print_colored(f"   • {host_name}: ", 'white', end='')
                    self._display_node_status(host_name, compact=True)
                    print()  # New line after each status
        else:
            self.print_colored(f"\n❌ {title} encountered issues. Please check the output above.", 'red')
            
            # Update node statuses to reflect potential error state
            if "start" in playbook_name or "restart" in playbook_name or "stop" in playbook_name:
                for host_name in selected_hosts:
                    self._update_node_status(host_name, 'error')
                self.print_colored(f"\n📊 Node statuses updated to Error due to operation failure.", 'yellow')
            
            # Additional error guidance based on operation
            if "start" in playbook_name:
                self.print_colored("Common issues: Service not deployed, network connectivity, or configuration errors.", 'yellow')
            elif "stop" in playbook_name or "restart" in playbook_name:
                self.print_colored("This might be normal if the service was not running on some nodes.", 'yellow')

        self.wait_for_enter()


if __name__ == "__main__":
    r1setup = R1Setup()
    r1setup.run()
