"""
Can-Do-Steps unified expansion and hierarchy creation component for the Lumabit lesson-generation pipeline.
Combined Phase: Expand statements and create full hierarchy in one step.
"""
import os
import json
import traceback
import glob
from typing import Dict, Any, List, Optional

from utils.io import (
    save_output,
    load_latest_output,
    create_clean_copy,
    load_optional_input_text,
    build_input_path,
)
from chains.base import build_chain, default_json_parser, parse_output
from validation.json_schemas import (
    ROADMAP_SCHEMA_VERSION,
    validate_roadmap,
    JsonSchemaValidationError,
)

# Sample can-do statements (43 total)
SAMPLE_STATEMENTS = [
    "I can touch type.",
    "I can touch type faster than 20wpm.",
    "I use secure and safe passwords.",
    "I know how to create a spreadsheet.",
    "I know formulas in spreadsheets work.",
    "I know how to use Pivot tables in spreadsheets.",
    "I have an email account.",
    "I know what To, Subject, CC and BCC are for",
    "I know the difference between 'Reply' and 'Reply all'",
    "I know the difference between Webmail, POP and IMAP",
    "I use folders to store my emails",
    "I know not to put passwords in my email",
    "I know what a phishing email is",
    "I know what an e-mail address looks like",
    "I know how to attach a file to an email",
    "I know how to download attachments safely",
    "I know how to find a past email I have sent",
    "I know how to choose a good subject",
    "I know how to write a good email",
    "I know how to be suspicious of links",
    "I know how and when to log out of my email",
    "I can search for and find emails efficiently",
    "I know how to identify a fake email.",
    "I know how to create a poster in Canva.",
    "I know how to export a file from Canva.",
    "I know the difference between different image file formats.",
    "I have used Draw.io to plan something.",
    "I have used the developer tools in a web browser.",
    "I know what the network tab shows in the browser developer tools.",
    "I know how to take a screenshot on my computer.",
    "I have backups of all my important files.",
    "I understand the 'Rule of 3s' related to backups.",
    "I know how to get directions using a maps application.",
    "I know where to find files downloaded to my computer.",
    "I use a file syncing service.",
    "I know how to check my settings.",
    "I know all the parts of a desktop computer.",
    "I know all the ports on my laptop.",
    "I know how to format text.",
    "I know how to use the markdown format.",
    "I know what a machine learning model is.",
    "I know what context is for machine learning models.",
    "I know how to write a good prompt.",
    "I know the difference between LLMs and other machine learning models.",
    "I know the weaknesses for LLM."
]

def parse_expanded_hierarchy(output: str) -> Dict[str, Any]:
    """
    Parse the expanded statements and hierarchy from the LLM output.

    Args:
        output: Raw output from the LLM

    Returns:
        Dict: Parsed data with expanded statements and full hierarchy
    """
    try:
        # Parse the JSON from the output
        parsed = parse_output(output, default_json_parser)

        # Validate the expected structure
        required_fields = ["tracks", "hierarchy_summary"]
        for field in required_fields:
            if field not in parsed:
                raise ValueError(f"Expected '{field}' key in parsed output")

        # Validate expanded statements (optional for some roadmaps)
        expanded_statements = parsed.get("expanded_statements")
        has_expanded_statements = isinstance(expanded_statements, list)

        if expanded_statements is None:
            print("WARNING: No 'expanded_statements' provided; skipping expanded-statement validations.")
        elif not has_expanded_statements:
            print("WARNING: 'expanded_statements' present but not a list; skipping expanded-statement validations.")
            has_expanded_statements = False

        # Count original statements (only when expanded statements are available)
        original_count = len(SAMPLE_STATEMENTS) if has_expanded_statements else None

        if has_expanded_statements:
            # Count new statements (those not in the original set)
            new_statements = [s for s in expanded_statements if s not in SAMPLE_STATEMENTS]
            new_count = len(new_statements)

            # Validate new statement count (should have 45-65 new statements)
            if new_count < 45 or new_count > 65:
                print(f"WARNING: Expected 45-65 new statements, got {new_count}")

            # Validate total statement count (should be 80-120 statements total)
            total_count = len(expanded_statements)
            if total_count < 80 or total_count > 120:
                print(f"WARNING: Expected 80-120 total statements, got {total_count}")
        else:
            new_count = None
            total_count = None

        # Add count details to the parsed data
        parsed["original_count"] = original_count
        parsed["new_count"] = new_count
        parsed["total_count"] = total_count

        # Validate tracks structure
        tracks = parsed["tracks"]
        if not isinstance(tracks, list):
            raise ValueError("Expected 'tracks' to be a list")

        if len(tracks) != 8:
            print(f"WARNING: Expected exactly 8 tracks, got {len(tracks)}")

        # Validate each track structure
        track_required_fields = ["id", "slug", "title", "description", "order", "paths"]
        track_ids = set()
        total_paths = 0
        total_steps = 0
        all_used_statements = []

        for i, track in enumerate(tracks):
            if not isinstance(track, dict):
                raise ValueError(f"Track at index {i} is not a dictionary")

            for field in track_required_fields:
                if field not in track:
                    raise ValueError(f"Track at index {i} is missing required field '{field}'")

            # Validate unique IDs
            if track["id"] in track_ids:
                raise ValueError(f"Duplicate track ID: {track['id']}")
            track_ids.add(track["id"])

            if track["id"] != track["slug"]:
                raise ValueError(f"Track ID and slug must match: {track['id']} != {track['slug']}")

            # Validate title and description lengths
            title_words = track["title"].split()
            if len(title_words) < 1 or len(title_words) > 4:
                print(f"WARNING: Track title must be 1-4 words: '{track['title']}'")

            desc_words = track["description"].split()
            if len(desc_words) < 4 or len(desc_words) > 15:
                print(f"WARNING: Track description must be 4-15 words: '{track['description']}'")

            # Validate order
            if not isinstance(track["order"], int) or track["order"] < 1:
                raise ValueError(f"Track order must be positive integer: {track['order']}")

            # Validate paths
            paths = track["paths"]
            if not isinstance(paths, list):
                raise ValueError(f"Track '{track['id']}' paths must be a list")

            if len(paths) < 2 or len(paths) > 4:
                print(f"WARNING: Track '{track['id']}' must have 2-4 paths, got {len(paths)}")

            total_paths += len(paths)

            # Validate each path
            path_required_fields = ["id", "slug", "title", "description", "order", "steps"]
            path_ids = set()

            for j, path in enumerate(paths):
                if not isinstance(path, dict):
                    raise ValueError(f"Path at index {j} in track '{track['id']}' is not a dictionary")

                for field in path_required_fields:
                    if field not in path:
                        raise ValueError(f"Path at index {j} in track '{track['id']}' is missing required field '{field}'")

                # Validate unique path IDs
                if path["id"] in path_ids:
                    raise ValueError(f"Duplicate path ID: {path['id']}")
                path_ids.add(path["id"])

                if path["id"] != path["slug"]:
                    raise ValueError(f"Path ID and slug must match: {path['id']} != {path['slug']}")

                # Validate path title and description
                path_title_words = path["title"].split()
                if len(path_title_words) < 1 or len(path_title_words) > 4:
                    print(f"WARNING: Path title must be 1-4 words: '{path['title']}'")

                path_desc_words = path["description"].split()
                if len(path_desc_words) < 4 or len(path_desc_words) > 15:
                    print(f"WARNING: Path description must be 4-15 words: '{path['description']}'")

                # Validate path order
                if not isinstance(path["order"], int) or path["order"] < 1:
                    raise ValueError(f"Path order must be positive integer: {path['order']}")

                # Validate steps
                steps = path["steps"]
                if not isinstance(steps, list):
                    raise ValueError(f"Path '{path['id']}' steps must be a list")

                if len(steps) < 1:
                    raise ValueError(f"Path '{path['id']}' must have at least 1 step")

                # Validate each step
                step_required_fields = ["id", "slug", "title", "description", "level", "order"]
                step_ids = set()
                unique_steps = []

                for k, step in enumerate(steps):
                    if not isinstance(step, dict):
                        raise ValueError(f"Step at index {k} in path '{path['id']}' is not a dictionary")

                    for field in step_required_fields:
                        if field not in step:
                            raise ValueError(f"Step at index {k} in path '{path['id']}' is missing required field '{field}'")

                    # Validate unique step IDs
                    if step["id"] in step_ids:
                        print(f"WARNING: Duplicate step ID '{step['id']}' found in path '{path['id']}'. Removing duplicate at index {k}.")
                        continue
                    step_ids.add(step["id"])

                    if step["id"] != step["slug"]:
                        raise ValueError(f"Step ID and slug must match: {step['id']} != {step['slug']}")

                    # Validate can-do-statement is in expanded statements
                    if has_expanded_statements and step["title"] not in expanded_statements:
                        print(f"WARNING: Can-do statement not found in expanded statements: '{step['title']}'")

                    # Validate step description
                    step_desc_words = step["description"].split()
                    if len(step_desc_words) < 4 or len(step_desc_words) > 20:
                        print(f"WARNING: Step description must be 4-20 words: '{step['description']}'")

                    # Validate level
                    if step["level"] not in ["beginner", "intermediate", "advanced"]:
                        raise ValueError(f"Step level must be beginner/intermediate/advanced: {step['level']}")

                    # Validate step order
                    if not isinstance(step["order"], int) or step["order"] < 1:
                        raise ValueError(f"Step order must be positive integer: {step['order']}")

                    # Collect the can-do statement for validation
                    all_used_statements.append(step["title"])
                    unique_steps.append(step)

                if not unique_steps:
                    raise ValueError(f"Path '{path['id']}' must have at least 1 unique step after removing duplicates")

                path["steps"] = unique_steps
                total_steps += len(unique_steps)

        # Validate totals
        if total_paths < 16 or total_paths > 32:
            print(f"WARNING: Expected 16-32 total paths, got {total_paths}")

        if total_steps < 40 or total_steps > 150:
            print(f"WARNING: Expected 40-150 total steps, got {total_steps}")

        # Validate all statements were used
        used_count = len(all_used_statements)
        if has_expanded_statements and used_count != total_count:
            # Find missing statements
            used_set = set(all_used_statements)
            original_set = set(expanded_statements)
            missing_statements = original_set - used_set
            extra_statements = used_set - original_set

            error_msg = f"Statement count mismatch! Expanded: {total_count}, Used: {used_count}\n"
            if missing_statements:
                error_msg += f"Missing statements ({len(missing_statements)}):\n"
                for stmt in list(missing_statements)[:5]:  # Show first 5
                    error_msg += f"  - {stmt}\n"
                if len(missing_statements) > 5:
                    error_msg += f"  ... and {len(missing_statements) - 5} more\n"

            if extra_statements:
                error_msg += f"Extra statements ({len(extra_statements)}):\n"
                for stmt in list(extra_statements)[:5]:  # Show first 5
                    error_msg += f"  - {stmt}\n"
                if len(extra_statements) > 5:
                    error_msg += f"  ... and {len(extra_statements) - 5} more\n"

            # raise ValueError(error_msg)

        # Check for duplicate statements
        if len(set(all_used_statements)) != len(all_used_statements):
            duplicates = []
            seen = set()
            for stmt in all_used_statements:
                if stmt in seen:
                    if stmt not in duplicates:
                        duplicates.append(stmt)
                else:
                    seen.add(stmt)
            # raise ValueError(f"Duplicate statements found in hierarchy: {duplicates[:3]}...")

        if has_expanded_statements:
            print(f"✅ Successfully created expanded hierarchy: {total_count} statements, {len(tracks)} tracks, {total_paths} paths, {total_steps} steps")
            print(f"✅ Original: {original_count}, New: {new_count}, Total: {total_count}")
        else:
            print(f"✅ Successfully created hierarchy: {len(tracks)} tracks, {total_paths} paths, {total_steps} steps")

        return parsed

    except Exception as e:
        print(f"Error parsing expanded hierarchy: {e}")

        tb = traceback.extract_tb(e.__traceback__)
        filename, lineno, funcname, text = tb[-1]  # last frame where it failed
        print(f"Error occurred in {filename}, line {lineno}, in {funcname}")
        print(f"   → {text}")

        # Add more context to help with debugging
        if "Could not parse JSON" in str(e):
            print("This might be due to invalid JSON format in the LLM output.")
            print("Check if the prompt template is properly formatted and the sample statements are being inserted correctly.")
        raise

def sanitize_topic(topic: Optional[str]) -> str:
    """
    Sanitize the topic parameter, supporting international characters.

    Args:
        topic: Raw topic string from command line

    Returns:
        str: Sanitized topic string

    Raises:
        ValueError: If no valid topic is provided
    """
    if not topic:
        raise ValueError("Topic is required for expand_and_create_hierarchy step. Please provide a topic using --topic parameter.")

    import re
    import unicodedata

    # First, normalize the string to handle composed characters properly
    topic = topic.strip()

    # Remove only dangerous control characters, but keep printable Unicode characters
    # This approach keeps letters, numbers, punctuation, and symbols from all languages
    sanitized_chars = []
    for char in topic:
        # Keep character if it's not a control character (except normal whitespace)
        category = unicodedata.category(char)
        if (not category.startswith('C') or char in ' \t\n') and char.isprintable():
            sanitized_chars.append(char)

    sanitized = ''.join(sanitized_chars)

    # Remove excessive whitespace and normalize spaces
    sanitized = re.sub(r'\s+', ' ', sanitized).strip()

    # Limit to reasonable length
    if len(sanitized) > 200:
        sanitized = sanitized[:200].strip()

    # Raise error if sanitization resulted in empty string
    if not sanitized:
        raise ValueError("Topic contains no valid characters. Please provide a meaningful topic using --topic parameter.")

    return sanitized

def load_input_file(run_id: str, file_type: str, input_prefix: Optional[str] = None) -> Optional[str]:
    """
    Load content from input files (tracks-{run_id}.txt or cando-{run_id}.txt).

    Args:
        run_id: Run identifier
        file_type: Either 'tracks' or 'cando'

    Returns:
        str: File content if found, None otherwise
    """
    filename = f"{file_type}-{run_id}.txt"
    input_path = build_input_path(filename, prefix=input_prefix)
    try:
        content = load_optional_input_text(filename, prefix=input_prefix)
    except Exception as e:
        print(f"⚠ Error reading {input_path}: {e}")
        return None
    if not content:
        print(f"ℹ No usable input found at {input_path}")
        return None
    print(f"✓ Found and loaded {input_path}: {len(content)} characters")
    return content

def expand_and_create_hierarchy(
    run_id: str,
    force_text: bool = False,
    topic: Optional[str] = None,
    audience: Optional[str] = None,
    purpose: Optional[str] = None,
    style: Optional[str] = None,
    notes: Optional[str] = None,
    language: Optional[str] = None,
    prompt_id: Optional[str] = None,
    input_prefix: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Expand statements and create full hierarchy in one step.

    Args:
        run_id: Run identifier
        force_text: If True, use existing raw text output instead of calling API
        topic: Topic for journey-based can-do statements expansion (required)
        audience: (optional) Intended audience for the statements
        purpose: (optional) Purpose / learning objective focus
        style: (optional) Language style to use for statements
        notes: (optional) Additional guidance or context to include in the prompt
        language: (optional) Output language to request from the model
        prompt_id: (optional) Override identifier for selecting a specific prompt file
        input_prefix: (optional) Input prefix/path for run-scoped input files

    Returns:
        Dict: Complete hierarchy data with expanded statements

    Note:
        This function will automatically look for input files:
        - input/cando-{run_id}.txt for required can-do statements
        - input/tracks-{run_id}.txt for required tracks
    """
    prompt_dir = os.path.join("prompts", "can-do-steps")
    default_chain_name = "expand_and_create_hierarchy"
    default_prompt_path = os.path.join(prompt_dir, f"{default_chain_name}.txt")
    prompt_path_in_use = default_prompt_path
    chain_name = default_chain_name

    identifier_candidates = []
    if prompt_id:
        identifier_candidates.append(("prompt override", prompt_id))
    if run_id:
        identifier_candidates.append(("roadmap-specific", run_id))

    for label, identifier in identifier_candidates:
        candidate_filename = f"{default_chain_name}-{identifier}.txt"
        candidate_prompt_path = os.path.join(prompt_dir, candidate_filename)
        if os.path.exists(candidate_prompt_path):
            chain_name = f"{default_chain_name}-{identifier}"
            prompt_path_in_use = candidate_prompt_path
            print(f"Using {label} prompt: {prompt_path_in_use}")
            break
    else:
        if prompt_id:
            override_candidate = os.path.join(prompt_dir, f"{default_chain_name}-{prompt_id}.txt")
            print(f"Prompt override not found ({override_candidate}); falling back.")
        if run_id:
            run_candidate = os.path.join(prompt_dir, f"{default_chain_name}-{run_id}.txt")
            if not os.path.exists(run_candidate):
                print(f"Roadmap-specific prompt not found ({run_candidate}); using default.")
        if not os.path.exists(default_prompt_path):
            raise FileNotFoundError(f"Default prompt file not found: {default_prompt_path}")
        print(f"Using default prompt: {prompt_path_in_use}")

    # Check if we should use existing raw output
    if force_text:
        existing_output = load_latest_output(
            pipeline="can-do-steps",
            step=chain_name,
            run_id=run_id,
            as_text=True,
            raw=True,
            subfolder="logs"
        )

        if existing_output:
            print(f"Using existing raw output for expand_and_create_hierarchy in can-do-steps/{run_id}")
            parsed_data = parse_expanded_hierarchy(existing_output)

            metadata_defaults = {
                "id": run_id,
                "slug": run_id,
                "title": sanitize_topic(topic) if topic else run_id,
                "description": purpose or "",
                "schema_version": ROADMAP_SCHEMA_VERSION,
            }
            for k, v in metadata_defaults.items():
                if k not in parsed_data:
                    parsed_data[k] = v

            # Save the parsed output
            try:
                validate_roadmap(parsed_data)
            except JsonSchemaValidationError as exc:
                raise ValueError(
                    f"Roadmap schema validation failed for run '{run_id}'. {exc}"
                ) from exc

            save_output(
                data=parsed_data,
                pipeline="can-do-steps",
                step="expand_and_create_hierarchy",
                run_id=run_id,
                subfolder="archived"
            )

            return parsed_data

    # Format original statements for the prompt
    # Ensure proper formatting for the template
    sample_statements_text = "\n".join(f"- {stmt}" for stmt in SAMPLE_STATEMENTS)

    # Sanitize and prepare the topic
    sanitized_topic = sanitize_topic(topic)

    print(f"Expanding statements and creating hierarchy for run ID: {run_id}")
    print(f"Topic: {sanitized_topic}")

    # Print the first few statements to verify
    print(f"Sample statements (first 3 of {len(SAMPLE_STATEMENTS)}):")
    for stmt in SAMPLE_STATEMENTS[:3]:
        print(f"  - {stmt}")

    try:
        # Load input files for required tracks and can-do statements
        print(f"\nChecking for input files for run ID: {run_id}")
        required_cando = load_input_file(run_id, "cando", input_prefix=input_prefix)
        required_tracks = load_input_file(run_id, "tracks", input_prefix=input_prefix)

        # Prepare input variables for template substitution
        input_vars = {
            "sample_statements": sample_statements_text,
            "topic": sanitized_topic,
            "audience": audience or "general public with a range of proficiency from basic to advanced",
            "purpose": purpose or "comprehensive skill development",
            "style": style or "clear and actionable",
            "notes": notes or "No additional notes provided.",
            "language": language or "English",
            "required_cando": required_cando or "none specified",
            "required_tracks": required_tracks or "No required tracks provided, generate these.",
            # Inject run-level id/slug so the prompt can use these exact values
            "id": run_id,
            "slug": run_id
        }

        result = build_chain(
            chain_name=chain_name,
            pipeline="can-do-steps",
            run_id=run_id,
            input_variables=input_vars
        )

        # Check if the output contains the placeholder text, indicating template substitution failure
        template_vars = [
            "sample_statements",
            "topic",
            "audience",
            "purpose",
            "style",
            "notes",
            "language",
            "required_cando",
            "required_tracks"
        ]
        for var in template_vars:
            if f"{{{var}}}" in result["output"]:
                print(f"WARNING: Template substitution may have failed. {var} placeholder text found in output.")

    except Exception as e:
        print(f"Error in LLM chain: {e}")
        print("DEBUG INFO:")
        print(f"- Template variable: 'sample_statements'")
        print(f"- Sample statements count: {len(SAMPLE_STATEMENTS)}")
        print(f"- First statement: '{SAMPLE_STATEMENTS[0]}'")
        raise

    # Parse the expanded hierarchy from the result
    try:
        parsed_data = parse_expanded_hierarchy(result["output"])
    except Exception as e:
        print(f"Failed to parse LLM output: {e}")
        print("First 500 characters of output:")
        print(result["output"][:500] + "..." if len(result["output"]) > 500 else result["output"])
        raise

    # Ensure metadata keys are present (the prompt will request them, but add sensible defaults)
    metadata_defaults = {
        "id": run_id,
        "slug": run_id,
        "title": sanitized_topic or run_id,
        "description": purpose or "",
        "schema_version": ROADMAP_SCHEMA_VERSION,
    }
    for k, v in metadata_defaults.items():
        if k not in parsed_data:
            parsed_data[k] = v

    # Record which prompt file was used so downstream tooling can inspect it easily
    parsed_data["prompt_file"] = prompt_path_in_use

    try:
        validate_roadmap(parsed_data)
    except JsonSchemaValidationError as exc:
        raise ValueError(
            f"Roadmap schema validation failed for run '{run_id}'. {exc}"
        ) from exc

    # Save the parsed output (timestamped)
    timestamped_filepath = save_output(
        data=parsed_data,
        pipeline="can-do-steps",
        step="expand_and_create_hierarchy",
        run_id=run_id,
        subfolder="archived"
    )

    # Create a roadmap-specific clean copy named 'roadmap-<run_id>.json'
    # Use the new `filter_keys` option to remove expanded_statements and internal counts
    try:
        create_clean_copy(
            timestamped_filepath=timestamped_filepath,
            pipeline="can-do-steps",
            step="expand_and_create_hierarchy",
            run_id=run_id,
            clean_filename=f"roadmap-{run_id}.json",
            filter_keys=["expanded_statements", "original_count", "new_count", "total_count"]
        )
    except Exception as e:
        print(f"Warning: failed to create roadmap clean copy: {e}")

    return parsed_data

def load_expanded_hierarchy(run_id: str) -> Optional[Dict[str, Any]]:
    """
    Load previously created expanded hierarchy.

    Args:
        run_id: Run identifier

    Returns:
        Dict: Previously created hierarchy, or None if not found
    """
    hierarchy = load_latest_output(
        pipeline="can-do-steps",
        step="expand_and_create_hierarchy",
        run_id=run_id,
        subfolder="archived"
    )

    if hierarchy is None:
        output_dir = os.path.join("output", "can-do-steps", run_id)
        pattern = os.path.join(output_dir, "expand_and_create_hierarchy_*.json")
        print(f"⚠️ Unable to load expanded hierarchy for run ID '{run_id}'.")
        print(f"   Looked for files matching: {pattern}")

        if not os.path.exists(output_dir):
            print("   Output directory does not exist.")
        else:
            contents = sorted(os.listdir(output_dir))
            if not contents:
                print("   Output directory exists but is empty.")
            else:
                print("   Output directory contains:")
                for name in contents:
                    print(f"     - {name}")

            raw_pattern = os.path.join(output_dir, "expand_and_create_hierarchy_raw_*.txt")
            raw_files = sorted(glob.glob(raw_pattern))
            if raw_files:
                print("   Found raw outputs:")
                for path in raw_files[-3:]:
                    print(f"     - {os.path.basename(path)}")

    return hierarchy

def get_hierarchy_statistics(run_id: str) -> Dict[str, Any]:
    """
    Get statistics about the expanded hierarchy.

    Args:
        run_id: Run identifier

    Returns:
        Dict: Statistics about tracks, paths, steps
    """
    hierarchy = load_expanded_hierarchy(run_id)
    if not hierarchy:
        raise ValueError(f"No hierarchy found for run ID: {run_id}")

    stats = {
        "original_count": hierarchy.get("original_count", 0),
        "new_count": hierarchy.get("new_count", 0),
        "total_count": hierarchy.get("total_count", 0),
        "tracks": len(hierarchy["tracks"]),
        "total_paths": 0,
        "total_steps": 0,
        "total_statements": 0,
        "paths_per_track": {},
        "steps_per_path": {},
        "level_distribution": {"beginner": 0, "intermediate": 0, "advanced": 0}
    }

    for track in hierarchy["tracks"]:
        track_id = track["id"]
        track_path_count = len(track["paths"])
        stats["total_paths"] += track_path_count
        stats["paths_per_track"][track_id] = track_path_count

        for path in track["paths"]:
            path_id = path["id"]
            path_step_count = len(path["steps"])
            stats["total_steps"] += path_step_count
            stats["steps_per_path"][path_id] = path_step_count

            for step in path["steps"]:
                stats["level_distribution"][step["level"]] += 1
                stats["total_statements"] += 1  # Each step has exactly one can-do statement

    return stats

def extract_expanded_statements(run_id: str) -> List[str]:
    """
    Extract just the expanded statements from the full hierarchy.

    Args:
        run_id: Run identifier

    Returns:
        List[str]: List of expanded statements
    """
    hierarchy = load_expanded_hierarchy(run_id)
    if not hierarchy:
        raise ValueError(f"No hierarchy found for run ID: {run_id}")

    return hierarchy.get("expanded_statements", [])
