"""
Utility functions for handling input/output operations.
Supports saving and loading outputs with run ID support.
"""
import os
import json
import glob
from pathlib import Path
from datetime import datetime
from typing import Dict, Any, Optional, Union, List

from utils.storage import (
    use_local_storage,
    write_text,
    write_json,
    read_text,
    read_json,
    path_exists,
    copy_path,
    list_objects,
    uri_from_key,
)

def ensure_dir(directory: str) -> None:
    """
    Ensure that a directory exists, creating it if necessary.

    Args:
        directory: Path to the directory to ensure exists
    """
    if use_local_storage():
        Path(directory).mkdir(parents=True, exist_ok=True)


def get_input_prefix(override: Optional[str] = None) -> str:
    """
    Resolve the input prefix used for run-scoped input files.
    """
    prefix = (override or os.getenv("INPUT_PREFIX", "input")).strip().strip("/")
    return prefix or "input"


def build_input_path(filename: str, prefix: Optional[str] = None) -> str:
    """
    Build a logical path for input files, compatible with local and remote storage.
    """
    clean_name = filename.strip().lstrip("/")
    return f"{get_input_prefix(prefix)}/{clean_name}"


def load_optional_input_text(
    filename: str,
    prefix: Optional[str] = None,
    strip: bool = True,
) -> Optional[str]:
    """
    Load an optional text input file from local or remote storage.
    Returns None if the file does not exist.
    """
    path = build_input_path(filename, prefix=prefix)
    if not path_exists(path):
        return None
    content = read_text(path)
    return content.strip() if strip else content


def save_output(
    data: Union[str, Dict[str, Any]],
    pipeline: str,
    step: str,
    run_id: str,
    as_text: bool = False,
    subfolder: Optional[str] = None
) -> str:
    """
    Save output data from a pipeline step with a run ID.

    Args:
        data: Data to save (string or dictionary)
        pipeline: Pipeline name (e.g. 'lessons', 'roadmap')
        step: Step name (e.g. 'extract_topics', 'generate_lessons')
        run_id: Run identifier
        as_text: If True, save as raw text instead of JSON
        subfolder: Optional subfolder within the pipeline/run_id directory

    Returns:
        Path or URI to the saved file
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Build the output directory path
    output_dir = f"output/{pipeline}/{run_id}"
    if subfolder:
        output_dir = f"{output_dir}/{subfolder}"

    ensure_dir(output_dir)

    # Determine file extension and name
    extension = "txt" if as_text else "json"
    filename = f"{step}_{timestamp}.{extension}"

    # Create a "raw" version for debugging/retry
    raw_filename = f"{step}_raw_{timestamp}.txt"

    filepath = os.path.join(output_dir, filename)
    raw_filepath = os.path.join(output_dir, raw_filename)

    # Save the raw text output for debugging
    if isinstance(data, dict):
        write_text(raw_filepath, str(data))
    else:
        write_text(raw_filepath, data)

    # Save the actual data
    if as_text:
        if isinstance(data, dict):
            saved_path = write_text(filepath, json.dumps(data, indent=2, ensure_ascii=False))
        else:
            saved_path = write_text(filepath, data)
    else:
        if isinstance(data, dict):
            saved_path = write_json(filepath, data)
        else:
            saved_path = write_json(filepath, {"content": data})

    return saved_path


def create_clean_copy(
    timestamped_filepath: str,
    pipeline: str,
    step: str,
    run_id: str,
    subfolder: Optional[str] = None,
    clean_filename: Optional[str] = None,
    filter_keys: Optional[List[str]] = None
) -> str:
    """
    Create a clean copy of a timestamped file.

    By default the filename will be: step-run_id.json
    You can override the final filename by passing `clean_filename` (e.g. 'roadmap-<run_id>.json').

    New:
      - filter_keys: optional list of top-level JSON keys to remove from the clean copy
                     (applies only when the original file is JSON)

    Args:
        timestamped_filepath: Path or URI to the original timestamped file
        pipeline: Pipeline name (e.g. 'can-do-steps')
        step: Step name (e.g. 'bits', 'tracks', 'paths', 'steps')
        run_id: Run identifier
        subfolder: Optional subfolder within the pipeline/run_id directory
        clean_filename: Optional explicit filename to use for the clean copy
        filter_keys: Optional list of top-level keys to remove from the JSON before saving

    Returns:
        Path or URI to the clean copy file
    """
    # Build the output directory path (same as original)
    output_dir = f"output/{pipeline}/{run_id}"
    if subfolder:
        output_dir = f"{output_dir}/{subfolder}"

    # Ensure the output directory exists before copying/writing
    ensure_dir(output_dir)

    # Determine clean filename (allow override)
    if not clean_filename:
        clean_filename = f"{step}-{run_id}.json"

    clean_filepath = os.path.join(output_dir, clean_filename)

    # If filter_keys provided and the source looks like JSON, attempt to create a filtered JSON copy
    try:
        source_is_json = timestamped_filepath.lower().endswith((".json",))
        if filter_keys and source_is_json:
            try:
                data = read_json(timestamped_filepath)
            except Exception:
                # Fall back to a safe copy if we can't parse JSON
                copied = copy_path(timestamped_filepath, clean_filepath)
                print(f"⚠️ Couldn't parse JSON from {os.path.basename(timestamped_filepath)}; created direct copy instead.")
                return copied

            # If the top-level structure is a dict, remove specified keys
            if isinstance(data, dict):
                for key in filter_keys:
                    if key in data:
                        del data[key]

                # Write cleaned JSON to the clean filepath
                saved = write_json(clean_filepath, data)
                print(f"✅ Created filtered clean copy: {clean_filename} (from {os.path.basename(timestamped_filepath)})")
                return saved

            # Not a dict (e.g., list), fall back to copying original file
            copied = copy_path(timestamped_filepath, clean_filepath)
            print(f"⚠️ Source JSON is not an object; copied file without filtering: {clean_filename}")
            return copied

        # Default behavior: copy the file directly
        copied = copy_path(timestamped_filepath, clean_filepath)
        print(f"✅ Created clean copy: {clean_filename} (from {os.path.basename(timestamped_filepath)})")
        return copied
    except Exception as e:
        # Provide a helpful error message if the operation fails
        raise RuntimeError(f"Failed to create clean copy from '{timestamped_filepath}' to '{clean_filepath}': {e}")


def load_latest_output(
    pipeline: str,
    step: str,
    run_id: str,
    as_text: bool = False,
    raw: bool = False,
    subfolder: Optional[str] = None
) -> Optional[Union[str, Dict[str, Any]]]:
    """
    Load the latest output for a specific pipeline step and run ID.

    Args:
        pipeline: Pipeline name (e.g. 'lessons', 'roadmap')
        step: Step name (e.g. 'extract_topics', 'generate_lessons')
        run_id: Run identifier
        as_text: If True, return raw text instead of parsed JSON
        raw: If True, load from the raw output file instead
        subfolder: Optional subfolder within the pipeline/run_id directory

    Returns:
        The loaded data or None if no matching file was found
    """
    # Find the latest file matching the pattern
    prefix = f"{step}_raw_" if raw else f"{step}_"
    extension = "txt" if raw or as_text else "json"
    output_dir = f"output/{pipeline}/{run_id}"
    if subfolder:
        output_dir = f"{output_dir}/{subfolder}"

    if use_local_storage():
        pattern = os.path.join(output_dir, f"{prefix}*.{extension}")
        files = glob.glob(pattern)
        if not files:
            return None

        # Get the most recent file by sorting by modification time
        latest_file = max(files, key=os.path.getmtime)

        # Load the file
        with open(latest_file, "r", encoding="utf-8") as f:
            if as_text or raw:
                return f.read()
            return json.load(f)

    objects = list_objects(output_dir)
    candidates = []
    suffix = f".{extension}"
    for item in objects:
        key = item.get("key", "")
        basename = os.path.basename(key)
        if basename.startswith(prefix) and basename.endswith(suffix):
            candidates.append(item)

    if not candidates:
        return None

    latest_obj = max(candidates, key=lambda x: x.get("last_modified", 0))
    latest_uri = uri_from_key(latest_obj["key"])
    if as_text or raw:
        return read_text(latest_uri)
    return read_json(latest_uri)


def save_markdown_lesson(
    content: str,
    topic: str,
    run_id: str,
    index: int
) -> str:
    """
    Save a generated markdown lesson file.

    Args:
        content: Markdown content for the lesson
        topic: The topic name
        run_id: Run identifier
        index: Numeric index for ordering

    Returns:
        Path or URI to the saved file
    """
    from slugify import slugify

    # Create a slugified filename with numeric prefix
    slug = slugify(topic)
    filename = f"{index:02d}-{slug}.md"

    # Ensure the directory exists
    output_dir = f"output/lessons/{run_id}/lessons-markdown"
    ensure_dir(output_dir)

    # Save the file
    filepath = os.path.join(output_dir, filename)
    return write_text(filepath, content, content_type="text/markdown; charset=utf-8")