"""
Extract topics directly from uploaded PDF via GPT-4o.
"""
import os
import json
from typing import Dict, Any, List, Optional

from utils.openai_file import upload_pdf_for_vision
from utils.io import load_latest_output, save_output
from chains.base import build_chain, default_json_parser, parse_output

def parse_topics(output: str) -> Dict[str, List[Dict[str, str]]]:
    """
    Parse the topics extracted from the PDF.

    Args:
        output: Raw output from the LLM

    Returns:
        Dict: Parsed topics in the expected format
    """
    try:
        # Parse the JSON from the output
        parsed = parse_output(output, default_json_parser)

        # Validate the expected structure
        if "root" not in parsed:
            raise ValueError("Expected 'root' key in parsed output")

        topics = parsed["root"]
        if not isinstance(topics, list):
            raise ValueError("Expected 'root' to be a list")

        # Validate each topic has the required fields
        required_fields = ["topic", "content", "duration", "notes"]
        for i, topic in enumerate(topics):
            if not isinstance(topic, dict):
                raise ValueError(f"Topic at index {i} is not a dictionary")

            for field in required_fields:
                if field not in topic:
                    raise ValueError(f"Topic at index {i} is missing required field '{field}'")

        return parsed
    except Exception as e:
        print(f"Error parsing topics: {e}")
        raise

def extract_topics_from_pdf(
    pdf_path: str,
    run_id: str,
    force_text: bool = False
) -> Dict[str, List[Dict[str, str]]]:
    """
    Extract topics from a PDF using GPT-4o.

    Args:
        pdf_path: Path to the PDF file
        run_id: Run identifier
        force_text: If True, use existing raw text output instead of calling API

    Returns:
        Dict: Extracted topics
    """
    # Check if we should use existing raw output
    if force_text:
        print(f"Checking for existing raw output for run ID: {run_id}")
        existing_output = load_latest_output(
            pipeline="lessons",
            step="extract_topics",
            run_id=run_id,
            as_text=True,
            raw=True
        )

        if existing_output:
            print(f"Using existing raw output for extract_topics in lessons/{run_id}")
            parsed_topics = parse_topics(existing_output)

            # Save the parsed output
            save_output(
                data=parsed_topics,
                pipeline="lessons",
                step="topics",
                run_id=run_id,
                subfolder="topics"
            )

            return parsed_topics

    # No existing output, so upload the PDF and extract topics
    print(f"Uploading PDF for vision: {pdf_path}")
    file_id = upload_pdf_for_vision(pdf_path)

    print(f"Extracting topics from PDF with file ID: {file_id}")
    result = build_chain(
        chain_name="extract_topics",
        pipeline="lessons",
        run_id=run_id,
        file_id=file_id,
        input_variables={}  # No additional variables needed as PDF is passed directly
    )

    # Parse the topics from the result
    parsed_topics = parse_topics(result["output"])

    # Save the parsed output
    save_output(
        data=parsed_topics,
        pipeline="lessons",
        step="topics",
        run_id=run_id,
        subfolder="topics"
    )

    return parsed_topics

def load_topics(run_id: str) -> Optional[Dict[str, List[Dict[str, str]]]]:
    """
    Load previously extracted topics.

    Args:
        run_id: Run identifier

    Returns:
        Dict: Previously extracted topics, or None if not found
    """
    return load_latest_output(
        pipeline="lessons",
        step="topics",
        run_id=run_id,
        subfolder="topics"
    )