"""Roadmap RAG ingestion step.

Loads a source document, chunks it, and stores it in a Chroma vectorstore
for later retrieval-driven lesson generation.
"""
from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional

from langchain_core.documents import Document
from utils.io import save_output
from utils.pdf_extractor import extract_and_chunk_pdf, chunk_text_for_rag
from utils.rag import (
    DEFAULT_EMBED_BATCH_SIZE,
    DEFAULT_EMBED_MODEL,
    DEFAULT_VECTORSTORE_BATCH_SIZE,
    add_documents_to_vectorstore,
)
from utils.storage import path_exists, materialize_to_local_file


@dataclass
class IngestionConfig:
    """Configuration for the ingestion step."""

    run_id: str
    resource_path: Path
    collection_name: str
    persist_directory: Path
    chunk_size: int = 1000
    chunk_overlap: int = 200
    embedding_model: Optional[str] = None
    embedding_batch_size: Optional[int] = None
    vectorstore_batch_size: Optional[int] = None


def _load_text_resource(path: Path) -> str:
    """Load plain-text content from a non-PDF resource."""
    try:
        return path.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        # Fallback to latin-1 for legacy files
        return path.read_text(encoding="latin-1")


def _chunk_plain_text(text: str, config: IngestionConfig) -> List[Document]:
    """Chunk plain text content into Documents with metadata."""
    documents = chunk_text_for_rag(
        text=text,
        chunk_size=config.chunk_size,
        chunk_overlap=config.chunk_overlap,
    )

    for index, doc in enumerate(documents):
        metadata = doc.metadata or {}
        metadata.update(
            {
                "chunk_index": metadata.get("chunk_index", index),
                "source_type": metadata.get("source_type", "text"),
                "source_file": str(config.resource_path.resolve()),
                "source_filename": config.resource_path.name,
                "run_id": config.run_id,
            }
        )
        doc.metadata = metadata

    return documents


def _prepare_documents(config: IngestionConfig) -> List[Document]:
    """Read the resource file and convert it into chunked Documents."""
    suffix = config.resource_path.suffix.lower()

    if suffix == ".pdf":
        documents = extract_and_chunk_pdf(
            str(config.resource_path.resolve()),
            chunk_size=config.chunk_size,
            chunk_overlap=config.chunk_overlap,
        )
        for doc in documents:
            metadata = doc.metadata or {}
            metadata.update(
                {
                    "source_type": "pdf",
                    "source_file": str(config.resource_path.resolve()),
                    "source_filename": config.resource_path.name,
                    "run_id": config.run_id,
                }
            )
            doc.metadata = metadata
        return documents

    # Treat everything else as plain text
    text = _load_text_resource(config.resource_path)
    return _chunk_plain_text(text, config)


def ingest_resource(
    run_id: str,
    resource_path: str,
    collection_name: Optional[str] = None,
    persist_directory: Optional[str] = None,
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    embedding_model: Optional[str] = None,
    embedding_batch_size: Optional[int] = None,
    vectorstore_batch_size: Optional[int] = None,
) -> Dict[str, object]:
    """Process a resource file and store it in the Chroma vectorstore."""
    if not path_exists(resource_path):
        raise FileNotFoundError(f"Resource file not found: {resource_path}")

    local_resource_path = materialize_to_local_file(resource_path)
    resolved_path = Path(local_resource_path).expanduser().resolve()

    collection = collection_name or f"roadmap_rag_{run_id}"
    persist_dir = Path(persist_directory or f".chroma_{collection}").expanduser().resolve()
    persist_dir.mkdir(parents=True, exist_ok=True)

    config = IngestionConfig(
        run_id=run_id,
        resource_path=resolved_path,
        collection_name=collection,
        persist_directory=persist_dir,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        embedding_model=embedding_model,
        embedding_batch_size=embedding_batch_size,
        vectorstore_batch_size=vectorstore_batch_size,
    )

    documents = _prepare_documents(config)
    if not documents:
        raise ValueError(f"No content extracted from resource: {resolved_path}")

    add_documents_to_vectorstore(
        documents=documents,
        collection_name=config.collection_name,
        persist_directory=str(config.persist_directory),
        embedding_model=config.embedding_model,
        embedding_batch_size=config.embedding_batch_size,
        vectorstore_batch_size=config.vectorstore_batch_size,
    )

    stats = {
        "run_id": run_id,
        "resource_path": resource_path,
        "materialized_resource_path": str(resolved_path),
        "collection_name": config.collection_name,
        "persist_directory": str(config.persist_directory),
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "total_chunks": len(documents),
        "file_size_bytes": resolved_path.stat().st_size,
        "embedding_model": embedding_model or DEFAULT_EMBED_MODEL,
        "embedding_batch_size": embedding_batch_size or DEFAULT_EMBED_BATCH_SIZE,
        "vectorstore_batch_size": vectorstore_batch_size or DEFAULT_VECTORSTORE_BATCH_SIZE,
    }

    save_output(
        data=stats,
        pipeline="rag-roadmap",
        step="ingest_resource",
        run_id=run_id,
    )

    print(
        "✅ Ingested resource",
        json.dumps(
            {
                "chunks": stats["total_chunks"],
                "collection": stats["collection_name"],
                "persist_directory": stats["persist_directory"],
            }
        ),
    )

    return stats


__all__ = ["ingest_resource"]