"""
Utility for extracting and chunking text from PDF files for RAG processing.
"""
import os
from typing import List, Dict, Any
from pathlib import Path

try:
    from pypdf import PdfReader
except ImportError:
    # Fallback to older import name
    from PyPDF2 import PdfReader

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter


def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract text content from a PDF file.

    Args:
        pdf_path: Path to the PDF file

    Returns:
        str: Extracted text content

    Raises:
        FileNotFoundError: If PDF file doesn't exist
        Exception: If PDF reading fails
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    try:
        print(f"[DEBUG] Opening PDF file: {pdf_path}")
        reader = PdfReader(pdf_path)
        text = ""

        print(f"[DEBUG] PDF has {len(reader.pages)} pages")
        for i, page in enumerate(reader.pages):
            print(f"[DEBUG] Extracting text from page {i+1}/{len(reader.pages)}")
            page_text = page.extract_text()
            if page_text.strip():  # Only add non-empty pages
                text += f"\n--- Page {i+1} ---\n{page_text}\n"

        print(f"[DEBUG] Total extracted text length: {len(text)} characters")
        return text

    except Exception as e:
        print(f"[ERROR] Failed to extract text from PDF: {e}")
        raise


def chunk_text_for_rag(
    text: str,
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    separators: List[str] = None
) -> List[Document]:
    """
    Chunk text into smaller documents for RAG processing.

    Args:
        text: Text content to chunk
        chunk_size: Maximum size of each chunk
        chunk_overlap: Number of characters to overlap between chunks
        separators: Custom separators for splitting (optional)

    Returns:
        List[Document]: List of chunked documents
    """
    if separators is None:
        # Default separators optimized for educational content
        separators = [
            "\n\n",  # Paragraph breaks
            "\n",    # Line breaks
            ". ",    # Sentence endings
            "! ",    # Exclamation endings
            "? ",    # Question endings
            "; ",    # Semicolon
            ", ",    # Comma
            " ",     # Spaces
            ""       # Character level
        ]

    print(f"[DEBUG] Chunking text with chunk_size={chunk_size}, overlap={chunk_overlap}")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=separators,
        length_function=len
    )

    # Split the text into chunks
    chunks = text_splitter.split_text(text)

    print(f"[DEBUG] Created {len(chunks)} text chunks")

    # Convert chunks to Document objects with metadata
    documents = []
    for i, chunk in enumerate(chunks):
        doc = Document(
            page_content=chunk,
            metadata={
                "chunk_index": i,
                "chunk_size": len(chunk),
                "source_type": "pdf_text"
            }
        )
        documents.append(doc)

    return documents


def extract_and_chunk_pdf(
    pdf_path: str,
    chunk_size: int = 1000,
    chunk_overlap: int = 200
) -> List[Document]:
    """
    Extract text from PDF and chunk it for RAG processing.

    Args:
        pdf_path: Path to the PDF file
        chunk_size: Maximum size of each chunk
        chunk_overlap: Number of characters to overlap between chunks

    Returns:
        List[Document]: List of chunked documents ready for RAG
    """
    print(f"[DEBUG] Starting PDF extraction and chunking for: {pdf_path}")

    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)

    if not text.strip():
        raise ValueError(f"No text content extracted from PDF: {pdf_path}")

    # Chunk the text
    documents = chunk_text_for_rag(text, chunk_size, chunk_overlap)

    # Add PDF path to metadata
    for doc in documents:
        doc.metadata["source_file"] = pdf_path
        doc.metadata["source_filename"] = Path(pdf_path).name

    print(f"[DEBUG] Successfully processed PDF into {len(documents)} chunks")
    return documents
