import re
import pandas as pd
from bs4 import BeautifulSoup  # For HTML cleaning
import spacy
import string
from unidecode import unidecode
from typing import List, Tuple, Any
from tqdm.auto import tqdm

import json

with open("config.json", "r") as json_file:
    cfg = json.load(json_file)

# Load spaCy's English model.
nlp = spacy.load("en_core_web_sm")

# Precompile regex patterns once for efficiency.
MULTI_SPACE_PATTERN = re.compile(r"\s+")
URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")

vowels_lower = ("aáàảãạ"
                "ăắằẳẵặ"
                "âấầẩẫậ"
                "eéèẻẽẹ"
                "êếềểễệ"
                "iíìỉĩị"
                "oóòỏõọ"
                "ôốồổỗộ"
                "ơớờởỡợ"
                "uúùủũụ"
                "ưứừửữự"
                "yýỳỷỹỵ")

vowels_upper = ("AÁÀẢÃẠ"
                "ĂẮẰẲẴẶ"
                "ÂẤẦẨẪẬ"
                "EÉÈẺẼẸ"
                "ÊẾỀỂỄỆ"
                "IÍÌỈĨỊ"
                "OÓÒỎÕỌ"
                "ÔỐỒỔỖỘ"
                "ƠỚỜỞỠỢ"
                "UÚÙỦŨỤ"
                "ƯỨỪỬỮỰ"
                "YÝỲỶỸỴ")

alphabet_lower = "abcdefghijklmnopqrstuvwxyz"
alphabet_upper = alphabet_lower.upper()

consonants_lower = "bcdđghklmnpqrstvx"
consonants_upper = consonants_lower.upper()

allowed_punctuations = string.punctuation + " "
digits = "0123456789"

# Combine all allowed characters into one string
allowed_pattern = "".join(
    sorted(
        set(vowels_lower + vowels_upper + alphabet_lower + alphabet_upper +
            consonants_lower + consonants_upper + allowed_punctuations +
            digits)))

# Escape the allowed characters so that regex meta-characters are taken literally.
escaped_allowed = re.escape(allowed_pattern)
regex_pattern = rf"^[{escaped_allowed}]+$"

# Compile the regex
VIETNAMESE_ALLOWED_PATTERN = re.compile(regex_pattern)


def validate_vietnamese_sentence(sentence: str) -> bool:
    """
    Return True if the Vietnamese sentence contains only allowed characters; otherwise, False.
    """
    return VIETNAMESE_ALLOWED_PATTERN.fullmatch(sentence) is not None


def fix_non_ascii_characters(sentence: str) -> str:
    """
    Replace non-ASCII characters in the sentence with their closest ASCII equivalents.
    """
    return unidecode(sentence)


def general_processing(sentence: str, max_length=50, filtering=True) -> str:
    """
    Clean and preprocess a sentence by removing extra spaces, HTML, and URLs.
    Filtering is applied if filtering is True.
    Returns None if the sentence exceeds max_length.
    """
    if filtering == True:
        if len(sentence.split()) > max_length:
            return None             #type: ignore

    sentence = MULTI_SPACE_PATTERN.sub(" ", sentence).strip()
    sentence = BeautifulSoup(sentence, "html.parser").get_text(separator=" ")
    sentence = URL_PATTERN.sub("", sentence)

    return sentence


def english_sentence_processing(sentence: str,
                                max_length=50,
                                filtering=True) -> str:
    """
    Process an English sentence by converting non-ASCII characters to ASCII and applying general cleaning.
    """

    sentence = fix_non_ascii_characters(sentence)
    sentence = general_processing(sentence,
                                  max_length=max_length,
                                  filtering=filtering)
    return sentence


def vietnamese_sentence_processing(sentence: str,
                                   max_length=50,
                                   filtering=True) -> str:
    """
    Process a Vietnamese sentence if it contains only allowed characters and applying general cleaning.
    """

    if validate_vietnamese_sentence(sentence):
        sentence = general_processing(sentence,
                                      max_length=max_length,
                                      filtering=filtering)
        return sentence
    return None             #type: ignore


class TextPreprocessor:

    def __init__(self, tokenizer, max_length, name):
        """
        Initializes the text preprocessor with a tokenizer and maximum sequence length.

        Args:
            tokenizer: The tokenizer used for tokenizing input and target text.
            max_length: The maximum length for tokenized sequences (inputs and targets).
            name: "mt5" or "mbart50"
        """
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.name = name

    def preprocess_function(self, examples):
        """
        Tokenizes and formats a batch of examples for sequence-to-sequence training.

        Args:
            examples: A dictionary with "en" (English source text) and "vi" (Vietnamese target text) keys.

        Returns:
            A dictionary containing tokenized inputs and labels with necessary padding/truncation.
        """
        # Get source and target text
        if self.name == "mbart50":
            inputs = examples["en"]
        else:
            PREFIX = cfg[self.name]["args"]["prefix"]
            inputs = [PREFIX + example for example in examples["en"]]
        targets = examples["vi"]

        # Tokenize both inputs and targets with padding and truncation
        model_inputs = self.tokenizer(
            inputs,
            text_target=targets,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
        )

        # Replace padding token ids in labels with -100 to ignore them in loss computation
        model_inputs["labels"] = [[
            (t if t != self.tokenizer.pad_token_id else -100) for t in seq
        ] for seq in model_inputs["labels"]]

        # Preserve original texts for reference or debugging
        if self.name == "mbart50":
            model_inputs["en"] = examples["en"]
        else:
            PREFIX = cfg[self.name]["args"]["prefix"]
            model_inputs["en"] = [
                PREFIX + example for example in examples["en"]
            ]
        model_inputs["vi"] = examples["vi"]

        return model_inputs

    def preprocess_dataset(self, dataset):
        """
        Applies preprocessing to the entire dataset using the `preprocess_function`.

        Args:
            dataset: A Hugging Face Dataset object containing examples with "en" and "vi" keys.

        Returns:
            A tokenized and formatted dataset ready for training.
        """
        return dataset.map(self.preprocess_function,
                           batched=True,
                           remove_columns=dataset.column_names)


def clean_sentence(sentence: str) -> str:
    """Remove punctuation and convert to lowercase."""
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    cleaned = sentence.translate(translator)
    # Convert to lowercase
    return cleaned.lower()


def build_corpus(csv_file: str) -> List[Tuple[List[str], List[str]]]:
    """Read CSV, clean sentences, and create corpus."""
    df = pd.read_csv(csv_file)
    corpus = []
    for _, row in tqdm(df.iterrows()):
        # Clean and tokenize
        eng_cleaned = clean_sentence(row['en'].strip())
        vi_cleaned = clean_sentence(row['vi'].strip())
        eng_words = eng_cleaned.split()
        vi_words = vi_cleaned.split()
        if eng_words and vi_words:  # Skip empty sentences
            corpus.append((eng_words, vi_words))

    return corpus