Spaces:

fisherman611
/

english-vietnamese-machine-translation

Sleeping

App Files Files Community

fisherman611 commited on Jul 22

Commit

fec5dda

verified ·

1 Parent(s): c6f9ba7

Upload 3 files

Browse files

Files changed (3) hide show

models/mt5.py +122 -0
models/rule_based_mt.py +470 -0
models/statistical_mt.py +884 -0

models/mt5.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+import torch
+from transformers import MT5TokenizerFast, MT5ForConditionalGeneration  # type: ignore
+from datasets import load_dataset
+from peft import LoraConfig, get_peft_model, TaskType
+from dotenv import load_dotenv
+import wandb
+import json
+from utils.helper import TextPreprocessor
+from utils.trainer import train_model
+load_dotenv()
+class MT5Finetuner:
+    """Class to handle fine-tuning of mT5 model for translation tasks."""
+    def __init__(self, config_path="config.json"):
+        """Initialize with configuration file."""
+        with open(config_path, "r") as json_file:
+            cfg = json.load(json_file)
+        self.args = cfg["mt5"]["args"]
+        self.lora_config = cfg["mt5"]["lora_config"]
+        # Constants
+        self.max_len = self.args["max_len"]
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.id = self.args["id"]
+        self.initial_learning_rate = self.args["initial_learning_rate"]
+        self.model_name = self.args["model_name"]
+        self.wandb_project = self.args["wandb_project"]
+        self.output_dir = self.args["output_dir"]
+        self.name = "mt5"
+        self.model = None
+        self.tokenizer = None
+        self.train_dataset = None
+        self.val_dataset = None
+        self.test_dataset = None
+    def setup_wandb(self):
+        """Initialize Weights & Biases for experiment tracking."""
+        wandb.login(key=os.environ.get("WANDB_API"), relogin=True)
+        wandb.init(project=self.wandb_project, name="mt5-finetune-lora")
+    def load_model_and_tokenizer(self):
+        """Load the mT5 model and tokenizer."""
+        self.tokenizer = MT5TokenizerFast.from_pretrained(self.model_name, legacy=False)
+        self.model = MT5ForConditionalGeneration.from_pretrained(self.model_name)
+        self.model.config.use_cache = False  # Disable cache for training
+    def load_datasets(self):
+        """Load training, validation, and test datasets."""
+        data_files = {
+            "train": "data/train_cleaned_dataset.csv",
+            "test": "data/test_cleaned_dataset.csv",
+            "val": "data/val_cleaned_dataset.csv",
+        }
+        if self.id is not None:
+            training_parts = [
+                f"[{(i * 200000) + 1 if i > 0 else ''}:{(i + 1) * 200000 if i < 10 else ''}]"
+                for i in range(11)
+            ]
+            self.train_dataset = load_dataset(
+                "csv", data_files=data_files, split=f"train{training_parts[self.id]}"
+            )
+            self.test_dataset = load_dataset("csv", data_files=data_files, split="test")
+            self.val_dataset = load_dataset(
+                "csv", data_files=data_files, split="val[:20000]"
+            )
+        else:
+            self.train_dataset = load_dataset(
+                "csv", data_files=data_files, split="train[:1000000]"
+            )
+            self.test_dataset = load_dataset("csv", data_files=data_files, split="test[:100000]")
+            self.val_dataset = load_dataset("csv", data_files=data_files, split="val[:100000]")
+    def configure_lora(self):
+        """Apply LoRA configuration to the model."""
+        lora_config = LoraConfig(
+            task_type=TaskType.SEQ_2_SEQ_LM,
+            r=self.lora_config["r"],
+            lora_alpha=self.lora_config["lora_alpha"],
+            target_modules=self.lora_config["target_modules"],
+            lora_dropout=self.lora_config["lora_dropout"],
+        )
+        self.model = get_peft_model(self.model, lora_config)  # type: ignore
+    def finetune(self):
+        """Orchestrate the fine-tuning process."""
+        self.setup_wandb()
+        self.load_model_and_tokenizer()
+        self.load_datasets()
+        preprocessor = TextPreprocessor(self.tokenizer, self.max_len, name="mt5")
+        tokenized_train_dataset = preprocessor.preprocess_dataset(self.train_dataset)
+        tokenized_eval_dataset = preprocessor.preprocess_dataset(self.val_dataset)
+        self.configure_lora()
+        self.model.print_trainable_parameters()  # type: ignore
+        train_model(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            train_dataset=tokenized_train_dataset,
+            eval_dataset=tokenized_eval_dataset,
+            output_dir=self.output_dir,
+            initial_learning_rate=self.initial_learning_rate,
+            name=self.name,
+            val_dataset=self.val_dataset,
+        )
+if __name__ == "__main__":
+    finetuner = MT5Finetuner()
+    finetuner.finetune()

models/rule_based_mt.py ADDED Viewed

	@@ -0,0 +1,470 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+import re
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.tag import pos_tag
+from nltk.parse import ChartParser, ViterbiParser
+from nltk.grammar import CFG, PCFG, Nonterminal, ProbabilisticProduction
+from nltk.tree import Tree
+import contractions
+import string
+from collections import defaultdict
+import spacy
+nlp = spacy.load("en_core_web_sm")
+import json
+with open("data/en_vi_dictionary.json", "r", encoding='utf-8') as json_file:
+    dictionary = json.load(json_file)
+with open('grammar.txt', 'r', encoding='utf-8') as text_file:
+    grammar = text_file.read()
+class TransferBasedMT:
+    def __init__(self) -> None:
+        # English - Vietnamese dictionary
+        self.dictionary = dictionary
+        # Define the CFG grammar for English sentence structure
+        self.grammar = grammar
+################################################ STAGE 1: PREPROCESSING SOURCE SENTENCE ###################################################
+    def preprocessing(self, sentence: str) -> str:
+        """Preprocess the input sentence: handle named entities, lowercase, expand contractions, and tokenize and regroup."""
+        # Handle named entities, e.g. New York -> New_York
+        doc = nlp(sentence)
+        entities = {ent.text: ent.label_ for ent in doc.ents}
+        for ent_text in sorted(entities.keys(), key=len,reverse=True):
+            ent_joined = ent_text.replace(" ", "_")
+            sentence = sentence.replace(ent_text, ent_joined)
+        # Lowercase and strip redundant space
+        sentence = sentence.lower().strip()
+        # Expand contractions, e.g. don't -> do not
+        sentence = contractions.fix(sentence)   #type: ignore
+        # Tokenize and regroup tokens
+        sentence = " ".join(word_tokenize(sentence))
+        return sentence
+    def safe_tag(self, tag):
+        """Convert tags with special characters to safe nonterminal symbols."""
+        return tag.replace("$", "S")
+################################################ STAGE 2: ANALYZE SOURCE SENTENCE #########################################################
+    def analyze_source(self, sentence: str):
+        """Analyze the source sentence: tokenize, POS tag, and parse into a syntax tree."""
+        doc = nlp(sentence)
+        filtered_pos_tagged = []
+        punctuation_marks = []
+        for i, token in enumerate(doc):
+            word = token.text
+            tag = token.tag_
+            if all(char in string.punctuation for char in word):
+                punctuation_marks.append((i, word, tag))
+            else:
+                filtered_pos_tagged.append((token.lemma_.lower(), tag))
+        grammar_str = self.grammar
+        # Add terminal rule grammars
+        for word, tag in filtered_pos_tagged:
+            safe_tag = self.safe_tag(tag)
+            escaped_word = word.replace('"', '\\"')
+            grammar_str += f'\n{safe_tag} -> "{escaped_word}"'
+        try:
+            grammar = CFG.fromstring(grammar_str)
+            parser = ChartParser(grammar)
+            tagged_tokens_only = [word for word, _ in filtered_pos_tagged]
+            parses = list(parser.parse(tagged_tokens_only))  # Generate parse trees
+            tree = (parses[0] if parses else self._create_fallback_tree(filtered_pos_tagged))  # Use first parse or fallback
+            tree = self._add_punctuation_to_tree(tree, punctuation_marks)  # Reattach punctuation
+            return tree
+        except Exception as e:
+            print(f"Grammar creation error: {e}")
+            return self._create_fallback_tree(filtered_pos_tagged)  # Fallback on error
+    def _create_fallback_tree(self, pos_tagged):
+        """Create a simple fallback tree when parsing fails."""
+        children = [Tree(self.safe_tag(tag), [word]) for word, tag in pos_tagged]  # Create leaf nodes for each token
+        return Tree("S", children)  # Wrap in a sentence node
+    def _add_punctuation_to_tree(self, tree, punctuation_marks):
+        """Add punctuation marks back to the syntax tree."""
+        if not punctuation_marks:
+            return tree
+        if tree.label() == "S":  # Only add to root sentence node
+            for _, word, tag in sorted(punctuation_marks):
+                tree.append(Tree(self.safe_tag(tag), [word]))
+        return tree
+#################################################### STAGE 3: TRANSFER GRAMMAR ############################################################
+    def transfer_grammar(self, tree):
+        """Transfer the English parse tree to Vietnamese structure."""
+        if not isinstance(tree, nltk.Tree):
+            return tree
+        # Sentence level: recurse through children
+        if tree.label() == "S":
+            return Tree("S", [self.transfer_grammar(child) for child in tree])
+        # Verb Phrase: adjust word order
+        elif tree.label() == "VP":
+            children = [self.transfer_grammar(child) for child in tree]
+            child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
+            if (len(children) >= 3 and "V" in child_labels and "To" in child_labels and "VP" in child_labels):  # Remove TO from V TO VP
+                return Tree("VP", [children[0], children[2]])
+            return Tree("VP", children)  # Default: preserve order
+        # Noun Phrase: adjust word order
+        elif tree.label() == "NP":
+            children = [self.transfer_grammar(child) for child in tree]
+            child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
+            if (len(children) >= 3 and 'Det' in child_labels and 'AdjP' in child_labels and 'N' in child_labels): # Reorder Det Adj N -> Det N Adj
+                return Tree("NP", [children[0], children[2], children[1]])
+            elif (len(children) >= 2 and 'PRPS' in child_labels and 'N' in child_labels):  # Reorder PRPS N -> N PRPS
+                return Tree("NP", [children[1], children[0]])
+            elif (len(children) >= 2 and 'Det' in child_labels and 'N' in child_labels):   # Remove Det from Det N
+                return Tree("NP", [children[1]])
+            return Tree("NP", children)  # Default: preserve order
+        # Prepositional Phrase: adjust word order
+        elif tree.label() == "PP":
+            children = [self.transfer_grammar(child) for child in tree]
+            return Tree("PP", children)  # Default: preserve order
+        # Adverbial Phrase: adjust word order
+        elif tree.label() == 'AdvP':
+            children = [self.transfer_grammar(child) for child in tree]
+            return Tree("AdvP", children)  # Default: preserve order
+        # Adjective Phrase: adjust word order
+        elif tree.label() == 'AdjP':
+            children = [self.transfer_grammar(child) for child in tree]
+            return Tree("AdjP", children)  # Default: preserve order
+        # Wh-Question: adjust word order
+        elif tree.label() == "WhQ":
+            children = [self.transfer_grammar(child) for child in tree]
+            child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
+            if len(children) >= 4 and "WH_Word" in child_labels and "AUX" in child_labels and "NP" in child_labels and "VP" in child_labels:
+                return Tree("WhQ", [children[2], children[3], children[0]])  # Remove AUX from  WH_Word AUX NP VP
+            elif len(children) >= 3 and "WH_Word" in child_labels and "NP" in child_labels and "VP" in child_labels and "AUX" not in child_labels:
+                return Tree("WhQ", [children[1], children[2], children[0]])
+            elif len(children) >= 2 and "WH_Word" in child_labels and "VP" in child_labels:
+                if len(children[1]) >= 2:
+                    return Tree("WhQ", [children[1][1], children[1][0], children[0]])  # WH_Word VP -> WH_Word V NP
+            else:
+                return Tree("WhQ", children)  # Default: preserve order
+        # Yes/No-Question: adjust word order
+        elif tree.label() == "YNQ":
+            children = [self.transfer_grammar(child) for child in tree]
+            child_labels = [child.label() if isinstance(child, Tree) else child for child in children]
+            if len(children) >= 3 and "AUX" in child_labels and "NP" in child_labels and "VP" in child_labels:
+                return Tree("YNQ", [children[1], children[2]])
+            elif len(children) >= 3 and "DO" in child_labels and "NP" in child_labels and "VP" in child_labels:
+                return Tree("YNQ", [children[1], children[2]])
+            elif len(children) >= 3 and "MD" in child_labels and "NP" in child_labels and "VP" in child_labels:
+                return Tree("YNQ", [children[1], children[2]])
+            return Tree("YNQ", children)
+        # Other labels: recurse through children
+        else:
+            return Tree(tree.label(), [self.transfer_grammar(child) for child in tree])
+#################################################### STAGE 4: GENERATION STAGE ############################################################
+    def generate(self, tree):
+        """Generate Vietnamese output from the transformed tree."""
+        if not isinstance(tree, nltk.Tree):
+            return self._lexical_transfer(tree)  # Translate leaf nodes
+        words = [self.generate(child) for child in tree if self.generate(child)]  # Recurse
+        # Handle questions specifically
+        if tree.label() == "WhQ":
+            words = self._process_wh_question(tree, words)
+        elif tree.label() == "YNQ":
+            words = self._process_yn_question(tree, words)
+        elif tree.label() == "NP":  # Add classifiers for nouns
+            words = self._add_classifiers(tree, words)
+        elif tree.label() == "VP":  # Apply tense/aspect/mood markers
+            words = self._apply_tam_mapping(tree, words)
+        words = self._apply_agreement(tree, words)  # Handle agreement (e.g., plurals)
+        result = " ".join(words)  # Join words into a string
+        return result
+    def _process_wh_question(self, tree, words):
+        """Process a Wh-question structure for Vietnamese."""
+        words = [w for w in words if w]
+        wh_word = None
+        for word in words:
+            if word in ["cái gì", "ai", "ở đâu", "khi nào", "tại sao", "như thế nào", "cái nào", "của ai"]:
+                wh_word = word
+                break
+        if wh_word == "tại sao":
+            if words and words[0] != "tại sao":
+                words.remove("tại sao")
+                words.insert(0, "tại sao")
+        elif wh_word == "như thế nào":
+            if "vậy" not in words:
+                words.append("vậy")
+        question_particles = ["vậy", "thế", "à", "hả"]
+        has_particle = any(particle in words for particle in question_particles)
+        if not has_particle and wh_word != "tại sao":
+            words.append("vậy")
+        return words
+    def _process_yn_question(self, tree, words):
+        """Process a Yes/No question structure for Vietnamese."""
+        words = [w for w in words if w not in ["", "do_vn", "does_vn", "did_vn"]]
+        has_question_particle = any(w in ["không", "à", "hả", "nhỉ", "chứ"] or
+                                   w in ["không_vn", "à_vn", "hả_vn", "nhỉ_vn", "chứ_vn"]
+                                   for w in words)
+        if not has_question_particle:
+            if "đã" in words or "đã_vn" in words:
+                words.append("phải không")
+            else:
+                words.append("không")
+        return words
+    def _lexical_transfer(self, word):
+        """Translate English words to Vietnamese using the dictionary."""
+        if word in self.dictionary:
+            return self.dictionary[word]  # Return translation if in dictionary
+        return f"{word}_vn"  # Mark untranslated words with _vn suffix
+    def _add_classifiers(self, np_tree, words):
+        """Add Vietnamese classifiers based on nouns."""
+        # noun_indices = [
+        #     i for i, child in enumerate(np_tree) if isinstance(child, Tree)
+        #     and child.label() in ["N", "NN", "NNS", "NNP", "NNPS"]
+        # ]  # Find noun positions
+        # for i in noun_indices:
+        #     if len(words) > i and not any(words[i].startswith(prefix) for prefix in ["một_vn", "những_vn", "các_vn"]):  # Check if classifier is needed
+        #         if words[i].endswith("_vn"):  # Add default classifier for untranslated nouns
+        #             words.insert(i, "cái_vn")
+        return words
+    def _apply_tam_mapping(self, vp_tree, words):
+        """Apply Vietnamese TAM (Tense, Aspect, Mood) markers to the word list.
+        Args:
+            vp_tree: A parse tree node representing the verb phrase.
+            words: List of words to be modified with TAM markers.
+        Returns:
+            List of words with appropriate Vietnamese TAM markers inserted.
+        """
+        verb_tense = None
+        mood = None
+        # Identify verb tense and mood from the verb phrase tree
+        for child in vp_tree:
+            if isinstance(child, Tree):
+                if child.label() in ["V", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
+                    verb_tense = child.label()
+                if child.label() == "MD":  # Modal verbs indicating mood
+                    mood = "indicative"
+                elif child.label() == "TO":  # Infinitive marker, often subjunctive
+                    mood = "subjunctive"
+        if not verb_tense:
+            print("Warning: No verb tense identified in the verb phrase tree.")
+            return words
+        # Apply TAM markers based on verb tense
+        if verb_tense == "VBD":
+            words.insert(0, "đã_vn")
+        elif verb_tense == "VB":
+            if "will_vn" in words:
+                words = [w for w in words if w != "will_vn"]
+                words.insert(0, "sẽ_vn")
+            elif "going_to_vn" in words:
+                words = [w for w in words if w != "going_to_vn"]
+                words.insert(0, "sẽ_vn")
+        elif verb_tense == "VBG":
+            words.insert(0, "đang_vn")
+            if "đã_vn" in words:
+                words.insert(0, "đã_vn")
+        elif verb_tense == "VBN":
+            words.insert(0, "đã_vn")
+        elif verb_tense == "VBP" or verb_tense == "VBZ":
+            pass
+        # Handle future continuous (e.g., "will be running" -> "sẽ đang")
+        if verb_tense == "VBG" and "will_vn" in words:
+            words = [w for w in words if w != "will_vn"]
+            words.insert(0, "đang_vn")  # Continuous marker
+            words.insert(0, "sẽ_vn")    # Future marker
+        # Apply mood markers if applicable
+        if mood == "subjunctive":
+            words.insert(0, "nếu_vn")  # Subjunctive marker (e.g., "if" clause)
+        elif mood == "indicative" and "must_vn" in words:
+            words = [w for w in words if w != "must_vn"]
+            words.insert(0, "phải_vn")  # Necessity marker
+        return words
+    def _apply_agreement(self, tree, words):
+        """Apply agreement rules for Vietnamese (e.g., pluralization)."""
+        if tree.label() == "NP":
+            for i, word in enumerate(words):
+                if "_vn" in word and word.replace("_vn", "").endswith("s"):  # Handle English plurals
+                    base_word = word.replace("_vn", "")[:-1] + "_vn"  # Remove 's'
+                    words[i] = base_word
+                    words.insert(i, "các_vn")  # Add plural marker
+        return words
+    def _post_process_vietnamese(self, text):
+        """Post-process the Vietnamese output: remove _vn, fix punctuation, capitalize."""
+        text = text.replace("_vn", "")  # Remove untranslated markers
+        def fix_entities(word):
+            if "_" in word:
+                word = " ".join([w for w in word.split("_")])
+                return word.title()
+            return word.lower()  # Lowercase non-entity words
+        words = text.split()
+        words = [fix_entities(word) for word in words]
+        text = " ".join(words)
+        for punct in [".", ",", "!", "?", ":", ";"]:  # Attach punctuation directly
+            text = text.replace(f" {punct}", punct)
+        if text:
+            words = text.split()
+            words[0] = words[0].capitalize()  # Capitalize first word
+            text = ' '.join(words)
+        return text
+    def translate(self, english_sentence):
+        """Main translation function that applies all stages of the process."""
+        # Step 1: Preprocess input
+        preprocessed = self.preprocessing(english_sentence)
+        # Step 2: Parse English sentence
+        source_tree = self.analyze_source(preprocessed)
+        print("English parse tree:")
+        source_tree.pretty_print()  # Display English parse tree
+        # Step 3: Transform to Vietnamese structure
+        target_tree = self.transfer_grammar(source_tree)
+        print("Vietnamese structure tree:")
+        target_tree.pretty_print()  # Display Vietnamese parse tree
+        # Step 4: Generate final translation
+        raw_output = self.generate(target_tree)
+        vietnamese_output = self._post_process_vietnamese(raw_output)
+        return vietnamese_output
+if __name__ == "__main__":
+    translator = TransferBasedMT()
+    test_sentences = [
+        "I read books.", "The student studies at school.",
+        "She has a beautiful house.", "They want to buy a new car.",
+        "This is a good computer.", "Are you ready to listen?",
+        "I want to eat.", "This is my book.","What is your name?",
+        "Do you like books?",
+        "Is she at school?",
+        "Are you ready to listen?",
+        "Can they buy a new car?",
+        "Did he read the book yesterday?",
+        "What is your name?",
+        "Where do you live?",
+        "Who is your teacher?",
+        "When will you go to school?",
+        "Why did he leave early?",
+        "How do you feel today?",
+        "I live in New York"
+    ]
+    test_sentences_2 = [
+        # YNQ -> BE NP
+        "Is the renowned astrophysicist still available for the conference?",
+        "Are those adventurous explorers currently in the remote jungle?",
+        "Was the mysterious stranger already gone by midnight?",
+        # YNQ -> BE NP Adj
+        "Is the vibrant annual festival exceptionally spectacular this season?",
+        "Are the newly discovered species remarkably resilient to harsh climates?",
+        "Were the ancient ruins surprisingly well-preserved after centuries?",
+        # YNQ -> BE NP NP
+        "Is she the brilliant leader of the innovative research team?",
+        "Are they the enthusiastic organizers of the grand charity event?",
+        "Was he the sole survivor of the perilous expedition?",
+        # YNQ -> BE NP PP
+        "Is the priceless artifact still hidden in the ancient underground chamber?",
+        "Are the colorful tropical birds nesting high above the lush rainforest canopy?",
+        "Was the historic manuscript carefully stored within the fortified library vault?"
+    ]
+    print("English to Vietnamese Translation Examples:")
+    print("-" * 50)
+    for sentence in test_sentences_2:
+        print(f"English: {sentence}")
+        translation = translator.translate(sentence)
+        print(f"Vietnamese: {translation}")
+        print()

models/statistical_mt.py ADDED Viewed

	@@ -0,0 +1,884 @@

+import pandas as pd
+from nltk.translate import AlignedSent
+from nltk.translate.ibm1 import IBMModel1
+from nltk.lm import MLE
+from nltk.lm.preprocessing import padded_everygram_pipeline
+from collections import defaultdict, Counter
+import math
+import os
+from tqdm import tqdm
+import pickle
+import random
+import gc
+import matplotlib.pyplot as plt
+import numpy as np
+import contractions
+BILINGUAL_DATA_PATH = "bilingual_cleaned_dataset.csv"  # Default bilingual dataset path
+VIE_DATA_PATH = "vie_cleaned_dataset.csv"  # Default Vietnamese dataset path
+VISUALIZATION_PATH = "visualizations"  # Default visualization output path
+BEAM_SIZE = 3
+MAX_PHRASE_LENGTH = 7
+LM_ORDER = 3
+ALPHA = 0.7
+BETA = 0.3
+BATCH_SIZE = 1000  # For processing data in batches
+MIN_PHRASE_COUNT = 3  # Increased threshold to reduce phrase table size
+LIMIT_VOCAB = 100000  # Limit vocabulary size to 10 words
+MODE_VISUALIZATION = False  # Enable visualization
+from pyvi import ViTokenizer
+from nltk.tokenize import word_tokenize
+################################################## 1. Language Model ##################################################
+class LanguageModel:
+    """Memory-optimized Language Model"""
+    def __init__(self, order=LM_ORDER, MODE_VISUALIZATION=MODE_VISUALIZATION):
+        self.order = order
+        self.lm = None
+        self.vocab_size = 0
+        self.MODE_VISUALIZATION = MODE_VISUALIZATION
+    def preprocess(self, text):
+        """Tokenize Vietnamese words"""
+        # return text.lower().split()
+        return ViTokenizer.tokenize(text.lower()).split()
+    def visualize_iterations(self, word_freq, iteration, batch_tokens, output_dir="/kaggle/working/visualizations"):
+        if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
+            # Đang chạy trên Kaggle
+            output_dir = "/kaggle/working/visualizations"
+        else:
+            output_dir = VISUALIZATION_PATH
+        os.makedirs(output_dir, exist_ok=True)
+        """Visualize word frequency for a given iteration"""
+        if not self.MODE_VISUALIZATION:
+            return
+        print(f"\nIteration {iteration} - Word Frequency (Top 5):")
+        top_words = word_freq.most_common(5)
+        for word, count in top_words:
+            print(f"  {word}: {count}")
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        words, counts = zip(*word_freq.most_common(10)) if word_freq else ([], [])
+        if words:
+            plt.figure(figsize=(8, 6))
+            plt.bar(words, counts, color='purple', alpha=0.7)
+            plt.title(f'Word Frequency - Iteration {iteration}')
+            plt.xlabel('Words')
+            plt.ylabel('Frequency')
+            plt.xticks(rotation=45)
+            plt.grid(True, axis='y')
+            plt.savefig(os.path.join(output_dir, f'word_freq_iter_{iteration}.png'))
+            plt.close()
+    def get_probability(self, tokens):
+        """Calculate probability P(V) for a vietnamese tokens sequence"""
+        if not tokens or not self.lm:
+            return 0.0
+        start_tokens = ['<s>'] * (self.order - 1)
+        tokens = start_tokens + tokens
+        log_prob = 0.0
+        for i in range(self.order - 1, len(tokens)):
+            context = tokens[max(0, i - self.order + 1):i]
+            word = tokens[i]
+            prob = self.lm.score(word, context) or 1e-10
+            log_prob += math.log(prob)
+        return log_prob
+    def visualize_log_probabilities(self, sentences, max_sentences=100, output_dir="/kaggle/working/visualizations"):
+        if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
+            # Đang chạy trên Kaggle
+            output_dir = "/kaggle/working/visualizations"
+        else:
+            # Chạy local
+            output_dir = VISUALIZATION_PATH
+        os.makedirs(output_dir, exist_ok=True)
+        """Visualize the log probabilities of a sample of sentences"""
+        if not self.MODE_VISUALIZATION:
+            return
+        if not self.lm:
+            print("Cannot visualize log probabilities: Language model not trained.")
+            return
+        # Sample sentences to reduce computation
+        sample_size = min(len(sentences), max_sentences)
+        sample_sentences = random.sample(sentences, sample_size) if len(sentences) > max_sentences else sentences
+        # Compute log probabilities
+        log_probs = []
+        for sent in sample_sentences:
+            tokens = self.preprocess(sent)
+            log_prob = self.get_probability(tokens)
+            log_probs.append(log_prob)
+        # Print summary statistics
+        print(f"\nLog Probabilities for {len(log_probs)} sentences:")
+        print(f"  Mean Log Probability: {np.mean(log_probs):.2f}")
+        print(f"  Min Log Probability: {min(log_probs):.2f}")
+        print(f"  Max Log Probability: {max(log_probs):.2f}")
+        # Plot histogram of log probabilities
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        plt.figure(figsize=(8, 6))
+        plt.hist(log_probs, bins=30, color='blue', alpha=0.7)
+        plt.title('Distribution of Log Probabilities for Sentences')
+        plt.xlabel('Log Probability')
+        plt.ylabel('Frequency')
+        plt.grid(True)
+        plt.savefig(os.path.join(output_dir, 'log_probabilities.png'))
+        plt.close()
+        print(f"Log probabilities visualization saved to {output_dir}/log_probabilities.png")
+    def train(self, vietnamese_sentences, max_sentences=200000):
+        """Training Language Model with memory optimization"""
+        print(f"Training Language Model on {min(len(vietnamese_sentences), max_sentences)} sentences...")
+        # Limit training data for LM to reduce memory
+        if len(vietnamese_sentences) > max_sentences:
+            print(f"Sampling {max_sentences} sentences from {len(vietnamese_sentences)} for LM training")
+            vietnamese_sentences = random.sample(vietnamese_sentences, max_sentences)
+        # Process in batches to reduce memory usage
+        all_tokens = []
+        batch_size = 10000
+        word_freq = Counter()
+        iteration = 0
+        for i in range(0, len(vietnamese_sentences), batch_size):
+            batch = vietnamese_sentences[i:i+batch_size]
+            batch_tokens = [self.preprocess(sent) for sent in batch]
+            all_tokens.extend(batch_tokens)
+            # Update word frequency for visualization
+            if self.MODE_VISUALIZATION and iteration < 2:  # Limit to 2 iterations
+                for tokens in batch_tokens:
+                    word_freq.update(tokens)
+                self.visualize_iterations(word_freq, iteration + 1, batch_tokens)
+                iteration += 1
+            # Force garbage collection
+            if i % (batch_size * 5) == 0:
+                gc.collect()
+        vocab = set()
+        for tokens in all_tokens:
+            vocab.update(tokens)
+        # Limit vocabulary size to most frequent words
+        if len(vocab) > LIMIT_VOCAB:
+            word_freq = Counter()
+            for tokens in all_tokens:
+                word_freq.update(tokens)
+            # Keep only top words
+            most_common = word_freq.most_common(LIMIT_VOCAB)
+            vocab = set(word for word, _ in most_common)
+            print(f"Limited vocabulary to {len(vocab)} most frequent words")
+        self.vocab_size = len(vocab)
+        # Filter sentences to contain only vocabulary words
+        filtered_sentences = []
+        for tokens in all_tokens:
+            filtered_tokens = [token for token in tokens if token in vocab]
+            if filtered_tokens:  # Only add non-empty sentences
+                filtered_sentences.append(filtered_tokens)
+        # Clear original data
+        del all_tokens
+        gc.collect()
+        # Train N-gram model
+        train_data, padded_sents = padded_everygram_pipeline(self.order, filtered_sentences)
+        self.lm = MLE(self.order)
+        self.lm.fit(train_data, padded_sents)
+        # Visualize log probabilities after training
+        if self.MODE_VISUALIZATION:
+            self.visualize_log_probabilities(vietnamese_sentences)
+        # Clear training data
+        del filtered_sentences, train_data, padded_sents
+        gc.collect()
+        return {"vocab_size": self.vocab_size, "ngram_order": self.order}
+############################################# 2. Translation Model #############################################
+class TranslationModel:
+    """Memory-optimized Translation Model"""
+    def __init__(self, max_phrase_length=MAX_PHRASE_LENGTH, MODE_VISUALIZATION=MODE_VISUALIZATION):
+        self.max_phrase_length = max_phrase_length
+        self.phrase_table = {}
+        self.word_alignments = []
+        self.MODE_VISUALIZATION = MODE_VISUALIZATION
+    def preprocess(self, text, lang):
+        """Preprocess text for both languages"""
+        text = text.lower()
+        if lang == 'eng':
+            text = contractions.fix(text)
+            return word_tokenize(text)
+        elif lang == 'vie':
+            return ViTokenizer.tokenize(text).split()
+        else:
+            return text.split()
+    def load_bilingual_data_batch(self, file_path, batch_size=BATCH_SIZE):
+        """Load bilingual data in batches to reduce memory usage"""
+        print(f"Loading bilingual data from {file_path} in batches")
+        # default = '/kaggle/input/general-data/bilingual_cleaned_dataset.csv'
+        try:
+            df = pd.read_csv(file_path)
+        except FileNotFoundError:
+            file_path = os.path.join('datatest', BILINGUAL_DATA_PATH)
+            df = pd.read_csv(file_path)
+        total_rows = len(df)
+        print(f"Total rows: {total_rows}")
+        for start_idx in range(0, total_rows, batch_size):
+            end_idx = min(start_idx + batch_size, total_rows)
+            batch_df = df.iloc[start_idx:end_idx]
+            aligned_sentences = []
+            for _, row in batch_df.iterrows():
+                eng_tokens = self.preprocess(row['en'], 'eng')
+                vie_tokens = self.preprocess(row['vi'], 'vie')
+                # Filter out very long sentences to save memory
+                if len(eng_tokens) <= 50 and len(vie_tokens) <= 50:
+                    aligned_sentences.append(AlignedSent(eng_tokens, vie_tokens))
+            yield aligned_sentences
+            # Clean up batch
+            del batch_df, aligned_sentences
+            gc.collect()
+    def visualize_alignments(self, aligned_sentences, max_sentences=2, output_dir="/kaggle/working/visualizations"):
+        if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
+            # Đang chạy trên Kaggle
+            output_dir = "/kaggle/working/visualizations"
+        else:
+            # Chạy local
+            output_dir = VISUALIZATION_PATH
+        os.makedirs(output_dir, exist_ok=True)
+        """Visualize word alignments for a sample of sentence pairs"""
+        if not self.MODE_VISUALIZATION:
+            return
+        if not self.ibm_model:
+            print("Cannot visualize alignments: IBM Model 1 not trained.")
+            return
+        # Sample sentences to reduce computation
+        sample_size = min(len(aligned_sentences), max_sentences)
+        sample_sentences = random.sample(aligned_sentences, sample_size) if len(aligned_sentences) > max_sentences else aligned_sentences
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        for idx, sent in enumerate(sample_sentences):
+            src_words = sent.words  # English
+            tgt_words = sent.mots   # Vietnamese
+            alignment = sent.alignment
+            # Create alignment matrix
+            matrix = np.zeros((len(tgt_words), len(src_words)))
+            for src_idx, tgt_idx in alignment:
+                if tgt_idx is not None and src_idx < len(src_words) and tgt_idx < len(tgt_words):
+                    matrix[tgt_idx, src_idx] = 1
+            # Plot alignment matrix
+            plt.figure(figsize=(8, 6))
+            plt.imshow(matrix, cmap='Blues', interpolation='nearest')
+            plt.title(f'Alignment Matrix - Sentence Pair {idx + 1}')
+            plt.xlabel('English Words')
+            plt.ylabel('Vietnamese Words')
+            plt.xticks(range(len(src_words)), src_words, rotation=45, ha='right')
+            plt.yticks(range(len(tgt_words)), tgt_words)
+            plt.tight_layout()
+            plt.savefig(os.path.join(output_dir, f'alignment_matrix_{idx + 1}.png'))
+            plt.close()
+            # Print alignment details
+            print(f"\nSentence Pair {idx + 1}:")
+            print(f"  English: {' '.join(src_words)}")
+            print(f"  Vietnamese: {' '.join(tgt_words)}")
+            print(f"  Alignments: {[(src_words[src], tgt_words[tgt]) for src, tgt in alignment if tgt is not None]}")
+        print(f"Alignment visualizations saved to {output_dir}/")
+    def _extract_alignments_memory_efficient(self, aligned_sentences, ibm_model):
+        """Memory-efficient alignment extraction"""
+        alignments = []
+        # Process in smaller batches
+        batch_size = 5000
+        for i in range(0, len(aligned_sentences), batch_size):
+            batch_alignments = []
+            batch_sentences = aligned_sentences[i:i+batch_size]
+            for sent_pair in batch_sentences:
+                eng_tokens = sent_pair.words
+                vie_tokens = sent_pair.mots
+                # Only keep high-probability alignments
+                alignment = []
+                for eng_i, eng_word in enumerate(eng_tokens):
+                    best_prob = 0
+                    best_vie_i = -1
+                    for vie_i, vie_word in enumerate(vie_tokens):
+                        prob = ibm_model.translation_table.get(eng_word, {}).get(vie_word, 0)
+                        if prob > best_prob:
+                            best_prob = prob
+                            best_vie_i = vie_i
+                    # Only keep alignments above threshold
+                    if best_prob > 0.01:  # Increased threshold
+                        alignment.append((eng_i, best_vie_i))
+                batch_alignments.append(alignment)
+            alignments.extend(batch_alignments)
+            # Periodic cleanup
+            if i % (batch_size * 10) == 0:
+                gc.collect()
+        return alignments
+    def extract_phrases_memory_efficient(self, aligned_sentences):
+        """Memory-efficient phrase extraction"""
+        print("Extracting phrase pairs with memory optimization...")
+        # Use smaller data structures
+        phrase_counts = defaultdict(lambda: defaultdict(int))
+        # Process in batches
+        batch_size = 5000
+        for i in range(0, len(aligned_sentences), batch_size):
+            batch_sentences = aligned_sentences[i:i+batch_size]
+            batch_alignments = self.word_alignments[i:i+batch_size]
+            for sent_pair, alignments in zip(batch_sentences, batch_alignments):
+                if not alignments:  # Skip sentences with no alignments
+                    continue
+                eng_tokens = sent_pair.words
+                vie_tokens = sent_pair.mots
+                alignment_set = set(alignments)
+                # Extract word-level translations first
+                for eng_i, vie_i in alignments:
+                    if eng_i < len(eng_tokens) and vie_i < len(vie_tokens):
+                        eng_word = eng_tokens[eng_i]
+                        vie_word = vie_tokens[vie_i]
+                        phrase_counts[eng_word][vie_word] += 1
+                # Extract short phrases only (max length 3 to save memory)
+                max_len = min(3, self.max_phrase_length)
+                consistent_phrases = self._extract_consistent_phrases(
+                    eng_tokens, vie_tokens, alignment_set, max_len
+                )
+                for eng_phrase, vie_phrase in consistent_phrases:
+                    phrase_counts[eng_phrase][vie_phrase] += 1
+            # Periodic cleanup
+            if i % (batch_size * 5) == 0:
+                gc.collect()
+                print(f"Processed {i+batch_size} sentences...")
+        # Calculate probabilities with higher threshold
+        self.phrase_table = {}
+        for eng_phrase, vie_phrases in phrase_counts.items():
+            total_count = sum(vie_phrases.values())
+            if total_count >= MIN_PHRASE_COUNT:  # Higher threshold
+                # Keep only top 3 translations per phrase to save memory
+                sorted_phrases = sorted(vie_phrases.items(), key=lambda x: x[1], reverse=True)[:3]
+                filtered_phrases = {}
+                for vie_phrase, count in sorted_phrases:
+                    if count >= MIN_PHRASE_COUNT:
+                        filtered_phrases[vie_phrase] = count / total_count
+                if filtered_phrases:
+                    self.phrase_table[eng_phrase] = filtered_phrases
+        print(f"Extracted {len(self.phrase_table)} phrase pairs (filtered)")
+        # Visualize phrase table if enabled
+        if self.MODE_VISUALIZATION:
+            self.visualize_phrase_table()
+        return self.phrase_table
+    def _extract_consistent_phrases(self, eng_tokens, vie_tokens, alignments, max_length):
+        """Extract consistent phrase pairs with length limit"""
+        consistent_phrases = []
+        eng_len = len(eng_tokens)
+        # Limit phrase extraction to reduce memory
+        for e_start in range(eng_len):
+            for e_end in range(e_start, min(eng_len, e_start + max_length)):
+                vie_positions = set()
+                for e_pos in range(e_start, e_end + 1):
+                    for (eng_idx, vie_idx) in alignments:
+                        if eng_idx == e_pos:
+                            vie_positions.add(vie_idx)
+                if not vie_positions:
+                    continue
+                v_start, v_end = min(vie_positions), max(vie_positions)
+                if v_end - v_start + 1 <= max_length:
+                    if self._is_consistent_phrase_pair(e_start, e_end, v_start, v_end, alignments):
+                        eng_phrase = ' '.join(eng_tokens[e_start:e_end+1])
+                        vie_phrase = ' '.join(vie_tokens[v_start:v_end+1])
+                        consistent_phrases.append((eng_phrase, vie_phrase))
+        return consistent_phrases
+    def _is_consistent_phrase_pair(self, e_start, e_end, v_start, v_end, alignments):
+        """Check if a phrase pair is consistent"""
+        for (eng_idx, vie_idx) in alignments:
+            if (e_start <= eng_idx <= e_end) and not (v_start <= vie_idx <= v_end):
+                return False
+            if (v_start <= vie_idx <= v_end) and not (e_start <= eng_idx <= e_end):
+                return False
+        return True
+    def train_ibm_model_incremental(self, file_path="/kaggle/input/general-data/bilingual_cleaned_dataset.csv", iterations=5):
+        """Train IBM Model 1 incrementally to reduce memory usage"""
+        if not os.path.exists(file_path):
+            file_path = os.path.join('datatest', BILINGUAL_DATA_PATH)
+        print(f"Training IBM Model 1 incrementally with {iterations} iterations...")
+        # First pass: collect vocabulary and create aligned sentences
+        all_aligned_sentences = []
+        eng_vocab = set()
+        vie_vocab = set()
+        for batch in self.load_bilingual_data_batch(file_path):
+            for sent_pair in batch:
+                eng_vocab.update(sent_pair.words)
+                vie_vocab.update(sent_pair.mots)
+                all_aligned_sentences.append(sent_pair)
+            # Limit total sentences to prevent memory issues
+            if len(all_aligned_sentences) >= 300000:  # Reduced from 500k
+                print(f"Limited training to {len(all_aligned_sentences)} sentences")
+                break
+        print(f"Training on {len(all_aligned_sentences)} aligned sentences")
+        print(f"English vocab: {len(eng_vocab)}, Vietnamese vocab: {len(vie_vocab)}")
+        ibm_model = IBMModel1(all_aligned_sentences, iterations)
+        # Extract alignments with memory optimization
+        self.word_alignments = self._extract_alignments_memory_efficient(all_aligned_sentences, ibm_model)
+        # Clean up
+        del ibm_model
+        gc.collect()
+        return all_aligned_sentences
+    def visualize_phrase_table(self, max_phrases=10, output_dir="/kaggle/working/visualizations"):
+        if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
+            # Đang chạy trên Kaggle
+            output_dir = "/kaggle/working/visualizations"
+        else:
+            # Chạy local
+            output_dir = VISUALIZATION_PATH
+        os.makedirs(output_dir, exist_ok=True)
+        """Visualize the phrase table as a heatmap with English phrases as columns and Vietnamese phrases as rows"""
+        if not self.MODE_VISUALIZATION:
+            return
+        if not self.phrase_table:
+            print("Cannot visualize phrase table: Phrase table is empty.")
+            return
+        # Select top English phrases and their top Vietnamese translations
+        eng_phrases = sorted(self.phrase_table.keys(), key=lambda x: sum(self.phrase_table[x].values()), reverse=True)[:max_phrases]
+        vie_phrases = set()
+        for eng in eng_phrases:
+            vie_phrases.update(self.phrase_table[eng].keys())
+        vie_phrases = sorted(list(vie_phrases))[:max_phrases]  # Limit Vietnamese phrases
+        # Create matrix for probabilities
+        matrix = np.zeros((len(vie_phrases), len(eng_phrases)))
+        for i, vie in enumerate(vie_phrases):
+            for j, eng in enumerate(eng_phrases):
+                matrix[i, j] = self.phrase_table.get(eng, {}).get(vie, 0)
+        # Create heatmap
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        plt.figure(figsize=(12, 8))
+        plt.imshow(matrix, cmap='Blues', interpolation='nearest')
+        plt.title('Phrase Table Translation Probabilities')
+        plt.xlabel('English Phrases')
+        plt.ylabel('Vietnamese Phrases')
+        plt.xticks(range(len(eng_phrases)), eng_phrases, rotation=45, ha='right')
+        plt.yticks(range(len(vie_phrases)), vie_phrases)
+        plt.colorbar(label='Translation Probability')
+        plt.tight_layout()
+        plt.savefig(os.path.join(output_dir, 'phrase_table.png'))
+        plt.close()
+        # Print sample phrase pairs
+        print("\nSample Phrase Table Entries (Top 5 English phrases):")
+        for eng in eng_phrases[:5]:
+            print(f"  English: {eng}")
+            for vie, prob in sorted(self.phrase_table[eng].items(), key=lambda x: x[1], reverse=True)[:3]:
+                print(f"    -> Vietnamese: {vie}, Probability: {prob:.4f}")
+        print(f"Phrase table visualization saved to {output_dir}/phrase_table.png")
+############################################# 3. Decoder Algorithm #############################################
+class Decoder:
+    """Memory-optimized decoder"""
+    def __init__(self, phrase_table, language_model, beam_size=BEAM_SIZE):
+        self.phrase_table = phrase_table
+        self.lm = language_model
+        self.beam_size = beam_size
+    def translate(self, sentence):
+        """Translate sentence with memory optimization"""
+        tokens = sentence.lower().split()
+        if not tokens:
+            return ""
+        return self._greedy_translate(tokens)
+    def _greedy_translate(self, tokens):
+        """Greedy translation to save memory"""
+        translation = []
+        i = 0
+        while i < len(tokens):
+            best_phrase_len = 1
+            best_translation = tokens[i]  # fallback
+            # Try phrases of different lengths
+            for phrase_len in range(min(3, len(tokens) - i), 0, -1):  # Max length 3
+                eng_phrase = ' '.join(tokens[i:i+phrase_len])
+                if eng_phrase in self.phrase_table:
+                    # Get best translation
+                    vie_translations = self.phrase_table[eng_phrase]
+                    if vie_translations:
+                        best_vie_phrase = max(vie_translations.items(), key=lambda x: x[1])
+                        best_translation = best_vie_phrase[0]
+                        best_phrase_len = phrase_len
+                        break
+            translation.append(best_translation)
+            i += best_phrase_len
+        return ' '.join(translation)
+class Hypothesis:
+    """Lightweight hypothesis class"""
+    def __init__(self, translation, coverage, score, last_phrase_end):
+        self.translation = translation
+        self.coverage = coverage
+        self.score = score
+        self.last_phrase_end = last_phrase_end
+################################################# 4. Combine all SMT System #############################################
+class SMT:
+    """Memory-optimized SMT system"""
+    def __init__(self):
+        self.lm = LanguageModel(order=LM_ORDER)
+        self.tm = TranslationModel(max_phrase_length=MAX_PHRASE_LENGTH)
+        self.decoder = None
+    def post_process(self, text):
+        """Replaces underscores with spaces in the translated text."""
+        return text.replace("_", " ")
+    def train(self):
+        bilingual_path = "/kaggle/input/general-data/bilingual_cleaned_dataset.csv"
+        vie_path = "/kaggle/input/general-data/vie_cleaned_dataset.csv"
+        if not os.path.exists(bilingual_path):
+            bilingual_path = os.path.join("datatest", BILINGUAL_DATA_PATH)
+            vie_path = os.path.join("datatest", VIE_DATA_PATH)
+        print("=== Training Translation Model ===")
+        aligned_sentences = self.tm.train_ibm_model_incremental(bilingual_path)
+        phrase_table = self.tm.extract_phrases_memory_efficient(aligned_sentences)
+        del aligned_sentences
+        gc.collect()
+        # Train language model
+        print("\n=== Training Language Model ===")
+        vie_df = pd.read_csv(vie_path)
+        vietnamese_sentences = vie_df['vi'].tolist()
+        del vie_df  # Free memory
+        gc.collect()
+        lm_stats = self.lm.train(vietnamese_sentences, max_sentences=50000)  # Limit LM training data
+        del vietnamese_sentences  # Free memory
+        gc.collect()
+        # Initialize decoder
+        self.decoder = Decoder(phrase_table, self.lm)
+        # Save model immediately
+        self.save_model()
+        return {
+            "phrase_pairs": len(phrase_table),
+            "lm_stats": lm_stats
+        }
+    def translate_sentence(self, sentence):
+        """Translate a single sentence"""
+        if self.decoder is None:
+            raise ValueError("Model not trained or loaded.")
+        translated_text_with_underscores = self.decoder.translate(sentence)
+        return self.post_process(translated_text_with_underscores)
+    def save_model(self):
+        """Save the trained model"""
+        if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
+            # Đang chạy trên Kaggle
+            model_dir = "/kaggle/working/checkpoints"
+        else:
+            # Chạy local
+            model_dir = "checkpoints"
+        os.makedirs(model_dir, exist_ok=True)
+        # Save with compression
+        with open(os.path.join(model_dir, "phrase_table.pkl"), 'wb') as f:
+            pickle.dump(self.tm.phrase_table, f, protocol=pickle.HIGHEST_PROTOCOL)
+        with open(os.path.join(model_dir, "lm_object.pkl"), 'wb') as f:
+            pickle.dump(self.lm, f, protocol=pickle.HIGHEST_PROTOCOL)
+        print(f"Model saved to {model_dir}")
+    def load_model(self, model_dir='checkpoints'):
+        """Load a pre-trained model"""
+        with open(os.path.join(model_dir, "phrase_table.pkl"), 'rb') as f:
+            phrase_table = pickle.load(f)
+        with open(os.path.join(model_dir, "lm_object.pkl"), 'rb') as f:
+            self.lm = pickle.load(f)
+        self.decoder = Decoder(phrase_table, self.lm, BEAM_SIZE)
+        self.tm.phrase_table = phrase_table
+        print(f"Model loaded from {model_dir}")
+    def evaluate(self, test_file='/kaggle/input/general-data/test_cleaned_dataset.csv', sample_size=5):
+        """Evaluate model on test set"""
+        try :
+            df = pd.read_csv(test_file)
+        except FileNotFoundError:
+            test_file = 'datatest/test_cleaned_dataset.csv'
+            df = pd.read_csv(test_file)
+        sample_size = min(sample_size, len(df))
+        sample_indices = random.sample(range(len(df)), sample_size)
+        results = []
+        for idx in sample_indices:
+            try:
+                source = df.iloc[idx]['en']
+                reference = df.iloc[idx]['vi']
+                translation = self.translate_sentence(source)
+                results.append({
+                    "source": source,
+                    "reference": reference,
+                    "translation": translation
+                })
+            except Exception as e:
+                print(f"Error translating sentence {idx}: {e}")
+                results.append({
+                    "source": df.iloc[idx]['en'],
+                    "reference": df.iloc[idx]['vi'],
+                    "translation": "Translation failed"
+                })
+        return results
+    def save_predictions_batch(self, test_file="/kaggle/input/general-data/test_cleaned_dataset.csv", output_file="/kaggle/working/predicted.csv", batch_size=1000):
+        """Save predictions in batches to avoid memory issues"""
+        # Check if test_file exists, if not update to default path
+        if not os.path.exists(test_file):
+            test_file = "datatest/test_cleaned_dataset.csv"
+            output_file = "datatest/predicted1.csv"
+        print(f"Output file will be saved to: {output_file}")
+        df_info = pd.read_csv(test_file, nrows=0)  # Just get column info
+        total_rows = len(pd.read_csv(test_file))
+        print(f"Processing {total_rows} sentences in batches of {batch_size}")
+        # Process in batches and write incrementally
+        first_batch = True
+        for start_idx in tqdm(range(0, total_rows, batch_size), desc="Processing batches"):
+            end_idx = min(start_idx + batch_size, total_rows)
+            # Read batch
+            batch_df = pd.read_csv(test_file, skiprows=range(1, start_idx+1), nrows=batch_size)
+            # Process batch
+            batch_predictions = []
+            for _, row in batch_df.iterrows():
+                try:
+                    source = row['en']
+                    reference = row['vi']
+                    translation = self.translate_sentence(source)
+                    batch_predictions.append({
+                        "en": source,
+                        "vi": reference,
+                        "pre": translation
+                    })
+                except Exception as e:
+                    batch_predictions.append({
+                        "en": row['en'],
+                        "vi": row['vi'],
+                        "pre": "Translation failed"
+                    })
+            # Save batch
+            batch_pred_df = pd.DataFrame(batch_predictions)
+            if first_batch:
+                batch_pred_df.to_csv(output_file, index=False)
+                first_batch = False
+            else:
+                batch_pred_df.to_csv(output_file, mode='a', header=False, index=False)
+            # Clean up
+            del batch_df, batch_predictions, batch_pred_df
+            gc.collect()
+        print(f"Predictions saved to {output_file}")
+        return output_file
+def main():
+    print("Starting Memory-Optimized SMT System...")
+    smt = SMT()
+    model_dir = "checkpoints"
+    if os.path.exists(model_dir) and os.path.isfile(os.path.join(model_dir, "phrase_table.pkl")):
+        print("Loading existing model...")
+        smt.load_model()
+    else:
+        print("Training new model...")
+        stats = smt.train()
+        print(f"Training complete: {stats}")
+    # Evaluate model
+    print("\nEvaluating model...")
+    results = smt.evaluate(sample_size=1)
+    print("\nExample translations:")
+    for i, result in enumerate(results):
+        print(f"\nExample {i+1}:")
+        print(f"English:    {result['source']}")
+        print(f"Reference:  {result['reference']}")
+        print(f"Translation: {result['translation']}")
+    # Save predictions in batches
+    print("\nSaving predictions in batches...")
+    output_file = smt.save_predictions_batch(batch_size=500)  # Smaller batch size
+    print(f"All predictions saved to: {output_file}")
+    # Final memory cleanup
+    gc.collect()
+    print("Processing complete!")
+class SMTExtended(SMT):
+    def infer(self, sentence):
+        """Translate a single arbitrary English sentence into Vietnamese using beam search"""
+        if self.decoder is None:
+            raise ValueError("Model not trained or loaded.")
+        # Preprocess input sentence
+        tokens = self.tm.preprocess(sentence, 'eng')
+        if not tokens:
+            return ""
+        # Initialize beam: (score, translation_tokens, last_pos, covered_positions)
+        beam = [(0.0, [], 0, set())]  # Score, translation tokens, last position, covered positions
+        best_score = float('-inf')
+        best_translation = []
+        # Beam search
+        while beam:
+            new_beam = []
+            for score, trans_tokens, last_pos, covered in beam:
+                # Check if all positions are covered
+                if len(covered) == len(tokens):
+                    if score > best_score:
+                        best_score = score
+                        best_translation = trans_tokens
+                    continue
+                # Find next uncovered position
+                next_pos = last_pos
+                while next_pos in covered and next_pos < len(tokens):
+                    next_pos += 1
+                if next_pos >= len(tokens):
+                    if score > best_score:
+                        best_score = score
+                        best_translation = trans_tokens
+                    continue
+                # Try phrases starting at next_pos
+                for phrase_len in range(1, min(self.tm.max_phrase_length + 1, len(tokens) - next_pos + 1)):
+                    eng_phrase = ' '.join(tokens[next_pos:next_pos + phrase_len])
+                    # Get possible translations from phrase table
+                    vie_translations = self.tm.phrase_table.get(eng_phrase, {})
+                    if not vie_translations and phrase_len == 1:
+                        # Fallback for single unknown word
+                        vie_translations = {tokens[next_pos]: 1.0}
+                    for vie_phrase, trans_prob in vie_translations.items():
+                        # Split Vietnamese phrase into tokens for LM scoring
+                        vie_tokens = vie_phrase.split()
+                        # Calculate new score: combine translation prob and LM prob
+                        log_trans_prob = math.log(trans_prob) if trans_prob > 0 else math.log(1e-10)
+                        lm_score = self.lm.get_probability(trans_tokens + vie_tokens)
+                        new_score = ALPHA * log_trans_prob + BETA * lm_score
+                        # Update covered positions
+                        new_covered = covered | set(range(next_pos, next_pos + phrase_len))
+                        # Add to new beam
+                        new_beam.append((score + new_score, trans_tokens + vie_tokens, next_pos + phrase_len, new_covered))
+            # Keep top BEAM_SIZE hypotheses
+            new_beam.sort(key=lambda x: x[0], reverse=True)
+            beam = new_beam[:self.decoder.beam_size]
+        # Return best translation
+        return ' '.join(best_translation) if best_translation else "Translation failed"
+if __name__ == "__main__":
+    main()