File size: 2,648 Bytes
60c2ac9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pandas as pd 
import json 
import nltk 
from nltk.tokenize import word_tokenize
import spacy 
import contractions
import string 
import re
from collections import defaultdict
from tqdm.auto import tqdm
from deep_translator import GoogleTranslator

translator = GoogleTranslator(source='en', target='vi')

def translate_word(word: str) -> str: 
    try: 
        translation = translator.translate(word).lower()
        return translation 
    except Exception as e: 
        return "N/A"

nlp = spacy.load("en_core_web_sm")

df = pd.read_csv('data/train_cleaned_dataset.csv')

def preprocessing(sentence: str) -> list[str]:
    """
    Preprocess the input sentence: remove named entities, lowercase, expand contractions, 
    and tokenize into a list of words.
    
    Args:
        sentence (str): The input sentence to preprocess.
    
    Returns:
        list[str]: A list of preprocessed tokens.
    """
    if not sentence or not sentence.strip():
        return []

    # Step 1: Remove named entities
    doc = nlp(sentence)
    entities = sorted(doc.ents, key=lambda ent: len(ent.text), reverse=True)  # Sort by length to handle nested entities
    for ent in entities:
        sentence = sentence.replace(ent.text, "")  # Remove the entity entirely

    # Step 2: Clean up extra spaces after entity removal
    sentence = " ".join(sentence.split()).strip()

    # Step 3: Remove all numbers using regex
    sentence = re.sub(r'\d+', '', sentence)

    # Step 4: Lowercase the sentence
    sentence = sentence.lower()

    # Step 5: Expand contractions (e.g., "don't" -> "do not")
    sentence = contractions.fix(sentence)

    # Step 6: Remove punctuation 
    translator = str.maketrans({p: ' ' for p in string.punctuation})
    sentence = sentence.translate(translator)
    sentence = re.sub(r'\s+', ' ', sentence).strip()

    # Step 7: Tokenize into words
    words = word_tokenize(sentence)

    return words

english_sentences = df['en'].to_list()


# Extract English vocabs: 

eng_vocabs = defaultdict(int)
for eng_sent in tqdm(english_sentences): 
    words = preprocessing(eng_sent)
    for word in words:
        eng_vocabs[word] += 1

dictionary = {}
for word in list(eng_vocabs.keys()): 
    dictionary[word] = translate_word(word)

dictionary = {k: v for k, v in dictionary.items() if v != "N/A"}
sorted_dictionary = sorted(dictionary.items(), key=lambda x: x[0])
final_dict = dict(sorted_dictionary)

output_file = "data/en_vi_dictionary.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(final_dict, f, ensure_ascii=False, indent=4)

print(f"English vocabulary saved to {output_file}")