fisherman611's picture
Create extract_dictionaries.py
60c2ac9 verified
raw
history blame
2.65 kB
import pandas as pd
import json
import nltk
from nltk.tokenize import word_tokenize
import spacy
import contractions
import string
import re
from collections import defaultdict
from tqdm.auto import tqdm
from deep_translator import GoogleTranslator
translator = GoogleTranslator(source='en', target='vi')
def translate_word(word: str) -> str:
try:
translation = translator.translate(word).lower()
return translation
except Exception as e:
return "N/A"
nlp = spacy.load("en_core_web_sm")
df = pd.read_csv('data/train_cleaned_dataset.csv')
def preprocessing(sentence: str) -> list[str]:
"""
Preprocess the input sentence: remove named entities, lowercase, expand contractions,
and tokenize into a list of words.
Args:
sentence (str): The input sentence to preprocess.
Returns:
list[str]: A list of preprocessed tokens.
"""
if not sentence or not sentence.strip():
return []
# Step 1: Remove named entities
doc = nlp(sentence)
entities = sorted(doc.ents, key=lambda ent: len(ent.text), reverse=True) # Sort by length to handle nested entities
for ent in entities:
sentence = sentence.replace(ent.text, "") # Remove the entity entirely
# Step 2: Clean up extra spaces after entity removal
sentence = " ".join(sentence.split()).strip()
# Step 3: Remove all numbers using regex
sentence = re.sub(r'\d+', '', sentence)
# Step 4: Lowercase the sentence
sentence = sentence.lower()
# Step 5: Expand contractions (e.g., "don't" -> "do not")
sentence = contractions.fix(sentence)
# Step 6: Remove punctuation
translator = str.maketrans({p: ' ' for p in string.punctuation})
sentence = sentence.translate(translator)
sentence = re.sub(r'\s+', ' ', sentence).strip()
# Step 7: Tokenize into words
words = word_tokenize(sentence)
return words
english_sentences = df['en'].to_list()
# Extract English vocabs:
eng_vocabs = defaultdict(int)
for eng_sent in tqdm(english_sentences):
words = preprocessing(eng_sent)
for word in words:
eng_vocabs[word] += 1
dictionary = {}
for word in list(eng_vocabs.keys()):
dictionary[word] = translate_word(word)
dictionary = {k: v for k, v in dictionary.items() if v != "N/A"}
sorted_dictionary = sorted(dictionary.items(), key=lambda x: x[0])
final_dict = dict(sorted_dictionary)
output_file = "data/en_vi_dictionary.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(final_dict, f, ensure_ascii=False, indent=4)
print(f"English vocabulary saved to {output_file}")