Swa-bhasha Resource Hub: Romanized Sinhala to Sinhala Transliteration Systems and Data Resources
Paper
•
2507.09245
•
Published
This tokenizer is specifically trained for Romanized Sinhala text (Sinhala written in Latin alphabet).
from transformers import PreTrainedTokenizerFast
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained(
"deshanksuman/romanized-sinhala-tokenizer",
token="hf Token"
)
# Just tokenize and get tensors
encoded = tokenizer("api ada mkda krnne", return_tensors="pt")
print(encoded)
# To see tokens in text form
print(tokenizer.convert_ids_to_tokens(encoded["input_ids"][0]))
@article{sumanathilaka2025swa,
title={Swa-bhasha Resource Hub: Romanized Sinhala to Sinhala Transliteration Systems and Data Resources},
author={Sumanathilaka, Deshan and Perera, Sameera and Dharmasiri, Sachithya and Athukorala, Maneesha and Herath, Anuja Dilrukshi and Dias, Rukshan and Gamage, Pasindu and Weerasinghe, Ruvan and Priyadarshana, YHPP},
journal={arXiv preprint arXiv:2507.09245},
year={2025}
}