| from idiomify.fetchers import fetch_tokenizer | |
| def main(): | |
| tokenizer = fetch_tokenizer("t-1-1") | |
| print(tokenizer.bos_token) | |
| print(tokenizer.cls_token) | |
| print(tokenizer.eos_token) | |
| print(tokenizer.sep_token) | |
| print(tokenizer.mask_token) | |
| print(tokenizer.pad_token) | |
| print(tokenizer.unk_token) | |
| print(tokenizer.additional_special_tokens) # this should have been added | |
| # the size of the vocab | |
| print(len(tokenizer)) | |
| """ | |
| <s> | |
| <s> | |
| </s> | |
| </s> | |
| <mask> | |
| <pad> | |
| <unk> | |
| ['<idiom>', '</idiom>'] | |
| 50267 | |
| """ | |
| if __name__ == '__main__': | |
| main() | |