Source code for srtools.transliteration

# -*- coding: utf-8 -*-
# srtools
# Copyright (C) 2019-2021 Andrej Radović <r.andrej@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
import re

from .character_dictionaries import CYR_TO_LAT_TTABLE
from .character_dictionaries import LAT_TO_CYR_DIGRAPHS_DICT
from .character_dictionaries import LAT_TO_CYR_TTABLE

_LAT_TO_CYR_DIGRAPH_RX = (
    "(" + "|".join(map(re.escape, LAT_TO_CYR_DIGRAPHS_DICT.keys())) + ")"
)
_LAT_TO_CYR_DIGRAPH_COMP_RX = re.compile(
    _LAT_TO_CYR_DIGRAPH_RX, re.UNICODE | re.MULTILINE
)


def _cyr_sub_string_from_lat_match(match: re.Match) -> str:
    cyr_digraph = match.group()
    lat_digraph = LAT_TO_CYR_DIGRAPHS_DICT[cyr_digraph]
    return lat_digraph


[docs]def latin_to_cyrillic(text: str) -> str: """Transliterate Serbian Latin string to Cyrillic. You may use a special separator ``!`` to split digraphs `lj`, `nj`, `dž` to prevent their conversion to single Cyrillic letters like so: `l!j`. Args: text: input Latin string to be transliterated. Returns: str: Input string transliterated to Cyrillic. Examples: >>> from srtools import latin_to_cyrillic >>> in_str = "Đače, uštedu plaćaj žaljenjem zbog džinovskih cifara." >>> latin_to_cyrillic(in_str) 'Ђаче, уштеду плаћај жаљењем због џиновских цифара.' >>> latin_to_cyrillic('N!J je skraćenica za Nju Džersi') 'НЈ је скраћеница за Њу Џерси' """ text_digraphs_substituted = _LAT_TO_CYR_DIGRAPH_COMP_RX.sub( _cyr_sub_string_from_lat_match, text ) text_digraphs_and_letters_substituted = ( text_digraphs_substituted.translate(LAT_TO_CYR_TTABLE) ) return text_digraphs_and_letters_substituted
[docs]def cyrillic_to_latin(text: str) -> str: """Transliterate Serbian Cyrillic string to Latin. Args: text: input Cyrillic string to be transliterated. Returns: str: Input string transliterated to Latin. Examples: >>> from srtools import cyrillic_to_latin >>> in_str = "Ђаче, уштеду плаћај жаљењем због џиновских цифара." >>> cyrillic_to_latin(in_str) 'Đače, uštedu plaćaj žaljenjem zbog džinovskih cifara.' """ return text.translate(CYR_TO_LAT_TTABLE)