Coverage for srtools/character_dictionaries.py : 0%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2# srtools
3# Copyright (C) 2019-2021 Andrej Radović <r.andrej@gmail.com>
4#
5# This program is free software: you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation, either version 3 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program. If not, see <http://www.gnu.org/licenses/>.
17from typing import Dict
18from typing import Set
19from typing import Tuple
20from unicodedata import normalize
22_CYR_LAT_LOWERCASE_LETTER_PAIRS: Set[Tuple[str, str]] = {
23 ("а", "a"),
24 ("б", "b"),
25 ("в", "v"),
26 ("г", "g"),
27 ("д", "d"),
28 ("ђ", "đ"),
29 ("е", "e"),
30 ("ж", "ž"),
31 ("з", "z"),
32 ("и", "i"),
33 ("ј", "j"),
34 ("к", "k"),
35 ("л", "l"),
36 ("љ", "lj"),
37 ("м", "m"),
38 ("н", "n"),
39 ("њ", "nj"),
40 ("о", "o"),
41 ("п", "p"),
42 ("р", "r"),
43 ("с", "s"),
44 ("т", "t"),
45 ("ћ", "ć"),
46 ("у", "u"),
47 ("ф", "f"),
48 ("х", "h"),
49 ("ц", "c"),
50 ("ч", "č"),
51 ("џ", "dž"),
52 ("ш", "š"),
53}
55_SERBIAN_ACCENTS: Set[str] = {
56 "\N{COMBINING DOUBLE GRAVE ACCENT}",
57 "\N{COMBINING GRAVE ACCENT}",
58 "\N{COMBINING INVERTED BREVE}",
59 "\N{COMBINING ACUTE ACCENT}",
60 "\N{COMBINING MACRON}",
61 "\N{COMBINING OVERLINE}",
62 "\N{COMBINING CIRCUMFLEX ACCENT}",
63 "\N{COMBINING BREVE}",
64}
65_VOWEL_PAIRS: Set[Tuple[str, str]] = {
66 ("а", "a"),
67 ("е", "e"),
68 ("и", "i"),
69 ("о", "o"),
70 ("у", "u"),
71}
72_ACCENTED_VOWEL_PAIRS = {
73 (normalize("NFC", cyr_ltr + acc), normalize("NFC", lat_ltr + acc))
74 for acc in _SERBIAN_ACCENTS
75 for cyr_ltr, lat_ltr in _VOWEL_PAIRS
76}
77_CYR_LAT_LOWERCASE_LETTER_PAIRS.union(_ACCENTED_VOWEL_PAIRS)
80def _cat_dicts(*args: dict) -> Dict[any, any]:
81 result = {}
82 for dct in args:
83 result.update(dct)
84 return result
87def _generate_cyr_to_lat_dictionary() -> Dict[str, str]:
88 lowercase_dict = {
89 cyr_letter: lat_letter
90 for cyr_letter, lat_letter in _CYR_LAT_LOWERCASE_LETTER_PAIRS
91 }
92 uppercase_dict = {
93 cyr_letter.upper(): lat_letter.capitalize()
94 for cyr_letter, lat_letter in _CYR_LAT_LOWERCASE_LETTER_PAIRS
95 }
96 return _cat_dicts(lowercase_dict, uppercase_dict)
99CYR_TO_LAT_DICT: Dict[str, str] = _generate_cyr_to_lat_dictionary()
100"""Dict[str, str]: Cyrillic → Latin character translation dict.
102Not used internally. Can be used to look up equivalent Latin letter strings.
103"""
105CYR_TO_LAT_TTABLE: Dict[int, str] = str.maketrans(CYR_TO_LAT_DICT)
106"""Dict[int, str]: Cyrillic → Latin character translation table.
108Used internally with :meth:`str.translate()` to transliterate text.
109"""
111DIGRAPH_ESCAPE_CHARACTER = "!"
114def _generate_lat_to_cyr_dictionary() -> Dict[str, str]:
115 lowercase_dict = {
116 lat_letter: cyr_letter
117 for cyr_letter, lat_letter in _CYR_LAT_LOWERCASE_LETTER_PAIRS
118 }
120 lowercase_letters = {
121 lat_letter: cyr_letter
122 for lat_letter, cyr_letter in lowercase_dict.items()
123 if len(lat_letter) == 1
124 }
125 uppercase_letters = {
126 lat_letter.upper(): cyr_letter.upper()
127 for lat_letter, cyr_letter in lowercase_letters.items()
128 }
130 all_letters_dict = _cat_dicts(lowercase_letters, uppercase_letters)
131 all_letters_ttable = str.maketrans(all_letters_dict)
133 lowercase_digraphs = {
134 lat_letter: cyr_letter
135 for lat_letter, cyr_letter in lowercase_dict.items()
136 if len(lat_letter) == 2
137 }
138 lowercase_escaped_digraphs = {
139 (
140 lat_letter[0] + DIGRAPH_ESCAPE_CHARACTER + lat_letter[1]
141 ): lat_letter.translate(all_letters_ttable)
142 for lat_letter in lowercase_digraphs.keys()
143 }
145 lowercase_all_digraphs = _cat_dicts(
146 lowercase_digraphs, lowercase_escaped_digraphs
147 )
149 uppercase_digraphs = {
150 lat_letter.upper(): cyr_letter.upper()
151 for lat_letter, cyr_letter in lowercase_all_digraphs.items()
152 }
153 capitalized_digraphs = {
154 lat_letter.capitalize(): cyr_letter.capitalize()
155 for lat_letter, cyr_letter in lowercase_all_digraphs.items()
156 }
158 all_digraphs = _cat_dicts(
159 lowercase_all_digraphs, capitalized_digraphs, uppercase_digraphs
160 )
162 return (
163 all_letters_ttable,
164 all_digraphs,
165 _cat_dicts(
166 lowercase_letters,
167 uppercase_letters,
168 lowercase_digraphs,
169 lowercase_escaped_digraphs,
170 uppercase_digraphs,
171 capitalized_digraphs,
172 ),
173 )
176LAT_TO_CYR_TTABLE: Dict[int, str]
177"""Dict[int, str]: Latin → Cyrillic character translation table.
179Covers only transitions from single-character Latin letters to equivalent
180Cyrillic letters.
181Used internally with :meth:`str.translate()` to transliterate text.
182"""
184LAT_TO_CYR_DIGRAPHS_DICT: Dict[str, str]
185"""Dict[str, str]: Latin → Cyrillic digraph translation dict.
187Covers only mappings from Latin digraphs to equivalent Cyrillic letters.
188Used internally with :meth:`str.translate()` to transliterate text.
189"""
191LAT_TO_CYR_DICT: Dict[str, str]
192"""Dict[str, str]: Latin → Cyrillic character translation dict.
194Not used internally. Can be used to look up equivalent Cyrillic letter strings.
195"""
197(
198 LAT_TO_CYR_TTABLE,
199 LAT_TO_CYR_DIGRAPHS_DICT,
200 LAT_TO_CYR_DICT,
201) = _generate_lat_to_cyr_dictionary()