Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2# srtools 

3# Copyright (C) 2019-2021 Andrej Radović <r.andrej@gmail.com> 

4# 

5# This program is free software: you can redistribute it and/or modify 

6# it under the terms of the GNU General Public License as published by 

7# the Free Software Foundation, either version 3 of the License, or 

8# (at your option) any later version. 

9# 

10# This program is distributed in the hope that it will be useful, 

11# but WITHOUT ANY WARRANTY; without even the implied warranty of 

12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

13# GNU General Public License for more details. 

14# 

15# You should have received a copy of the GNU General Public License 

16# along with this program. If not, see <http://www.gnu.org/licenses/>. 

17from typing import Dict 

18from typing import Set 

19from typing import Tuple 

20from unicodedata import normalize 

21 

22_CYR_LAT_LOWERCASE_LETTER_PAIRS: Set[Tuple[str, str]] = { 

23 ("а", "a"), 

24 ("б", "b"), 

25 ("в", "v"), 

26 ("г", "g"), 

27 ("д", "d"), 

28 ("ђ", "đ"), 

29 ("е", "e"), 

30 ("ж", "ž"), 

31 ("з", "z"), 

32 ("и", "i"), 

33 ("ј", "j"), 

34 ("к", "k"), 

35 ("л", "l"), 

36 ("љ", "lj"), 

37 ("м", "m"), 

38 ("н", "n"), 

39 ("њ", "nj"), 

40 ("о", "o"), 

41 ("п", "p"), 

42 ("р", "r"), 

43 ("с", "s"), 

44 ("т", "t"), 

45 ("ћ", "ć"), 

46 ("у", "u"), 

47 ("ф", "f"), 

48 ("х", "h"), 

49 ("ц", "c"), 

50 ("ч", "č"), 

51 ("џ", "dž"), 

52 ("ш", "š"), 

53} 

54 

55_SERBIAN_ACCENTS: Set[str] = { 

56 "\N{COMBINING DOUBLE GRAVE ACCENT}", 

57 "\N{COMBINING GRAVE ACCENT}", 

58 "\N{COMBINING INVERTED BREVE}", 

59 "\N{COMBINING ACUTE ACCENT}", 

60 "\N{COMBINING MACRON}", 

61 "\N{COMBINING OVERLINE}", 

62 "\N{COMBINING CIRCUMFLEX ACCENT}", 

63 "\N{COMBINING BREVE}", 

64} 

65_VOWEL_PAIRS: Set[Tuple[str, str]] = { 

66 ("а", "a"), 

67 ("е", "e"), 

68 ("и", "i"), 

69 ("о", "o"), 

70 ("у", "u"), 

71} 

72_ACCENTED_VOWEL_PAIRS = { 

73 (normalize("NFC", cyr_ltr + acc), normalize("NFC", lat_ltr + acc)) 

74 for acc in _SERBIAN_ACCENTS 

75 for cyr_ltr, lat_ltr in _VOWEL_PAIRS 

76} 

77_CYR_LAT_LOWERCASE_LETTER_PAIRS.union(_ACCENTED_VOWEL_PAIRS) 

78 

79 

80def _cat_dicts(*args: dict) -> Dict[any, any]: 

81 result = {} 

82 for dct in args: 

83 result.update(dct) 

84 return result 

85 

86 

87def _generate_cyr_to_lat_dictionary() -> Dict[str, str]: 

88 lowercase_dict = { 

89 cyr_letter: lat_letter 

90 for cyr_letter, lat_letter in _CYR_LAT_LOWERCASE_LETTER_PAIRS 

91 } 

92 uppercase_dict = { 

93 cyr_letter.upper(): lat_letter.capitalize() 

94 for cyr_letter, lat_letter in _CYR_LAT_LOWERCASE_LETTER_PAIRS 

95 } 

96 return _cat_dicts(lowercase_dict, uppercase_dict) 

97 

98 

99CYR_TO_LAT_DICT: Dict[str, str] = _generate_cyr_to_lat_dictionary() 

100"""Dict[str, str]: Cyrillic → Latin character translation dict. 

101 

102Not used internally. Can be used to look up equivalent Latin letter strings. 

103""" 

104 

105CYR_TO_LAT_TTABLE: Dict[int, str] = str.maketrans(CYR_TO_LAT_DICT) 

106"""Dict[int, str]: Cyrillic → Latin character translation table. 

107 

108Used internally with :meth:`str.translate()` to transliterate text. 

109""" 

110 

111DIGRAPH_ESCAPE_CHARACTER = "!" 

112 

113 

114def _generate_lat_to_cyr_dictionary() -> Dict[str, str]: 

115 lowercase_dict = { 

116 lat_letter: cyr_letter 

117 for cyr_letter, lat_letter in _CYR_LAT_LOWERCASE_LETTER_PAIRS 

118 } 

119 

120 lowercase_letters = { 

121 lat_letter: cyr_letter 

122 for lat_letter, cyr_letter in lowercase_dict.items() 

123 if len(lat_letter) == 1 

124 } 

125 uppercase_letters = { 

126 lat_letter.upper(): cyr_letter.upper() 

127 for lat_letter, cyr_letter in lowercase_letters.items() 

128 } 

129 

130 all_letters_dict = _cat_dicts(lowercase_letters, uppercase_letters) 

131 all_letters_ttable = str.maketrans(all_letters_dict) 

132 

133 lowercase_digraphs = { 

134 lat_letter: cyr_letter 

135 for lat_letter, cyr_letter in lowercase_dict.items() 

136 if len(lat_letter) == 2 

137 } 

138 lowercase_escaped_digraphs = { 

139 ( 

140 lat_letter[0] + DIGRAPH_ESCAPE_CHARACTER + lat_letter[1] 

141 ): lat_letter.translate(all_letters_ttable) 

142 for lat_letter in lowercase_digraphs.keys() 

143 } 

144 

145 lowercase_all_digraphs = _cat_dicts( 

146 lowercase_digraphs, lowercase_escaped_digraphs 

147 ) 

148 

149 uppercase_digraphs = { 

150 lat_letter.upper(): cyr_letter.upper() 

151 for lat_letter, cyr_letter in lowercase_all_digraphs.items() 

152 } 

153 capitalized_digraphs = { 

154 lat_letter.capitalize(): cyr_letter.capitalize() 

155 for lat_letter, cyr_letter in lowercase_all_digraphs.items() 

156 } 

157 

158 all_digraphs = _cat_dicts( 

159 lowercase_all_digraphs, capitalized_digraphs, uppercase_digraphs 

160 ) 

161 

162 return ( 

163 all_letters_ttable, 

164 all_digraphs, 

165 _cat_dicts( 

166 lowercase_letters, 

167 uppercase_letters, 

168 lowercase_digraphs, 

169 lowercase_escaped_digraphs, 

170 uppercase_digraphs, 

171 capitalized_digraphs, 

172 ), 

173 ) 

174 

175 

176LAT_TO_CYR_TTABLE: Dict[int, str] 

177"""Dict[int, str]: Latin → Cyrillic character translation table. 

178 

179Covers only transitions from single-character Latin letters to equivalent 

180Cyrillic letters. 

181Used internally with :meth:`str.translate()` to transliterate text. 

182""" 

183 

184LAT_TO_CYR_DIGRAPHS_DICT: Dict[str, str] 

185"""Dict[str, str]: Latin → Cyrillic digraph translation dict. 

186 

187Covers only mappings from Latin digraphs to equivalent Cyrillic letters. 

188Used internally with :meth:`str.translate()` to transliterate text. 

189""" 

190 

191LAT_TO_CYR_DICT: Dict[str, str] 

192"""Dict[str, str]: Latin → Cyrillic character translation dict. 

193 

194Not used internally. Can be used to look up equivalent Cyrillic letter strings. 

195""" 

196 

197( 

198 LAT_TO_CYR_TTABLE, 

199 LAT_TO_CYR_DIGRAPHS_DICT, 

200 LAT_TO_CYR_DICT, 

201) = _generate_lat_to_cyr_dictionary()