| Tor Norbye | 3a2425a | 2013-11-04 10:16:08 -0800 | [diff] [blame] | 1 | from bisect import bisect_left |
| 2 | import operator |
| 3 | import java.lang.Character |
| 4 | |
| 5 | # XXX - this is intended as a stopgap measure until 2.5.1, which will have a Java implementation |
| 6 | # requires java 6 for `normalize` function |
| 7 | # only has one version of the database |
| 8 | # does not normalized ideographs |
| 9 | |
| 10 | _codepoints = {} |
| 11 | _eaw = {} |
| 12 | _names = {} |
| 13 | _segments = [] |
| 14 | _eaw_segments = [] |
| 15 | Nonesuch = object() |
| 16 | |
| 17 | def get_int(col): |
| 18 | try: |
| 19 | return int(col) |
| 20 | except ValueError: |
| 21 | return None |
| 22 | |
| 23 | def get_yn(col): |
| 24 | if col == 'Y': return 1 |
| 25 | else: return 0 |
| 26 | |
| 27 | def get_numeric(col): |
| 28 | try: |
| 29 | return float(col) |
| 30 | except ValueError: |
| 31 | try: |
| 32 | a, b = col.split('/') |
| 33 | return float(a)/float(b) |
| 34 | except: |
| 35 | return None |
| 36 | |
| 37 | def init_unicodedata(data): |
| 38 | for row in data: |
| 39 | cols = row.split(';') |
| 40 | codepoint = int(cols[0], 16) |
| 41 | name = cols[1] |
| 42 | if name == '<CJK Ideograph, Last>': |
| 43 | lookup_name = 'CJK UNIFIED IDEOGRAPH' |
| 44 | else: |
| 45 | lookup_name = name |
| 46 | data = ( |
| 47 | cols[2], |
| 48 | get_int(cols[3]), |
| 49 | cols[4], |
| 50 | cols[5], |
| 51 | get_int(cols[6]), |
| 52 | get_int(cols[7]), |
| 53 | get_numeric(cols[8]), |
| 54 | get_yn(cols[9]), |
| 55 | lookup_name, |
| 56 | ) |
| 57 | |
| 58 | if name.find('First') >= 0: |
| 59 | start = codepoint |
| 60 | elif name.find('Last') >= 0: |
| 61 | _segments.append((start, (start, codepoint), data)) |
| 62 | else: |
| 63 | _names[name] = unichr(codepoint) |
| 64 | _codepoints[codepoint] = data |
| 65 | |
| 66 | def init_east_asian_width(data): |
| 67 | for row in data: |
| 68 | if row.startswith('#'): |
| 69 | continue |
| 70 | row = row.partition('#')[0] |
| 71 | cols = row.split(';') |
| 72 | if len(cols) < 2: |
| 73 | continue |
| 74 | cr = cols[0].split('..') |
| 75 | width = cols[1].rstrip() |
| 76 | if len(cr) == 1: |
| 77 | codepoint = int(cr[0], 16) |
| 78 | _eaw[codepoint] = width |
| 79 | else: |
| 80 | start = int(cr[0], 16) |
| 81 | end = int(cr[1], 16) |
| 82 | _eaw_segments.append((start, (start, end), width)) |
| 83 | |
| 84 | # xxx - need to normalize the segments, so |
| 85 | # <CJK Ideograph, Last> ==> CJK UNIFIED IDEOGRAPH; |
| 86 | # may need to do some sort of analysis against CPython for the normalization! |
| 87 | |
| 88 | def name(unichr, default=None): |
| 89 | codepoint = get_codepoint(unichr, "name") |
| 90 | v = _codepoints.get(codepoint, None) |
| 91 | if v is None: |
| 92 | v = check_segments(codepoint, _segments) |
| 93 | if v is not None: |
| 94 | return "%s-%X" % (v[8], codepoint) |
| 95 | |
| 96 | if v is None: |
| 97 | if default is not Nonesuch: |
| 98 | return default |
| 99 | raise ValueError() |
| 100 | return v[8] |
| 101 | |
| 102 | # xxx - also need to add logic here so that if it's CJK UNIFIED |
| 103 | # IDEOGRAPH-8000, we go against the segment to verify the prefix |
| 104 | |
| 105 | def lookup(name): |
| 106 | return _names[name] |
| 107 | |
| 108 | def check_segments(codepoint, segments): |
| 109 | i = bisect_left(segments, (codepoint,)) |
| 110 | if i < len(segments): |
| 111 | segment = segments[i - 1] |
| 112 | if codepoint <= segment[1][1]: |
| 113 | return segment[2] |
| 114 | return None |
| 115 | |
| 116 | |
| 117 | def get_codepoint(unichr, fn=None): |
| 118 | if not(isinstance(unichr, unicode)): |
| 119 | raise TypeError(fn, "() argument 1 must be unicode, not " + type(unichr)) |
| 120 | if len(unichr) > 1 or len(unichr) == 0: |
| 121 | raise TypeError("need a single Unicode character as parameter") |
| 122 | return ord(unichr) |
| 123 | |
| 124 | def get_eaw(unichr, default, fn): |
| 125 | codepoint = get_codepoint(unichr, fn) |
| 126 | v = _eaw.get(codepoint, None) |
| 127 | if v is None: |
| 128 | v = check_segments(codepoint, _eaw_segments) |
| 129 | |
| 130 | if v is None: |
| 131 | if default is not Nonesuch: |
| 132 | return default |
| 133 | raise ValueError() |
| 134 | return v |
| 135 | |
| 136 | def get(unichr, default, fn, getter): |
| 137 | codepoint = get_codepoint(unichr, fn) |
| 138 | data = _codepoints.get(codepoint, None) |
| 139 | if data is None: |
| 140 | data = check_segments(codepoint, _segments) |
| 141 | if data is None: |
| 142 | if default is not Nonesuch: |
| 143 | return default |
| 144 | raise ValueError() |
| 145 | v = getter(data) |
| 146 | if v is None: |
| 147 | if default is not Nonesuch: |
| 148 | return default |
| 149 | raise ValueError() |
| 150 | else: |
| 151 | return v |
| 152 | |
| 153 | category_getter = operator.itemgetter(0) |
| 154 | combining_getter = operator.itemgetter(1) |
| 155 | bidirectional_getter = operator.itemgetter(2) |
| 156 | decomposition_getter = operator.itemgetter(3) |
| 157 | decimal_getter = operator.itemgetter(4) |
| 158 | digit_getter = operator.itemgetter(5) |
| 159 | numeric_getter = operator.itemgetter(6) |
| 160 | mirrored_getter = operator.itemgetter(7) |
| 161 | |
| 162 | def decimal(unichr, default=Nonesuch): |
| 163 | return get(unichr, default, 'decimal', decimal_getter) |
| 164 | |
| 165 | def decomposition(unichr, default=''): |
| 166 | return get(unichr, default, 'decomposition', decomposition_getter) |
| 167 | |
| 168 | def digit(unichr, default=Nonesuch): |
| 169 | return get(unichr, default, 'digit', digit_getter) |
| 170 | |
| 171 | def numeric(unichr, default=Nonesuch): |
| 172 | return get(unichr, default, 'numeric', numeric_getter) |
| 173 | |
| 174 | def category(unichr): |
| 175 | return get(unichr, 'Cn', 'catgegory', category_getter) |
| 176 | |
| 177 | def bidirectional(unichr): |
| 178 | return get(unichr, '', 'bidirectional', bidirectional_getter) |
| 179 | |
| 180 | def combining(unichr): |
| 181 | return get(unichr, 0, 'combining', combining_getter) |
| 182 | |
| 183 | def mirrored(unichr): |
| 184 | return get(unichr, 0, 'mirrored', mirrored_getter) |
| 185 | |
| 186 | def east_asian_width(unichr): |
| 187 | return get_eaw(unichr, 'N', 'east_asian_width') |
| 188 | |
| 189 | def jymirrored(unichr): |
| 190 | return java.lang.Character.isMirrored(get_codepoint(unichr, 'mirrored')) |
| 191 | |
| 192 | try: |
| 193 | from java.text import Normalizer |
| 194 | |
| 195 | _forms = { |
| 196 | 'NFC': Normalizer.Form.NFC, |
| 197 | 'NFKC': Normalizer.Form.NFKC, |
| 198 | 'NFD': Normalizer.Form.NFD, |
| 199 | 'NFKD': Normalizer.Form.NFKD |
| 200 | } |
| 201 | |
| 202 | def normalize(form, unistr): |
| 203 | """ |
| 204 | Return the normal form 'form' for the Unicode string unistr. Valid |
| 205 | values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. |
| 206 | """ |
| 207 | |
| 208 | try: |
| 209 | normalizer_form = _forms[form] |
| 210 | except KeyError: |
| 211 | raise ValueError('invalid normalization form') |
| 212 | return Normalizer.normalize(unistr, normalizer_form) |
| 213 | |
| 214 | except ImportError: |
| 215 | pass |
| 216 | |
| 217 | |
| 218 | def init(): |
| 219 | import pkgutil |
| 220 | import os.path |
| 221 | import StringIO |
| 222 | import sys |
| 223 | |
| 224 | my_path = os.path.dirname(__file__) |
| 225 | loader = pkgutil.get_loader('unicodedata') |
| 226 | init_unicodedata(StringIO.StringIO(loader.get_data(os.path.join(my_path, 'UnicodeData.txt')))) |
| 227 | init_east_asian_width(StringIO.StringIO(loader.get_data(os.path.join(my_path, 'EastAsianWidth.txt')))) |
| 228 | |
| 229 | init() |