Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 1 | """ Test script for the Unicode implementation. |
| 2 | |
| 3 | Written by Bill Tutt. |
Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 4 | Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 5 | |
| 6 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. |
| 7 | |
| 8 | """#" |
Barry Warsaw | 04f357c | 2002-07-23 19:04:11 +0000 | [diff] [blame] | 9 | from test.test_support import verify, verbose |
Marc-André Lemburg | 3661908 | 2001-01-17 19:11:13 +0000 | [diff] [blame] | 10 | |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 11 | print 'Testing General Unicode Character Name, and case insensitivity...', |
| 12 | |
| 13 | # General and case insensitivity test: |
Fredrik Lundh | f605606 | 2001-01-20 11:15:25 +0000 | [diff] [blame] | 14 | try: |
| 15 | # put all \N escapes inside exec'd raw strings, to make sure this |
| 16 | # script runs even if the compiler chokes on \N escapes |
| 17 | exec r""" |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 18 | s = u"\N{LATIN CAPITAL LETTER T}" \ |
| 19 | u"\N{LATIN SMALL LETTER H}" \ |
| 20 | u"\N{LATIN SMALL LETTER E}" \ |
| 21 | u"\N{SPACE}" \ |
| 22 | u"\N{LATIN SMALL LETTER R}" \ |
| 23 | u"\N{LATIN CAPITAL LETTER E}" \ |
| 24 | u"\N{LATIN SMALL LETTER D}" \ |
| 25 | u"\N{SPACE}" \ |
| 26 | u"\N{LATIN SMALL LETTER f}" \ |
| 27 | u"\N{LATIN CAPITAL LeTtEr o}" \ |
| 28 | u"\N{LATIN SMaLl LETTER x}" \ |
| 29 | u"\N{SPACE}" \ |
| 30 | u"\N{LATIN SMALL LETTER A}" \ |
| 31 | u"\N{LATIN SMALL LETTER T}" \ |
| 32 | u"\N{LATIN SMALL LETTER E}" \ |
| 33 | u"\N{SPACE}" \ |
| 34 | u"\N{LATIN SMALL LETTER T}" \ |
| 35 | u"\N{LATIN SMALL LETTER H}" \ |
| 36 | u"\N{LATIN SMALL LETTER E}" \ |
| 37 | u"\N{SpAcE}" \ |
| 38 | u"\N{LATIN SMALL LETTER S}" \ |
| 39 | u"\N{LATIN SMALL LETTER H}" \ |
| 40 | u"\N{LATIN SMALL LETTER E}" \ |
| 41 | u"\N{LATIN SMALL LETTER E}" \ |
| 42 | u"\N{LATIN SMALL LETTER P}" \ |
| 43 | u"\N{FULL STOP}" |
Marc-André Lemburg | 3661908 | 2001-01-17 19:11:13 +0000 | [diff] [blame] | 44 | verify(s == u"The rEd fOx ate the sheep.", s) |
Fredrik Lundh | f605606 | 2001-01-20 11:15:25 +0000 | [diff] [blame] | 45 | """ |
| 46 | except UnicodeError, v: |
| 47 | print v |
Fredrik Lundh | 2acb54a | 2001-01-19 11:13:46 +0000 | [diff] [blame] | 48 | print "done." |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 49 | |
Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 50 | import unicodedata |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 51 | |
Fredrik Lundh | 2acb54a | 2001-01-19 11:13:46 +0000 | [diff] [blame] | 52 | print "Testing name to code mapping....", |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 53 | for char in "SPAM": |
| 54 | name = "LATIN SMALL LETTER %s" % char |
Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 55 | code = unicodedata.lookup(name) |
| 56 | verify(unicodedata.name(code) == name) |
Fredrik Lundh | 2acb54a | 2001-01-19 11:13:46 +0000 | [diff] [blame] | 57 | print "done." |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 58 | |
Martin v. Löwis | 8579efc | 2002-11-23 17:11:42 +0000 | [diff] [blame] | 59 | print "Testing hangul syllable names....", |
| 60 | exec r""" |
| 61 | verify(u"\N{HANGUL SYLLABLE GA}" == u"\uac00") |
| 62 | verify(u"\N{HANGUL SYLLABLE GGWEOSS}" == u"\uafe8") |
| 63 | verify(u"\N{HANGUL SYLLABLE DOLS}" == u"\ub3d0") |
| 64 | verify(u"\N{HANGUL SYLLABLE RYAN}" == u"\ub7b8") |
| 65 | verify(u"\N{HANGUL SYLLABLE MWIK}" == u"\ubba0") |
| 66 | verify(u"\N{HANGUL SYLLABLE BBWAEM}" == u"\ubf88") |
| 67 | verify(u"\N{HANGUL SYLLABLE SSEOL}" == u"\uc370") |
| 68 | verify(u"\N{HANGUL SYLLABLE YI}" == u"\uc758") |
| 69 | verify(u"\N{HANGUL SYLLABLE JJYOSS}" == u"\ucb40") |
| 70 | verify(u"\N{HANGUL SYLLABLE KYEOLS}" == u"\ucf28") |
| 71 | verify(u"\N{HANGUL SYLLABLE PAN}" == u"\ud310") |
| 72 | verify(u"\N{HANGUL SYLLABLE HWEOK}" == u"\ud6f8") |
| 73 | verify(u"\N{HANGUL SYLLABLE HIH}" == u"\ud7a3") |
| 74 | """ |
| 75 | try: |
| 76 | unicodedata.name(u"\ud7a4") |
| 77 | except ValueError: |
| 78 | pass |
| 79 | else: |
| 80 | raise AssertionError, "Found name for U+D7A4" |
| 81 | print "done." |
| 82 | |
Martin v. Löwis | ef7fe2e | 2002-11-23 18:01:32 +0000 | [diff] [blame] | 83 | print "Testing names of CJK unified ideographs....", |
| 84 | exec r""" |
| 85 | verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400") |
| 86 | verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5") |
| 87 | verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00") |
| 88 | verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5") |
| 89 | verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000") |
| 90 | verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6") |
| 91 | """ |
| 92 | print "done." |
| 93 | |
| 94 | print "Testing code to name mapping for all BMP characters....", |
Fredrik Lundh | 2acb54a | 2001-01-19 11:13:46 +0000 | [diff] [blame] | 95 | count = 0 |
Martin v. Löwis | ef7fe2e | 2002-11-23 18:01:32 +0000 | [diff] [blame] | 96 | for code in range(0x10000): |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 97 | try: |
Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 98 | char = unichr(code) |
| 99 | name = unicodedata.name(char) |
Fredrik Lundh | 06d1268 | 2001-01-24 07:59:11 +0000 | [diff] [blame] | 100 | except (KeyError, ValueError): |
Fredrik Lundh | ee865c6 | 2001-01-19 11:00:42 +0000 | [diff] [blame] | 101 | pass |
Martin v. Löwis | ef7fe2e | 2002-11-23 18:01:32 +0000 | [diff] [blame] | 102 | else: |
| 103 | verify(unicodedata.lookup(name) == char) |
| 104 | count += 1 |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 105 | print "done." |
| 106 | |
Fredrik Lundh | 2acb54a | 2001-01-19 11:13:46 +0000 | [diff] [blame] | 107 | print "Found", count, "characters in the unicode name database" |
| 108 | |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 109 | # misc. symbol testing |
| 110 | print "Testing misc. symbols for unicode character name expansion....", |
Fredrik Lundh | f605606 | 2001-01-20 11:15:25 +0000 | [diff] [blame] | 111 | exec r""" |
Marc-André Lemburg | 3661908 | 2001-01-17 19:11:13 +0000 | [diff] [blame] | 112 | verify(u"\N{PILCROW SIGN}" == u"\u00b6") |
| 113 | verify(u"\N{REPLACEMENT CHARACTER}" == u"\uFFFD") |
| 114 | verify(u"\N{HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK}" == u"\uFF9F") |
| 115 | verify(u"\N{FULLWIDTH LATIN SMALL LETTER A}" == u"\uFF41") |
Fredrik Lundh | f605606 | 2001-01-20 11:15:25 +0000 | [diff] [blame] | 116 | """ |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 117 | print "done." |
| 118 | |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 119 | # strict error testing: |
| 120 | print "Testing unicode character name expansion strict error handling....", |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 121 | try: |
Fred Drake | 004d5e6 | 2000-10-23 17:22:08 +0000 | [diff] [blame] | 122 | unicode("\N{blah}", 'unicode-escape', 'strict') |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 123 | except UnicodeError: |
Fred Drake | 004d5e6 | 2000-10-23 17:22:08 +0000 | [diff] [blame] | 124 | pass |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 125 | else: |
Fred Drake | 004d5e6 | 2000-10-23 17:22:08 +0000 | [diff] [blame] | 126 | raise AssertionError, "failed to raise an exception when given a bogus character name" |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 127 | |
| 128 | try: |
Fredrik Lundh | 0fdb90c | 2001-01-19 09:45:02 +0000 | [diff] [blame] | 129 | unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict') |
| 130 | except UnicodeError: |
| 131 | pass |
| 132 | else: |
| 133 | raise AssertionError, "failed to raise an exception when given a very " \ |
| 134 | "long bogus character name" |
| 135 | |
| 136 | try: |
Fred Drake | 004d5e6 | 2000-10-23 17:22:08 +0000 | [diff] [blame] | 137 | unicode("\N{SPACE", 'unicode-escape', 'strict') |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 138 | except UnicodeError: |
Fred Drake | 004d5e6 | 2000-10-23 17:22:08 +0000 | [diff] [blame] | 139 | pass |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 140 | else: |
Fred Drake | 004d5e6 | 2000-10-23 17:22:08 +0000 | [diff] [blame] | 141 | raise AssertionError, "failed to raise an exception for a missing closing brace." |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 142 | |
| 143 | try: |
Fred Drake | 004d5e6 | 2000-10-23 17:22:08 +0000 | [diff] [blame] | 144 | unicode("\NSPACE", 'unicode-escape', 'strict') |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 145 | except UnicodeError: |
Fred Drake | 004d5e6 | 2000-10-23 17:22:08 +0000 | [diff] [blame] | 146 | pass |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 147 | else: |
Fred Drake | 004d5e6 | 2000-10-23 17:22:08 +0000 | [diff] [blame] | 148 | raise AssertionError, "failed to raise an exception for a missing opening brace." |
Marc-André Lemburg | 6cdec2e | 2000-06-30 09:45:20 +0000 | [diff] [blame] | 149 | print "done." |