Antoine Pitrou | dd4e2f0 | 2011-10-13 00:02:27 +0200 | [diff] [blame] | 1 | /* Finding the optimal width of unicode characters in a buffer */ |
| 2 | |
| 3 | #if STRINGLIB_IS_UNICODE |
| 4 | |
Antoine Pitrou | dd4e2f0 | 2011-10-13 00:02:27 +0200 | [diff] [blame] | 5 | /* Mask to quickly check whether a C 'long' contains a |
| 6 | non-ASCII, UTF8-encoded char. */ |
| 7 | #if (SIZEOF_LONG == 8) |
Mark Dickinson | 01ac8b6 | 2012-07-07 14:08:48 +0200 | [diff] [blame] | 8 | # define UCS1_ASCII_CHAR_MASK 0x8080808080808080UL |
Antoine Pitrou | dd4e2f0 | 2011-10-13 00:02:27 +0200 | [diff] [blame] | 9 | #elif (SIZEOF_LONG == 4) |
Mark Dickinson | 01ac8b6 | 2012-07-07 14:08:48 +0200 | [diff] [blame] | 10 | # define UCS1_ASCII_CHAR_MASK 0x80808080UL |
Antoine Pitrou | dd4e2f0 | 2011-10-13 00:02:27 +0200 | [diff] [blame] | 11 | #else |
| 12 | # error C 'long' size should be either 4 or 8! |
| 13 | #endif |
| 14 | |
| 15 | #if STRINGLIB_SIZEOF_CHAR == 1 |
| 16 | |
| 17 | Py_LOCAL_INLINE(Py_UCS4) |
| 18 | STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) |
| 19 | { |
| 20 | const unsigned char *p = (const unsigned char *) begin; |
Antoine Pitrou | ca8aa4a | 2012-09-20 20:56:47 +0200 | [diff] [blame] | 21 | const unsigned char *aligned_end = |
| 22 | (const unsigned char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); |
Antoine Pitrou | dd4e2f0 | 2011-10-13 00:02:27 +0200 | [diff] [blame] | 23 | |
| 24 | while (p < end) { |
Antoine Pitrou | ca8aa4a | 2012-09-20 20:56:47 +0200 | [diff] [blame] | 25 | if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { |
Antoine Pitrou | dd4e2f0 | 2011-10-13 00:02:27 +0200 | [diff] [blame] | 26 | /* Help register allocation */ |
| 27 | register const unsigned char *_p = p; |
| 28 | while (_p < aligned_end) { |
| 29 | unsigned long value = *(unsigned long *) _p; |
| 30 | if (value & UCS1_ASCII_CHAR_MASK) |
| 31 | return 255; |
| 32 | _p += SIZEOF_LONG; |
| 33 | } |
| 34 | p = _p; |
| 35 | if (p == end) |
| 36 | break; |
| 37 | } |
| 38 | if (*p++ & 0x80) |
| 39 | return 255; |
| 40 | } |
| 41 | return 127; |
| 42 | } |
| 43 | |
Antoine Pitrou | dd4e2f0 | 2011-10-13 00:02:27 +0200 | [diff] [blame] | 44 | #undef ASCII_CHAR_MASK |
| 45 | |
| 46 | #else /* STRINGLIB_SIZEOF_CHAR == 1 */ |
| 47 | |
| 48 | #define MASK_ASCII 0xFFFFFF80 |
| 49 | #define MASK_UCS1 0xFFFFFF00 |
| 50 | #define MASK_UCS2 0xFFFF0000 |
| 51 | |
| 52 | #define MAX_CHAR_ASCII 0x7f |
| 53 | #define MAX_CHAR_UCS1 0xff |
| 54 | #define MAX_CHAR_UCS2 0xffff |
| 55 | #define MAX_CHAR_UCS4 0x10ffff |
| 56 | |
| 57 | Py_LOCAL_INLINE(Py_UCS4) |
| 58 | STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) |
| 59 | { |
| 60 | #if STRINGLIB_SIZEOF_CHAR == 2 |
| 61 | const Py_UCS4 mask_limit = MASK_UCS1; |
| 62 | const Py_UCS4 max_char_limit = MAX_CHAR_UCS2; |
| 63 | #elif STRINGLIB_SIZEOF_CHAR == 4 |
| 64 | const Py_UCS4 mask_limit = MASK_UCS2; |
| 65 | const Py_UCS4 max_char_limit = MAX_CHAR_UCS4; |
| 66 | #else |
| 67 | #error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4) |
| 68 | #endif |
| 69 | register Py_UCS4 mask; |
| 70 | Py_ssize_t n = end - begin; |
| 71 | const STRINGLIB_CHAR *p = begin; |
Antoine Pitrou | ca8aa4a | 2012-09-20 20:56:47 +0200 | [diff] [blame] | 72 | const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4); |
Antoine Pitrou | dd4e2f0 | 2011-10-13 00:02:27 +0200 | [diff] [blame] | 73 | Py_UCS4 max_char; |
| 74 | |
| 75 | max_char = MAX_CHAR_ASCII; |
| 76 | mask = MASK_ASCII; |
| 77 | while (p < unrolled_end) { |
| 78 | STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3]; |
| 79 | if (bits & mask) { |
| 80 | if (mask == mask_limit) { |
| 81 | /* Limit reached */ |
| 82 | return max_char_limit; |
| 83 | } |
| 84 | if (mask == MASK_ASCII) { |
| 85 | max_char = MAX_CHAR_UCS1; |
| 86 | mask = MASK_UCS1; |
| 87 | } |
| 88 | else { |
| 89 | /* mask can't be MASK_UCS2 because of mask_limit above */ |
| 90 | assert(mask == MASK_UCS1); |
| 91 | max_char = MAX_CHAR_UCS2; |
| 92 | mask = MASK_UCS2; |
| 93 | } |
| 94 | /* We check the new mask on the same chars in the next iteration */ |
| 95 | continue; |
| 96 | } |
| 97 | p += 4; |
| 98 | } |
| 99 | while (p < end) { |
| 100 | if (p[0] & mask) { |
| 101 | if (mask == mask_limit) { |
| 102 | /* Limit reached */ |
| 103 | return max_char_limit; |
| 104 | } |
| 105 | if (mask == MASK_ASCII) { |
| 106 | max_char = MAX_CHAR_UCS1; |
| 107 | mask = MASK_UCS1; |
| 108 | } |
| 109 | else { |
| 110 | /* mask can't be MASK_UCS2 because of mask_limit above */ |
| 111 | assert(mask == MASK_UCS1); |
| 112 | max_char = MAX_CHAR_UCS2; |
| 113 | mask = MASK_UCS2; |
| 114 | } |
| 115 | /* We check the new mask on the same chars in the next iteration */ |
| 116 | continue; |
| 117 | } |
| 118 | p++; |
| 119 | } |
| 120 | return max_char; |
| 121 | } |
| 122 | |
| 123 | #undef MASK_ASCII |
| 124 | #undef MASK_UCS1 |
| 125 | #undef MASK_UCS2 |
| 126 | #undef MAX_CHAR_ASCII |
| 127 | #undef MAX_CHAR_UCS1 |
| 128 | #undef MAX_CHAR_UCS2 |
| 129 | #undef MAX_CHAR_UCS4 |
| 130 | |
| 131 | #endif /* STRINGLIB_SIZEOF_CHAR == 1 */ |
| 132 | #endif /* STRINGLIB_IS_UNICODE */ |
| 133 | |