blob: 9e344a0de95e708e0992c20e7a33124d14cbf420 [file] [log] [blame]
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001/* Finding the optimal width of unicode characters in a buffer */
2
3#if STRINGLIB_IS_UNICODE
4
5/* Mask to check or force alignment of a pointer to C 'long' boundaries */
6#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
7
8/* Mask to quickly check whether a C 'long' contains a
9 non-ASCII, UTF8-encoded char. */
10#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020011# define UCS1_ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitroudd4e2f02011-10-13 00:02:27 +020012#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020013# define UCS1_ASCII_CHAR_MASK 0x80808080UL
Antoine Pitroudd4e2f02011-10-13 00:02:27 +020014#else
15# error C 'long' size should be either 4 or 8!
16#endif
17
18#if STRINGLIB_SIZEOF_CHAR == 1
19
20Py_LOCAL_INLINE(Py_UCS4)
21STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
22{
23 const unsigned char *p = (const unsigned char *) begin;
24 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
25
26 while (p < end) {
27 if (!((size_t) p & LONG_PTR_MASK)) {
28 /* Help register allocation */
29 register const unsigned char *_p = p;
30 while (_p < aligned_end) {
31 unsigned long value = *(unsigned long *) _p;
32 if (value & UCS1_ASCII_CHAR_MASK)
33 return 255;
34 _p += SIZEOF_LONG;
35 }
36 p = _p;
37 if (p == end)
38 break;
39 }
40 if (*p++ & 0x80)
41 return 255;
42 }
43 return 127;
44}
45
46#undef LONG_PTR_MASK
47#undef ASCII_CHAR_MASK
48
49#else /* STRINGLIB_SIZEOF_CHAR == 1 */
50
51#define MASK_ASCII 0xFFFFFF80
52#define MASK_UCS1 0xFFFFFF00
53#define MASK_UCS2 0xFFFF0000
54
55#define MAX_CHAR_ASCII 0x7f
56#define MAX_CHAR_UCS1 0xff
57#define MAX_CHAR_UCS2 0xffff
58#define MAX_CHAR_UCS4 0x10ffff
59
60Py_LOCAL_INLINE(Py_UCS4)
61STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
62{
63#if STRINGLIB_SIZEOF_CHAR == 2
64 const Py_UCS4 mask_limit = MASK_UCS1;
65 const Py_UCS4 max_char_limit = MAX_CHAR_UCS2;
66#elif STRINGLIB_SIZEOF_CHAR == 4
67 const Py_UCS4 mask_limit = MASK_UCS2;
68 const Py_UCS4 max_char_limit = MAX_CHAR_UCS4;
69#else
70#error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4)
71#endif
72 register Py_UCS4 mask;
73 Py_ssize_t n = end - begin;
74 const STRINGLIB_CHAR *p = begin;
75 const STRINGLIB_CHAR *unrolled_end = begin + (n & ~ (Py_ssize_t) 3);
76 Py_UCS4 max_char;
77
78 max_char = MAX_CHAR_ASCII;
79 mask = MASK_ASCII;
80 while (p < unrolled_end) {
81 STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3];
82 if (bits & mask) {
83 if (mask == mask_limit) {
84 /* Limit reached */
85 return max_char_limit;
86 }
87 if (mask == MASK_ASCII) {
88 max_char = MAX_CHAR_UCS1;
89 mask = MASK_UCS1;
90 }
91 else {
92 /* mask can't be MASK_UCS2 because of mask_limit above */
93 assert(mask == MASK_UCS1);
94 max_char = MAX_CHAR_UCS2;
95 mask = MASK_UCS2;
96 }
97 /* We check the new mask on the same chars in the next iteration */
98 continue;
99 }
100 p += 4;
101 }
102 while (p < end) {
103 if (p[0] & mask) {
104 if (mask == mask_limit) {
105 /* Limit reached */
106 return max_char_limit;
107 }
108 if (mask == MASK_ASCII) {
109 max_char = MAX_CHAR_UCS1;
110 mask = MASK_UCS1;
111 }
112 else {
113 /* mask can't be MASK_UCS2 because of mask_limit above */
114 assert(mask == MASK_UCS1);
115 max_char = MAX_CHAR_UCS2;
116 mask = MASK_UCS2;
117 }
118 /* We check the new mask on the same chars in the next iteration */
119 continue;
120 }
121 p++;
122 }
123 return max_char;
124}
125
126#undef MASK_ASCII
127#undef MASK_UCS1
128#undef MASK_UCS2
129#undef MAX_CHAR_ASCII
130#undef MAX_CHAR_UCS1
131#undef MAX_CHAR_UCS2
132#undef MAX_CHAR_UCS4
133
134#endif /* STRINGLIB_SIZEOF_CHAR == 1 */
135#endif /* STRINGLIB_IS_UNICODE */
136