blob: 88a97a4cb5f71f1e47a2dfbd52d6bafd7d0b0497 [file] [log] [blame]
Victor Stinner75e46992018-11-26 17:29:38 +01001#ifndef Py_CPYTHON_UNICODEOBJECT_H
2# error "this header file must not be included directly"
3#endif
4
Victor Stinner75e46992018-11-26 17:29:38 +01005/* Py_UNICODE was the native Unicode storage format (code unit) used by
6 Python and represents a single Unicode element in the Unicode type.
7 With PEP 393, Py_UNICODE is deprecated and replaced with a
8 typedef to wchar_t. */
9#define PY_UNICODE_TYPE wchar_t
Zackery Spytz3c8724f2019-05-28 09:16:33 -060010/* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
Victor Stinner75e46992018-11-26 17:29:38 +010011
12/* --- Internal Unicode Operations ---------------------------------------- */
13
14/* Since splitting on whitespace is an important use case, and
15 whitespace in most situations is solely ASCII whitespace, we
16 optimize for the common case by using a quick look-up table
17 _Py_ascii_whitespace (see below) with an inlined check.
18
19 */
20#define Py_UNICODE_ISSPACE(ch) \
21 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
22
23#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
24#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
25#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
26#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
27
28#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
29#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
30#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
31
32#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
33#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
34#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
35#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
36
37#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
38#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
39#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
40
41#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
42
43#define Py_UNICODE_ISALNUM(ch) \
44 (Py_UNICODE_ISALPHA(ch) || \
45 Py_UNICODE_ISDECIMAL(ch) || \
46 Py_UNICODE_ISDIGIT(ch) || \
47 Py_UNICODE_ISNUMERIC(ch))
48
Inada Naoki2c4928d2020-06-17 20:09:44 +090049Py_DEPRECATED(3.3) static inline void
50Py_UNICODE_COPY(Py_UNICODE *target, const Py_UNICODE *source, Py_ssize_t length) {
51 memcpy(target, source, length * sizeof(Py_UNICODE));
52}
Victor Stinner75e46992018-11-26 17:29:38 +010053
Inada Naoki2c4928d2020-06-17 20:09:44 +090054Py_DEPRECATED(3.3) static inline void
55Py_UNICODE_FILL(Py_UNICODE *target, Py_UNICODE value, Py_ssize_t length) {
Inada Naoki8e34e922020-06-17 23:43:01 +090056 Py_ssize_t i;
57 for (i = 0; i < length; i++) {
Inada Naoki2c4928d2020-06-17 20:09:44 +090058 target[i] = value;
59 }
60}
Victor Stinner75e46992018-11-26 17:29:38 +010061
62/* macros to work with surrogates */
63#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
64#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
65#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
66/* Join two surrogate characters and return a single Py_UCS4 value. */
67#define Py_UNICODE_JOIN_SURROGATES(high, low) \
68 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
69 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
70/* high surrogate = top 10 bits added to D800 */
71#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
72/* low surrogate = bottom 10 bits added to DC00 */
73#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
74
Victor Stinner75e46992018-11-26 17:29:38 +010075/* --- Unicode Type ------------------------------------------------------- */
76
77/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
78 structure. state.ascii and state.compact are set, and the data
79 immediately follow the structure. utf8_length and wstr_length can be found
80 in the length field; the utf8 pointer is equal to the data pointer. */
81typedef struct {
82 /* There are 4 forms of Unicode strings:
83
84 - compact ascii:
85
86 * structure = PyASCIIObject
87 * test: PyUnicode_IS_COMPACT_ASCII(op)
88 * kind = PyUnicode_1BYTE_KIND
89 * compact = 1
90 * ascii = 1
91 * ready = 1
92 * (length is the length of the utf8 and wstr strings)
93 * (data starts just after the structure)
94 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
95
96 - compact:
97
98 * structure = PyCompactUnicodeObject
99 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
100 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
101 PyUnicode_4BYTE_KIND
102 * compact = 1
103 * ready = 1
104 * ascii = 0
105 * utf8 is not shared with data
106 * utf8_length = 0 if utf8 is NULL
107 * wstr is shared with data and wstr_length=length
108 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
109 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
110 * wstr_length = 0 if wstr is NULL
111 * (data starts just after the structure)
112
113 - legacy string, not ready:
114
115 * structure = PyUnicodeObject
116 * test: kind == PyUnicode_WCHAR_KIND
117 * length = 0 (use wstr_length)
118 * hash = -1
119 * kind = PyUnicode_WCHAR_KIND
120 * compact = 0
121 * ascii = 0
122 * ready = 0
123 * interned = SSTATE_NOT_INTERNED
124 * wstr is not NULL
125 * data.any is NULL
126 * utf8 is NULL
127 * utf8_length = 0
128
129 - legacy string, ready:
130
131 * structure = PyUnicodeObject structure
132 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
133 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
134 PyUnicode_4BYTE_KIND
135 * compact = 0
136 * ready = 1
137 * data.any is not NULL
138 * utf8 is shared and utf8_length = length with data.any if ascii = 1
139 * utf8_length = 0 if utf8 is NULL
140 * wstr is shared with data.any and wstr_length = length
141 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
142 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
143 * wstr_length = 0 if wstr is NULL
144
145 Compact strings use only one memory block (structure + characters),
146 whereas legacy strings use one block for the structure and one block
147 for characters.
148
149 Legacy strings are created by PyUnicode_FromUnicode() and
150 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
151 when PyUnicode_READY() is called.
152
153 See also _PyUnicode_CheckConsistency().
154 */
155 PyObject_HEAD
156 Py_ssize_t length; /* Number of code points in the string */
157 Py_hash_t hash; /* Hash value; -1 if not set */
158 struct {
159 /*
160 SSTATE_NOT_INTERNED (0)
161 SSTATE_INTERNED_MORTAL (1)
162 SSTATE_INTERNED_IMMORTAL (2)
163
164 If interned != SSTATE_NOT_INTERNED, the two references from the
165 dictionary to this object are *not* counted in ob_refcnt.
166 */
167 unsigned int interned:2;
168 /* Character size:
169
170 - PyUnicode_WCHAR_KIND (0):
171
172 * character type = wchar_t (16 or 32 bits, depending on the
173 platform)
174
175 - PyUnicode_1BYTE_KIND (1):
176
177 * character type = Py_UCS1 (8 bits, unsigned)
178 * all characters are in the range U+0000-U+00FF (latin1)
179 * if ascii is set, all characters are in the range U+0000-U+007F
180 (ASCII), otherwise at least one character is in the range
181 U+0080-U+00FF
182
183 - PyUnicode_2BYTE_KIND (2):
184
185 * character type = Py_UCS2 (16 bits, unsigned)
186 * all characters are in the range U+0000-U+FFFF (BMP)
187 * at least one character is in the range U+0100-U+FFFF
188
189 - PyUnicode_4BYTE_KIND (4):
190
191 * character type = Py_UCS4 (32 bits, unsigned)
192 * all characters are in the range U+0000-U+10FFFF
193 * at least one character is in the range U+10000-U+10FFFF
194 */
195 unsigned int kind:3;
196 /* Compact is with respect to the allocation scheme. Compact unicode
197 objects only require one memory block while non-compact objects use
198 one block for the PyUnicodeObject struct and another for its data
199 buffer. */
200 unsigned int compact:1;
201 /* The string only contains characters in the range U+0000-U+007F (ASCII)
202 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
203 set, use the PyASCIIObject structure. */
204 unsigned int ascii:1;
205 /* The ready flag indicates whether the object layout is initialized
206 completely. This means that this is either a compact object, or
207 the data pointer is filled out. The bit is redundant, and helps
208 to minimize the test in PyUnicode_IS_READY(). */
209 unsigned int ready:1;
210 /* Padding to ensure that PyUnicode_DATA() is always aligned to
211 4 bytes (see issue #19537 on m68k). */
212 unsigned int :24;
213 } state;
214 wchar_t *wstr; /* wchar_t representation (null-terminated) */
215} PyASCIIObject;
216
217/* Non-ASCII strings allocated through PyUnicode_New use the
218 PyCompactUnicodeObject structure. state.compact is set, and the data
219 immediately follow the structure. */
220typedef struct {
221 PyASCIIObject _base;
222 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
223 * terminating \0. */
224 char *utf8; /* UTF-8 representation (null-terminated) */
225 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
226 * surrogates count as two code points. */
227} PyCompactUnicodeObject;
228
229/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
230 PyUnicodeObject structure. The actual string data is initially in the wstr
231 block, and copied into the data block using _PyUnicode_Ready. */
232typedef struct {
233 PyCompactUnicodeObject _base;
234 union {
235 void *any;
236 Py_UCS1 *latin1;
237 Py_UCS2 *ucs2;
238 Py_UCS4 *ucs4;
239 } data; /* Canonical, smallest-form Unicode buffer */
240} PyUnicodeObject;
241
Victor Stinner68762572019-10-07 18:42:01 +0200242PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
243 PyObject *op,
244 int check_content);
245
Victor Stinner75e46992018-11-26 17:29:38 +0100246/* Fast access macros */
Victor Stinner75e46992018-11-26 17:29:38 +0100247
248/* Returns the deprecated Py_UNICODE representation's size in code units
249 (this includes surrogate pairs as 2 units).
250 If the Py_UNICODE representation is not available, it will be computed
251 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
252
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600253/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100254#define PyUnicode_GET_SIZE(op) \
255 (assert(PyUnicode_Check(op)), \
256 (((PyASCIIObject *)(op))->wstr) ? \
257 PyUnicode_WSTR_LENGTH(op) : \
258 ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\
259 assert(((PyASCIIObject *)(op))->wstr), \
260 PyUnicode_WSTR_LENGTH(op)))
Victor Stinner75e46992018-11-26 17:29:38 +0100261
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600262/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100263#define PyUnicode_GET_DATA_SIZE(op) \
264 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
Victor Stinner75e46992018-11-26 17:29:38 +0100265
266/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
267 representation on demand. Using this macro is very inefficient now,
268 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
269 use PyUnicode_WRITE() and PyUnicode_READ(). */
270
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600271/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100272#define PyUnicode_AS_UNICODE(op) \
273 (assert(PyUnicode_Check(op)), \
274 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
275 PyUnicode_AsUnicode(_PyObject_CAST(op)))
Victor Stinner75e46992018-11-26 17:29:38 +0100276
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600277/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100278#define PyUnicode_AS_DATA(op) \
279 ((const char *)(PyUnicode_AS_UNICODE(op)))
Victor Stinner75e46992018-11-26 17:29:38 +0100280
281
282/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
283
284/* Values for PyASCIIObject.state: */
285
286/* Interning state. */
287#define SSTATE_NOT_INTERNED 0
288#define SSTATE_INTERNED_MORTAL 1
289#define SSTATE_INTERNED_IMMORTAL 2
290
291/* Return true if the string contains only ASCII characters, or 0 if not. The
292 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
293 ready. */
294#define PyUnicode_IS_ASCII(op) \
295 (assert(PyUnicode_Check(op)), \
296 assert(PyUnicode_IS_READY(op)), \
297 ((PyASCIIObject*)op)->state.ascii)
298
299/* Return true if the string is compact or 0 if not.
300 No type checks or Ready calls are performed. */
301#define PyUnicode_IS_COMPACT(op) \
302 (((PyASCIIObject*)(op))->state.compact)
303
304/* Return true if the string is a compact ASCII string (use PyASCIIObject
305 structure), or 0 if not. No type checks or Ready calls are performed. */
306#define PyUnicode_IS_COMPACT_ASCII(op) \
307 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
308
309enum PyUnicode_Kind {
310/* String contains only wstr byte characters. This is only possible
311 when the string was created with a legacy API and _PyUnicode_Ready()
312 has not been called yet. */
313 PyUnicode_WCHAR_KIND = 0,
314/* Return values of the PyUnicode_KIND() macro: */
315 PyUnicode_1BYTE_KIND = 1,
316 PyUnicode_2BYTE_KIND = 2,
317 PyUnicode_4BYTE_KIND = 4
318};
319
320/* Return pointers to the canonical representation cast to unsigned char,
321 Py_UCS2, or Py_UCS4 for direct character access.
322 No checks are performed, use PyUnicode_KIND() before to ensure
323 these will work correctly. */
324
325#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
326#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
327#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
328
329/* Return one of the PyUnicode_*_KIND values defined above. */
330#define PyUnicode_KIND(op) \
331 (assert(PyUnicode_Check(op)), \
332 assert(PyUnicode_IS_READY(op)), \
333 ((PyASCIIObject *)(op))->state.kind)
334
335/* Return a void pointer to the raw unicode buffer. */
336#define _PyUnicode_COMPACT_DATA(op) \
337 (PyUnicode_IS_ASCII(op) ? \
338 ((void*)((PyASCIIObject*)(op) + 1)) : \
339 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
340
341#define _PyUnicode_NONCOMPACT_DATA(op) \
342 (assert(((PyUnicodeObject*)(op))->data.any), \
343 ((((PyUnicodeObject *)(op))->data.any)))
344
345#define PyUnicode_DATA(op) \
346 (assert(PyUnicode_Check(op)), \
347 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
348 _PyUnicode_NONCOMPACT_DATA(op))
349
350/* In the access macros below, "kind" may be evaluated more than once.
351 All other macro parameters are evaluated exactly once, so it is safe
352 to put side effects into them (such as increasing the index). */
353
354/* Write into the canonical representation, this macro does not do any sanity
355 checks and is intended for usage in loops. The caller should cache the
356 kind and data pointers obtained from other macro calls.
357 index is the index in the string (starts at 0) and value is the new
358 code point value which should be written to that location. */
359#define PyUnicode_WRITE(kind, data, index, value) \
360 do { \
361 switch ((kind)) { \
362 case PyUnicode_1BYTE_KIND: { \
363 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
364 break; \
365 } \
366 case PyUnicode_2BYTE_KIND: { \
367 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
368 break; \
369 } \
370 default: { \
371 assert((kind) == PyUnicode_4BYTE_KIND); \
372 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
373 } \
374 } \
375 } while (0)
376
377/* Read a code point from the string's canonical representation. No checks
378 or ready calls are performed. */
379#define PyUnicode_READ(kind, data, index) \
380 ((Py_UCS4) \
381 ((kind) == PyUnicode_1BYTE_KIND ? \
382 ((const Py_UCS1 *)(data))[(index)] : \
383 ((kind) == PyUnicode_2BYTE_KIND ? \
384 ((const Py_UCS2 *)(data))[(index)] : \
385 ((const Py_UCS4 *)(data))[(index)] \
386 ) \
387 ))
388
389/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
390 calls PyUnicode_KIND() and might call it twice. For single reads, use
391 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
392 cache kind and use PyUnicode_READ instead. */
393#define PyUnicode_READ_CHAR(unicode, index) \
394 (assert(PyUnicode_Check(unicode)), \
395 assert(PyUnicode_IS_READY(unicode)), \
396 (Py_UCS4) \
397 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
398 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
399 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
400 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
401 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
402 ) \
403 ))
404
405/* Returns the length of the unicode string. The caller has to make sure that
406 the string has it's canonical representation set before calling
407 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
408#define PyUnicode_GET_LENGTH(op) \
409 (assert(PyUnicode_Check(op)), \
410 assert(PyUnicode_IS_READY(op)), \
411 ((PyASCIIObject *)(op))->length)
412
413
414/* Fast check to determine whether an object is ready. Equivalent to
415 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
416
417#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
418
419/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
420 case. If the canonical representation is not yet set, it will still call
421 _PyUnicode_Ready().
422 Returns 0 on success and -1 on errors. */
423#define PyUnicode_READY(op) \
424 (assert(PyUnicode_Check(op)), \
425 (PyUnicode_IS_READY(op) ? \
426 0 : _PyUnicode_Ready(_PyObject_CAST(op))))
427
428/* Return a maximum character value which is suitable for creating another
429 string based on op. This is always an approximation but more efficient
430 than iterating over the string. */
431#define PyUnicode_MAX_CHAR_VALUE(op) \
432 (assert(PyUnicode_IS_READY(op)), \
433 (PyUnicode_IS_ASCII(op) ? \
434 (0x7f) : \
435 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
436 (0xffU) : \
437 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
438 (0xffffU) : \
439 (0x10ffffU)))))
440
Inada Naoki2c4928d2020-06-17 20:09:44 +0900441Py_DEPRECATED(3.3)
442static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {
443 return PyUnicode_IS_COMPACT_ASCII(op) ?
444 ((PyASCIIObject*)op)->length :
445 ((PyCompactUnicodeObject*)op)->wstr_length;
446}
447#define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)
448
Victor Stinner75e46992018-11-26 17:29:38 +0100449/* === Public API ========================================================= */
450
451/* --- Plain Py_UNICODE --------------------------------------------------- */
452
453/* With PEP 393, this is the recommended way to allocate a new unicode object.
454 This function will allocate the object and its buffer in a single memory
455 block. Objects created using this function are not resizable. */
456PyAPI_FUNC(PyObject*) PyUnicode_New(
457 Py_ssize_t size, /* Number of code points in the new string */
458 Py_UCS4 maxchar /* maximum code point value in the string */
459 );
460
461/* Initializes the canonical string representation from the deprecated
462 wstr/Py_UNICODE representation. This function is used to convert Unicode
463 objects which were created using the old API to the new flexible format
464 introduced with PEP 393.
465
466 Don't call this function directly, use the public PyUnicode_READY() macro
467 instead. */
468PyAPI_FUNC(int) _PyUnicode_Ready(
469 PyObject *unicode /* Unicode object */
470 );
471
472/* Get a copy of a Unicode string. */
473PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
474 PyObject *unicode
475 );
476
477/* Copy character from one unicode object into another, this function performs
478 character conversion when necessary and falls back to memcpy() if possible.
479
480 Fail if to is too small (smaller than *how_many* or smaller than
481 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
482 kind(to), or if *to* has more than 1 reference.
483
484 Return the number of written character, or return -1 and raise an exception
485 on error.
486
487 Pseudo-code:
488
489 how_many = min(how_many, len(from) - from_start)
490 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
491 return how_many
492
493 Note: The function doesn't write a terminating null character.
494 */
495PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
496 PyObject *to,
497 Py_ssize_t to_start,
498 PyObject *from,
499 Py_ssize_t from_start,
500 Py_ssize_t how_many
501 );
502
503/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
504 may crash if parameters are invalid (e.g. if the output string
505 is too short). */
506PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
507 PyObject *to,
508 Py_ssize_t to_start,
509 PyObject *from,
510 Py_ssize_t from_start,
511 Py_ssize_t how_many
512 );
513
514/* Fill a string with a character: write fill_char into
515 unicode[start:start+length].
516
517 Fail if fill_char is bigger than the string maximum character, or if the
518 string has more than 1 reference.
519
520 Return the number of written character, or return -1 and raise an exception
521 on error. */
522PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
523 PyObject *unicode,
524 Py_ssize_t start,
525 Py_ssize_t length,
526 Py_UCS4 fill_char
527 );
528
529/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
530 if parameters are invalid (e.g. if length is longer than the string). */
531PyAPI_FUNC(void) _PyUnicode_FastFill(
532 PyObject *unicode,
533 Py_ssize_t start,
534 Py_ssize_t length,
535 Py_UCS4 fill_char
536 );
537
538/* Create a Unicode Object from the Py_UNICODE buffer u of the given
539 size.
540
541 u may be NULL which causes the contents to be undefined. It is the
542 user's responsibility to fill in the needed data afterwards. Note
543 that modifying the Unicode object contents after construction is
544 only allowed if u was set to NULL.
545
546 The buffer is copied into the new object. */
Inada Naoki2c4928d2020-06-17 20:09:44 +0900547Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Victor Stinner75e46992018-11-26 17:29:38 +0100548 const Py_UNICODE *u, /* Unicode buffer */
549 Py_ssize_t size /* size of buffer */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600550 );
Victor Stinner75e46992018-11-26 17:29:38 +0100551
552/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
553 Scan the string to find the maximum character. */
554PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
555 int kind,
556 const void *buffer,
557 Py_ssize_t size);
558
559/* Create a new string from a buffer of ASCII characters.
560 WARNING: Don't check if the string contains any non-ASCII character. */
561PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
562 const char *buffer,
563 Py_ssize_t size);
564
565/* Compute the maximum character of the substring unicode[start:end].
566 Return 127 for an empty string. */
567PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
568 PyObject *unicode,
569 Py_ssize_t start,
570 Py_ssize_t end);
571
572/* Return a read-only pointer to the Unicode object's internal
573 Py_UNICODE buffer.
574 If the wchar_t/Py_UNICODE representation is not yet available, this
575 function will calculate it. */
Inada Naoki2c4928d2020-06-17 20:09:44 +0900576Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Victor Stinner75e46992018-11-26 17:29:38 +0100577 PyObject *unicode /* Unicode object */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600578 );
Victor Stinner75e46992018-11-26 17:29:38 +0100579
580/* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
581 contains null characters. */
Inada Naoki2c4928d2020-06-17 20:09:44 +0900582Py_DEPRECATED(3.3) PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode(
Victor Stinner75e46992018-11-26 17:29:38 +0100583 PyObject *unicode /* Unicode object */
584 );
585
586/* Return a read-only pointer to the Unicode object's internal
587 Py_UNICODE buffer and save the length at size.
588 If the wchar_t/Py_UNICODE representation is not yet available, this
589 function will calculate it. */
590
Inada Naoki2c4928d2020-06-17 20:09:44 +0900591Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
Victor Stinner75e46992018-11-26 17:29:38 +0100592 PyObject *unicode, /* Unicode object */
593 Py_ssize_t *size /* location where to save the length */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600594 );
Victor Stinner75e46992018-11-26 17:29:38 +0100595
Victor Stinner75e46992018-11-26 17:29:38 +0100596
597/* --- _PyUnicodeWriter API ----------------------------------------------- */
598
599typedef struct {
600 PyObject *buffer;
601 void *data;
602 enum PyUnicode_Kind kind;
603 Py_UCS4 maxchar;
604 Py_ssize_t size;
605 Py_ssize_t pos;
606
607 /* minimum number of allocated characters (default: 0) */
608 Py_ssize_t min_length;
609
610 /* minimum character (default: 127, ASCII) */
611 Py_UCS4 min_char;
612
613 /* If non-zero, overallocate the buffer (default: 0). */
614 unsigned char overallocate;
615
616 /* If readonly is 1, buffer is a shared string (cannot be modified)
617 and size is set to 0. */
618 unsigned char readonly;
619} _PyUnicodeWriter ;
620
621/* Initialize a Unicode writer.
622 *
623 * By default, the minimum buffer size is 0 character and overallocation is
624 * disabled. Set min_length, min_char and overallocate attributes to control
625 * the allocation of the buffer. */
626PyAPI_FUNC(void)
627_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
628
629/* Prepare the buffer to write 'length' characters
630 with the specified maximum character.
631
632 Return 0 on success, raise an exception and return -1 on error. */
633#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
634 (((MAXCHAR) <= (WRITER)->maxchar \
635 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
636 ? 0 \
637 : (((LENGTH) == 0) \
638 ? 0 \
639 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
640
641/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
642 instead. */
643PyAPI_FUNC(int)
644_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
645 Py_ssize_t length, Py_UCS4 maxchar);
646
647/* Prepare the buffer to have at least the kind KIND.
648 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
649 support characters in range U+000-U+FFFF.
650
651 Return 0 on success, raise an exception and return -1 on error. */
652#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
653 (assert((KIND) != PyUnicode_WCHAR_KIND), \
654 (KIND) <= (WRITER)->kind \
655 ? 0 \
656 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
657
658/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
659 macro instead. */
660PyAPI_FUNC(int)
661_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
662 enum PyUnicode_Kind kind);
663
664/* Append a Unicode character.
665 Return 0 on success, raise an exception and return -1 on error. */
666PyAPI_FUNC(int)
667_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
668 Py_UCS4 ch
669 );
670
671/* Append a Unicode string.
672 Return 0 on success, raise an exception and return -1 on error. */
673PyAPI_FUNC(int)
674_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
675 PyObject *str /* Unicode string */
676 );
677
678/* Append a substring of a Unicode string.
679 Return 0 on success, raise an exception and return -1 on error. */
680PyAPI_FUNC(int)
681_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
682 PyObject *str, /* Unicode string */
683 Py_ssize_t start,
684 Py_ssize_t end
685 );
686
687/* Append an ASCII-encoded byte string.
688 Return 0 on success, raise an exception and return -1 on error. */
689PyAPI_FUNC(int)
690_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
691 const char *str, /* ASCII-encoded byte string */
692 Py_ssize_t len /* number of bytes, or -1 if unknown */
693 );
694
695/* Append a latin1-encoded byte string.
696 Return 0 on success, raise an exception and return -1 on error. */
697PyAPI_FUNC(int)
698_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
699 const char *str, /* latin1-encoded byte string */
700 Py_ssize_t len /* length in bytes */
701 );
702
703/* Get the value of the writer as a Unicode string. Clear the
704 buffer of the writer. Raise an exception and return NULL
705 on error. */
706PyAPI_FUNC(PyObject *)
707_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
708
709/* Deallocate memory of a writer (clear its internal buffer). */
710PyAPI_FUNC(void)
711_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
712
713
714/* Format the object based on the format_spec, as defined in PEP 3101
715 (Advanced String Formatting). */
716PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
717 _PyUnicodeWriter *writer,
718 PyObject *obj,
719 PyObject *format_spec,
720 Py_ssize_t start,
721 Py_ssize_t end);
722
Victor Stinner75e46992018-11-26 17:29:38 +0100723/* --- Manage the default encoding ---------------------------------------- */
724
725/* Returns a pointer to the default encoding (UTF-8) of the
726 Unicode object unicode and the size of the encoded representation
727 in bytes stored in *size.
728
729 In case of an error, no *size is set.
730
731 This function caches the UTF-8 encoded string in the unicodeobject
732 and subsequent calls will return the same string. The memory is released
733 when the unicodeobject is deallocated.
734
735 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
736 support the previous internal function with the same behaviour.
Victor Stinner75e46992018-11-26 17:29:38 +0100737*/
738
739PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
740 PyObject *unicode,
741 Py_ssize_t *size);
742
743#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
744
745/* Returns a pointer to the default encoding (UTF-8) of the
746 Unicode object unicode.
747
748 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
749 in the unicodeobject.
750
751 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
752 support the previous internal function with the same behaviour.
753
754 Use of this API is DEPRECATED since no size information can be
755 extracted from the returned data.
756
757 *** This API is for interpreter INTERNAL USE ONLY and will likely
758 *** be removed or changed for Python 3.1.
759
760 *** If you need to access the Unicode object as UTF-8 bytes string,
761 *** please use PyUnicode_AsUTF8String() instead.
762
763*/
764
765PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
766
767#define _PyUnicode_AsString PyUnicode_AsUTF8
768
769/* --- Generic Codecs ----------------------------------------------------- */
770
771/* Encodes a Py_UNICODE buffer of the given size and returns a
772 Python string object. */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600773Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Victor Stinner75e46992018-11-26 17:29:38 +0100774 const Py_UNICODE *s, /* Unicode char buffer */
775 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
776 const char *encoding, /* encoding */
777 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600778 );
Victor Stinner75e46992018-11-26 17:29:38 +0100779
780/* --- UTF-7 Codecs ------------------------------------------------------- */
781
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600782Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Victor Stinner75e46992018-11-26 17:29:38 +0100783 const Py_UNICODE *data, /* Unicode char buffer */
784 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
785 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
786 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
787 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600788 );
Victor Stinner75e46992018-11-26 17:29:38 +0100789
790PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
791 PyObject *unicode, /* Unicode object */
792 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
793 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
794 const char *errors /* error handling */
795 );
796
797/* --- UTF-8 Codecs ------------------------------------------------------- */
798
799PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
800 PyObject *unicode,
801 const char *errors);
802
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600803Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Victor Stinner75e46992018-11-26 17:29:38 +0100804 const Py_UNICODE *data, /* Unicode char buffer */
805 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
806 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600807 );
Victor Stinner75e46992018-11-26 17:29:38 +0100808
809/* --- UTF-32 Codecs ------------------------------------------------------ */
810
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600811Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Victor Stinner75e46992018-11-26 17:29:38 +0100812 const Py_UNICODE *data, /* Unicode char buffer */
813 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
814 const char *errors, /* error handling */
815 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600816 );
Victor Stinner75e46992018-11-26 17:29:38 +0100817
818PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
819 PyObject *object, /* Unicode object */
820 const char *errors, /* error handling */
821 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
822 );
823
824/* --- UTF-16 Codecs ------------------------------------------------------ */
825
826/* Returns a Python string object holding the UTF-16 encoded value of
827 the Unicode data.
828
829 If byteorder is not 0, output is written according to the following
830 byte order:
831
832 byteorder == -1: little endian
833 byteorder == 0: native byte order (writes a BOM mark)
834 byteorder == 1: big endian
835
836 If byteorder is 0, the output string will always start with the
837 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
838 prepended.
839
840 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
841 UCS-2. This trick makes it possible to add full UTF-16 capabilities
842 at a later point without compromising the APIs.
843
844*/
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600845Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Victor Stinner75e46992018-11-26 17:29:38 +0100846 const Py_UNICODE *data, /* Unicode char buffer */
847 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
848 const char *errors, /* error handling */
849 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600850 );
Victor Stinner75e46992018-11-26 17:29:38 +0100851
852PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
853 PyObject* unicode, /* Unicode object */
854 const char *errors, /* error handling */
855 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
856 );
857
858/* --- Unicode-Escape Codecs ---------------------------------------------- */
859
860/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
861 chars. */
862PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
863 const char *string, /* Unicode-Escape encoded string */
864 Py_ssize_t length, /* size of string */
865 const char *errors, /* error handling */
866 const char **first_invalid_escape /* on return, points to first
867 invalid escaped char in
868 string. */
869);
870
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600871Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Victor Stinner75e46992018-11-26 17:29:38 +0100872 const Py_UNICODE *data, /* Unicode char buffer */
873 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600874 );
Victor Stinner75e46992018-11-26 17:29:38 +0100875
876/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
877
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600878Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Victor Stinner75e46992018-11-26 17:29:38 +0100879 const Py_UNICODE *data, /* Unicode char buffer */
880 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600881 );
Victor Stinner75e46992018-11-26 17:29:38 +0100882
Victor Stinner75e46992018-11-26 17:29:38 +0100883/* --- Latin-1 Codecs ----------------------------------------------------- */
884
885PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
886 PyObject* unicode,
887 const char* errors);
888
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600889Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Victor Stinner75e46992018-11-26 17:29:38 +0100890 const Py_UNICODE *data, /* Unicode char buffer */
891 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
892 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600893 );
Victor Stinner75e46992018-11-26 17:29:38 +0100894
895/* --- ASCII Codecs ------------------------------------------------------- */
896
897PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
898 PyObject* unicode,
899 const char* errors);
900
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600901Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Victor Stinner75e46992018-11-26 17:29:38 +0100902 const Py_UNICODE *data, /* Unicode char buffer */
903 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
904 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600905 );
Victor Stinner75e46992018-11-26 17:29:38 +0100906
907/* --- Character Map Codecs ----------------------------------------------- */
908
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600909Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Victor Stinner75e46992018-11-26 17:29:38 +0100910 const Py_UNICODE *data, /* Unicode char buffer */
911 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
912 PyObject *mapping, /* encoding mapping */
913 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600914 );
Victor Stinner75e46992018-11-26 17:29:38 +0100915
916PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
917 PyObject *unicode, /* Unicode object */
918 PyObject *mapping, /* encoding mapping */
919 const char *errors /* error handling */
920 );
921
922/* Translate a Py_UNICODE buffer of the given length by applying a
923 character mapping table to it and return the resulting Unicode
924 object.
925
926 The mapping table must map Unicode ordinal integers to Unicode strings,
927 Unicode ordinal integers or None (causing deletion of the character).
928
929 Mapping tables may be dictionaries or sequences. Unmapped character
930 ordinals (ones which cause a LookupError) are left untouched and
931 are copied as-is.
932
933*/
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600934Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Victor Stinner75e46992018-11-26 17:29:38 +0100935 const Py_UNICODE *data, /* Unicode char buffer */
936 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
937 PyObject *table, /* Translate table */
938 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600939 );
Victor Stinner75e46992018-11-26 17:29:38 +0100940
941/* --- MBCS codecs for Windows -------------------------------------------- */
942
943#ifdef MS_WINDOWS
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600944Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Victor Stinner75e46992018-11-26 17:29:38 +0100945 const Py_UNICODE *data, /* Unicode char buffer */
946 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
947 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600948 );
Victor Stinner75e46992018-11-26 17:29:38 +0100949#endif
950
951/* --- Decimal Encoder ---------------------------------------------------- */
952
953/* Takes a Unicode string holding a decimal value and writes it into
954 an output buffer using standard ASCII digit codes.
955
956 The output buffer has to provide at least length+1 bytes of storage
957 area. The output string is 0-terminated.
958
959 The encoder converts whitespace to ' ', decimal characters to their
960 corresponding ASCII digit and all other Latin-1 characters except
961 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
962 are treated as errors. This includes embedded NULL bytes.
963
964 Error handling is defined by the errors argument:
965
966 NULL or "strict": raise a ValueError
967 "ignore": ignore the wrong characters (these are not copied to the
968 output buffer)
969 "replace": replaces illegal characters with '?'
970
971 Returns 0 on success, -1 on failure.
972
973*/
974
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600975/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Victor Stinner75e46992018-11-26 17:29:38 +0100976 Py_UNICODE *s, /* Unicode buffer */
977 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
978 char *output, /* Output buffer; must have size >= length */
979 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600980 );
Victor Stinner75e46992018-11-26 17:29:38 +0100981
982/* Transforms code points that have decimal digit property to the
983 corresponding ASCII digit code points.
984
985 Returns a new Unicode string on success, NULL on failure.
986*/
987
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600988/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100989PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
990 Py_UNICODE *s, /* Unicode buffer */
991 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600992 );
Victor Stinner75e46992018-11-26 17:29:38 +0100993
994/* Coverts a Unicode object holding a decimal value to an ASCII string
995 for using in int, float and complex parsers.
996 Transforms code points that have decimal digit property to the
997 corresponding ASCII digit code points. Transforms spaces to ASCII.
998 Transforms code points starting from the first non-ASCII code point that
999 is neither a decimal digit nor a space to the end into '?'. */
1000
1001PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1002 PyObject *unicode /* Unicode object */
1003 );
1004
1005/* --- Methods & Slots ---------------------------------------------------- */
1006
1007PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
1008 PyObject *separator,
1009 PyObject *const *items,
1010 Py_ssize_t seqlen
1011 );
1012
1013/* Test whether a unicode is equal to ASCII identifier. Return 1 if true,
1014 0 otherwise. The right argument must be ASCII identifier.
1015 Any error occurs inside will be cleared before return. */
1016PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
1017 PyObject *left, /* Left string */
1018 _Py_Identifier *right /* Right identifier */
1019 );
1020
1021/* Test whether a unicode is equal to ASCII string. Return 1 if true,
1022 0 otherwise. The right argument must be ASCII-encoded string.
1023 Any error occurs inside will be cleared before return. */
1024PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
1025 PyObject *left,
1026 const char *right /* ASCII-encoded string */
1027 );
1028
1029/* Externally visible for str.strip(unicode) */
1030PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1031 PyObject *self,
1032 int striptype,
1033 PyObject *sepobj
1034 );
1035
1036/* Using explicit passed-in values, insert the thousands grouping
1037 into the string pointed to by buffer. For the argument descriptions,
1038 see Objects/stringlib/localeutil.h */
1039PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1040 _PyUnicodeWriter *writer,
1041 Py_ssize_t n_buffer,
1042 PyObject *digits,
1043 Py_ssize_t d_pos,
1044 Py_ssize_t n_digits,
1045 Py_ssize_t min_width,
1046 const char *grouping,
1047 PyObject *thousands_sep,
1048 Py_UCS4 *maxchar);
1049
1050/* === Characters Type APIs =============================================== */
1051
1052/* Helper array used by Py_UNICODE_ISSPACE(). */
1053
1054PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1055
1056/* These should not be used directly. Use the Py_UNICODE_IS* and
1057 Py_UNICODE_TO* macros instead.
1058
1059 These APIs are implemented in Objects/unicodectype.c.
1060
1061*/
1062
1063PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1064 Py_UCS4 ch /* Unicode character */
1065 );
1066
1067PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1068 Py_UCS4 ch /* Unicode character */
1069 );
1070
1071PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1072 Py_UCS4 ch /* Unicode character */
1073 );
1074
1075PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1076 Py_UCS4 ch /* Unicode character */
1077 );
1078
1079PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1080 Py_UCS4 ch /* Unicode character */
1081 );
1082
1083PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1084 const Py_UCS4 ch /* Unicode character */
1085 );
1086
1087PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1088 const Py_UCS4 ch /* Unicode character */
1089 );
1090
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001091/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
Victor Stinner75e46992018-11-26 17:29:38 +01001092 Py_UCS4 ch /* Unicode character */
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001093 );
Victor Stinner75e46992018-11-26 17:29:38 +01001094
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001095/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
Victor Stinner75e46992018-11-26 17:29:38 +01001096 Py_UCS4 ch /* Unicode character */
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001097 );
Victor Stinner75e46992018-11-26 17:29:38 +01001098
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001099Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
Victor Stinner75e46992018-11-26 17:29:38 +01001100 Py_UCS4 ch /* Unicode character */
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001101 );
Victor Stinner75e46992018-11-26 17:29:38 +01001102
1103PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
1104 Py_UCS4 ch, /* Unicode character */
1105 Py_UCS4 *res
1106 );
1107
1108PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
1109 Py_UCS4 ch, /* Unicode character */
1110 Py_UCS4 *res
1111 );
1112
1113PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
1114 Py_UCS4 ch, /* Unicode character */
1115 Py_UCS4 *res
1116 );
1117
1118PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
1119 Py_UCS4 ch, /* Unicode character */
1120 Py_UCS4 *res
1121 );
1122
1123PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
1124 Py_UCS4 ch /* Unicode character */
1125 );
1126
1127PyAPI_FUNC(int) _PyUnicode_IsCased(
1128 Py_UCS4 ch /* Unicode character */
1129 );
1130
1131PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1132 Py_UCS4 ch /* Unicode character */
1133 );
1134
1135PyAPI_FUNC(int) _PyUnicode_ToDigit(
1136 Py_UCS4 ch /* Unicode character */
1137 );
1138
1139PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1140 Py_UCS4 ch /* Unicode character */
1141 );
1142
1143PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1144 Py_UCS4 ch /* Unicode character */
1145 );
1146
1147PyAPI_FUNC(int) _PyUnicode_IsDigit(
1148 Py_UCS4 ch /* Unicode character */
1149 );
1150
1151PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1152 Py_UCS4 ch /* Unicode character */
1153 );
1154
1155PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1156 Py_UCS4 ch /* Unicode character */
1157 );
1158
1159PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1160 Py_UCS4 ch /* Unicode character */
1161 );
1162
Victor Stinner75e46992018-11-26 17:29:38 +01001163PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
1164
Victor Stinner75e46992018-11-26 17:29:38 +01001165/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
1166PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
Victor Stinner75e46992018-11-26 17:29:38 +01001167
1168/* Fast equality check when the inputs are known to be exact unicode types
1169 and where the hash values are equal (i.e. a very probable match) */
1170PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
1171
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001172PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);