blob: a9b754ae6d225645155389fe6e5364cae7083f3b [file] [log] [blame]
Victor Stinner75e46992018-11-26 17:29:38 +01001#ifndef Py_CPYTHON_UNICODEOBJECT_H
2# error "this header file must not be included directly"
3#endif
4
5#ifdef __cplusplus
6extern "C" {
7#endif
8
9/* Py_UNICODE was the native Unicode storage format (code unit) used by
10 Python and represents a single Unicode element in the Unicode type.
11 With PEP 393, Py_UNICODE is deprecated and replaced with a
12 typedef to wchar_t. */
13#define PY_UNICODE_TYPE wchar_t
Zackery Spytz3c8724f2019-05-28 09:16:33 -060014/* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
Victor Stinner75e46992018-11-26 17:29:38 +010015
16/* --- Internal Unicode Operations ---------------------------------------- */
17
18/* Since splitting on whitespace is an important use case, and
19 whitespace in most situations is solely ASCII whitespace, we
20 optimize for the common case by using a quick look-up table
21 _Py_ascii_whitespace (see below) with an inlined check.
22
23 */
24#define Py_UNICODE_ISSPACE(ch) \
25 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
26
27#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
28#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
29#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
30#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
31
32#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
33#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
34#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
35
36#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
37#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
38#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
39#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
40
41#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
42#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
43#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
44
45#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
46
47#define Py_UNICODE_ISALNUM(ch) \
48 (Py_UNICODE_ISALPHA(ch) || \
49 Py_UNICODE_ISDECIMAL(ch) || \
50 Py_UNICODE_ISDIGIT(ch) || \
51 Py_UNICODE_ISNUMERIC(ch))
52
Inada Naoki610a60c2020-06-18 17:30:53 +090053Py_DEPRECATED(3.3) static inline void
54Py_UNICODE_COPY(Py_UNICODE *target, const Py_UNICODE *source, Py_ssize_t length) {
55 memcpy(target, source, length * sizeof(Py_UNICODE));
56}
Victor Stinner75e46992018-11-26 17:29:38 +010057
Inada Naoki610a60c2020-06-18 17:30:53 +090058Py_DEPRECATED(3.3) static inline void
59Py_UNICODE_FILL(Py_UNICODE *target, Py_UNICODE value, Py_ssize_t length) {
60 Py_ssize_t i;
61 for (i = 0; i < length; i++) {
62 target[i] = value;
63 }
64}
Victor Stinner75e46992018-11-26 17:29:38 +010065
66/* macros to work with surrogates */
67#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
68#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
69#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
70/* Join two surrogate characters and return a single Py_UCS4 value. */
71#define Py_UNICODE_JOIN_SURROGATES(high, low) \
72 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
73 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
74/* high surrogate = top 10 bits added to D800 */
75#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
76/* low surrogate = bottom 10 bits added to DC00 */
77#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
78
Victor Stinner75e46992018-11-26 17:29:38 +010079/* --- Unicode Type ------------------------------------------------------- */
80
81/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
82 structure. state.ascii and state.compact are set, and the data
83 immediately follow the structure. utf8_length and wstr_length can be found
84 in the length field; the utf8 pointer is equal to the data pointer. */
85typedef struct {
86 /* There are 4 forms of Unicode strings:
87
88 - compact ascii:
89
90 * structure = PyASCIIObject
91 * test: PyUnicode_IS_COMPACT_ASCII(op)
92 * kind = PyUnicode_1BYTE_KIND
93 * compact = 1
94 * ascii = 1
95 * ready = 1
96 * (length is the length of the utf8 and wstr strings)
97 * (data starts just after the structure)
98 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
99
100 - compact:
101
102 * structure = PyCompactUnicodeObject
103 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
104 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
105 PyUnicode_4BYTE_KIND
106 * compact = 1
107 * ready = 1
108 * ascii = 0
109 * utf8 is not shared with data
110 * utf8_length = 0 if utf8 is NULL
111 * wstr is shared with data and wstr_length=length
112 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
113 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
114 * wstr_length = 0 if wstr is NULL
115 * (data starts just after the structure)
116
117 - legacy string, not ready:
118
119 * structure = PyUnicodeObject
120 * test: kind == PyUnicode_WCHAR_KIND
121 * length = 0 (use wstr_length)
122 * hash = -1
123 * kind = PyUnicode_WCHAR_KIND
124 * compact = 0
125 * ascii = 0
126 * ready = 0
127 * interned = SSTATE_NOT_INTERNED
128 * wstr is not NULL
129 * data.any is NULL
130 * utf8 is NULL
131 * utf8_length = 0
132
133 - legacy string, ready:
134
135 * structure = PyUnicodeObject structure
136 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
137 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
138 PyUnicode_4BYTE_KIND
139 * compact = 0
140 * ready = 1
141 * data.any is not NULL
142 * utf8 is shared and utf8_length = length with data.any if ascii = 1
143 * utf8_length = 0 if utf8 is NULL
144 * wstr is shared with data.any and wstr_length = length
145 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
146 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
147 * wstr_length = 0 if wstr is NULL
148
149 Compact strings use only one memory block (structure + characters),
150 whereas legacy strings use one block for the structure and one block
151 for characters.
152
153 Legacy strings are created by PyUnicode_FromUnicode() and
154 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
155 when PyUnicode_READY() is called.
156
157 See also _PyUnicode_CheckConsistency().
158 */
159 PyObject_HEAD
160 Py_ssize_t length; /* Number of code points in the string */
161 Py_hash_t hash; /* Hash value; -1 if not set */
162 struct {
163 /*
164 SSTATE_NOT_INTERNED (0)
165 SSTATE_INTERNED_MORTAL (1)
166 SSTATE_INTERNED_IMMORTAL (2)
167
168 If interned != SSTATE_NOT_INTERNED, the two references from the
169 dictionary to this object are *not* counted in ob_refcnt.
170 */
171 unsigned int interned:2;
172 /* Character size:
173
174 - PyUnicode_WCHAR_KIND (0):
175
176 * character type = wchar_t (16 or 32 bits, depending on the
177 platform)
178
179 - PyUnicode_1BYTE_KIND (1):
180
181 * character type = Py_UCS1 (8 bits, unsigned)
182 * all characters are in the range U+0000-U+00FF (latin1)
183 * if ascii is set, all characters are in the range U+0000-U+007F
184 (ASCII), otherwise at least one character is in the range
185 U+0080-U+00FF
186
187 - PyUnicode_2BYTE_KIND (2):
188
189 * character type = Py_UCS2 (16 bits, unsigned)
190 * all characters are in the range U+0000-U+FFFF (BMP)
191 * at least one character is in the range U+0100-U+FFFF
192
193 - PyUnicode_4BYTE_KIND (4):
194
195 * character type = Py_UCS4 (32 bits, unsigned)
196 * all characters are in the range U+0000-U+10FFFF
197 * at least one character is in the range U+10000-U+10FFFF
198 */
199 unsigned int kind:3;
200 /* Compact is with respect to the allocation scheme. Compact unicode
201 objects only require one memory block while non-compact objects use
202 one block for the PyUnicodeObject struct and another for its data
203 buffer. */
204 unsigned int compact:1;
205 /* The string only contains characters in the range U+0000-U+007F (ASCII)
206 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
207 set, use the PyASCIIObject structure. */
208 unsigned int ascii:1;
209 /* The ready flag indicates whether the object layout is initialized
210 completely. This means that this is either a compact object, or
211 the data pointer is filled out. The bit is redundant, and helps
212 to minimize the test in PyUnicode_IS_READY(). */
213 unsigned int ready:1;
214 /* Padding to ensure that PyUnicode_DATA() is always aligned to
215 4 bytes (see issue #19537 on m68k). */
216 unsigned int :24;
217 } state;
218 wchar_t *wstr; /* wchar_t representation (null-terminated) */
219} PyASCIIObject;
220
221/* Non-ASCII strings allocated through PyUnicode_New use the
222 PyCompactUnicodeObject structure. state.compact is set, and the data
223 immediately follow the structure. */
224typedef struct {
225 PyASCIIObject _base;
226 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
227 * terminating \0. */
228 char *utf8; /* UTF-8 representation (null-terminated) */
229 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
230 * surrogates count as two code points. */
231} PyCompactUnicodeObject;
232
233/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
234 PyUnicodeObject structure. The actual string data is initially in the wstr
235 block, and copied into the data block using _PyUnicode_Ready. */
236typedef struct {
237 PyCompactUnicodeObject _base;
238 union {
239 void *any;
240 Py_UCS1 *latin1;
241 Py_UCS2 *ucs2;
242 Py_UCS4 *ucs4;
243 } data; /* Canonical, smallest-form Unicode buffer */
244} PyUnicodeObject;
245
Victor Stinner68762572019-10-07 18:42:01 +0200246PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
247 PyObject *op,
248 int check_content);
249
Victor Stinner75e46992018-11-26 17:29:38 +0100250/* Fast access macros */
Victor Stinner75e46992018-11-26 17:29:38 +0100251
252/* Returns the deprecated Py_UNICODE representation's size in code units
253 (this includes surrogate pairs as 2 units).
254 If the Py_UNICODE representation is not available, it will be computed
255 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
256
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600257/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100258#define PyUnicode_GET_SIZE(op) \
259 (assert(PyUnicode_Check(op)), \
260 (((PyASCIIObject *)(op))->wstr) ? \
261 PyUnicode_WSTR_LENGTH(op) : \
262 ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\
263 assert(((PyASCIIObject *)(op))->wstr), \
264 PyUnicode_WSTR_LENGTH(op)))
Victor Stinner75e46992018-11-26 17:29:38 +0100265
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600266/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100267#define PyUnicode_GET_DATA_SIZE(op) \
268 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
Victor Stinner75e46992018-11-26 17:29:38 +0100269
270/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
271 representation on demand. Using this macro is very inefficient now,
272 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
273 use PyUnicode_WRITE() and PyUnicode_READ(). */
274
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600275/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100276#define PyUnicode_AS_UNICODE(op) \
277 (assert(PyUnicode_Check(op)), \
278 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
279 PyUnicode_AsUnicode(_PyObject_CAST(op)))
Victor Stinner75e46992018-11-26 17:29:38 +0100280
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600281/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100282#define PyUnicode_AS_DATA(op) \
283 ((const char *)(PyUnicode_AS_UNICODE(op)))
Victor Stinner75e46992018-11-26 17:29:38 +0100284
285
286/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
287
288/* Values for PyASCIIObject.state: */
289
290/* Interning state. */
291#define SSTATE_NOT_INTERNED 0
292#define SSTATE_INTERNED_MORTAL 1
293#define SSTATE_INTERNED_IMMORTAL 2
294
295/* Return true if the string contains only ASCII characters, or 0 if not. The
296 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
297 ready. */
298#define PyUnicode_IS_ASCII(op) \
299 (assert(PyUnicode_Check(op)), \
300 assert(PyUnicode_IS_READY(op)), \
301 ((PyASCIIObject*)op)->state.ascii)
302
303/* Return true if the string is compact or 0 if not.
304 No type checks or Ready calls are performed. */
305#define PyUnicode_IS_COMPACT(op) \
306 (((PyASCIIObject*)(op))->state.compact)
307
308/* Return true if the string is a compact ASCII string (use PyASCIIObject
309 structure), or 0 if not. No type checks or Ready calls are performed. */
310#define PyUnicode_IS_COMPACT_ASCII(op) \
311 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
312
313enum PyUnicode_Kind {
314/* String contains only wstr byte characters. This is only possible
315 when the string was created with a legacy API and _PyUnicode_Ready()
316 has not been called yet. */
317 PyUnicode_WCHAR_KIND = 0,
318/* Return values of the PyUnicode_KIND() macro: */
319 PyUnicode_1BYTE_KIND = 1,
320 PyUnicode_2BYTE_KIND = 2,
321 PyUnicode_4BYTE_KIND = 4
322};
323
324/* Return pointers to the canonical representation cast to unsigned char,
325 Py_UCS2, or Py_UCS4 for direct character access.
326 No checks are performed, use PyUnicode_KIND() before to ensure
327 these will work correctly. */
328
329#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
330#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
331#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
332
333/* Return one of the PyUnicode_*_KIND values defined above. */
334#define PyUnicode_KIND(op) \
335 (assert(PyUnicode_Check(op)), \
336 assert(PyUnicode_IS_READY(op)), \
337 ((PyASCIIObject *)(op))->state.kind)
338
339/* Return a void pointer to the raw unicode buffer. */
340#define _PyUnicode_COMPACT_DATA(op) \
341 (PyUnicode_IS_ASCII(op) ? \
342 ((void*)((PyASCIIObject*)(op) + 1)) : \
343 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
344
345#define _PyUnicode_NONCOMPACT_DATA(op) \
346 (assert(((PyUnicodeObject*)(op))->data.any), \
347 ((((PyUnicodeObject *)(op))->data.any)))
348
349#define PyUnicode_DATA(op) \
350 (assert(PyUnicode_Check(op)), \
351 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
352 _PyUnicode_NONCOMPACT_DATA(op))
353
354/* In the access macros below, "kind" may be evaluated more than once.
355 All other macro parameters are evaluated exactly once, so it is safe
356 to put side effects into them (such as increasing the index). */
357
358/* Write into the canonical representation, this macro does not do any sanity
359 checks and is intended for usage in loops. The caller should cache the
360 kind and data pointers obtained from other macro calls.
361 index is the index in the string (starts at 0) and value is the new
362 code point value which should be written to that location. */
363#define PyUnicode_WRITE(kind, data, index, value) \
364 do { \
365 switch ((kind)) { \
366 case PyUnicode_1BYTE_KIND: { \
367 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
368 break; \
369 } \
370 case PyUnicode_2BYTE_KIND: { \
371 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
372 break; \
373 } \
374 default: { \
375 assert((kind) == PyUnicode_4BYTE_KIND); \
376 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
377 } \
378 } \
379 } while (0)
380
381/* Read a code point from the string's canonical representation. No checks
382 or ready calls are performed. */
383#define PyUnicode_READ(kind, data, index) \
384 ((Py_UCS4) \
385 ((kind) == PyUnicode_1BYTE_KIND ? \
386 ((const Py_UCS1 *)(data))[(index)] : \
387 ((kind) == PyUnicode_2BYTE_KIND ? \
388 ((const Py_UCS2 *)(data))[(index)] : \
389 ((const Py_UCS4 *)(data))[(index)] \
390 ) \
391 ))
392
393/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
394 calls PyUnicode_KIND() and might call it twice. For single reads, use
395 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
396 cache kind and use PyUnicode_READ instead. */
397#define PyUnicode_READ_CHAR(unicode, index) \
398 (assert(PyUnicode_Check(unicode)), \
399 assert(PyUnicode_IS_READY(unicode)), \
400 (Py_UCS4) \
401 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
402 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
403 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
404 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
405 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
406 ) \
407 ))
408
409/* Returns the length of the unicode string. The caller has to make sure that
410 the string has it's canonical representation set before calling
411 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
412#define PyUnicode_GET_LENGTH(op) \
413 (assert(PyUnicode_Check(op)), \
414 assert(PyUnicode_IS_READY(op)), \
415 ((PyASCIIObject *)(op))->length)
416
417
418/* Fast check to determine whether an object is ready. Equivalent to
419 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
420
421#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
422
423/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
424 case. If the canonical representation is not yet set, it will still call
425 _PyUnicode_Ready().
426 Returns 0 on success and -1 on errors. */
427#define PyUnicode_READY(op) \
428 (assert(PyUnicode_Check(op)), \
429 (PyUnicode_IS_READY(op) ? \
430 0 : _PyUnicode_Ready(_PyObject_CAST(op))))
431
432/* Return a maximum character value which is suitable for creating another
433 string based on op. This is always an approximation but more efficient
434 than iterating over the string. */
435#define PyUnicode_MAX_CHAR_VALUE(op) \
436 (assert(PyUnicode_IS_READY(op)), \
437 (PyUnicode_IS_ASCII(op) ? \
438 (0x7f) : \
439 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
440 (0xffU) : \
441 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
442 (0xffffU) : \
443 (0x10ffffU)))))
444
Inada Naoki610a60c2020-06-18 17:30:53 +0900445Py_DEPRECATED(3.3)
446static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {
447 return PyUnicode_IS_COMPACT_ASCII(op) ?
448 ((PyASCIIObject*)op)->length :
449 ((PyCompactUnicodeObject*)op)->wstr_length;
450}
451#define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)
452
Victor Stinner75e46992018-11-26 17:29:38 +0100453/* === Public API ========================================================= */
454
455/* --- Plain Py_UNICODE --------------------------------------------------- */
456
457/* With PEP 393, this is the recommended way to allocate a new unicode object.
458 This function will allocate the object and its buffer in a single memory
459 block. Objects created using this function are not resizable. */
460PyAPI_FUNC(PyObject*) PyUnicode_New(
461 Py_ssize_t size, /* Number of code points in the new string */
462 Py_UCS4 maxchar /* maximum code point value in the string */
463 );
464
465/* Initializes the canonical string representation from the deprecated
466 wstr/Py_UNICODE representation. This function is used to convert Unicode
467 objects which were created using the old API to the new flexible format
468 introduced with PEP 393.
469
470 Don't call this function directly, use the public PyUnicode_READY() macro
471 instead. */
472PyAPI_FUNC(int) _PyUnicode_Ready(
473 PyObject *unicode /* Unicode object */
474 );
475
476/* Get a copy of a Unicode string. */
477PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
478 PyObject *unicode
479 );
480
481/* Copy character from one unicode object into another, this function performs
482 character conversion when necessary and falls back to memcpy() if possible.
483
484 Fail if to is too small (smaller than *how_many* or smaller than
485 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
486 kind(to), or if *to* has more than 1 reference.
487
488 Return the number of written character, or return -1 and raise an exception
489 on error.
490
491 Pseudo-code:
492
493 how_many = min(how_many, len(from) - from_start)
494 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
495 return how_many
496
497 Note: The function doesn't write a terminating null character.
498 */
499PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
500 PyObject *to,
501 Py_ssize_t to_start,
502 PyObject *from,
503 Py_ssize_t from_start,
504 Py_ssize_t how_many
505 );
506
507/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
508 may crash if parameters are invalid (e.g. if the output string
509 is too short). */
510PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
511 PyObject *to,
512 Py_ssize_t to_start,
513 PyObject *from,
514 Py_ssize_t from_start,
515 Py_ssize_t how_many
516 );
517
518/* Fill a string with a character: write fill_char into
519 unicode[start:start+length].
520
521 Fail if fill_char is bigger than the string maximum character, or if the
522 string has more than 1 reference.
523
524 Return the number of written character, or return -1 and raise an exception
525 on error. */
526PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
527 PyObject *unicode,
528 Py_ssize_t start,
529 Py_ssize_t length,
530 Py_UCS4 fill_char
531 );
532
533/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
534 if parameters are invalid (e.g. if length is longer than the string). */
535PyAPI_FUNC(void) _PyUnicode_FastFill(
536 PyObject *unicode,
537 Py_ssize_t start,
538 Py_ssize_t length,
539 Py_UCS4 fill_char
540 );
541
542/* Create a Unicode Object from the Py_UNICODE buffer u of the given
543 size.
544
545 u may be NULL which causes the contents to be undefined. It is the
546 user's responsibility to fill in the needed data afterwards. Note
547 that modifying the Unicode object contents after construction is
548 only allowed if u was set to NULL.
549
550 The buffer is copied into the new object. */
Inada Naoki610a60c2020-06-18 17:30:53 +0900551Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Victor Stinner75e46992018-11-26 17:29:38 +0100552 const Py_UNICODE *u, /* Unicode buffer */
553 Py_ssize_t size /* size of buffer */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600554 );
Victor Stinner75e46992018-11-26 17:29:38 +0100555
556/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
557 Scan the string to find the maximum character. */
558PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
559 int kind,
560 const void *buffer,
561 Py_ssize_t size);
562
563/* Create a new string from a buffer of ASCII characters.
564 WARNING: Don't check if the string contains any non-ASCII character. */
565PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
566 const char *buffer,
567 Py_ssize_t size);
568
569/* Compute the maximum character of the substring unicode[start:end].
570 Return 127 for an empty string. */
571PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
572 PyObject *unicode,
573 Py_ssize_t start,
574 Py_ssize_t end);
575
576/* Return a read-only pointer to the Unicode object's internal
577 Py_UNICODE buffer.
578 If the wchar_t/Py_UNICODE representation is not yet available, this
579 function will calculate it. */
Inada Naoki610a60c2020-06-18 17:30:53 +0900580Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Victor Stinner75e46992018-11-26 17:29:38 +0100581 PyObject *unicode /* Unicode object */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600582 );
Victor Stinner75e46992018-11-26 17:29:38 +0100583
584/* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
585 contains null characters. */
Inada Naoki610a60c2020-06-18 17:30:53 +0900586Py_DEPRECATED(3.3) PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode(
Victor Stinner75e46992018-11-26 17:29:38 +0100587 PyObject *unicode /* Unicode object */
588 );
589
590/* Return a read-only pointer to the Unicode object's internal
591 Py_UNICODE buffer and save the length at size.
592 If the wchar_t/Py_UNICODE representation is not yet available, this
593 function will calculate it. */
594
Inada Naoki610a60c2020-06-18 17:30:53 +0900595Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
Victor Stinner75e46992018-11-26 17:29:38 +0100596 PyObject *unicode, /* Unicode object */
597 Py_ssize_t *size /* location where to save the length */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600598 );
Victor Stinner75e46992018-11-26 17:29:38 +0100599
600/* Get the maximum ordinal for a Unicode character. */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600601Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Victor Stinner75e46992018-11-26 17:29:38 +0100602
603
604/* --- _PyUnicodeWriter API ----------------------------------------------- */
605
606typedef struct {
607 PyObject *buffer;
608 void *data;
609 enum PyUnicode_Kind kind;
610 Py_UCS4 maxchar;
611 Py_ssize_t size;
612 Py_ssize_t pos;
613
614 /* minimum number of allocated characters (default: 0) */
615 Py_ssize_t min_length;
616
617 /* minimum character (default: 127, ASCII) */
618 Py_UCS4 min_char;
619
620 /* If non-zero, overallocate the buffer (default: 0). */
621 unsigned char overallocate;
622
623 /* If readonly is 1, buffer is a shared string (cannot be modified)
624 and size is set to 0. */
625 unsigned char readonly;
626} _PyUnicodeWriter ;
627
628/* Initialize a Unicode writer.
629 *
630 * By default, the minimum buffer size is 0 character and overallocation is
631 * disabled. Set min_length, min_char and overallocate attributes to control
632 * the allocation of the buffer. */
633PyAPI_FUNC(void)
634_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
635
636/* Prepare the buffer to write 'length' characters
637 with the specified maximum character.
638
639 Return 0 on success, raise an exception and return -1 on error. */
640#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
641 (((MAXCHAR) <= (WRITER)->maxchar \
642 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
643 ? 0 \
644 : (((LENGTH) == 0) \
645 ? 0 \
646 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
647
648/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
649 instead. */
650PyAPI_FUNC(int)
651_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
652 Py_ssize_t length, Py_UCS4 maxchar);
653
654/* Prepare the buffer to have at least the kind KIND.
655 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
656 support characters in range U+000-U+FFFF.
657
658 Return 0 on success, raise an exception and return -1 on error. */
659#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
660 (assert((KIND) != PyUnicode_WCHAR_KIND), \
661 (KIND) <= (WRITER)->kind \
662 ? 0 \
663 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
664
665/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
666 macro instead. */
667PyAPI_FUNC(int)
668_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
669 enum PyUnicode_Kind kind);
670
671/* Append a Unicode character.
672 Return 0 on success, raise an exception and return -1 on error. */
673PyAPI_FUNC(int)
674_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
675 Py_UCS4 ch
676 );
677
678/* Append a Unicode string.
679 Return 0 on success, raise an exception and return -1 on error. */
680PyAPI_FUNC(int)
681_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
682 PyObject *str /* Unicode string */
683 );
684
685/* Append a substring of a Unicode string.
686 Return 0 on success, raise an exception and return -1 on error. */
687PyAPI_FUNC(int)
688_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
689 PyObject *str, /* Unicode string */
690 Py_ssize_t start,
691 Py_ssize_t end
692 );
693
694/* Append an ASCII-encoded byte string.
695 Return 0 on success, raise an exception and return -1 on error. */
696PyAPI_FUNC(int)
697_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
698 const char *str, /* ASCII-encoded byte string */
699 Py_ssize_t len /* number of bytes, or -1 if unknown */
700 );
701
702/* Append a latin1-encoded byte string.
703 Return 0 on success, raise an exception and return -1 on error. */
704PyAPI_FUNC(int)
705_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
706 const char *str, /* latin1-encoded byte string */
707 Py_ssize_t len /* length in bytes */
708 );
709
710/* Get the value of the writer as a Unicode string. Clear the
711 buffer of the writer. Raise an exception and return NULL
712 on error. */
713PyAPI_FUNC(PyObject *)
714_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
715
716/* Deallocate memory of a writer (clear its internal buffer). */
717PyAPI_FUNC(void)
718_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
719
720
721/* Format the object based on the format_spec, as defined in PEP 3101
722 (Advanced String Formatting). */
723PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
724 _PyUnicodeWriter *writer,
725 PyObject *obj,
726 PyObject *format_spec,
727 Py_ssize_t start,
728 Py_ssize_t end);
729
Victor Stinner75e46992018-11-26 17:29:38 +0100730/* --- Manage the default encoding ---------------------------------------- */
731
732/* Returns a pointer to the default encoding (UTF-8) of the
733 Unicode object unicode and the size of the encoded representation
734 in bytes stored in *size.
735
736 In case of an error, no *size is set.
737
738 This function caches the UTF-8 encoded string in the unicodeobject
739 and subsequent calls will return the same string. The memory is released
740 when the unicodeobject is deallocated.
741
742 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
743 support the previous internal function with the same behaviour.
Victor Stinner75e46992018-11-26 17:29:38 +0100744*/
745
746PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
747 PyObject *unicode,
748 Py_ssize_t *size);
749
750#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
751
752/* Returns a pointer to the default encoding (UTF-8) of the
753 Unicode object unicode.
754
755 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
756 in the unicodeobject.
757
758 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
759 support the previous internal function with the same behaviour.
760
761 Use of this API is DEPRECATED since no size information can be
762 extracted from the returned data.
763
764 *** This API is for interpreter INTERNAL USE ONLY and will likely
765 *** be removed or changed for Python 3.1.
766
767 *** If you need to access the Unicode object as UTF-8 bytes string,
768 *** please use PyUnicode_AsUTF8String() instead.
769
770*/
771
772PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
773
774#define _PyUnicode_AsString PyUnicode_AsUTF8
775
776/* --- Generic Codecs ----------------------------------------------------- */
777
778/* Encodes a Py_UNICODE buffer of the given size and returns a
779 Python string object. */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600780Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Victor Stinner75e46992018-11-26 17:29:38 +0100781 const Py_UNICODE *s, /* Unicode char buffer */
782 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
783 const char *encoding, /* encoding */
784 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600785 );
Victor Stinner75e46992018-11-26 17:29:38 +0100786
787/* --- UTF-7 Codecs ------------------------------------------------------- */
788
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600789Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Victor Stinner75e46992018-11-26 17:29:38 +0100790 const Py_UNICODE *data, /* Unicode char buffer */
791 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
792 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
793 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
794 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600795 );
Victor Stinner75e46992018-11-26 17:29:38 +0100796
797PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
798 PyObject *unicode, /* Unicode object */
799 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
800 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
801 const char *errors /* error handling */
802 );
803
804/* --- UTF-8 Codecs ------------------------------------------------------- */
805
806PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
807 PyObject *unicode,
808 const char *errors);
809
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600810Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Victor Stinner75e46992018-11-26 17:29:38 +0100811 const Py_UNICODE *data, /* Unicode char buffer */
812 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
813 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600814 );
Victor Stinner75e46992018-11-26 17:29:38 +0100815
816/* --- UTF-32 Codecs ------------------------------------------------------ */
817
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600818Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Victor Stinner75e46992018-11-26 17:29:38 +0100819 const Py_UNICODE *data, /* Unicode char buffer */
820 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
821 const char *errors, /* error handling */
822 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600823 );
Victor Stinner75e46992018-11-26 17:29:38 +0100824
825PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
826 PyObject *object, /* Unicode object */
827 const char *errors, /* error handling */
828 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
829 );
830
831/* --- UTF-16 Codecs ------------------------------------------------------ */
832
833/* Returns a Python string object holding the UTF-16 encoded value of
834 the Unicode data.
835
836 If byteorder is not 0, output is written according to the following
837 byte order:
838
839 byteorder == -1: little endian
840 byteorder == 0: native byte order (writes a BOM mark)
841 byteorder == 1: big endian
842
843 If byteorder is 0, the output string will always start with the
844 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
845 prepended.
846
847 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
848 UCS-2. This trick makes it possible to add full UTF-16 capabilities
849 at a later point without compromising the APIs.
850
851*/
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600852Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Victor Stinner75e46992018-11-26 17:29:38 +0100853 const Py_UNICODE *data, /* Unicode char buffer */
854 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
855 const char *errors, /* error handling */
856 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600857 );
Victor Stinner75e46992018-11-26 17:29:38 +0100858
859PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
860 PyObject* unicode, /* Unicode object */
861 const char *errors, /* error handling */
862 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
863 );
864
865/* --- Unicode-Escape Codecs ---------------------------------------------- */
866
867/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
868 chars. */
869PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
870 const char *string, /* Unicode-Escape encoded string */
871 Py_ssize_t length, /* size of string */
872 const char *errors, /* error handling */
873 const char **first_invalid_escape /* on return, points to first
874 invalid escaped char in
875 string. */
876);
877
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600878Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Victor Stinner75e46992018-11-26 17:29:38 +0100879 const Py_UNICODE *data, /* Unicode char buffer */
880 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600881 );
Victor Stinner75e46992018-11-26 17:29:38 +0100882
883/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
884
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600885Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Victor Stinner75e46992018-11-26 17:29:38 +0100886 const Py_UNICODE *data, /* Unicode char buffer */
887 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600888 );
Victor Stinner75e46992018-11-26 17:29:38 +0100889
Victor Stinner75e46992018-11-26 17:29:38 +0100890/* --- Latin-1 Codecs ----------------------------------------------------- */
891
892PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
893 PyObject* unicode,
894 const char* errors);
895
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600896Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Victor Stinner75e46992018-11-26 17:29:38 +0100897 const Py_UNICODE *data, /* Unicode char buffer */
898 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
899 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600900 );
Victor Stinner75e46992018-11-26 17:29:38 +0100901
902/* --- ASCII Codecs ------------------------------------------------------- */
903
904PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
905 PyObject* unicode,
906 const char* errors);
907
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600908Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Victor Stinner75e46992018-11-26 17:29:38 +0100909 const Py_UNICODE *data, /* Unicode char buffer */
910 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
911 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600912 );
Victor Stinner75e46992018-11-26 17:29:38 +0100913
914/* --- Character Map Codecs ----------------------------------------------- */
915
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600916Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Victor Stinner75e46992018-11-26 17:29:38 +0100917 const Py_UNICODE *data, /* Unicode char buffer */
918 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
919 PyObject *mapping, /* encoding mapping */
920 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600921 );
Victor Stinner75e46992018-11-26 17:29:38 +0100922
923PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
924 PyObject *unicode, /* Unicode object */
925 PyObject *mapping, /* encoding mapping */
926 const char *errors /* error handling */
927 );
928
929/* Translate a Py_UNICODE buffer of the given length by applying a
930 character mapping table to it and return the resulting Unicode
931 object.
932
933 The mapping table must map Unicode ordinal integers to Unicode strings,
934 Unicode ordinal integers or None (causing deletion of the character).
935
936 Mapping tables may be dictionaries or sequences. Unmapped character
937 ordinals (ones which cause a LookupError) are left untouched and
938 are copied as-is.
939
940*/
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600941Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Victor Stinner75e46992018-11-26 17:29:38 +0100942 const Py_UNICODE *data, /* Unicode char buffer */
943 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
944 PyObject *table, /* Translate table */
945 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600946 );
Victor Stinner75e46992018-11-26 17:29:38 +0100947
948/* --- MBCS codecs for Windows -------------------------------------------- */
949
950#ifdef MS_WINDOWS
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600951Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Victor Stinner75e46992018-11-26 17:29:38 +0100952 const Py_UNICODE *data, /* Unicode char buffer */
953 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
954 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600955 );
Victor Stinner75e46992018-11-26 17:29:38 +0100956#endif
957
958/* --- Decimal Encoder ---------------------------------------------------- */
959
960/* Takes a Unicode string holding a decimal value and writes it into
961 an output buffer using standard ASCII digit codes.
962
963 The output buffer has to provide at least length+1 bytes of storage
964 area. The output string is 0-terminated.
965
966 The encoder converts whitespace to ' ', decimal characters to their
967 corresponding ASCII digit and all other Latin-1 characters except
968 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
969 are treated as errors. This includes embedded NULL bytes.
970
971 Error handling is defined by the errors argument:
972
973 NULL or "strict": raise a ValueError
974 "ignore": ignore the wrong characters (these are not copied to the
975 output buffer)
976 "replace": replaces illegal characters with '?'
977
978 Returns 0 on success, -1 on failure.
979
980*/
981
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600982/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Victor Stinner75e46992018-11-26 17:29:38 +0100983 Py_UNICODE *s, /* Unicode buffer */
984 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
985 char *output, /* Output buffer; must have size >= length */
986 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600987 );
Victor Stinner75e46992018-11-26 17:29:38 +0100988
989/* Transforms code points that have decimal digit property to the
990 corresponding ASCII digit code points.
991
992 Returns a new Unicode string on success, NULL on failure.
993*/
994
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600995/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100996PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
997 Py_UNICODE *s, /* Unicode buffer */
998 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600999 );
Victor Stinner75e46992018-11-26 17:29:38 +01001000
1001/* Coverts a Unicode object holding a decimal value to an ASCII string
1002 for using in int, float and complex parsers.
1003 Transforms code points that have decimal digit property to the
1004 corresponding ASCII digit code points. Transforms spaces to ASCII.
1005 Transforms code points starting from the first non-ASCII code point that
1006 is neither a decimal digit nor a space to the end into '?'. */
1007
1008PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1009 PyObject *unicode /* Unicode object */
1010 );
1011
1012/* --- Methods & Slots ---------------------------------------------------- */
1013
1014PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
1015 PyObject *separator,
1016 PyObject *const *items,
1017 Py_ssize_t seqlen
1018 );
1019
1020/* Test whether a unicode is equal to ASCII identifier. Return 1 if true,
1021 0 otherwise. The right argument must be ASCII identifier.
1022 Any error occurs inside will be cleared before return. */
1023PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
1024 PyObject *left, /* Left string */
1025 _Py_Identifier *right /* Right identifier */
1026 );
1027
1028/* Test whether a unicode is equal to ASCII string. Return 1 if true,
1029 0 otherwise. The right argument must be ASCII-encoded string.
1030 Any error occurs inside will be cleared before return. */
1031PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
1032 PyObject *left,
1033 const char *right /* ASCII-encoded string */
1034 );
1035
1036/* Externally visible for str.strip(unicode) */
1037PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1038 PyObject *self,
1039 int striptype,
1040 PyObject *sepobj
1041 );
1042
1043/* Using explicit passed-in values, insert the thousands grouping
1044 into the string pointed to by buffer. For the argument descriptions,
1045 see Objects/stringlib/localeutil.h */
1046PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1047 _PyUnicodeWriter *writer,
1048 Py_ssize_t n_buffer,
1049 PyObject *digits,
1050 Py_ssize_t d_pos,
1051 Py_ssize_t n_digits,
1052 Py_ssize_t min_width,
1053 const char *grouping,
1054 PyObject *thousands_sep,
1055 Py_UCS4 *maxchar);
1056
1057/* === Characters Type APIs =============================================== */
1058
1059/* Helper array used by Py_UNICODE_ISSPACE(). */
1060
1061PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1062
1063/* These should not be used directly. Use the Py_UNICODE_IS* and
1064 Py_UNICODE_TO* macros instead.
1065
1066 These APIs are implemented in Objects/unicodectype.c.
1067
1068*/
1069
1070PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1071 Py_UCS4 ch /* Unicode character */
1072 );
1073
1074PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1075 Py_UCS4 ch /* Unicode character */
1076 );
1077
1078PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1079 Py_UCS4 ch /* Unicode character */
1080 );
1081
1082PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1083 Py_UCS4 ch /* Unicode character */
1084 );
1085
1086PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1087 Py_UCS4 ch /* Unicode character */
1088 );
1089
1090PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1091 const Py_UCS4 ch /* Unicode character */
1092 );
1093
1094PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1095 const Py_UCS4 ch /* Unicode character */
1096 );
1097
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001098/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
Victor Stinner75e46992018-11-26 17:29:38 +01001099 Py_UCS4 ch /* Unicode character */
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001100 );
Victor Stinner75e46992018-11-26 17:29:38 +01001101
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001102/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
Victor Stinner75e46992018-11-26 17:29:38 +01001103 Py_UCS4 ch /* Unicode character */
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001104 );
Victor Stinner75e46992018-11-26 17:29:38 +01001105
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001106Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
Victor Stinner75e46992018-11-26 17:29:38 +01001107 Py_UCS4 ch /* Unicode character */
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001108 );
Victor Stinner75e46992018-11-26 17:29:38 +01001109
1110PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
1111 Py_UCS4 ch, /* Unicode character */
1112 Py_UCS4 *res
1113 );
1114
1115PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
1116 Py_UCS4 ch, /* Unicode character */
1117 Py_UCS4 *res
1118 );
1119
1120PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
1121 Py_UCS4 ch, /* Unicode character */
1122 Py_UCS4 *res
1123 );
1124
1125PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
1126 Py_UCS4 ch, /* Unicode character */
1127 Py_UCS4 *res
1128 );
1129
1130PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
1131 Py_UCS4 ch /* Unicode character */
1132 );
1133
1134PyAPI_FUNC(int) _PyUnicode_IsCased(
1135 Py_UCS4 ch /* Unicode character */
1136 );
1137
1138PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1139 Py_UCS4 ch /* Unicode character */
1140 );
1141
1142PyAPI_FUNC(int) _PyUnicode_ToDigit(
1143 Py_UCS4 ch /* Unicode character */
1144 );
1145
1146PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1147 Py_UCS4 ch /* Unicode character */
1148 );
1149
1150PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1151 Py_UCS4 ch /* Unicode character */
1152 );
1153
1154PyAPI_FUNC(int) _PyUnicode_IsDigit(
1155 Py_UCS4 ch /* Unicode character */
1156 );
1157
1158PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1159 Py_UCS4 ch /* Unicode character */
1160 );
1161
1162PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1163 Py_UCS4 ch /* Unicode character */
1164 );
1165
1166PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1167 Py_UCS4 ch /* Unicode character */
1168 );
1169
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001170Py_DEPRECATED(3.3) PyAPI_FUNC(size_t) Py_UNICODE_strlen(
Victor Stinner75e46992018-11-26 17:29:38 +01001171 const Py_UNICODE *u
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001172 );
Victor Stinner75e46992018-11-26 17:29:38 +01001173
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001174Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinner75e46992018-11-26 17:29:38 +01001175 Py_UNICODE *s1,
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001176 const Py_UNICODE *s2);
Victor Stinner75e46992018-11-26 17:29:38 +01001177
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001178Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1179 Py_UNICODE *s1, const Py_UNICODE *s2);
Victor Stinner75e46992018-11-26 17:29:38 +01001180
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001181Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinner75e46992018-11-26 17:29:38 +01001182 Py_UNICODE *s1,
1183 const Py_UNICODE *s2,
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001184 size_t n);
Victor Stinner75e46992018-11-26 17:29:38 +01001185
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001186Py_DEPRECATED(3.3) PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinner75e46992018-11-26 17:29:38 +01001187 const Py_UNICODE *s1,
1188 const Py_UNICODE *s2
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001189 );
Victor Stinner75e46992018-11-26 17:29:38 +01001190
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001191Py_DEPRECATED(3.3) PyAPI_FUNC(int) Py_UNICODE_strncmp(
Victor Stinner75e46992018-11-26 17:29:38 +01001192 const Py_UNICODE *s1,
1193 const Py_UNICODE *s2,
1194 size_t n
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001195 );
Victor Stinner75e46992018-11-26 17:29:38 +01001196
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001197Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinner75e46992018-11-26 17:29:38 +01001198 const Py_UNICODE *s,
1199 Py_UNICODE c
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001200 );
Victor Stinner75e46992018-11-26 17:29:38 +01001201
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001202Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinner75e46992018-11-26 17:29:38 +01001203 const Py_UNICODE *s,
1204 Py_UNICODE c
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001205 );
Victor Stinner75e46992018-11-26 17:29:38 +01001206
1207PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
1208
1209/* Create a copy of a unicode string ending with a nul character. Return NULL
1210 and raise a MemoryError exception on memory allocation failure, otherwise
1211 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1212
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001213Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner75e46992018-11-26 17:29:38 +01001214 PyObject *unicode
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001215 );
Victor Stinner75e46992018-11-26 17:29:38 +01001216
1217/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
1218PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
Victor Stinner75e46992018-11-26 17:29:38 +01001219
1220/* Fast equality check when the inputs are known to be exact unicode types
1221 and where the hash values are equal (i.e. a very probable match) */
1222PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
1223
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001224PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
1225
Victor Stinner75e46992018-11-26 17:29:38 +01001226#ifdef __cplusplus
1227}
1228#endif