blob: 7e53ccc9e63f07ad2c3c343aa863ecca26632edd [file] [log] [blame]
Victor Stinner75e46992018-11-26 17:29:38 +01001#ifndef Py_CPYTHON_UNICODEOBJECT_H
2# error "this header file must not be included directly"
3#endif
4
Victor Stinner75e46992018-11-26 17:29:38 +01005/* Py_UNICODE was the native Unicode storage format (code unit) used by
6 Python and represents a single Unicode element in the Unicode type.
7 With PEP 393, Py_UNICODE is deprecated and replaced with a
8 typedef to wchar_t. */
9#define PY_UNICODE_TYPE wchar_t
Zackery Spytz3c8724f2019-05-28 09:16:33 -060010/* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
Victor Stinner75e46992018-11-26 17:29:38 +010011
12/* --- Internal Unicode Operations ---------------------------------------- */
13
14/* Since splitting on whitespace is an important use case, and
15 whitespace in most situations is solely ASCII whitespace, we
16 optimize for the common case by using a quick look-up table
17 _Py_ascii_whitespace (see below) with an inlined check.
18
19 */
20#define Py_UNICODE_ISSPACE(ch) \
21 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
22
23#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
24#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
25#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
26#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
27
28#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
29#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
30#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
31
32#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
33#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
34#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
35#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
36
37#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
38#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
39#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
40
41#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
42
43#define Py_UNICODE_ISALNUM(ch) \
44 (Py_UNICODE_ISALPHA(ch) || \
45 Py_UNICODE_ISDECIMAL(ch) || \
46 Py_UNICODE_ISDIGIT(ch) || \
47 Py_UNICODE_ISNUMERIC(ch))
48
Inada Naoki2c4928d2020-06-17 20:09:44 +090049Py_DEPRECATED(3.3) static inline void
50Py_UNICODE_COPY(Py_UNICODE *target, const Py_UNICODE *source, Py_ssize_t length) {
51 memcpy(target, source, length * sizeof(Py_UNICODE));
52}
Victor Stinner75e46992018-11-26 17:29:38 +010053
Inada Naoki2c4928d2020-06-17 20:09:44 +090054Py_DEPRECATED(3.3) static inline void
55Py_UNICODE_FILL(Py_UNICODE *target, Py_UNICODE value, Py_ssize_t length) {
Inada Naoki8e34e922020-06-17 23:43:01 +090056 Py_ssize_t i;
57 for (i = 0; i < length; i++) {
Inada Naoki2c4928d2020-06-17 20:09:44 +090058 target[i] = value;
59 }
60}
Victor Stinner75e46992018-11-26 17:29:38 +010061
62/* macros to work with surrogates */
63#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
64#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
65#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
66/* Join two surrogate characters and return a single Py_UCS4 value. */
67#define Py_UNICODE_JOIN_SURROGATES(high, low) \
68 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
69 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
70/* high surrogate = top 10 bits added to D800 */
71#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
72/* low surrogate = bottom 10 bits added to DC00 */
73#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
74
Victor Stinner75e46992018-11-26 17:29:38 +010075/* --- Unicode Type ------------------------------------------------------- */
76
77/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
78 structure. state.ascii and state.compact are set, and the data
79 immediately follow the structure. utf8_length and wstr_length can be found
80 in the length field; the utf8 pointer is equal to the data pointer. */
81typedef struct {
82 /* There are 4 forms of Unicode strings:
83
84 - compact ascii:
85
86 * structure = PyASCIIObject
87 * test: PyUnicode_IS_COMPACT_ASCII(op)
88 * kind = PyUnicode_1BYTE_KIND
89 * compact = 1
90 * ascii = 1
91 * ready = 1
92 * (length is the length of the utf8 and wstr strings)
93 * (data starts just after the structure)
94 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
95
96 - compact:
97
98 * structure = PyCompactUnicodeObject
99 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
100 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
101 PyUnicode_4BYTE_KIND
102 * compact = 1
103 * ready = 1
104 * ascii = 0
105 * utf8 is not shared with data
106 * utf8_length = 0 if utf8 is NULL
107 * wstr is shared with data and wstr_length=length
108 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
109 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
110 * wstr_length = 0 if wstr is NULL
111 * (data starts just after the structure)
112
113 - legacy string, not ready:
114
115 * structure = PyUnicodeObject
116 * test: kind == PyUnicode_WCHAR_KIND
117 * length = 0 (use wstr_length)
118 * hash = -1
119 * kind = PyUnicode_WCHAR_KIND
120 * compact = 0
121 * ascii = 0
122 * ready = 0
123 * interned = SSTATE_NOT_INTERNED
124 * wstr is not NULL
125 * data.any is NULL
126 * utf8 is NULL
127 * utf8_length = 0
128
129 - legacy string, ready:
130
131 * structure = PyUnicodeObject structure
132 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
133 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
134 PyUnicode_4BYTE_KIND
135 * compact = 0
136 * ready = 1
137 * data.any is not NULL
138 * utf8 is shared and utf8_length = length with data.any if ascii = 1
139 * utf8_length = 0 if utf8 is NULL
140 * wstr is shared with data.any and wstr_length = length
141 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
142 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
143 * wstr_length = 0 if wstr is NULL
144
145 Compact strings use only one memory block (structure + characters),
146 whereas legacy strings use one block for the structure and one block
147 for characters.
148
149 Legacy strings are created by PyUnicode_FromUnicode() and
150 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
151 when PyUnicode_READY() is called.
152
153 See also _PyUnicode_CheckConsistency().
154 */
155 PyObject_HEAD
156 Py_ssize_t length; /* Number of code points in the string */
157 Py_hash_t hash; /* Hash value; -1 if not set */
158 struct {
159 /*
160 SSTATE_NOT_INTERNED (0)
161 SSTATE_INTERNED_MORTAL (1)
162 SSTATE_INTERNED_IMMORTAL (2)
163
164 If interned != SSTATE_NOT_INTERNED, the two references from the
165 dictionary to this object are *not* counted in ob_refcnt.
166 */
167 unsigned int interned:2;
168 /* Character size:
169
170 - PyUnicode_WCHAR_KIND (0):
171
172 * character type = wchar_t (16 or 32 bits, depending on the
173 platform)
174
175 - PyUnicode_1BYTE_KIND (1):
176
177 * character type = Py_UCS1 (8 bits, unsigned)
178 * all characters are in the range U+0000-U+00FF (latin1)
179 * if ascii is set, all characters are in the range U+0000-U+007F
180 (ASCII), otherwise at least one character is in the range
181 U+0080-U+00FF
182
183 - PyUnicode_2BYTE_KIND (2):
184
185 * character type = Py_UCS2 (16 bits, unsigned)
186 * all characters are in the range U+0000-U+FFFF (BMP)
187 * at least one character is in the range U+0100-U+FFFF
188
189 - PyUnicode_4BYTE_KIND (4):
190
191 * character type = Py_UCS4 (32 bits, unsigned)
192 * all characters are in the range U+0000-U+10FFFF
193 * at least one character is in the range U+10000-U+10FFFF
194 */
195 unsigned int kind:3;
196 /* Compact is with respect to the allocation scheme. Compact unicode
197 objects only require one memory block while non-compact objects use
198 one block for the PyUnicodeObject struct and another for its data
199 buffer. */
200 unsigned int compact:1;
201 /* The string only contains characters in the range U+0000-U+007F (ASCII)
202 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
203 set, use the PyASCIIObject structure. */
204 unsigned int ascii:1;
205 /* The ready flag indicates whether the object layout is initialized
206 completely. This means that this is either a compact object, or
207 the data pointer is filled out. The bit is redundant, and helps
208 to minimize the test in PyUnicode_IS_READY(). */
209 unsigned int ready:1;
210 /* Padding to ensure that PyUnicode_DATA() is always aligned to
211 4 bytes (see issue #19537 on m68k). */
212 unsigned int :24;
213 } state;
214 wchar_t *wstr; /* wchar_t representation (null-terminated) */
215} PyASCIIObject;
216
217/* Non-ASCII strings allocated through PyUnicode_New use the
218 PyCompactUnicodeObject structure. state.compact is set, and the data
219 immediately follow the structure. */
220typedef struct {
221 PyASCIIObject _base;
222 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
223 * terminating \0. */
224 char *utf8; /* UTF-8 representation (null-terminated) */
225 Py_ssize_t wstr_length; /* Number of code points in wstr, possible
226 * surrogates count as two code points. */
227} PyCompactUnicodeObject;
228
229/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
230 PyUnicodeObject structure. The actual string data is initially in the wstr
231 block, and copied into the data block using _PyUnicode_Ready. */
232typedef struct {
233 PyCompactUnicodeObject _base;
234 union {
235 void *any;
236 Py_UCS1 *latin1;
237 Py_UCS2 *ucs2;
238 Py_UCS4 *ucs4;
239 } data; /* Canonical, smallest-form Unicode buffer */
240} PyUnicodeObject;
241
Victor Stinner68762572019-10-07 18:42:01 +0200242PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
243 PyObject *op,
244 int check_content);
245
Victor Stinner75e46992018-11-26 17:29:38 +0100246/* Fast access macros */
Victor Stinner75e46992018-11-26 17:29:38 +0100247
248/* Returns the deprecated Py_UNICODE representation's size in code units
249 (this includes surrogate pairs as 2 units).
250 If the Py_UNICODE representation is not available, it will be computed
251 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
252
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600253/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100254#define PyUnicode_GET_SIZE(op) \
255 (assert(PyUnicode_Check(op)), \
256 (((PyASCIIObject *)(op))->wstr) ? \
257 PyUnicode_WSTR_LENGTH(op) : \
258 ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\
259 assert(((PyASCIIObject *)(op))->wstr), \
260 PyUnicode_WSTR_LENGTH(op)))
Victor Stinner75e46992018-11-26 17:29:38 +0100261
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600262/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100263#define PyUnicode_GET_DATA_SIZE(op) \
264 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
Victor Stinner75e46992018-11-26 17:29:38 +0100265
266/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
267 representation on demand. Using this macro is very inefficient now,
268 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
269 use PyUnicode_WRITE() and PyUnicode_READ(). */
270
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600271/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100272#define PyUnicode_AS_UNICODE(op) \
273 (assert(PyUnicode_Check(op)), \
274 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
275 PyUnicode_AsUnicode(_PyObject_CAST(op)))
Victor Stinner75e46992018-11-26 17:29:38 +0100276
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600277/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100278#define PyUnicode_AS_DATA(op) \
279 ((const char *)(PyUnicode_AS_UNICODE(op)))
Victor Stinner75e46992018-11-26 17:29:38 +0100280
281
282/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
283
284/* Values for PyASCIIObject.state: */
285
286/* Interning state. */
287#define SSTATE_NOT_INTERNED 0
288#define SSTATE_INTERNED_MORTAL 1
289#define SSTATE_INTERNED_IMMORTAL 2
290
291/* Return true if the string contains only ASCII characters, or 0 if not. The
292 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
293 ready. */
294#define PyUnicode_IS_ASCII(op) \
295 (assert(PyUnicode_Check(op)), \
296 assert(PyUnicode_IS_READY(op)), \
297 ((PyASCIIObject*)op)->state.ascii)
298
299/* Return true if the string is compact or 0 if not.
300 No type checks or Ready calls are performed. */
301#define PyUnicode_IS_COMPACT(op) \
302 (((PyASCIIObject*)(op))->state.compact)
303
304/* Return true if the string is a compact ASCII string (use PyASCIIObject
305 structure), or 0 if not. No type checks or Ready calls are performed. */
306#define PyUnicode_IS_COMPACT_ASCII(op) \
307 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
308
309enum PyUnicode_Kind {
310/* String contains only wstr byte characters. This is only possible
311 when the string was created with a legacy API and _PyUnicode_Ready()
312 has not been called yet. */
313 PyUnicode_WCHAR_KIND = 0,
314/* Return values of the PyUnicode_KIND() macro: */
315 PyUnicode_1BYTE_KIND = 1,
316 PyUnicode_2BYTE_KIND = 2,
317 PyUnicode_4BYTE_KIND = 4
318};
319
320/* Return pointers to the canonical representation cast to unsigned char,
321 Py_UCS2, or Py_UCS4 for direct character access.
322 No checks are performed, use PyUnicode_KIND() before to ensure
323 these will work correctly. */
324
325#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
326#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
327#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
328
329/* Return one of the PyUnicode_*_KIND values defined above. */
330#define PyUnicode_KIND(op) \
331 (assert(PyUnicode_Check(op)), \
332 assert(PyUnicode_IS_READY(op)), \
333 ((PyASCIIObject *)(op))->state.kind)
334
335/* Return a void pointer to the raw unicode buffer. */
336#define _PyUnicode_COMPACT_DATA(op) \
337 (PyUnicode_IS_ASCII(op) ? \
338 ((void*)((PyASCIIObject*)(op) + 1)) : \
339 ((void*)((PyCompactUnicodeObject*)(op) + 1)))
340
341#define _PyUnicode_NONCOMPACT_DATA(op) \
342 (assert(((PyUnicodeObject*)(op))->data.any), \
343 ((((PyUnicodeObject *)(op))->data.any)))
344
345#define PyUnicode_DATA(op) \
346 (assert(PyUnicode_Check(op)), \
347 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
348 _PyUnicode_NONCOMPACT_DATA(op))
349
350/* In the access macros below, "kind" may be evaluated more than once.
351 All other macro parameters are evaluated exactly once, so it is safe
352 to put side effects into them (such as increasing the index). */
353
354/* Write into the canonical representation, this macro does not do any sanity
355 checks and is intended for usage in loops. The caller should cache the
356 kind and data pointers obtained from other macro calls.
357 index is the index in the string (starts at 0) and value is the new
358 code point value which should be written to that location. */
359#define PyUnicode_WRITE(kind, data, index, value) \
360 do { \
361 switch ((kind)) { \
362 case PyUnicode_1BYTE_KIND: { \
363 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
364 break; \
365 } \
366 case PyUnicode_2BYTE_KIND: { \
367 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
368 break; \
369 } \
370 default: { \
371 assert((kind) == PyUnicode_4BYTE_KIND); \
372 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
373 } \
374 } \
375 } while (0)
376
377/* Read a code point from the string's canonical representation. No checks
378 or ready calls are performed. */
379#define PyUnicode_READ(kind, data, index) \
380 ((Py_UCS4) \
381 ((kind) == PyUnicode_1BYTE_KIND ? \
382 ((const Py_UCS1 *)(data))[(index)] : \
383 ((kind) == PyUnicode_2BYTE_KIND ? \
384 ((const Py_UCS2 *)(data))[(index)] : \
385 ((const Py_UCS4 *)(data))[(index)] \
386 ) \
387 ))
388
389/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
390 calls PyUnicode_KIND() and might call it twice. For single reads, use
391 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
392 cache kind and use PyUnicode_READ instead. */
393#define PyUnicode_READ_CHAR(unicode, index) \
394 (assert(PyUnicode_Check(unicode)), \
395 assert(PyUnicode_IS_READY(unicode)), \
396 (Py_UCS4) \
397 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
398 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
399 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
400 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
401 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
402 ) \
403 ))
404
405/* Returns the length of the unicode string. The caller has to make sure that
406 the string has it's canonical representation set before calling
407 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
408#define PyUnicode_GET_LENGTH(op) \
409 (assert(PyUnicode_Check(op)), \
410 assert(PyUnicode_IS_READY(op)), \
411 ((PyASCIIObject *)(op))->length)
412
413
414/* Fast check to determine whether an object is ready. Equivalent to
415 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
416
417#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
418
419/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
420 case. If the canonical representation is not yet set, it will still call
421 _PyUnicode_Ready().
422 Returns 0 on success and -1 on errors. */
423#define PyUnicode_READY(op) \
424 (assert(PyUnicode_Check(op)), \
425 (PyUnicode_IS_READY(op) ? \
426 0 : _PyUnicode_Ready(_PyObject_CAST(op))))
427
428/* Return a maximum character value which is suitable for creating another
429 string based on op. This is always an approximation but more efficient
430 than iterating over the string. */
431#define PyUnicode_MAX_CHAR_VALUE(op) \
432 (assert(PyUnicode_IS_READY(op)), \
433 (PyUnicode_IS_ASCII(op) ? \
434 (0x7f) : \
435 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
436 (0xffU) : \
437 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
438 (0xffffU) : \
439 (0x10ffffU)))))
440
Inada Naoki2c4928d2020-06-17 20:09:44 +0900441Py_DEPRECATED(3.3)
442static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {
443 return PyUnicode_IS_COMPACT_ASCII(op) ?
444 ((PyASCIIObject*)op)->length :
445 ((PyCompactUnicodeObject*)op)->wstr_length;
446}
447#define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)
448
Victor Stinner75e46992018-11-26 17:29:38 +0100449/* === Public API ========================================================= */
450
451/* --- Plain Py_UNICODE --------------------------------------------------- */
452
453/* With PEP 393, this is the recommended way to allocate a new unicode object.
454 This function will allocate the object and its buffer in a single memory
455 block. Objects created using this function are not resizable. */
456PyAPI_FUNC(PyObject*) PyUnicode_New(
457 Py_ssize_t size, /* Number of code points in the new string */
458 Py_UCS4 maxchar /* maximum code point value in the string */
459 );
460
461/* Initializes the canonical string representation from the deprecated
462 wstr/Py_UNICODE representation. This function is used to convert Unicode
463 objects which were created using the old API to the new flexible format
464 introduced with PEP 393.
465
466 Don't call this function directly, use the public PyUnicode_READY() macro
467 instead. */
468PyAPI_FUNC(int) _PyUnicode_Ready(
469 PyObject *unicode /* Unicode object */
470 );
471
472/* Get a copy of a Unicode string. */
473PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
474 PyObject *unicode
475 );
476
477/* Copy character from one unicode object into another, this function performs
478 character conversion when necessary and falls back to memcpy() if possible.
479
480 Fail if to is too small (smaller than *how_many* or smaller than
481 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
482 kind(to), or if *to* has more than 1 reference.
483
484 Return the number of written character, or return -1 and raise an exception
485 on error.
486
487 Pseudo-code:
488
489 how_many = min(how_many, len(from) - from_start)
490 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
491 return how_many
492
493 Note: The function doesn't write a terminating null character.
494 */
495PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
496 PyObject *to,
497 Py_ssize_t to_start,
498 PyObject *from,
499 Py_ssize_t from_start,
500 Py_ssize_t how_many
501 );
502
503/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
504 may crash if parameters are invalid (e.g. if the output string
505 is too short). */
506PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
507 PyObject *to,
508 Py_ssize_t to_start,
509 PyObject *from,
510 Py_ssize_t from_start,
511 Py_ssize_t how_many
512 );
513
514/* Fill a string with a character: write fill_char into
515 unicode[start:start+length].
516
517 Fail if fill_char is bigger than the string maximum character, or if the
518 string has more than 1 reference.
519
520 Return the number of written character, or return -1 and raise an exception
521 on error. */
522PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
523 PyObject *unicode,
524 Py_ssize_t start,
525 Py_ssize_t length,
526 Py_UCS4 fill_char
527 );
528
529/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
530 if parameters are invalid (e.g. if length is longer than the string). */
531PyAPI_FUNC(void) _PyUnicode_FastFill(
532 PyObject *unicode,
533 Py_ssize_t start,
534 Py_ssize_t length,
535 Py_UCS4 fill_char
536 );
537
538/* Create a Unicode Object from the Py_UNICODE buffer u of the given
539 size.
540
541 u may be NULL which causes the contents to be undefined. It is the
542 user's responsibility to fill in the needed data afterwards. Note
543 that modifying the Unicode object contents after construction is
544 only allowed if u was set to NULL.
545
546 The buffer is copied into the new object. */
Inada Naoki2c4928d2020-06-17 20:09:44 +0900547Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Victor Stinner75e46992018-11-26 17:29:38 +0100548 const Py_UNICODE *u, /* Unicode buffer */
549 Py_ssize_t size /* size of buffer */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600550 );
Victor Stinner75e46992018-11-26 17:29:38 +0100551
552/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
553 Scan the string to find the maximum character. */
554PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
555 int kind,
556 const void *buffer,
557 Py_ssize_t size);
558
559/* Create a new string from a buffer of ASCII characters.
560 WARNING: Don't check if the string contains any non-ASCII character. */
561PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
562 const char *buffer,
563 Py_ssize_t size);
564
565/* Compute the maximum character of the substring unicode[start:end].
566 Return 127 for an empty string. */
567PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
568 PyObject *unicode,
569 Py_ssize_t start,
570 Py_ssize_t end);
571
572/* Return a read-only pointer to the Unicode object's internal
573 Py_UNICODE buffer.
574 If the wchar_t/Py_UNICODE representation is not yet available, this
575 function will calculate it. */
Inada Naoki2c4928d2020-06-17 20:09:44 +0900576Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
Victor Stinner75e46992018-11-26 17:29:38 +0100577 PyObject *unicode /* Unicode object */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600578 );
Victor Stinner75e46992018-11-26 17:29:38 +0100579
580/* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
581 contains null characters. */
Inada Naoki2c4928d2020-06-17 20:09:44 +0900582Py_DEPRECATED(3.3) PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode(
Victor Stinner75e46992018-11-26 17:29:38 +0100583 PyObject *unicode /* Unicode object */
584 );
585
586/* Return a read-only pointer to the Unicode object's internal
587 Py_UNICODE buffer and save the length at size.
588 If the wchar_t/Py_UNICODE representation is not yet available, this
589 function will calculate it. */
590
Inada Naoki2c4928d2020-06-17 20:09:44 +0900591Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
Victor Stinner75e46992018-11-26 17:29:38 +0100592 PyObject *unicode, /* Unicode object */
593 Py_ssize_t *size /* location where to save the length */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600594 );
Victor Stinner75e46992018-11-26 17:29:38 +0100595
596/* Get the maximum ordinal for a Unicode character. */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600597Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
Victor Stinner75e46992018-11-26 17:29:38 +0100598
599
600/* --- _PyUnicodeWriter API ----------------------------------------------- */
601
602typedef struct {
603 PyObject *buffer;
604 void *data;
605 enum PyUnicode_Kind kind;
606 Py_UCS4 maxchar;
607 Py_ssize_t size;
608 Py_ssize_t pos;
609
610 /* minimum number of allocated characters (default: 0) */
611 Py_ssize_t min_length;
612
613 /* minimum character (default: 127, ASCII) */
614 Py_UCS4 min_char;
615
616 /* If non-zero, overallocate the buffer (default: 0). */
617 unsigned char overallocate;
618
619 /* If readonly is 1, buffer is a shared string (cannot be modified)
620 and size is set to 0. */
621 unsigned char readonly;
622} _PyUnicodeWriter ;
623
624/* Initialize a Unicode writer.
625 *
626 * By default, the minimum buffer size is 0 character and overallocation is
627 * disabled. Set min_length, min_char and overallocate attributes to control
628 * the allocation of the buffer. */
629PyAPI_FUNC(void)
630_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
631
632/* Prepare the buffer to write 'length' characters
633 with the specified maximum character.
634
635 Return 0 on success, raise an exception and return -1 on error. */
636#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
637 (((MAXCHAR) <= (WRITER)->maxchar \
638 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
639 ? 0 \
640 : (((LENGTH) == 0) \
641 ? 0 \
642 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
643
644/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
645 instead. */
646PyAPI_FUNC(int)
647_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
648 Py_ssize_t length, Py_UCS4 maxchar);
649
650/* Prepare the buffer to have at least the kind KIND.
651 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
652 support characters in range U+000-U+FFFF.
653
654 Return 0 on success, raise an exception and return -1 on error. */
655#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
656 (assert((KIND) != PyUnicode_WCHAR_KIND), \
657 (KIND) <= (WRITER)->kind \
658 ? 0 \
659 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
660
661/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
662 macro instead. */
663PyAPI_FUNC(int)
664_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
665 enum PyUnicode_Kind kind);
666
667/* Append a Unicode character.
668 Return 0 on success, raise an exception and return -1 on error. */
669PyAPI_FUNC(int)
670_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
671 Py_UCS4 ch
672 );
673
674/* Append a Unicode string.
675 Return 0 on success, raise an exception and return -1 on error. */
676PyAPI_FUNC(int)
677_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
678 PyObject *str /* Unicode string */
679 );
680
681/* Append a substring of a Unicode string.
682 Return 0 on success, raise an exception and return -1 on error. */
683PyAPI_FUNC(int)
684_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
685 PyObject *str, /* Unicode string */
686 Py_ssize_t start,
687 Py_ssize_t end
688 );
689
690/* Append an ASCII-encoded byte string.
691 Return 0 on success, raise an exception and return -1 on error. */
692PyAPI_FUNC(int)
693_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
694 const char *str, /* ASCII-encoded byte string */
695 Py_ssize_t len /* number of bytes, or -1 if unknown */
696 );
697
698/* Append a latin1-encoded byte string.
699 Return 0 on success, raise an exception and return -1 on error. */
700PyAPI_FUNC(int)
701_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
702 const char *str, /* latin1-encoded byte string */
703 Py_ssize_t len /* length in bytes */
704 );
705
706/* Get the value of the writer as a Unicode string. Clear the
707 buffer of the writer. Raise an exception and return NULL
708 on error. */
709PyAPI_FUNC(PyObject *)
710_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
711
712/* Deallocate memory of a writer (clear its internal buffer). */
713PyAPI_FUNC(void)
714_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
715
716
717/* Format the object based on the format_spec, as defined in PEP 3101
718 (Advanced String Formatting). */
719PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
720 _PyUnicodeWriter *writer,
721 PyObject *obj,
722 PyObject *format_spec,
723 Py_ssize_t start,
724 Py_ssize_t end);
725
Victor Stinner75e46992018-11-26 17:29:38 +0100726/* --- Manage the default encoding ---------------------------------------- */
727
728/* Returns a pointer to the default encoding (UTF-8) of the
729 Unicode object unicode and the size of the encoded representation
730 in bytes stored in *size.
731
732 In case of an error, no *size is set.
733
734 This function caches the UTF-8 encoded string in the unicodeobject
735 and subsequent calls will return the same string. The memory is released
736 when the unicodeobject is deallocated.
737
738 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
739 support the previous internal function with the same behaviour.
Victor Stinner75e46992018-11-26 17:29:38 +0100740*/
741
742PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
743 PyObject *unicode,
744 Py_ssize_t *size);
745
746#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
747
748/* Returns a pointer to the default encoding (UTF-8) of the
749 Unicode object unicode.
750
751 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
752 in the unicodeobject.
753
754 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
755 support the previous internal function with the same behaviour.
756
757 Use of this API is DEPRECATED since no size information can be
758 extracted from the returned data.
759
760 *** This API is for interpreter INTERNAL USE ONLY and will likely
761 *** be removed or changed for Python 3.1.
762
763 *** If you need to access the Unicode object as UTF-8 bytes string,
764 *** please use PyUnicode_AsUTF8String() instead.
765
766*/
767
768PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
769
770#define _PyUnicode_AsString PyUnicode_AsUTF8
771
772/* --- Generic Codecs ----------------------------------------------------- */
773
774/* Encodes a Py_UNICODE buffer of the given size and returns a
775 Python string object. */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600776Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_Encode(
Victor Stinner75e46992018-11-26 17:29:38 +0100777 const Py_UNICODE *s, /* Unicode char buffer */
778 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
779 const char *encoding, /* encoding */
780 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600781 );
Victor Stinner75e46992018-11-26 17:29:38 +0100782
783/* --- UTF-7 Codecs ------------------------------------------------------- */
784
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600785Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
Victor Stinner75e46992018-11-26 17:29:38 +0100786 const Py_UNICODE *data, /* Unicode char buffer */
787 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
788 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
789 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
790 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600791 );
Victor Stinner75e46992018-11-26 17:29:38 +0100792
793PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
794 PyObject *unicode, /* Unicode object */
795 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
796 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
797 const char *errors /* error handling */
798 );
799
800/* --- UTF-8 Codecs ------------------------------------------------------- */
801
802PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
803 PyObject *unicode,
804 const char *errors);
805
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600806Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
Victor Stinner75e46992018-11-26 17:29:38 +0100807 const Py_UNICODE *data, /* Unicode char buffer */
808 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
809 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600810 );
Victor Stinner75e46992018-11-26 17:29:38 +0100811
812/* --- UTF-32 Codecs ------------------------------------------------------ */
813
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600814Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
Victor Stinner75e46992018-11-26 17:29:38 +0100815 const Py_UNICODE *data, /* Unicode char buffer */
816 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
817 const char *errors, /* error handling */
818 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600819 );
Victor Stinner75e46992018-11-26 17:29:38 +0100820
821PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
822 PyObject *object, /* Unicode object */
823 const char *errors, /* error handling */
824 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
825 );
826
827/* --- UTF-16 Codecs ------------------------------------------------------ */
828
829/* Returns a Python string object holding the UTF-16 encoded value of
830 the Unicode data.
831
832 If byteorder is not 0, output is written according to the following
833 byte order:
834
835 byteorder == -1: little endian
836 byteorder == 0: native byte order (writes a BOM mark)
837 byteorder == 1: big endian
838
839 If byteorder is 0, the output string will always start with the
840 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
841 prepended.
842
843 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
844 UCS-2. This trick makes it possible to add full UTF-16 capabilities
845 at a later point without compromising the APIs.
846
847*/
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600848Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
Victor Stinner75e46992018-11-26 17:29:38 +0100849 const Py_UNICODE *data, /* Unicode char buffer */
850 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
851 const char *errors, /* error handling */
852 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600853 );
Victor Stinner75e46992018-11-26 17:29:38 +0100854
855PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
856 PyObject* unicode, /* Unicode object */
857 const char *errors, /* error handling */
858 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
859 );
860
861/* --- Unicode-Escape Codecs ---------------------------------------------- */
862
863/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
864 chars. */
865PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
866 const char *string, /* Unicode-Escape encoded string */
867 Py_ssize_t length, /* size of string */
868 const char *errors, /* error handling */
869 const char **first_invalid_escape /* on return, points to first
870 invalid escaped char in
871 string. */
872);
873
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600874Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
Victor Stinner75e46992018-11-26 17:29:38 +0100875 const Py_UNICODE *data, /* Unicode char buffer */
876 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600877 );
Victor Stinner75e46992018-11-26 17:29:38 +0100878
879/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
880
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600881Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
Victor Stinner75e46992018-11-26 17:29:38 +0100882 const Py_UNICODE *data, /* Unicode char buffer */
883 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600884 );
Victor Stinner75e46992018-11-26 17:29:38 +0100885
Victor Stinner75e46992018-11-26 17:29:38 +0100886/* --- Latin-1 Codecs ----------------------------------------------------- */
887
888PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
889 PyObject* unicode,
890 const char* errors);
891
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600892Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
Victor Stinner75e46992018-11-26 17:29:38 +0100893 const Py_UNICODE *data, /* Unicode char buffer */
894 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
895 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600896 );
Victor Stinner75e46992018-11-26 17:29:38 +0100897
898/* --- ASCII Codecs ------------------------------------------------------- */
899
900PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
901 PyObject* unicode,
902 const char* errors);
903
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600904Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
Victor Stinner75e46992018-11-26 17:29:38 +0100905 const Py_UNICODE *data, /* Unicode char buffer */
906 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
907 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600908 );
Victor Stinner75e46992018-11-26 17:29:38 +0100909
910/* --- Character Map Codecs ----------------------------------------------- */
911
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600912Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
Victor Stinner75e46992018-11-26 17:29:38 +0100913 const Py_UNICODE *data, /* Unicode char buffer */
914 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
915 PyObject *mapping, /* encoding mapping */
916 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600917 );
Victor Stinner75e46992018-11-26 17:29:38 +0100918
919PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
920 PyObject *unicode, /* Unicode object */
921 PyObject *mapping, /* encoding mapping */
922 const char *errors /* error handling */
923 );
924
925/* Translate a Py_UNICODE buffer of the given length by applying a
926 character mapping table to it and return the resulting Unicode
927 object.
928
929 The mapping table must map Unicode ordinal integers to Unicode strings,
930 Unicode ordinal integers or None (causing deletion of the character).
931
932 Mapping tables may be dictionaries or sequences. Unmapped character
933 ordinals (ones which cause a LookupError) are left untouched and
934 are copied as-is.
935
936*/
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600937Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
Victor Stinner75e46992018-11-26 17:29:38 +0100938 const Py_UNICODE *data, /* Unicode char buffer */
939 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
940 PyObject *table, /* Translate table */
941 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600942 );
Victor Stinner75e46992018-11-26 17:29:38 +0100943
944/* --- MBCS codecs for Windows -------------------------------------------- */
945
946#ifdef MS_WINDOWS
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600947Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
Victor Stinner75e46992018-11-26 17:29:38 +0100948 const Py_UNICODE *data, /* Unicode char buffer */
949 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
950 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600951 );
Victor Stinner75e46992018-11-26 17:29:38 +0100952#endif
953
954/* --- Decimal Encoder ---------------------------------------------------- */
955
956/* Takes a Unicode string holding a decimal value and writes it into
957 an output buffer using standard ASCII digit codes.
958
959 The output buffer has to provide at least length+1 bytes of storage
960 area. The output string is 0-terminated.
961
962 The encoder converts whitespace to ' ', decimal characters to their
963 corresponding ASCII digit and all other Latin-1 characters except
964 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
965 are treated as errors. This includes embedded NULL bytes.
966
967 Error handling is defined by the errors argument:
968
969 NULL or "strict": raise a ValueError
970 "ignore": ignore the wrong characters (these are not copied to the
971 output buffer)
972 "replace": replaces illegal characters with '?'
973
974 Returns 0 on success, -1 on failure.
975
976*/
977
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600978/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Victor Stinner75e46992018-11-26 17:29:38 +0100979 Py_UNICODE *s, /* Unicode buffer */
980 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
981 char *output, /* Output buffer; must have size >= length */
982 const char *errors /* error handling */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600983 );
Victor Stinner75e46992018-11-26 17:29:38 +0100984
985/* Transforms code points that have decimal digit property to the
986 corresponding ASCII digit code points.
987
988 Returns a new Unicode string on success, NULL on failure.
989*/
990
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600991/* Py_DEPRECATED(3.3) */
Victor Stinner75e46992018-11-26 17:29:38 +0100992PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
993 Py_UNICODE *s, /* Unicode buffer */
994 Py_ssize_t length /* Number of Py_UNICODE chars to transform */
Zackery Spytz3c8724f2019-05-28 09:16:33 -0600995 );
Victor Stinner75e46992018-11-26 17:29:38 +0100996
997/* Coverts a Unicode object holding a decimal value to an ASCII string
998 for using in int, float and complex parsers.
999 Transforms code points that have decimal digit property to the
1000 corresponding ASCII digit code points. Transforms spaces to ASCII.
1001 Transforms code points starting from the first non-ASCII code point that
1002 is neither a decimal digit nor a space to the end into '?'. */
1003
1004PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1005 PyObject *unicode /* Unicode object */
1006 );
1007
1008/* --- Methods & Slots ---------------------------------------------------- */
1009
1010PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
1011 PyObject *separator,
1012 PyObject *const *items,
1013 Py_ssize_t seqlen
1014 );
1015
1016/* Test whether a unicode is equal to ASCII identifier. Return 1 if true,
1017 0 otherwise. The right argument must be ASCII identifier.
1018 Any error occurs inside will be cleared before return. */
1019PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
1020 PyObject *left, /* Left string */
1021 _Py_Identifier *right /* Right identifier */
1022 );
1023
1024/* Test whether a unicode is equal to ASCII string. Return 1 if true,
1025 0 otherwise. The right argument must be ASCII-encoded string.
1026 Any error occurs inside will be cleared before return. */
1027PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
1028 PyObject *left,
1029 const char *right /* ASCII-encoded string */
1030 );
1031
1032/* Externally visible for str.strip(unicode) */
1033PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1034 PyObject *self,
1035 int striptype,
1036 PyObject *sepobj
1037 );
1038
1039/* Using explicit passed-in values, insert the thousands grouping
1040 into the string pointed to by buffer. For the argument descriptions,
1041 see Objects/stringlib/localeutil.h */
1042PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1043 _PyUnicodeWriter *writer,
1044 Py_ssize_t n_buffer,
1045 PyObject *digits,
1046 Py_ssize_t d_pos,
1047 Py_ssize_t n_digits,
1048 Py_ssize_t min_width,
1049 const char *grouping,
1050 PyObject *thousands_sep,
1051 Py_UCS4 *maxchar);
1052
1053/* === Characters Type APIs =============================================== */
1054
1055/* Helper array used by Py_UNICODE_ISSPACE(). */
1056
1057PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1058
1059/* These should not be used directly. Use the Py_UNICODE_IS* and
1060 Py_UNICODE_TO* macros instead.
1061
1062 These APIs are implemented in Objects/unicodectype.c.
1063
1064*/
1065
1066PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1067 Py_UCS4 ch /* Unicode character */
1068 );
1069
1070PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1071 Py_UCS4 ch /* Unicode character */
1072 );
1073
1074PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1075 Py_UCS4 ch /* Unicode character */
1076 );
1077
1078PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1079 Py_UCS4 ch /* Unicode character */
1080 );
1081
1082PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1083 Py_UCS4 ch /* Unicode character */
1084 );
1085
1086PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1087 const Py_UCS4 ch /* Unicode character */
1088 );
1089
1090PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1091 const Py_UCS4 ch /* Unicode character */
1092 );
1093
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001094/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
Victor Stinner75e46992018-11-26 17:29:38 +01001095 Py_UCS4 ch /* Unicode character */
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001096 );
Victor Stinner75e46992018-11-26 17:29:38 +01001097
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001098/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
Victor Stinner75e46992018-11-26 17:29:38 +01001099 Py_UCS4 ch /* Unicode character */
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001100 );
Victor Stinner75e46992018-11-26 17:29:38 +01001101
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001102Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
Victor Stinner75e46992018-11-26 17:29:38 +01001103 Py_UCS4 ch /* Unicode character */
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001104 );
Victor Stinner75e46992018-11-26 17:29:38 +01001105
1106PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
1107 Py_UCS4 ch, /* Unicode character */
1108 Py_UCS4 *res
1109 );
1110
1111PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
1112 Py_UCS4 ch, /* Unicode character */
1113 Py_UCS4 *res
1114 );
1115
1116PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
1117 Py_UCS4 ch, /* Unicode character */
1118 Py_UCS4 *res
1119 );
1120
1121PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
1122 Py_UCS4 ch, /* Unicode character */
1123 Py_UCS4 *res
1124 );
1125
1126PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
1127 Py_UCS4 ch /* Unicode character */
1128 );
1129
1130PyAPI_FUNC(int) _PyUnicode_IsCased(
1131 Py_UCS4 ch /* Unicode character */
1132 );
1133
1134PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1135 Py_UCS4 ch /* Unicode character */
1136 );
1137
1138PyAPI_FUNC(int) _PyUnicode_ToDigit(
1139 Py_UCS4 ch /* Unicode character */
1140 );
1141
1142PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1143 Py_UCS4 ch /* Unicode character */
1144 );
1145
1146PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1147 Py_UCS4 ch /* Unicode character */
1148 );
1149
1150PyAPI_FUNC(int) _PyUnicode_IsDigit(
1151 Py_UCS4 ch /* Unicode character */
1152 );
1153
1154PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1155 Py_UCS4 ch /* Unicode character */
1156 );
1157
1158PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1159 Py_UCS4 ch /* Unicode character */
1160 );
1161
1162PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1163 Py_UCS4 ch /* Unicode character */
1164 );
1165
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001166Py_DEPRECATED(3.3) PyAPI_FUNC(size_t) Py_UNICODE_strlen(
Victor Stinner75e46992018-11-26 17:29:38 +01001167 const Py_UNICODE *u
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001168 );
Victor Stinner75e46992018-11-26 17:29:38 +01001169
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001170Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
Victor Stinner75e46992018-11-26 17:29:38 +01001171 Py_UNICODE *s1,
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001172 const Py_UNICODE *s2);
Victor Stinner75e46992018-11-26 17:29:38 +01001173
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001174Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1175 Py_UNICODE *s1, const Py_UNICODE *s2);
Victor Stinner75e46992018-11-26 17:29:38 +01001176
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001177Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
Victor Stinner75e46992018-11-26 17:29:38 +01001178 Py_UNICODE *s1,
1179 const Py_UNICODE *s2,
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001180 size_t n);
Victor Stinner75e46992018-11-26 17:29:38 +01001181
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001182Py_DEPRECATED(3.3) PyAPI_FUNC(int) Py_UNICODE_strcmp(
Victor Stinner75e46992018-11-26 17:29:38 +01001183 const Py_UNICODE *s1,
1184 const Py_UNICODE *s2
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001185 );
Victor Stinner75e46992018-11-26 17:29:38 +01001186
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001187Py_DEPRECATED(3.3) PyAPI_FUNC(int) Py_UNICODE_strncmp(
Victor Stinner75e46992018-11-26 17:29:38 +01001188 const Py_UNICODE *s1,
1189 const Py_UNICODE *s2,
1190 size_t n
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001191 );
Victor Stinner75e46992018-11-26 17:29:38 +01001192
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001193Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
Victor Stinner75e46992018-11-26 17:29:38 +01001194 const Py_UNICODE *s,
1195 Py_UNICODE c
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001196 );
Victor Stinner75e46992018-11-26 17:29:38 +01001197
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001198Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Victor Stinner75e46992018-11-26 17:29:38 +01001199 const Py_UNICODE *s,
1200 Py_UNICODE c
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001201 );
Victor Stinner75e46992018-11-26 17:29:38 +01001202
1203PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
1204
1205/* Create a copy of a unicode string ending with a nul character. Return NULL
1206 and raise a MemoryError exception on memory allocation failure, otherwise
1207 return a new allocated buffer (use PyMem_Free() to free the buffer). */
1208
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001209Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
Victor Stinner75e46992018-11-26 17:29:38 +01001210 PyObject *unicode
Zackery Spytz3c8724f2019-05-28 09:16:33 -06001211 );
Victor Stinner75e46992018-11-26 17:29:38 +01001212
1213/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
1214PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
Victor Stinner75e46992018-11-26 17:29:38 +01001215
1216/* Fast equality check when the inputs are known to be exact unicode types
1217 and where the hash values are equal (i.e. a very probable match) */
1218PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
1219
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +03001220PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);