blob: 58032165d34ec0a6e70ece31e77525cc51e2d00e [file] [log] [blame]
Eric Smith8c663262007-08-25 02:26:07 +00001/*
2 string_format.h -- implementation of string.format().
3
4 It uses the Objects/stringlib conventions, so that it can be
5 compiled for both unicode and string objects.
6*/
7
8
9/* Defines for more efficiently reallocating the string buffer */
10#define INITIAL_SIZE_INCREMENT 100
11#define SIZE_MULTIPLIER 2
12#define MAX_SIZE_INCREMENT 3200
13
14
15/************************************************************************/
16/*********** Global data structures and forward declarations *********/
17/************************************************************************/
18
19/*
20 A SubString consists of the characters between two string or
21 unicode pointers.
22*/
23typedef struct {
24 STRINGLIB_CHAR *ptr;
25 STRINGLIB_CHAR *end;
26} SubString;
27
28
29/* forward declaration for recursion */
30static PyObject *
31build_string(SubString *input, PyObject *args, PyObject *kwargs,
32 int *recursion_level);
33
34
35
36/************************************************************************/
37/************************** Utility functions ************************/
38/************************************************************************/
39
40/* fill in a SubString from a pointer and length */
41Py_LOCAL_INLINE(void)
42SubString_init(SubString *str, STRINGLIB_CHAR *p, Py_ssize_t len)
43{
44 str->ptr = p;
45 if (p == NULL)
46 str->end = NULL;
47 else
48 str->end = str->ptr + len;
49}
50
51Py_LOCAL_INLINE(PyObject *)
52SubString_new_object(SubString *str)
53{
54 return STRINGLIB_NEW(str->ptr, str->end - str->ptr);
55}
56
57/************************************************************************/
58/*********** Error handling and exception generation **************/
59/************************************************************************/
60
61/*
62 Most of our errors are value errors, because to Python, the
63 format string is a "value". Also, it's convenient to return
64 a NULL when we are erroring out.
65
66 XXX: need better error handling, per PEP 3101.
67*/
68static void *
69SetError(const char *s)
70{
71 /* PyErr_Format always returns NULL */
72 return PyErr_Format(PyExc_ValueError, "%s in format string", s);
73}
74
75/*
76 check_input returns True if we still have characters
77 left in the input string.
78
79 XXX: make this function go away when better error handling is
80 implemented.
81*/
82Py_LOCAL_INLINE(int)
83check_input(SubString *input)
84{
85 if (input->ptr < input->end)
86 return 1;
87 PyErr_SetString(PyExc_ValueError,
88 "unterminated replacement field");
89 return 0;
90}
91
92/************************************************************************/
93/*********** Output string management functions ****************/
94/************************************************************************/
95
96typedef struct {
97 STRINGLIB_CHAR *ptr;
98 STRINGLIB_CHAR *end;
99 PyObject *obj;
100 Py_ssize_t size_increment;
101} OutputString;
102
103/* initialize an OutputString object, reserving size characters */
104static int
105output_initialize(OutputString *output, Py_ssize_t size)
106{
107 output->obj = STRINGLIB_NEW(NULL, size);
108 if (output->obj == NULL)
109 return 0;
110
111 output->ptr = STRINGLIB_STR(output->obj);
112 output->end = STRINGLIB_LEN(output->obj) + output->ptr;
113 output->size_increment = INITIAL_SIZE_INCREMENT;
114
115 return 1;
116}
117
118/*
119 output_extend reallocates the output string buffer.
120 It returns a status: 0 for a failed reallocation,
121 1 for success.
122*/
123
124static int
125output_extend(OutputString *output, Py_ssize_t count)
126{
127 STRINGLIB_CHAR *startptr = STRINGLIB_STR(output->obj);
128 Py_ssize_t curlen = output->ptr - startptr;
129 Py_ssize_t maxlen = curlen + count + output->size_increment;
130
131 if (STRINGLIB_RESIZE(&output->obj, maxlen) < 0)
132 return 0;
133 startptr = STRINGLIB_STR(output->obj);
134 output->ptr = startptr + curlen;
135 output->end = startptr + maxlen;
136 if (output->size_increment < MAX_SIZE_INCREMENT)
137 output->size_increment *= SIZE_MULTIPLIER;
138 return 1;
139}
140
141/*
142 output_data dumps characters into our output string
143 buffer.
144
145 In some cases, it has to reallocate the string.
146
147 It returns a status: 0 for a failed reallocation,
148 1 for success.
149*/
150static int
151output_data(OutputString *output, const STRINGLIB_CHAR *s, Py_ssize_t count)
152{
153 if ((count > output->end - output->ptr) && !output_extend(output, count))
154 return 0;
155 memcpy(output->ptr, s, count * sizeof(STRINGLIB_CHAR));
156 output->ptr += count;
157 return 1;
158}
159
160/************************************************************************/
161/*********** Format string parsing -- integers and identifiers *********/
162/************************************************************************/
163
164/*
165 end_identifier returns true if a character marks
166 the end of an identifier string.
167
168 Although the PEP specifies that identifiers are
169 numbers or valid Python identifiers, we just let
170 getattr/getitem handle that, so the implementation
171 is more flexible than the PEP would indicate.
172*/
173Py_LOCAL_INLINE(int)
174end_identifier(STRINGLIB_CHAR c)
175{
176 switch (c) {
177 case '.': case '[': case ']':
178 return 1;
179 default:
180 return 0;
181 }
182}
183
184/*
185 get_integer consumes 0 or more decimal digit characters from an
186 input string, updates *result with the corresponding positive
187 integer, and returns the number of digits consumed.
188
189 returns -1 on error.
190*/
191static int
192get_integer(STRINGLIB_CHAR **ptr, STRINGLIB_CHAR *end,
193 Py_ssize_t *result)
194{
195 Py_ssize_t accumulator, digitval, oldaccumulator;
196 int numdigits;
197 accumulator = numdigits = 0;
198 for (;;(*ptr)++, numdigits++) {
199 if (*ptr >= end)
200 break;
201 digitval = STRINGLIB_TODECIMAL(**ptr);
202 if (digitval < 0)
203 break;
204 /*
205 This trick was copied from old Unicode format code. It's cute,
206 but would really suck on an old machine with a slow divide
207 implementation. Fortunately, in the normal case we do not
208 expect too many digits.
209 */
210 oldaccumulator = accumulator;
211 accumulator *= 10;
212 if ((accumulator+10)/10 != oldaccumulator+1) {
213 PyErr_Format(PyExc_ValueError,
214 "Too many decimal digits in format string");
215 return -1;
216 }
217 accumulator += digitval;
218 }
219 *result = accumulator;
220 return numdigits;
221}
222
223/*
224 get_identifier is a bit of a misnomer. It returns a value for use
225 with getattr or getindex. This value will a string/unicode
226 object. The input cannot be zero length. Continues until end of
227 input, or end_identifier() returns true.
228*/
229static PyObject *
230get_identifier(SubString *input)
231{
232 STRINGLIB_CHAR *start;
233
234 for (start = input->ptr;
235 input->ptr < input->end && !end_identifier(*input->ptr);
236 input->ptr++)
237 ;
238
239 return STRINGLIB_NEW(start, input->ptr - start);
240
241 /*
242 We might want to add code here to check for invalid Python
243 identifiers. All identifiers are eventually passed to getattr
244 or getitem, so there is a check when used. However, we might
245 want to remove (or not) the ability to have strings like
246 "a/b" or " ab" or "-1" (which is not parsed as a number).
247 For now, this is left as an exercise for the first disgruntled
248 user...
249
250 if (XXX -- need check function) {
251 Py_DECREF(result);
252 PyErr_SetString(PyExc_ValueError,
253 "Invalid embedded Python identifier");
254 return NULL;
255 }
256 */
257}
258
259/************************************************************************/
260/******** Functions to get field objects and specification strings ******/
261/************************************************************************/
262
263/* get_field_and_spec is the main function in this section. It parses
264 the format string well enough to return a field object to render along
265 with a field specification string.
266*/
267
268/*
269 look up key in our keyword arguments
270*/
271static PyObject *
272key_lookup(PyObject *kwargs, PyObject *key)
273{
274 PyObject *result;
275
276 if (kwargs && (result = PyDict_GetItem(kwargs, key)) != NULL) {
277 Py_INCREF(result);
278 return result;
279 }
280 return NULL;
281}
282
283/*
284 get_field_object returns the object inside {}, before the
285 format_spec. It handles getindex and getattr lookups and consumes
286 the entire input string.
287*/
288static PyObject *
289get_field_object(SubString *input, PyObject *args, PyObject *kwargs)
290{
291 PyObject *myobj, *subobj, *newobj;
292 STRINGLIB_CHAR c;
293 Py_ssize_t index;
294 int isindex, isnumeric, isargument;
295
296 index = isnumeric = 0; /* Just to shut up the compiler warnings */
297
298 myobj = args;
299 Py_INCREF(myobj);
300
301 for (isindex=1, isargument=1;;) {
302 if (!check_input(input))
303 break;
304 if (!isindex) {
305 if ((subobj = get_identifier(input)) == NULL)
306 break;
307 newobj = PyObject_GetAttr(myobj, subobj);
308 Py_DECREF(subobj);
309 } else {
310 isnumeric = (STRINGLIB_ISDECIMAL(*input->ptr));
311 if (isnumeric)
312 /* XXX: add error checking */
313 get_integer(&input->ptr, input->end, &index);
314
315 if (isnumeric && PySequence_Check(myobj))
316 newobj = PySequence_GetItem(myobj, index);
317 else {
318 /* XXX -- do we need PyLong_FromLongLong?
319 Using ssizet, not int... */
320 subobj = isnumeric ?
321 PyInt_FromLong(index) :
322 get_identifier(input);
323 if (subobj == NULL)
324 break;
325 if (isargument) {
326 newobj = key_lookup(kwargs, subobj);
327 } else {
328 newobj = PyObject_GetItem(myobj, subobj);
329 }
330 Py_DECREF(subobj);
331 }
332 }
333 Py_DECREF(myobj);
334 myobj = newobj;
335 if (myobj == NULL)
336 break;
337 if (!isargument && isindex)
338 if ((!check_input(input)) || (*(input->ptr++) != ']')) {
339 SetError("Expected ]");
340 break;
341 }
342
343 /* if at the end of input, return with myobj */
344 if (input->ptr >= input->end)
345 return myobj;
346
347 c = *input->ptr;
348 input->ptr++;
349 isargument = 0;
350 isindex = (c == '[');
351 if (!isindex && (c != '.')) {
352 SetError("Expected ., [, :, !, or }");
353 break;
354 }
355 }
356 if ((myobj == NULL) && isargument) {
357 /* XXX: include more useful error information, like which
358 * keyword not found or which index missing */
359 PyErr_Clear();
360 return SetError(isnumeric
361 ? "Not enough positional arguments"
362 : "Keyword argument not found");
363 }
364 Py_XDECREF(myobj);
365 return NULL;
366}
367
368/************************************************************************/
369/***************** Field rendering functions **************************/
370/************************************************************************/
371
372/*
373 render_field() is the main function in this section. It takes the
374 field object and field specification string generated by
375 get_field_and_spec, and renders the field into the output string.
376
377 format() does the actual calling of the objects __format__ method.
378*/
379
380
381/* returns fieldobj.__format__(format_spec) */
382static PyObject *
383format(PyObject *fieldobj, SubString *format_spec)
384{
385 static PyObject *format_str = NULL;
386 PyObject *meth;
387 PyObject *spec = NULL;
388 PyObject *result = NULL;
389
390 /* Initialize cached value */
391 if (format_str == NULL) {
392 /* Initialize static variable needed by _PyType_Lookup */
393 format_str = PyUnicode_FromString("__format__");
394 if (format_str == NULL)
395 return NULL;
396 }
397
398 /* Make sure the type is initialized. float gets initialized late */
399 if (Py_Type(fieldobj)->tp_dict == NULL)
400 if (PyType_Ready(Py_Type(fieldobj)) < 0)
401 return NULL;
402
403 /* we need to create an object out of the pointers we have */
404 spec = SubString_new_object(format_spec);
405 if (spec == NULL)
406 goto done;
407
408 /* Find the (unbound!) __format__ method (a borrowed reference) */
409 meth = _PyType_Lookup(Py_Type(fieldobj), format_str);
410 if (meth == NULL) {
411 PyErr_Format(PyExc_TypeError,
412 "Type %.100s doesn't define __format__",
413 Py_Type(fieldobj)->tp_name);
414 goto done;
415 }
416
417 /* And call it, binding it to the value */
418 result = PyObject_CallFunctionObjArgs(meth, fieldobj, spec, NULL);
419 if (result == NULL)
420 goto done;
421
422 if (!STRINGLIB_CHECK(result)) {
423 PyErr_SetString(PyExc_TypeError,
424 "__format__ method did not return "
425 STRINGLIB_TYPE_NAME);
426 Py_DECREF(result);
427 result = NULL;
428 goto done;
429 }
430
431done:
432 Py_XDECREF(spec);
433 return result;
434}
435
436/*
437 render_field calls fieldobj.__format__(format_spec) method, and
438 appends to the output.
439*/
440static int
441render_field(PyObject *fieldobj, SubString *format_spec, OutputString *output)
442{
443 int ok = 0;
444 PyObject *result = format(fieldobj, format_spec);
445
446 if (result == NULL)
447 goto done;
448
449 ok = output_data(output,
450 STRINGLIB_STR(result), STRINGLIB_LEN(result));
451done:
452 Py_XDECREF(result);
453 return ok;
454}
455
456static int
457parse_field(SubString *str, SubString *field_name, SubString *format_spec,
458 STRINGLIB_CHAR *conversion)
459{
460 STRINGLIB_CHAR c = 0;
461
462 /* initialize these, as they may be empty */
463 *conversion = '\0';
464 SubString_init(format_spec, NULL, 0);
465
466 /* search for the field name. it's terminated by the end of the
467 string, or a ':' or '!' */
468 field_name->ptr = str->ptr;
469 while (str->ptr < str->end) {
470 switch (c = *(str->ptr++)) {
471 case ':':
472 case '!':
473 break;
474 default:
475 continue;
476 }
477 break;
478 }
479
480 if (c == '!' || c == ':') {
481 /* we have a format specifier and/or a conversion */
482 /* don't include the last character */
483 field_name->end = str->ptr-1;
484
485 /* the format specifier is the rest of the string */
486 format_spec->ptr = str->ptr;
487 format_spec->end = str->end;
488
489 /* see if there's a conversion specifier */
490 if (c == '!') {
491 /* there must be another character present */
492 if (format_spec->ptr >= format_spec->end) {
493 PyErr_SetString(PyExc_ValueError,
494 "end of format while looking for conversion "
495 "specifier");
496 return 0;
497 }
498 *conversion = *(format_spec->ptr++);
499
500 /* if there is another character, it must be a colon */
501 if (format_spec->ptr < format_spec->end) {
502 c = *(format_spec->ptr++);
503 if (c != ':') {
504 PyErr_SetString(PyExc_ValueError,
505 "expected ':' after format specifier");
506 return 0;
507 }
508 }
509 }
510
511 return 1;
512
513 } else {
514 /* end of string, there's no format_spec or conversion */
515 field_name->end = str->ptr;
516 return 1;
517 }
518}
519
520/************************************************************************/
521/******* Output string allocation and escape-to-markup processing ******/
522/************************************************************************/
523
524/* MarkupIterator breaks the string into pieces of either literal
525 text, or things inside {} that need to be marked up. it is
526 designed to make it easy to wrap a Python iterator around it, for
527 use with the Formatter class */
528
529typedef struct {
530 SubString str;
531 int in_markup;
532} MarkupIterator;
533
534static int
535MarkupIterator_init(MarkupIterator *self, STRINGLIB_CHAR *ptr, Py_ssize_t len)
536{
537 SubString_init(&self->str, ptr, len);
538 self->in_markup = 0;
539 return 1;
540}
541
542/* returns 0 on error, 1 on non-error termination, and 2 if it got a
543 string (or something to be expanded) */
544static int
545MarkupIterator_next(MarkupIterator *self, int *is_markup, SubString *literal,
546 SubString *field_name, SubString *format_spec,
547 STRINGLIB_CHAR *conversion,
548 int *format_spec_needs_expanding)
549{
550 int at_end;
551 STRINGLIB_CHAR c = 0;
552 STRINGLIB_CHAR *start;
553 int count;
554 Py_ssize_t len;
555
556 *format_spec_needs_expanding = 0;
557
558 /* no more input, end of iterator */
559 if (self->str.ptr >= self->str.end)
560 return 1;
561
562 *is_markup = self->in_markup;
563 start = self->str.ptr;
564
565 if (self->in_markup) {
566
567 /* prepare for next iteration */
568 self->in_markup = 0;
569
570 /* this is markup, find the end of the string by counting nested
571 braces. note that this prohibits escaped braces, so that
572 format_specs cannot have braces in them. */
573 count = 1;
574
575 /* we know we can't have a zero length string, so don't worry
576 about that case */
577 while (self->str.ptr < self->str.end) {
578 switch (c = *(self->str.ptr++)) {
579 case '{':
580 /* the format spec needs to be recursively expanded.
581 this is an optimization, and not strictly needed */
582 *format_spec_needs_expanding = 1;
583 count++;
584 break;
585 case '}':
586 count--;
587 if (count <= 0) {
588 /* we're done. parse and get out */
589 literal->ptr = start;
590 literal->end = self->str.ptr-1;
591
592 if (parse_field(literal, field_name, format_spec,
593 conversion) == 0)
594 return 0;
595
596 /* success */
597 return 2;
598 }
599 break;
600 }
601 }
602 /* end of string while searching for matching '}' */
603 PyErr_SetString(PyExc_ValueError, "unmatched '{' in format");
604 return 0;
605
606 } else {
607 /* literal text, read until the end of string, an escaped { or },
608 or an unescaped { */
609 while (self->str.ptr < self->str.end) {
610 switch (c = *(self->str.ptr++)) {
611 case '{':
612 case '}':
613 self->in_markup = 1;
614 break;
615 default:
616 continue;
617 }
618 break;
619 }
620
621 at_end = self->str.ptr >= self->str.end;
622 len = self->str.ptr - start;
623
624 if ((c == '}') && (at_end || (c != *self->str.ptr)))
625 return (int)SetError("Single } encountered");
626 if (at_end && c == '{')
627 return (int)SetError("Single { encountered");
628 if (!at_end) {
629 if (c == *self->str.ptr) {
630 /* escaped } or {, skip it in the input */
631 self->str.ptr++;
632 self->in_markup = 0;
633 } else
634 len--;
635 }
636
637 /* this is just plain text, return it */
638 literal->ptr = start;
639 literal->end = start + len;
640 return 2;
641 }
642}
643
644
645/* do the !r or !s conversion on obj */
646static PyObject *
647do_conversion(PyObject *obj, STRINGLIB_CHAR conversion)
648{
649 /* XXX in pre-3.0, do we need to convert this to unicode, since it
650 might have returned a string? */
651 switch (conversion) {
652 case 'r':
653 return PyObject_Repr(obj);
654 case 's':
655 return PyObject_Unicode(obj);
656 default:
657 PyErr_Format(PyExc_ValueError,
658 "Unknown converion specifier %c",
659 conversion);
660 return NULL;
661 }
662}
663
664/* given:
665
666 {field_name!conversion:format_spec}
667
668 compute the result and write it to output.
669 format_spec_needs_expanding is an optimization. if it's false,
670 just output the string directly, otherwise recursively expand the
671 format_spec string. */
672
673static int
674output_markup(SubString *field_name, SubString *format_spec,
675 int format_spec_needs_expanding, STRINGLIB_CHAR conversion,
676 OutputString *output, PyObject *args, PyObject *kwargs,
677 int *recursion_level)
678{
679 PyObject *tmp = NULL;
680 PyObject *fieldobj = NULL;
681 SubString expanded_format_spec;
682 SubString *actual_format_spec;
683 int result = 0;
684
685 /* convert field_name to an object */
686 fieldobj = get_field_object(field_name, args, kwargs);
687 if (fieldobj == NULL)
688 goto done;
689
690 if (conversion != '\0') {
691 tmp = do_conversion(fieldobj, conversion);
692 if (tmp == NULL)
693 goto done;
694
695 /* do the assignment, transferring ownership: fieldobj = tmp */
696 Py_DECREF(fieldobj);
697 fieldobj = tmp;
698 tmp = NULL;
699 }
700
701 /* if needed, recurively compute the format_spec */
702 if (format_spec_needs_expanding) {
703 tmp = build_string(format_spec, args, kwargs, recursion_level);
704 if (tmp == NULL)
705 goto done;
706
707 /* note that in the case we're expanding the format string,
708 tmp must be kept around until after the call to
709 render_field. */
710 SubString_init(&expanded_format_spec,
711 STRINGLIB_STR(tmp), STRINGLIB_LEN(tmp));
712 actual_format_spec = &expanded_format_spec;
713 } else
714 actual_format_spec = format_spec;
715
716 if (render_field(fieldobj, actual_format_spec, output) == 0)
717 goto done;
718
719 result = 1;
720
721done:
722 Py_XDECREF(fieldobj);
723 Py_XDECREF(tmp);
724
725 return result;
726}
727
728/*
729 do_markup is the top-level loop for the format() function. It
730 searches through the format string for escapes to markup codes, and
731 calls other functions to move non-markup text to the output,
732 and to perform the markup to the output.
733*/
734static int
735do_markup(SubString *input, PyObject *args, PyObject *kwargs,
736 OutputString *output, int *recursion_level)
737{
738 MarkupIterator iter;
739 int is_markup;
740 int format_spec_needs_expanding;
741 int result;
742 SubString str;
743 SubString field_name;
744 SubString format_spec;
745 STRINGLIB_CHAR conversion;
746
747 MarkupIterator_init(&iter, input->ptr, input->end - input->ptr);
748 while ((result = MarkupIterator_next(&iter, &is_markup, &str, &field_name,
749 &format_spec, &conversion,
750 &format_spec_needs_expanding)) == 2) {
751 if (is_markup) {
752 if (!output_markup(&field_name, &format_spec,
753 format_spec_needs_expanding, conversion, output,
754 args, kwargs, recursion_level))
755 return 0;
756 } else {
757 if (!output_data(output, str.ptr, str.end-str.ptr))
758 return 0;
759 }
760 }
761 return result;
762}
763
764
765/*
766 build_string allocates the output string and then
767 calls do_markup to do the heavy lifting.
768*/
769static PyObject *
770build_string(SubString *input, PyObject *args, PyObject *kwargs,
771 int *recursion_level)
772{
773 OutputString output;
774 PyObject *result = NULL;
775 Py_ssize_t count;
776
777 output.obj = NULL; /* needed so cleanup code always works */
778
779 /* check the recursion level */
780 (*recursion_level)--;
781 if (*recursion_level < 0) {
782 PyErr_SetString(PyExc_ValueError,
783 "Max string recursion exceeded");
784 goto done;
785 }
786
787 /* initial size is the length of the format string, plus the size
788 increment. seems like a reasonable default */
789 if (!output_initialize(&output,
790 input->end - input->ptr +
791 INITIAL_SIZE_INCREMENT))
792 goto done;
793
794 if (!do_markup(input, args, kwargs, &output, recursion_level)) {
795 goto done;
796 }
797
798 count = output.ptr - STRINGLIB_STR(output.obj);
799 if (STRINGLIB_RESIZE(&output.obj, count) < 0) {
800 goto done;
801 }
802
803 /* transfer ownership to result */
804 result = output.obj;
805 output.obj = NULL;
806
807done:
808 (*recursion_level)++;
809 Py_XDECREF(output.obj);
810 return result;
811}
812
813/************************************************************************/
814/*********** main routine ***********************************************/
815/************************************************************************/
816
817/* this is the main entry point */
818static PyObject *
819do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
820{
821 SubString input;
822
823 /* PEP 3101 says only 2 levels, so that
824 "{0:{1}}".format('abc', 's') # works
825 "{0:{1:{2}}}".format('abc', 's', '') # fails
826 */
827 int recursion_level = 2;
828
829 SubString_init(&input, STRINGLIB_STR(self), STRINGLIB_LEN(self));
830 return build_string(&input, args, kwargs, &recursion_level);
831}