blob: e95741eb8c31924359b1e3bb43f0a99908fc7b1a [file] [log] [blame]
Fred Drake295da241998-08-10 19:42:37 +00001\section{\module{string} ---
Fred Drakeffbe6871999-04-22 21:23:22 +00002 Common string operations}
Fred Drakeb91e9341998-07-23 17:59:49 +00003
Fred Drakeffbe6871999-04-22 21:23:22 +00004\declaremodule{standard}{string}
Fred Drakeb91e9341998-07-23 17:59:49 +00005\modulesynopsis{Common string operations.}
6
Guido van Rossum5fdeeea1994-01-02 01:22:07 +00007
8This module defines some constants useful for checking character
Fred Drake6d2bdb61997-12-16 04:04:25 +00009classes and some useful string functions. See the module
Fred Drakeffbe6871999-04-22 21:23:22 +000010\refmodule{re}\refstmodindex{re} for string functions based on regular
Fred Drakecce10901998-03-17 06:33:25 +000011expressions.
Guido van Rossum0bf4d891995-03-02 12:37:30 +000012
Andrew M. Kuchlingbe063022000-12-26 16:14:32 +000013The constants defined in this module are:
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000014
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000015\begin{datadesc}{digits}
16 The string \code{'0123456789'}.
17\end{datadesc}
18
19\begin{datadesc}{hexdigits}
20 The string \code{'0123456789abcdefABCDEF'}.
21\end{datadesc}
22
23\begin{datadesc}{letters}
Fred Drake0682be42000-04-10 18:35:49 +000024 The concatenation of the strings \constant{lowercase} and
25 \constant{uppercase} described below.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000026\end{datadesc}
27
28\begin{datadesc}{lowercase}
29 A string containing all the characters that are considered lowercase
30 letters. On most systems this is the string
Guido van Rossum86751151995-02-28 17:14:32 +000031 \code{'abcdefghijklmnopqrstuvwxyz'}. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000032 the effect on the routines \function{upper()} and
33 \function{swapcase()} is undefined.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000034\end{datadesc}
35
36\begin{datadesc}{octdigits}
37 The string \code{'01234567'}.
38\end{datadesc}
39
Fred Drake480abc22000-09-18 16:48:13 +000040\begin{datadesc}{punctuation}
41 String of \ASCII{} characters which are considered punctuation
42 characters in the \samp{C} locale.
43\end{datadesc}
44
45\begin{datadesc}{printable}
46 String of characters which are considered printable. This is a
47 combination of \constant{digits}, \constant{letters},
48 \constant{punctuation}, and \constant{whitespace}.
49\end{datadesc}
50
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000051\begin{datadesc}{uppercase}
52 A string containing all the characters that are considered uppercase
53 letters. On most systems this is the string
Guido van Rossum86751151995-02-28 17:14:32 +000054 \code{'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000055 the effect on the routines \function{lower()} and
56 \function{swapcase()} is undefined.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000057\end{datadesc}
58
59\begin{datadesc}{whitespace}
60 A string containing all characters that are considered whitespace.
61 On most systems this includes the characters space, tab, linefeed,
Guido van Rossum86751151995-02-28 17:14:32 +000062 return, formfeed, and vertical tab. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000063 the effect on the routines \function{strip()} and \function{split()}
64 is undefined.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000065\end{datadesc}
66
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000067
Fred Drake1b194f922000-09-09 05:34:06 +000068Many of the functions provided by this module are also defined as
69methods of string and Unicode objects; see ``String Methods'' (section
70\ref{string-methods}) for more information on those.
71The functions defined in this module are:
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000072
73\begin{funcdesc}{atof}{s}
Fred Drake15f06662000-10-04 13:59:52 +000074 \deprecated{2.0}{Use the \function{float()} built-in function.}
Fred Drakee8489761998-12-21 18:56:13 +000075 Convert a string to a floating point number. The string must have
76 the standard syntax for a floating point literal in Python,
Fred Drake70a66c91999-02-18 16:08:36 +000077 optionally preceded by a sign (\samp{+} or \samp{-}). Note that
78 this behaves identical to the built-in function
79 \function{float()}\bifuncindex{float} when passed a string.
80
81 \strong{Note:} When passing in a string, values for NaN\index{NaN}
82 and Infinity\index{Infinity} may be returned, depending on the
83 underlying C library. The specific set of strings accepted which
84 cause these values to be returned depends entirely on the C library
85 and is known to vary.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000086\end{funcdesc}
87
Fred Drakecce10901998-03-17 06:33:25 +000088\begin{funcdesc}{atoi}{s\optional{, base}}
Fred Drake15f06662000-10-04 13:59:52 +000089 \deprecated{2.0}{Use the \function{int()} built-in function.}
Fred Drakee8489761998-12-21 18:56:13 +000090 Convert string \var{s} to an integer in the given \var{base}. The
91 string must consist of one or more digits, optionally preceded by a
92 sign (\samp{+} or \samp{-}). The \var{base} defaults to 10. If it
93 is 0, a default base is chosen depending on the leading characters
94 of the string (after stripping the sign): \samp{0x} or \samp{0X}
95 means 16, \samp{0} means 8, anything else means 10. If \var{base}
Fred Drakefffe5db2000-09-21 05:25:30 +000096 is 16, a leading \samp{0x} or \samp{0X} is always accepted, though
97 not required. This behaves identically to the built-in function
98 \function{int()} when passed a string. (Also note: for a more
99 flexible interpretation of numeric literals, use the built-in
100 function \function{eval()}\bifuncindex{eval}.)
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000101\end{funcdesc}
102
Fred Drakecce10901998-03-17 06:33:25 +0000103\begin{funcdesc}{atol}{s\optional{, base}}
Fred Drake15f06662000-10-04 13:59:52 +0000104 \deprecated{2.0}{Use the \function{long()} built-in function.}
Fred Drakee8489761998-12-21 18:56:13 +0000105 Convert string \var{s} to a long integer in the given \var{base}.
106 The string must consist of one or more digits, optionally preceded
107 by a sign (\samp{+} or \samp{-}). The \var{base} argument has the
108 same meaning as for \function{atoi()}. A trailing \samp{l} or
109 \samp{L} is not allowed, except if the base is 0. Note that when
110 invoked without \var{base} or with \var{base} set to 10, this
111 behaves identical to the built-in function
112 \function{long()}\bifuncindex{long} when passed a string.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000113\end{funcdesc}
114
Guido van Rossume5e55d71996-08-09 21:44:51 +0000115\begin{funcdesc}{capitalize}{word}
Fred Drakee8489761998-12-21 18:56:13 +0000116 Capitalize the first character of the argument.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000117\end{funcdesc}
118
119\begin{funcdesc}{capwords}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000120 Split the argument into words using \function{split()}, capitalize
121 each word using \function{capitalize()}, and join the capitalized
122 words using \function{join()}. Note that this replaces runs of
123 whitespace characters by a single space, and removes leading and
124 trailing whitespace.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000125\end{funcdesc}
126
Fred Drake15f06662000-10-04 13:59:52 +0000127\begin{funcdesc}{expandtabs}{s\optional{, tabsize}}
Fred Drakee8489761998-12-21 18:56:13 +0000128 Expand tabs in a string, i.e.\ replace them by one or more spaces,
129 depending on the current column and the given tab size. The column
130 number is reset to zero after each newline occurring in the string.
131 This doesn't understand other non-printing characters or escape
Guido van Rossum9700e9b1999-01-25 22:31:53 +0000132 sequences. The tab size defaults to 8.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000133\end{funcdesc}
134
Fred Drakecce10901998-03-17 06:33:25 +0000135\begin{funcdesc}{find}{s, sub\optional{, start\optional{,end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000136 Return the lowest index in \var{s} where the substring \var{sub} is
137 found such that \var{sub} is wholly contained in
138 \code{\var{s}[\var{start}:\var{end}]}. Return \code{-1} on failure.
139 Defaults for \var{start} and \var{end} and interpretation of
140 negative values is the same as for slices.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000141\end{funcdesc}
142
Fred Drakecce10901998-03-17 06:33:25 +0000143\begin{funcdesc}{rfind}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000144 Like \function{find()} but find the highest index.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000145\end{funcdesc}
146
Fred Drakecce10901998-03-17 06:33:25 +0000147\begin{funcdesc}{index}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000148 Like \function{find()} but raise \exception{ValueError} when the
149 substring is not found.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000150\end{funcdesc}
151
Fred Drakecce10901998-03-17 06:33:25 +0000152\begin{funcdesc}{rindex}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000153 Like \function{rfind()} but raise \exception{ValueError} when the
154 substring is not found.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000155\end{funcdesc}
156
Fred Drakecce10901998-03-17 06:33:25 +0000157\begin{funcdesc}{count}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000158 Return the number of (non-overlapping) occurrences of substring
159 \var{sub} in string \code{\var{s}[\var{start}:\var{end}]}.
160 Defaults for \var{start} and \var{end} and interpretation of
Andrew M. Kuchlinga4ca07c2000-06-21 01:48:46 +0000161 negative values are the same as for slices.
Guido van Rossumab3a2501994-08-01 12:18:36 +0000162\end{funcdesc}
163
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000164\begin{funcdesc}{lower}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000165 Return a copy of \var{s}, but with upper case letters converted to
166 lower case.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000167\end{funcdesc}
168
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000169\begin{funcdesc}{maketrans}{from, to}
Fred Drakee8489761998-12-21 18:56:13 +0000170 Return a translation table suitable for passing to
171 \function{translate()} or \function{regex.compile()}, that will map
172 each character in \var{from} into the character at the same position
173 in \var{to}; \var{from} and \var{to} must have the same length.
Guido van Rossuma3eebe61998-06-11 16:03:30 +0000174
Fred Drake0682be42000-04-10 18:35:49 +0000175 \strong{Warning:} don't use strings derived from \constant{lowercase}
176 and \constant{uppercase} as arguments; in some locales, these don't have
Fred Drakee8489761998-12-21 18:56:13 +0000177 the same length. For case conversions, always use
178 \function{lower()} and \function{upper()}.
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000179\end{funcdesc}
180
Fred Drakecce10901998-03-17 06:33:25 +0000181\begin{funcdesc}{split}{s\optional{, sep\optional{, maxsplit}}}
Fred Drakee8489761998-12-21 18:56:13 +0000182 Return a list of the words of the string \var{s}. If the optional
183 second argument \var{sep} is absent or \code{None}, the words are
184 separated by arbitrary strings of whitespace characters (space, tab,
185 newline, return, formfeed). If the second argument \var{sep} is
186 present and not \code{None}, it specifies a string to be used as the
Fred Drakea7ce52b01999-05-27 17:18:08 +0000187 word separator. The returned list will then have one more item
Fred Drakee8489761998-12-21 18:56:13 +0000188 than the number of non-overlapping occurrences of the separator in
189 the string. The optional third argument \var{maxsplit} defaults to
190 0. If it is nonzero, at most \var{maxsplit} number of splits occur,
191 and the remainder of the string is returned as the final element of
192 the list (thus, the list will have at most \code{\var{maxsplit}+1}
193 elements).
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000194\end{funcdesc}
195
Fred Drakecce10901998-03-17 06:33:25 +0000196\begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}}
Fred Drakee8489761998-12-21 18:56:13 +0000197 This function behaves identically to \function{split()}. (In the
198 past, \function{split()} was only used with one argument, while
199 \function{splitfields()} was only used with two arguments.)
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000200\end{funcdesc}
201
Fred Drakecce10901998-03-17 06:33:25 +0000202\begin{funcdesc}{join}{words\optional{, sep}}
Fred Drakee8489761998-12-21 18:56:13 +0000203 Concatenate a list or tuple of words with intervening occurrences of
204 \var{sep}. The default value for \var{sep} is a single space
205 character. It is always true that
206 \samp{string.join(string.split(\var{s}, \var{sep}), \var{sep})}
207 equals \var{s}.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000208\end{funcdesc}
209
Fred Drakecce10901998-03-17 06:33:25 +0000210\begin{funcdesc}{joinfields}{words\optional{, sep}}
Fred Drakee8489761998-12-21 18:56:13 +0000211 This function behaves identical to \function{join()}. (In the past,
212 \function{join()} was only used with one argument, while
213 \function{joinfields()} was only used with two arguments.)
Guido van Rossume5e55d71996-08-09 21:44:51 +0000214\end{funcdesc}
215
216\begin{funcdesc}{lstrip}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000217 Return a copy of \var{s} but without leading whitespace characters.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000218\end{funcdesc}
219
220\begin{funcdesc}{rstrip}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000221 Return a copy of \var{s} but without trailing whitespace
222 characters.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000223\end{funcdesc}
224
225\begin{funcdesc}{strip}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000226 Return a copy of \var{s} without leading or trailing whitespace.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000227\end{funcdesc}
228
229\begin{funcdesc}{swapcase}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000230 Return a copy of \var{s}, but with lower case letters
231 converted to upper case and vice versa.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000232\end{funcdesc}
233
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000234\begin{funcdesc}{translate}{s, table\optional{, deletechars}}
Fred Drakee8489761998-12-21 18:56:13 +0000235 Delete all characters from \var{s} that are in \var{deletechars} (if
236 present), and then translate the characters using \var{table}, which
237 must be a 256-character string giving the translation for each
238 character value, indexed by its ordinal.
Guido van Rossumf65f2781995-09-13 17:37:21 +0000239\end{funcdesc}
240
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000241\begin{funcdesc}{upper}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000242 Return a copy of \var{s}, but with lower case letters converted to
243 upper case.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000244\end{funcdesc}
245
Fred Drakecce10901998-03-17 06:33:25 +0000246\begin{funcdesc}{ljust}{s, width}
247\funcline{rjust}{s, width}
248\funcline{center}{s, width}
Fred Drakee8489761998-12-21 18:56:13 +0000249 These functions respectively left-justify, right-justify and center
250 a string in a field of given width. They return a string that is at
251 least \var{width} characters wide, created by padding the string
252 \var{s} with spaces until the given width on the right, left or both
253 sides. The string is never truncated.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000254\end{funcdesc}
255
Fred Drakecce10901998-03-17 06:33:25 +0000256\begin{funcdesc}{zfill}{s, width}
Fred Drakee8489761998-12-21 18:56:13 +0000257 Pad a numeric string on the left with zero digits until the given
258 width is reached. Strings starting with a sign are handled
259 correctly.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000260\end{funcdesc}
Guido van Rossum0bf4d891995-03-02 12:37:30 +0000261
Guido van Rossum740eb821997-04-02 05:56:16 +0000262\begin{funcdesc}{replace}{str, old, new\optional{, maxsplit}}
Fred Drakee8489761998-12-21 18:56:13 +0000263 Return a copy of string \var{str} with all occurrences of substring
264 \var{old} replaced by \var{new}. If the optional argument
265 \var{maxsplit} is given, the first \var{maxsplit} occurrences are
266 replaced.
Guido van Rossumc8a80cd1997-03-25 16:41:31 +0000267\end{funcdesc}
268
Guido van Rossum0bf4d891995-03-02 12:37:30 +0000269This module is implemented in Python. Much of its functionality has
Fred Drakecce10901998-03-17 06:33:25 +0000270been reimplemented in the built-in module
271\module{strop}\refbimodindex{strop}. However, you
Guido van Rossum0bf4d891995-03-02 12:37:30 +0000272should \emph{never} import the latter module directly. When
Fred Drakecce10901998-03-17 06:33:25 +0000273\module{string} discovers that \module{strop} exists, it transparently
274replaces parts of itself with the implementation from \module{strop}.
Guido van Rossum0bf4d891995-03-02 12:37:30 +0000275After initialization, there is \emph{no} overhead in using
Fred Drakecce10901998-03-17 06:33:25 +0000276\module{string} instead of \module{strop}.