blob: 5f71ec8102935f9b6b3059357c54a8a5071c87b2 [file] [log] [blame]
Fred Drake295da241998-08-10 19:42:37 +00001\section{\module{string} ---
Fred Drakeffbe6871999-04-22 21:23:22 +00002 Common string operations}
Fred Drakeb91e9341998-07-23 17:59:49 +00003
Fred Drakeffbe6871999-04-22 21:23:22 +00004\declaremodule{standard}{string}
Fred Drakeb91e9341998-07-23 17:59:49 +00005\modulesynopsis{Common string operations.}
6
Guido van Rossum5fdeeea1994-01-02 01:22:07 +00007
8This module defines some constants useful for checking character
Fred Drake6d2bdb61997-12-16 04:04:25 +00009classes and some useful string functions. See the module
Fred Drakeffbe6871999-04-22 21:23:22 +000010\refmodule{re}\refstmodindex{re} for string functions based on regular
Fred Drakecce10901998-03-17 06:33:25 +000011expressions.
Guido van Rossum0bf4d891995-03-02 12:37:30 +000012
Andrew M. Kuchlingbe063022000-12-26 16:14:32 +000013The constants defined in this module are:
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000014
Fred Drake960fdf92001-07-20 18:38:26 +000015\begin{datadesc}{ascii_letters}
16 The concatenation of the \constant{ascii_lowercase} and
17 \constant{ascii_uppercase} constants described below. This value is
18 not locale-dependent.
19\end{datadesc}
20
21\begin{datadesc}{ascii_lowercase}
22 The lowercase letters \code{'abcdefghijklmnopqrstuvwxyz'}. This
23 value is not locale-dependent and will not change.
24\end{datadesc}
25
26\begin{datadesc}{ascii_uppercase}
27 The uppercase letters \code{'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}. This
28 value is not locale-dependent and will not change.
29\end{datadesc}
30
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000031\begin{datadesc}{digits}
32 The string \code{'0123456789'}.
33\end{datadesc}
34
35\begin{datadesc}{hexdigits}
36 The string \code{'0123456789abcdefABCDEF'}.
37\end{datadesc}
38
39\begin{datadesc}{letters}
Fred Drake0682be42000-04-10 18:35:49 +000040 The concatenation of the strings \constant{lowercase} and
Fred Drake960fdf92001-07-20 18:38:26 +000041 \constant{uppercase} described below. The specific value is
42 locale-dependent, and will be updated when
43 \function{locale.setlocale()} is called.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000044\end{datadesc}
45
46\begin{datadesc}{lowercase}
47 A string containing all the characters that are considered lowercase
48 letters. On most systems this is the string
Guido van Rossum86751151995-02-28 17:14:32 +000049 \code{'abcdefghijklmnopqrstuvwxyz'}. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000050 the effect on the routines \function{upper()} and
Fred Drake960fdf92001-07-20 18:38:26 +000051 \function{swapcase()} is undefined. The specific value is
52 locale-dependent, and will be updated when
53 \function{locale.setlocale()} is called.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000054\end{datadesc}
55
56\begin{datadesc}{octdigits}
57 The string \code{'01234567'}.
58\end{datadesc}
59
Fred Drake480abc22000-09-18 16:48:13 +000060\begin{datadesc}{punctuation}
61 String of \ASCII{} characters which are considered punctuation
62 characters in the \samp{C} locale.
63\end{datadesc}
64
65\begin{datadesc}{printable}
66 String of characters which are considered printable. This is a
67 combination of \constant{digits}, \constant{letters},
68 \constant{punctuation}, and \constant{whitespace}.
69\end{datadesc}
70
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000071\begin{datadesc}{uppercase}
72 A string containing all the characters that are considered uppercase
73 letters. On most systems this is the string
Guido van Rossum86751151995-02-28 17:14:32 +000074 \code{'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000075 the effect on the routines \function{lower()} and
Fred Drake960fdf92001-07-20 18:38:26 +000076 \function{swapcase()} is undefined. The specific value is
77 locale-dependent, and will be updated when
78 \function{locale.setlocale()} is called.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000079\end{datadesc}
80
81\begin{datadesc}{whitespace}
82 A string containing all characters that are considered whitespace.
83 On most systems this includes the characters space, tab, linefeed,
Guido van Rossum86751151995-02-28 17:14:32 +000084 return, formfeed, and vertical tab. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000085 the effect on the routines \function{strip()} and \function{split()}
86 is undefined.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000087\end{datadesc}
88
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000089
Fred Drake1b194f922000-09-09 05:34:06 +000090Many of the functions provided by this module are also defined as
91methods of string and Unicode objects; see ``String Methods'' (section
92\ref{string-methods}) for more information on those.
93The functions defined in this module are:
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000094
95\begin{funcdesc}{atof}{s}
Fred Drake15f06662000-10-04 13:59:52 +000096 \deprecated{2.0}{Use the \function{float()} built-in function.}
Fred Drakee8489761998-12-21 18:56:13 +000097 Convert a string to a floating point number. The string must have
98 the standard syntax for a floating point literal in Python,
Fred Drake70a66c91999-02-18 16:08:36 +000099 optionally preceded by a sign (\samp{+} or \samp{-}). Note that
100 this behaves identical to the built-in function
101 \function{float()}\bifuncindex{float} when passed a string.
102
Fred Drake0aa811c2001-10-20 04:24:09 +0000103 \note{When passing in a string, values for NaN\index{NaN}
Fred Drake70a66c91999-02-18 16:08:36 +0000104 and Infinity\index{Infinity} may be returned, depending on the
105 underlying C library. The specific set of strings accepted which
106 cause these values to be returned depends entirely on the C library
Fred Drake0aa811c2001-10-20 04:24:09 +0000107 and is known to vary.}
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000108\end{funcdesc}
109
Fred Drakecce10901998-03-17 06:33:25 +0000110\begin{funcdesc}{atoi}{s\optional{, base}}
Fred Drake15f06662000-10-04 13:59:52 +0000111 \deprecated{2.0}{Use the \function{int()} built-in function.}
Fred Drakee8489761998-12-21 18:56:13 +0000112 Convert string \var{s} to an integer in the given \var{base}. The
113 string must consist of one or more digits, optionally preceded by a
114 sign (\samp{+} or \samp{-}). The \var{base} defaults to 10. If it
115 is 0, a default base is chosen depending on the leading characters
116 of the string (after stripping the sign): \samp{0x} or \samp{0X}
117 means 16, \samp{0} means 8, anything else means 10. If \var{base}
Fred Drakefffe5db2000-09-21 05:25:30 +0000118 is 16, a leading \samp{0x} or \samp{0X} is always accepted, though
119 not required. This behaves identically to the built-in function
120 \function{int()} when passed a string. (Also note: for a more
121 flexible interpretation of numeric literals, use the built-in
122 function \function{eval()}\bifuncindex{eval}.)
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000123\end{funcdesc}
124
Fred Drakecce10901998-03-17 06:33:25 +0000125\begin{funcdesc}{atol}{s\optional{, base}}
Fred Drake15f06662000-10-04 13:59:52 +0000126 \deprecated{2.0}{Use the \function{long()} built-in function.}
Fred Drakee8489761998-12-21 18:56:13 +0000127 Convert string \var{s} to a long integer in the given \var{base}.
128 The string must consist of one or more digits, optionally preceded
129 by a sign (\samp{+} or \samp{-}). The \var{base} argument has the
130 same meaning as for \function{atoi()}. A trailing \samp{l} or
131 \samp{L} is not allowed, except if the base is 0. Note that when
132 invoked without \var{base} or with \var{base} set to 10, this
133 behaves identical to the built-in function
134 \function{long()}\bifuncindex{long} when passed a string.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000135\end{funcdesc}
136
Guido van Rossume5e55d71996-08-09 21:44:51 +0000137\begin{funcdesc}{capitalize}{word}
Fred Drakee8489761998-12-21 18:56:13 +0000138 Capitalize the first character of the argument.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000139\end{funcdesc}
140
141\begin{funcdesc}{capwords}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000142 Split the argument into words using \function{split()}, capitalize
143 each word using \function{capitalize()}, and join the capitalized
144 words using \function{join()}. Note that this replaces runs of
145 whitespace characters by a single space, and removes leading and
146 trailing whitespace.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000147\end{funcdesc}
148
Fred Drake15f06662000-10-04 13:59:52 +0000149\begin{funcdesc}{expandtabs}{s\optional{, tabsize}}
Fred Drakee8489761998-12-21 18:56:13 +0000150 Expand tabs in a string, i.e.\ replace them by one or more spaces,
151 depending on the current column and the given tab size. The column
152 number is reset to zero after each newline occurring in the string.
153 This doesn't understand other non-printing characters or escape
Guido van Rossum9700e9b1999-01-25 22:31:53 +0000154 sequences. The tab size defaults to 8.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000155\end{funcdesc}
156
Fred Drakecce10901998-03-17 06:33:25 +0000157\begin{funcdesc}{find}{s, sub\optional{, start\optional{,end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000158 Return the lowest index in \var{s} where the substring \var{sub} is
159 found such that \var{sub} is wholly contained in
160 \code{\var{s}[\var{start}:\var{end}]}. Return \code{-1} on failure.
161 Defaults for \var{start} and \var{end} and interpretation of
162 negative values is the same as for slices.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000163\end{funcdesc}
164
Fred Drakecce10901998-03-17 06:33:25 +0000165\begin{funcdesc}{rfind}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000166 Like \function{find()} but find the highest index.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000167\end{funcdesc}
168
Fred Drakecce10901998-03-17 06:33:25 +0000169\begin{funcdesc}{index}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000170 Like \function{find()} but raise \exception{ValueError} when the
171 substring is not found.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000172\end{funcdesc}
173
Fred Drakecce10901998-03-17 06:33:25 +0000174\begin{funcdesc}{rindex}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000175 Like \function{rfind()} but raise \exception{ValueError} when the
176 substring is not found.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000177\end{funcdesc}
178
Fred Drakecce10901998-03-17 06:33:25 +0000179\begin{funcdesc}{count}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000180 Return the number of (non-overlapping) occurrences of substring
181 \var{sub} in string \code{\var{s}[\var{start}:\var{end}]}.
182 Defaults for \var{start} and \var{end} and interpretation of
Andrew M. Kuchlinga4ca07c2000-06-21 01:48:46 +0000183 negative values are the same as for slices.
Guido van Rossumab3a2501994-08-01 12:18:36 +0000184\end{funcdesc}
185
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000186\begin{funcdesc}{lower}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000187 Return a copy of \var{s}, but with upper case letters converted to
188 lower case.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000189\end{funcdesc}
190
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000191\begin{funcdesc}{maketrans}{from, to}
Fred Drakee8489761998-12-21 18:56:13 +0000192 Return a translation table suitable for passing to
193 \function{translate()} or \function{regex.compile()}, that will map
194 each character in \var{from} into the character at the same position
195 in \var{to}; \var{from} and \var{to} must have the same length.
Guido van Rossuma3eebe61998-06-11 16:03:30 +0000196
Fred Drake0aa811c2001-10-20 04:24:09 +0000197 \warning{Don't use strings derived from \constant{lowercase}
Fred Drake0682be42000-04-10 18:35:49 +0000198 and \constant{uppercase} as arguments; in some locales, these don't have
Fred Drakee8489761998-12-21 18:56:13 +0000199 the same length. For case conversions, always use
Fred Drake0aa811c2001-10-20 04:24:09 +0000200 \function{lower()} and \function{upper()}.}
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000201\end{funcdesc}
202
Fred Drakecce10901998-03-17 06:33:25 +0000203\begin{funcdesc}{split}{s\optional{, sep\optional{, maxsplit}}}
Fred Drakee8489761998-12-21 18:56:13 +0000204 Return a list of the words of the string \var{s}. If the optional
205 second argument \var{sep} is absent or \code{None}, the words are
206 separated by arbitrary strings of whitespace characters (space, tab,
207 newline, return, formfeed). If the second argument \var{sep} is
208 present and not \code{None}, it specifies a string to be used as the
Fred Drakea7ce52b01999-05-27 17:18:08 +0000209 word separator. The returned list will then have one more item
Fred Drakee8489761998-12-21 18:56:13 +0000210 than the number of non-overlapping occurrences of the separator in
211 the string. The optional third argument \var{maxsplit} defaults to
212 0. If it is nonzero, at most \var{maxsplit} number of splits occur,
213 and the remainder of the string is returned as the final element of
214 the list (thus, the list will have at most \code{\var{maxsplit}+1}
215 elements).
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000216\end{funcdesc}
217
Fred Drakecce10901998-03-17 06:33:25 +0000218\begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}}
Fred Drakee8489761998-12-21 18:56:13 +0000219 This function behaves identically to \function{split()}. (In the
220 past, \function{split()} was only used with one argument, while
221 \function{splitfields()} was only used with two arguments.)
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000222\end{funcdesc}
223
Fred Drakecce10901998-03-17 06:33:25 +0000224\begin{funcdesc}{join}{words\optional{, sep}}
Fred Drakee8489761998-12-21 18:56:13 +0000225 Concatenate a list or tuple of words with intervening occurrences of
226 \var{sep}. The default value for \var{sep} is a single space
227 character. It is always true that
228 \samp{string.join(string.split(\var{s}, \var{sep}), \var{sep})}
229 equals \var{s}.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000230\end{funcdesc}
231
Fred Drakecce10901998-03-17 06:33:25 +0000232\begin{funcdesc}{joinfields}{words\optional{, sep}}
Fred Drakee8489761998-12-21 18:56:13 +0000233 This function behaves identical to \function{join()}. (In the past,
234 \function{join()} was only used with one argument, while
235 \function{joinfields()} was only used with two arguments.)
Guido van Rossume5e55d71996-08-09 21:44:51 +0000236\end{funcdesc}
237
238\begin{funcdesc}{lstrip}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000239 Return a copy of \var{s} but without leading whitespace characters.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000240\end{funcdesc}
241
242\begin{funcdesc}{rstrip}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000243 Return a copy of \var{s} but without trailing whitespace
244 characters.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000245\end{funcdesc}
246
247\begin{funcdesc}{strip}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000248 Return a copy of \var{s} without leading or trailing whitespace.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000249\end{funcdesc}
250
251\begin{funcdesc}{swapcase}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000252 Return a copy of \var{s}, but with lower case letters
253 converted to upper case and vice versa.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000254\end{funcdesc}
255
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000256\begin{funcdesc}{translate}{s, table\optional{, deletechars}}
Fred Drakee8489761998-12-21 18:56:13 +0000257 Delete all characters from \var{s} that are in \var{deletechars} (if
258 present), and then translate the characters using \var{table}, which
259 must be a 256-character string giving the translation for each
260 character value, indexed by its ordinal.
Guido van Rossumf65f2781995-09-13 17:37:21 +0000261\end{funcdesc}
262
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000263\begin{funcdesc}{upper}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000264 Return a copy of \var{s}, but with lower case letters converted to
265 upper case.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000266\end{funcdesc}
267
Fred Drakecce10901998-03-17 06:33:25 +0000268\begin{funcdesc}{ljust}{s, width}
269\funcline{rjust}{s, width}
270\funcline{center}{s, width}
Fred Drakee8489761998-12-21 18:56:13 +0000271 These functions respectively left-justify, right-justify and center
272 a string in a field of given width. They return a string that is at
273 least \var{width} characters wide, created by padding the string
274 \var{s} with spaces until the given width on the right, left or both
275 sides. The string is never truncated.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000276\end{funcdesc}
277
Fred Drakecce10901998-03-17 06:33:25 +0000278\begin{funcdesc}{zfill}{s, width}
Fred Drakee8489761998-12-21 18:56:13 +0000279 Pad a numeric string on the left with zero digits until the given
280 width is reached. Strings starting with a sign are handled
281 correctly.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000282\end{funcdesc}
Guido van Rossum0bf4d891995-03-02 12:37:30 +0000283
Guido van Rossum740eb821997-04-02 05:56:16 +0000284\begin{funcdesc}{replace}{str, old, new\optional{, maxsplit}}
Fred Drakee8489761998-12-21 18:56:13 +0000285 Return a copy of string \var{str} with all occurrences of substring
286 \var{old} replaced by \var{new}. If the optional argument
287 \var{maxsplit} is given, the first \var{maxsplit} occurrences are
288 replaced.
Guido van Rossumc8a80cd1997-03-25 16:41:31 +0000289\end{funcdesc}