blob: 950291c579c88363bd09e50cd4e402fe9b1e2047 [file] [log] [blame]
Fred Drake295da241998-08-10 19:42:37 +00001\section{\module{string} ---
Fred Drakeffbe6871999-04-22 21:23:22 +00002 Common string operations}
Fred Drakeb91e9341998-07-23 17:59:49 +00003
Fred Drakeffbe6871999-04-22 21:23:22 +00004\declaremodule{standard}{string}
Fred Drakeb91e9341998-07-23 17:59:49 +00005\modulesynopsis{Common string operations.}
6
Guido van Rossum5fdeeea1994-01-02 01:22:07 +00007
8This module defines some constants useful for checking character
Fred Drake6d2bdb61997-12-16 04:04:25 +00009classes and some useful string functions. See the module
Fred Drakeffbe6871999-04-22 21:23:22 +000010\refmodule{re}\refstmodindex{re} for string functions based on regular
Fred Drakecce10901998-03-17 06:33:25 +000011expressions.
Guido van Rossum0bf4d891995-03-02 12:37:30 +000012
Andrew M. Kuchlingbe063022000-12-26 16:14:32 +000013The constants defined in this module are:
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000014
Fred Drake960fdf92001-07-20 18:38:26 +000015\begin{datadesc}{ascii_letters}
16 The concatenation of the \constant{ascii_lowercase} and
17 \constant{ascii_uppercase} constants described below. This value is
18 not locale-dependent.
19\end{datadesc}
20
21\begin{datadesc}{ascii_lowercase}
22 The lowercase letters \code{'abcdefghijklmnopqrstuvwxyz'}. This
23 value is not locale-dependent and will not change.
24\end{datadesc}
25
26\begin{datadesc}{ascii_uppercase}
27 The uppercase letters \code{'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}. This
28 value is not locale-dependent and will not change.
29\end{datadesc}
30
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000031\begin{datadesc}{digits}
32 The string \code{'0123456789'}.
33\end{datadesc}
34
35\begin{datadesc}{hexdigits}
36 The string \code{'0123456789abcdefABCDEF'}.
37\end{datadesc}
38
39\begin{datadesc}{letters}
Fred Drake0682be42000-04-10 18:35:49 +000040 The concatenation of the strings \constant{lowercase} and
Fred Drake960fdf92001-07-20 18:38:26 +000041 \constant{uppercase} described below. The specific value is
42 locale-dependent, and will be updated when
43 \function{locale.setlocale()} is called.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000044\end{datadesc}
45
46\begin{datadesc}{lowercase}
47 A string containing all the characters that are considered lowercase
48 letters. On most systems this is the string
Guido van Rossum86751151995-02-28 17:14:32 +000049 \code{'abcdefghijklmnopqrstuvwxyz'}. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000050 the effect on the routines \function{upper()} and
Fred Drake960fdf92001-07-20 18:38:26 +000051 \function{swapcase()} is undefined. The specific value is
52 locale-dependent, and will be updated when
53 \function{locale.setlocale()} is called.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000054\end{datadesc}
55
56\begin{datadesc}{octdigits}
57 The string \code{'01234567'}.
58\end{datadesc}
59
Fred Drake480abc22000-09-18 16:48:13 +000060\begin{datadesc}{punctuation}
61 String of \ASCII{} characters which are considered punctuation
62 characters in the \samp{C} locale.
63\end{datadesc}
64
65\begin{datadesc}{printable}
66 String of characters which are considered printable. This is a
67 combination of \constant{digits}, \constant{letters},
68 \constant{punctuation}, and \constant{whitespace}.
69\end{datadesc}
70
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000071\begin{datadesc}{uppercase}
72 A string containing all the characters that are considered uppercase
73 letters. On most systems this is the string
Guido van Rossum86751151995-02-28 17:14:32 +000074 \code{'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000075 the effect on the routines \function{lower()} and
Fred Drake960fdf92001-07-20 18:38:26 +000076 \function{swapcase()} is undefined. The specific value is
77 locale-dependent, and will be updated when
78 \function{locale.setlocale()} is called.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000079\end{datadesc}
80
81\begin{datadesc}{whitespace}
82 A string containing all characters that are considered whitespace.
83 On most systems this includes the characters space, tab, linefeed,
Guido van Rossum86751151995-02-28 17:14:32 +000084 return, formfeed, and vertical tab. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000085 the effect on the routines \function{strip()} and \function{split()}
86 is undefined.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000087\end{datadesc}
88
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000089
Fred Drake1b194f922000-09-09 05:34:06 +000090Many of the functions provided by this module are also defined as
91methods of string and Unicode objects; see ``String Methods'' (section
92\ref{string-methods}) for more information on those.
93The functions defined in this module are:
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000094
95\begin{funcdesc}{atof}{s}
Fred Drake15f06662000-10-04 13:59:52 +000096 \deprecated{2.0}{Use the \function{float()} built-in function.}
Fred Drakee8489761998-12-21 18:56:13 +000097 Convert a string to a floating point number. The string must have
98 the standard syntax for a floating point literal in Python,
Fred Drake70a66c91999-02-18 16:08:36 +000099 optionally preceded by a sign (\samp{+} or \samp{-}). Note that
100 this behaves identical to the built-in function
101 \function{float()}\bifuncindex{float} when passed a string.
102
Fred Drake0aa811c2001-10-20 04:24:09 +0000103 \note{When passing in a string, values for NaN\index{NaN}
Fred Drake70a66c91999-02-18 16:08:36 +0000104 and Infinity\index{Infinity} may be returned, depending on the
105 underlying C library. The specific set of strings accepted which
106 cause these values to be returned depends entirely on the C library
Fred Drake0aa811c2001-10-20 04:24:09 +0000107 and is known to vary.}
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000108\end{funcdesc}
109
Fred Drakecce10901998-03-17 06:33:25 +0000110\begin{funcdesc}{atoi}{s\optional{, base}}
Fred Drake15f06662000-10-04 13:59:52 +0000111 \deprecated{2.0}{Use the \function{int()} built-in function.}
Fred Drakee8489761998-12-21 18:56:13 +0000112 Convert string \var{s} to an integer in the given \var{base}. The
113 string must consist of one or more digits, optionally preceded by a
114 sign (\samp{+} or \samp{-}). The \var{base} defaults to 10. If it
115 is 0, a default base is chosen depending on the leading characters
116 of the string (after stripping the sign): \samp{0x} or \samp{0X}
117 means 16, \samp{0} means 8, anything else means 10. If \var{base}
Fred Drakefffe5db2000-09-21 05:25:30 +0000118 is 16, a leading \samp{0x} or \samp{0X} is always accepted, though
119 not required. This behaves identically to the built-in function
120 \function{int()} when passed a string. (Also note: for a more
121 flexible interpretation of numeric literals, use the built-in
122 function \function{eval()}\bifuncindex{eval}.)
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000123\end{funcdesc}
124
Fred Drakecce10901998-03-17 06:33:25 +0000125\begin{funcdesc}{atol}{s\optional{, base}}
Fred Drake15f06662000-10-04 13:59:52 +0000126 \deprecated{2.0}{Use the \function{long()} built-in function.}
Fred Drakee8489761998-12-21 18:56:13 +0000127 Convert string \var{s} to a long integer in the given \var{base}.
128 The string must consist of one or more digits, optionally preceded
129 by a sign (\samp{+} or \samp{-}). The \var{base} argument has the
130 same meaning as for \function{atoi()}. A trailing \samp{l} or
131 \samp{L} is not allowed, except if the base is 0. Note that when
132 invoked without \var{base} or with \var{base} set to 10, this
133 behaves identical to the built-in function
134 \function{long()}\bifuncindex{long} when passed a string.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000135\end{funcdesc}
136
Guido van Rossume5e55d71996-08-09 21:44:51 +0000137\begin{funcdesc}{capitalize}{word}
Fred Drake473f46a2002-06-20 21:18:46 +0000138 Return a copy of \var{word} with only its first character capitalized.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000139\end{funcdesc}
140
141\begin{funcdesc}{capwords}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000142 Split the argument into words using \function{split()}, capitalize
143 each word using \function{capitalize()}, and join the capitalized
144 words using \function{join()}. Note that this replaces runs of
145 whitespace characters by a single space, and removes leading and
146 trailing whitespace.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000147\end{funcdesc}
148
Fred Drake15f06662000-10-04 13:59:52 +0000149\begin{funcdesc}{expandtabs}{s\optional{, tabsize}}
Fred Drakee8489761998-12-21 18:56:13 +0000150 Expand tabs in a string, i.e.\ replace them by one or more spaces,
151 depending on the current column and the given tab size. The column
152 number is reset to zero after each newline occurring in the string.
153 This doesn't understand other non-printing characters or escape
Guido van Rossum9700e9b1999-01-25 22:31:53 +0000154 sequences. The tab size defaults to 8.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000155\end{funcdesc}
156
Fred Drakecce10901998-03-17 06:33:25 +0000157\begin{funcdesc}{find}{s, sub\optional{, start\optional{,end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000158 Return the lowest index in \var{s} where the substring \var{sub} is
159 found such that \var{sub} is wholly contained in
160 \code{\var{s}[\var{start}:\var{end}]}. Return \code{-1} on failure.
161 Defaults for \var{start} and \var{end} and interpretation of
162 negative values is the same as for slices.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000163\end{funcdesc}
164
Fred Drakecce10901998-03-17 06:33:25 +0000165\begin{funcdesc}{rfind}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000166 Like \function{find()} but find the highest index.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000167\end{funcdesc}
168
Fred Drakecce10901998-03-17 06:33:25 +0000169\begin{funcdesc}{index}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000170 Like \function{find()} but raise \exception{ValueError} when the
171 substring is not found.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000172\end{funcdesc}
173
Fred Drakecce10901998-03-17 06:33:25 +0000174\begin{funcdesc}{rindex}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000175 Like \function{rfind()} but raise \exception{ValueError} when the
176 substring is not found.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000177\end{funcdesc}
178
Fred Drakecce10901998-03-17 06:33:25 +0000179\begin{funcdesc}{count}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000180 Return the number of (non-overlapping) occurrences of substring
181 \var{sub} in string \code{\var{s}[\var{start}:\var{end}]}.
182 Defaults for \var{start} and \var{end} and interpretation of
Andrew M. Kuchlinga4ca07c2000-06-21 01:48:46 +0000183 negative values are the same as for slices.
Guido van Rossumab3a2501994-08-01 12:18:36 +0000184\end{funcdesc}
185
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000186\begin{funcdesc}{lower}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000187 Return a copy of \var{s}, but with upper case letters converted to
188 lower case.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000189\end{funcdesc}
190
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000191\begin{funcdesc}{maketrans}{from, to}
Fred Drakee8489761998-12-21 18:56:13 +0000192 Return a translation table suitable for passing to
193 \function{translate()} or \function{regex.compile()}, that will map
194 each character in \var{from} into the character at the same position
195 in \var{to}; \var{from} and \var{to} must have the same length.
Guido van Rossuma3eebe61998-06-11 16:03:30 +0000196
Fred Drake0aa811c2001-10-20 04:24:09 +0000197 \warning{Don't use strings derived from \constant{lowercase}
Fred Drake0682be42000-04-10 18:35:49 +0000198 and \constant{uppercase} as arguments; in some locales, these don't have
Fred Drakee8489761998-12-21 18:56:13 +0000199 the same length. For case conversions, always use
Fred Drake0aa811c2001-10-20 04:24:09 +0000200 \function{lower()} and \function{upper()}.}
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000201\end{funcdesc}
202
Fred Drakecce10901998-03-17 06:33:25 +0000203\begin{funcdesc}{split}{s\optional{, sep\optional{, maxsplit}}}
Fred Drakee8489761998-12-21 18:56:13 +0000204 Return a list of the words of the string \var{s}. If the optional
205 second argument \var{sep} is absent or \code{None}, the words are
206 separated by arbitrary strings of whitespace characters (space, tab,
207 newline, return, formfeed). If the second argument \var{sep} is
208 present and not \code{None}, it specifies a string to be used as the
Fred Drakea7ce52b01999-05-27 17:18:08 +0000209 word separator. The returned list will then have one more item
Fred Drakee8489761998-12-21 18:56:13 +0000210 than the number of non-overlapping occurrences of the separator in
211 the string. The optional third argument \var{maxsplit} defaults to
212 0. If it is nonzero, at most \var{maxsplit} number of splits occur,
213 and the remainder of the string is returned as the final element of
214 the list (thus, the list will have at most \code{\var{maxsplit}+1}
215 elements).
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000216\end{funcdesc}
217
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +0000218\begin{funcdesc}{rsplit}{s\optional{, sep\optional{, maxsplit}}}
Hye-Shik Changc6f066f2003-12-17 02:49:03 +0000219 Return a list of the words of the string \var{s}, scanning \var{s}
220 from the end. To all intents and purposes, the resulting list of
221 words is the same as returned by \function{split()}, except when the
222 optional third argument \var{maxsplit} is explicitly specified and
223 nonzero. When \var{maxsplit} is nonzero, at most \var{maxsplit}
224 number of splits -- the \em{rightmost} ones -- occur, and the remainder
225 of the string is returned as the first element of the list (thus, the
226 list will have at most \code{\var{maxsplit}+1} elements).
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +0000227 \versionadded{2.4}
228\end{funcdesc}
229
Fred Drakecce10901998-03-17 06:33:25 +0000230\begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}}
Fred Drakee8489761998-12-21 18:56:13 +0000231 This function behaves identically to \function{split()}. (In the
232 past, \function{split()} was only used with one argument, while
233 \function{splitfields()} was only used with two arguments.)
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000234\end{funcdesc}
235
Fred Drakecce10901998-03-17 06:33:25 +0000236\begin{funcdesc}{join}{words\optional{, sep}}
Fred Drakee8489761998-12-21 18:56:13 +0000237 Concatenate a list or tuple of words with intervening occurrences of
238 \var{sep}. The default value for \var{sep} is a single space
239 character. It is always true that
240 \samp{string.join(string.split(\var{s}, \var{sep}), \var{sep})}
241 equals \var{s}.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000242\end{funcdesc}
243
Fred Drakecce10901998-03-17 06:33:25 +0000244\begin{funcdesc}{joinfields}{words\optional{, sep}}
Fred Drakeb7c18952002-09-12 14:16:07 +0000245 This function behaves identically to \function{join()}. (In the past,
Fred Drakee8489761998-12-21 18:56:13 +0000246 \function{join()} was only used with one argument, while
247 \function{joinfields()} was only used with two arguments.)
Fred Drakeb7c18952002-09-12 14:16:07 +0000248 Note that there is no \method{joinfields()} method on string
249 objects; use the \method{join()} method instead.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000250\end{funcdesc}
251
Walter Dörwaldde02bcb2002-04-22 17:42:37 +0000252\begin{funcdesc}{lstrip}{s\optional{, chars}}
253Return a copy of the string with leading characters removed. If
254\var{chars} is omitted or \code{None}, whitespace characters are
255removed. If given and not \code{None}, \var{chars} must be a string;
256the characters in the string will be stripped from the beginning of
257the string this method is called on.
Neal Norwitzffe33b72003-04-10 22:35:32 +0000258\versionchanged[The \var{chars} parameter was added. The \var{chars}
259parameter cannot be passed in earlier 2.2 versions]{2.2.3}
Guido van Rossume5e55d71996-08-09 21:44:51 +0000260\end{funcdesc}
261
Walter Dörwaldde02bcb2002-04-22 17:42:37 +0000262\begin{funcdesc}{rstrip}{s\optional{, chars}}
263Return a copy of the string with trailing characters removed. If
264\var{chars} is omitted or \code{None}, whitespace characters are
265removed. If given and not \code{None}, \var{chars} must be a string;
266the characters in the string will be stripped from the end of the
267string this method is called on.
Neal Norwitzffe33b72003-04-10 22:35:32 +0000268\versionchanged[The \var{chars} parameter was added. The \var{chars}
269parameter cannot be passed in 2.2 versions]{2.2.3}
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000270\end{funcdesc}
271
Walter Dörwaldde02bcb2002-04-22 17:42:37 +0000272\begin{funcdesc}{strip}{s\optional{, chars}}
273Return a copy of the string with leading and trailing characters
274removed. If \var{chars} is omitted or \code{None}, whitespace
275characters are removed. If given and not \code{None}, \var{chars}
276must be a string; the characters in the string will be stripped from
277the both ends of the string this method is called on.
Neal Norwitzffe33b72003-04-10 22:35:32 +0000278\versionchanged[The \var{chars} parameter was added. The \var{chars}
Neal Norwitza6bdf2a2003-04-17 23:07:13 +0000279parameter cannot be passed in earlier 2.2 versions]{2.2.3}
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000280\end{funcdesc}
281
282\begin{funcdesc}{swapcase}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000283 Return a copy of \var{s}, but with lower case letters
284 converted to upper case and vice versa.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000285\end{funcdesc}
286
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000287\begin{funcdesc}{translate}{s, table\optional{, deletechars}}
Fred Drakee8489761998-12-21 18:56:13 +0000288 Delete all characters from \var{s} that are in \var{deletechars} (if
289 present), and then translate the characters using \var{table}, which
290 must be a 256-character string giving the translation for each
Raymond Hettinger5c5fca92003-07-13 02:06:47 +0000291 character value, indexed by its ordinal.
Guido van Rossumf65f2781995-09-13 17:37:21 +0000292\end{funcdesc}
293
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000294\begin{funcdesc}{upper}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000295 Return a copy of \var{s}, but with lower case letters converted to
296 upper case.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000297\end{funcdesc}
298
Fred Drakecce10901998-03-17 06:33:25 +0000299\begin{funcdesc}{ljust}{s, width}
300\funcline{rjust}{s, width}
301\funcline{center}{s, width}
Fred Drakee8489761998-12-21 18:56:13 +0000302 These functions respectively left-justify, right-justify and center
303 a string in a field of given width. They return a string that is at
304 least \var{width} characters wide, created by padding the string
305 \var{s} with spaces until the given width on the right, left or both
306 sides. The string is never truncated.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000307\end{funcdesc}
308
Fred Drakecce10901998-03-17 06:33:25 +0000309\begin{funcdesc}{zfill}{s, width}
Fred Drakee8489761998-12-21 18:56:13 +0000310 Pad a numeric string on the left with zero digits until the given
311 width is reached. Strings starting with a sign are handled
312 correctly.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000313\end{funcdesc}
Guido van Rossum0bf4d891995-03-02 12:37:30 +0000314
Martin v. Löwis8bafb2a2003-11-18 19:48:57 +0000315\begin{funcdesc}{replace}{str, old, new\optional{, maxreplace}}
Fred Drakee8489761998-12-21 18:56:13 +0000316 Return a copy of string \var{str} with all occurrences of substring
317 \var{old} replaced by \var{new}. If the optional argument
Martin v. Löwis8bafb2a2003-11-18 19:48:57 +0000318 \var{maxreplace} is given, the first \var{maxreplace} occurrences are
Fred Drakee8489761998-12-21 18:56:13 +0000319 replaced.
Guido van Rossumc8a80cd1997-03-25 16:41:31 +0000320\end{funcdesc}