blob: 82b96a12da741cc59f8f4b5c5f274a2fb4ca9147 [file] [log] [blame]
Fred Drake295da241998-08-10 19:42:37 +00001\section{\module{string} ---
Fred Drakeffbe6871999-04-22 21:23:22 +00002 Common string operations}
Fred Drakeb91e9341998-07-23 17:59:49 +00003
Fred Drakeffbe6871999-04-22 21:23:22 +00004\declaremodule{standard}{string}
Fred Drakeb91e9341998-07-23 17:59:49 +00005\modulesynopsis{Common string operations.}
6
Guido van Rossum5fdeeea1994-01-02 01:22:07 +00007
8This module defines some constants useful for checking character
Fred Drake6d2bdb61997-12-16 04:04:25 +00009classes and some useful string functions. See the module
Fred Drakeffbe6871999-04-22 21:23:22 +000010\refmodule{re}\refstmodindex{re} for string functions based on regular
Fred Drakecce10901998-03-17 06:33:25 +000011expressions.
Guido van Rossum0bf4d891995-03-02 12:37:30 +000012
13The constants defined in this module are are:
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000014
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000015\begin{datadesc}{digits}
16 The string \code{'0123456789'}.
17\end{datadesc}
18
19\begin{datadesc}{hexdigits}
20 The string \code{'0123456789abcdefABCDEF'}.
21\end{datadesc}
22
23\begin{datadesc}{letters}
Fred Drake0682be42000-04-10 18:35:49 +000024 The concatenation of the strings \constant{lowercase} and
25 \constant{uppercase} described below.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000026\end{datadesc}
27
28\begin{datadesc}{lowercase}
29 A string containing all the characters that are considered lowercase
30 letters. On most systems this is the string
Guido van Rossum86751151995-02-28 17:14:32 +000031 \code{'abcdefghijklmnopqrstuvwxyz'}. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000032 the effect on the routines \function{upper()} and
33 \function{swapcase()} is undefined.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000034\end{datadesc}
35
36\begin{datadesc}{octdigits}
37 The string \code{'01234567'}.
38\end{datadesc}
39
Fred Drake480abc22000-09-18 16:48:13 +000040\begin{datadesc}{punctuation}
41 String of \ASCII{} characters which are considered punctuation
42 characters in the \samp{C} locale.
43\end{datadesc}
44
45\begin{datadesc}{printable}
46 String of characters which are considered printable. This is a
47 combination of \constant{digits}, \constant{letters},
48 \constant{punctuation}, and \constant{whitespace}.
49\end{datadesc}
50
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000051\begin{datadesc}{uppercase}
52 A string containing all the characters that are considered uppercase
53 letters. On most systems this is the string
Guido van Rossum86751151995-02-28 17:14:32 +000054 \code{'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000055 the effect on the routines \function{lower()} and
56 \function{swapcase()} is undefined.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000057\end{datadesc}
58
59\begin{datadesc}{whitespace}
60 A string containing all characters that are considered whitespace.
61 On most systems this includes the characters space, tab, linefeed,
Guido van Rossum86751151995-02-28 17:14:32 +000062 return, formfeed, and vertical tab. Do not change its definition ---
Fred Drakecce10901998-03-17 06:33:25 +000063 the effect on the routines \function{strip()} and \function{split()}
64 is undefined.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000065\end{datadesc}
66
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000067
Fred Drake1b194f922000-09-09 05:34:06 +000068Many of the functions provided by this module are also defined as
69methods of string and Unicode objects; see ``String Methods'' (section
70\ref{string-methods}) for more information on those.
71The functions defined in this module are:
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000072
73\begin{funcdesc}{atof}{s}
Fred Drakee8489761998-12-21 18:56:13 +000074 Convert a string to a floating point number. The string must have
75 the standard syntax for a floating point literal in Python,
Fred Drake70a66c91999-02-18 16:08:36 +000076 optionally preceded by a sign (\samp{+} or \samp{-}). Note that
77 this behaves identical to the built-in function
78 \function{float()}\bifuncindex{float} when passed a string.
79
80 \strong{Note:} When passing in a string, values for NaN\index{NaN}
81 and Infinity\index{Infinity} may be returned, depending on the
82 underlying C library. The specific set of strings accepted which
83 cause these values to be returned depends entirely on the C library
84 and is known to vary.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +000085\end{funcdesc}
86
Fred Drakecce10901998-03-17 06:33:25 +000087\begin{funcdesc}{atoi}{s\optional{, base}}
Fred Drakee8489761998-12-21 18:56:13 +000088 Convert string \var{s} to an integer in the given \var{base}. The
89 string must consist of one or more digits, optionally preceded by a
90 sign (\samp{+} or \samp{-}). The \var{base} defaults to 10. If it
91 is 0, a default base is chosen depending on the leading characters
92 of the string (after stripping the sign): \samp{0x} or \samp{0X}
93 means 16, \samp{0} means 8, anything else means 10. If \var{base}
94 is 16, a leading \samp{0x} or \samp{0X} is always accepted. Note
95 that when invoked without \var{base} or with \var{base} set to 10,
96 this behaves identical to the built-in function \function{int()}
97 when passed a string. (Also note: for a more flexible
98 interpretation of numeric literals, use the built-in function
99 \function{eval()}\bifuncindex{eval}.)
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000100\end{funcdesc}
101
Fred Drakecce10901998-03-17 06:33:25 +0000102\begin{funcdesc}{atol}{s\optional{, base}}
Fred Drakee8489761998-12-21 18:56:13 +0000103 Convert string \var{s} to a long integer in the given \var{base}.
104 The string must consist of one or more digits, optionally preceded
105 by a sign (\samp{+} or \samp{-}). The \var{base} argument has the
106 same meaning as for \function{atoi()}. A trailing \samp{l} or
107 \samp{L} is not allowed, except if the base is 0. Note that when
108 invoked without \var{base} or with \var{base} set to 10, this
109 behaves identical to the built-in function
110 \function{long()}\bifuncindex{long} when passed a string.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000111\end{funcdesc}
112
Guido van Rossume5e55d71996-08-09 21:44:51 +0000113\begin{funcdesc}{capitalize}{word}
Fred Drakee8489761998-12-21 18:56:13 +0000114 Capitalize the first character of the argument.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000115\end{funcdesc}
116
117\begin{funcdesc}{capwords}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000118 Split the argument into words using \function{split()}, capitalize
119 each word using \function{capitalize()}, and join the capitalized
120 words using \function{join()}. Note that this replaces runs of
121 whitespace characters by a single space, and removes leading and
122 trailing whitespace.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000123\end{funcdesc}
124
Guido van Rossum9700e9b1999-01-25 22:31:53 +0000125\begin{funcdesc}{expandtabs}{s, \optional{tabsize}}
Fred Drakee8489761998-12-21 18:56:13 +0000126 Expand tabs in a string, i.e.\ replace them by one or more spaces,
127 depending on the current column and the given tab size. The column
128 number is reset to zero after each newline occurring in the string.
129 This doesn't understand other non-printing characters or escape
Guido van Rossum9700e9b1999-01-25 22:31:53 +0000130 sequences. The tab size defaults to 8.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000131\end{funcdesc}
132
Fred Drakecce10901998-03-17 06:33:25 +0000133\begin{funcdesc}{find}{s, sub\optional{, start\optional{,end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000134 Return the lowest index in \var{s} where the substring \var{sub} is
135 found such that \var{sub} is wholly contained in
136 \code{\var{s}[\var{start}:\var{end}]}. Return \code{-1} on failure.
137 Defaults for \var{start} and \var{end} and interpretation of
138 negative values is the same as for slices.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000139\end{funcdesc}
140
Fred Drakecce10901998-03-17 06:33:25 +0000141\begin{funcdesc}{rfind}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000142 Like \function{find()} but find the highest index.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000143\end{funcdesc}
144
Fred Drakecce10901998-03-17 06:33:25 +0000145\begin{funcdesc}{index}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000146 Like \function{find()} but raise \exception{ValueError} when the
147 substring is not found.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000148\end{funcdesc}
149
Fred Drakecce10901998-03-17 06:33:25 +0000150\begin{funcdesc}{rindex}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000151 Like \function{rfind()} but raise \exception{ValueError} when the
152 substring is not found.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000153\end{funcdesc}
154
Fred Drakecce10901998-03-17 06:33:25 +0000155\begin{funcdesc}{count}{s, sub\optional{, start\optional{, end}}}
Fred Drakee8489761998-12-21 18:56:13 +0000156 Return the number of (non-overlapping) occurrences of substring
157 \var{sub} in string \code{\var{s}[\var{start}:\var{end}]}.
158 Defaults for \var{start} and \var{end} and interpretation of
Andrew M. Kuchlinga4ca07c2000-06-21 01:48:46 +0000159 negative values are the same as for slices.
Guido van Rossumab3a2501994-08-01 12:18:36 +0000160\end{funcdesc}
161
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000162\begin{funcdesc}{lower}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000163 Return a copy of \var{s}, but with upper case letters converted to
164 lower case.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000165\end{funcdesc}
166
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000167\begin{funcdesc}{maketrans}{from, to}
Fred Drakee8489761998-12-21 18:56:13 +0000168 Return a translation table suitable for passing to
169 \function{translate()} or \function{regex.compile()}, that will map
170 each character in \var{from} into the character at the same position
171 in \var{to}; \var{from} and \var{to} must have the same length.
Guido van Rossuma3eebe61998-06-11 16:03:30 +0000172
Fred Drake0682be42000-04-10 18:35:49 +0000173 \strong{Warning:} don't use strings derived from \constant{lowercase}
174 and \constant{uppercase} as arguments; in some locales, these don't have
Fred Drakee8489761998-12-21 18:56:13 +0000175 the same length. For case conversions, always use
176 \function{lower()} and \function{upper()}.
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000177\end{funcdesc}
178
Fred Drakecce10901998-03-17 06:33:25 +0000179\begin{funcdesc}{split}{s\optional{, sep\optional{, maxsplit}}}
Fred Drakee8489761998-12-21 18:56:13 +0000180 Return a list of the words of the string \var{s}. If the optional
181 second argument \var{sep} is absent or \code{None}, the words are
182 separated by arbitrary strings of whitespace characters (space, tab,
183 newline, return, formfeed). If the second argument \var{sep} is
184 present and not \code{None}, it specifies a string to be used as the
Fred Drakea7ce52b01999-05-27 17:18:08 +0000185 word separator. The returned list will then have one more item
Fred Drakee8489761998-12-21 18:56:13 +0000186 than the number of non-overlapping occurrences of the separator in
187 the string. The optional third argument \var{maxsplit} defaults to
188 0. If it is nonzero, at most \var{maxsplit} number of splits occur,
189 and the remainder of the string is returned as the final element of
190 the list (thus, the list will have at most \code{\var{maxsplit}+1}
191 elements).
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000192\end{funcdesc}
193
Fred Drakecce10901998-03-17 06:33:25 +0000194\begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}}
Fred Drakee8489761998-12-21 18:56:13 +0000195 This function behaves identically to \function{split()}. (In the
196 past, \function{split()} was only used with one argument, while
197 \function{splitfields()} was only used with two arguments.)
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000198\end{funcdesc}
199
Fred Drakecce10901998-03-17 06:33:25 +0000200\begin{funcdesc}{join}{words\optional{, sep}}
Fred Drakee8489761998-12-21 18:56:13 +0000201 Concatenate a list or tuple of words with intervening occurrences of
202 \var{sep}. The default value for \var{sep} is a single space
203 character. It is always true that
204 \samp{string.join(string.split(\var{s}, \var{sep}), \var{sep})}
205 equals \var{s}.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000206\end{funcdesc}
207
Fred Drakecce10901998-03-17 06:33:25 +0000208\begin{funcdesc}{joinfields}{words\optional{, sep}}
Fred Drakee8489761998-12-21 18:56:13 +0000209 This function behaves identical to \function{join()}. (In the past,
210 \function{join()} was only used with one argument, while
211 \function{joinfields()} was only used with two arguments.)
Guido van Rossume5e55d71996-08-09 21:44:51 +0000212\end{funcdesc}
213
214\begin{funcdesc}{lstrip}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000215 Return a copy of \var{s} but without leading whitespace characters.
Guido van Rossume5e55d71996-08-09 21:44:51 +0000216\end{funcdesc}
217
218\begin{funcdesc}{rstrip}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000219 Return a copy of \var{s} but without trailing whitespace
220 characters.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000221\end{funcdesc}
222
223\begin{funcdesc}{strip}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000224 Return a copy of \var{s} without leading or trailing whitespace.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000225\end{funcdesc}
226
227\begin{funcdesc}{swapcase}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000228 Return a copy of \var{s}, but with lower case letters
229 converted to upper case and vice versa.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000230\end{funcdesc}
231
Guido van Rossumf4d0d571996-07-30 18:23:05 +0000232\begin{funcdesc}{translate}{s, table\optional{, deletechars}}
Fred Drakee8489761998-12-21 18:56:13 +0000233 Delete all characters from \var{s} that are in \var{deletechars} (if
234 present), and then translate the characters using \var{table}, which
235 must be a 256-character string giving the translation for each
236 character value, indexed by its ordinal.
Guido van Rossumf65f2781995-09-13 17:37:21 +0000237\end{funcdesc}
238
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000239\begin{funcdesc}{upper}{s}
Fred Drakee8489761998-12-21 18:56:13 +0000240 Return a copy of \var{s}, but with lower case letters converted to
241 upper case.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000242\end{funcdesc}
243
Fred Drakecce10901998-03-17 06:33:25 +0000244\begin{funcdesc}{ljust}{s, width}
245\funcline{rjust}{s, width}
246\funcline{center}{s, width}
Fred Drakee8489761998-12-21 18:56:13 +0000247 These functions respectively left-justify, right-justify and center
248 a string in a field of given width. They return a string that is at
249 least \var{width} characters wide, created by padding the string
250 \var{s} with spaces until the given width on the right, left or both
251 sides. The string is never truncated.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000252\end{funcdesc}
253
Fred Drakecce10901998-03-17 06:33:25 +0000254\begin{funcdesc}{zfill}{s, width}
Fred Drakee8489761998-12-21 18:56:13 +0000255 Pad a numeric string on the left with zero digits until the given
256 width is reached. Strings starting with a sign are handled
257 correctly.
Guido van Rossum5fdeeea1994-01-02 01:22:07 +0000258\end{funcdesc}
Guido van Rossum0bf4d891995-03-02 12:37:30 +0000259
Guido van Rossum740eb821997-04-02 05:56:16 +0000260\begin{funcdesc}{replace}{str, old, new\optional{, maxsplit}}
Fred Drakee8489761998-12-21 18:56:13 +0000261 Return a copy of string \var{str} with all occurrences of substring
262 \var{old} replaced by \var{new}. If the optional argument
263 \var{maxsplit} is given, the first \var{maxsplit} occurrences are
264 replaced.
Guido van Rossumc8a80cd1997-03-25 16:41:31 +0000265\end{funcdesc}
266
Guido van Rossum0bf4d891995-03-02 12:37:30 +0000267This module is implemented in Python. Much of its functionality has
Fred Drakecce10901998-03-17 06:33:25 +0000268been reimplemented in the built-in module
269\module{strop}\refbimodindex{strop}. However, you
Guido van Rossum0bf4d891995-03-02 12:37:30 +0000270should \emph{never} import the latter module directly. When
Fred Drakecce10901998-03-17 06:33:25 +0000271\module{string} discovers that \module{strop} exists, it transparently
272replaces parts of itself with the implementation from \module{strop}.
Guido van Rossum0bf4d891995-03-02 12:37:30 +0000273After initialization, there is \emph{no} overhead in using
Fred Drakecce10901998-03-17 06:33:25 +0000274\module{string} instead of \module{strop}.