Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

2013-03-09 07:12:48 -0800

[diff] [blame]

1

"""Lightweight XML support for Python.

2

3

XML is an inherently hierarchical data format, and the most natural way to

4

represent it is with a tree. This module has two classes for this purpose:

5

6

1. ElementTree represents the whole XML document as a tree and

7

8

2. Element represents a single node in this tree.

9

10

Interactions with the whole document (reading and writing to/from files) are

11

usually done on the ElementTree level. Interactions with a single XML element

12

and its sub-elements are done on the Element level.

13

14

Element is a flexible container object designed to store hierarchical data

15

structures in memory. It can be described as a cross between a list and a

16

dictionary. Each Element has a number of properties associated with it:

17

18

'tag' - a string containing the element's name.

19

20

'attributes' - a Python dictionary storing the element's attributes.

21

22

'text' - a string containing the element's text content.

23

24

'tail' - an optional string containing text after the element's end tag.

25

26

And a number of child elements stored in a Python sequence.

27

28

To create an element instance, use the Element constructor,

29

or the SubElement factory function.

30

31

You can also use the ElementTree class to wrap an element structure

32

and convert it to and from XML.

"""

Eli Bendersky

2013-04-20 05:44:01 -0700

[diff] [blame]

36

#---------------------------------------------------------------------

37

# Licensed to PSF under a Contributor Agreement.

38

# See http://www.python.org/psf/license for licensing details.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

39

#

40

# ElementTree

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

41

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

42

#

43

# fredrik@pythonware.com

44

# http://www.pythonware.com

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

45

# --------------------------------------------------------------------

46

# The ElementTree toolkit is

47

#

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

48

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

49

#

50

# By obtaining, using, and/or copying this software and/or its

51

# associated documentation, you agree that you have read, understood,

52

# and will comply with the following terms and conditions:

53

#

54

# Permission to use, copy, modify, and distribute this software and

55

# its associated documentation for any purpose and without fee is

56

# hereby granted, provided that the above copyright notice appears in

57

# all copies, and that both that copyright notice and this permission

58

# notice appear in supporting documentation, and that the name of

59

# Secret Labs AB or the author not be used in advertising or publicity

60

# pertaining to distribution of the software without specific, written

61

# prior permission.

62

#

63

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

64

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

65

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

66

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

67

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

68

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

69

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

70

# OF THIS SOFTWARE.

71

# --------------------------------------------------------------------

__all__ = [

# public symbols

"Comment",

"dump",

"Element", "ElementTree",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

78

"fromstring", "fromstringlist",

Stefan Behnel

b5d3cee

2019-08-23 16:44:25 +0200

[diff] [blame^]

79

"indent", "iselement", "iterparse",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

80

"parse", "ParseError",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

81

"PI", "ProcessingInstruction",

82

"QName",

83

"SubElement",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

84

"tostring", "tostringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

85

"TreeBuilder",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

86

"VERSION",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

87

"XML", "XMLID",

Martin Panter

dcfebb3

2016-04-01 06:55:55 +0000

[diff] [blame]

88

"XMLParser", "XMLPullParser",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

89

"register_namespace",

Stefan Behnel

e1d5dd6

2019-05-01 22:34:13 +0200

[diff] [blame]

90

"canonicalize", "C14NWriterTarget",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

91

]

92

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

93

VERSION = "1.3.0"

94

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

95

import sys

96

import re

97

import warnings

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

98

import io

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

99

import collections

Serhiy Storchaka

2e576f5

2017-04-24 09:05:00 +0300

[diff] [blame]

100

import collections.abc

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

101

import contextlib

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

102

Eli Bendersky

27cbb19

2012-06-15 09:03:19 +0300

[diff] [blame]

103

from . import ElementPath

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

104

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

105

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

106

class ParseError(SyntaxError):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

107

"""An error when parsing an XML document.

108

109

In addition to its exception value, a ParseError contains

110

two extra attributes:

111

'code' - the specific exception code

112

'position' - the line and column of the error

113

114

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

115

pass

116

117

# --------------------------------------------------------------------

118

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

119

120

def iselement(element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

121

"""Return True if *element* appears to be an Element."""

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

122

return hasattr(element, 'tag')

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

123

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

124

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

125

class Element:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

126

"""An XML element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

127

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

128

This class is the reference implementation of the Element interface.

129

130

An element's length is its number of subelements. That means if you

Serhiy Storchaka

56a6d85

2014-12-01 18:28:43 +0200

[diff] [blame]

131

want to check if an element is truly empty, you should check BOTH

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

132

its length AND its text attribute.

133

134

The element tag, attribute names, and attribute values can be either

135

bytes or strings.

136

137

*tag* is the element name. *attrib* is an optional dictionary containing

138

element attributes. *extra* are additional element attributes given as

keyword arguments.

Example form:

<tag attrib>text<child/>...</tag>tail

143

144

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

145

146

tag = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

147

"""The element's name."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

148

149

attrib = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

150

"""Dictionary of the element's attributes."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

151

152

text = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

153

"""

154

Text before first subelement. This is either a string or the value None.

155

Note that if there is no text, this attribute may be either

156

None or the empty string, depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

157

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

158

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

159

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

160

tail = None

161

"""

162

Text after this element's end tag, but before the next sibling element's

163

start tag. This is either a string or the value None. Note that if there

164

was no text, this attribute may be either None or an empty string,

165

depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

166

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

167

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

168

169

def __init__(self, tag, attrib={}, **extra):

Eli Bendersky

737b173

2012-05-29 06:02:56 +0300

[diff] [blame]

170

if not isinstance(attrib, dict):

171

raise TypeError("attrib must be dict, not %s" % (

172

attrib.__class__.__name__,))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

173

self.tag = tag

Serhiy Storchaka

da08470

2019-03-27 08:02:28 +0200

[diff] [blame]

174

self.attrib = {**attrib, **extra}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

175

self._children = []

176

177

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

178

return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

179

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

180

def makeelement(self, tag, attrib):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

181

"""Create a new element with the same type.

182

183

*tag* is a string containing the element name.

184

*attrib* is a dictionary containing the element attributes.

185

186

Do not call this method, use the SubElement factory function instead.

187

188

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

189

return self.__class__(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

190

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

191

def copy(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

192

"""Return copy of current element.

193

194

This creates a shallow copy. Subelements will be shared with the

195

original tree.

196

197

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

198

elem = self.makeelement(self.tag, self.attrib)

199

elem.text = self.text

200

elem.tail = self.tail

elem[:] = self

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

204

def __len__(self):

205

return len(self._children)

206

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

207

def __bool__(self):

208

warnings.warn(

209

"The behavior of this method will change in future versions. "

210

"Use specific 'len(elem)' or 'elem is not None' test instead.",

211

FutureWarning, stacklevel=2

212

)

213

return len(self._children) != 0 # emulate old behaviour, for now

214

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

215

def __getitem__(self, index):

216

return self._children[index]

217

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

218

def __setitem__(self, index, element):

Serhiy Storchaka

f081fd8

2018-10-19 12:12:57 +0300

[diff] [blame]

219

if isinstance(index, slice):

220

for elt in element:

221

self._assert_is_element(elt)

222

else:

223

self._assert_is_element(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

224

self._children[index] = element

225

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

226

def __delitem__(self, index):

227

del self._children[index]

228

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

229

def append(self, subelement):

230

"""Add *subelement* to the end of this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

231

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

232

The new element will appear in document order after the last existing

233

subelement (or directly after the text, if it's the first subelement),

234

but before the end tag for this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

235

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

236

"""

237

self._assert_is_element(subelement)

238

self._children.append(subelement)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

239

240

def extend(self, elements):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

241

"""Append subelements from a sequence.

242

243

*elements* is a sequence with zero or more elements.

244

245

"""

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

246

for element in elements:

247

self._assert_is_element(element)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

248

self._children.extend(elements)

249

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

250

def insert(self, index, subelement):

251

"""Insert *subelement* at position *index*."""

252

self._assert_is_element(subelement)

253

self._children.insert(index, subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

254

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

255

def _assert_is_element(self, e):

Antoine Pitrou

ee32931

2012-10-04 19:53:29 +0200

[diff] [blame]

256

# Need to refer to the actual Python implementation, not the

257

# shadowing C implementation.

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

258

if not isinstance(e, _Element_Py):

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

259

raise TypeError('expected an Element, not %s' % type(e).__name__)

260

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

261

def remove(self, subelement):

262

"""Remove matching subelement.

263

264

Unlike the find methods, this method compares elements based on

265

identity, NOT ON tag value or contents. To remove subelements by

266

other means, the easiest way is to use a list comprehension to

267

select what elements to keep, and then use slice assignment to update

268

the parent element.

269

270

ValueError is raised if a matching element could not be found.

271

272

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

273

# assert iselement(element)

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

274

self._children.remove(subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

275

276

def getchildren(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

277

"""(Deprecated) Return all subelements.

278

279

Elements are returned in document order.

280

281

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

282

warnings.warn(

283

"This method will be removed in future versions. "

284

"Use 'list(elem)' or iteration over elem instead.",

285

DeprecationWarning, stacklevel=2

286

)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

287

return self._children

288

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

289

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

290

"""Find first matching element by tag name or path.

291

292

*path* is a string having either an element tag or an XPath,

293

*namespaces* is an optional mapping from namespace prefix to full name.

294

295

Return the first matching element, or None if no element was found.

296

297

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

298

return ElementPath.find(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

299

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

300

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

301

"""Find text for first matching element by tag name or path.

302

303

*path* is a string having either an element tag or an XPath,

304

*default* is the value to return if the element was not found,

305

*namespaces* is an optional mapping from namespace prefix to full name.

306

307

Return text content of first matching element, or default value if

308

none was found. Note that if an element is found having no text

309

content, the empty string is returned.

310

311

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

312

return ElementPath.findtext(self, path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

313

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

314

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

315

"""Find all matching subelements by tag name or path.

316

317

*path* is a string having either an element tag or an XPath,

318

*namespaces* is an optional mapping from namespace prefix to full name.

319

320

Returns list containing all matching elements in document order.

321

322

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

323

return ElementPath.findall(self, path, namespaces)

324

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

325

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

326

"""Find all matching subelements by tag name or path.

327

328

*path* is a string having either an element tag or an XPath,

329

*namespaces* is an optional mapping from namespace prefix to full name.

330

331

Return an iterable yielding all matching elements in document order.

332

333

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

334

return ElementPath.iterfind(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

335

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

336

def clear(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

337

"""Reset element.

338

339

This function removes all subelements, clears all attributes, and sets

340

the text and tail attributes to None.

341

342

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

343

self.attrib.clear()

344

self._children = []

345

self.text = self.tail = None

346

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

347

def get(self, key, default=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

348

"""Get element attribute.

349

350

Equivalent to attrib.get, but some implementations may handle this a

351

bit more efficiently. *key* is what attribute to look for, and

352

*default* is what to return if the attribute was not found.

353

354

Returns a string containing the attribute value, or the default if

355

attribute was not found.

356

357

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

358

return self.attrib.get(key, default)

359

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

360

def set(self, key, value):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

361

"""Set element attribute.

362

363

Equivalent to attrib[key] = value, but some implementations may handle

364

this a bit more efficiently. *key* is what attribute to set, and

365

*value* is the attribute value to set it to.

366

367

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

368

self.attrib[key] = value

369

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

370

def keys(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

371

"""Get list of attribute names.

372

373

Names are returned in an arbitrary order, just like an ordinary

374

Python dict. Equivalent to attrib.keys()

375

376

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

377

return self.attrib.keys()

378

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

379

def items(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

380

"""Get element attributes as a sequence.

381

382

The attributes are returned in arbitrary order. Equivalent to

383

attrib.items().

384

385

Return a list of (name, value) tuples.

386

387

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

388

return self.attrib.items()

389

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

390

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

391

"""Create tree iterator.

392

393

The iterator loops over the element and all subelements in document

394

order, returning all elements with a matching tag.

395

396

If the tree structure is modified during iteration, new or removed

397

elements may or may not be included. To get a stable set, use the

398

list() function on the iterator, and loop over the resulting list.

399

400

*tag* is what tags to look for (default is to return all elements)

401

402

Return an iterator containing all the matching elements.

403

404

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

405

if tag == "*":

406

tag = None

407

if tag is None or self.tag == tag:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

408

yield self

409

for e in self._children:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

410

yield from e.iter(tag)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

411

412

# compatibility

413

def getiterator(self, tag=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

414

warnings.warn(

415

"This method will be removed in future versions. "

416

"Use 'elem.iter()' or 'list(elem.iter())' instead.",

Serhiy Storchaka

2018-07-24 12:03:34 +0300

[diff] [blame]

417

DeprecationWarning, stacklevel=2

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

418

)

419

return list(self.iter(tag))

420

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

421

def itertext(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

422

"""Create text iterator.

423

424

The iterator loops over the element and all subelements in document

425

order, returning all inner text.

426

427

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

428

tag = self.tag

429

if not isinstance(tag, str) and tag is not None:

430

return

Serhiy Storchaka

66c08d9

2015-12-21 11:09:48 +0200

[diff] [blame]

431

t = self.text

432

if t:

433

yield t

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

434

for e in self:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

435

yield from e.itertext()

Serhiy Storchaka

66c08d9

2015-12-21 11:09:48 +0200

[diff] [blame]

436

t = e.tail

437

if t:

438

yield t

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

439

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

440

441

def SubElement(parent, tag, attrib={}, **extra):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

442

"""Subelement factory which creates an element instance, and appends it

443

to an existing parent.

444

445

The element tag, attribute names, and attribute values can be either

446

bytes or Unicode strings.

447

448

*parent* is the parent element, *tag* is the subelements name, *attrib* is

449

an optional directory containing element attributes, *extra* are

450

additional attributes given as keyword arguments.

451

452

"""

Serhiy Storchaka

da08470

2019-03-27 08:02:28 +0200

[diff] [blame]

453

attrib = {**attrib, **extra}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

454

element = parent.makeelement(tag, attrib)

455

parent.append(element)

456

return element

457

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

458

459

def Comment(text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

460

"""Comment element factory.

461

462

This function creates a special element which the standard serializer

463

serializes as an XML comment.

464

465

*text* is a string containing the comment string.

466

467

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

468

element = Element(Comment)

element.text = text

return element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

472

473

def ProcessingInstruction(target, text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

474

"""Processing Instruction element factory.

475

476

This function creates a special element which the standard serializer

477

serializes as an XML comment.

478

479

*target* is a string containing the processing instruction, *text* is a

480

string containing the processing instruction contents, if any.

481

482

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

483

element = Element(ProcessingInstruction)

484

element.text = target

485

if text:

486

element.text = element.text + " " + text

487

return element

488

489

PI = ProcessingInstruction

490

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

491

492

class QName:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

493

"""Qualified name wrapper.

494

495

This class can be used to wrap a QName attribute value in order to get

496

proper namespace handing on output.

497

498

*text_or_uri* is a string containing the QName value either in the form

499

{uri}local, or if the tag argument is given, the URI part of a QName.

500

501

*tag* is an optional argument which if given, will make the first

502

argument (text_or_uri) be interpreted as a URI, and this argument (tag)

503

be interpreted as a local name.

504

505

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

506

def __init__(self, text_or_uri, tag=None):

507

if tag:

508

text_or_uri = "{%s}%s" % (text_or_uri, tag)

509

self.text = text_or_uri

510

def __str__(self):

511

return self.text

Georg Brandl

b56c0e2

2010-12-09 18:10:27 +0000

[diff] [blame]

512

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

513

return '<%s %r>' % (self.__class__.__name__, self.text)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

514

def __hash__(self):

515

return hash(self.text)

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

516

def __le__(self, other):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

517

if isinstance(other, QName):

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

518

return self.text <= other.text

519

return self.text <= other

520

def __lt__(self, other):

521

if isinstance(other, QName):

522

return self.text < other.text

523

return self.text < other

524

def __ge__(self, other):

525

if isinstance(other, QName):

526

return self.text >= other.text

527

return self.text >= other

528

def __gt__(self, other):

529

if isinstance(other, QName):

530

return self.text > other.text

531

return self.text > other

532

def __eq__(self, other):

533

if isinstance(other, QName):

534

return self.text == other.text

535

return self.text == other

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

536

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

537

# --------------------------------------------------------------------

538

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

539

540

class ElementTree:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

541

"""An XML element hierarchy.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

542

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

543

This class also provides support for serialization to and from

544

standard XML.

545

546

*element* is an optional root element node,

547

*file* is an optional file handle or file name of an XML file whose

548

contents will be used to initialize the tree with.

549

550

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

551

def __init__(self, element=None, file=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

552

# assert element is None or iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

553

self._root = element # first node

if file:

self.parse(file)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

557

def getroot(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

558

"""Return root element of this tree."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

559

return self._root

560

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

561

def _setroot(self, element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

562

"""Replace root element of this tree.

563

564

This will discard the current contents of the tree and replace it

565

with the given element. Use with care!

566

567

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

568

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

569

self._root = element

570

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

571

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

572

"""Load external XML document into element tree.

573

574

*source* is a file name or file object, *parser* is an optional parser

575

instance that defaults to XMLParser.

576

577

ParseError is raised if the parser fails to parse the document.

578

579

Returns the root element of the given source document.

580

581

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

582

close_source = False

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

583

if not hasattr(source, "read"):

584

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

585

close_source = True

586

try:

Eli Bendersky

a369923

2013-05-19 18:47:23 -0700

[diff] [blame]

587

if parser is None:

588

# If no parser was specified, create a default XMLParser

589

parser = XMLParser()

590

if hasattr(parser, '_parse_whole'):

591

# The default XMLParser, when it comes from an accelerator,

592

# can define an internal _parse_whole API for efficiency.

593

# It can be used to parse the whole source without feeding

594

# it with chunks.

595

self._root = parser._parse_whole(source)

596

return self._root

597

while True:

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

598

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

return self._root

finally:

if close_source:

source.close()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

607

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

608

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

609

"""Create and return tree iterator for the root element.

610

611

The iterator loops over all elements in this tree, in document order.

612

613

*tag* is a string with the tag name to iterate over

614

(default is to return all elements).

615

616

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

617

# assert self._root is not None

618

return self._root.iter(tag)

619

620

# compatibility

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

621

def getiterator(self, tag=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

622

warnings.warn(

623

"This method will be removed in future versions. "

624

"Use 'tree.iter()' or 'list(tree.iter())' instead.",

Serhiy Storchaka

2018-07-24 12:03:34 +0300

[diff] [blame]

625

DeprecationWarning, stacklevel=2

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

626

)

627

return list(self.iter(tag))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

628

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

629

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

630

"""Find first matching element by tag name or path.

631

632

Same as getroot().find(path), which is Element.find()

633

634

*path* is a string having either an element tag or an XPath,

635

*namespaces* is an optional mapping from namespace prefix to full name.

636

637

Return the first matching element, or None if no element was found.

638

639

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

640

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

641

if path[:1] == "/":

642

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

643

warnings.warn(

644

"This search is broken in 1.3 and earlier, and will be "

645

"fixed in a future version. If you rely on the current "

646

"behaviour, change it to %r" % path,

647

FutureWarning, stacklevel=2

648

)

649

return self._root.find(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

650

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

651

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

652

"""Find first matching element by tag name or path.

653

654

Same as getroot().findtext(path), which is Element.findtext()

655

656

*path* is a string having either an element tag or an XPath,

657

*namespaces* is an optional mapping from namespace prefix to full name.

658

659

Return the first matching element, or None if no element was found.

660

661

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

662

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

663

if path[:1] == "/":

664

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

665

warnings.warn(

666

"This search is broken in 1.3 and earlier, and will be "

667

"fixed in a future version. If you rely on the current "

668

"behaviour, change it to %r" % path,

669

FutureWarning, stacklevel=2

670

)

671

return self._root.findtext(path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

672

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

673

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

674

"""Find all matching subelements by tag name or path.

675

676

Same as getroot().findall(path), which is Element.findall().

677

678

*path* is a string having either an element tag or an XPath,

679

*namespaces* is an optional mapping from namespace prefix to full name.

680

681

Return list containing all matching elements in document order.

682

683

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

684

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

685

if path[:1] == "/":

686

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

687

warnings.warn(

688

"This search is broken in 1.3 and earlier, and will be "

689

"fixed in a future version. If you rely on the current "

690

"behaviour, change it to %r" % path,

691

FutureWarning, stacklevel=2

692

)

693

return self._root.findall(path, namespaces)

694

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

695

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

696

"""Find all matching subelements by tag name or path.

697

698

Same as getroot().iterfind(path), which is element.iterfind()

699

700

*path* is a string having either an element tag or an XPath,

701

*namespaces* is an optional mapping from namespace prefix to full name.

702

703

Return an iterable yielding all matching elements in document order.

704

705

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

706

# assert self._root is not None

if path[:1] == "/":

path = "." + path

warnings.warn(

"This search is broken in 1.3 and earlier, and will be "

711

"fixed in a future version. If you rely on the current "

712

"behaviour, change it to %r" % path,

713

FutureWarning, stacklevel=2

714

)

715

return self._root.iterfind(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

716

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

717

def write(self, file_or_filename,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

718

encoding=None,

719

xml_declaration=None,

720

default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

721

method=None, *,

722

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

723

"""Write element tree to a file as XML.

724

725

Arguments:

726

*file_or_filename* -- file name or a file object opened for writing

727

728

*encoding* -- the output encoding (default: US-ASCII)

729

730

*xml_declaration* -- bool indicating if an XML declaration should be

731

added to the output. If None, an XML declaration

732

is added if encoding IS NOT either of:

733

US-ASCII, UTF-8, or Unicode

734

735

*default_namespace* -- sets the default XML namespace (for "xmlns")

736

737

*method* -- either "xml" (default), "html, "text", or "c14n"

738

739

*short_empty_elements* -- controls the formatting of elements

740

that contain no content. If True (default)

741

they are emitted as a single self-closed

742

tag, otherwise they are emitted as a pair

743

of start/end tags

Eli Bendersky

e9af827

2013-01-13 06:27:51 -0800

[diff] [blame]

744

745

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

746

if not method:

747

method = "xml"

748

elif method not in _serialize:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

749

raise ValueError("unknown method %r" % method)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

if not encoding:

if method == "c14n":

encoding = "utf-8"

else:

encoding = "us-ascii"

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

755

enc_lower = encoding.lower()

756

with _get_writer(file_or_filename, enc_lower) as write:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

757

if method == "xml" and (xml_declaration or

758

(xml_declaration is None and

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

759

enc_lower not in ("utf-8", "us-ascii", "unicode"))):

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

760

declared_encoding = encoding

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

761

if enc_lower == "unicode":

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

762

# Retrieve the default encoding for the xml declaration

763

import locale

764

declared_encoding = locale.getpreferredencoding()

765

write("<?xml version='1.0' encoding='%s'?>\n" % (

766

declared_encoding,))

767

if method == "text":

768

_serialize_text(write, self._root)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

769

else:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

770

qnames, namespaces = _namespaces(self._root, default_namespace)

771

serialize = _serialize[method]

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

772

serialize(write, self._root, qnames, namespaces,

773

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

774

775

def write_c14n(self, file):

776

# lxml.etree compatibility. use output method instead

777

return self.write(file, method="c14n")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

778

779

# --------------------------------------------------------------------

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

780

# serialization support

781

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

782

@contextlib.contextmanager

783

def _get_writer(file_or_filename, encoding):

Ezio Melotti

b5bc353

2013-08-17 16:11:40 +0300

[diff] [blame]

784

# returns text write method and release all resources after using

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

785

try:

786

write = file_or_filename.write

787

except AttributeError:

788

# file_or_filename is a file name

789

if encoding == "unicode":

790

file = open(file_or_filename, "w")

791

else:

792

file = open(file_or_filename, "w", encoding=encoding,

793

errors="xmlcharrefreplace")

with file:

yield file.write

else:

# file_or_filename is a file-like object

798

# encoding determines if it is a text or binary writer

799

if encoding == "unicode":

800

# use a text writer as is

801

yield write

802

else:

803

# wrap a binary writer with TextIOWrapper

804

with contextlib.ExitStack() as stack:

805

if isinstance(file_or_filename, io.BufferedIOBase):

806

file = file_or_filename

807

elif isinstance(file_or_filename, io.RawIOBase):

808

file = io.BufferedWriter(file_or_filename)

809

# Keep the original file open when the BufferedWriter is

810

# destroyed

811

stack.callback(file.detach)

812

else:

813

# This is to handle passed objects that aren't in the

814

# IOBase hierarchy, but just have a write method

815

file = io.BufferedIOBase()

816

file.writable = lambda: True

817

file.write = write

818

try:

819

# TextIOWrapper uses this methods to determine

820

# if BOM (for UTF-16, etc) should be added

821

file.seekable = file_or_filename.seekable

822

file.tell = file_or_filename.tell

823

except AttributeError:

824

pass

825

file = io.TextIOWrapper(file,

826

encoding=encoding,

827

errors="xmlcharrefreplace",

828

newline="\n")

829

# Keep the original file open when the TextIOWrapper is

830

# destroyed

831

stack.callback(file.detach)

832

yield file.write

833

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

834

def _namespaces(elem, default_namespace=None):

835

# identify namespaces used in this tree

836

837

# maps qnames to *encoded* prefix:local names

838

qnames = {None: None}

839

840

# maps uri:s to prefixes

841

namespaces = {}

842

if default_namespace:

843

namespaces[default_namespace] = ""

844

845

def add_qname(qname):

846

# calculate serialized qname representation

847

try:

848

if qname[:1] == "{":

849

uri, tag = qname[1:].rsplit("}", 1)

850

prefix = namespaces.get(uri)

851

if prefix is None:

852

prefix = _namespace_map.get(uri)

853

if prefix is None:

854

prefix = "ns%d" % len(namespaces)

855

if prefix != "xml":

856

namespaces[uri] = prefix

857

if prefix:

858

qnames[qname] = "%s:%s" % (prefix, tag)

859

else:

860

qnames[qname] = tag # default element

861

else:

862

if default_namespace:

863

# FIXME: can this be handled in XML 1.0?

864

raise ValueError(

865

"cannot use non-qualified names with "

866

"default_namespace option"

867

)

868

qnames[qname] = qname

869

except TypeError:

870

_raise_serialization_error(qname)

871

872

# populate qname and namespaces table

Eli Bendersky

64d11e6

2012-06-15 07:42:50 +0300

[diff] [blame]

873

for elem in elem.iter():

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

874

tag = elem.tag

Senthil Kumaran

ec30b3d

2010-11-09 02:36:59 +0000

[diff] [blame]

875

if isinstance(tag, QName):

876

if tag.text not in qnames:

877

add_qname(tag.text)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

878

elif isinstance(tag, str):

879

if tag not in qnames:

880

add_qname(tag)

881

elif tag is not None and tag is not Comment and tag is not PI:

882

_raise_serialization_error(tag)

883

for key, value in elem.items():

884

if isinstance(key, QName):

885

key = key.text

886

if key not in qnames:

887

add_qname(key)

888

if isinstance(value, QName) and value.text not in qnames:

889

add_qname(value.text)

890

text = elem.text

891

if isinstance(text, QName) and text.text not in qnames:

892

add_qname(text.text)

893

return qnames, namespaces

894

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

895

def _serialize_xml(write, elem, qnames, namespaces,

896

short_empty_elements, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % text)

901

elif tag is ProcessingInstruction:

902

write("<?%s?>" % text)

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

908

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

909

_serialize_xml(write, e, qnames, None,

910

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

911

else:

912

write("<" + tag)

913

items = list(elem.items())

914

if items or namespaces:

915

if namespaces:

916

for v, k in sorted(namespaces.items(),

917

key=lambda x: x[1]): # sort on prefix

918

if k:

919

k = ":" + k

920

write(" xmlns%s=\"%s\"" % (

921

k,

922

_escape_attrib(v)

923

))

Raymond Hettinger

e3685fd

2018-10-28 11:18:22 -0700

[diff] [blame]

924

for k, v in items:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

925

if isinstance(k, QName):

926

k = k.text

927

if isinstance(v, QName):

928

v = qnames[v.text]

929

else:

930

v = _escape_attrib(v)

931

write(" %s=\"%s\"" % (qnames[k], v))

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

932

if text or len(elem) or not short_empty_elements:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

933

write(">")

934

if text:

935

write(_escape_cdata(text))

936

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

937

_serialize_xml(write, e, qnames, None,

938

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

939

write("</" + tag + ">")

else:

write(" />")

if elem.tail:

write(_escape_cdata(elem.tail))

944

945

HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",

Ezio Melotti

c90111f

2012-09-19 08:19:12 +0300

[diff] [blame]

946

"img", "input", "isindex", "link", "meta", "param")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

947

948

try:

949

HTML_EMPTY = set(HTML_EMPTY)

except NameError:

pass

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

953

def _serialize_html(write, elem, qnames, namespaces, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % _escape_cdata(text))

958

elif tag is ProcessingInstruction:

959

write("<?%s?>" % _escape_cdata(text))

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

965

for e in elem:

966

_serialize_html(write, e, qnames, None)

967

else:

968

write("<" + tag)

969

items = list(elem.items())

970

if items or namespaces:

971

if namespaces:

972

for v, k in sorted(namespaces.items(),

973

key=lambda x: x[1]): # sort on prefix

974

if k:

975

k = ":" + k

976

write(" xmlns%s=\"%s\"" % (

977

k,

978

_escape_attrib(v)

979

))

Serhiy Storchaka

3b05ad7

2018-10-29 19:31:04 +0200

[diff] [blame]

980

for k, v in items:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

981

if isinstance(k, QName):

982

k = k.text

983

if isinstance(v, QName):

984

v = qnames[v.text]

985

else:

986

v = _escape_attrib_html(v)

987

# FIXME: handle boolean attributes

988

write(" %s=\"%s\"" % (qnames[k], v))

989

write(">")

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

990

ltag = tag.lower()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

991

if text:

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

992

if ltag == "script" or ltag == "style":

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

993

write(text)

994

else:

995

write(_escape_cdata(text))

996

for e in elem:

997

_serialize_html(write, e, qnames, None)

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

998

if ltag not in HTML_EMPTY:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

999

write("</" + tag + ">")

1000

if elem.tail:

1001

write(_escape_cdata(elem.tail))

1002

1003

def _serialize_text(write, elem):

1004

for part in elem.itertext():

write(part)

if elem.tail:

write(elem.tail)

_serialize = {

"xml": _serialize_xml,

1011

"html": _serialize_html,

1012

"text": _serialize_text,

1013

# this optional method is imported at the end of the module

1014

# "c14n": _serialize_c14n,

1015

}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1016

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1017

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1018

def register_namespace(prefix, uri):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1019

"""Register a namespace prefix.

1020

1021

The registry is global, and any existing mapping for either the

1022

given prefix or the namespace URI will be removed.

1023

1024

*prefix* is the namespace prefix, *uri* is a namespace uri. Tags and

1025

attributes in this namespace will be serialized with prefix if possible.

1026

1027

ValueError is raised if prefix is reserved or is invalid.

1028

1029

"""

R David Murray

44b548d

2016-09-08 13:59:53 -0400

[diff] [blame]

1030

if re.match(r"ns\d+$", prefix):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1031

raise ValueError("Prefix format reserved for internal use")

Georg Brandl

90b2067

2010-12-28 10:38:33 +0000

[diff] [blame]

1032

for k, v in list(_namespace_map.items()):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1033

if k == uri or v == prefix:

1034

del _namespace_map[k]

1035

_namespace_map[uri] = prefix

1036

1037

_namespace_map = {

1038

# "well-known" namespace prefixes

1039

"http://www.w3.org/XML/1998/namespace": "xml",

1040

"http://www.w3.org/1999/xhtml": "html",

1041

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

1042

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

1043

# xml schema

1044

"http://www.w3.org/2001/XMLSchema": "xs",

1045

"http://www.w3.org/2001/XMLSchema-instance": "xsi",

1046

# dublin core

1047

"http://purl.org/dc/elements/1.1/": "dc",

1048

}

Florent Xicluna

1639505

2012-02-16 23:28:35 +0100

[diff] [blame]

1049

# For tests and troubleshooting

1050

register_namespace._namespace_map = _namespace_map

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1051

1052

def _raise_serialization_error(text):

1053

raise TypeError(

1054

"cannot serialize %r (type %s)" % (text, type(text).__name__)

1055

)

1056

1057

def _escape_cdata(text):

1058

# escape character data

1059

try:

1060

# it's worth avoiding do-nothing calls for strings that are

Mike

53f7a7c

2017-12-14 14:04:53 +0300

[diff] [blame]

1061

# shorter than 500 characters, or so. assume that's, by far,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1062

# the most common case in most applications.

1063

if "&" in text:

1064

text = text.replace("&", "&")

1065

if "<" in text:

1066

text = text.replace("<", "<")

1067

if ">" in text:

1068

text = text.replace(">", ">")

1069

return text

1070

except (TypeError, AttributeError):

1071

_raise_serialization_error(text)

1072

1073

def _escape_attrib(text):

1074

# escape attribute value

1075

try:

1076

if "&" in text:

1077

text = text.replace("&", "&")

1078

if "<" in text:

1079

text = text.replace("<", "<")

1080

if ">" in text:

1081

text = text.replace(">", ">")

1082

if "\"" in text:

1083

text = text.replace("\"", """)

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1084

# The following business with carriage returns is to satisfy

Raymond Hettinger

11fa3ff

2016-09-11 23:23:24 -0700

[diff] [blame]

1085

# Section 2.11 of the XML specification, stating that

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1086

# CR or CR LN should be replaced with just LN

1087

# http://www.w3.org/TR/REC-xml/#sec-line-ends

1088

if "\r\n" in text:

1089

text = text.replace("\r\n", "\n")

1090

if "\r" in text:

1091

text = text.replace("\r", "\n")

1092

#The following four lines are issue 17582

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1093

if "\n" in text:

1094

text = text.replace("\n", "
")

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1095

if "\t" in text:

1096

text = text.replace("\t", "	")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1097

return text

1098

except (TypeError, AttributeError):

1099

_raise_serialization_error(text)

1100

1101

def _escape_attrib_html(text):

1102

# escape attribute value

1103

try:

1104

if "&" in text:

1105

text = text.replace("&", "&")

1106

if ">" in text:

1107

text = text.replace(">", ">")

1108

if "\"" in text:

1109

text = text.replace("\"", """)

1110

return text

1111

except (TypeError, AttributeError):

1112

_raise_serialization_error(text)

1113

1114

# --------------------------------------------------------------------

1115

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1116

def tostring(element, encoding=None, method=None, *,

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1117

xml_declaration=None, default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1118

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1119

"""Generate string representation of XML element.

1120

1121

All subelements are included. If encoding is "unicode", a string

1122

is returned. Otherwise a bytestring is returned.

1123

1124

*element* is an Element instance, *encoding* is an optional output

1125

encoding defaulting to US-ASCII, *method* is an optional output which can

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1126

be one of "xml" (default), "html", "text" or "c14n", *default_namespace*

1127

sets the default XML namespace (for "xmlns").

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1128

1129

Returns an (optionally) encoded string containing the XML data.

1130

1131

"""

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1132

stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1133

ElementTree(element).write(stream, encoding,

1134

xml_declaration=xml_declaration,

1135

default_namespace=default_namespace,

1136

method=method,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1137

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1138

return stream.getvalue()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1139

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1140

class _ListDataStream(io.BufferedIOBase):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1141

"""An auxiliary stream accumulating into a list reference."""

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1142

def __init__(self, lst):

1143

self.lst = lst

Eli Bendersky

f90fc68

2012-07-17 15:09:56 +0300

[diff] [blame]

1144

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

def writable(self):

return True

def seekable(self):

return True

def write(self, b):

self.lst.append(b)

def tell(self):

return len(self.lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1157

def tostringlist(element, encoding=None, method=None, *,

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1158

xml_declaration=None, default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1159

short_empty_elements=True):

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1160

lst = []

1161

stream = _ListDataStream(lst)

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1162

ElementTree(element).write(stream, encoding,

1163

xml_declaration=xml_declaration,

1164

default_namespace=default_namespace,

1165

method=method,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1166

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1167

return lst

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1168

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1169

1170

def dump(elem):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1171

"""Write element tree or element structure to sys.stdout.

1172

1173

This function should be used for debugging only.

1174

1175

*elem* is either an ElementTree, or a single Element. The exact output

1176

format is implementation dependent. In this version, it's written as an

1177

ordinary XML file.

1178

1179

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1180

# debugging

1181

if not isinstance(elem, ElementTree):

1182

elem = ElementTree(elem)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

1183

elem.write(sys.stdout, encoding="unicode")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1184

tail = elem.getroot().tail

1185

if not tail or tail[-1] != "\n":

1186

sys.stdout.write("\n")

1187

Stefan Behnel

b5d3cee

2019-08-23 16:44:25 +0200

[diff] [blame^]

1188

1189

def indent(tree, space=" ", level=0):

1190

"""Indent an XML document by inserting newlines and indentation space

1191

after elements.

1192

1193

*tree* is the ElementTree or Element to modify. The (root) element

1194

itself will not be changed, but the tail text of all elements in its

1195

subtree will be adapted.

1196

1197

*space* is the whitespace to insert for each indentation level, two

1198

space characters by default.

1199

1200

*level* is the initial indentation level. Setting this to a higher

1201

value than 0 can be used for indenting subtrees that are more deeply

1202

nested inside of a document.

1203

"""

1204

if isinstance(tree, ElementTree):

1205

tree = tree.getroot()

1206

if level < 0:

1207

raise ValueError(f"Initial indentation level must be >= 0, got {level}")

if not len(tree):

return

# Reduce the memory consumption by reusing indentation strings.

1212

indentations = ["\n" + level * space]

1213

1214

def _indent_children(elem, level):

1215

# Start a new indentation level for the first child.

1216

child_level = level + 1

1217

try:

1218

child_indentation = indentations[child_level]

1219

except IndexError:

1220

child_indentation = indentations[level] + space

1221

indentations.append(child_indentation)

1222

1223

if not elem.text or not elem.text.strip():

1224

elem.text = child_indentation

for child in elem:

if len(child):

_indent_children(child, child_level)

1229

if not child.tail or not child.tail.strip():

1230

child.tail = child_indentation

1231

1232

# Dedent after the last child by overwriting the previous indentation.

1233

if not child.tail.strip():

1234

child.tail = indentations[level]

1235

1236

_indent_children(tree, 0)

1237

1238

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1239

# --------------------------------------------------------------------

1240

# parsing

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1241

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1242

1243

def parse(source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1244

"""Parse XML document into element tree.

1245

1246

*source* is a filename or file object containing XML data,

1247

*parser* is an optional parser instance defaulting to XMLParser.

1248

1249

Return an ElementTree instance.

1250

1251

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1252

tree = ElementTree()

1253

tree.parse(source, parser)

1254

return tree

1255

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1256

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1257

def iterparse(source, events=None, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1258

"""Incrementally parse XML document into ElementTree.

1259

1260

This class also reports what's going on to the user based on the

1261

*events* it is initialized with. The supported events are the strings

1262

"start", "end", "start-ns" and "end-ns" (the "ns" events are used to get

1263

detailed namespace information). If *events* is omitted, only

1264

"end" events are reported.

1265

1266

*source* is a filename or file object containing XML data, *events* is

1267

a list of events to report back, *parser* is an optional parser instance.

1268

1269

Returns an iterator providing (event, elem) pairs.

1270

1271

"""

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1272

# Use the internal, undocumented _parser argument for now; When the

1273

# parser argument of iterparse is removed, this can be killed.

1274

pullparser = XMLPullParser(events=events, _parser=parser)

def iterator():

try:

while True:

yield from pullparser.read_events()

1279

# load event buffer

1280

data = source.read(16 * 1024)

1281

if not data:

1282

break

1283

pullparser.feed(data)

1284

root = pullparser._close_and_return_root()

1285

yield from pullparser.read_events()

it.root = root

finally:

if close_source:

source.close()

Serhiy Storchaka

2017-04-24 09:05:00 +0300

[diff] [blame]

1291

class IterParseIterator(collections.abc.Iterator):

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1292

__next__ = iterator().__next__

1293

it = IterParseIterator()

1294

it.root = None

1295

del iterator, IterParseIterator

1296

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1297

close_source = False

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1298

if not hasattr(source, "read"):

1299

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1300

close_source = True

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1301

1302

return it

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1303

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1304

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1305

class XMLPullParser:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1306

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1307

def __init__(self, events=None, *, _parser=None):

1308

# The _parser argument is for internal use only and must not be relied

1309

# upon in user code. It will be removed in a future release.

1310

# See http://bugs.python.org/issue17741 for more details.

1311

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1312

self._events_queue = collections.deque()

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1313

self._parser = _parser or XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1314

# wire up the parser for event reporting

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1315

if events is None:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1316

events = ("end",)

1317

self._parser._setevents(self._events_queue, events)

1318

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1319

def feed(self, data):

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1320

"""Feed encoded data to parser."""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1321

if self._parser is None:

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1322

raise ValueError("feed() called after end of stream")

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1323

if data:

1324

try:

1325

self._parser.feed(data)

1326

except SyntaxError as exc:

1327

self._events_queue.append(exc)

1328

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1329

def _close_and_return_root(self):

1330

# iterparse needs this to set its root attribute properly :(

1331

root = self._parser.close()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1332

self._parser = None

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

return root

def close(self):

"""Finish feeding data to parser.

1337

1338

Unlike XMLParser, does not return the root element. Use

1339

read_events() to consume elements from XMLPullParser.

1340

"""

1341

self._close_and_return_root()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1342

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1343

def read_events(self):

R David Murray

410d320

2014-01-04 23:52:50 -0500

[diff] [blame]

1344

"""Return an iterator over currently available (event, elem) pairs.

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1345

1346

Events are consumed from the internal event queue as they are

1347

retrieved from the iterator.

1348

"""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1349

events = self._events_queue

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1350

while events:

1351

event = events.popleft()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1352

if isinstance(event, Exception):

1353

raise event

1354

else:

1355

yield event

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1356

1357

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1358

def XML(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1359

"""Parse XML document from string constant.

1360

1361

This function can be used to embed "XML Literals" in Python code.

1362

1363

*text* is a string containing XML data, *parser* is an

1364

optional parser instance, defaulting to the standard XMLParser.

1365

1366

Returns an Element instance.

1367

1368

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1369

if not parser:

1370

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1371

parser.feed(text)

1372

return parser.close()

1373

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1374

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1375

def XMLID(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1376

"""Parse XML document from string constant for its IDs.

1377

1378

*text* is a string containing XML data, *parser* is an

1379

optional parser instance, defaulting to the standard XMLParser.

1380

1381

Returns an (Element, dict) tuple, in which the

1382

dict maps element id:s to elements.

1383

1384

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1385

if not parser:

1386

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1387

parser.feed(text)

1388

tree = parser.close()

1389

ids = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1390

for elem in tree.iter():

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

id = elem.get("id")

if id:

ids[id] = elem

return tree, ids

Victor Stinner

2013-03-26 01:11:54 +0100

[diff] [blame]

1396

# Parse XML document from string constant. Alias for XML().

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1397

fromstring = XML

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1398

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1399

def fromstringlist(sequence, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1400

"""Parse XML document from sequence of string fragments.

1401

1402

*sequence* is a list of other sequence, *parser* is an optional parser

1403

instance, defaulting to the standard XMLParser.

1404

1405

Returns an Element instance.

1406

1407

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1408

if not parser:

1409

parser = XMLParser(target=TreeBuilder())

1410

for text in sequence:

1411

parser.feed(text)

1412

return parser.close()

1413

1414

# --------------------------------------------------------------------

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1415

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1416

1417

class TreeBuilder:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1418

"""Generic element structure builder.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1419

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1420

This builder converts a sequence of start, data, and end method

1421

calls to a well-formed element structure.

1422

1423

You can use this class to build an element structure using a custom XML

1424

parser, or a parser for some other XML-like format.

1425

1426

*element_factory* is an optional element factory which is called

1427

to create new Element instances, as necessary.

1428

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1429

*comment_factory* is a factory to create comments to be used instead of

1430

the standard factory. If *insert_comments* is false (the default),

1431

comments will not be inserted into the tree.

1432

1433

*pi_factory* is a factory to create processing instructions to be used

1434

instead of the standard factory. If *insert_pis* is false (the default),

1435

processing instructions will not be inserted into the tree.

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1436

"""

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1437

def __init__(self, element_factory=None, *,

1438

comment_factory=None, pi_factory=None,

1439

insert_comments=False, insert_pis=False):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1440

self._data = [] # data collector

1441

self._elem = [] # element stack

1442

self._last = None # last element

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1443

self._root = None # root element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1444

self._tail = None # true if we're after an end tag

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1445

if comment_factory is None:

1446

comment_factory = Comment

1447

self._comment_factory = comment_factory

1448

self.insert_comments = insert_comments

1449

if pi_factory is None:

1450

pi_factory = ProcessingInstruction

1451

self._pi_factory = pi_factory

1452

self.insert_pis = insert_pis

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1453

if element_factory is None:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1454

element_factory = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1455

self._factory = element_factory

1456

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1457

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1458

"""Flush builder buffers and return toplevel document Element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1459

assert len(self._elem) == 0, "missing end tags"

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1460

assert self._root is not None, "missing toplevel element"

1461

return self._root

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

def _flush(self):

if self._data:

if self._last is not None:

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1466

text = "".join(self._data)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1467

if self._tail:

1468

assert self._last.tail is None, "internal error (tail)"

1469

self._last.tail = text

1470

else:

1471

assert self._last.text is None, "internal error (text)"

1472

self._last.text = text

1473

self._data = []

1474

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1475

def data(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1476

"""Add text to current element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1477

self._data.append(data)

1478

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1479

def start(self, tag, attrs):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1480

"""Open new element and return it.

1481

1482

*tag* is the element name, *attrs* is a dict containing element

1483

attributes.

1484

1485

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1486

self._flush()

1487

self._last = elem = self._factory(tag, attrs)

1488

if self._elem:

1489

self._elem[-1].append(elem)

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1490

elif self._root is None:

1491

self._root = elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1492

self._elem.append(elem)

self._tail = 0

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1496

def end(self, tag):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1497

"""Close and return current Element.

1498

1499

*tag* is the element name.

1500

1501

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1502

self._flush()

1503

self._last = self._elem.pop()

1504

assert self._last.tag == tag,\

1505

"end tag mismatch (expected %s, got %s)" % (

self._last.tag, tag)

self._tail = 1

return self._last

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1510

def comment(self, text):

1511

"""Create a comment using the comment_factory.

1512

1513

*text* is the text of the comment.

1514

"""

1515

return self._handle_single(

1516

self._comment_factory, self.insert_comments, text)

1517

1518

def pi(self, target, text=None):

1519

"""Create a processing instruction using the pi_factory.

1520

1521

*target* is the target name of the processing instruction.

1522

*text* is the data of the processing instruction, or ''.

1523

"""

1524

return self._handle_single(

1525

self._pi_factory, self.insert_pis, target, text)

1526

1527

def _handle_single(self, factory, insert, *args):

1528

elem = factory(*args)

if insert:

self._flush()

self._last = elem

if self._elem:

self._elem[-1].append(elem)

self._tail = 1

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1537

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1538

# also see ElementTree and TreeBuilder

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1539

class XMLParser:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1540

"""Element structure builder for XML source data based on the expat parser.

1541

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1542

*target* is an optional target object which defaults to an instance of the

1543

standard TreeBuilder class, *encoding* is an optional encoding string

1544

which if given, overrides the encoding specified in the XML file:

1545

http://www.iana.org/assignments/character-sets

1546

1547

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1548

Serhiy Storchaka

2018-07-24 12:03:34 +0300

[diff] [blame]

1549

def __init__(self, *, target=None, encoding=None):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1550

try:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1551

from xml.parsers import expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1552

except ImportError:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1553

try:

1554

import pyexpat as expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1555

except ImportError:

1556

raise ImportError(

1557

"No module named expat; use SimpleXMLTreeBuilder instead"

1558

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1559

parser = expat.ParserCreate(encoding, "}")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1560

if target is None:

1561

target = TreeBuilder()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1562

# underscored names are provided for compatibility only

1563

self.parser = self._parser = parser

1564

self.target = self._target = target

1565

self._error = expat.error

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1566

self._names = {} # name memo cache

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1567

# main callbacks

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1568

parser.DefaultHandlerExpand = self._default

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1569

if hasattr(target, 'start'):

1570

parser.StartElementHandler = self._start

1571

if hasattr(target, 'end'):

1572

parser.EndElementHandler = self._end

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1573

if hasattr(target, 'start_ns'):

1574

parser.StartNamespaceDeclHandler = self._start_ns

1575

if hasattr(target, 'end_ns'):

1576

parser.EndNamespaceDeclHandler = self._end_ns

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1577

if hasattr(target, 'data'):

1578

parser.CharacterDataHandler = target.data

1579

# miscellaneous callbacks

1580

if hasattr(target, 'comment'):

1581

parser.CommentHandler = target.comment

1582

if hasattr(target, 'pi'):

1583

parser.ProcessingInstructionHandler = target.pi

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1584

# Configure pyexpat: buffering, new-style attribute handling.

1585

parser.buffer_text = 1

1586

parser.ordered_attributes = 1

1587

parser.specified_attributes = 1

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1588

self._doctype = None

1589

self.entity = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1590

try:

1591

self.version = "Expat %d.%d.%d" % expat.version_info

1592

except AttributeError:

1593

pass # unknown

1594

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1595

def _setevents(self, events_queue, events_to_report):

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1596

# Internal API for XMLPullParser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1597

# events_to_report: a list of events to report during parsing (same as

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1598

# the *events* of XMLPullParser's constructor.

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1599

# events_queue: a list of actual parsing events that will be populated

1600

# by the underlying parser.

1601

#

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1602

parser = self._parser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1603

append = events_queue.append

1604

for event_name in events_to_report:

1605

if event_name == "start":

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1606

parser.ordered_attributes = 1

1607

parser.specified_attributes = 1

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1608

def handler(tag, attrib_in, event=event_name, append=append,

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1609

start=self._start):

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1610

append((event, start(tag, attrib_in)))

1611

parser.StartElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1612

elif event_name == "end":

1613

def handler(tag, event=event_name, append=append,

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1614

end=self._end):

1615

append((event, end(tag)))

1616

parser.EndElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1617

elif event_name == "start-ns":

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1618

# TreeBuilder does not implement .start_ns()

1619

if hasattr(self.target, "start_ns"):

1620

def handler(prefix, uri, event=event_name, append=append,

1621

start_ns=self._start_ns):

1622

append((event, start_ns(prefix, uri)))

1623

else:

1624

def handler(prefix, uri, event=event_name, append=append):

1625

append((event, (prefix or '', uri or '')))

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1626

parser.StartNamespaceDeclHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1627

elif event_name == "end-ns":

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1628

# TreeBuilder does not implement .end_ns()

1629

if hasattr(self.target, "end_ns"):

1630

def handler(prefix, event=event_name, append=append,

1631

end_ns=self._end_ns):

1632

append((event, end_ns(prefix)))

1633

else:

1634

def handler(prefix, event=event_name, append=append):

1635

append((event, None))

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1636

parser.EndNamespaceDeclHandler = handler

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1637

elif event_name == 'comment':

1638

def handler(text, event=event_name, append=append, self=self):

1639

append((event, self.target.comment(text)))

1640

parser.CommentHandler = handler

1641

elif event_name == 'pi':

1642

def handler(pi_target, data, event=event_name, append=append,

1643

self=self):

1644

append((event, self.target.pi(pi_target, data)))

1645

parser.ProcessingInstructionHandler = handler

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1646

else:

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1647

raise ValueError("unknown event %r" % event_name)

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1648

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1649

def _raiseerror(self, value):

1650

err = ParseError(value)

1651

err.code = value.code

1652

err.position = value.lineno, value.offset

1653

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1654

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1655

def _fixname(self, key):

1656

# expand qname, and convert name string to ascii, if possible

1657

try:

1658

name = self._names[key]

except KeyError:

name = key

if "}" in name:

name = "{" + name

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1663

self._names[key] = name

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1664

return name

1665

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1666

def _start_ns(self, prefix, uri):

1667

return self.target.start_ns(prefix or '', uri or '')

1668

1669

def _end_ns(self, prefix):

1670

return self.target.end_ns(prefix or '')

1671

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1672

def _start(self, tag, attr_list):

1673

# Handler for expat's StartElementHandler. Since ordered_attributes

1674

# is set, the attributes are reported as a list of alternating

1675

# attribute name,value.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1676

fixname = self._fixname

1677

tag = fixname(tag)

1678

attrib = {}

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1679

if attr_list:

1680

for i in range(0, len(attr_list), 2):

1681

attrib[fixname(attr_list[i])] = attr_list[i+1]

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1682

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1683

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1684

def _end(self, tag):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1685

return self.target.end(self._fixname(tag))

1686

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1687

def _default(self, text):

1688

prefix = text[:1]

1689

if prefix == "&":

1690

# deal with undefined entities

1691

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1692

data_handler = self.target.data

1693

except AttributeError:

1694

return

1695

try:

1696

data_handler(self.entity[text[1:-1]])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1697

except KeyError:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1698

from xml.parsers import expat

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1699

err = expat.error(

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1700

"undefined entity %s: line %d, column %d" %

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1701

(text, self.parser.ErrorLineNumber,

1702

self.parser.ErrorColumnNumber)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1703

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1704

err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1705

err.lineno = self.parser.ErrorLineNumber

1706

err.offset = self.parser.ErrorColumnNumber

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1707

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1708

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1709

self._doctype = [] # inside a doctype declaration

1710

elif self._doctype is not None:

1711

# parse doctype contents

1712

if prefix == ">":

1713

self._doctype = None

1714

return

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1715

text = text.strip()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1716

if not text:

1717

return

1718

self._doctype.append(text)

1719

n = len(self._doctype)

1720

if n > 2:

1721

type = self._doctype[1]

1722

if type == "PUBLIC" and n == 4:

1723

name, type, pubid, system = self._doctype

Florent Xicluna

a1c974a

2012-07-07 13:16:44 +0200

[diff] [blame]

1724

if pubid:

1725

pubid = pubid[1:-1]

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1726

elif type == "SYSTEM" and n == 3:

1727

name, type, system = self._doctype

1728

pubid = None

1729

else:

1730

return

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1731

if hasattr(self.target, "doctype"):

1732

self.target.doctype(name, pubid, system[1:-1])

Serhiy Storchaka

2018-07-24 12:03:34 +0300

[diff] [blame]

1733

elif hasattr(self, "doctype"):

1734

warnings.warn(

1735

"The doctype() method of XMLParser is ignored. "

1736

"Define doctype() method on the TreeBuilder target.",

1737

RuntimeWarning)

1738

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1739

self._doctype = None

1740

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1741

def feed(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1742

"""Feed encoded data to parser."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1743

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1744

self.parser.Parse(data, 0)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1745

except self._error as v:

1746

self._raiseerror(v)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1747

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1748

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1749

"""Finish feeding data to parser and return element structure."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1750

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1751

self.parser.Parse("", 1) # end of data

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1752

except self._error as v:

1753

self._raiseerror(v)

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1754

try:

Florent Xicluna

fb06746

2012-03-05 11:42:49 +0100

[diff] [blame]

1755

close_handler = self.target.close

1756

except AttributeError:

1757

pass

1758

else:

1759

return close_handler()

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1760

finally:

1761

# get rid of circular references

1762

del self.parser, self._parser

1763

del self.target, self._target

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1764

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1765

Stefan Behnel

e1d5dd6

2019-05-01 22:34:13 +0200

[diff] [blame]

1766

# --------------------------------------------------------------------

1767

# C14N 2.0

1768

1769

def canonicalize(xml_data=None, *, out=None, from_file=None, **options):

1770

"""Convert XML to its C14N 2.0 serialised form.

1771

1772

If *out* is provided, it must be a file or file-like object that receives

1773

the serialised canonical XML output (text, not bytes) through its ``.write()``

1774

method. To write to a file, open it in text mode with encoding "utf-8".

1775

If *out* is not provided, this function returns the output as text string.

1776

1777

Either *xml_data* (an XML string) or *from_file* (a file path or

1778

file-like object) must be provided as input.

1779

1780

The configuration options are the same as for the ``C14NWriterTarget``.

1781

"""

1782

if xml_data is None and from_file is None:

1783

raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")

1784

sio = None

1785

if out is None:

1786

sio = out = io.StringIO()

1787

1788

parser = XMLParser(target=C14NWriterTarget(out.write, **options))

1789

1790

if xml_data is not None:

1791

parser.feed(xml_data)

1792

parser.close()

1793

elif from_file is not None:

1794

parse(from_file, parser=parser)

1795

1796

return sio.getvalue() if sio is not None else None

1797

1798

1799

_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match

1800

1801

1802

class C14NWriterTarget:

1803

"""

1804

Canonicalization writer target for the XMLParser.

1805

1806

Serialises parse events to XML C14N 2.0.

1807

1808

The *write* function is used for writing out the resulting data stream

1809

as text (not bytes). To write to a file, open it in text mode with encoding

1810

"utf-8" and pass its ``.write`` method.

1811

1812

Configuration options:

1813

1814

- *with_comments*: set to true to include comments

1815

- *strip_text*: set to true to strip whitespace before and after text content

1816

- *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"

1817

- *qname_aware_tags*: a set of qname aware tag names in which prefixes

1818

should be replaced in text content

1819

- *qname_aware_attrs*: a set of qname aware attribute names in which prefixes

1820

should be replaced in text content

1821

- *exclude_attrs*: a set of attribute names that should not be serialised

1822

- *exclude_tags*: a set of tag names that should not be serialised

1823

"""

1824

def __init__(self, write, *,

1825

with_comments=False, strip_text=False, rewrite_prefixes=False,

1826

qname_aware_tags=None, qname_aware_attrs=None,

1827

exclude_attrs=None, exclude_tags=None):

1828

self._write = write

1829

self._data = []

1830

self._with_comments = with_comments

1831

self._strip_text = strip_text

1832

self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None

1833

self._exclude_tags = set(exclude_tags) if exclude_tags else None

1834

1835

self._rewrite_prefixes = rewrite_prefixes

1836

if qname_aware_tags:

1837

self._qname_aware_tags = set(qname_aware_tags)

1838

else:

1839

self._qname_aware_tags = None

1840

if qname_aware_attrs:

1841

self._find_qname_aware_attrs = set(qname_aware_attrs).intersection

1842

else:

1843

self._find_qname_aware_attrs = None

1844

1845

# Stack with globally and newly declared namespaces as (uri, prefix) pairs.

1846

self._declared_ns_stack = [[

1847

("http://www.w3.org/XML/1998/namespace", "xml"),

1848

]]

1849

# Stack with user declared namespace prefixes as (uri, prefix) pairs.

1850

self._ns_stack = []

1851

if not rewrite_prefixes:

1852

self._ns_stack.append(list(_namespace_map.items()))

1853

self._ns_stack.append([])

1854

self._prefix_map = {}

1855

self._preserve_space = [False]

1856

self._pending_start = None

1857

self._root_seen = False

1858

self._root_done = False

1859

self._ignored_depth = 0

1860

1861

def _iter_namespaces(self, ns_stack, _reversed=reversed):

1862

for namespaces in _reversed(ns_stack):

1863

if namespaces: # almost no element declares new namespaces

1864

yield from namespaces

1865

1866

def _resolve_prefix_name(self, prefixed_name):

1867

prefix, name = prefixed_name.split(':', 1)

1868

for uri, p in self._iter_namespaces(self._ns_stack):

1869

if p == prefix:

1870

return f'{{{uri}}}{name}'

1871

raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')

1872

1873

def _qname(self, qname, uri=None):

1874

if uri is None:

1875

uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)

else:

tag = qname

prefixes_seen = set()

1880

for u, prefix in self._iter_namespaces(self._declared_ns_stack):

1881

if u == uri and prefix not in prefixes_seen:

1882

return f'{prefix}:{tag}' if prefix else tag, tag, uri

1883

prefixes_seen.add(prefix)

1884

1885

# Not declared yet => add new declaration.

1886

if self._rewrite_prefixes:

1887

if uri in self._prefix_map:

1888

prefix = self._prefix_map[uri]

1889

else:

1890

prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'

1891

self._declared_ns_stack[-1].append((uri, prefix))

1892

return f'{prefix}:{tag}', tag, uri

1893

1894

if not uri and '' not in prefixes_seen:

1895

# No default namespace declared => no prefix needed.

1896

return tag, tag, uri

1897

1898

for u, prefix in self._iter_namespaces(self._ns_stack):

1899

if u == uri:

1900

self._declared_ns_stack[-1].append((uri, prefix))

1901

return f'{prefix}:{tag}' if prefix else tag, tag, uri

1902

1903

raise ValueError(f'Namespace "{uri}" is not declared in scope')

1904

1905

def data(self, data):

1906

if not self._ignored_depth:

1907

self._data.append(data)

1908

1909

def _flush(self, _join_text=''.join):

1910

data = _join_text(self._data)

1911

del self._data[:]

1912

if self._strip_text and not self._preserve_space[-1]:

1913

data = data.strip()

1914

if self._pending_start is not None:

1915

args, self._pending_start = self._pending_start, None

1916

qname_text = data if data and _looks_like_prefix_name(data) else None

1917

self._start(*args, qname_text)

1918

if qname_text is not None:

1919

return

1920

if data and self._root_seen:

1921

self._write(_escape_cdata_c14n(data))

1922

1923

def start_ns(self, prefix, uri):

1924

if self._ignored_depth:

1925

return

1926

# we may have to resolve qnames in text content

1927

if self._data:

1928

self._flush()

1929

self._ns_stack[-1].append((uri, prefix))

1930

1931

def start(self, tag, attrs):

1932

if self._exclude_tags is not None and (

1933

self._ignored_depth or tag in self._exclude_tags):

1934

self._ignored_depth += 1

return

if self._data:

self._flush()

new_namespaces = []

self._declared_ns_stack.append(new_namespaces)

1941

1942

if self._qname_aware_tags is not None and tag in self._qname_aware_tags:

1943

# Need to parse text first to see if it requires a prefix declaration.

1944

self._pending_start = (tag, attrs, new_namespaces)

1945

return

1946

self._start(tag, attrs, new_namespaces)

1947

1948

def _start(self, tag, attrs, new_namespaces, qname_text=None):

1949

if self._exclude_attrs is not None and attrs:

1950

attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}

1951

1952

qnames = {tag, *attrs}

1953

resolved_names = {}

1954

1955

# Resolve prefixes in attribute and tag text.

1956

if qname_text is not None:

1957

qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)

1958

qnames.add(qname)

1959

if self._find_qname_aware_attrs is not None and attrs:

1960

qattrs = self._find_qname_aware_attrs(attrs)

1961

if qattrs:

1962

for attr_name in qattrs:

1963

value = attrs[attr_name]

1964

if _looks_like_prefix_name(value):

1965

qname = resolved_names[value] = self._resolve_prefix_name(value)

qnames.add(qname)

else:

qattrs = None

else:

qattrs = None

# Assign prefixes in lexicographical order of used URIs.

1973

parse_qname = self._qname

1974

parsed_qnames = {n: parse_qname(n) for n in sorted(

1975

qnames, key=lambda n: n.split('}', 1))}

1976

1977

# Write namespace declarations in prefix order ...

1978

if new_namespaces:

1979

attr_list = [

1980

('xmlns:' + prefix if prefix else 'xmlns', uri)

1981

for uri, prefix in new_namespaces

]

attr_list.sort()

else:

# almost always empty

1986

attr_list = []

1987

1988

# ... followed by attributes in URI+name order

1989

if attrs:

1990

for k, v in sorted(attrs.items()):

1991

if qattrs is not None and k in qattrs and v in resolved_names:

1992

v = parsed_qnames[resolved_names[v]][0]

1993

attr_qname, attr_name, uri = parsed_qnames[k]

1994

# No prefix for attributes in default ('') namespace.

1995

attr_list.append((attr_qname if uri else attr_name, v))

1996

1997

# Honour xml:space attributes.

1998

space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')

1999

self._preserve_space.append(

2000

space_behaviour == 'preserve' if space_behaviour

2001

else self._preserve_space[-1])

# Write the tag.

write = self._write

write('<' + parsed_qnames[tag][0])

2006

if attr_list:

2007

write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))

2008

write('>')

2009

2010

# Write the resolved qname text content.

2011

if qname_text is not None:

2012

write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))

2013

2014

self._root_seen = True

2015

self._ns_stack.append([])

2016

2017

def end(self, tag):

2018

if self._ignored_depth:

2019

self._ignored_depth -= 1

return

if self._data:

self._flush()

self._write(f'</{self._qname(tag)[0]}>')

2024

self._preserve_space.pop()

2025

self._root_done = len(self._preserve_space) == 1

2026

self._declared_ns_stack.pop()

2027

self._ns_stack.pop()

2028

2029

def comment(self, text):

2030

if not self._with_comments:

2031

return

2032

if self._ignored_depth:

return

if self._root_done:

self._write('\n')

elif self._root_seen and self._data:

2037

self._flush()

2038

self._write(f'')

2039

if not self._root_seen:

2040

self._write('\n')

2041

2042

def pi(self, target, data):

2043

if self._ignored_depth:

return

if self._root_done:

self._write('\n')

elif self._root_seen and self._data:

2048

self._flush()

2049

self._write(

2050

f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')

2051

if not self._root_seen:

self._write('\n')

def _escape_cdata_c14n(text):

2056

# escape character data

2057

try:

2058

# it's worth avoiding do-nothing calls for strings that are

2059

# shorter than 500 character, or so. assume that's, by far,

2060

# the most common case in most applications.

2061

if '&' in text:

2062

text = text.replace('&', '&')

2063

if '<' in text:

2064

text = text.replace('<', '<')

2065

if '>' in text:

2066

text = text.replace('>', '>')

2067

if '\r' in text:

2068

text = text.replace('\r', '')

2069

return text

2070

except (TypeError, AttributeError):

2071

_raise_serialization_error(text)

2072

2073

2074

def _escape_attrib_c14n(text):

2075

# escape attribute value

2076

try:

2077

if '&' in text:

2078

text = text.replace('&', '&')

2079

if '<' in text:

2080

text = text.replace('<', '<')

2081

if '"' in text:

2082

text = text.replace('"', '"')

2083

if '\t' in text:

2084

text = text.replace('\t', '	')

2085

if '\n' in text:

2086

text = text.replace('\n', '
')

2087

if '\r' in text:

2088

text = text.replace('\r', '')

2089

return text

2090

except (TypeError, AttributeError):

2091

_raise_serialization_error(text)

2092

2093

2094

# --------------------------------------------------------------------

2095

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

2096

# Import the C accelerators

2097

try:

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

2098

# Element is going to be shadowed by the C implementation. We need to keep

2099

# the Python version of it accessible for some "creative" by external code

2100

# (see tests)

2101

_Element_Py = Element

2102

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

2103

# Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

2104

from _elementtree import *

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

2105

from _elementtree import _set_factories

Eli Bendersky

c4e98a6

2013-05-19 09:24:43 -0700

[diff] [blame]

2106

except ImportError:

2107

pass

Stefan Behnel