Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

2013-03-09 07:12:48 -0800

[diff] [blame]

1

"""Lightweight XML support for Python.

2

3

XML is an inherently hierarchical data format, and the most natural way to

4

represent it is with a tree. This module has two classes for this purpose:

5

6

1. ElementTree represents the whole XML document as a tree and

7

8

2. Element represents a single node in this tree.

9

10

Interactions with the whole document (reading and writing to/from files) are

11

usually done on the ElementTree level. Interactions with a single XML element

12

and its sub-elements are done on the Element level.

13

14

Element is a flexible container object designed to store hierarchical data

15

structures in memory. It can be described as a cross between a list and a

16

dictionary. Each Element has a number of properties associated with it:

17

18

'tag' - a string containing the element's name.

19

20

'attributes' - a Python dictionary storing the element's attributes.

21

22

'text' - a string containing the element's text content.

23

24

'tail' - an optional string containing text after the element's end tag.

25

26

And a number of child elements stored in a Python sequence.

27

28

To create an element instance, use the Element constructor,

29

or the SubElement factory function.

30

31

You can also use the ElementTree class to wrap an element structure

32

and convert it to and from XML.

"""

Eli Bendersky

2013-04-20 05:44:01 -0700

[diff] [blame]

36

#---------------------------------------------------------------------

37

# Licensed to PSF under a Contributor Agreement.

38

# See http://www.python.org/psf/license for licensing details.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

39

#

40

# ElementTree

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

41

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

42

#

43

# fredrik@pythonware.com

44

# http://www.pythonware.com

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

45

# --------------------------------------------------------------------

46

# The ElementTree toolkit is

47

#

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

48

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

49

#

50

# By obtaining, using, and/or copying this software and/or its

51

# associated documentation, you agree that you have read, understood,

52

# and will comply with the following terms and conditions:

53

#

54

# Permission to use, copy, modify, and distribute this software and

55

# its associated documentation for any purpose and without fee is

56

# hereby granted, provided that the above copyright notice appears in

57

# all copies, and that both that copyright notice and this permission

58

# notice appear in supporting documentation, and that the name of

59

# Secret Labs AB or the author not be used in advertising or publicity

60

# pertaining to distribution of the software without specific, written

61

# prior permission.

62

#

63

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

64

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

65

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

66

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

67

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

68

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

69

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

70

# OF THIS SOFTWARE.

71

# --------------------------------------------------------------------

__all__ = [

# public symbols

"Comment",

"dump",

"Element", "ElementTree",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

78

"fromstring", "fromstringlist",

Stefan Behnel

b5d3cee

2019-08-23 16:44:25 +0200

[diff] [blame]

79

"indent", "iselement", "iterparse",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

80

"parse", "ParseError",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

81

"PI", "ProcessingInstruction",

82

"QName",

83

"SubElement",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

84

"tostring", "tostringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

85

"TreeBuilder",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

86

"VERSION",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

87

"XML", "XMLID",

Martin Panter

dcfebb3

2016-04-01 06:55:55 +0000

[diff] [blame]

88

"XMLParser", "XMLPullParser",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

89

"register_namespace",

Stefan Behnel

e1d5dd6

2019-05-01 22:34:13 +0200

[diff] [blame]

90

"canonicalize", "C14NWriterTarget",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

91

]

92

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

93

VERSION = "1.3.0"

94

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

95

import sys

96

import re

97

import warnings

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

98

import io

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

99

import collections

Serhiy Storchaka

2e576f5

2017-04-24 09:05:00 +0300

[diff] [blame]

100

import collections.abc

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

101

import contextlib

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

102

Eli Bendersky

27cbb19

2012-06-15 09:03:19 +0300

[diff] [blame]

103

from . import ElementPath

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

104

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

105

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

106

class ParseError(SyntaxError):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

107

"""An error when parsing an XML document.

108

109

In addition to its exception value, a ParseError contains

110

two extra attributes:

111

'code' - the specific exception code

112

'position' - the line and column of the error

113

114

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

115

pass

116

117

# --------------------------------------------------------------------

118

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

119

120

def iselement(element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

121

"""Return True if *element* appears to be an Element."""

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

122

return hasattr(element, 'tag')

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

123

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

124

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

125

class Element:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

126

"""An XML element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

127

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

128

This class is the reference implementation of the Element interface.

129

130

An element's length is its number of subelements. That means if you

Serhiy Storchaka

56a6d85

2014-12-01 18:28:43 +0200

[diff] [blame]

131

want to check if an element is truly empty, you should check BOTH

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

132

its length AND its text attribute.

133

134

The element tag, attribute names, and attribute values can be either

135

bytes or strings.

136

137

*tag* is the element name. *attrib* is an optional dictionary containing

138

element attributes. *extra* are additional element attributes given as

keyword arguments.

Example form:

<tag attrib>text<child/>...</tag>tail

143

144

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

145

146

tag = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

147

"""The element's name."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

148

149

attrib = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

150

"""Dictionary of the element's attributes."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

151

152

text = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

153

"""

154

Text before first subelement. This is either a string or the value None.

155

Note that if there is no text, this attribute may be either

156

None or the empty string, depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

157

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

158

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

159

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

160

tail = None

161

"""

162

Text after this element's end tag, but before the next sibling element's

163

start tag. This is either a string or the value None. Note that if there

164

was no text, this attribute may be either None or an empty string,

165

depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

166

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

167

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

168

169

def __init__(self, tag, attrib={}, **extra):

Eli Bendersky

737b173

2012-05-29 06:02:56 +0300

[diff] [blame]

170

if not isinstance(attrib, dict):

171

raise TypeError("attrib must be dict, not %s" % (

172

attrib.__class__.__name__,))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

173

self.tag = tag

Serhiy Storchaka

da08470

2019-03-27 08:02:28 +0200

[diff] [blame]

174

self.attrib = {**attrib, **extra}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

175

self._children = []

176

177

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

178

return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

179

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

180

def makeelement(self, tag, attrib):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

181

"""Create a new element with the same type.

182

183

*tag* is a string containing the element name.

184

*attrib* is a dictionary containing the element attributes.

185

186

Do not call this method, use the SubElement factory function instead.

187

188

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

189

return self.__class__(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

190

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

191

def copy(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

192

"""Return copy of current element.

193

194

This creates a shallow copy. Subelements will be shared with the

195

original tree.

196

197

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

198

elem = self.makeelement(self.tag, self.attrib)

199

elem.text = self.text

200

elem.tail = self.tail

elem[:] = self

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

204

def __len__(self):

205

return len(self._children)

206

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

207

def __bool__(self):

208

warnings.warn(

209

"The behavior of this method will change in future versions. "

210

"Use specific 'len(elem)' or 'elem is not None' test instead.",

211

FutureWarning, stacklevel=2

212

)

213

return len(self._children) != 0 # emulate old behaviour, for now

214

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

215

def __getitem__(self, index):

216

return self._children[index]

217

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

218

def __setitem__(self, index, element):

Serhiy Storchaka

f081fd8

2018-10-19 12:12:57 +0300

[diff] [blame]

219

if isinstance(index, slice):

220

for elt in element:

221

self._assert_is_element(elt)

222

else:

223

self._assert_is_element(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

224

self._children[index] = element

225

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

226

def __delitem__(self, index):

227

del self._children[index]

228

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

229

def append(self, subelement):

230

"""Add *subelement* to the end of this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

231

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

232

The new element will appear in document order after the last existing

233

subelement (or directly after the text, if it's the first subelement),

234

but before the end tag for this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

235

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

236

"""

237

self._assert_is_element(subelement)

238

self._children.append(subelement)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

239

240

def extend(self, elements):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

241

"""Append subelements from a sequence.

242

243

*elements* is a sequence with zero or more elements.

244

245

"""

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

246

for element in elements:

247

self._assert_is_element(element)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

248

self._children.extend(elements)

249

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

250

def insert(self, index, subelement):

251

"""Insert *subelement* at position *index*."""

252

self._assert_is_element(subelement)

253

self._children.insert(index, subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

254

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

255

def _assert_is_element(self, e):

Antoine Pitrou

ee32931

2012-10-04 19:53:29 +0200

[diff] [blame]

256

# Need to refer to the actual Python implementation, not the

257

# shadowing C implementation.

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

258

if not isinstance(e, _Element_Py):

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

259

raise TypeError('expected an Element, not %s' % type(e).__name__)

260

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

261

def remove(self, subelement):

262

"""Remove matching subelement.

263

264

Unlike the find methods, this method compares elements based on

265

identity, NOT ON tag value or contents. To remove subelements by

266

other means, the easiest way is to use a list comprehension to

267

select what elements to keep, and then use slice assignment to update

268

the parent element.

269

270

ValueError is raised if a matching element could not be found.

271

272

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

273

# assert iselement(element)

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

274

self._children.remove(subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

275

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

276

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

277

"""Find first matching element by tag name or path.

278

279

*path* is a string having either an element tag or an XPath,

280

*namespaces* is an optional mapping from namespace prefix to full name.

281

282

Return the first matching element, or None if no element was found.

283

284

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

285

return ElementPath.find(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

286

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

287

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

288

"""Find text for first matching element by tag name or path.

289

290

*path* is a string having either an element tag or an XPath,

291

*default* is the value to return if the element was not found,

292

*namespaces* is an optional mapping from namespace prefix to full name.

293

294

Return text content of first matching element, or default value if

295

none was found. Note that if an element is found having no text

296

content, the empty string is returned.

297

298

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

299

return ElementPath.findtext(self, path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

300

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

301

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

302

"""Find all matching subelements by tag name or path.

303

304

*path* is a string having either an element tag or an XPath,

305

*namespaces* is an optional mapping from namespace prefix to full name.

306

307

Returns list containing all matching elements in document order.

308

309

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

310

return ElementPath.findall(self, path, namespaces)

311

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

312

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

313

"""Find all matching subelements by tag name or path.

314

315

*path* is a string having either an element tag or an XPath,

316

*namespaces* is an optional mapping from namespace prefix to full name.

317

318

Return an iterable yielding all matching elements in document order.

319

320

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

321

return ElementPath.iterfind(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

322

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

323

def clear(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

324

"""Reset element.

325

326

This function removes all subelements, clears all attributes, and sets

327

the text and tail attributes to None.

328

329

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

330

self.attrib.clear()

331

self._children = []

332

self.text = self.tail = None

333

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

334

def get(self, key, default=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

335

"""Get element attribute.

336

337

Equivalent to attrib.get, but some implementations may handle this a

338

bit more efficiently. *key* is what attribute to look for, and

339

*default* is what to return if the attribute was not found.

340

341

Returns a string containing the attribute value, or the default if

342

attribute was not found.

343

344

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

345

return self.attrib.get(key, default)

346

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

347

def set(self, key, value):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

348

"""Set element attribute.

349

350

Equivalent to attrib[key] = value, but some implementations may handle

351

this a bit more efficiently. *key* is what attribute to set, and

352

*value* is the attribute value to set it to.

353

354

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

355

self.attrib[key] = value

356

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

357

def keys(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

358

"""Get list of attribute names.

359

360

Names are returned in an arbitrary order, just like an ordinary

361

Python dict. Equivalent to attrib.keys()

362

363

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

364

return self.attrib.keys()

365

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

366

def items(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

367

"""Get element attributes as a sequence.

368

369

The attributes are returned in arbitrary order. Equivalent to

370

attrib.items().

371

372

Return a list of (name, value) tuples.

373

374

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

375

return self.attrib.items()

376

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

377

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

378

"""Create tree iterator.

379

380

The iterator loops over the element and all subelements in document

381

order, returning all elements with a matching tag.

382

383

If the tree structure is modified during iteration, new or removed

384

elements may or may not be included. To get a stable set, use the

385

list() function on the iterator, and loop over the resulting list.

386

387

*tag* is what tags to look for (default is to return all elements)

388

389

Return an iterator containing all the matching elements.

390

391

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

392

if tag == "*":

393

tag = None

394

if tag is None or self.tag == tag:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

395

yield self

396

for e in self._children:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

397

yield from e.iter(tag)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

398

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

399

def itertext(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

400

"""Create text iterator.

401

402

The iterator loops over the element and all subelements in document

403

order, returning all inner text.

404

405

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

406

tag = self.tag

407

if not isinstance(tag, str) and tag is not None:

408

return

Serhiy Storchaka

66c08d9

2015-12-21 11:09:48 +0200

[diff] [blame]

409

t = self.text

410

if t:

411

yield t

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

412

for e in self:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

413

yield from e.itertext()

Serhiy Storchaka

66c08d9

2015-12-21 11:09:48 +0200

[diff] [blame]

414

t = e.tail

415

if t:

416

yield t

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

417

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

418

419

def SubElement(parent, tag, attrib={}, **extra):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

420

"""Subelement factory which creates an element instance, and appends it

421

to an existing parent.

422

423

The element tag, attribute names, and attribute values can be either

424

bytes or Unicode strings.

425

426

*parent* is the parent element, *tag* is the subelements name, *attrib* is

427

an optional directory containing element attributes, *extra* are

428

additional attributes given as keyword arguments.

429

430

"""

Serhiy Storchaka

da08470

2019-03-27 08:02:28 +0200

[diff] [blame]

431

attrib = {**attrib, **extra}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

432

element = parent.makeelement(tag, attrib)

433

parent.append(element)

434

return element

435

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

436

437

def Comment(text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

438

"""Comment element factory.

439

440

This function creates a special element which the standard serializer

441

serializes as an XML comment.

442

443

*text* is a string containing the comment string.

444

445

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

446

element = Element(Comment)

element.text = text

return element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

450

451

def ProcessingInstruction(target, text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

452

"""Processing Instruction element factory.

453

454

This function creates a special element which the standard serializer

455

serializes as an XML comment.

456

457

*target* is a string containing the processing instruction, *text* is a

458

string containing the processing instruction contents, if any.

459

460

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

461

element = Element(ProcessingInstruction)

462

element.text = target

463

if text:

464

element.text = element.text + " " + text

465

return element

466

467

PI = ProcessingInstruction

468

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

469

470

class QName:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

471

"""Qualified name wrapper.

472

473

This class can be used to wrap a QName attribute value in order to get

474

proper namespace handing on output.

475

476

*text_or_uri* is a string containing the QName value either in the form

477

{uri}local, or if the tag argument is given, the URI part of a QName.

478

479

*tag* is an optional argument which if given, will make the first

480

argument (text_or_uri) be interpreted as a URI, and this argument (tag)

481

be interpreted as a local name.

482

483

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

484

def __init__(self, text_or_uri, tag=None):

485

if tag:

486

text_or_uri = "{%s}%s" % (text_or_uri, tag)

487

self.text = text_or_uri

488

def __str__(self):

489

return self.text

Georg Brandl

b56c0e2

2010-12-09 18:10:27 +0000

[diff] [blame]

490

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

491

return '<%s %r>' % (self.__class__.__name__, self.text)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

492

def __hash__(self):

493

return hash(self.text)

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

494

def __le__(self, other):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

495

if isinstance(other, QName):

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

496

return self.text <= other.text

497

return self.text <= other

498

def __lt__(self, other):

499

if isinstance(other, QName):

500

return self.text < other.text

501

return self.text < other

502

def __ge__(self, other):

503

if isinstance(other, QName):

504

return self.text >= other.text

505

return self.text >= other

506

def __gt__(self, other):

507

if isinstance(other, QName):

508

return self.text > other.text

509

return self.text > other

510

def __eq__(self, other):

511

if isinstance(other, QName):

512

return self.text == other.text

513

return self.text == other

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

514

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

515

# --------------------------------------------------------------------

516

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

517

518

class ElementTree:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

519

"""An XML element hierarchy.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

520

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

521

This class also provides support for serialization to and from

522

standard XML.

523

524

*element* is an optional root element node,

525

*file* is an optional file handle or file name of an XML file whose

526

contents will be used to initialize the tree with.

527

528

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

529

def __init__(self, element=None, file=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

530

# assert element is None or iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

531

self._root = element # first node

if file:

self.parse(file)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

535

def getroot(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

536

"""Return root element of this tree."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

537

return self._root

538

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

539

def _setroot(self, element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

540

"""Replace root element of this tree.

541

542

This will discard the current contents of the tree and replace it

543

with the given element. Use with care!

544

545

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

546

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

547

self._root = element

548

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

549

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

550

"""Load external XML document into element tree.

551

552

*source* is a file name or file object, *parser* is an optional parser

553

instance that defaults to XMLParser.

554

555

ParseError is raised if the parser fails to parse the document.

556

557

Returns the root element of the given source document.

558

559

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

560

close_source = False

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

561

if not hasattr(source, "read"):

562

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

563

close_source = True

564

try:

Eli Bendersky

a369923

2013-05-19 18:47:23 -0700

[diff] [blame]

565

if parser is None:

566

# If no parser was specified, create a default XMLParser

567

parser = XMLParser()

568

if hasattr(parser, '_parse_whole'):

569

# The default XMLParser, when it comes from an accelerator,

570

# can define an internal _parse_whole API for efficiency.

571

# It can be used to parse the whole source without feeding

572

# it with chunks.

573

self._root = parser._parse_whole(source)

574

return self._root

575

while True:

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

576

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

return self._root

finally:

if close_source:

source.close()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

585

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

586

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

587

"""Create and return tree iterator for the root element.

588

589

The iterator loops over all elements in this tree, in document order.

590

591

*tag* is a string with the tag name to iterate over

592

(default is to return all elements).

593

594

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

595

# assert self._root is not None

596

return self._root.iter(tag)

597

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

598

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

599

"""Find first matching element by tag name or path.

600

601

Same as getroot().find(path), which is Element.find()

602

603

*path* is a string having either an element tag or an XPath,

604

*namespaces* is an optional mapping from namespace prefix to full name.

605

606

Return the first matching element, or None if no element was found.

607

608

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

609

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

610

if path[:1] == "/":

611

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

612

warnings.warn(

613

"This search is broken in 1.3 and earlier, and will be "

614

"fixed in a future version. If you rely on the current "

615

"behaviour, change it to %r" % path,

616

FutureWarning, stacklevel=2

617

)

618

return self._root.find(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

619

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

620

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

621

"""Find first matching element by tag name or path.

622

623

Same as getroot().findtext(path), which is Element.findtext()

624

625

*path* is a string having either an element tag or an XPath,

626

*namespaces* is an optional mapping from namespace prefix to full name.

627

628

Return the first matching element, or None if no element was found.

629

630

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

631

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

632

if path[:1] == "/":

633

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

634

warnings.warn(

635

"This search is broken in 1.3 and earlier, and will be "

636

"fixed in a future version. If you rely on the current "

637

"behaviour, change it to %r" % path,

638

FutureWarning, stacklevel=2

639

)

640

return self._root.findtext(path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

641

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

642

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

643

"""Find all matching subelements by tag name or path.

644

645

Same as getroot().findall(path), which is Element.findall().

646

647

*path* is a string having either an element tag or an XPath,

648

*namespaces* is an optional mapping from namespace prefix to full name.

649

650

Return list containing all matching elements in document order.

651

652

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

653

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

654

if path[:1] == "/":

655

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

656

warnings.warn(

657

"This search is broken in 1.3 and earlier, and will be "

658

"fixed in a future version. If you rely on the current "

659

"behaviour, change it to %r" % path,

660

FutureWarning, stacklevel=2

661

)

662

return self._root.findall(path, namespaces)

663

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

664

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

665

"""Find all matching subelements by tag name or path.

666

667

Same as getroot().iterfind(path), which is element.iterfind()

668

669

*path* is a string having either an element tag or an XPath,

670

*namespaces* is an optional mapping from namespace prefix to full name.

671

672

Return an iterable yielding all matching elements in document order.

673

674

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

675

# assert self._root is not None

if path[:1] == "/":

path = "." + path

warnings.warn(

"This search is broken in 1.3 and earlier, and will be "

680

"fixed in a future version. If you rely on the current "

681

"behaviour, change it to %r" % path,

682

FutureWarning, stacklevel=2

683

)

684

return self._root.iterfind(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

685

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

686

def write(self, file_or_filename,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

687

encoding=None,

688

xml_declaration=None,

689

default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

690

method=None, *,

691

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

692

"""Write element tree to a file as XML.

693

694

Arguments:

695

*file_or_filename* -- file name or a file object opened for writing

696

697

*encoding* -- the output encoding (default: US-ASCII)

698

699

*xml_declaration* -- bool indicating if an XML declaration should be

700

added to the output. If None, an XML declaration

701

is added if encoding IS NOT either of:

702

US-ASCII, UTF-8, or Unicode

703

704

*default_namespace* -- sets the default XML namespace (for "xmlns")

705

706

*method* -- either "xml" (default), "html, "text", or "c14n"

707

708

*short_empty_elements* -- controls the formatting of elements

709

that contain no content. If True (default)

710

they are emitted as a single self-closed

711

tag, otherwise they are emitted as a pair

712

of start/end tags

Eli Bendersky

e9af827

2013-01-13 06:27:51 -0800

[diff] [blame]

713

714

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

715

if not method:

716

method = "xml"

717

elif method not in _serialize:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

718

raise ValueError("unknown method %r" % method)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

if not encoding:

if method == "c14n":

encoding = "utf-8"

else:

encoding = "us-ascii"

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

724

enc_lower = encoding.lower()

725

with _get_writer(file_or_filename, enc_lower) as write:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

726

if method == "xml" and (xml_declaration or

727

(xml_declaration is None and

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

728

enc_lower not in ("utf-8", "us-ascii", "unicode"))):

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

729

declared_encoding = encoding

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

730

if enc_lower == "unicode":

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

731

# Retrieve the default encoding for the xml declaration

732

import locale

733

declared_encoding = locale.getpreferredencoding()

734

write("<?xml version='1.0' encoding='%s'?>\n" % (

735

declared_encoding,))

736

if method == "text":

737

_serialize_text(write, self._root)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

738

else:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

739

qnames, namespaces = _namespaces(self._root, default_namespace)

740

serialize = _serialize[method]

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

741

serialize(write, self._root, qnames, namespaces,

742

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

743

744

def write_c14n(self, file):

745

# lxml.etree compatibility. use output method instead

746

return self.write(file, method="c14n")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

747

748

# --------------------------------------------------------------------

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

749

# serialization support

750

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

751

@contextlib.contextmanager

752

def _get_writer(file_or_filename, encoding):

Ezio Melotti

b5bc353

2013-08-17 16:11:40 +0300

[diff] [blame]

753

# returns text write method and release all resources after using

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

754

try:

755

write = file_or_filename.write

756

except AttributeError:

757

# file_or_filename is a file name

758

if encoding == "unicode":

759

file = open(file_or_filename, "w")

760

else:

761

file = open(file_or_filename, "w", encoding=encoding,

762

errors="xmlcharrefreplace")

with file:

yield file.write

else:

# file_or_filename is a file-like object

767

# encoding determines if it is a text or binary writer

768

if encoding == "unicode":

769

# use a text writer as is

770

yield write

771

else:

772

# wrap a binary writer with TextIOWrapper

773

with contextlib.ExitStack() as stack:

774

if isinstance(file_or_filename, io.BufferedIOBase):

775

file = file_or_filename

776

elif isinstance(file_or_filename, io.RawIOBase):

777

file = io.BufferedWriter(file_or_filename)

778

# Keep the original file open when the BufferedWriter is

779

# destroyed

780

stack.callback(file.detach)

781

else:

782

# This is to handle passed objects that aren't in the

783

# IOBase hierarchy, but just have a write method

784

file = io.BufferedIOBase()

785

file.writable = lambda: True

786

file.write = write

787

try:

788

# TextIOWrapper uses this methods to determine

789

# if BOM (for UTF-16, etc) should be added

790

file.seekable = file_or_filename.seekable

791

file.tell = file_or_filename.tell

792

except AttributeError:

793

pass

794

file = io.TextIOWrapper(file,

795

encoding=encoding,

796

errors="xmlcharrefreplace",

797

newline="\n")

798

# Keep the original file open when the TextIOWrapper is

799

# destroyed

800

stack.callback(file.detach)

801

yield file.write

802

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

803

def _namespaces(elem, default_namespace=None):

804

# identify namespaces used in this tree

805

806

# maps qnames to *encoded* prefix:local names

807

qnames = {None: None}

808

809

# maps uri:s to prefixes

810

namespaces = {}

811

if default_namespace:

812

namespaces[default_namespace] = ""

813

814

def add_qname(qname):

815

# calculate serialized qname representation

816

try:

817

if qname[:1] == "{":

818

uri, tag = qname[1:].rsplit("}", 1)

819

prefix = namespaces.get(uri)

820

if prefix is None:

821

prefix = _namespace_map.get(uri)

822

if prefix is None:

823

prefix = "ns%d" % len(namespaces)

824

if prefix != "xml":

825

namespaces[uri] = prefix

826

if prefix:

827

qnames[qname] = "%s:%s" % (prefix, tag)

828

else:

829

qnames[qname] = tag # default element

830

else:

831

if default_namespace:

832

# FIXME: can this be handled in XML 1.0?

833

raise ValueError(

834

"cannot use non-qualified names with "

835

"default_namespace option"

836

)

837

qnames[qname] = qname

838

except TypeError:

839

_raise_serialization_error(qname)

840

841

# populate qname and namespaces table

Eli Bendersky

64d11e6

2012-06-15 07:42:50 +0300

[diff] [blame]

842

for elem in elem.iter():

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

843

tag = elem.tag

Senthil Kumaran

ec30b3d

2010-11-09 02:36:59 +0000

[diff] [blame]

844

if isinstance(tag, QName):

845

if tag.text not in qnames:

846

add_qname(tag.text)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

847

elif isinstance(tag, str):

848

if tag not in qnames:

849

add_qname(tag)

850

elif tag is not None and tag is not Comment and tag is not PI:

851

_raise_serialization_error(tag)

852

for key, value in elem.items():

853

if isinstance(key, QName):

854

key = key.text

855

if key not in qnames:

856

add_qname(key)

857

if isinstance(value, QName) and value.text not in qnames:

858

add_qname(value.text)

859

text = elem.text

860

if isinstance(text, QName) and text.text not in qnames:

861

add_qname(text.text)

862

return qnames, namespaces

863

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

864

def _serialize_xml(write, elem, qnames, namespaces,

865

short_empty_elements, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % text)

870

elif tag is ProcessingInstruction:

871

write("<?%s?>" % text)

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

877

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

878

_serialize_xml(write, e, qnames, None,

879

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

880

else:

881

write("<" + tag)

882

items = list(elem.items())

883

if items or namespaces:

884

if namespaces:

885

for v, k in sorted(namespaces.items(),

886

key=lambda x: x[1]): # sort on prefix

887

if k:

888

k = ":" + k

889

write(" xmlns%s=\"%s\"" % (

890

k,

891

_escape_attrib(v)

892

))

Raymond Hettinger

e3685fd

2018-10-28 11:18:22 -0700

[diff] [blame]

893

for k, v in items:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

894

if isinstance(k, QName):

895

k = k.text

896

if isinstance(v, QName):

897

v = qnames[v.text]

898

else:

899

v = _escape_attrib(v)

900

write(" %s=\"%s\"" % (qnames[k], v))

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

901

if text or len(elem) or not short_empty_elements:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

902

write(">")

903

if text:

904

write(_escape_cdata(text))

905

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

906

_serialize_xml(write, e, qnames, None,

907

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

908

write("</" + tag + ">")

else:

write(" />")

if elem.tail:

write(_escape_cdata(elem.tail))

913

914

HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",

Ezio Melotti

c90111f

2012-09-19 08:19:12 +0300

[diff] [blame]

915

"img", "input", "isindex", "link", "meta", "param")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

916

917

try:

918

HTML_EMPTY = set(HTML_EMPTY)

except NameError:

pass

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

922

def _serialize_html(write, elem, qnames, namespaces, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % _escape_cdata(text))

927

elif tag is ProcessingInstruction:

928

write("<?%s?>" % _escape_cdata(text))

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

934

for e in elem:

935

_serialize_html(write, e, qnames, None)

936

else:

937

write("<" + tag)

938

items = list(elem.items())

939

if items or namespaces:

940

if namespaces:

941

for v, k in sorted(namespaces.items(),

942

key=lambda x: x[1]): # sort on prefix

943

if k:

944

k = ":" + k

945

write(" xmlns%s=\"%s\"" % (

946

k,

947

_escape_attrib(v)

948

))

Serhiy Storchaka

3b05ad7

2018-10-29 19:31:04 +0200

[diff] [blame]

949

for k, v in items:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

950

if isinstance(k, QName):

951

k = k.text

952

if isinstance(v, QName):

953

v = qnames[v.text]

954

else:

955

v = _escape_attrib_html(v)

956

# FIXME: handle boolean attributes

957

write(" %s=\"%s\"" % (qnames[k], v))

958

write(">")

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

959

ltag = tag.lower()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

960

if text:

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

961

if ltag == "script" or ltag == "style":

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

962

write(text)

963

else:

964

write(_escape_cdata(text))

965

for e in elem:

966

_serialize_html(write, e, qnames, None)

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

967

if ltag not in HTML_EMPTY:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

968

write("</" + tag + ">")

969

if elem.tail:

970

write(_escape_cdata(elem.tail))

971

972

def _serialize_text(write, elem):

973

for part in elem.itertext():

write(part)

if elem.tail:

write(elem.tail)

_serialize = {

"xml": _serialize_xml,

980

"html": _serialize_html,

981

"text": _serialize_text,

982

# this optional method is imported at the end of the module

983

# "c14n": _serialize_c14n,

984

}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

985

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

986

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

987

def register_namespace(prefix, uri):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

988

"""Register a namespace prefix.

989

990

The registry is global, and any existing mapping for either the

991

given prefix or the namespace URI will be removed.

992

993

*prefix* is the namespace prefix, *uri* is a namespace uri. Tags and

994

attributes in this namespace will be serialized with prefix if possible.

995

996

ValueError is raised if prefix is reserved or is invalid.

997

998

"""

R David Murray

44b548d

2016-09-08 13:59:53 -0400

[diff] [blame]

999

if re.match(r"ns\d+$", prefix):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1000

raise ValueError("Prefix format reserved for internal use")

Georg Brandl

90b2067

2010-12-28 10:38:33 +0000

[diff] [blame]

1001

for k, v in list(_namespace_map.items()):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1002

if k == uri or v == prefix:

1003

del _namespace_map[k]

1004

_namespace_map[uri] = prefix

1005

1006

_namespace_map = {

1007

# "well-known" namespace prefixes

1008

"http://www.w3.org/XML/1998/namespace": "xml",

1009

"http://www.w3.org/1999/xhtml": "html",

1010

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

1011

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

1012

# xml schema

1013

"http://www.w3.org/2001/XMLSchema": "xs",

1014

"http://www.w3.org/2001/XMLSchema-instance": "xsi",

1015

# dublin core

1016

"http://purl.org/dc/elements/1.1/": "dc",

1017

}

Florent Xicluna

1639505

2012-02-16 23:28:35 +0100

[diff] [blame]

1018

# For tests and troubleshooting

1019

register_namespace._namespace_map = _namespace_map

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1020

1021

def _raise_serialization_error(text):

1022

raise TypeError(

1023

"cannot serialize %r (type %s)" % (text, type(text).__name__)

1024

)

1025

1026

def _escape_cdata(text):

1027

# escape character data

1028

try:

1029

# it's worth avoiding do-nothing calls for strings that are

Mike

53f7a7c

2017-12-14 14:04:53 +0300

[diff] [blame]

1030

# shorter than 500 characters, or so. assume that's, by far,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1031

# the most common case in most applications.

1032

if "&" in text:

1033

text = text.replace("&", "&")

1034

if "<" in text:

1035

text = text.replace("<", "<")

1036

if ">" in text:

1037

text = text.replace(">", ">")

1038

return text

1039

except (TypeError, AttributeError):

1040

_raise_serialization_error(text)

1041

1042

def _escape_attrib(text):

1043

# escape attribute value

1044

try:

1045

if "&" in text:

1046

text = text.replace("&", "&")

1047

if "<" in text:

1048

text = text.replace("<", "<")

1049

if ">" in text:

1050

text = text.replace(">", ">")

1051

if "\"" in text:

1052

text = text.replace("\"", """)

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1053

# The following business with carriage returns is to satisfy

Raymond Hettinger

11fa3ff

2016-09-11 23:23:24 -0700

[diff] [blame]

1054

# Section 2.11 of the XML specification, stating that

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1055

# CR or CR LN should be replaced with just LN

1056

# http://www.w3.org/TR/REC-xml/#sec-line-ends

1057

if "\r\n" in text:

1058

text = text.replace("\r\n", "\n")

1059

if "\r" in text:

1060

text = text.replace("\r", "\n")

1061

#The following four lines are issue 17582

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1062

if "\n" in text:

1063

text = text.replace("\n", "
")

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1064

if "\t" in text:

1065

text = text.replace("\t", "	")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1066

return text

1067

except (TypeError, AttributeError):

1068

_raise_serialization_error(text)

1069

1070

def _escape_attrib_html(text):

1071

# escape attribute value

1072

try:

1073

if "&" in text:

1074

text = text.replace("&", "&")

1075

if ">" in text:

1076

text = text.replace(">", ">")

1077

if "\"" in text:

1078

text = text.replace("\"", """)

1079

return text

1080

except (TypeError, AttributeError):

1081

_raise_serialization_error(text)

1082

1083

# --------------------------------------------------------------------

1084

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1085

def tostring(element, encoding=None, method=None, *,

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1086

xml_declaration=None, default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1087

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1088

"""Generate string representation of XML element.

1089

1090

All subelements are included. If encoding is "unicode", a string

1091

is returned. Otherwise a bytestring is returned.

1092

1093

*element* is an Element instance, *encoding* is an optional output

1094

encoding defaulting to US-ASCII, *method* is an optional output which can

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1095

be one of "xml" (default), "html", "text" or "c14n", *default_namespace*

1096

sets the default XML namespace (for "xmlns").

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1097

1098

Returns an (optionally) encoded string containing the XML data.

1099

1100

"""

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1101

stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1102

ElementTree(element).write(stream, encoding,

1103

xml_declaration=xml_declaration,

1104

default_namespace=default_namespace,

1105

method=method,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1106

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1107

return stream.getvalue()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1108

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1109

class _ListDataStream(io.BufferedIOBase):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1110

"""An auxiliary stream accumulating into a list reference."""

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1111

def __init__(self, lst):

1112

self.lst = lst

Eli Bendersky

f90fc68

2012-07-17 15:09:56 +0300

[diff] [blame]

1113

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

def writable(self):

return True

def seekable(self):

return True

def write(self, b):

self.lst.append(b)

def tell(self):

return len(self.lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1126

def tostringlist(element, encoding=None, method=None, *,

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1127

xml_declaration=None, default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1128

short_empty_elements=True):

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1129

lst = []

1130

stream = _ListDataStream(lst)

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1131

ElementTree(element).write(stream, encoding,

1132

xml_declaration=xml_declaration,

1133

default_namespace=default_namespace,

1134

method=method,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1135

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1136

return lst

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1137

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1138

1139

def dump(elem):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1140

"""Write element tree or element structure to sys.stdout.

1141

1142

This function should be used for debugging only.

1143

1144

*elem* is either an ElementTree, or a single Element. The exact output

1145

format is implementation dependent. In this version, it's written as an

1146

ordinary XML file.

1147

1148

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1149

# debugging

1150

if not isinstance(elem, ElementTree):

1151

elem = ElementTree(elem)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

1152

elem.write(sys.stdout, encoding="unicode")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1153

tail = elem.getroot().tail

1154

if not tail or tail[-1] != "\n":

1155

sys.stdout.write("\n")

1156

Stefan Behnel

b5d3cee

2019-08-23 16:44:25 +0200

[diff] [blame]

1157

1158

def indent(tree, space=" ", level=0):

1159

"""Indent an XML document by inserting newlines and indentation space

1160

after elements.

1161

1162

*tree* is the ElementTree or Element to modify. The (root) element

1163

itself will not be changed, but the tail text of all elements in its

1164

subtree will be adapted.

1165

1166

*space* is the whitespace to insert for each indentation level, two

1167

space characters by default.

1168

1169

*level* is the initial indentation level. Setting this to a higher

1170

value than 0 can be used for indenting subtrees that are more deeply

1171

nested inside of a document.

1172

"""

1173

if isinstance(tree, ElementTree):

1174

tree = tree.getroot()

1175

if level < 0:

1176

raise ValueError(f"Initial indentation level must be >= 0, got {level}")

if not len(tree):

return

# Reduce the memory consumption by reusing indentation strings.

1181

indentations = ["\n" + level * space]

1182

1183

def _indent_children(elem, level):

1184

# Start a new indentation level for the first child.

1185

child_level = level + 1

1186

try:

1187

child_indentation = indentations[child_level]

1188

except IndexError:

1189

child_indentation = indentations[level] + space

1190

indentations.append(child_indentation)

1191

1192

if not elem.text or not elem.text.strip():

1193

elem.text = child_indentation

for child in elem:

if len(child):

_indent_children(child, child_level)

1198

if not child.tail or not child.tail.strip():

1199

child.tail = child_indentation

1200

1201

# Dedent after the last child by overwriting the previous indentation.

1202

if not child.tail.strip():

1203

child.tail = indentations[level]

1204

1205

_indent_children(tree, 0)

1206

1207

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1208

# --------------------------------------------------------------------

1209

# parsing

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1210

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1211

1212

def parse(source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1213

"""Parse XML document into element tree.

1214

1215

*source* is a filename or file object containing XML data,

1216

*parser* is an optional parser instance defaulting to XMLParser.

1217

1218

Return an ElementTree instance.

1219

1220

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1221

tree = ElementTree()

1222

tree.parse(source, parser)

1223

return tree

1224

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1225

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1226

def iterparse(source, events=None, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1227

"""Incrementally parse XML document into ElementTree.

1228

1229

This class also reports what's going on to the user based on the

1230

*events* it is initialized with. The supported events are the strings

1231

"start", "end", "start-ns" and "end-ns" (the "ns" events are used to get

1232

detailed namespace information). If *events* is omitted, only

1233

"end" events are reported.

1234

1235

*source* is a filename or file object containing XML data, *events* is

1236

a list of events to report back, *parser* is an optional parser instance.

1237

1238

Returns an iterator providing (event, elem) pairs.

1239

1240

"""

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1241

# Use the internal, undocumented _parser argument for now; When the

1242

# parser argument of iterparse is removed, this can be killed.

1243

pullparser = XMLPullParser(events=events, _parser=parser)

def iterator():

try:

while True:

yield from pullparser.read_events()

1248

# load event buffer

1249

data = source.read(16 * 1024)

1250

if not data:

1251

break

1252

pullparser.feed(data)

1253

root = pullparser._close_and_return_root()

1254

yield from pullparser.read_events()

it.root = root

finally:

if close_source:

source.close()

Serhiy Storchaka

2017-04-24 09:05:00 +0300

[diff] [blame]

1260

class IterParseIterator(collections.abc.Iterator):

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1261

__next__ = iterator().__next__

1262

it = IterParseIterator()

1263

it.root = None

1264

del iterator, IterParseIterator

1265

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1266

close_source = False

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1267

if not hasattr(source, "read"):

1268

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1269

close_source = True

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1270

1271

return it

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1272

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1273

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1274

class XMLPullParser:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1275

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1276

def __init__(self, events=None, *, _parser=None):

1277

# The _parser argument is for internal use only and must not be relied

1278

# upon in user code. It will be removed in a future release.

1279

# See http://bugs.python.org/issue17741 for more details.

1280

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1281

self._events_queue = collections.deque()

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1282

self._parser = _parser or XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1283

# wire up the parser for event reporting

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1284

if events is None:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1285

events = ("end",)

1286

self._parser._setevents(self._events_queue, events)

1287

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1288

def feed(self, data):

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1289

"""Feed encoded data to parser."""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1290

if self._parser is None:

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1291

raise ValueError("feed() called after end of stream")

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1292

if data:

1293

try:

1294

self._parser.feed(data)

1295

except SyntaxError as exc:

1296

self._events_queue.append(exc)

1297

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1298

def _close_and_return_root(self):

1299

# iterparse needs this to set its root attribute properly :(

1300

root = self._parser.close()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1301

self._parser = None

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

return root

def close(self):

"""Finish feeding data to parser.

1306

1307

Unlike XMLParser, does not return the root element. Use

1308

read_events() to consume elements from XMLPullParser.

1309

"""

1310

self._close_and_return_root()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1311

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1312

def read_events(self):

R David Murray

410d320

2014-01-04 23:52:50 -0500

[diff] [blame]

1313

"""Return an iterator over currently available (event, elem) pairs.

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1314

1315

Events are consumed from the internal event queue as they are

1316

retrieved from the iterator.

1317

"""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1318

events = self._events_queue

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1319

while events:

1320

event = events.popleft()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1321

if isinstance(event, Exception):

1322

raise event

1323

else:

1324

yield event

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1325

1326

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1327

def XML(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1328

"""Parse XML document from string constant.

1329

1330

This function can be used to embed "XML Literals" in Python code.

1331

1332

*text* is a string containing XML data, *parser* is an

1333

optional parser instance, defaulting to the standard XMLParser.

1334

1335

Returns an Element instance.

1336

1337

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1338

if not parser:

1339

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1340

parser.feed(text)

1341

return parser.close()

1342

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1343

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1344

def XMLID(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1345

"""Parse XML document from string constant for its IDs.

1346

1347

*text* is a string containing XML data, *parser* is an

1348

optional parser instance, defaulting to the standard XMLParser.

1349

1350

Returns an (Element, dict) tuple, in which the

1351

dict maps element id:s to elements.

1352

1353

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1354

if not parser:

1355

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1356

parser.feed(text)

1357

tree = parser.close()

1358

ids = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1359

for elem in tree.iter():

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

id = elem.get("id")

if id:

ids[id] = elem

return tree, ids

Victor Stinner

2013-03-26 01:11:54 +0100

[diff] [blame]

1365

# Parse XML document from string constant. Alias for XML().

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1366

fromstring = XML

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1367

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1368

def fromstringlist(sequence, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1369

"""Parse XML document from sequence of string fragments.

1370

1371

*sequence* is a list of other sequence, *parser* is an optional parser

1372

instance, defaulting to the standard XMLParser.

1373

1374

Returns an Element instance.

1375

1376

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1377

if not parser:

1378

parser = XMLParser(target=TreeBuilder())

1379

for text in sequence:

1380

parser.feed(text)

1381

return parser.close()

1382

1383

# --------------------------------------------------------------------

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1384

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1385

1386

class TreeBuilder:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1387

"""Generic element structure builder.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1388

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1389

This builder converts a sequence of start, data, and end method

1390

calls to a well-formed element structure.

1391

1392

You can use this class to build an element structure using a custom XML

1393

parser, or a parser for some other XML-like format.

1394

1395

*element_factory* is an optional element factory which is called

1396

to create new Element instances, as necessary.

1397

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1398

*comment_factory* is a factory to create comments to be used instead of

1399

the standard factory. If *insert_comments* is false (the default),

1400

comments will not be inserted into the tree.

1401

1402

*pi_factory* is a factory to create processing instructions to be used

1403

instead of the standard factory. If *insert_pis* is false (the default),

1404

processing instructions will not be inserted into the tree.

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1405

"""

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1406

def __init__(self, element_factory=None, *,

1407

comment_factory=None, pi_factory=None,

1408

insert_comments=False, insert_pis=False):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1409

self._data = [] # data collector

1410

self._elem = [] # element stack

1411

self._last = None # last element

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1412

self._root = None # root element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1413

self._tail = None # true if we're after an end tag

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1414

if comment_factory is None:

1415

comment_factory = Comment

1416

self._comment_factory = comment_factory

1417

self.insert_comments = insert_comments

1418

if pi_factory is None:

1419

pi_factory = ProcessingInstruction

1420

self._pi_factory = pi_factory

1421

self.insert_pis = insert_pis

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1422

if element_factory is None:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1423

element_factory = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1424

self._factory = element_factory

1425

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1426

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1427

"""Flush builder buffers and return toplevel document Element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1428

assert len(self._elem) == 0, "missing end tags"

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1429

assert self._root is not None, "missing toplevel element"

1430

return self._root

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

def _flush(self):

if self._data:

if self._last is not None:

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1435

text = "".join(self._data)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1436

if self._tail:

1437

assert self._last.tail is None, "internal error (tail)"

1438

self._last.tail = text

1439

else:

1440

assert self._last.text is None, "internal error (text)"

1441

self._last.text = text

1442

self._data = []

1443

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1444

def data(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1445

"""Add text to current element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1446

self._data.append(data)

1447

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1448

def start(self, tag, attrs):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1449

"""Open new element and return it.

1450

1451

*tag* is the element name, *attrs* is a dict containing element

1452

attributes.

1453

1454

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1455

self._flush()

1456

self._last = elem = self._factory(tag, attrs)

1457

if self._elem:

1458

self._elem[-1].append(elem)

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1459

elif self._root is None:

1460

self._root = elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1461

self._elem.append(elem)

self._tail = 0

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1465

def end(self, tag):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1466

"""Close and return current Element.

1467

1468

*tag* is the element name.

1469

1470

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1471

self._flush()

1472

self._last = self._elem.pop()

1473

assert self._last.tag == tag,\

1474

"end tag mismatch (expected %s, got %s)" % (

self._last.tag, tag)

self._tail = 1

return self._last

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1479

def comment(self, text):

1480

"""Create a comment using the comment_factory.

1481

1482

*text* is the text of the comment.

1483

"""

1484

return self._handle_single(

1485

self._comment_factory, self.insert_comments, text)

1486

1487

def pi(self, target, text=None):

1488

"""Create a processing instruction using the pi_factory.

1489

1490

*target* is the target name of the processing instruction.

1491

*text* is the data of the processing instruction, or ''.

1492

"""

1493

return self._handle_single(

1494

self._pi_factory, self.insert_pis, target, text)

1495

1496

def _handle_single(self, factory, insert, *args):

1497

elem = factory(*args)

if insert:

self._flush()

self._last = elem

if self._elem:

self._elem[-1].append(elem)

self._tail = 1

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1506

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1507

# also see ElementTree and TreeBuilder

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1508

class XMLParser:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1509

"""Element structure builder for XML source data based on the expat parser.

1510

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1511

*target* is an optional target object which defaults to an instance of the

1512

standard TreeBuilder class, *encoding* is an optional encoding string

1513

which if given, overrides the encoding specified in the XML file:

1514

http://www.iana.org/assignments/character-sets

1515

1516

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1517

Serhiy Storchaka

02ec92f

2018-07-24 12:03:34 +0300

[diff] [blame]

1518

def __init__(self, *, target=None, encoding=None):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1519

try:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1520

from xml.parsers import expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1521

except ImportError:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1522

try:

1523

import pyexpat as expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1524

except ImportError:

1525

raise ImportError(

1526

"No module named expat; use SimpleXMLTreeBuilder instead"

1527

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1528

parser = expat.ParserCreate(encoding, "}")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1529

if target is None:

1530

target = TreeBuilder()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1531

# underscored names are provided for compatibility only

1532

self.parser = self._parser = parser

1533

self.target = self._target = target

1534

self._error = expat.error

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1535

self._names = {} # name memo cache

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1536

# main callbacks

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1537

parser.DefaultHandlerExpand = self._default

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1538

if hasattr(target, 'start'):

1539

parser.StartElementHandler = self._start

1540

if hasattr(target, 'end'):

1541

parser.EndElementHandler = self._end

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1542

if hasattr(target, 'start_ns'):

1543

parser.StartNamespaceDeclHandler = self._start_ns

1544

if hasattr(target, 'end_ns'):

1545

parser.EndNamespaceDeclHandler = self._end_ns

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1546

if hasattr(target, 'data'):

1547

parser.CharacterDataHandler = target.data

1548

# miscellaneous callbacks

1549

if hasattr(target, 'comment'):

1550

parser.CommentHandler = target.comment

1551

if hasattr(target, 'pi'):

1552

parser.ProcessingInstructionHandler = target.pi

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1553

# Configure pyexpat: buffering, new-style attribute handling.

1554

parser.buffer_text = 1

1555

parser.ordered_attributes = 1

1556

parser.specified_attributes = 1

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1557

self._doctype = None

1558

self.entity = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1559

try:

1560

self.version = "Expat %d.%d.%d" % expat.version_info

1561

except AttributeError:

1562

pass # unknown

1563

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1564

def _setevents(self, events_queue, events_to_report):

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1565

# Internal API for XMLPullParser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1566

# events_to_report: a list of events to report during parsing (same as

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1567

# the *events* of XMLPullParser's constructor.

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1568

# events_queue: a list of actual parsing events that will be populated

1569

# by the underlying parser.

1570

#

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1571

parser = self._parser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1572

append = events_queue.append

1573

for event_name in events_to_report:

1574

if event_name == "start":

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1575

parser.ordered_attributes = 1

1576

parser.specified_attributes = 1

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1577

def handler(tag, attrib_in, event=event_name, append=append,

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1578

start=self._start):

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1579

append((event, start(tag, attrib_in)))

1580

parser.StartElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1581

elif event_name == "end":

1582

def handler(tag, event=event_name, append=append,

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1583

end=self._end):

1584

append((event, end(tag)))

1585

parser.EndElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1586

elif event_name == "start-ns":

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1587

# TreeBuilder does not implement .start_ns()

1588

if hasattr(self.target, "start_ns"):

1589

def handler(prefix, uri, event=event_name, append=append,

1590

start_ns=self._start_ns):

1591

append((event, start_ns(prefix, uri)))

1592

else:

1593

def handler(prefix, uri, event=event_name, append=append):

1594

append((event, (prefix or '', uri or '')))

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1595

parser.StartNamespaceDeclHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1596

elif event_name == "end-ns":

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1597

# TreeBuilder does not implement .end_ns()

1598

if hasattr(self.target, "end_ns"):

1599

def handler(prefix, event=event_name, append=append,

1600

end_ns=self._end_ns):

1601

append((event, end_ns(prefix)))

1602

else:

1603

def handler(prefix, event=event_name, append=append):

1604

append((event, None))

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1605

parser.EndNamespaceDeclHandler = handler

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1606

elif event_name == 'comment':

1607

def handler(text, event=event_name, append=append, self=self):

1608

append((event, self.target.comment(text)))

1609

parser.CommentHandler = handler

1610

elif event_name == 'pi':

1611

def handler(pi_target, data, event=event_name, append=append,

1612

self=self):

1613

append((event, self.target.pi(pi_target, data)))

1614

parser.ProcessingInstructionHandler = handler

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1615

else:

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1616

raise ValueError("unknown event %r" % event_name)

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1617

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1618

def _raiseerror(self, value):

1619

err = ParseError(value)

1620

err.code = value.code

1621

err.position = value.lineno, value.offset

1622

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1623

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1624

def _fixname(self, key):

1625

# expand qname, and convert name string to ascii, if possible

1626

try:

1627

name = self._names[key]

except KeyError:

name = key

if "}" in name:

name = "{" + name

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1632

self._names[key] = name

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1633

return name

1634

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1635

def _start_ns(self, prefix, uri):

1636

return self.target.start_ns(prefix or '', uri or '')

1637

1638

def _end_ns(self, prefix):

1639

return self.target.end_ns(prefix or '')

1640

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1641

def _start(self, tag, attr_list):

1642

# Handler for expat's StartElementHandler. Since ordered_attributes

1643

# is set, the attributes are reported as a list of alternating

1644

# attribute name,value.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1645

fixname = self._fixname

1646

tag = fixname(tag)

1647

attrib = {}

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1648

if attr_list:

1649

for i in range(0, len(attr_list), 2):

1650

attrib[fixname(attr_list[i])] = attr_list[i+1]

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1651

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1652

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1653

def _end(self, tag):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1654

return self.target.end(self._fixname(tag))

1655

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1656

def _default(self, text):

1657

prefix = text[:1]

1658

if prefix == "&":

1659

# deal with undefined entities

1660

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1661

data_handler = self.target.data

1662

except AttributeError:

1663

return

1664

try:

1665

data_handler(self.entity[text[1:-1]])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1666

except KeyError:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1667

from xml.parsers import expat

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1668

err = expat.error(

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1669

"undefined entity %s: line %d, column %d" %

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1670

(text, self.parser.ErrorLineNumber,

1671

self.parser.ErrorColumnNumber)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1672

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1673

err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1674

err.lineno = self.parser.ErrorLineNumber

1675

err.offset = self.parser.ErrorColumnNumber

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1676

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1677

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1678

self._doctype = [] # inside a doctype declaration

1679

elif self._doctype is not None:

1680

# parse doctype contents

1681

if prefix == ">":

1682

self._doctype = None

1683

return

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1684

text = text.strip()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1685

if not text:

1686

return

1687

self._doctype.append(text)

1688

n = len(self._doctype)

1689

if n > 2:

1690

type = self._doctype[1]

1691

if type == "PUBLIC" and n == 4:

1692

name, type, pubid, system = self._doctype

Florent Xicluna

a1c974a

2012-07-07 13:16:44 +0200

[diff] [blame]

1693

if pubid:

1694

pubid = pubid[1:-1]

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1695

elif type == "SYSTEM" and n == 3:

1696

name, type, system = self._doctype

1697

pubid = None

1698

else:

1699

return

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1700

if hasattr(self.target, "doctype"):

1701

self.target.doctype(name, pubid, system[1:-1])

Serhiy Storchaka

02ec92f

2018-07-24 12:03:34 +0300

[diff] [blame]

1702

elif hasattr(self, "doctype"):

1703

warnings.warn(

1704

"The doctype() method of XMLParser is ignored. "

1705

"Define doctype() method on the TreeBuilder target.",

1706

RuntimeWarning)

1707

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1708

self._doctype = None

1709

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1710

def feed(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1711

"""Feed encoded data to parser."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1712

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1713

self.parser.Parse(data, 0)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1714

except self._error as v:

1715

self._raiseerror(v)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1716

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1717

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1718

"""Finish feeding data to parser and return element structure."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1719

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1720

self.parser.Parse("", 1) # end of data

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1721

except self._error as v:

1722

self._raiseerror(v)

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1723

try:

Florent Xicluna

fb06746

2012-03-05 11:42:49 +0100

[diff] [blame]

1724

close_handler = self.target.close

1725

except AttributeError:

1726

pass

1727

else:

1728

return close_handler()

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1729

finally:

1730

# get rid of circular references

1731

del self.parser, self._parser

1732

del self.target, self._target

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1733

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1734

Stefan Behnel

e1d5dd6

2019-05-01 22:34:13 +0200

[diff] [blame]

1735

# --------------------------------------------------------------------

1736

# C14N 2.0

1737

1738

def canonicalize(xml_data=None, *, out=None, from_file=None, **options):

1739

"""Convert XML to its C14N 2.0 serialised form.

1740

1741

If *out* is provided, it must be a file or file-like object that receives

1742

the serialised canonical XML output (text, not bytes) through its ``.write()``

1743

method. To write to a file, open it in text mode with encoding "utf-8".

1744

If *out* is not provided, this function returns the output as text string.

1745

1746

Either *xml_data* (an XML string) or *from_file* (a file path or

1747

file-like object) must be provided as input.

1748

1749

The configuration options are the same as for the ``C14NWriterTarget``.

1750

"""

1751

if xml_data is None and from_file is None:

1752

raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")

1753

sio = None

1754

if out is None:

1755

sio = out = io.StringIO()

1756

1757

parser = XMLParser(target=C14NWriterTarget(out.write, **options))

1758

1759

if xml_data is not None:

1760

parser.feed(xml_data)

1761

parser.close()

1762

elif from_file is not None:

1763

parse(from_file, parser=parser)

1764

1765

return sio.getvalue() if sio is not None else None

1766

1767

1768

_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match

1769

1770

1771

class C14NWriterTarget:

1772

"""

1773

Canonicalization writer target for the XMLParser.

1774

1775

Serialises parse events to XML C14N 2.0.

1776

1777

The *write* function is used for writing out the resulting data stream

1778

as text (not bytes). To write to a file, open it in text mode with encoding

1779

"utf-8" and pass its ``.write`` method.

1780

1781

Configuration options:

1782

1783

- *with_comments*: set to true to include comments

1784

- *strip_text*: set to true to strip whitespace before and after text content

1785

- *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"

1786

- *qname_aware_tags*: a set of qname aware tag names in which prefixes

1787

should be replaced in text content

1788

- *qname_aware_attrs*: a set of qname aware attribute names in which prefixes

1789

should be replaced in text content

1790

- *exclude_attrs*: a set of attribute names that should not be serialised

1791

- *exclude_tags*: a set of tag names that should not be serialised

1792

"""

1793

def __init__(self, write, *,

1794

with_comments=False, strip_text=False, rewrite_prefixes=False,

1795

qname_aware_tags=None, qname_aware_attrs=None,

1796

exclude_attrs=None, exclude_tags=None):

1797

self._write = write

1798

self._data = []

1799

self._with_comments = with_comments

1800

self._strip_text = strip_text

1801

self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None

1802

self._exclude_tags = set(exclude_tags) if exclude_tags else None

1803

1804

self._rewrite_prefixes = rewrite_prefixes

1805

if qname_aware_tags:

1806

self._qname_aware_tags = set(qname_aware_tags)

1807

else:

1808

self._qname_aware_tags = None

1809

if qname_aware_attrs:

1810

self._find_qname_aware_attrs = set(qname_aware_attrs).intersection

1811

else:

1812

self._find_qname_aware_attrs = None

1813

1814

# Stack with globally and newly declared namespaces as (uri, prefix) pairs.

1815

self._declared_ns_stack = [[

1816

("http://www.w3.org/XML/1998/namespace", "xml"),

1817

]]

1818

# Stack with user declared namespace prefixes as (uri, prefix) pairs.

1819

self._ns_stack = []

1820

if not rewrite_prefixes:

1821

self._ns_stack.append(list(_namespace_map.items()))

1822

self._ns_stack.append([])

1823

self._prefix_map = {}

1824

self._preserve_space = [False]

1825

self._pending_start = None

1826

self._root_seen = False

1827

self._root_done = False

1828

self._ignored_depth = 0

1829

1830

def _iter_namespaces(self, ns_stack, _reversed=reversed):

1831

for namespaces in _reversed(ns_stack):

1832

if namespaces: # almost no element declares new namespaces

1833

yield from namespaces

1834

1835

def _resolve_prefix_name(self, prefixed_name):

1836

prefix, name = prefixed_name.split(':', 1)

1837

for uri, p in self._iter_namespaces(self._ns_stack):

1838

if p == prefix:

1839

return f'{{{uri}}}{name}'

1840

raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')

1841

1842

def _qname(self, qname, uri=None):

1843

if uri is None:

1844

uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)

else:

tag = qname

prefixes_seen = set()

1849

for u, prefix in self._iter_namespaces(self._declared_ns_stack):

1850

if u == uri and prefix not in prefixes_seen:

1851

return f'{prefix}:{tag}' if prefix else tag, tag, uri

1852

prefixes_seen.add(prefix)

1853

1854

# Not declared yet => add new declaration.

1855

if self._rewrite_prefixes:

1856

if uri in self._prefix_map:

1857

prefix = self._prefix_map[uri]

1858

else:

1859

prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'

1860

self._declared_ns_stack[-1].append((uri, prefix))

1861

return f'{prefix}:{tag}', tag, uri

1862

1863

if not uri and '' not in prefixes_seen:

1864

# No default namespace declared => no prefix needed.

1865

return tag, tag, uri

1866

1867

for u, prefix in self._iter_namespaces(self._ns_stack):

1868

if u == uri:

1869

self._declared_ns_stack[-1].append((uri, prefix))

1870

return f'{prefix}:{tag}' if prefix else tag, tag, uri

1871

1872

raise ValueError(f'Namespace "{uri}" is not declared in scope')

1873

1874

def data(self, data):

1875

if not self._ignored_depth:

1876

self._data.append(data)

1877

1878

def _flush(self, _join_text=''.join):

1879

data = _join_text(self._data)

1880

del self._data[:]

1881

if self._strip_text and not self._preserve_space[-1]:

1882

data = data.strip()

1883

if self._pending_start is not None:

1884

args, self._pending_start = self._pending_start, None

1885

qname_text = data if data and _looks_like_prefix_name(data) else None

1886

self._start(*args, qname_text)

1887

if qname_text is not None:

1888

return

1889

if data and self._root_seen:

1890

self._write(_escape_cdata_c14n(data))

1891

1892

def start_ns(self, prefix, uri):

1893

if self._ignored_depth:

1894

return

1895

# we may have to resolve qnames in text content

1896

if self._data:

1897

self._flush()

1898

self._ns_stack[-1].append((uri, prefix))

1899

1900

def start(self, tag, attrs):

1901

if self._exclude_tags is not None and (

1902

self._ignored_depth or tag in self._exclude_tags):

1903

self._ignored_depth += 1

return

if self._data:

self._flush()

new_namespaces = []

self._declared_ns_stack.append(new_namespaces)

1910

1911

if self._qname_aware_tags is not None and tag in self._qname_aware_tags:

1912

# Need to parse text first to see if it requires a prefix declaration.

1913

self._pending_start = (tag, attrs, new_namespaces)

1914

return

1915

self._start(tag, attrs, new_namespaces)

1916

1917

def _start(self, tag, attrs, new_namespaces, qname_text=None):

1918

if self._exclude_attrs is not None and attrs:

1919

attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}

1920

1921

qnames = {tag, *attrs}

1922

resolved_names = {}

1923

1924

# Resolve prefixes in attribute and tag text.

1925

if qname_text is not None:

1926

qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)

1927

qnames.add(qname)

1928

if self._find_qname_aware_attrs is not None and attrs:

1929

qattrs = self._find_qname_aware_attrs(attrs)

1930

if qattrs:

1931

for attr_name in qattrs:

1932

value = attrs[attr_name]

1933

if _looks_like_prefix_name(value):

1934

qname = resolved_names[value] = self._resolve_prefix_name(value)

qnames.add(qname)

else:

qattrs = None

else:

qattrs = None

# Assign prefixes in lexicographical order of used URIs.

1942

parse_qname = self._qname

1943

parsed_qnames = {n: parse_qname(n) for n in sorted(

1944

qnames, key=lambda n: n.split('}', 1))}

1945

1946

# Write namespace declarations in prefix order ...

1947

if new_namespaces:

1948

attr_list = [

1949

('xmlns:' + prefix if prefix else 'xmlns', uri)

1950

for uri, prefix in new_namespaces

]

attr_list.sort()

else:

# almost always empty

1955

attr_list = []

1956

1957

# ... followed by attributes in URI+name order

1958

if attrs:

1959

for k, v in sorted(attrs.items()):

1960

if qattrs is not None and k in qattrs and v in resolved_names:

1961

v = parsed_qnames[resolved_names[v]][0]

1962

attr_qname, attr_name, uri = parsed_qnames[k]

1963

# No prefix for attributes in default ('') namespace.

1964

attr_list.append((attr_qname if uri else attr_name, v))

1965

1966

# Honour xml:space attributes.

1967

space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')

1968

self._preserve_space.append(

1969

space_behaviour == 'preserve' if space_behaviour

1970

else self._preserve_space[-1])

# Write the tag.

write = self._write

write('<' + parsed_qnames[tag][0])

1975

if attr_list:

1976

write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))

1977

write('>')

1978

1979

# Write the resolved qname text content.

1980

if qname_text is not None:

1981

write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))

1982

1983

self._root_seen = True

1984

self._ns_stack.append([])

1985

1986

def end(self, tag):

1987

if self._ignored_depth:

1988

self._ignored_depth -= 1

return

if self._data:

self._flush()

self._write(f'</{self._qname(tag)[0]}>')

1993

self._preserve_space.pop()

1994

self._root_done = len(self._preserve_space) == 1

1995

self._declared_ns_stack.pop()

1996

self._ns_stack.pop()

1997

1998

def comment(self, text):

1999

if not self._with_comments:

2000

return

2001

if self._ignored_depth:

return

if self._root_done:

self._write('\n')

elif self._root_seen and self._data:

2006

self._flush()

2007

self._write(f'')

2008

if not self._root_seen:

2009

self._write('\n')

2010

2011

def pi(self, target, data):

2012

if self._ignored_depth:

return

if self._root_done:

self._write('\n')

elif self._root_seen and self._data:

2017

self._flush()

2018

self._write(

2019

f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')

2020

if not self._root_seen:

self._write('\n')

def _escape_cdata_c14n(text):

2025

# escape character data

2026

try:

2027

# it's worth avoiding do-nothing calls for strings that are

2028

# shorter than 500 character, or so. assume that's, by far,

2029

# the most common case in most applications.

2030

if '&' in text:

2031

text = text.replace('&', '&')

2032

if '<' in text:

2033

text = text.replace('<', '<')

2034

if '>' in text:

2035

text = text.replace('>', '>')

2036

if '\r' in text:

2037

text = text.replace('\r', '')

2038

return text

2039

except (TypeError, AttributeError):

2040

_raise_serialization_error(text)

2041

2042

2043

def _escape_attrib_c14n(text):

2044

# escape attribute value

2045

try:

2046

if '&' in text:

2047

text = text.replace('&', '&')

2048

if '<' in text:

2049

text = text.replace('<', '<')

2050

if '"' in text:

2051

text = text.replace('"', '"')

2052

if '\t' in text:

2053

text = text.replace('\t', '	')

2054

if '\n' in text:

2055

text = text.replace('\n', '
')

2056

if '\r' in text:

2057

text = text.replace('\r', '')

2058

return text

2059

except (TypeError, AttributeError):

2060

_raise_serialization_error(text)

2061

2062

2063

# --------------------------------------------------------------------

2064

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

2065

# Import the C accelerators

2066

try:

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

2067

# Element is going to be shadowed by the C implementation. We need to keep

2068

# the Python version of it accessible for some "creative" by external code

2069

# (see tests)

2070

_Element_Py = Element

2071

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

2072

# Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

2073

from _elementtree import *

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

2074

from _elementtree import _set_factories

Eli Bendersky

c4e98a6

2013-05-19 09:24:43 -0700

[diff] [blame]

2075

except ImportError:

2076

pass

Stefan Behnel