Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

2013-03-09 07:12:48 -0800

[diff] [blame]

1

"""Lightweight XML support for Python.

2

3

XML is an inherently hierarchical data format, and the most natural way to

4

represent it is with a tree. This module has two classes for this purpose:

5

6

1. ElementTree represents the whole XML document as a tree and

7

8

2. Element represents a single node in this tree.

9

10

Interactions with the whole document (reading and writing to/from files) are

11

usually done on the ElementTree level. Interactions with a single XML element

12

and its sub-elements are done on the Element level.

13

14

Element is a flexible container object designed to store hierarchical data

15

structures in memory. It can be described as a cross between a list and a

16

dictionary. Each Element has a number of properties associated with it:

17

18

'tag' - a string containing the element's name.

19

20

'attributes' - a Python dictionary storing the element's attributes.

21

22

'text' - a string containing the element's text content.

23

24

'tail' - an optional string containing text after the element's end tag.

25

26

And a number of child elements stored in a Python sequence.

27

28

To create an element instance, use the Element constructor,

29

or the SubElement factory function.

30

31

You can also use the ElementTree class to wrap an element structure

32

and convert it to and from XML.

"""

Eli Bendersky

2013-04-20 05:44:01 -0700

[diff] [blame]

36

#---------------------------------------------------------------------

37

# Licensed to PSF under a Contributor Agreement.

38

# See http://www.python.org/psf/license for licensing details.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

39

#

40

# ElementTree

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

41

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

42

#

43

# fredrik@pythonware.com

44

# http://www.pythonware.com

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

45

# --------------------------------------------------------------------

46

# The ElementTree toolkit is

47

#

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

48

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

49

#

50

# By obtaining, using, and/or copying this software and/or its

51

# associated documentation, you agree that you have read, understood,

52

# and will comply with the following terms and conditions:

53

#

54

# Permission to use, copy, modify, and distribute this software and

55

# its associated documentation for any purpose and without fee is

56

# hereby granted, provided that the above copyright notice appears in

57

# all copies, and that both that copyright notice and this permission

58

# notice appear in supporting documentation, and that the name of

59

# Secret Labs AB or the author not be used in advertising or publicity

60

# pertaining to distribution of the software without specific, written

61

# prior permission.

62

#

63

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

64

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

65

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

66

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

67

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

68

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

69

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

70

# OF THIS SOFTWARE.

71

# --------------------------------------------------------------------

__all__ = [

# public symbols

"Comment",

"dump",

"Element", "ElementTree",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

78

"fromstring", "fromstringlist",

Stefan Behnel

b5d3cee

2019-08-23 16:44:25 +0200

[diff] [blame]

79

"indent", "iselement", "iterparse",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

80

"parse", "ParseError",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

81

"PI", "ProcessingInstruction",

82

"QName",

83

"SubElement",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

84

"tostring", "tostringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

85

"TreeBuilder",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

86

"VERSION",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

87

"XML", "XMLID",

Martin Panter

dcfebb3

2016-04-01 06:55:55 +0000

[diff] [blame]

88

"XMLParser", "XMLPullParser",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

89

"register_namespace",

Stefan Behnel

e1d5dd6

2019-05-01 22:34:13 +0200

[diff] [blame]

90

"canonicalize", "C14NWriterTarget",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

91

]

92

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

93

VERSION = "1.3.0"

94

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

95

import sys

96

import re

97

import warnings

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

98

import io

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

99

import collections

Serhiy Storchaka

2e576f5

2017-04-24 09:05:00 +0300

[diff] [blame]

100

import collections.abc

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

101

import contextlib

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

102

Eli Bendersky

27cbb19

2012-06-15 09:03:19 +0300

[diff] [blame]

103

from . import ElementPath

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

104

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

105

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

106

class ParseError(SyntaxError):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

107

"""An error when parsing an XML document.

108

109

In addition to its exception value, a ParseError contains

110

two extra attributes:

111

'code' - the specific exception code

112

'position' - the line and column of the error

113

114

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

115

pass

116

117

# --------------------------------------------------------------------

118

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

119

120

def iselement(element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

121

"""Return True if *element* appears to be an Element."""

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

122

return hasattr(element, 'tag')

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

123

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

124

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

125

class Element:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

126

"""An XML element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

127

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

128

This class is the reference implementation of the Element interface.

129

130

An element's length is its number of subelements. That means if you

Serhiy Storchaka

56a6d85

2014-12-01 18:28:43 +0200

[diff] [blame]

131

want to check if an element is truly empty, you should check BOTH

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

132

its length AND its text attribute.

133

134

The element tag, attribute names, and attribute values can be either

135

bytes or strings.

136

137

*tag* is the element name. *attrib* is an optional dictionary containing

138

element attributes. *extra* are additional element attributes given as

keyword arguments.

Example form:

<tag attrib>text<child/>...</tag>tail

143

144

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

145

146

tag = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

147

"""The element's name."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

148

149

attrib = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

150

"""Dictionary of the element's attributes."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

151

152

text = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

153

"""

154

Text before first subelement. This is either a string or the value None.

155

Note that if there is no text, this attribute may be either

156

None or the empty string, depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

157

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

158

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

159

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

160

tail = None

161

"""

162

Text after this element's end tag, but before the next sibling element's

163

start tag. This is either a string or the value None. Note that if there

164

was no text, this attribute may be either None or an empty string,

165

depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

166

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

167

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

168

169

def __init__(self, tag, attrib={}, **extra):

Eli Bendersky

737b173

2012-05-29 06:02:56 +0300

[diff] [blame]

170

if not isinstance(attrib, dict):

171

raise TypeError("attrib must be dict, not %s" % (

172

attrib.__class__.__name__,))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

173

self.tag = tag

Serhiy Storchaka

da08470

2019-03-27 08:02:28 +0200

[diff] [blame]

174

self.attrib = {**attrib, **extra}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

175

self._children = []

176

177

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

178

return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

179

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

180

def makeelement(self, tag, attrib):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

181

"""Create a new element with the same type.

182

183

*tag* is a string containing the element name.

184

*attrib* is a dictionary containing the element attributes.

185

186

Do not call this method, use the SubElement factory function instead.

187

188

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

189

return self.__class__(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

190

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

191

def copy(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

192

"""Return copy of current element.

193

194

This creates a shallow copy. Subelements will be shared with the

195

original tree.

196

197

"""

Gordon P. Hemsley

7d952de

2019-09-10 11:22:01 -0400

[diff] [blame]

198

warnings.warn(

199

"elem.copy() is deprecated. Use copy.copy(elem) instead.",

200

DeprecationWarning

201

)

202

return self.__copy__()

203

204

def __copy__(self):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

205

elem = self.makeelement(self.tag, self.attrib)

206

elem.text = self.text

207

elem.tail = self.tail

elem[:] = self

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

211

def __len__(self):

212

return len(self._children)

213

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

214

def __bool__(self):

215

warnings.warn(

216

"The behavior of this method will change in future versions. "

217

"Use specific 'len(elem)' or 'elem is not None' test instead.",

218

FutureWarning, stacklevel=2

219

)

220

return len(self._children) != 0 # emulate old behaviour, for now

221

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

222

def __getitem__(self, index):

223

return self._children[index]

224

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

225

def __setitem__(self, index, element):

Serhiy Storchaka

f081fd8

2018-10-19 12:12:57 +0300

[diff] [blame]

226

if isinstance(index, slice):

227

for elt in element:

228

self._assert_is_element(elt)

229

else:

230

self._assert_is_element(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

231

self._children[index] = element

232

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

233

def __delitem__(self, index):

234

del self._children[index]

235

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

236

def append(self, subelement):

237

"""Add *subelement* to the end of this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

238

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

239

The new element will appear in document order after the last existing

240

subelement (or directly after the text, if it's the first subelement),

241

but before the end tag for this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

242

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

243

"""

244

self._assert_is_element(subelement)

245

self._children.append(subelement)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

246

247

def extend(self, elements):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

248

"""Append subelements from a sequence.

249

250

*elements* is a sequence with zero or more elements.

251

252

"""

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

253

for element in elements:

254

self._assert_is_element(element)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

255

self._children.extend(elements)

256

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

257

def insert(self, index, subelement):

258

"""Insert *subelement* at position *index*."""

259

self._assert_is_element(subelement)

260

self._children.insert(index, subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

261

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

262

def _assert_is_element(self, e):

Antoine Pitrou

ee32931

2012-10-04 19:53:29 +0200

[diff] [blame]

263

# Need to refer to the actual Python implementation, not the

264

# shadowing C implementation.

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

265

if not isinstance(e, _Element_Py):

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

266

raise TypeError('expected an Element, not %s' % type(e).__name__)

267

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

268

def remove(self, subelement):

269

"""Remove matching subelement.

270

271

Unlike the find methods, this method compares elements based on

272

identity, NOT ON tag value or contents. To remove subelements by

273

other means, the easiest way is to use a list comprehension to

274

select what elements to keep, and then use slice assignment to update

275

the parent element.

276

277

ValueError is raised if a matching element could not be found.

278

279

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

280

# assert iselement(element)

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

281

self._children.remove(subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

282

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

283

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

284

"""Find first matching element by tag name or path.

285

286

*path* is a string having either an element tag or an XPath,

287

*namespaces* is an optional mapping from namespace prefix to full name.

288

289

Return the first matching element, or None if no element was found.

290

291

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

292

return ElementPath.find(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

293

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

294

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

295

"""Find text for first matching element by tag name or path.

296

297

*path* is a string having either an element tag or an XPath,

298

*default* is the value to return if the element was not found,

299

*namespaces* is an optional mapping from namespace prefix to full name.

300

301

Return text content of first matching element, or default value if

302

none was found. Note that if an element is found having no text

303

content, the empty string is returned.

304

305

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

306

return ElementPath.findtext(self, path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

307

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

308

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

309

"""Find all matching subelements by tag name or path.

310

311

*path* is a string having either an element tag or an XPath,

312

*namespaces* is an optional mapping from namespace prefix to full name.

313

314

Returns list containing all matching elements in document order.

315

316

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

317

return ElementPath.findall(self, path, namespaces)

318

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

319

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

320

"""Find all matching subelements by tag name or path.

321

322

*path* is a string having either an element tag or an XPath,

323

*namespaces* is an optional mapping from namespace prefix to full name.

324

325

Return an iterable yielding all matching elements in document order.

326

327

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

328

return ElementPath.iterfind(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

329

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

330

def clear(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

331

"""Reset element.

332

333

This function removes all subelements, clears all attributes, and sets

334

the text and tail attributes to None.

335

336

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

337

self.attrib.clear()

338

self._children = []

339

self.text = self.tail = None

340

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

341

def get(self, key, default=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

342

"""Get element attribute.

343

344

Equivalent to attrib.get, but some implementations may handle this a

345

bit more efficiently. *key* is what attribute to look for, and

346

*default* is what to return if the attribute was not found.

347

348

Returns a string containing the attribute value, or the default if

349

attribute was not found.

350

351

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

352

return self.attrib.get(key, default)

353

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

354

def set(self, key, value):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

355

"""Set element attribute.

356

357

Equivalent to attrib[key] = value, but some implementations may handle

358

this a bit more efficiently. *key* is what attribute to set, and

359

*value* is the attribute value to set it to.

360

361

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

362

self.attrib[key] = value

363

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

364

def keys(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

365

"""Get list of attribute names.

366

367

Names are returned in an arbitrary order, just like an ordinary

368

Python dict. Equivalent to attrib.keys()

369

370

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

371

return self.attrib.keys()

372

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

373

def items(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

374

"""Get element attributes as a sequence.

375

376

The attributes are returned in arbitrary order. Equivalent to

377

attrib.items().

378

379

Return a list of (name, value) tuples.

380

381

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

382

return self.attrib.items()

383

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

384

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

385

"""Create tree iterator.

386

387

The iterator loops over the element and all subelements in document

388

order, returning all elements with a matching tag.

389

390

If the tree structure is modified during iteration, new or removed

391

elements may or may not be included. To get a stable set, use the

392

list() function on the iterator, and loop over the resulting list.

393

394

*tag* is what tags to look for (default is to return all elements)

395

396

Return an iterator containing all the matching elements.

397

398

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

399

if tag == "*":

400

tag = None

401

if tag is None or self.tag == tag:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

402

yield self

403

for e in self._children:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

404

yield from e.iter(tag)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

405

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

406

def itertext(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

407

"""Create text iterator.

408

409

The iterator loops over the element and all subelements in document

410

order, returning all inner text.

411

412

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

413

tag = self.tag

414

if not isinstance(tag, str) and tag is not None:

415

return

Serhiy Storchaka

66c08d9

2015-12-21 11:09:48 +0200

[diff] [blame]

416

t = self.text

417

if t:

418

yield t

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

419

for e in self:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

420

yield from e.itertext()

Serhiy Storchaka

66c08d9

2015-12-21 11:09:48 +0200

[diff] [blame]

421

t = e.tail

422

if t:

423

yield t

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

424

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

425

426

def SubElement(parent, tag, attrib={}, **extra):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

427

"""Subelement factory which creates an element instance, and appends it

428

to an existing parent.

429

430

The element tag, attribute names, and attribute values can be either

431

bytes or Unicode strings.

432

433

*parent* is the parent element, *tag* is the subelements name, *attrib* is

434

an optional directory containing element attributes, *extra* are

435

additional attributes given as keyword arguments.

436

437

"""

Serhiy Storchaka

da08470

2019-03-27 08:02:28 +0200

[diff] [blame]

438

attrib = {**attrib, **extra}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

439

element = parent.makeelement(tag, attrib)

440

parent.append(element)

441

return element

442

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

443

444

def Comment(text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

445

"""Comment element factory.

446

447

This function creates a special element which the standard serializer

448

serializes as an XML comment.

449

450

*text* is a string containing the comment string.

451

452

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

453

element = Element(Comment)

element.text = text

return element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

457

458

def ProcessingInstruction(target, text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

459

"""Processing Instruction element factory.

460

461

This function creates a special element which the standard serializer

462

serializes as an XML comment.

463

464

*target* is a string containing the processing instruction, *text* is a

465

string containing the processing instruction contents, if any.

466

467

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

468

element = Element(ProcessingInstruction)

469

element.text = target

470

if text:

471

element.text = element.text + " " + text

472

return element

473

474

PI = ProcessingInstruction

475

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

476

477

class QName:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

478

"""Qualified name wrapper.

479

480

This class can be used to wrap a QName attribute value in order to get

481

proper namespace handing on output.

482

483

*text_or_uri* is a string containing the QName value either in the form

484

{uri}local, or if the tag argument is given, the URI part of a QName.

485

486

*tag* is an optional argument which if given, will make the first

487

argument (text_or_uri) be interpreted as a URI, and this argument (tag)

488

be interpreted as a local name.

489

490

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

491

def __init__(self, text_or_uri, tag=None):

492

if tag:

493

text_or_uri = "{%s}%s" % (text_or_uri, tag)

494

self.text = text_or_uri

495

def __str__(self):

496

return self.text

Georg Brandl

b56c0e2

2010-12-09 18:10:27 +0000

[diff] [blame]

497

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

498

return '<%s %r>' % (self.__class__.__name__, self.text)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

499

def __hash__(self):

500

return hash(self.text)

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

501

def __le__(self, other):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

502

if isinstance(other, QName):

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

503

return self.text <= other.text

504

return self.text <= other

505

def __lt__(self, other):

506

if isinstance(other, QName):

507

return self.text < other.text

508

return self.text < other

509

def __ge__(self, other):

510

if isinstance(other, QName):

511

return self.text >= other.text

512

return self.text >= other

513

def __gt__(self, other):

514

if isinstance(other, QName):

515

return self.text > other.text

516

return self.text > other

517

def __eq__(self, other):

518

if isinstance(other, QName):

519

return self.text == other.text

520

return self.text == other

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

521

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

522

# --------------------------------------------------------------------

523

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

524

525

class ElementTree:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

526

"""An XML element hierarchy.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

527

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

528

This class also provides support for serialization to and from

529

standard XML.

530

531

*element* is an optional root element node,

532

*file* is an optional file handle or file name of an XML file whose

533

contents will be used to initialize the tree with.

534

535

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

536

def __init__(self, element=None, file=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

537

# assert element is None or iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

538

self._root = element # first node

if file:

self.parse(file)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

542

def getroot(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

543

"""Return root element of this tree."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

544

return self._root

545

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

546

def _setroot(self, element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

547

"""Replace root element of this tree.

548

549

This will discard the current contents of the tree and replace it

550

with the given element. Use with care!

551

552

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

553

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

554

self._root = element

555

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

556

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

557

"""Load external XML document into element tree.

558

559

*source* is a file name or file object, *parser* is an optional parser

560

instance that defaults to XMLParser.

561

562

ParseError is raised if the parser fails to parse the document.

563

564

Returns the root element of the given source document.

565

566

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

567

close_source = False

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

568

if not hasattr(source, "read"):

569

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

570

close_source = True

571

try:

Eli Bendersky

a369923

2013-05-19 18:47:23 -0700

[diff] [blame]

572

if parser is None:

573

# If no parser was specified, create a default XMLParser

574

parser = XMLParser()

575

if hasattr(parser, '_parse_whole'):

576

# The default XMLParser, when it comes from an accelerator,

577

# can define an internal _parse_whole API for efficiency.

578

# It can be used to parse the whole source without feeding

579

# it with chunks.

580

self._root = parser._parse_whole(source)

581

return self._root

582

while True:

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

583

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

return self._root

finally:

if close_source:

source.close()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

592

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

593

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

594

"""Create and return tree iterator for the root element.

595

596

The iterator loops over all elements in this tree, in document order.

597

598

*tag* is a string with the tag name to iterate over

599

(default is to return all elements).

600

601

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

602

# assert self._root is not None

603

return self._root.iter(tag)

604

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

605

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

606

"""Find first matching element by tag name or path.

607

608

Same as getroot().find(path), which is Element.find()

609

610

*path* is a string having either an element tag or an XPath,

611

*namespaces* is an optional mapping from namespace prefix to full name.

612

613

Return the first matching element, or None if no element was found.

614

615

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

616

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

617

if path[:1] == "/":

618

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

619

warnings.warn(

620

"This search is broken in 1.3 and earlier, and will be "

621

"fixed in a future version. If you rely on the current "

622

"behaviour, change it to %r" % path,

623

FutureWarning, stacklevel=2

624

)

625

return self._root.find(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

626

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

627

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

628

"""Find first matching element by tag name or path.

629

630

Same as getroot().findtext(path), which is Element.findtext()

631

632

*path* is a string having either an element tag or an XPath,

633

*namespaces* is an optional mapping from namespace prefix to full name.

634

635

Return the first matching element, or None if no element was found.

636

637

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

638

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

639

if path[:1] == "/":

640

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

641

warnings.warn(

642

"This search is broken in 1.3 and earlier, and will be "

643

"fixed in a future version. If you rely on the current "

644

"behaviour, change it to %r" % path,

645

FutureWarning, stacklevel=2

646

)

647

return self._root.findtext(path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

648

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

649

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

650

"""Find all matching subelements by tag name or path.

651

652

Same as getroot().findall(path), which is Element.findall().

653

654

*path* is a string having either an element tag or an XPath,

655

*namespaces* is an optional mapping from namespace prefix to full name.

656

657

Return list containing all matching elements in document order.

658

659

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

660

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

661

if path[:1] == "/":

662

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

663

warnings.warn(

664

"This search is broken in 1.3 and earlier, and will be "

665

"fixed in a future version. If you rely on the current "

666

"behaviour, change it to %r" % path,

667

FutureWarning, stacklevel=2

668

)

669

return self._root.findall(path, namespaces)

670

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

671

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

672

"""Find all matching subelements by tag name or path.

673

674

Same as getroot().iterfind(path), which is element.iterfind()

675

676

*path* is a string having either an element tag or an XPath,

677

*namespaces* is an optional mapping from namespace prefix to full name.

678

679

Return an iterable yielding all matching elements in document order.

680

681

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

682

# assert self._root is not None

if path[:1] == "/":

path = "." + path

warnings.warn(

"This search is broken in 1.3 and earlier, and will be "

687

"fixed in a future version. If you rely on the current "

688

"behaviour, change it to %r" % path,

689

FutureWarning, stacklevel=2

690

)

691

return self._root.iterfind(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

692

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

693

def write(self, file_or_filename,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

694

encoding=None,

695

xml_declaration=None,

696

default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

697

method=None, *,

698

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

699

"""Write element tree to a file as XML.

700

701

Arguments:

702

*file_or_filename* -- file name or a file object opened for writing

703

704

*encoding* -- the output encoding (default: US-ASCII)

705

706

*xml_declaration* -- bool indicating if an XML declaration should be

707

added to the output. If None, an XML declaration

708

is added if encoding IS NOT either of:

709

US-ASCII, UTF-8, or Unicode

710

711

*default_namespace* -- sets the default XML namespace (for "xmlns")

712

713

*method* -- either "xml" (default), "html, "text", or "c14n"

714

715

*short_empty_elements* -- controls the formatting of elements

716

that contain no content. If True (default)

717

they are emitted as a single self-closed

718

tag, otherwise they are emitted as a pair

719

of start/end tags

Eli Bendersky

e9af827

2013-01-13 06:27:51 -0800

[diff] [blame]

720

721

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

722

if not method:

723

method = "xml"

724

elif method not in _serialize:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

725

raise ValueError("unknown method %r" % method)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

if not encoding:

if method == "c14n":

encoding = "utf-8"

else:

encoding = "us-ascii"

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

731

enc_lower = encoding.lower()

732

with _get_writer(file_or_filename, enc_lower) as write:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

733

if method == "xml" and (xml_declaration or

734

(xml_declaration is None and

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

735

enc_lower not in ("utf-8", "us-ascii", "unicode"))):

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

736

declared_encoding = encoding

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

737

if enc_lower == "unicode":

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

738

# Retrieve the default encoding for the xml declaration

739

import locale

740

declared_encoding = locale.getpreferredencoding()

741

write("<?xml version='1.0' encoding='%s'?>\n" % (

742

declared_encoding,))

743

if method == "text":

744

_serialize_text(write, self._root)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

745

else:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

746

qnames, namespaces = _namespaces(self._root, default_namespace)

747

serialize = _serialize[method]

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

748

serialize(write, self._root, qnames, namespaces,

749

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

750

751

def write_c14n(self, file):

752

# lxml.etree compatibility. use output method instead

753

return self.write(file, method="c14n")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

754

755

# --------------------------------------------------------------------

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

756

# serialization support

757

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

758

@contextlib.contextmanager

759

def _get_writer(file_or_filename, encoding):

Ezio Melotti

b5bc353

2013-08-17 16:11:40 +0300

[diff] [blame]

760

# returns text write method and release all resources after using

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

761

try:

762

write = file_or_filename.write

763

except AttributeError:

764

# file_or_filename is a file name

765

if encoding == "unicode":

766

file = open(file_or_filename, "w")

767

else:

768

file = open(file_or_filename, "w", encoding=encoding,

769

errors="xmlcharrefreplace")

with file:

yield file.write

else:

# file_or_filename is a file-like object

774

# encoding determines if it is a text or binary writer

775

if encoding == "unicode":

776

# use a text writer as is

777

yield write

778

else:

779

# wrap a binary writer with TextIOWrapper

780

with contextlib.ExitStack() as stack:

781

if isinstance(file_or_filename, io.BufferedIOBase):

782

file = file_or_filename

783

elif isinstance(file_or_filename, io.RawIOBase):

784

file = io.BufferedWriter(file_or_filename)

785

# Keep the original file open when the BufferedWriter is

786

# destroyed

787

stack.callback(file.detach)

788

else:

789

# This is to handle passed objects that aren't in the

790

# IOBase hierarchy, but just have a write method

791

file = io.BufferedIOBase()

792

file.writable = lambda: True

793

file.write = write

794

try:

795

# TextIOWrapper uses this methods to determine

796

# if BOM (for UTF-16, etc) should be added

797

file.seekable = file_or_filename.seekable

798

file.tell = file_or_filename.tell

799

except AttributeError:

800

pass

801

file = io.TextIOWrapper(file,

802

encoding=encoding,

803

errors="xmlcharrefreplace",

804

newline="\n")

805

# Keep the original file open when the TextIOWrapper is

806

# destroyed

807

stack.callback(file.detach)

808

yield file.write

809

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

810

def _namespaces(elem, default_namespace=None):

811

# identify namespaces used in this tree

812

813

# maps qnames to *encoded* prefix:local names

814

qnames = {None: None}

815

816

# maps uri:s to prefixes

817

namespaces = {}

818

if default_namespace:

819

namespaces[default_namespace] = ""

820

821

def add_qname(qname):

822

# calculate serialized qname representation

823

try:

824

if qname[:1] == "{":

825

uri, tag = qname[1:].rsplit("}", 1)

826

prefix = namespaces.get(uri)

827

if prefix is None:

828

prefix = _namespace_map.get(uri)

829

if prefix is None:

830

prefix = "ns%d" % len(namespaces)

831

if prefix != "xml":

832

namespaces[uri] = prefix

833

if prefix:

834

qnames[qname] = "%s:%s" % (prefix, tag)

835

else:

836

qnames[qname] = tag # default element

837

else:

838

if default_namespace:

839

# FIXME: can this be handled in XML 1.0?

840

raise ValueError(

841

"cannot use non-qualified names with "

842

"default_namespace option"

843

)

844

qnames[qname] = qname

845

except TypeError:

846

_raise_serialization_error(qname)

847

848

# populate qname and namespaces table

Eli Bendersky

64d11e6

2012-06-15 07:42:50 +0300

[diff] [blame]

849

for elem in elem.iter():

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

850

tag = elem.tag

Senthil Kumaran

ec30b3d

2010-11-09 02:36:59 +0000

[diff] [blame]

851

if isinstance(tag, QName):

852

if tag.text not in qnames:

853

add_qname(tag.text)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

854

elif isinstance(tag, str):

855

if tag not in qnames:

856

add_qname(tag)

857

elif tag is not None and tag is not Comment and tag is not PI:

858

_raise_serialization_error(tag)

859

for key, value in elem.items():

860

if isinstance(key, QName):

861

key = key.text

862

if key not in qnames:

863

add_qname(key)

864

if isinstance(value, QName) and value.text not in qnames:

865

add_qname(value.text)

866

text = elem.text

867

if isinstance(text, QName) and text.text not in qnames:

868

add_qname(text.text)

869

return qnames, namespaces

870

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

871

def _serialize_xml(write, elem, qnames, namespaces,

872

short_empty_elements, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % text)

877

elif tag is ProcessingInstruction:

878

write("<?%s?>" % text)

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

884

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

885

_serialize_xml(write, e, qnames, None,

886

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

887

else:

888

write("<" + tag)

889

items = list(elem.items())

890

if items or namespaces:

891

if namespaces:

892

for v, k in sorted(namespaces.items(),

893

key=lambda x: x[1]): # sort on prefix

894

if k:

895

k = ":" + k

896

write(" xmlns%s=\"%s\"" % (

897

k,

898

_escape_attrib(v)

899

))

Raymond Hettinger

e3685fd

2018-10-28 11:18:22 -0700

[diff] [blame]

900

for k, v in items:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

901

if isinstance(k, QName):

902

k = k.text

903

if isinstance(v, QName):

904

v = qnames[v.text]

905

else:

906

v = _escape_attrib(v)

907

write(" %s=\"%s\"" % (qnames[k], v))

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

908

if text or len(elem) or not short_empty_elements:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

909

write(">")

910

if text:

911

write(_escape_cdata(text))

912

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

913

_serialize_xml(write, e, qnames, None,

914

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

915

write("</" + tag + ">")

else:

write(" />")

if elem.tail:

write(_escape_cdata(elem.tail))

920

921

HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",

Ezio Melotti

c90111f

2012-09-19 08:19:12 +0300

[diff] [blame]

922

"img", "input", "isindex", "link", "meta", "param")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

923

924

try:

925

HTML_EMPTY = set(HTML_EMPTY)

except NameError:

pass

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

929

def _serialize_html(write, elem, qnames, namespaces, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % _escape_cdata(text))

934

elif tag is ProcessingInstruction:

935

write("<?%s?>" % _escape_cdata(text))

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

941

for e in elem:

942

_serialize_html(write, e, qnames, None)

943

else:

944

write("<" + tag)

945

items = list(elem.items())

946

if items or namespaces:

947

if namespaces:

948

for v, k in sorted(namespaces.items(),

949

key=lambda x: x[1]): # sort on prefix

950

if k:

951

k = ":" + k

952

write(" xmlns%s=\"%s\"" % (

953

k,

954

_escape_attrib(v)

955

))

Serhiy Storchaka

3b05ad7

2018-10-29 19:31:04 +0200

[diff] [blame]

956

for k, v in items:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

957

if isinstance(k, QName):

958

k = k.text

959

if isinstance(v, QName):

960

v = qnames[v.text]

961

else:

962

v = _escape_attrib_html(v)

963

# FIXME: handle boolean attributes

964

write(" %s=\"%s\"" % (qnames[k], v))

965

write(">")

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

966

ltag = tag.lower()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

967

if text:

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

968

if ltag == "script" or ltag == "style":

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

969

write(text)

970

else:

971

write(_escape_cdata(text))

972

for e in elem:

973

_serialize_html(write, e, qnames, None)

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

974

if ltag not in HTML_EMPTY:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

975

write("</" + tag + ">")

976

if elem.tail:

977

write(_escape_cdata(elem.tail))

978

979

def _serialize_text(write, elem):

980

for part in elem.itertext():

write(part)

if elem.tail:

write(elem.tail)

_serialize = {

"xml": _serialize_xml,

987

"html": _serialize_html,

988

"text": _serialize_text,

989

# this optional method is imported at the end of the module

990

# "c14n": _serialize_c14n,

991

}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

992

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

993

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

994

def register_namespace(prefix, uri):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

995

"""Register a namespace prefix.

996

997

The registry is global, and any existing mapping for either the

998

given prefix or the namespace URI will be removed.

999

1000

*prefix* is the namespace prefix, *uri* is a namespace uri. Tags and

1001

attributes in this namespace will be serialized with prefix if possible.

1002

1003

ValueError is raised if prefix is reserved or is invalid.

1004

1005

"""

R David Murray

44b548d

2016-09-08 13:59:53 -0400

[diff] [blame]

1006

if re.match(r"ns\d+$", prefix):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1007

raise ValueError("Prefix format reserved for internal use")

Georg Brandl

90b2067

2010-12-28 10:38:33 +0000

[diff] [blame]

1008

for k, v in list(_namespace_map.items()):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1009

if k == uri or v == prefix:

1010

del _namespace_map[k]

1011

_namespace_map[uri] = prefix

1012

1013

_namespace_map = {

1014

# "well-known" namespace prefixes

1015

"http://www.w3.org/XML/1998/namespace": "xml",

1016

"http://www.w3.org/1999/xhtml": "html",

1017

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

1018

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

1019

# xml schema

1020

"http://www.w3.org/2001/XMLSchema": "xs",

1021

"http://www.w3.org/2001/XMLSchema-instance": "xsi",

1022

# dublin core

1023

"http://purl.org/dc/elements/1.1/": "dc",

1024

}

Florent Xicluna

1639505

2012-02-16 23:28:35 +0100

[diff] [blame]

1025

# For tests and troubleshooting

1026

register_namespace._namespace_map = _namespace_map

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1027

1028

def _raise_serialization_error(text):

1029

raise TypeError(

1030

"cannot serialize %r (type %s)" % (text, type(text).__name__)

1031

)

1032

1033

def _escape_cdata(text):

1034

# escape character data

1035

try:

1036

# it's worth avoiding do-nothing calls for strings that are

Mike

53f7a7c

2017-12-14 14:04:53 +0300

[diff] [blame]

1037

# shorter than 500 characters, or so. assume that's, by far,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1038

# the most common case in most applications.

1039

if "&" in text:

1040

text = text.replace("&", "&")

1041

if "<" in text:

1042

text = text.replace("<", "<")

1043

if ">" in text:

1044

text = text.replace(">", ">")

1045

return text

1046

except (TypeError, AttributeError):

1047

_raise_serialization_error(text)

1048

1049

def _escape_attrib(text):

1050

# escape attribute value

1051

try:

1052

if "&" in text:

1053

text = text.replace("&", "&")

1054

if "<" in text:

1055

text = text.replace("<", "<")

1056

if ">" in text:

1057

text = text.replace(">", ">")

1058

if "\"" in text:

1059

text = text.replace("\"", """)

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1060

# The following business with carriage returns is to satisfy

Raymond Hettinger

11fa3ff

2016-09-11 23:23:24 -0700

[diff] [blame]

1061

# Section 2.11 of the XML specification, stating that

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1062

# CR or CR LN should be replaced with just LN

1063

# http://www.w3.org/TR/REC-xml/#sec-line-ends

1064

if "\r\n" in text:

1065

text = text.replace("\r\n", "\n")

1066

if "\r" in text:

1067

text = text.replace("\r", "\n")

1068

#The following four lines are issue 17582

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1069

if "\n" in text:

1070

text = text.replace("\n", "
")

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1071

if "\t" in text:

1072

text = text.replace("\t", "	")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1073

return text

1074

except (TypeError, AttributeError):

1075

_raise_serialization_error(text)

1076

1077

def _escape_attrib_html(text):

1078

# escape attribute value

1079

try:

1080

if "&" in text:

1081

text = text.replace("&", "&")

1082

if ">" in text:

1083

text = text.replace(">", ">")

1084

if "\"" in text:

1085

text = text.replace("\"", """)

1086

return text

1087

except (TypeError, AttributeError):

1088

_raise_serialization_error(text)

1089

1090

# --------------------------------------------------------------------

1091

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1092

def tostring(element, encoding=None, method=None, *,

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1093

xml_declaration=None, default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1094

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1095

"""Generate string representation of XML element.

1096

1097

All subelements are included. If encoding is "unicode", a string

1098

is returned. Otherwise a bytestring is returned.

1099

1100

*element* is an Element instance, *encoding* is an optional output

1101

encoding defaulting to US-ASCII, *method* is an optional output which can

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1102

be one of "xml" (default), "html", "text" or "c14n", *default_namespace*

1103

sets the default XML namespace (for "xmlns").

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1104

1105

Returns an (optionally) encoded string containing the XML data.

1106

1107

"""

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1108

stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1109

ElementTree(element).write(stream, encoding,

1110

xml_declaration=xml_declaration,

1111

default_namespace=default_namespace,

1112

method=method,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1113

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1114

return stream.getvalue()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1115

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1116

class _ListDataStream(io.BufferedIOBase):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1117

"""An auxiliary stream accumulating into a list reference."""

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1118

def __init__(self, lst):

1119

self.lst = lst

Eli Bendersky

f90fc68

2012-07-17 15:09:56 +0300

[diff] [blame]

1120

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

def writable(self):

return True

def seekable(self):

return True

def write(self, b):

self.lst.append(b)

def tell(self):

return len(self.lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1133

def tostringlist(element, encoding=None, method=None, *,

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1134

xml_declaration=None, default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1135

short_empty_elements=True):

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1136

lst = []

1137

stream = _ListDataStream(lst)

Bernt Røskar Brenna

2019-04-14 10:07:02 +0200

[diff] [blame]

1138

ElementTree(element).write(stream, encoding,

1139

xml_declaration=xml_declaration,

1140

default_namespace=default_namespace,

1141

method=method,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1142

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1143

return lst

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1144

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1145

1146

def dump(elem):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1147

"""Write element tree or element structure to sys.stdout.

1148

1149

This function should be used for debugging only.

1150

1151

*elem* is either an ElementTree, or a single Element. The exact output

1152

format is implementation dependent. In this version, it's written as an

1153

ordinary XML file.

1154

1155

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1156

# debugging

1157

if not isinstance(elem, ElementTree):

1158

elem = ElementTree(elem)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

1159

elem.write(sys.stdout, encoding="unicode")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1160

tail = elem.getroot().tail

1161

if not tail or tail[-1] != "\n":

1162

sys.stdout.write("\n")

1163

Stefan Behnel

b5d3cee

2019-08-23 16:44:25 +0200

[diff] [blame]

1164

1165

def indent(tree, space=" ", level=0):

1166

"""Indent an XML document by inserting newlines and indentation space

1167

after elements.

1168

1169

*tree* is the ElementTree or Element to modify. The (root) element

1170

itself will not be changed, but the tail text of all elements in its

1171

subtree will be adapted.

1172

1173

*space* is the whitespace to insert for each indentation level, two

1174

space characters by default.

1175

1176

*level* is the initial indentation level. Setting this to a higher

1177

value than 0 can be used for indenting subtrees that are more deeply

1178

nested inside of a document.

1179

"""

1180

if isinstance(tree, ElementTree):

1181

tree = tree.getroot()

1182

if level < 0:

1183

raise ValueError(f"Initial indentation level must be >= 0, got {level}")

if not len(tree):

return

# Reduce the memory consumption by reusing indentation strings.

1188

indentations = ["\n" + level * space]

1189

1190

def _indent_children(elem, level):

1191

# Start a new indentation level for the first child.

1192

child_level = level + 1

1193

try:

1194

child_indentation = indentations[child_level]

1195

except IndexError:

1196

child_indentation = indentations[level] + space

1197

indentations.append(child_indentation)

1198

1199

if not elem.text or not elem.text.strip():

1200

elem.text = child_indentation

for child in elem:

if len(child):

_indent_children(child, child_level)

1205

if not child.tail or not child.tail.strip():

1206

child.tail = child_indentation

1207

1208

# Dedent after the last child by overwriting the previous indentation.

1209

if not child.tail.strip():

1210

child.tail = indentations[level]

1211

1212

_indent_children(tree, 0)

1213

1214

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1215

# --------------------------------------------------------------------

1216

# parsing

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1217

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1218

1219

def parse(source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1220

"""Parse XML document into element tree.

1221

1222

*source* is a filename or file object containing XML data,

1223

*parser* is an optional parser instance defaulting to XMLParser.

1224

1225

Return an ElementTree instance.

1226

1227

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1228

tree = ElementTree()

1229

tree.parse(source, parser)

1230

return tree

1231

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1232

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1233

def iterparse(source, events=None, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1234

"""Incrementally parse XML document into ElementTree.

1235

1236

This class also reports what's going on to the user based on the

1237

*events* it is initialized with. The supported events are the strings

1238

"start", "end", "start-ns" and "end-ns" (the "ns" events are used to get

1239

detailed namespace information). If *events* is omitted, only

1240

"end" events are reported.

1241

1242

*source* is a filename or file object containing XML data, *events* is

1243

a list of events to report back, *parser* is an optional parser instance.

1244

1245

Returns an iterator providing (event, elem) pairs.

1246

1247

"""

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1248

# Use the internal, undocumented _parser argument for now; When the

1249

# parser argument of iterparse is removed, this can be killed.

1250

pullparser = XMLPullParser(events=events, _parser=parser)

def iterator():

try:

while True:

yield from pullparser.read_events()

1255

# load event buffer

1256

data = source.read(16 * 1024)

1257

if not data:

1258

break

1259

pullparser.feed(data)

1260

root = pullparser._close_and_return_root()

1261

yield from pullparser.read_events()

it.root = root

finally:

if close_source:

source.close()

Serhiy Storchaka

2017-04-24 09:05:00 +0300

[diff] [blame]

1267

class IterParseIterator(collections.abc.Iterator):

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1268

__next__ = iterator().__next__

1269

it = IterParseIterator()

1270

it.root = None

1271

del iterator, IterParseIterator

1272

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1273

close_source = False

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1274

if not hasattr(source, "read"):

1275

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1276

close_source = True

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1277

1278

return it

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1279

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1280

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1281

class XMLPullParser:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1282

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1283

def __init__(self, events=None, *, _parser=None):

1284

# The _parser argument is for internal use only and must not be relied

1285

# upon in user code. It will be removed in a future release.

1286

# See http://bugs.python.org/issue17741 for more details.

1287

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1288

self._events_queue = collections.deque()

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1289

self._parser = _parser or XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1290

# wire up the parser for event reporting

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1291

if events is None:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1292

events = ("end",)

1293

self._parser._setevents(self._events_queue, events)

1294

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1295

def feed(self, data):

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1296

"""Feed encoded data to parser."""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1297

if self._parser is None:

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1298

raise ValueError("feed() called after end of stream")

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1299

if data:

1300

try:

1301

self._parser.feed(data)

1302

except SyntaxError as exc:

1303

self._events_queue.append(exc)

1304

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1305

def _close_and_return_root(self):

1306

# iterparse needs this to set its root attribute properly :(

1307

root = self._parser.close()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1308

self._parser = None

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

return root

def close(self):

"""Finish feeding data to parser.

1313

1314

Unlike XMLParser, does not return the root element. Use

1315

read_events() to consume elements from XMLPullParser.

1316

"""

1317

self._close_and_return_root()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1318

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1319

def read_events(self):

R David Murray

410d320

2014-01-04 23:52:50 -0500

[diff] [blame]

1320

"""Return an iterator over currently available (event, elem) pairs.

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1321

1322

Events are consumed from the internal event queue as they are

1323

retrieved from the iterator.

1324

"""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1325

events = self._events_queue

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1326

while events:

1327

event = events.popleft()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1328

if isinstance(event, Exception):

1329

raise event

1330

else:

1331

yield event

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1332

1333

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1334

def XML(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1335

"""Parse XML document from string constant.

1336

1337

This function can be used to embed "XML Literals" in Python code.

1338

1339

*text* is a string containing XML data, *parser* is an

1340

optional parser instance, defaulting to the standard XMLParser.

1341

1342

Returns an Element instance.

1343

1344

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1345

if not parser:

1346

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1347

parser.feed(text)

1348

return parser.close()

1349

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1350

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1351

def XMLID(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1352

"""Parse XML document from string constant for its IDs.

1353

1354

*text* is a string containing XML data, *parser* is an

1355

optional parser instance, defaulting to the standard XMLParser.

1356

1357

Returns an (Element, dict) tuple, in which the

1358

dict maps element id:s to elements.

1359

1360

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1361

if not parser:

1362

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1363

parser.feed(text)

1364

tree = parser.close()

1365

ids = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1366

for elem in tree.iter():

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

id = elem.get("id")

if id:

ids[id] = elem

return tree, ids

Victor Stinner

2013-03-26 01:11:54 +0100

[diff] [blame]

1372

# Parse XML document from string constant. Alias for XML().

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1373

fromstring = XML

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1374

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1375

def fromstringlist(sequence, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1376

"""Parse XML document from sequence of string fragments.

1377

1378

*sequence* is a list of other sequence, *parser* is an optional parser

1379

instance, defaulting to the standard XMLParser.

1380

1381

Returns an Element instance.

1382

1383

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1384

if not parser:

1385

parser = XMLParser(target=TreeBuilder())

1386

for text in sequence:

1387

parser.feed(text)

1388

return parser.close()

1389

1390

# --------------------------------------------------------------------

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1391

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1392

1393

class TreeBuilder:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1394

"""Generic element structure builder.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1395

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1396

This builder converts a sequence of start, data, and end method

1397

calls to a well-formed element structure.

1398

1399

You can use this class to build an element structure using a custom XML

1400

parser, or a parser for some other XML-like format.

1401

1402

*element_factory* is an optional element factory which is called

1403

to create new Element instances, as necessary.

1404

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1405

*comment_factory* is a factory to create comments to be used instead of

1406

the standard factory. If *insert_comments* is false (the default),

1407

comments will not be inserted into the tree.

1408

1409

*pi_factory* is a factory to create processing instructions to be used

1410

instead of the standard factory. If *insert_pis* is false (the default),

1411

processing instructions will not be inserted into the tree.

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1412

"""

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1413

def __init__(self, element_factory=None, *,

1414

comment_factory=None, pi_factory=None,

1415

insert_comments=False, insert_pis=False):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1416

self._data = [] # data collector

1417

self._elem = [] # element stack

1418

self._last = None # last element

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1419

self._root = None # root element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1420

self._tail = None # true if we're after an end tag

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1421

if comment_factory is None:

1422

comment_factory = Comment

1423

self._comment_factory = comment_factory

1424

self.insert_comments = insert_comments

1425

if pi_factory is None:

1426

pi_factory = ProcessingInstruction

1427

self._pi_factory = pi_factory

1428

self.insert_pis = insert_pis

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1429

if element_factory is None:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1430

element_factory = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1431

self._factory = element_factory

1432

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1433

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1434

"""Flush builder buffers and return toplevel document Element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1435

assert len(self._elem) == 0, "missing end tags"

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1436

assert self._root is not None, "missing toplevel element"

1437

return self._root

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

def _flush(self):

if self._data:

if self._last is not None:

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1442

text = "".join(self._data)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1443

if self._tail:

1444

assert self._last.tail is None, "internal error (tail)"

1445

self._last.tail = text

1446

else:

1447

assert self._last.text is None, "internal error (text)"

1448

self._last.text = text

1449

self._data = []

1450

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1451

def data(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1452

"""Add text to current element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1453

self._data.append(data)

1454

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1455

def start(self, tag, attrs):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1456

"""Open new element and return it.

1457

1458

*tag* is the element name, *attrs* is a dict containing element

1459

attributes.

1460

1461

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1462

self._flush()

1463

self._last = elem = self._factory(tag, attrs)

1464

if self._elem:

1465

self._elem[-1].append(elem)

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1466

elif self._root is None:

1467

self._root = elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1468

self._elem.append(elem)

self._tail = 0

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1472

def end(self, tag):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1473

"""Close and return current Element.

1474

1475

*tag* is the element name.

1476

1477

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1478

self._flush()

1479

self._last = self._elem.pop()

1480

assert self._last.tag == tag,\

1481

"end tag mismatch (expected %s, got %s)" % (

self._last.tag, tag)

self._tail = 1

return self._last

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1486

def comment(self, text):

1487

"""Create a comment using the comment_factory.

1488

1489

*text* is the text of the comment.

1490

"""

1491

return self._handle_single(

1492

self._comment_factory, self.insert_comments, text)

1493

1494

def pi(self, target, text=None):

1495

"""Create a processing instruction using the pi_factory.

1496

1497

*target* is the target name of the processing instruction.

1498

*text* is the data of the processing instruction, or ''.

1499

"""

1500

return self._handle_single(

1501

self._pi_factory, self.insert_pis, target, text)

1502

1503

def _handle_single(self, factory, insert, *args):

1504

elem = factory(*args)

if insert:

self._flush()

self._last = elem

if self._elem:

self._elem[-1].append(elem)

self._tail = 1

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1513

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1514

# also see ElementTree and TreeBuilder

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1515

class XMLParser:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1516

"""Element structure builder for XML source data based on the expat parser.

1517

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1518

*target* is an optional target object which defaults to an instance of the

1519

standard TreeBuilder class, *encoding* is an optional encoding string

1520

which if given, overrides the encoding specified in the XML file:

1521

http://www.iana.org/assignments/character-sets

1522

1523

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1524

Serhiy Storchaka

02ec92f

2018-07-24 12:03:34 +0300

[diff] [blame]

1525

def __init__(self, *, target=None, encoding=None):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1526

try:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1527

from xml.parsers import expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1528

except ImportError:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1529

try:

1530

import pyexpat as expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1531

except ImportError:

1532

raise ImportError(

1533

"No module named expat; use SimpleXMLTreeBuilder instead"

1534

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1535

parser = expat.ParserCreate(encoding, "}")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1536

if target is None:

1537

target = TreeBuilder()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1538

# underscored names are provided for compatibility only

1539

self.parser = self._parser = parser

1540

self.target = self._target = target

1541

self._error = expat.error

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1542

self._names = {} # name memo cache

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1543

# main callbacks

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1544

parser.DefaultHandlerExpand = self._default

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1545

if hasattr(target, 'start'):

1546

parser.StartElementHandler = self._start

1547

if hasattr(target, 'end'):

1548

parser.EndElementHandler = self._end

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1549

if hasattr(target, 'start_ns'):

1550

parser.StartNamespaceDeclHandler = self._start_ns

1551

if hasattr(target, 'end_ns'):

1552

parser.EndNamespaceDeclHandler = self._end_ns

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1553

if hasattr(target, 'data'):

1554

parser.CharacterDataHandler = target.data

1555

# miscellaneous callbacks

1556

if hasattr(target, 'comment'):

1557

parser.CommentHandler = target.comment

1558

if hasattr(target, 'pi'):

1559

parser.ProcessingInstructionHandler = target.pi

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1560

# Configure pyexpat: buffering, new-style attribute handling.

1561

parser.buffer_text = 1

1562

parser.ordered_attributes = 1

1563

parser.specified_attributes = 1

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1564

self._doctype = None

1565

self.entity = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1566

try:

1567

self.version = "Expat %d.%d.%d" % expat.version_info

1568

except AttributeError:

1569

pass # unknown

1570

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1571

def _setevents(self, events_queue, events_to_report):

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1572

# Internal API for XMLPullParser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1573

# events_to_report: a list of events to report during parsing (same as

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1574

# the *events* of XMLPullParser's constructor.

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1575

# events_queue: a list of actual parsing events that will be populated

1576

# by the underlying parser.

1577

#

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1578

parser = self._parser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1579

append = events_queue.append

1580

for event_name in events_to_report:

1581

if event_name == "start":

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1582

parser.ordered_attributes = 1

1583

parser.specified_attributes = 1

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1584

def handler(tag, attrib_in, event=event_name, append=append,

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1585

start=self._start):

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1586

append((event, start(tag, attrib_in)))

1587

parser.StartElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1588

elif event_name == "end":

1589

def handler(tag, event=event_name, append=append,

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1590

end=self._end):

1591

append((event, end(tag)))

1592

parser.EndElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1593

elif event_name == "start-ns":

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1594

# TreeBuilder does not implement .start_ns()

1595

if hasattr(self.target, "start_ns"):

1596

def handler(prefix, uri, event=event_name, append=append,

1597

start_ns=self._start_ns):

1598

append((event, start_ns(prefix, uri)))

1599

else:

1600

def handler(prefix, uri, event=event_name, append=append):

1601

append((event, (prefix or '', uri or '')))

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1602

parser.StartNamespaceDeclHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1603

elif event_name == "end-ns":

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1604

# TreeBuilder does not implement .end_ns()

1605

if hasattr(self.target, "end_ns"):

1606

def handler(prefix, event=event_name, append=append,

1607

end_ns=self._end_ns):

1608

append((event, end_ns(prefix)))

1609

else:

1610

def handler(prefix, event=event_name, append=append):

1611

append((event, None))

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1612

parser.EndNamespaceDeclHandler = handler

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

1613

elif event_name == 'comment':

1614

def handler(text, event=event_name, append=append, self=self):

1615

append((event, self.target.comment(text)))

1616

parser.CommentHandler = handler

1617

elif event_name == 'pi':

1618

def handler(pi_target, data, event=event_name, append=append,

1619

self=self):

1620

append((event, self.target.pi(pi_target, data)))

1621

parser.ProcessingInstructionHandler = handler

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1622

else:

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1623

raise ValueError("unknown event %r" % event_name)

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1624

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1625

def _raiseerror(self, value):

1626

err = ParseError(value)

1627

err.code = value.code

1628

err.position = value.lineno, value.offset

1629

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1630

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1631

def _fixname(self, key):

1632

# expand qname, and convert name string to ascii, if possible

1633

try:

1634

name = self._names[key]

except KeyError:

name = key

if "}" in name:

name = "{" + name

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1639

self._names[key] = name

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1640

return name

1641

Stefan Behnel

2019-05-01 21:49:58 +0200

[diff] [blame]

1642

def _start_ns(self, prefix, uri):

1643

return self.target.start_ns(prefix or '', uri or '')

1644

1645

def _end_ns(self, prefix):

1646

return self.target.end_ns(prefix or '')

1647

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1648

def _start(self, tag, attr_list):

1649

# Handler for expat's StartElementHandler. Since ordered_attributes

1650

# is set, the attributes are reported as a list of alternating

1651

# attribute name,value.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1652

fixname = self._fixname

1653

tag = fixname(tag)

1654

attrib = {}

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1655

if attr_list:

1656

for i in range(0, len(attr_list), 2):

1657

attrib[fixname(attr_list[i])] = attr_list[i+1]

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1658

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1659

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1660

def _end(self, tag):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1661

return self.target.end(self._fixname(tag))

1662

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1663

def _default(self, text):

1664

prefix = text[:1]

1665

if prefix == "&":

1666

# deal with undefined entities

1667

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1668

data_handler = self.target.data

1669

except AttributeError:

1670

return

1671

try:

1672

data_handler(self.entity[text[1:-1]])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1673

except KeyError:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1674

from xml.parsers import expat

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1675

err = expat.error(

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1676

"undefined entity %s: line %d, column %d" %

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1677

(text, self.parser.ErrorLineNumber,

1678

self.parser.ErrorColumnNumber)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1679

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1680

err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1681

err.lineno = self.parser.ErrorLineNumber

1682

err.offset = self.parser.ErrorColumnNumber

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1683

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1684

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1685

self._doctype = [] # inside a doctype declaration

1686

elif self._doctype is not None:

1687

# parse doctype contents

1688

if prefix == ">":

1689

self._doctype = None

1690

return

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1691

text = text.strip()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1692

if not text:

1693

return

1694

self._doctype.append(text)

1695

n = len(self._doctype)

1696

if n > 2:

1697

type = self._doctype[1]

1698

if type == "PUBLIC" and n == 4:

1699

name, type, pubid, system = self._doctype

Florent Xicluna

a1c974a

2012-07-07 13:16:44 +0200

[diff] [blame]

1700

if pubid:

1701

pubid = pubid[1:-1]

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1702

elif type == "SYSTEM" and n == 3:

1703

name, type, system = self._doctype

1704

pubid = None

1705

else:

1706

return

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1707

if hasattr(self.target, "doctype"):

1708

self.target.doctype(name, pubid, system[1:-1])

Serhiy Storchaka

02ec92f

2018-07-24 12:03:34 +0300

[diff] [blame]

1709

elif hasattr(self, "doctype"):

1710

warnings.warn(

1711

"The doctype() method of XMLParser is ignored. "

1712

"Define doctype() method on the TreeBuilder target.",

1713

RuntimeWarning)

1714

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1715

self._doctype = None

1716

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1717

def feed(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1718

"""Feed encoded data to parser."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1719

try:

Serhiy Storchaka

eb89746

2019-09-01 12:11:43 +0300

[diff] [blame]

1720

self.parser.Parse(data, False)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1721

except self._error as v:

1722

self._raiseerror(v)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1723

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1724

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1725

"""Finish feeding data to parser and return element structure."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1726

try:

Serhiy Storchaka

eb89746

2019-09-01 12:11:43 +0300

[diff] [blame]

1727

self.parser.Parse(b"", True) # end of data

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1728

except self._error as v:

1729

self._raiseerror(v)

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1730

try:

Florent Xicluna

fb06746

2012-03-05 11:42:49 +0100

[diff] [blame]

1731

close_handler = self.target.close

1732

except AttributeError:

1733

pass

1734

else:

1735

return close_handler()

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1736

finally:

1737

# get rid of circular references

1738

del self.parser, self._parser

1739

del self.target, self._target

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1740

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1741

Stefan Behnel

e1d5dd6

2019-05-01 22:34:13 +0200

[diff] [blame]

1742

# --------------------------------------------------------------------

1743

# C14N 2.0

1744

1745

def canonicalize(xml_data=None, *, out=None, from_file=None, **options):

1746

"""Convert XML to its C14N 2.0 serialised form.

1747

1748

If *out* is provided, it must be a file or file-like object that receives

1749

the serialised canonical XML output (text, not bytes) through its ``.write()``

1750

method. To write to a file, open it in text mode with encoding "utf-8".

1751

If *out* is not provided, this function returns the output as text string.

1752

1753

Either *xml_data* (an XML string) or *from_file* (a file path or

1754

file-like object) must be provided as input.

1755

1756

The configuration options are the same as for the ``C14NWriterTarget``.

1757

"""

1758

if xml_data is None and from_file is None:

1759

raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")

1760

sio = None

1761

if out is None:

1762

sio = out = io.StringIO()

1763

1764

parser = XMLParser(target=C14NWriterTarget(out.write, **options))

1765

1766

if xml_data is not None:

1767

parser.feed(xml_data)

1768

parser.close()

1769

elif from_file is not None:

1770

parse(from_file, parser=parser)

1771

1772

return sio.getvalue() if sio is not None else None

1773

1774

1775

_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match

1776

1777

1778

class C14NWriterTarget:

1779

"""

1780

Canonicalization writer target for the XMLParser.

1781

1782

Serialises parse events to XML C14N 2.0.

1783

1784

The *write* function is used for writing out the resulting data stream

1785

as text (not bytes). To write to a file, open it in text mode with encoding

1786

"utf-8" and pass its ``.write`` method.

1787

1788

Configuration options:

1789

1790

- *with_comments*: set to true to include comments

1791

- *strip_text*: set to true to strip whitespace before and after text content

1792

- *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"

1793

- *qname_aware_tags*: a set of qname aware tag names in which prefixes

1794

should be replaced in text content

1795

- *qname_aware_attrs*: a set of qname aware attribute names in which prefixes

1796

should be replaced in text content

1797

- *exclude_attrs*: a set of attribute names that should not be serialised

1798

- *exclude_tags*: a set of tag names that should not be serialised

1799

"""

1800

def __init__(self, write, *,

1801

with_comments=False, strip_text=False, rewrite_prefixes=False,

1802

qname_aware_tags=None, qname_aware_attrs=None,

1803

exclude_attrs=None, exclude_tags=None):

1804

self._write = write

1805

self._data = []

1806

self._with_comments = with_comments

1807

self._strip_text = strip_text

1808

self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None

1809

self._exclude_tags = set(exclude_tags) if exclude_tags else None

1810

1811

self._rewrite_prefixes = rewrite_prefixes

1812

if qname_aware_tags:

1813

self._qname_aware_tags = set(qname_aware_tags)

1814

else:

1815

self._qname_aware_tags = None

1816

if qname_aware_attrs:

1817

self._find_qname_aware_attrs = set(qname_aware_attrs).intersection

1818

else:

1819

self._find_qname_aware_attrs = None

1820

1821

# Stack with globally and newly declared namespaces as (uri, prefix) pairs.

1822

self._declared_ns_stack = [[

1823

("http://www.w3.org/XML/1998/namespace", "xml"),

1824

]]

1825

# Stack with user declared namespace prefixes as (uri, prefix) pairs.

1826

self._ns_stack = []

1827

if not rewrite_prefixes:

1828

self._ns_stack.append(list(_namespace_map.items()))

1829

self._ns_stack.append([])

1830

self._prefix_map = {}

1831

self._preserve_space = [False]

1832

self._pending_start = None

1833

self._root_seen = False

1834

self._root_done = False

1835

self._ignored_depth = 0

1836

1837

def _iter_namespaces(self, ns_stack, _reversed=reversed):

1838

for namespaces in _reversed(ns_stack):

1839

if namespaces: # almost no element declares new namespaces

1840

yield from namespaces

1841

1842

def _resolve_prefix_name(self, prefixed_name):

1843

prefix, name = prefixed_name.split(':', 1)

1844

for uri, p in self._iter_namespaces(self._ns_stack):

1845

if p == prefix:

1846

return f'{{{uri}}}{name}'

1847

raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')

1848

1849

def _qname(self, qname, uri=None):

1850

if uri is None:

1851

uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)

else:

tag = qname

prefixes_seen = set()

1856

for u, prefix in self._iter_namespaces(self._declared_ns_stack):

1857

if u == uri and prefix not in prefixes_seen:

1858

return f'{prefix}:{tag}' if prefix else tag, tag, uri

1859

prefixes_seen.add(prefix)

1860

1861

# Not declared yet => add new declaration.

1862

if self._rewrite_prefixes:

1863

if uri in self._prefix_map:

1864

prefix = self._prefix_map[uri]

1865

else:

1866

prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'

1867

self._declared_ns_stack[-1].append((uri, prefix))

1868

return f'{prefix}:{tag}', tag, uri

1869

1870

if not uri and '' not in prefixes_seen:

1871

# No default namespace declared => no prefix needed.

1872

return tag, tag, uri

1873

1874

for u, prefix in self._iter_namespaces(self._ns_stack):

1875

if u == uri:

1876

self._declared_ns_stack[-1].append((uri, prefix))

1877

return f'{prefix}:{tag}' if prefix else tag, tag, uri

1878

1879

raise ValueError(f'Namespace "{uri}" is not declared in scope')

1880

1881

def data(self, data):

1882

if not self._ignored_depth:

1883

self._data.append(data)

1884

1885

def _flush(self, _join_text=''.join):

1886

data = _join_text(self._data)

1887

del self._data[:]

1888

if self._strip_text and not self._preserve_space[-1]:

1889

data = data.strip()

1890

if self._pending_start is not None:

1891

args, self._pending_start = self._pending_start, None

1892

qname_text = data if data and _looks_like_prefix_name(data) else None

1893

self._start(*args, qname_text)

1894

if qname_text is not None:

1895

return

1896

if data and self._root_seen:

1897

self._write(_escape_cdata_c14n(data))

1898

1899

def start_ns(self, prefix, uri):

1900

if self._ignored_depth:

1901

return

1902

# we may have to resolve qnames in text content

1903

if self._data:

1904

self._flush()

1905

self._ns_stack[-1].append((uri, prefix))

1906

1907

def start(self, tag, attrs):

1908

if self._exclude_tags is not None and (

1909

self._ignored_depth or tag in self._exclude_tags):

1910

self._ignored_depth += 1

return

if self._data:

self._flush()

new_namespaces = []

self._declared_ns_stack.append(new_namespaces)

1917

1918

if self._qname_aware_tags is not None and tag in self._qname_aware_tags:

1919

# Need to parse text first to see if it requires a prefix declaration.

1920

self._pending_start = (tag, attrs, new_namespaces)

1921

return

1922

self._start(tag, attrs, new_namespaces)

1923

1924

def _start(self, tag, attrs, new_namespaces, qname_text=None):

1925

if self._exclude_attrs is not None and attrs:

1926

attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}

1927

1928

qnames = {tag, *attrs}

1929

resolved_names = {}

1930

1931

# Resolve prefixes in attribute and tag text.

1932

if qname_text is not None:

1933

qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)

1934

qnames.add(qname)

1935

if self._find_qname_aware_attrs is not None and attrs:

1936

qattrs = self._find_qname_aware_attrs(attrs)

1937

if qattrs:

1938

for attr_name in qattrs:

1939

value = attrs[attr_name]

1940

if _looks_like_prefix_name(value):

1941

qname = resolved_names[value] = self._resolve_prefix_name(value)

qnames.add(qname)

else:

qattrs = None

else:

qattrs = None

# Assign prefixes in lexicographical order of used URIs.

1949

parse_qname = self._qname

1950

parsed_qnames = {n: parse_qname(n) for n in sorted(

1951

qnames, key=lambda n: n.split('}', 1))}

1952

1953

# Write namespace declarations in prefix order ...

1954

if new_namespaces:

1955

attr_list = [

1956

('xmlns:' + prefix if prefix else 'xmlns', uri)

1957

for uri, prefix in new_namespaces

]

attr_list.sort()

else:

# almost always empty

1962

attr_list = []

1963

1964

# ... followed by attributes in URI+name order

1965

if attrs:

1966

for k, v in sorted(attrs.items()):

1967

if qattrs is not None and k in qattrs and v in resolved_names:

1968

v = parsed_qnames[resolved_names[v]][0]

1969

attr_qname, attr_name, uri = parsed_qnames[k]

1970

# No prefix for attributes in default ('') namespace.

1971

attr_list.append((attr_qname if uri else attr_name, v))

1972

1973

# Honour xml:space attributes.

1974

space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')

1975

self._preserve_space.append(

1976

space_behaviour == 'preserve' if space_behaviour

1977

else self._preserve_space[-1])

# Write the tag.

write = self._write

write('<' + parsed_qnames[tag][0])

1982

if attr_list:

1983

write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))

1984

write('>')

1985

1986

# Write the resolved qname text content.

1987

if qname_text is not None:

1988

write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))

1989

1990

self._root_seen = True

1991

self._ns_stack.append([])

1992

1993

def end(self, tag):

1994

if self._ignored_depth:

1995

self._ignored_depth -= 1

return

if self._data:

self._flush()

self._write(f'</{self._qname(tag)[0]}>')

2000

self._preserve_space.pop()

2001

self._root_done = len(self._preserve_space) == 1

2002

self._declared_ns_stack.pop()

2003

self._ns_stack.pop()

2004

2005

def comment(self, text):

2006

if not self._with_comments:

2007

return

2008

if self._ignored_depth:

return

if self._root_done:

self._write('\n')

elif self._root_seen and self._data:

2013

self._flush()

2014

self._write(f'')

2015

if not self._root_seen:

2016

self._write('\n')

2017

2018

def pi(self, target, data):

2019

if self._ignored_depth:

return

if self._root_done:

self._write('\n')

elif self._root_seen and self._data:

2024

self._flush()

2025

self._write(

2026

f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')

2027

if not self._root_seen:

self._write('\n')

def _escape_cdata_c14n(text):

2032

# escape character data

2033

try:

2034

# it's worth avoiding do-nothing calls for strings that are

2035

# shorter than 500 character, or so. assume that's, by far,

2036

# the most common case in most applications.

2037

if '&' in text:

2038

text = text.replace('&', '&')

2039

if '<' in text:

2040

text = text.replace('<', '<')

2041

if '>' in text:

2042

text = text.replace('>', '>')

2043

if '\r' in text:

2044

text = text.replace('\r', '')

2045

return text

2046

except (TypeError, AttributeError):

2047

_raise_serialization_error(text)

2048

2049

2050

def _escape_attrib_c14n(text):

2051

# escape attribute value

2052

try:

2053

if '&' in text:

2054

text = text.replace('&', '&')

2055

if '<' in text:

2056

text = text.replace('<', '<')

2057

if '"' in text:

2058

text = text.replace('"', '"')

2059

if '\t' in text:

2060

text = text.replace('\t', '	')

2061

if '\n' in text:

2062

text = text.replace('\n', '
')

2063

if '\r' in text:

2064

text = text.replace('\r', '')

2065

return text

2066

except (TypeError, AttributeError):

2067

_raise_serialization_error(text)

2068

2069

2070

# --------------------------------------------------------------------

2071

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

2072

# Import the C accelerators

2073

try:

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

2074

# Element is going to be shadowed by the C implementation. We need to keep

2075

# the Python version of it accessible for some "creative" by external code

2076

# (see tests)

2077

_Element_Py = Element

2078

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

2079

# Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

2080

from _elementtree import *

Stefan Behnel

2019-05-01 21:20:38 +0200

[diff] [blame]

2081

from _elementtree import _set_factories

Eli Bendersky

c4e98a6

2013-05-19 09:24:43 -0700

[diff] [blame]

2082

except ImportError:

2083

pass

Stefan Behnel