Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

2013-03-09 07:12:48 -0800

[diff] [blame]

1

"""Lightweight XML support for Python.

2

3

XML is an inherently hierarchical data format, and the most natural way to

4

represent it is with a tree. This module has two classes for this purpose:

5

6

1. ElementTree represents the whole XML document as a tree and

7

8

2. Element represents a single node in this tree.

9

10

Interactions with the whole document (reading and writing to/from files) are

11

usually done on the ElementTree level. Interactions with a single XML element

12

and its sub-elements are done on the Element level.

13

14

Element is a flexible container object designed to store hierarchical data

15

structures in memory. It can be described as a cross between a list and a

16

dictionary. Each Element has a number of properties associated with it:

17

18

'tag' - a string containing the element's name.

19

20

'attributes' - a Python dictionary storing the element's attributes.

21

22

'text' - a string containing the element's text content.

23

24

'tail' - an optional string containing text after the element's end tag.

25

26

And a number of child elements stored in a Python sequence.

27

28

To create an element instance, use the Element constructor,

29

or the SubElement factory function.

30

31

You can also use the ElementTree class to wrap an element structure

32

and convert it to and from XML.

"""

Eli Bendersky

2013-04-20 05:44:01 -0700

[diff] [blame]

36

#---------------------------------------------------------------------

37

# Licensed to PSF under a Contributor Agreement.

38

# See http://www.python.org/psf/license for licensing details.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

39

#

40

# ElementTree

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

41

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

42

#

43

# fredrik@pythonware.com

44

# http://www.pythonware.com

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

45

# --------------------------------------------------------------------

46

# The ElementTree toolkit is

47

#

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

48

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

49

#

50

# By obtaining, using, and/or copying this software and/or its

51

# associated documentation, you agree that you have read, understood,

52

# and will comply with the following terms and conditions:

53

#

54

# Permission to use, copy, modify, and distribute this software and

55

# its associated documentation for any purpose and without fee is

56

# hereby granted, provided that the above copyright notice appears in

57

# all copies, and that both that copyright notice and this permission

58

# notice appear in supporting documentation, and that the name of

59

# Secret Labs AB or the author not be used in advertising or publicity

60

# pertaining to distribution of the software without specific, written

61

# prior permission.

62

#

63

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

64

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

65

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

66

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

67

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

68

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

69

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

70

# OF THIS SOFTWARE.

71

# --------------------------------------------------------------------

__all__ = [

# public symbols

"Comment",

"dump",

"Element", "ElementTree",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

78

"fromstring", "fromstringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

79

"iselement", "iterparse",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

80

"parse", "ParseError",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

81

"PI", "ProcessingInstruction",

82

"QName",

83

"SubElement",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

84

"tostring", "tostringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

85

"TreeBuilder",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

86

"VERSION",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

87

"XML", "XMLID",

Martin Panter

dcfebb3

2016-04-01 06:55:55 +0000

[diff] [blame]

88

"XMLParser", "XMLPullParser",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

89

"register_namespace",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

90

]

91

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

92

VERSION = "1.3.0"

93

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

94

import sys

95

import re

96

import warnings

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

97

import io

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

98

import collections

Serhiy Storchaka

2e576f5

2017-04-24 09:05:00 +0300

[diff] [blame^]

99

import collections.abc

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

100

import contextlib

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

101

Eli Bendersky

27cbb19

2012-06-15 09:03:19 +0300

[diff] [blame]

102

from . import ElementPath

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

103

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

104

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

105

class ParseError(SyntaxError):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

106

"""An error when parsing an XML document.

107

108

In addition to its exception value, a ParseError contains

109

two extra attributes:

110

'code' - the specific exception code

111

'position' - the line and column of the error

112

113

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

114

pass

115

116

# --------------------------------------------------------------------

117

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

118

119

def iselement(element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

120

"""Return True if *element* appears to be an Element."""

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

121

return hasattr(element, 'tag')

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

122

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

123

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

124

class Element:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

125

"""An XML element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

126

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

127

This class is the reference implementation of the Element interface.

128

129

An element's length is its number of subelements. That means if you

Serhiy Storchaka

56a6d85

2014-12-01 18:28:43 +0200

[diff] [blame]

130

want to check if an element is truly empty, you should check BOTH

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

131

its length AND its text attribute.

132

133

The element tag, attribute names, and attribute values can be either

134

bytes or strings.

135

136

*tag* is the element name. *attrib* is an optional dictionary containing

137

element attributes. *extra* are additional element attributes given as

keyword arguments.

Example form:

<tag attrib>text<child/>...</tag>tail

142

143

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

144

145

tag = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

146

"""The element's name."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

147

148

attrib = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

149

"""Dictionary of the element's attributes."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

150

151

text = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

152

"""

153

Text before first subelement. This is either a string or the value None.

154

Note that if there is no text, this attribute may be either

155

None or the empty string, depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

156

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

157

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

158

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

159

tail = None

160

"""

161

Text after this element's end tag, but before the next sibling element's

162

start tag. This is either a string or the value None. Note that if there

163

was no text, this attribute may be either None or an empty string,

164

depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

165

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

166

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

167

168

def __init__(self, tag, attrib={}, **extra):

Eli Bendersky

737b173

2012-05-29 06:02:56 +0300

[diff] [blame]

169

if not isinstance(attrib, dict):

170

raise TypeError("attrib must be dict, not %s" % (

171

attrib.__class__.__name__,))

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

172

attrib = attrib.copy()

173

attrib.update(extra)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

self.tag = tag

self.attrib = attrib

self._children = []

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

179

return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

180

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

181

def makeelement(self, tag, attrib):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

182

"""Create a new element with the same type.

183

184

*tag* is a string containing the element name.

185

*attrib* is a dictionary containing the element attributes.

186

187

Do not call this method, use the SubElement factory function instead.

188

189

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

190

return self.__class__(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

191

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

192

def copy(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

193

"""Return copy of current element.

194

195

This creates a shallow copy. Subelements will be shared with the

196

original tree.

197

198

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

199

elem = self.makeelement(self.tag, self.attrib)

200

elem.text = self.text

201

elem.tail = self.tail

elem[:] = self

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

205

def __len__(self):

206

return len(self._children)

207

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

208

def __bool__(self):

209

warnings.warn(

210

"The behavior of this method will change in future versions. "

211

"Use specific 'len(elem)' or 'elem is not None' test instead.",

212

FutureWarning, stacklevel=2

213

)

214

return len(self._children) != 0 # emulate old behaviour, for now

215

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

216

def __getitem__(self, index):

217

return self._children[index]

218

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

219

def __setitem__(self, index, element):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

220

# if isinstance(index, slice):

221

# for elt in element:

222

# assert iselement(elt)

223

# else:

224

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

225

self._children[index] = element

226

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

227

def __delitem__(self, index):

228

del self._children[index]

229

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

230

def append(self, subelement):

231

"""Add *subelement* to the end of this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

232

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

233

The new element will appear in document order after the last existing

234

subelement (or directly after the text, if it's the first subelement),

235

but before the end tag for this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

236

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

237

"""

238

self._assert_is_element(subelement)

239

self._children.append(subelement)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

240

241

def extend(self, elements):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

242

"""Append subelements from a sequence.

243

244

*elements* is a sequence with zero or more elements.

245

246

"""

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

247

for element in elements:

248

self._assert_is_element(element)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

249

self._children.extend(elements)

250

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

251

def insert(self, index, subelement):

252

"""Insert *subelement* at position *index*."""

253

self._assert_is_element(subelement)

254

self._children.insert(index, subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

255

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

256

def _assert_is_element(self, e):

Antoine Pitrou

ee32931

2012-10-04 19:53:29 +0200

[diff] [blame]

257

# Need to refer to the actual Python implementation, not the

258

# shadowing C implementation.

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

259

if not isinstance(e, _Element_Py):

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

260

raise TypeError('expected an Element, not %s' % type(e).__name__)

261

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

262

def remove(self, subelement):

263

"""Remove matching subelement.

264

265

Unlike the find methods, this method compares elements based on

266

identity, NOT ON tag value or contents. To remove subelements by

267

other means, the easiest way is to use a list comprehension to

268

select what elements to keep, and then use slice assignment to update

269

the parent element.

270

271

ValueError is raised if a matching element could not be found.

272

273

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

274

# assert iselement(element)

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

275

self._children.remove(subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

276

277

def getchildren(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

278

"""(Deprecated) Return all subelements.

279

280

Elements are returned in document order.

281

282

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

283

warnings.warn(

284

"This method will be removed in future versions. "

285

"Use 'list(elem)' or iteration over elem instead.",

286

DeprecationWarning, stacklevel=2

287

)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

288

return self._children

289

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

290

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

291

"""Find first matching element by tag name or path.

292

293

*path* is a string having either an element tag or an XPath,

294

*namespaces* is an optional mapping from namespace prefix to full name.

295

296

Return the first matching element, or None if no element was found.

297

298

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

299

return ElementPath.find(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

300

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

301

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

302

"""Find text for first matching element by tag name or path.

303

304

*path* is a string having either an element tag or an XPath,

305

*default* is the value to return if the element was not found,

306

*namespaces* is an optional mapping from namespace prefix to full name.

307

308

Return text content of first matching element, or default value if

309

none was found. Note that if an element is found having no text

310

content, the empty string is returned.

311

312

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

313

return ElementPath.findtext(self, path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

314

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

315

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

316

"""Find all matching subelements by tag name or path.

317

318

*path* is a string having either an element tag or an XPath,

319

*namespaces* is an optional mapping from namespace prefix to full name.

320

321

Returns list containing all matching elements in document order.

322

323

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

324

return ElementPath.findall(self, path, namespaces)

325

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

326

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

327

"""Find all matching subelements by tag name or path.

328

329

*path* is a string having either an element tag or an XPath,

330

*namespaces* is an optional mapping from namespace prefix to full name.

331

332

Return an iterable yielding all matching elements in document order.

333

334

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

335

return ElementPath.iterfind(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

336

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

337

def clear(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

338

"""Reset element.

339

340

This function removes all subelements, clears all attributes, and sets

341

the text and tail attributes to None.

342

343

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

344

self.attrib.clear()

345

self._children = []

346

self.text = self.tail = None

347

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

348

def get(self, key, default=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

349

"""Get element attribute.

350

351

Equivalent to attrib.get, but some implementations may handle this a

352

bit more efficiently. *key* is what attribute to look for, and

353

*default* is what to return if the attribute was not found.

354

355

Returns a string containing the attribute value, or the default if

356

attribute was not found.

357

358

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

359

return self.attrib.get(key, default)

360

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

361

def set(self, key, value):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

362

"""Set element attribute.

363

364

Equivalent to attrib[key] = value, but some implementations may handle

365

this a bit more efficiently. *key* is what attribute to set, and

366

*value* is the attribute value to set it to.

367

368

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

369

self.attrib[key] = value

370

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

371

def keys(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

372

"""Get list of attribute names.

373

374

Names are returned in an arbitrary order, just like an ordinary

375

Python dict. Equivalent to attrib.keys()

376

377

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

378

return self.attrib.keys()

379

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

380

def items(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

381

"""Get element attributes as a sequence.

382

383

The attributes are returned in arbitrary order. Equivalent to

384

attrib.items().

385

386

Return a list of (name, value) tuples.

387

388

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

389

return self.attrib.items()

390

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

391

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

392

"""Create tree iterator.

393

394

The iterator loops over the element and all subelements in document

395

order, returning all elements with a matching tag.

396

397

If the tree structure is modified during iteration, new or removed

398

elements may or may not be included. To get a stable set, use the

399

list() function on the iterator, and loop over the resulting list.

400

401

*tag* is what tags to look for (default is to return all elements)

402

403

Return an iterator containing all the matching elements.

404

405

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

406

if tag == "*":

407

tag = None

408

if tag is None or self.tag == tag:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

409

yield self

410

for e in self._children:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

411

yield from e.iter(tag)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

412

413

# compatibility

414

def getiterator(self, tag=None):

415

# Change for a DeprecationWarning in 1.4

416

warnings.warn(

417

"This method will be removed in future versions. "

418

"Use 'elem.iter()' or 'list(elem.iter())' instead.",

419

PendingDeprecationWarning, stacklevel=2

420

)

421

return list(self.iter(tag))

422

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

423

def itertext(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

424

"""Create text iterator.

425

426

The iterator loops over the element and all subelements in document

427

order, returning all inner text.

428

429

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

430

tag = self.tag

431

if not isinstance(tag, str) and tag is not None:

432

return

Serhiy Storchaka

66c08d9

2015-12-21 11:09:48 +0200

[diff] [blame]

433

t = self.text

434

if t:

435

yield t

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

436

for e in self:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

437

yield from e.itertext()

Serhiy Storchaka

66c08d9

2015-12-21 11:09:48 +0200

[diff] [blame]

438

t = e.tail

439

if t:

440

yield t

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

441

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

442

443

def SubElement(parent, tag, attrib={}, **extra):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

444

"""Subelement factory which creates an element instance, and appends it

445

to an existing parent.

446

447

The element tag, attribute names, and attribute values can be either

448

bytes or Unicode strings.

449

450

*parent* is the parent element, *tag* is the subelements name, *attrib* is

451

an optional directory containing element attributes, *extra* are

452

additional attributes given as keyword arguments.

453

454

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

455

attrib = attrib.copy()

456

attrib.update(extra)

457

element = parent.makeelement(tag, attrib)

458

parent.append(element)

459

return element

460

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

461

462

def Comment(text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

463

"""Comment element factory.

464

465

This function creates a special element which the standard serializer

466

serializes as an XML comment.

467

468

*text* is a string containing the comment string.

469

470

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

471

element = Element(Comment)

element.text = text

return element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

475

476

def ProcessingInstruction(target, text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

477

"""Processing Instruction element factory.

478

479

This function creates a special element which the standard serializer

480

serializes as an XML comment.

481

482

*target* is a string containing the processing instruction, *text* is a

483

string containing the processing instruction contents, if any.

484

485

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

486

element = Element(ProcessingInstruction)

487

element.text = target

488

if text:

489

element.text = element.text + " " + text

490

return element

491

492

PI = ProcessingInstruction

493

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

494

495

class QName:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

496

"""Qualified name wrapper.

497

498

This class can be used to wrap a QName attribute value in order to get

499

proper namespace handing on output.

500

501

*text_or_uri* is a string containing the QName value either in the form

502

{uri}local, or if the tag argument is given, the URI part of a QName.

503

504

*tag* is an optional argument which if given, will make the first

505

argument (text_or_uri) be interpreted as a URI, and this argument (tag)

506

be interpreted as a local name.

507

508

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

509

def __init__(self, text_or_uri, tag=None):

510

if tag:

511

text_or_uri = "{%s}%s" % (text_or_uri, tag)

512

self.text = text_or_uri

513

def __str__(self):

514

return self.text

Georg Brandl

b56c0e2

2010-12-09 18:10:27 +0000

[diff] [blame]

515

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

516

return '<%s %r>' % (self.__class__.__name__, self.text)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

517

def __hash__(self):

518

return hash(self.text)

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

519

def __le__(self, other):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

520

if isinstance(other, QName):

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

521

return self.text <= other.text

522

return self.text <= other

523

def __lt__(self, other):

524

if isinstance(other, QName):

525

return self.text < other.text

526

return self.text < other

527

def __ge__(self, other):

528

if isinstance(other, QName):

529

return self.text >= other.text

530

return self.text >= other

531

def __gt__(self, other):

532

if isinstance(other, QName):

533

return self.text > other.text

534

return self.text > other

535

def __eq__(self, other):

536

if isinstance(other, QName):

537

return self.text == other.text

538

return self.text == other

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

539

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

540

# --------------------------------------------------------------------

541

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

542

543

class ElementTree:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

544

"""An XML element hierarchy.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

545

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

546

This class also provides support for serialization to and from

547

standard XML.

548

549

*element* is an optional root element node,

550

*file* is an optional file handle or file name of an XML file whose

551

contents will be used to initialize the tree with.

552

553

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

554

def __init__(self, element=None, file=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

555

# assert element is None or iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

556

self._root = element # first node

if file:

self.parse(file)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

560

def getroot(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

561

"""Return root element of this tree."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

562

return self._root

563

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

564

def _setroot(self, element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

565

"""Replace root element of this tree.

566

567

This will discard the current contents of the tree and replace it

568

with the given element. Use with care!

569

570

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

571

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

572

self._root = element

573

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

574

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

575

"""Load external XML document into element tree.

576

577

*source* is a file name or file object, *parser* is an optional parser

578

instance that defaults to XMLParser.

579

580

ParseError is raised if the parser fails to parse the document.

581

582

Returns the root element of the given source document.

583

584

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

585

close_source = False

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

586

if not hasattr(source, "read"):

587

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

588

close_source = True

589

try:

Eli Bendersky

a369923

2013-05-19 18:47:23 -0700

[diff] [blame]

590

if parser is None:

591

# If no parser was specified, create a default XMLParser

592

parser = XMLParser()

593

if hasattr(parser, '_parse_whole'):

594

# The default XMLParser, when it comes from an accelerator,

595

# can define an internal _parse_whole API for efficiency.

596

# It can be used to parse the whole source without feeding

597

# it with chunks.

598

self._root = parser._parse_whole(source)

599

return self._root

600

while True:

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

601

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

return self._root

finally:

if close_source:

source.close()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

610

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

611

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

612

"""Create and return tree iterator for the root element.

613

614

The iterator loops over all elements in this tree, in document order.

615

616

*tag* is a string with the tag name to iterate over

617

(default is to return all elements).

618

619

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

620

# assert self._root is not None

621

return self._root.iter(tag)

622

623

# compatibility

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

624

def getiterator(self, tag=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

625

# Change for a DeprecationWarning in 1.4

626

warnings.warn(

627

"This method will be removed in future versions. "

628

"Use 'tree.iter()' or 'list(tree.iter())' instead.",

629

PendingDeprecationWarning, stacklevel=2

630

)

631

return list(self.iter(tag))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

632

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

633

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

634

"""Find first matching element by tag name or path.

635

636

Same as getroot().find(path), which is Element.find()

637

638

*path* is a string having either an element tag or an XPath,

639

*namespaces* is an optional mapping from namespace prefix to full name.

640

641

Return the first matching element, or None if no element was found.

642

643

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

644

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

645

if path[:1] == "/":

646

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

647

warnings.warn(

648

"This search is broken in 1.3 and earlier, and will be "

649

"fixed in a future version. If you rely on the current "

650

"behaviour, change it to %r" % path,

651

FutureWarning, stacklevel=2

652

)

653

return self._root.find(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

654

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

655

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

656

"""Find first matching element by tag name or path.

657

658

Same as getroot().findtext(path), which is Element.findtext()

659

660

*path* is a string having either an element tag or an XPath,

661

*namespaces* is an optional mapping from namespace prefix to full name.

662

663

Return the first matching element, or None if no element was found.

664

665

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

666

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

667

if path[:1] == "/":

668

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

669

warnings.warn(

670

"This search is broken in 1.3 and earlier, and will be "

671

"fixed in a future version. If you rely on the current "

672

"behaviour, change it to %r" % path,

673

FutureWarning, stacklevel=2

674

)

675

return self._root.findtext(path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

676

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

677

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

678

"""Find all matching subelements by tag name or path.

679

680

Same as getroot().findall(path), which is Element.findall().

681

682

*path* is a string having either an element tag or an XPath,

683

*namespaces* is an optional mapping from namespace prefix to full name.

684

685

Return list containing all matching elements in document order.

686

687

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

688

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

689

if path[:1] == "/":

690

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

691

warnings.warn(

692

"This search is broken in 1.3 and earlier, and will be "

693

"fixed in a future version. If you rely on the current "

694

"behaviour, change it to %r" % path,

695

FutureWarning, stacklevel=2

696

)

697

return self._root.findall(path, namespaces)

698

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

699

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

700

"""Find all matching subelements by tag name or path.

701

702

Same as getroot().iterfind(path), which is element.iterfind()

703

704

*path* is a string having either an element tag or an XPath,

705

*namespaces* is an optional mapping from namespace prefix to full name.

706

707

Return an iterable yielding all matching elements in document order.

708

709

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

710

# assert self._root is not None

if path[:1] == "/":

path = "." + path

warnings.warn(

"This search is broken in 1.3 and earlier, and will be "

715

"fixed in a future version. If you rely on the current "

716

"behaviour, change it to %r" % path,

717

FutureWarning, stacklevel=2

718

)

719

return self._root.iterfind(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

720

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

721

def write(self, file_or_filename,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

722

encoding=None,

723

xml_declaration=None,

724

default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

725

method=None, *,

726

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

727

"""Write element tree to a file as XML.

728

729

Arguments:

730

*file_or_filename* -- file name or a file object opened for writing

731

732

*encoding* -- the output encoding (default: US-ASCII)

733

734

*xml_declaration* -- bool indicating if an XML declaration should be

735

added to the output. If None, an XML declaration

736

is added if encoding IS NOT either of:

737

US-ASCII, UTF-8, or Unicode

738

739

*default_namespace* -- sets the default XML namespace (for "xmlns")

740

741

*method* -- either "xml" (default), "html, "text", or "c14n"

742

743

*short_empty_elements* -- controls the formatting of elements

744

that contain no content. If True (default)

745

they are emitted as a single self-closed

746

tag, otherwise they are emitted as a pair

747

of start/end tags

Eli Bendersky

e9af827

2013-01-13 06:27:51 -0800

[diff] [blame]

748

749

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

750

if not method:

751

method = "xml"

752

elif method not in _serialize:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

753

raise ValueError("unknown method %r" % method)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

if not encoding:

if method == "c14n":

encoding = "utf-8"

else:

encoding = "us-ascii"

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

759

enc_lower = encoding.lower()

760

with _get_writer(file_or_filename, enc_lower) as write:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

761

if method == "xml" and (xml_declaration or

762

(xml_declaration is None and

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

763

enc_lower not in ("utf-8", "us-ascii", "unicode"))):

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

764

declared_encoding = encoding

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

765

if enc_lower == "unicode":

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

766

# Retrieve the default encoding for the xml declaration

767

import locale

768

declared_encoding = locale.getpreferredencoding()

769

write("<?xml version='1.0' encoding='%s'?>\n" % (

770

declared_encoding,))

771

if method == "text":

772

_serialize_text(write, self._root)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

773

else:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

774

qnames, namespaces = _namespaces(self._root, default_namespace)

775

serialize = _serialize[method]

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

776

serialize(write, self._root, qnames, namespaces,

777

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

778

779

def write_c14n(self, file):

780

# lxml.etree compatibility. use output method instead

781

return self.write(file, method="c14n")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

782

783

# --------------------------------------------------------------------

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

784

# serialization support

785

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

786

@contextlib.contextmanager

787

def _get_writer(file_or_filename, encoding):

Ezio Melotti

b5bc353

2013-08-17 16:11:40 +0300

[diff] [blame]

788

# returns text write method and release all resources after using

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

789

try:

790

write = file_or_filename.write

791

except AttributeError:

792

# file_or_filename is a file name

793

if encoding == "unicode":

794

file = open(file_or_filename, "w")

795

else:

796

file = open(file_or_filename, "w", encoding=encoding,

797

errors="xmlcharrefreplace")

with file:

yield file.write

else:

# file_or_filename is a file-like object

802

# encoding determines if it is a text or binary writer

803

if encoding == "unicode":

804

# use a text writer as is

805

yield write

806

else:

807

# wrap a binary writer with TextIOWrapper

808

with contextlib.ExitStack() as stack:

809

if isinstance(file_or_filename, io.BufferedIOBase):

810

file = file_or_filename

811

elif isinstance(file_or_filename, io.RawIOBase):

812

file = io.BufferedWriter(file_or_filename)

813

# Keep the original file open when the BufferedWriter is

814

# destroyed

815

stack.callback(file.detach)

816

else:

817

# This is to handle passed objects that aren't in the

818

# IOBase hierarchy, but just have a write method

819

file = io.BufferedIOBase()

820

file.writable = lambda: True

821

file.write = write

822

try:

823

# TextIOWrapper uses this methods to determine

824

# if BOM (for UTF-16, etc) should be added

825

file.seekable = file_or_filename.seekable

826

file.tell = file_or_filename.tell

827

except AttributeError:

828

pass

829

file = io.TextIOWrapper(file,

830

encoding=encoding,

831

errors="xmlcharrefreplace",

832

newline="\n")

833

# Keep the original file open when the TextIOWrapper is

834

# destroyed

835

stack.callback(file.detach)

836

yield file.write

837

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

838

def _namespaces(elem, default_namespace=None):

839

# identify namespaces used in this tree

840

841

# maps qnames to *encoded* prefix:local names

842

qnames = {None: None}

843

844

# maps uri:s to prefixes

845

namespaces = {}

846

if default_namespace:

847

namespaces[default_namespace] = ""

848

849

def add_qname(qname):

850

# calculate serialized qname representation

851

try:

852

if qname[:1] == "{":

853

uri, tag = qname[1:].rsplit("}", 1)

854

prefix = namespaces.get(uri)

855

if prefix is None:

856

prefix = _namespace_map.get(uri)

857

if prefix is None:

858

prefix = "ns%d" % len(namespaces)

859

if prefix != "xml":

860

namespaces[uri] = prefix

861

if prefix:

862

qnames[qname] = "%s:%s" % (prefix, tag)

863

else:

864

qnames[qname] = tag # default element

865

else:

866

if default_namespace:

867

# FIXME: can this be handled in XML 1.0?

868

raise ValueError(

869

"cannot use non-qualified names with "

870

"default_namespace option"

871

)

872

qnames[qname] = qname

873

except TypeError:

874

_raise_serialization_error(qname)

875

876

# populate qname and namespaces table

Eli Bendersky

64d11e6

2012-06-15 07:42:50 +0300

[diff] [blame]

877

for elem in elem.iter():

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

878

tag = elem.tag

Senthil Kumaran

ec30b3d

2010-11-09 02:36:59 +0000

[diff] [blame]

879

if isinstance(tag, QName):

880

if tag.text not in qnames:

881

add_qname(tag.text)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

882

elif isinstance(tag, str):

883

if tag not in qnames:

884

add_qname(tag)

885

elif tag is not None and tag is not Comment and tag is not PI:

886

_raise_serialization_error(tag)

887

for key, value in elem.items():

888

if isinstance(key, QName):

889

key = key.text

890

if key not in qnames:

891

add_qname(key)

892

if isinstance(value, QName) and value.text not in qnames:

893

add_qname(value.text)

894

text = elem.text

895

if isinstance(text, QName) and text.text not in qnames:

896

add_qname(text.text)

897

return qnames, namespaces

898

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

899

def _serialize_xml(write, elem, qnames, namespaces,

900

short_empty_elements, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % text)

905

elif tag is ProcessingInstruction:

906

write("<?%s?>" % text)

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

912

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

913

_serialize_xml(write, e, qnames, None,

914

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

915

else:

916

write("<" + tag)

917

items = list(elem.items())

918

if items or namespaces:

919

if namespaces:

920

for v, k in sorted(namespaces.items(),

921

key=lambda x: x[1]): # sort on prefix

922

if k:

923

k = ":" + k

924

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

929

if isinstance(k, QName):

930

k = k.text

931

if isinstance(v, QName):

932

v = qnames[v.text]

933

else:

934

v = _escape_attrib(v)

935

write(" %s=\"%s\"" % (qnames[k], v))

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

936

if text or len(elem) or not short_empty_elements:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

937

write(">")

938

if text:

939

write(_escape_cdata(text))

940

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

941

_serialize_xml(write, e, qnames, None,

942

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

943

write("</" + tag + ">")

else:

write(" />")

if elem.tail:

write(_escape_cdata(elem.tail))

948

949

HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",

Ezio Melotti

c90111f

2012-09-19 08:19:12 +0300

[diff] [blame]

950

"img", "input", "isindex", "link", "meta", "param")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

951

952

try:

953

HTML_EMPTY = set(HTML_EMPTY)

except NameError:

pass

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

957

def _serialize_html(write, elem, qnames, namespaces, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % _escape_cdata(text))

962

elif tag is ProcessingInstruction:

963

write("<?%s?>" % _escape_cdata(text))

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

969

for e in elem:

970

_serialize_html(write, e, qnames, None)

971

else:

972

write("<" + tag)

973

items = list(elem.items())

974

if items or namespaces:

975

if namespaces:

976

for v, k in sorted(namespaces.items(),

977

key=lambda x: x[1]): # sort on prefix

978

if k:

979

k = ":" + k

980

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

985

if isinstance(k, QName):

986

k = k.text

987

if isinstance(v, QName):

988

v = qnames[v.text]

989

else:

990

v = _escape_attrib_html(v)

991

# FIXME: handle boolean attributes

992

write(" %s=\"%s\"" % (qnames[k], v))

993

write(">")

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

994

ltag = tag.lower()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

995

if text:

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

996

if ltag == "script" or ltag == "style":

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

997

write(text)

998

else:

999

write(_escape_cdata(text))

1000

for e in elem:

1001

_serialize_html(write, e, qnames, None)

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

1002

if ltag not in HTML_EMPTY:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1003

write("</" + tag + ">")

1004

if elem.tail:

1005

write(_escape_cdata(elem.tail))

1006

1007

def _serialize_text(write, elem):

1008

for part in elem.itertext():

write(part)

if elem.tail:

write(elem.tail)

_serialize = {

"xml": _serialize_xml,

1015

"html": _serialize_html,

1016

"text": _serialize_text,

1017

# this optional method is imported at the end of the module

1018

# "c14n": _serialize_c14n,

1019

}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1020

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1021

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1022

def register_namespace(prefix, uri):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1023

"""Register a namespace prefix.

1024

1025

The registry is global, and any existing mapping for either the

1026

given prefix or the namespace URI will be removed.

1027

1028

*prefix* is the namespace prefix, *uri* is a namespace uri. Tags and

1029

attributes in this namespace will be serialized with prefix if possible.

1030

1031

ValueError is raised if prefix is reserved or is invalid.

1032

1033

"""

R David Murray

44b548d

2016-09-08 13:59:53 -0400

[diff] [blame]

1034

if re.match(r"ns\d+$", prefix):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1035

raise ValueError("Prefix format reserved for internal use")

Georg Brandl

90b2067

2010-12-28 10:38:33 +0000

[diff] [blame]

1036

for k, v in list(_namespace_map.items()):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1037

if k == uri or v == prefix:

1038

del _namespace_map[k]

1039

_namespace_map[uri] = prefix

1040

1041

_namespace_map = {

1042

# "well-known" namespace prefixes

1043

"http://www.w3.org/XML/1998/namespace": "xml",

1044

"http://www.w3.org/1999/xhtml": "html",

1045

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

1046

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

1047

# xml schema

1048

"http://www.w3.org/2001/XMLSchema": "xs",

1049

"http://www.w3.org/2001/XMLSchema-instance": "xsi",

1050

# dublin core

1051

"http://purl.org/dc/elements/1.1/": "dc",

1052

}

Florent Xicluna

1639505

2012-02-16 23:28:35 +0100

[diff] [blame]

1053

# For tests and troubleshooting

1054

register_namespace._namespace_map = _namespace_map

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1055

1056

def _raise_serialization_error(text):

1057

raise TypeError(

1058

"cannot serialize %r (type %s)" % (text, type(text).__name__)

1059

)

1060

1061

def _escape_cdata(text):

1062

# escape character data

1063

try:

1064

# it's worth avoiding do-nothing calls for strings that are

1065

# shorter than 500 character, or so. assume that's, by far,

1066

# the most common case in most applications.

1067

if "&" in text:

1068

text = text.replace("&", "&")

1069

if "<" in text:

1070

text = text.replace("<", "<")

1071

if ">" in text:

1072

text = text.replace(">", ">")

1073

return text

1074

except (TypeError, AttributeError):

1075

_raise_serialization_error(text)

1076

1077

def _escape_attrib(text):

1078

# escape attribute value

1079

try:

1080

if "&" in text:

1081

text = text.replace("&", "&")

1082

if "<" in text:

1083

text = text.replace("<", "<")

1084

if ">" in text:

1085

text = text.replace(">", ">")

1086

if "\"" in text:

1087

text = text.replace("\"", """)

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1088

# The following business with carriage returns is to satisfy

Raymond Hettinger

11fa3ff

2016-09-11 23:23:24 -0700

[diff] [blame]

1089

# Section 2.11 of the XML specification, stating that

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1090

# CR or CR LN should be replaced with just LN

1091

# http://www.w3.org/TR/REC-xml/#sec-line-ends

1092

if "\r\n" in text:

1093

text = text.replace("\r\n", "\n")

1094

if "\r" in text:

1095

text = text.replace("\r", "\n")

1096

#The following four lines are issue 17582

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1097

if "\n" in text:

1098

text = text.replace("\n", "
")

Raymond Hettinger

076366c

2016-09-11 23:18:03 -0700

[diff] [blame]

1099

if "\t" in text:

1100

text = text.replace("\t", "	")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1101

return text

1102

except (TypeError, AttributeError):

1103

_raise_serialization_error(text)

1104

1105

def _escape_attrib_html(text):

1106

# escape attribute value

1107

try:

1108

if "&" in text:

1109

text = text.replace("&", "&")

1110

if ">" in text:

1111

text = text.replace(">", ">")

1112

if "\"" in text:

1113

text = text.replace("\"", """)

1114

return text

1115

except (TypeError, AttributeError):

1116

_raise_serialization_error(text)

1117

1118

# --------------------------------------------------------------------

1119

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1120

def tostring(element, encoding=None, method=None, *,

1121

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1122

"""Generate string representation of XML element.

1123

1124

All subelements are included. If encoding is "unicode", a string

1125

is returned. Otherwise a bytestring is returned.

1126

1127

*element* is an Element instance, *encoding* is an optional output

1128

encoding defaulting to US-ASCII, *method* is an optional output which can

1129

be one of "xml" (default), "html", "text" or "c14n".

1130

1131

Returns an (optionally) encoded string containing the XML data.

1132

1133

"""

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1134

stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1135

ElementTree(element).write(stream, encoding, method=method,

1136

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1137

return stream.getvalue()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1138

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1139

class _ListDataStream(io.BufferedIOBase):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1140

"""An auxiliary stream accumulating into a list reference."""

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1141

def __init__(self, lst):

1142

self.lst = lst

Eli Bendersky

f90fc68

2012-07-17 15:09:56 +0300

[diff] [blame]

1143

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

def writable(self):

return True

def seekable(self):

return True

def write(self, b):

self.lst.append(b)

def tell(self):

return len(self.lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1156

def tostringlist(element, encoding=None, method=None, *,

1157

short_empty_elements=True):

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1158

lst = []

1159

stream = _ListDataStream(lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1160

ElementTree(element).write(stream, encoding, method=method,

1161

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1162

return lst

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1163

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1164

1165

def dump(elem):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1166

"""Write element tree or element structure to sys.stdout.

1167

1168

This function should be used for debugging only.

1169

1170

*elem* is either an ElementTree, or a single Element. The exact output

1171

format is implementation dependent. In this version, it's written as an

1172

ordinary XML file.

1173

1174

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1175

# debugging

1176

if not isinstance(elem, ElementTree):

1177

elem = ElementTree(elem)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

1178

elem.write(sys.stdout, encoding="unicode")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1179

tail = elem.getroot().tail

1180

if not tail or tail[-1] != "\n":

1181

sys.stdout.write("\n")

1182

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1183

# --------------------------------------------------------------------

1184

# parsing

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1185

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1186

1187

def parse(source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1188

"""Parse XML document into element tree.

1189

1190

*source* is a filename or file object containing XML data,

1191

*parser* is an optional parser instance defaulting to XMLParser.

1192

1193

Return an ElementTree instance.

1194

1195

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1196

tree = ElementTree()

1197

tree.parse(source, parser)

1198

return tree

1199

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1200

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1201

def iterparse(source, events=None, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1202

"""Incrementally parse XML document into ElementTree.

1203

1204

This class also reports what's going on to the user based on the

1205

*events* it is initialized with. The supported events are the strings

1206

"start", "end", "start-ns" and "end-ns" (the "ns" events are used to get

1207

detailed namespace information). If *events* is omitted, only

1208

"end" events are reported.

1209

1210

*source* is a filename or file object containing XML data, *events* is

1211

a list of events to report back, *parser* is an optional parser instance.

1212

1213

Returns an iterator providing (event, elem) pairs.

1214

1215

"""

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1216

# Use the internal, undocumented _parser argument for now; When the

1217

# parser argument of iterparse is removed, this can be killed.

1218

pullparser = XMLPullParser(events=events, _parser=parser)

def iterator():

try:

while True:

yield from pullparser.read_events()

1223

# load event buffer

1224

data = source.read(16 * 1024)

1225

if not data:

1226

break

1227

pullparser.feed(data)

1228

root = pullparser._close_and_return_root()

1229

yield from pullparser.read_events()

it.root = root

finally:

if close_source:

source.close()

Serhiy Storchaka

2017-04-24 09:05:00 +0300

[diff] [blame^]

1235

class IterParseIterator(collections.abc.Iterator):

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1236

__next__ = iterator().__next__

1237

it = IterParseIterator()

1238

it.root = None

1239

del iterator, IterParseIterator

1240

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1241

close_source = False

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1242

if not hasattr(source, "read"):

1243

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1244

close_source = True

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1245

1246

return it

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1247

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1248

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1249

class XMLPullParser:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1250

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1251

def __init__(self, events=None, *, _parser=None):

1252

# The _parser argument is for internal use only and must not be relied

1253

# upon in user code. It will be removed in a future release.

1254

# See http://bugs.python.org/issue17741 for more details.

1255

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1256

self._events_queue = collections.deque()

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1257

self._parser = _parser or XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1258

# wire up the parser for event reporting

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1259

if events is None:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1260

events = ("end",)

1261

self._parser._setevents(self._events_queue, events)

1262

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1263

def feed(self, data):

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1264

"""Feed encoded data to parser."""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1265

if self._parser is None:

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1266

raise ValueError("feed() called after end of stream")

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1267

if data:

1268

try:

1269

self._parser.feed(data)

1270

except SyntaxError as exc:

1271

self._events_queue.append(exc)

1272

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1273

def _close_and_return_root(self):

1274

# iterparse needs this to set its root attribute properly :(

1275

root = self._parser.close()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1276

self._parser = None

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

return root

def close(self):

"""Finish feeding data to parser.

1281

1282

Unlike XMLParser, does not return the root element. Use

1283

read_events() to consume elements from XMLPullParser.

1284

"""

1285

self._close_and_return_root()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1286

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1287

def read_events(self):

R David Murray

410d320

2014-01-04 23:52:50 -0500

[diff] [blame]

1288

"""Return an iterator over currently available (event, elem) pairs.

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1289

1290

Events are consumed from the internal event queue as they are

1291

retrieved from the iterator.

1292

"""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1293

events = self._events_queue

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1294

while events:

1295

event = events.popleft()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1296

if isinstance(event, Exception):

1297

raise event

1298

else:

1299

yield event

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1300

1301

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1302

def XML(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1303

"""Parse XML document from string constant.

1304

1305

This function can be used to embed "XML Literals" in Python code.

1306

1307

*text* is a string containing XML data, *parser* is an

1308

optional parser instance, defaulting to the standard XMLParser.

1309

1310

Returns an Element instance.

1311

1312

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1313

if not parser:

1314

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1315

parser.feed(text)

1316

return parser.close()

1317

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1318

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1319

def XMLID(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1320

"""Parse XML document from string constant for its IDs.

1321

1322

*text* is a string containing XML data, *parser* is an

1323

optional parser instance, defaulting to the standard XMLParser.

1324

1325

Returns an (Element, dict) tuple, in which the

1326

dict maps element id:s to elements.

1327

1328

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1329

if not parser:

1330

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1331

parser.feed(text)

1332

tree = parser.close()

1333

ids = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1334

for elem in tree.iter():

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

id = elem.get("id")

if id:

ids[id] = elem

return tree, ids

Victor Stinner

2013-03-26 01:11:54 +0100

[diff] [blame]

1340

# Parse XML document from string constant. Alias for XML().

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1341

fromstring = XML

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1342

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1343

def fromstringlist(sequence, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1344

"""Parse XML document from sequence of string fragments.

1345

1346

*sequence* is a list of other sequence, *parser* is an optional parser

1347

instance, defaulting to the standard XMLParser.

1348

1349

Returns an Element instance.

1350

1351

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1352

if not parser:

1353

parser = XMLParser(target=TreeBuilder())

1354

for text in sequence:

1355

parser.feed(text)

1356

return parser.close()

1357

1358

# --------------------------------------------------------------------

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1359

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1360

1361

class TreeBuilder:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1362

"""Generic element structure builder.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1363

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1364

This builder converts a sequence of start, data, and end method

1365

calls to a well-formed element structure.

1366

1367

You can use this class to build an element structure using a custom XML

1368

parser, or a parser for some other XML-like format.

1369

1370

*element_factory* is an optional element factory which is called

1371

to create new Element instances, as necessary.

1372

1373

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1374

def __init__(self, element_factory=None):

1375

self._data = [] # data collector

1376

self._elem = [] # element stack

1377

self._last = None # last element

1378

self._tail = None # true if we're after an end tag

1379

if element_factory is None:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1380

element_factory = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1381

self._factory = element_factory

1382

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1383

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1384

"""Flush builder buffers and return toplevel document Element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1385

assert len(self._elem) == 0, "missing end tags"

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1386

assert self._last is not None, "missing toplevel element"

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

return self._last

def _flush(self):

if self._data:

if self._last is not None:

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1392

text = "".join(self._data)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1393

if self._tail:

1394

assert self._last.tail is None, "internal error (tail)"

1395

self._last.tail = text

1396

else:

1397

assert self._last.text is None, "internal error (text)"

1398

self._last.text = text

1399

self._data = []

1400

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1401

def data(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1402

"""Add text to current element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1403

self._data.append(data)

1404

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1405

def start(self, tag, attrs):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1406

"""Open new element and return it.

1407

1408

*tag* is the element name, *attrs* is a dict containing element

1409

attributes.

1410

1411

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1412

self._flush()

1413

self._last = elem = self._factory(tag, attrs)

1414

if self._elem:

1415

self._elem[-1].append(elem)

1416

self._elem.append(elem)

self._tail = 0

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1420

def end(self, tag):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1421

"""Close and return current Element.

1422

1423

*tag* is the element name.

1424

1425

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1426

self._flush()

1427

self._last = self._elem.pop()

1428

assert self._last.tag == tag,\

1429

"end tag mismatch (expected %s, got %s)" % (

self._last.tag, tag)

self._tail = 1

return self._last

Serhiy Storchaka

2017-03-30 18:12:06 +0300

[diff] [blame]

1434

_sentinel = ['sentinel']

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1435

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1436

# also see ElementTree and TreeBuilder

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1437

class XMLParser:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1438

"""Element structure builder for XML source data based on the expat parser.

1439

Martin Panter

29ce082

2016-06-04 07:12:51 +0000

[diff] [blame]

1440

*html* are predefined HTML entities (deprecated and not supported),

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1441

*target* is an optional target object which defaults to an instance of the

1442

standard TreeBuilder class, *encoding* is an optional encoding string

1443

which if given, overrides the encoding specified in the XML file:

1444

http://www.iana.org/assignments/character-sets

1445

1446

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1447

Serhiy Storchaka

762ec97

2017-03-30 18:12:06 +0300

[diff] [blame]

1448

def __init__(self, html=_sentinel, target=None, encoding=None):

1449

if html is not _sentinel:

1450

warnings.warn(

1451

"The html argument of XMLParser() is deprecated",

1452

DeprecationWarning, stacklevel=2)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1453

try:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1454

from xml.parsers import expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1455

except ImportError:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1456

try:

1457

import pyexpat as expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1458

except ImportError:

1459

raise ImportError(

1460

"No module named expat; use SimpleXMLTreeBuilder instead"

1461

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1462

parser = expat.ParserCreate(encoding, "}")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1463

if target is None:

1464

target = TreeBuilder()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1465

# underscored names are provided for compatibility only

1466

self.parser = self._parser = parser

1467

self.target = self._target = target

1468

self._error = expat.error

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1469

self._names = {} # name memo cache

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1470

# main callbacks

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1471

parser.DefaultHandlerExpand = self._default

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1472

if hasattr(target, 'start'):

1473

parser.StartElementHandler = self._start

1474

if hasattr(target, 'end'):

1475

parser.EndElementHandler = self._end

1476

if hasattr(target, 'data'):

1477

parser.CharacterDataHandler = target.data

1478

# miscellaneous callbacks

1479

if hasattr(target, 'comment'):

1480

parser.CommentHandler = target.comment

1481

if hasattr(target, 'pi'):

1482

parser.ProcessingInstructionHandler = target.pi

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1483

# Configure pyexpat: buffering, new-style attribute handling.

1484

parser.buffer_text = 1

1485

parser.ordered_attributes = 1

1486

parser.specified_attributes = 1

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1487

self._doctype = None

1488

self.entity = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1489

try:

1490

self.version = "Expat %d.%d.%d" % expat.version_info

1491

except AttributeError:

1492

pass # unknown

1493

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1494

def _setevents(self, events_queue, events_to_report):

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1495

# Internal API for XMLPullParser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1496

# events_to_report: a list of events to report during parsing (same as

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1497

# the *events* of XMLPullParser's constructor.

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1498

# events_queue: a list of actual parsing events that will be populated

1499

# by the underlying parser.

1500

#

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1501

parser = self._parser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1502

append = events_queue.append

1503

for event_name in events_to_report:

1504

if event_name == "start":

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1505

parser.ordered_attributes = 1

1506

parser.specified_attributes = 1

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1507

def handler(tag, attrib_in, event=event_name, append=append,

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1508

start=self._start):

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1509

append((event, start(tag, attrib_in)))

1510

parser.StartElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1511

elif event_name == "end":

1512

def handler(tag, event=event_name, append=append,

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1513

end=self._end):

1514

append((event, end(tag)))

1515

parser.EndElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1516

elif event_name == "start-ns":

1517

def handler(prefix, uri, event=event_name, append=append):

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1518

append((event, (prefix or "", uri or "")))

1519

parser.StartNamespaceDeclHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1520

elif event_name == "end-ns":

1521

def handler(prefix, event=event_name, append=append):

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1522

append((event, None))

1523

parser.EndNamespaceDeclHandler = handler

1524

else:

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1525

raise ValueError("unknown event %r" % event_name)

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1526

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1527

def _raiseerror(self, value):

1528

err = ParseError(value)

1529

err.code = value.code

1530

err.position = value.lineno, value.offset

1531

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1532

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1533

def _fixname(self, key):

1534

# expand qname, and convert name string to ascii, if possible

1535

try:

1536

name = self._names[key]

except KeyError:

name = key

if "}" in name:

name = "{" + name

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1541

self._names[key] = name

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1542

return name

1543

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1544

def _start(self, tag, attr_list):

1545

# Handler for expat's StartElementHandler. Since ordered_attributes

1546

# is set, the attributes are reported as a list of alternating

1547

# attribute name,value.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1548

fixname = self._fixname

1549

tag = fixname(tag)

1550

attrib = {}

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1551

if attr_list:

1552

for i in range(0, len(attr_list), 2):

1553

attrib[fixname(attr_list[i])] = attr_list[i+1]

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1554

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1555

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1556

def _end(self, tag):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1557

return self.target.end(self._fixname(tag))

1558

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1559

def _default(self, text):

1560

prefix = text[:1]

1561

if prefix == "&":

1562

# deal with undefined entities

1563

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1564

data_handler = self.target.data

1565

except AttributeError:

1566

return

1567

try:

1568

data_handler(self.entity[text[1:-1]])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1569

except KeyError:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1570

from xml.parsers import expat

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1571

err = expat.error(

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1572

"undefined entity %s: line %d, column %d" %

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1573

(text, self.parser.ErrorLineNumber,

1574

self.parser.ErrorColumnNumber)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1575

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1576

err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1577

err.lineno = self.parser.ErrorLineNumber

1578

err.offset = self.parser.ErrorColumnNumber

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1579

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1580

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1581

self._doctype = [] # inside a doctype declaration

1582

elif self._doctype is not None:

1583

# parse doctype contents

1584

if prefix == ">":

1585

self._doctype = None

1586

return

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1587

text = text.strip()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1588

if not text:

1589

return

1590

self._doctype.append(text)

1591

n = len(self._doctype)

1592

if n > 2:

1593

type = self._doctype[1]

1594

if type == "PUBLIC" and n == 4:

1595

name, type, pubid, system = self._doctype

Florent Xicluna

a1c974a

2012-07-07 13:16:44 +0200

[diff] [blame]

1596

if pubid:

1597

pubid = pubid[1:-1]

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1598

elif type == "SYSTEM" and n == 3:

1599

name, type, system = self._doctype

1600

pubid = None

1601

else:

1602

return

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1603

if hasattr(self.target, "doctype"):

1604

self.target.doctype(name, pubid, system[1:-1])

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1605

elif self.doctype != self._XMLParser__doctype:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1606

# warn about deprecated call

1607

self._XMLParser__doctype(name, pubid, system[1:-1])

1608

self.doctype(name, pubid, system[1:-1])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1609

self._doctype = None

1610

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1611

def doctype(self, name, pubid, system):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1612

"""(Deprecated) Handle doctype declaration

1613

1614

*name* is the Doctype name, *pubid* is the public identifier,

1615

and *system* is the system identifier.

1616

1617

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1618

warnings.warn(

1619

"This method of XMLParser is deprecated. Define doctype() "

1620

"method on the TreeBuilder target.",

DeprecationWarning,

)

# sentinel, if doctype is redefined in a subclass

1625

__doctype = doctype

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1626

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1627

def feed(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1628

"""Feed encoded data to parser."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1629

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1630

self.parser.Parse(data, 0)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1631

except self._error as v:

1632

self._raiseerror(v)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1633

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1634

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1635

"""Finish feeding data to parser and return element structure."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1636

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1637

self.parser.Parse("", 1) # end of data

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1638

except self._error as v:

1639

self._raiseerror(v)

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1640

try:

Florent Xicluna

fb06746

2012-03-05 11:42:49 +0100

[diff] [blame]

1641

close_handler = self.target.close

1642

except AttributeError:

1643

pass

1644

else:

1645

return close_handler()

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1646

finally:

1647

# get rid of circular references

1648

del self.parser, self._parser

1649

del self.target, self._target

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1650

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1651

1652

# Import the C accelerators

1653

try:

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

1654

# Element is going to be shadowed by the C implementation. We need to keep

1655

# the Python version of it accessible for some "creative" by external code

1656

# (see tests)

1657

_Element_Py = Element

1658

Florent Xicluna