Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

2013-03-09 07:12:48 -0800

[diff] [blame]

1

"""Lightweight XML support for Python.

2

3

XML is an inherently hierarchical data format, and the most natural way to

4

represent it is with a tree. This module has two classes for this purpose:

5

6

1. ElementTree represents the whole XML document as a tree and

7

8

2. Element represents a single node in this tree.

9

10

Interactions with the whole document (reading and writing to/from files) are

11

usually done on the ElementTree level. Interactions with a single XML element

12

and its sub-elements are done on the Element level.

13

14

Element is a flexible container object designed to store hierarchical data

15

structures in memory. It can be described as a cross between a list and a

16

dictionary. Each Element has a number of properties associated with it:

17

18

'tag' - a string containing the element's name.

19

20

'attributes' - a Python dictionary storing the element's attributes.

21

22

'text' - a string containing the element's text content.

23

24

'tail' - an optional string containing text after the element's end tag.

25

26

And a number of child elements stored in a Python sequence.

27

28

To create an element instance, use the Element constructor,

29

or the SubElement factory function.

30

31

You can also use the ElementTree class to wrap an element structure

32

and convert it to and from XML.

"""

Eli Bendersky

2013-04-20 05:44:01 -0700

[diff] [blame]

36

#---------------------------------------------------------------------

37

# Licensed to PSF under a Contributor Agreement.

38

# See http://www.python.org/psf/license for licensing details.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

39

#

40

# ElementTree

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

41

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

42

#

43

# fredrik@pythonware.com

44

# http://www.pythonware.com

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

45

# --------------------------------------------------------------------

46

# The ElementTree toolkit is

47

#

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

48

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

49

#

50

# By obtaining, using, and/or copying this software and/or its

51

# associated documentation, you agree that you have read, understood,

52

# and will comply with the following terms and conditions:

53

#

54

# Permission to use, copy, modify, and distribute this software and

55

# its associated documentation for any purpose and without fee is

56

# hereby granted, provided that the above copyright notice appears in

57

# all copies, and that both that copyright notice and this permission

58

# notice appear in supporting documentation, and that the name of

59

# Secret Labs AB or the author not be used in advertising or publicity

60

# pertaining to distribution of the software without specific, written

61

# prior permission.

62

#

63

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

64

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

65

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

66

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

67

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

68

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

69

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

70

# OF THIS SOFTWARE.

71

# --------------------------------------------------------------------

__all__ = [

# public symbols

"Comment",

"dump",

"Element", "ElementTree",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

78

"fromstring", "fromstringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

79

"iselement", "iterparse",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

80

"parse", "ParseError",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

81

"PI", "ProcessingInstruction",

82

"QName",

83

"SubElement",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

84

"tostring", "tostringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

85

"TreeBuilder",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

86

"VERSION",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

87

"XML", "XMLID",

Martin Panter

dcfebb3

2016-04-01 06:55:55 +0000

[diff] [blame]

88

"XMLParser", "XMLPullParser",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

89

"register_namespace",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

90

]

91

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

92

VERSION = "1.3.0"

93

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

94

import sys

95

import re

96

import warnings

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

97

import io

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

98

import collections

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

99

import contextlib

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

100

Eli Bendersky

27cbb19

2012-06-15 09:03:19 +0300

[diff] [blame]

101

from . import ElementPath

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

102

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

103

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

104

class ParseError(SyntaxError):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

105

"""An error when parsing an XML document.

106

107

In addition to its exception value, a ParseError contains

108

two extra attributes:

109

'code' - the specific exception code

110

'position' - the line and column of the error

111

112

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

113

pass

114

115

# --------------------------------------------------------------------

116

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

117

118

def iselement(element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

119

"""Return True if *element* appears to be an Element."""

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

120

return hasattr(element, 'tag')

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

121

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

122

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

123

class Element:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

124

"""An XML element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

125

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

126

This class is the reference implementation of the Element interface.

127

128

An element's length is its number of subelements. That means if you

Serhiy Storchaka

56a6d85

2014-12-01 18:28:43 +0200

[diff] [blame]

129

want to check if an element is truly empty, you should check BOTH

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

130

its length AND its text attribute.

131

132

The element tag, attribute names, and attribute values can be either

133

bytes or strings.

134

135

*tag* is the element name. *attrib* is an optional dictionary containing

136

element attributes. *extra* are additional element attributes given as

keyword arguments.

Example form:

<tag attrib>text<child/>...</tag>tail

141

142

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

143

144

tag = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

145

"""The element's name."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

146

147

attrib = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

148

"""Dictionary of the element's attributes."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

149

150

text = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

151

"""

152

Text before first subelement. This is either a string or the value None.

153

Note that if there is no text, this attribute may be either

154

None or the empty string, depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

155

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

156

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

157

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

158

tail = None

159

"""

160

Text after this element's end tag, but before the next sibling element's

161

start tag. This is either a string or the value None. Note that if there

162

was no text, this attribute may be either None or an empty string,

163

depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

164

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

165

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

166

167

def __init__(self, tag, attrib={}, **extra):

Eli Bendersky

737b173

2012-05-29 06:02:56 +0300

[diff] [blame]

168

if not isinstance(attrib, dict):

169

raise TypeError("attrib must be dict, not %s" % (

170

attrib.__class__.__name__,))

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

171

attrib = attrib.copy()

172

attrib.update(extra)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

self.tag = tag

self.attrib = attrib

self._children = []

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

178

return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

179

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

180

def makeelement(self, tag, attrib):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

181

"""Create a new element with the same type.

182

183

*tag* is a string containing the element name.

184

*attrib* is a dictionary containing the element attributes.

185

186

Do not call this method, use the SubElement factory function instead.

187

188

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

189

return self.__class__(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

190

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

191

def copy(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

192

"""Return copy of current element.

193

194

This creates a shallow copy. Subelements will be shared with the

195

original tree.

196

197

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

198

elem = self.makeelement(self.tag, self.attrib)

199

elem.text = self.text

200

elem.tail = self.tail

elem[:] = self

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

204

def __len__(self):

205

return len(self._children)

206

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

207

def __bool__(self):

208

warnings.warn(

209

"The behavior of this method will change in future versions. "

210

"Use specific 'len(elem)' or 'elem is not None' test instead.",

211

FutureWarning, stacklevel=2

212

)

213

return len(self._children) != 0 # emulate old behaviour, for now

214

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

215

def __getitem__(self, index):

216

return self._children[index]

217

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

218

def __setitem__(self, index, element):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

219

# if isinstance(index, slice):

220

# for elt in element:

221

# assert iselement(elt)

222

# else:

223

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

224

self._children[index] = element

225

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

226

def __delitem__(self, index):

227

del self._children[index]

228

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

229

def append(self, subelement):

230

"""Add *subelement* to the end of this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

231

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

232

The new element will appear in document order after the last existing

233

subelement (or directly after the text, if it's the first subelement),

234

but before the end tag for this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

235

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

236

"""

237

self._assert_is_element(subelement)

238

self._children.append(subelement)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

239

240

def extend(self, elements):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

241

"""Append subelements from a sequence.

242

243

*elements* is a sequence with zero or more elements.

244

245

"""

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

246

for element in elements:

247

self._assert_is_element(element)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

248

self._children.extend(elements)

249

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

250

def insert(self, index, subelement):

251

"""Insert *subelement* at position *index*."""

252

self._assert_is_element(subelement)

253

self._children.insert(index, subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

254

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

255

def _assert_is_element(self, e):

Antoine Pitrou

ee32931

2012-10-04 19:53:29 +0200

[diff] [blame]

256

# Need to refer to the actual Python implementation, not the

257

# shadowing C implementation.

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

258

if not isinstance(e, _Element_Py):

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

259

raise TypeError('expected an Element, not %s' % type(e).__name__)

260

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

261

def remove(self, subelement):

262

"""Remove matching subelement.

263

264

Unlike the find methods, this method compares elements based on

265

identity, NOT ON tag value or contents. To remove subelements by

266

other means, the easiest way is to use a list comprehension to

267

select what elements to keep, and then use slice assignment to update

268

the parent element.

269

270

ValueError is raised if a matching element could not be found.

271

272

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

273

# assert iselement(element)

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

274

self._children.remove(subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

275

276

def getchildren(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

277

"""(Deprecated) Return all subelements.

278

279

Elements are returned in document order.

280

281

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

282

warnings.warn(

283

"This method will be removed in future versions. "

284

"Use 'list(elem)' or iteration over elem instead.",

285

DeprecationWarning, stacklevel=2

286

)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

287

return self._children

288

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

289

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

290

"""Find first matching element by tag name or path.

291

292

*path* is a string having either an element tag or an XPath,

293

*namespaces* is an optional mapping from namespace prefix to full name.

294

295

Return the first matching element, or None if no element was found.

296

297

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

298

return ElementPath.find(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

299

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

300

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

301

"""Find text for first matching element by tag name or path.

302

303

*path* is a string having either an element tag or an XPath,

304

*default* is the value to return if the element was not found,

305

*namespaces* is an optional mapping from namespace prefix to full name.

306

307

Return text content of first matching element, or default value if

308

none was found. Note that if an element is found having no text

309

content, the empty string is returned.

310

311

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

312

return ElementPath.findtext(self, path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

313

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

314

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

315

"""Find all matching subelements by tag name or path.

316

317

*path* is a string having either an element tag or an XPath,

318

*namespaces* is an optional mapping from namespace prefix to full name.

319

320

Returns list containing all matching elements in document order.

321

322

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

323

return ElementPath.findall(self, path, namespaces)

324

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

325

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

326

"""Find all matching subelements by tag name or path.

327

328

*path* is a string having either an element tag or an XPath,

329

*namespaces* is an optional mapping from namespace prefix to full name.

330

331

Return an iterable yielding all matching elements in document order.

332

333

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

334

return ElementPath.iterfind(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

335

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

336

def clear(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

337

"""Reset element.

338

339

This function removes all subelements, clears all attributes, and sets

340

the text and tail attributes to None.

341

342

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

343

self.attrib.clear()

344

self._children = []

345

self.text = self.tail = None

346

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

347

def get(self, key, default=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

348

"""Get element attribute.

349

350

Equivalent to attrib.get, but some implementations may handle this a

351

bit more efficiently. *key* is what attribute to look for, and

352

*default* is what to return if the attribute was not found.

353

354

Returns a string containing the attribute value, or the default if

355

attribute was not found.

356

357

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

358

return self.attrib.get(key, default)

359

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

360

def set(self, key, value):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

361

"""Set element attribute.

362

363

Equivalent to attrib[key] = value, but some implementations may handle

364

this a bit more efficiently. *key* is what attribute to set, and

365

*value* is the attribute value to set it to.

366

367

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

368

self.attrib[key] = value

369

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

370

def keys(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

371

"""Get list of attribute names.

372

373

Names are returned in an arbitrary order, just like an ordinary

374

Python dict. Equivalent to attrib.keys()

375

376

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

377

return self.attrib.keys()

378

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

379

def items(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

380

"""Get element attributes as a sequence.

381

382

The attributes are returned in arbitrary order. Equivalent to

383

attrib.items().

384

385

Return a list of (name, value) tuples.

386

387

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

388

return self.attrib.items()

389

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

390

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

391

"""Create tree iterator.

392

393

The iterator loops over the element and all subelements in document

394

order, returning all elements with a matching tag.

395

396

If the tree structure is modified during iteration, new or removed

397

elements may or may not be included. To get a stable set, use the

398

list() function on the iterator, and loop over the resulting list.

399

400

*tag* is what tags to look for (default is to return all elements)

401

402

Return an iterator containing all the matching elements.

403

404

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

405

if tag == "*":

406

tag = None

407

if tag is None or self.tag == tag:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

408

yield self

409

for e in self._children:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

410

yield from e.iter(tag)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

411

412

# compatibility

413

def getiterator(self, tag=None):

414

# Change for a DeprecationWarning in 1.4

415

warnings.warn(

416

"This method will be removed in future versions. "

417

"Use 'elem.iter()' or 'list(elem.iter())' instead.",

418

PendingDeprecationWarning, stacklevel=2

419

)

420

return list(self.iter(tag))

421

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

422

def itertext(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

423

"""Create text iterator.

424

425

The iterator loops over the element and all subelements in document

426

order, returning all inner text.

427

428

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

429

tag = self.tag

430

if not isinstance(tag, str) and tag is not None:

431

return

Serhiy Storchaka

66c08d9

2015-12-21 11:09:48 +0200

[diff] [blame]

432

t = self.text

433

if t:

434

yield t

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

435

for e in self:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

436

yield from e.itertext()

Serhiy Storchaka

66c08d9

2015-12-21 11:09:48 +0200

[diff] [blame]

437

t = e.tail

438

if t:

439

yield t

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

440

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

441

442

def SubElement(parent, tag, attrib={}, **extra):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

443

"""Subelement factory which creates an element instance, and appends it

444

to an existing parent.

445

446

The element tag, attribute names, and attribute values can be either

447

bytes or Unicode strings.

448

449

*parent* is the parent element, *tag* is the subelements name, *attrib* is

450

an optional directory containing element attributes, *extra* are

451

additional attributes given as keyword arguments.

452

453

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

454

attrib = attrib.copy()

455

attrib.update(extra)

456

element = parent.makeelement(tag, attrib)

457

parent.append(element)

458

return element

459

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

460

461

def Comment(text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

462

"""Comment element factory.

463

464

This function creates a special element which the standard serializer

465

serializes as an XML comment.

466

467

*text* is a string containing the comment string.

468

469

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

470

element = Element(Comment)

element.text = text

return element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

474

475

def ProcessingInstruction(target, text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

476

"""Processing Instruction element factory.

477

478

This function creates a special element which the standard serializer

479

serializes as an XML comment.

480

481

*target* is a string containing the processing instruction, *text* is a

482

string containing the processing instruction contents, if any.

483

484

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

485

element = Element(ProcessingInstruction)

486

element.text = target

487

if text:

488

element.text = element.text + " " + text

489

return element

490

491

PI = ProcessingInstruction

492

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

493

494

class QName:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

495

"""Qualified name wrapper.

496

497

This class can be used to wrap a QName attribute value in order to get

498

proper namespace handing on output.

499

500

*text_or_uri* is a string containing the QName value either in the form

501

{uri}local, or if the tag argument is given, the URI part of a QName.

502

503

*tag* is an optional argument which if given, will make the first

504

argument (text_or_uri) be interpreted as a URI, and this argument (tag)

505

be interpreted as a local name.

506

507

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

508

def __init__(self, text_or_uri, tag=None):

509

if tag:

510

text_or_uri = "{%s}%s" % (text_or_uri, tag)

511

self.text = text_or_uri

512

def __str__(self):

513

return self.text

Georg Brandl

b56c0e2

2010-12-09 18:10:27 +0000

[diff] [blame]

514

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

515

return '<%s %r>' % (self.__class__.__name__, self.text)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

516

def __hash__(self):

517

return hash(self.text)

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

518

def __le__(self, other):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

519

if isinstance(other, QName):

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

520

return self.text <= other.text

521

return self.text <= other

522

def __lt__(self, other):

523

if isinstance(other, QName):

524

return self.text < other.text

525

return self.text < other

526

def __ge__(self, other):

527

if isinstance(other, QName):

528

return self.text >= other.text

529

return self.text >= other

530

def __gt__(self, other):

531

if isinstance(other, QName):

532

return self.text > other.text

533

return self.text > other

534

def __eq__(self, other):

535

if isinstance(other, QName):

536

return self.text == other.text

537

return self.text == other

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

538

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

539

# --------------------------------------------------------------------

540

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

541

542

class ElementTree:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

543

"""An XML element hierarchy.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

544

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

545

This class also provides support for serialization to and from

546

standard XML.

547

548

*element* is an optional root element node,

549

*file* is an optional file handle or file name of an XML file whose

550

contents will be used to initialize the tree with.

551

552

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

553

def __init__(self, element=None, file=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

554

# assert element is None or iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

555

self._root = element # first node

if file:

self.parse(file)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

559

def getroot(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

560

"""Return root element of this tree."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

561

return self._root

562

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

563

def _setroot(self, element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

564

"""Replace root element of this tree.

565

566

This will discard the current contents of the tree and replace it

567

with the given element. Use with care!

568

569

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

570

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

571

self._root = element

572

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

573

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

574

"""Load external XML document into element tree.

575

576

*source* is a file name or file object, *parser* is an optional parser

577

instance that defaults to XMLParser.

578

579

ParseError is raised if the parser fails to parse the document.

580

581

Returns the root element of the given source document.

582

583

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

584

close_source = False

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

585

if not hasattr(source, "read"):

586

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

587

close_source = True

588

try:

Eli Bendersky

a369923

2013-05-19 18:47:23 -0700

[diff] [blame]

589

if parser is None:

590

# If no parser was specified, create a default XMLParser

591

parser = XMLParser()

592

if hasattr(parser, '_parse_whole'):

593

# The default XMLParser, when it comes from an accelerator,

594

# can define an internal _parse_whole API for efficiency.

595

# It can be used to parse the whole source without feeding

596

# it with chunks.

597

self._root = parser._parse_whole(source)

598

return self._root

599

while True:

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

600

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

return self._root

finally:

if close_source:

source.close()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

609

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

610

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

611

"""Create and return tree iterator for the root element.

612

613

The iterator loops over all elements in this tree, in document order.

614

615

*tag* is a string with the tag name to iterate over

616

(default is to return all elements).

617

618

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

619

# assert self._root is not None

620

return self._root.iter(tag)

621

622

# compatibility

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

623

def getiterator(self, tag=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

624

# Change for a DeprecationWarning in 1.4

625

warnings.warn(

626

"This method will be removed in future versions. "

627

"Use 'tree.iter()' or 'list(tree.iter())' instead.",

628

PendingDeprecationWarning, stacklevel=2

629

)

630

return list(self.iter(tag))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

631

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

632

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

633

"""Find first matching element by tag name or path.

634

635

Same as getroot().find(path), which is Element.find()

636

637

*path* is a string having either an element tag or an XPath,

638

*namespaces* is an optional mapping from namespace prefix to full name.

639

640

Return the first matching element, or None if no element was found.

641

642

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

643

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

644

if path[:1] == "/":

645

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

646

warnings.warn(

647

"This search is broken in 1.3 and earlier, and will be "

648

"fixed in a future version. If you rely on the current "

649

"behaviour, change it to %r" % path,

650

FutureWarning, stacklevel=2

651

)

652

return self._root.find(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

653

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

654

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

655

"""Find first matching element by tag name or path.

656

657

Same as getroot().findtext(path), which is Element.findtext()

658

659

*path* is a string having either an element tag or an XPath,

660

*namespaces* is an optional mapping from namespace prefix to full name.

661

662

Return the first matching element, or None if no element was found.

663

664

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

665

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

666

if path[:1] == "/":

667

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

668

warnings.warn(

669

"This search is broken in 1.3 and earlier, and will be "

670

"fixed in a future version. If you rely on the current "

671

"behaviour, change it to %r" % path,

672

FutureWarning, stacklevel=2

673

)

674

return self._root.findtext(path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

675

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

676

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

677

"""Find all matching subelements by tag name or path.

678

679

Same as getroot().findall(path), which is Element.findall().

680

681

*path* is a string having either an element tag or an XPath,

682

*namespaces* is an optional mapping from namespace prefix to full name.

683

684

Return list containing all matching elements in document order.

685

686

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

687

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

688

if path[:1] == "/":

689

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

690

warnings.warn(

691

"This search is broken in 1.3 and earlier, and will be "

692

"fixed in a future version. If you rely on the current "

693

"behaviour, change it to %r" % path,

694

FutureWarning, stacklevel=2

695

)

696

return self._root.findall(path, namespaces)

697

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

698

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

699

"""Find all matching subelements by tag name or path.

700

701

Same as getroot().iterfind(path), which is element.iterfind()

702

703

*path* is a string having either an element tag or an XPath,

704

*namespaces* is an optional mapping from namespace prefix to full name.

705

706

Return an iterable yielding all matching elements in document order.

707

708

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

709

# assert self._root is not None

if path[:1] == "/":

path = "." + path

warnings.warn(

"This search is broken in 1.3 and earlier, and will be "

714

"fixed in a future version. If you rely on the current "

715

"behaviour, change it to %r" % path,

716

FutureWarning, stacklevel=2

717

)

718

return self._root.iterfind(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

719

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

720

def write(self, file_or_filename,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

721

encoding=None,

722

xml_declaration=None,

723

default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

724

method=None, *,

725

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

726

"""Write element tree to a file as XML.

727

728

Arguments:

729

*file_or_filename* -- file name or a file object opened for writing

730

731

*encoding* -- the output encoding (default: US-ASCII)

732

733

*xml_declaration* -- bool indicating if an XML declaration should be

734

added to the output. If None, an XML declaration

735

is added if encoding IS NOT either of:

736

US-ASCII, UTF-8, or Unicode

737

738

*default_namespace* -- sets the default XML namespace (for "xmlns")

739

740

*method* -- either "xml" (default), "html, "text", or "c14n"

741

742

*short_empty_elements* -- controls the formatting of elements

743

that contain no content. If True (default)

744

they are emitted as a single self-closed

745

tag, otherwise they are emitted as a pair

746

of start/end tags

Eli Bendersky

e9af827

2013-01-13 06:27:51 -0800

[diff] [blame]

747

748

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

749

if not method:

750

method = "xml"

751

elif method not in _serialize:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

752

raise ValueError("unknown method %r" % method)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

if not encoding:

if method == "c14n":

encoding = "utf-8"

else:

encoding = "us-ascii"

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

758

enc_lower = encoding.lower()

759

with _get_writer(file_or_filename, enc_lower) as write:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

760

if method == "xml" and (xml_declaration or

761

(xml_declaration is None and

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

762

enc_lower not in ("utf-8", "us-ascii", "unicode"))):

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

763

declared_encoding = encoding

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

764

if enc_lower == "unicode":

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

765

# Retrieve the default encoding for the xml declaration

766

import locale

767

declared_encoding = locale.getpreferredencoding()

768

write("<?xml version='1.0' encoding='%s'?>\n" % (

769

declared_encoding,))

770

if method == "text":

771

_serialize_text(write, self._root)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

772

else:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

773

qnames, namespaces = _namespaces(self._root, default_namespace)

774

serialize = _serialize[method]

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

775

serialize(write, self._root, qnames, namespaces,

776

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

777

778

def write_c14n(self, file):

779

# lxml.etree compatibility. use output method instead

780

return self.write(file, method="c14n")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

781

782

# --------------------------------------------------------------------

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

783

# serialization support

784

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

785

@contextlib.contextmanager

786

def _get_writer(file_or_filename, encoding):

Ezio Melotti

b5bc353

2013-08-17 16:11:40 +0300

[diff] [blame]

787

# returns text write method and release all resources after using

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

788

try:

789

write = file_or_filename.write

790

except AttributeError:

791

# file_or_filename is a file name

792

if encoding == "unicode":

793

file = open(file_or_filename, "w")

794

else:

795

file = open(file_or_filename, "w", encoding=encoding,

796

errors="xmlcharrefreplace")

with file:

yield file.write

else:

# file_or_filename is a file-like object

801

# encoding determines if it is a text or binary writer

802

if encoding == "unicode":

803

# use a text writer as is

804

yield write

805

else:

806

# wrap a binary writer with TextIOWrapper

807

with contextlib.ExitStack() as stack:

808

if isinstance(file_or_filename, io.BufferedIOBase):

809

file = file_or_filename

810

elif isinstance(file_or_filename, io.RawIOBase):

811

file = io.BufferedWriter(file_or_filename)

812

# Keep the original file open when the BufferedWriter is

813

# destroyed

814

stack.callback(file.detach)

815

else:

816

# This is to handle passed objects that aren't in the

817

# IOBase hierarchy, but just have a write method

818

file = io.BufferedIOBase()

819

file.writable = lambda: True

820

file.write = write

821

try:

822

# TextIOWrapper uses this methods to determine

823

# if BOM (for UTF-16, etc) should be added

824

file.seekable = file_or_filename.seekable

825

file.tell = file_or_filename.tell

826

except AttributeError:

827

pass

828

file = io.TextIOWrapper(file,

829

encoding=encoding,

830

errors="xmlcharrefreplace",

831

newline="\n")

832

# Keep the original file open when the TextIOWrapper is

833

# destroyed

834

stack.callback(file.detach)

835

yield file.write

836

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

837

def _namespaces(elem, default_namespace=None):

838

# identify namespaces used in this tree

839

840

# maps qnames to *encoded* prefix:local names

841

qnames = {None: None}

842

843

# maps uri:s to prefixes

844

namespaces = {}

845

if default_namespace:

846

namespaces[default_namespace] = ""

847

848

def add_qname(qname):

849

# calculate serialized qname representation

850

try:

851

if qname[:1] == "{":

852

uri, tag = qname[1:].rsplit("}", 1)

853

prefix = namespaces.get(uri)

854

if prefix is None:

855

prefix = _namespace_map.get(uri)

856

if prefix is None:

857

prefix = "ns%d" % len(namespaces)

858

if prefix != "xml":

859

namespaces[uri] = prefix

860

if prefix:

861

qnames[qname] = "%s:%s" % (prefix, tag)

862

else:

863

qnames[qname] = tag # default element

864

else:

865

if default_namespace:

866

# FIXME: can this be handled in XML 1.0?

867

raise ValueError(

868

"cannot use non-qualified names with "

869

"default_namespace option"

870

)

871

qnames[qname] = qname

872

except TypeError:

873

_raise_serialization_error(qname)

874

875

# populate qname and namespaces table

Eli Bendersky

64d11e6

2012-06-15 07:42:50 +0300

[diff] [blame]

876

for elem in elem.iter():

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

877

tag = elem.tag

Senthil Kumaran

ec30b3d

2010-11-09 02:36:59 +0000

[diff] [blame]

878

if isinstance(tag, QName):

879

if tag.text not in qnames:

880

add_qname(tag.text)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

881

elif isinstance(tag, str):

882

if tag not in qnames:

883

add_qname(tag)

884

elif tag is not None and tag is not Comment and tag is not PI:

885

_raise_serialization_error(tag)

886

for key, value in elem.items():

887

if isinstance(key, QName):

888

key = key.text

889

if key not in qnames:

890

add_qname(key)

891

if isinstance(value, QName) and value.text not in qnames:

892

add_qname(value.text)

893

text = elem.text

894

if isinstance(text, QName) and text.text not in qnames:

895

add_qname(text.text)

896

return qnames, namespaces

897

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

898

def _serialize_xml(write, elem, qnames, namespaces,

899

short_empty_elements, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % text)

904

elif tag is ProcessingInstruction:

905

write("<?%s?>" % text)

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

911

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

912

_serialize_xml(write, e, qnames, None,

913

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

914

else:

915

write("<" + tag)

916

items = list(elem.items())

917

if items or namespaces:

918

if namespaces:

919

for v, k in sorted(namespaces.items(),

920

key=lambda x: x[1]): # sort on prefix

921

if k:

922

k = ":" + k

923

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

928

if isinstance(k, QName):

929

k = k.text

930

if isinstance(v, QName):

931

v = qnames[v.text]

932

else:

933

v = _escape_attrib(v)

934

write(" %s=\"%s\"" % (qnames[k], v))

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

935

if text or len(elem) or not short_empty_elements:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

936

write(">")

937

if text:

938

write(_escape_cdata(text))

939

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

940

_serialize_xml(write, e, qnames, None,

941

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

942

write("</" + tag + ">")

else:

write(" />")

if elem.tail:

write(_escape_cdata(elem.tail))

947

948

HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",

Ezio Melotti

c90111f

2012-09-19 08:19:12 +0300

[diff] [blame]

949

"img", "input", "isindex", "link", "meta", "param")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

950

951

try:

952

HTML_EMPTY = set(HTML_EMPTY)

except NameError:

pass

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

956

def _serialize_html(write, elem, qnames, namespaces, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % _escape_cdata(text))

961

elif tag is ProcessingInstruction:

962

write("<?%s?>" % _escape_cdata(text))

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

968

for e in elem:

969

_serialize_html(write, e, qnames, None)

970

else:

971

write("<" + tag)

972

items = list(elem.items())

973

if items or namespaces:

974

if namespaces:

975

for v, k in sorted(namespaces.items(),

976

key=lambda x: x[1]): # sort on prefix

977

if k:

978

k = ":" + k

979

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

984

if isinstance(k, QName):

985

k = k.text

986

if isinstance(v, QName):

987

v = qnames[v.text]

988

else:

989

v = _escape_attrib_html(v)

990

# FIXME: handle boolean attributes

991

write(" %s=\"%s\"" % (qnames[k], v))

992

write(">")

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

993

ltag = tag.lower()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

994

if text:

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

995

if ltag == "script" or ltag == "style":

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

996

write(text)

997

else:

998

write(_escape_cdata(text))

999

for e in elem:

1000

_serialize_html(write, e, qnames, None)

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

1001

if ltag not in HTML_EMPTY:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1002

write("</" + tag + ">")

1003

if elem.tail:

1004

write(_escape_cdata(elem.tail))

1005

1006

def _serialize_text(write, elem):

1007

for part in elem.itertext():

write(part)

if elem.tail:

write(elem.tail)

_serialize = {

"xml": _serialize_xml,

1014

"html": _serialize_html,

1015

"text": _serialize_text,

1016

# this optional method is imported at the end of the module

1017

# "c14n": _serialize_c14n,

1018

}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1019

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1020

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1021

def register_namespace(prefix, uri):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1022

"""Register a namespace prefix.

1023

1024

The registry is global, and any existing mapping for either the

1025

given prefix or the namespace URI will be removed.

1026

1027

*prefix* is the namespace prefix, *uri* is a namespace uri. Tags and

1028

attributes in this namespace will be serialized with prefix if possible.

1029

1030

ValueError is raised if prefix is reserved or is invalid.

1031

1032

"""

R David Murray

44b548d

2016-09-08 13:59:53 -0400

[diff] [blame^]

1033

if re.match(r"ns\d+$", prefix):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1034

raise ValueError("Prefix format reserved for internal use")

Georg Brandl

90b2067

2010-12-28 10:38:33 +0000

[diff] [blame]

1035

for k, v in list(_namespace_map.items()):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1036

if k == uri or v == prefix:

1037

del _namespace_map[k]

1038

_namespace_map[uri] = prefix

1039

1040

_namespace_map = {

1041

# "well-known" namespace prefixes

1042

"http://www.w3.org/XML/1998/namespace": "xml",

1043

"http://www.w3.org/1999/xhtml": "html",

1044

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

1045

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

1046

# xml schema

1047

"http://www.w3.org/2001/XMLSchema": "xs",

1048

"http://www.w3.org/2001/XMLSchema-instance": "xsi",

1049

# dublin core

1050

"http://purl.org/dc/elements/1.1/": "dc",

1051

}

Florent Xicluna

1639505

2012-02-16 23:28:35 +0100

[diff] [blame]

1052

# For tests and troubleshooting

1053

register_namespace._namespace_map = _namespace_map

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1054

1055

def _raise_serialization_error(text):

1056

raise TypeError(

1057

"cannot serialize %r (type %s)" % (text, type(text).__name__)

1058

)

1059

1060

def _escape_cdata(text):

1061

# escape character data

1062

try:

1063

# it's worth avoiding do-nothing calls for strings that are

1064

# shorter than 500 character, or so. assume that's, by far,

1065

# the most common case in most applications.

1066

if "&" in text:

1067

text = text.replace("&", "&")

1068

if "<" in text:

1069

text = text.replace("<", "<")

1070

if ">" in text:

1071

text = text.replace(">", ">")

1072

return text

1073

except (TypeError, AttributeError):

1074

_raise_serialization_error(text)

1075

1076

def _escape_attrib(text):

1077

# escape attribute value

1078

try:

1079

if "&" in text:

1080

text = text.replace("&", "&")

1081

if "<" in text:

1082

text = text.replace("<", "<")

1083

if ">" in text:

1084

text = text.replace(">", ">")

1085

if "\"" in text:

1086

text = text.replace("\"", """)

1087

if "\n" in text:

1088

text = text.replace("\n", "
")

1089

return text

1090

except (TypeError, AttributeError):

1091

_raise_serialization_error(text)

1092

1093

def _escape_attrib_html(text):

1094

# escape attribute value

1095

try:

1096

if "&" in text:

1097

text = text.replace("&", "&")

1098

if ">" in text:

1099

text = text.replace(">", ">")

1100

if "\"" in text:

1101

text = text.replace("\"", """)

1102

return text

1103

except (TypeError, AttributeError):

1104

_raise_serialization_error(text)

1105

1106

# --------------------------------------------------------------------

1107

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1108

def tostring(element, encoding=None, method=None, *,

1109

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1110

"""Generate string representation of XML element.

1111

1112

All subelements are included. If encoding is "unicode", a string

1113

is returned. Otherwise a bytestring is returned.

1114

1115

*element* is an Element instance, *encoding* is an optional output

1116

encoding defaulting to US-ASCII, *method* is an optional output which can

1117

be one of "xml" (default), "html", "text" or "c14n".

1118

1119

Returns an (optionally) encoded string containing the XML data.

1120

1121

"""

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1122

stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1123

ElementTree(element).write(stream, encoding, method=method,

1124

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1125

return stream.getvalue()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1126

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1127

class _ListDataStream(io.BufferedIOBase):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1128

"""An auxiliary stream accumulating into a list reference."""

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1129

def __init__(self, lst):

1130

self.lst = lst

Eli Bendersky

f90fc68

2012-07-17 15:09:56 +0300

[diff] [blame]

1131

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

def writable(self):

return True

def seekable(self):

return True

def write(self, b):

self.lst.append(b)

def tell(self):

return len(self.lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1144

def tostringlist(element, encoding=None, method=None, *,

1145

short_empty_elements=True):

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1146

lst = []

1147

stream = _ListDataStream(lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1148

ElementTree(element).write(stream, encoding, method=method,

1149

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1150

return lst

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1151

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1152

1153

def dump(elem):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1154

"""Write element tree or element structure to sys.stdout.

1155

1156

This function should be used for debugging only.

1157

1158

*elem* is either an ElementTree, or a single Element. The exact output

1159

format is implementation dependent. In this version, it's written as an

1160

ordinary XML file.

1161

1162

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1163

# debugging

1164

if not isinstance(elem, ElementTree):

1165

elem = ElementTree(elem)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

1166

elem.write(sys.stdout, encoding="unicode")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1167

tail = elem.getroot().tail

1168

if not tail or tail[-1] != "\n":

1169

sys.stdout.write("\n")

1170

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1171

# --------------------------------------------------------------------

1172

# parsing

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1173

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1174

1175

def parse(source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1176

"""Parse XML document into element tree.

1177

1178

*source* is a filename or file object containing XML data,

1179

*parser* is an optional parser instance defaulting to XMLParser.

1180

1181

Return an ElementTree instance.

1182

1183

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1184

tree = ElementTree()

1185

tree.parse(source, parser)

1186

return tree

1187

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1188

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1189

def iterparse(source, events=None, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1190

"""Incrementally parse XML document into ElementTree.

1191

1192

This class also reports what's going on to the user based on the

1193

*events* it is initialized with. The supported events are the strings

1194

"start", "end", "start-ns" and "end-ns" (the "ns" events are used to get

1195

detailed namespace information). If *events* is omitted, only

1196

"end" events are reported.

1197

1198

*source* is a filename or file object containing XML data, *events* is

1199

a list of events to report back, *parser* is an optional parser instance.

1200

1201

Returns an iterator providing (event, elem) pairs.

1202

1203

"""

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1204

# Use the internal, undocumented _parser argument for now; When the

1205

# parser argument of iterparse is removed, this can be killed.

1206

pullparser = XMLPullParser(events=events, _parser=parser)

def iterator():

try:

while True:

yield from pullparser.read_events()

1211

# load event buffer

1212

data = source.read(16 * 1024)

1213

if not data:

1214

break

1215

pullparser.feed(data)

1216

root = pullparser._close_and_return_root()

1217

yield from pullparser.read_events()

it.root = root

finally:

if close_source:

source.close()

class IterParseIterator(collections.Iterator):

1224

__next__ = iterator().__next__

1225

it = IterParseIterator()

1226

it.root = None

1227

del iterator, IterParseIterator

1228

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1229

close_source = False

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1230

if not hasattr(source, "read"):

1231

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1232

close_source = True

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1233

1234

return it

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1235

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1236

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1237

class XMLPullParser:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1238

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1239

def __init__(self, events=None, *, _parser=None):

1240

# The _parser argument is for internal use only and must not be relied

1241

# upon in user code. It will be removed in a future release.

1242

# See http://bugs.python.org/issue17741 for more details.

1243

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1244

self._events_queue = collections.deque()

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1245

self._parser = _parser or XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1246

# wire up the parser for event reporting

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1247

if events is None:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1248

events = ("end",)

1249

self._parser._setevents(self._events_queue, events)

1250

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1251

def feed(self, data):

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1252

"""Feed encoded data to parser."""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1253

if self._parser is None:

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1254

raise ValueError("feed() called after end of stream")

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1255

if data:

1256

try:

1257

self._parser.feed(data)

1258

except SyntaxError as exc:

1259

self._events_queue.append(exc)

1260

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1261

def _close_and_return_root(self):

1262

# iterparse needs this to set its root attribute properly :(

1263

root = self._parser.close()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1264

self._parser = None

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

return root

def close(self):

"""Finish feeding data to parser.

1269

1270

Unlike XMLParser, does not return the root element. Use

1271

read_events() to consume elements from XMLPullParser.

1272

"""

1273

self._close_and_return_root()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1274

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1275

def read_events(self):

R David Murray

410d320

2014-01-04 23:52:50 -0500

[diff] [blame]

1276

"""Return an iterator over currently available (event, elem) pairs.

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1277

1278

Events are consumed from the internal event queue as they are

1279

retrieved from the iterator.

1280

"""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1281

events = self._events_queue

Serhiy Storchaka

2015-12-07 02:31:11 +0200

[diff] [blame]

1282

while events:

1283

event = events.popleft()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1284

if isinstance(event, Exception):

1285

raise event

1286

else:

1287

yield event

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1288

1289

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1290

def XML(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1291

"""Parse XML document from string constant.

1292

1293

This function can be used to embed "XML Literals" in Python code.

1294

1295

*text* is a string containing XML data, *parser* is an

1296

optional parser instance, defaulting to the standard XMLParser.

1297

1298

Returns an Element instance.

1299

1300

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1301

if not parser:

1302

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1303

parser.feed(text)

1304

return parser.close()

1305

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1306

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1307

def XMLID(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1308

"""Parse XML document from string constant for its IDs.

1309

1310

*text* is a string containing XML data, *parser* is an

1311

optional parser instance, defaulting to the standard XMLParser.

1312

1313

Returns an (Element, dict) tuple, in which the

1314

dict maps element id:s to elements.

1315

1316

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1317

if not parser:

1318

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1319

parser.feed(text)

1320

tree = parser.close()

1321

ids = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1322

for elem in tree.iter():

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

id = elem.get("id")

if id:

ids[id] = elem

return tree, ids

Victor Stinner

2013-03-26 01:11:54 +0100

[diff] [blame]

1328

# Parse XML document from string constant. Alias for XML().

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1329

fromstring = XML

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1330

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1331

def fromstringlist(sequence, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1332

"""Parse XML document from sequence of string fragments.

1333

1334

*sequence* is a list of other sequence, *parser* is an optional parser

1335

instance, defaulting to the standard XMLParser.

1336

1337

Returns an Element instance.

1338

1339

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1340

if not parser:

1341

parser = XMLParser(target=TreeBuilder())

1342

for text in sequence:

1343

parser.feed(text)

1344

return parser.close()

1345

1346

# --------------------------------------------------------------------

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1347

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1348

1349

class TreeBuilder:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1350

"""Generic element structure builder.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1351

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1352

This builder converts a sequence of start, data, and end method

1353

calls to a well-formed element structure.

1354

1355

You can use this class to build an element structure using a custom XML

1356

parser, or a parser for some other XML-like format.

1357

1358

*element_factory* is an optional element factory which is called

1359

to create new Element instances, as necessary.

1360

1361

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1362

def __init__(self, element_factory=None):

1363

self._data = [] # data collector

1364

self._elem = [] # element stack

1365

self._last = None # last element

1366

self._tail = None # true if we're after an end tag

1367

if element_factory is None:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1368

element_factory = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1369

self._factory = element_factory

1370

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1371

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1372

"""Flush builder buffers and return toplevel document Element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1373

assert len(self._elem) == 0, "missing end tags"

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1374

assert self._last is not None, "missing toplevel element"

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

return self._last

def _flush(self):

if self._data:

if self._last is not None:

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1380

text = "".join(self._data)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1381

if self._tail:

1382

assert self._last.tail is None, "internal error (tail)"

1383

self._last.tail = text

1384

else:

1385

assert self._last.text is None, "internal error (text)"

1386

self._last.text = text

1387

self._data = []

1388

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1389

def data(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1390

"""Add text to current element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1391

self._data.append(data)

1392

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1393

def start(self, tag, attrs):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1394

"""Open new element and return it.

1395

1396

*tag* is the element name, *attrs* is a dict containing element

1397

attributes.

1398

1399

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1400

self._flush()

1401

self._last = elem = self._factory(tag, attrs)

1402

if self._elem:

1403

self._elem[-1].append(elem)

1404

self._elem.append(elem)

self._tail = 0

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1408

def end(self, tag):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1409

"""Close and return current Element.

1410

1411

*tag* is the element name.

1412

1413

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1414

self._flush()

1415

self._last = self._elem.pop()

1416

assert self._last.tag == tag,\

1417

"end tag mismatch (expected %s, got %s)" % (

self._last.tag, tag)

self._tail = 1

return self._last

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1422

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1423

# also see ElementTree and TreeBuilder

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1424

class XMLParser:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1425

"""Element structure builder for XML source data based on the expat parser.

1426

Martin Panter

29ce082

2016-06-04 07:12:51 +0000

[diff] [blame]

1427

*html* are predefined HTML entities (deprecated and not supported),

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1428

*target* is an optional target object which defaults to an instance of the

1429

standard TreeBuilder class, *encoding* is an optional encoding string

1430

which if given, overrides the encoding specified in the XML file:

1431

http://www.iana.org/assignments/character-sets

1432

1433

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1434

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1435

def __init__(self, html=0, target=None, encoding=None):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1436

try:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1437

from xml.parsers import expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1438

except ImportError:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1439

try:

1440

import pyexpat as expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1441

except ImportError:

1442

raise ImportError(

1443

"No module named expat; use SimpleXMLTreeBuilder instead"

1444

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1445

parser = expat.ParserCreate(encoding, "}")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1446

if target is None:

1447

target = TreeBuilder()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1448

# underscored names are provided for compatibility only

1449

self.parser = self._parser = parser

1450

self.target = self._target = target

1451

self._error = expat.error

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1452

self._names = {} # name memo cache

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1453

# main callbacks

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1454

parser.DefaultHandlerExpand = self._default

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1455

if hasattr(target, 'start'):

1456

parser.StartElementHandler = self._start

1457

if hasattr(target, 'end'):

1458

parser.EndElementHandler = self._end

1459

if hasattr(target, 'data'):

1460

parser.CharacterDataHandler = target.data

1461

# miscellaneous callbacks

1462

if hasattr(target, 'comment'):

1463

parser.CommentHandler = target.comment

1464

if hasattr(target, 'pi'):

1465

parser.ProcessingInstructionHandler = target.pi

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1466

# Configure pyexpat: buffering, new-style attribute handling.

1467

parser.buffer_text = 1

1468

parser.ordered_attributes = 1

1469

parser.specified_attributes = 1

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1470

self._doctype = None

1471

self.entity = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1472

try:

1473

self.version = "Expat %d.%d.%d" % expat.version_info

1474

except AttributeError:

1475

pass # unknown

1476

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1477

def _setevents(self, events_queue, events_to_report):

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1478

# Internal API for XMLPullParser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1479

# events_to_report: a list of events to report during parsing (same as

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1480

# the *events* of XMLPullParser's constructor.

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1481

# events_queue: a list of actual parsing events that will be populated

1482

# by the underlying parser.

1483

#

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1484

parser = self._parser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1485

append = events_queue.append

1486

for event_name in events_to_report:

1487

if event_name == "start":

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1488

parser.ordered_attributes = 1

1489

parser.specified_attributes = 1

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1490

def handler(tag, attrib_in, event=event_name, append=append,

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1491

start=self._start):

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1492

append((event, start(tag, attrib_in)))

1493

parser.StartElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1494

elif event_name == "end":

1495

def handler(tag, event=event_name, append=append,

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1496

end=self._end):

1497

append((event, end(tag)))

1498

parser.EndElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1499

elif event_name == "start-ns":

1500

def handler(prefix, uri, event=event_name, append=append):

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1501

append((event, (prefix or "", uri or "")))

1502

parser.StartNamespaceDeclHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1503

elif event_name == "end-ns":

1504

def handler(prefix, event=event_name, append=append):

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1505

append((event, None))

1506

parser.EndNamespaceDeclHandler = handler

1507

else:

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1508

raise ValueError("unknown event %r" % event_name)

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1509

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1510

def _raiseerror(self, value):

1511

err = ParseError(value)

1512

err.code = value.code

1513

err.position = value.lineno, value.offset

1514

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1515

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1516

def _fixname(self, key):

1517

# expand qname, and convert name string to ascii, if possible

1518

try:

1519

name = self._names[key]

except KeyError:

name = key

if "}" in name:

name = "{" + name

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1524

self._names[key] = name

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1525

return name

1526

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1527

def _start(self, tag, attr_list):

1528

# Handler for expat's StartElementHandler. Since ordered_attributes

1529

# is set, the attributes are reported as a list of alternating

1530

# attribute name,value.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1531

fixname = self._fixname

1532

tag = fixname(tag)

1533

attrib = {}

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1534

if attr_list:

1535

for i in range(0, len(attr_list), 2):

1536

attrib[fixname(attr_list[i])] = attr_list[i+1]

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1537

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1538

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1539

def _end(self, tag):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1540

return self.target.end(self._fixname(tag))

1541

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1542

def _default(self, text):

1543

prefix = text[:1]

1544

if prefix == "&":

1545

# deal with undefined entities

1546

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1547

data_handler = self.target.data

1548

except AttributeError:

1549

return

1550

try:

1551

data_handler(self.entity[text[1:-1]])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1552

except KeyError:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1553

from xml.parsers import expat

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1554

err = expat.error(

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1555

"undefined entity %s: line %d, column %d" %

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1556

(text, self.parser.ErrorLineNumber,

1557

self.parser.ErrorColumnNumber)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1558

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1559

err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1560

err.lineno = self.parser.ErrorLineNumber

1561

err.offset = self.parser.ErrorColumnNumber

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1562

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1563

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1564

self._doctype = [] # inside a doctype declaration

1565

elif self._doctype is not None:

1566

# parse doctype contents

1567

if prefix == ">":

1568

self._doctype = None

1569

return

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1570

text = text.strip()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1571

if not text:

1572

return

1573

self._doctype.append(text)

1574

n = len(self._doctype)

1575

if n > 2:

1576

type = self._doctype[1]

1577

if type == "PUBLIC" and n == 4:

1578

name, type, pubid, system = self._doctype

Florent Xicluna

a1c974a

2012-07-07 13:16:44 +0200

[diff] [blame]

1579

if pubid:

1580

pubid = pubid[1:-1]

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1581

elif type == "SYSTEM" and n == 3:

1582

name, type, system = self._doctype

1583

pubid = None

1584

else:

1585

return

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1586

if hasattr(self.target, "doctype"):

1587

self.target.doctype(name, pubid, system[1:-1])

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1588

elif self.doctype != self._XMLParser__doctype:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1589

# warn about deprecated call

1590

self._XMLParser__doctype(name, pubid, system[1:-1])

1591

self.doctype(name, pubid, system[1:-1])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1592

self._doctype = None

1593

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1594

def doctype(self, name, pubid, system):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1595

"""(Deprecated) Handle doctype declaration

1596

1597

*name* is the Doctype name, *pubid* is the public identifier,

1598

and *system* is the system identifier.

1599

1600

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1601

warnings.warn(

1602

"This method of XMLParser is deprecated. Define doctype() "

1603

"method on the TreeBuilder target.",

DeprecationWarning,

)

# sentinel, if doctype is redefined in a subclass

1608

__doctype = doctype

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1609

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1610

def feed(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1611

"""Feed encoded data to parser."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1612

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1613

self.parser.Parse(data, 0)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1614

except self._error as v:

1615

self._raiseerror(v)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1616

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1617

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1618

"""Finish feeding data to parser and return element structure."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1619

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1620

self.parser.Parse("", 1) # end of data

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1621

except self._error as v:

1622

self._raiseerror(v)

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1623

try:

Florent Xicluna

fb06746

2012-03-05 11:42:49 +0100

[diff] [blame]

1624

close_handler = self.target.close

1625

except AttributeError:

1626

pass

1627

else:

1628

return close_handler()

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1629

finally:

1630

# get rid of circular references

1631

del self.parser, self._parser

1632

del self.target, self._target

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1633

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1634

1635

# Import the C accelerators

1636

try:

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

1637

# Element is going to be shadowed by the C implementation. We need to keep

1638

# the Python version of it accessible for some "creative" by external code

1639

# (see tests)

1640

_Element_Py = Element

1641

Florent Xicluna