Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

2013-03-09 07:12:48 -0800

[diff] [blame]

1

"""Lightweight XML support for Python.

2

3

XML is an inherently hierarchical data format, and the most natural way to

4

represent it is with a tree. This module has two classes for this purpose:

5

6

1. ElementTree represents the whole XML document as a tree and

7

8

2. Element represents a single node in this tree.

9

10

Interactions with the whole document (reading and writing to/from files) are

11

usually done on the ElementTree level. Interactions with a single XML element

12

and its sub-elements are done on the Element level.

13

14

Element is a flexible container object designed to store hierarchical data

15

structures in memory. It can be described as a cross between a list and a

16

dictionary. Each Element has a number of properties associated with it:

17

18

'tag' - a string containing the element's name.

19

20

'attributes' - a Python dictionary storing the element's attributes.

21

22

'text' - a string containing the element's text content.

23

24

'tail' - an optional string containing text after the element's end tag.

25

26

And a number of child elements stored in a Python sequence.

27

28

To create an element instance, use the Element constructor,

29

or the SubElement factory function.

30

31

You can also use the ElementTree class to wrap an element structure

32

and convert it to and from XML.

"""

Eli Bendersky

2013-04-20 05:44:01 -0700

[diff] [blame]

36

#---------------------------------------------------------------------

37

# Licensed to PSF under a Contributor Agreement.

38

# See http://www.python.org/psf/license for licensing details.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

39

#

40

# ElementTree

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

41

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

42

#

43

# fredrik@pythonware.com

44

# http://www.pythonware.com

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

45

# --------------------------------------------------------------------

46

# The ElementTree toolkit is

47

#

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

48

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

49

#

50

# By obtaining, using, and/or copying this software and/or its

51

# associated documentation, you agree that you have read, understood,

52

# and will comply with the following terms and conditions:

53

#

54

# Permission to use, copy, modify, and distribute this software and

55

# its associated documentation for any purpose and without fee is

56

# hereby granted, provided that the above copyright notice appears in

57

# all copies, and that both that copyright notice and this permission

58

# notice appear in supporting documentation, and that the name of

59

# Secret Labs AB or the author not be used in advertising or publicity

60

# pertaining to distribution of the software without specific, written

61

# prior permission.

62

#

63

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

64

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

65

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

66

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

67

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

68

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

69

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

70

# OF THIS SOFTWARE.

71

# --------------------------------------------------------------------

__all__ = [

# public symbols

"Comment",

"dump",

"Element", "ElementTree",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

78

"fromstring", "fromstringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

79

"iselement", "iterparse",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

80

"parse", "ParseError",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

81

"PI", "ProcessingInstruction",

82

"QName",

83

"SubElement",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

84

"tostring", "tostringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

85

"TreeBuilder",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

86

"VERSION",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

87

"XML", "XMLID",

Eli Bendersky

c4e98a6

2013-05-19 09:24:43 -0700

[diff] [blame]

88

"XMLParser",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

89

"register_namespace",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

90

]

91

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

92

VERSION = "1.3.0"

93

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

94

import sys

95

import re

96

import warnings

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

97

import io

98

import contextlib

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

99

Eli Bendersky

27cbb19

2012-06-15 09:03:19 +0300

[diff] [blame]

100

from . import ElementPath

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

101

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

102

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

103

class ParseError(SyntaxError):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

104

"""An error when parsing an XML document.

105

106

In addition to its exception value, a ParseError contains

107

two extra attributes:

108

'code' - the specific exception code

109

'position' - the line and column of the error

110

111

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

112

pass

113

114

# --------------------------------------------------------------------

115

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

116

117

def iselement(element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

118

"""Return True if *element* appears to be an Element."""

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

119

return hasattr(element, 'tag')

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

120

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

121

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

122

class Element:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

123

"""An XML element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

124

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

125

This class is the reference implementation of the Element interface.

126

127

An element's length is its number of subelements. That means if you

Serhiy Storchaka

56a6d85

2014-12-01 18:28:43 +0200

[diff] [blame]

128

want to check if an element is truly empty, you should check BOTH

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

129

its length AND its text attribute.

130

131

The element tag, attribute names, and attribute values can be either

132

bytes or strings.

133

134

*tag* is the element name. *attrib* is an optional dictionary containing

135

element attributes. *extra* are additional element attributes given as

keyword arguments.

Example form:

<tag attrib>text<child/>...</tag>tail

140

141

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

142

143

tag = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

144

"""The element's name."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

145

146

attrib = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

147

"""Dictionary of the element's attributes."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

148

149

text = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

150

"""

151

Text before first subelement. This is either a string or the value None.

152

Note that if there is no text, this attribute may be either

153

None or the empty string, depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

154

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

155

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

156

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

157

tail = None

158

"""

159

Text after this element's end tag, but before the next sibling element's

160

start tag. This is either a string or the value None. Note that if there

161

was no text, this attribute may be either None or an empty string,

162

depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

163

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

164

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

165

166

def __init__(self, tag, attrib={}, **extra):

Eli Bendersky

737b173

2012-05-29 06:02:56 +0300

[diff] [blame]

167

if not isinstance(attrib, dict):

168

raise TypeError("attrib must be dict, not %s" % (

169

attrib.__class__.__name__,))

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

170

attrib = attrib.copy()

171

attrib.update(extra)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

self.tag = tag

self.attrib = attrib

self._children = []

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

177

return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

178

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

179

def makeelement(self, tag, attrib):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

180

"""Create a new element with the same type.

181

182

*tag* is a string containing the element name.

183

*attrib* is a dictionary containing the element attributes.

184

185

Do not call this method, use the SubElement factory function instead.

186

187

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

188

return self.__class__(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

189

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

190

def copy(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

191

"""Return copy of current element.

192

193

This creates a shallow copy. Subelements will be shared with the

194

original tree.

195

196

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

197

elem = self.makeelement(self.tag, self.attrib)

198

elem.text = self.text

199

elem.tail = self.tail

elem[:] = self

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

203

def __len__(self):

204

return len(self._children)

205

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

206

def __bool__(self):

207

warnings.warn(

208

"The behavior of this method will change in future versions. "

209

"Use specific 'len(elem)' or 'elem is not None' test instead.",

210

FutureWarning, stacklevel=2

211

)

212

return len(self._children) != 0 # emulate old behaviour, for now

213

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

214

def __getitem__(self, index):

215

return self._children[index]

216

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

217

def __setitem__(self, index, element):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

218

# if isinstance(index, slice):

219

# for elt in element:

220

# assert iselement(elt)

221

# else:

222

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

223

self._children[index] = element

224

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

225

def __delitem__(self, index):

226

del self._children[index]

227

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

228

def append(self, subelement):

229

"""Add *subelement* to the end of this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

230

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

231

The new element will appear in document order after the last existing

232

subelement (or directly after the text, if it's the first subelement),

233

but before the end tag for this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

234

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

235

"""

236

self._assert_is_element(subelement)

237

self._children.append(subelement)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

238

239

def extend(self, elements):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

240

"""Append subelements from a sequence.

241

242

*elements* is a sequence with zero or more elements.

243

244

"""

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

245

for element in elements:

246

self._assert_is_element(element)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

247

self._children.extend(elements)

248

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

249

def insert(self, index, subelement):

250

"""Insert *subelement* at position *index*."""

251

self._assert_is_element(subelement)

252

self._children.insert(index, subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

253

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

254

def _assert_is_element(self, e):

Antoine Pitrou

ee32931

2012-10-04 19:53:29 +0200

[diff] [blame]

255

# Need to refer to the actual Python implementation, not the

256

# shadowing C implementation.

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

257

if not isinstance(e, _Element_Py):

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

258

raise TypeError('expected an Element, not %s' % type(e).__name__)

259

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

260

def remove(self, subelement):

261

"""Remove matching subelement.

262

263

Unlike the find methods, this method compares elements based on

264

identity, NOT ON tag value or contents. To remove subelements by

265

other means, the easiest way is to use a list comprehension to

266

select what elements to keep, and then use slice assignment to update

267

the parent element.

268

269

ValueError is raised if a matching element could not be found.

270

271

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

272

# assert iselement(element)

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

273

self._children.remove(subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

274

275

def getchildren(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

276

"""(Deprecated) Return all subelements.

277

278

Elements are returned in document order.

279

280

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

281

warnings.warn(

282

"This method will be removed in future versions. "

283

"Use 'list(elem)' or iteration over elem instead.",

284

DeprecationWarning, stacklevel=2

285

)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

286

return self._children

287

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

288

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

289

"""Find first matching element by tag name or path.

290

291

*path* is a string having either an element tag or an XPath,

292

*namespaces* is an optional mapping from namespace prefix to full name.

293

294

Return the first matching element, or None if no element was found.

295

296

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

297

return ElementPath.find(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

298

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

299

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

300

"""Find text for first matching element by tag name or path.

301

302

*path* is a string having either an element tag or an XPath,

303

*default* is the value to return if the element was not found,

304

*namespaces* is an optional mapping from namespace prefix to full name.

305

306

Return text content of first matching element, or default value if

307

none was found. Note that if an element is found having no text

308

content, the empty string is returned.

309

310

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

311

return ElementPath.findtext(self, path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

312

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

313

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

314

"""Find all matching subelements by tag name or path.

315

316

*path* is a string having either an element tag or an XPath,

317

*namespaces* is an optional mapping from namespace prefix to full name.

318

319

Returns list containing all matching elements in document order.

320

321

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

322

return ElementPath.findall(self, path, namespaces)

323

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

324

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

325

"""Find all matching subelements by tag name or path.

326

327

*path* is a string having either an element tag or an XPath,

328

*namespaces* is an optional mapping from namespace prefix to full name.

329

330

Return an iterable yielding all matching elements in document order.

331

332

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

333

return ElementPath.iterfind(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

334

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

335

def clear(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

336

"""Reset element.

337

338

This function removes all subelements, clears all attributes, and sets

339

the text and tail attributes to None.

340

341

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

342

self.attrib.clear()

343

self._children = []

344

self.text = self.tail = None

345

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

346

def get(self, key, default=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

347

"""Get element attribute.

348

349

Equivalent to attrib.get, but some implementations may handle this a

350

bit more efficiently. *key* is what attribute to look for, and

351

*default* is what to return if the attribute was not found.

352

353

Returns a string containing the attribute value, or the default if

354

attribute was not found.

355

356

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

357

return self.attrib.get(key, default)

358

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

359

def set(self, key, value):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

360

"""Set element attribute.

361

362

Equivalent to attrib[key] = value, but some implementations may handle

363

this a bit more efficiently. *key* is what attribute to set, and

364

*value* is the attribute value to set it to.

365

366

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

367

self.attrib[key] = value

368

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

369

def keys(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

370

"""Get list of attribute names.

371

372

Names are returned in an arbitrary order, just like an ordinary

373

Python dict. Equivalent to attrib.keys()

374

375

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

376

return self.attrib.keys()

377

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

378

def items(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

379

"""Get element attributes as a sequence.

380

381

The attributes are returned in arbitrary order. Equivalent to

382

attrib.items().

383

384

Return a list of (name, value) tuples.

385

386

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

387

return self.attrib.items()

388

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

389

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

390

"""Create tree iterator.

391

392

The iterator loops over the element and all subelements in document

393

order, returning all elements with a matching tag.

394

395

If the tree structure is modified during iteration, new or removed

396

elements may or may not be included. To get a stable set, use the

397

list() function on the iterator, and loop over the resulting list.

398

399

*tag* is what tags to look for (default is to return all elements)

400

401

Return an iterator containing all the matching elements.

402

403

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

404

if tag == "*":

405

tag = None

406

if tag is None or self.tag == tag:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

407

yield self

408

for e in self._children:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

409

yield from e.iter(tag)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

410

411

# compatibility

412

def getiterator(self, tag=None):

413

# Change for a DeprecationWarning in 1.4

414

warnings.warn(

415

"This method will be removed in future versions. "

416

"Use 'elem.iter()' or 'list(elem.iter())' instead.",

417

PendingDeprecationWarning, stacklevel=2

418

)

419

return list(self.iter(tag))

420

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

421

def itertext(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

422

"""Create text iterator.

423

424

The iterator loops over the element and all subelements in document

425

order, returning all inner text.

426

427

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

428

tag = self.tag

429

if not isinstance(tag, str) and tag is not None:

return

if self.text:

yield self.text

for e in self:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

434

yield from e.itertext()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

435

if e.tail:

436

yield e.tail

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

437

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

438

439

def SubElement(parent, tag, attrib={}, **extra):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

440

"""Subelement factory which creates an element instance, and appends it

441

to an existing parent.

442

443

The element tag, attribute names, and attribute values can be either

444

bytes or Unicode strings.

445

446

*parent* is the parent element, *tag* is the subelements name, *attrib* is

447

an optional directory containing element attributes, *extra* are

448

additional attributes given as keyword arguments.

449

450

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

451

attrib = attrib.copy()

452

attrib.update(extra)

453

element = parent.makeelement(tag, attrib)

454

parent.append(element)

455

return element

456

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

457

458

def Comment(text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

459

"""Comment element factory.

460

461

This function creates a special element which the standard serializer

462

serializes as an XML comment.

463

464

*text* is a string containing the comment string.

465

466

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

467

element = Element(Comment)

element.text = text

return element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

471

472

def ProcessingInstruction(target, text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

473

"""Processing Instruction element factory.

474

475

This function creates a special element which the standard serializer

476

serializes as an XML comment.

477

478

*target* is a string containing the processing instruction, *text* is a

479

string containing the processing instruction contents, if any.

480

481

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

482

element = Element(ProcessingInstruction)

483

element.text = target

484

if text:

485

element.text = element.text + " " + text

486

return element

487

488

PI = ProcessingInstruction

489

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

490

491

class QName:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

492

"""Qualified name wrapper.

493

494

This class can be used to wrap a QName attribute value in order to get

495

proper namespace handing on output.

496

497

*text_or_uri* is a string containing the QName value either in the form

498

{uri}local, or if the tag argument is given, the URI part of a QName.

499

500

*tag* is an optional argument which if given, will make the first

501

argument (text_or_uri) be interpreted as a URI, and this argument (tag)

502

be interpreted as a local name.

503

504

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

505

def __init__(self, text_or_uri, tag=None):

506

if tag:

507

text_or_uri = "{%s}%s" % (text_or_uri, tag)

508

self.text = text_or_uri

509

def __str__(self):

510

return self.text

Georg Brandl

b56c0e2

2010-12-09 18:10:27 +0000

[diff] [blame]

511

def __repr__(self):

Serhiy Storchaka

465e60e

2014-07-25 23:36:00 +0300

[diff] [blame]

512

return '<%s %r>' % (self.__class__.__name__, self.text)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

513

def __hash__(self):

514

return hash(self.text)

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

515

def __le__(self, other):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

516

if isinstance(other, QName):

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

517

return self.text <= other.text

518

return self.text <= other

519

def __lt__(self, other):

520

if isinstance(other, QName):

521

return self.text < other.text

522

return self.text < other

523

def __ge__(self, other):

524

if isinstance(other, QName):

525

return self.text >= other.text

526

return self.text >= other

527

def __gt__(self, other):

528

if isinstance(other, QName):

529

return self.text > other.text

530

return self.text > other

531

def __eq__(self, other):

532

if isinstance(other, QName):

533

return self.text == other.text

534

return self.text == other

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

535

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

536

# --------------------------------------------------------------------

537

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

538

539

class ElementTree:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

540

"""An XML element hierarchy.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

541

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

542

This class also provides support for serialization to and from

543

standard XML.

544

545

*element* is an optional root element node,

546

*file* is an optional file handle or file name of an XML file whose

547

contents will be used to initialize the tree with.

548

549

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

550

def __init__(self, element=None, file=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

551

# assert element is None or iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

552

self._root = element # first node

if file:

self.parse(file)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

556

def getroot(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

557

"""Return root element of this tree."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

558

return self._root

559

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

560

def _setroot(self, element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

561

"""Replace root element of this tree.

562

563

This will discard the current contents of the tree and replace it

564

with the given element. Use with care!

565

566

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

567

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

568

self._root = element

569

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

570

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

571

"""Load external XML document into element tree.

572

573

*source* is a file name or file object, *parser* is an optional parser

574

instance that defaults to XMLParser.

575

576

ParseError is raised if the parser fails to parse the document.

577

578

Returns the root element of the given source document.

579

580

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

581

close_source = False

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

582

if not hasattr(source, "read"):

583

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

584

close_source = True

585

try:

Eli Bendersky

a369923

2013-05-19 18:47:23 -0700

[diff] [blame]

586

if parser is None:

587

# If no parser was specified, create a default XMLParser

588

parser = XMLParser()

589

if hasattr(parser, '_parse_whole'):

590

# The default XMLParser, when it comes from an accelerator,

591

# can define an internal _parse_whole API for efficiency.

592

# It can be used to parse the whole source without feeding

593

# it with chunks.

594

self._root = parser._parse_whole(source)

595

return self._root

596

while True:

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

597

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

return self._root

finally:

if close_source:

source.close()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

606

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

607

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

608

"""Create and return tree iterator for the root element.

609

610

The iterator loops over all elements in this tree, in document order.

611

612

*tag* is a string with the tag name to iterate over

613

(default is to return all elements).

614

615

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

616

# assert self._root is not None

617

return self._root.iter(tag)

618

619

# compatibility

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

620

def getiterator(self, tag=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

621

# Change for a DeprecationWarning in 1.4

622

warnings.warn(

623

"This method will be removed in future versions. "

624

"Use 'tree.iter()' or 'list(tree.iter())' instead.",

625

PendingDeprecationWarning, stacklevel=2

626

)

627

return list(self.iter(tag))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

628

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

629

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

630

"""Find first matching element by tag name or path.

631

632

Same as getroot().find(path), which is Element.find()

633

634

*path* is a string having either an element tag or an XPath,

635

*namespaces* is an optional mapping from namespace prefix to full name.

636

637

Return the first matching element, or None if no element was found.

638

639

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

640

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

641

if path[:1] == "/":

642

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

643

warnings.warn(

644

"This search is broken in 1.3 and earlier, and will be "

645

"fixed in a future version. If you rely on the current "

646

"behaviour, change it to %r" % path,

647

FutureWarning, stacklevel=2

648

)

649

return self._root.find(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

650

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

651

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

652

"""Find first matching element by tag name or path.

653

654

Same as getroot().findtext(path), which is Element.findtext()

655

656

*path* is a string having either an element tag or an XPath,

657

*namespaces* is an optional mapping from namespace prefix to full name.

658

659

Return the first matching element, or None if no element was found.

660

661

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

662

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

663

if path[:1] == "/":

664

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

665

warnings.warn(

666

"This search is broken in 1.3 and earlier, and will be "

667

"fixed in a future version. If you rely on the current "

668

"behaviour, change it to %r" % path,

669

FutureWarning, stacklevel=2

670

)

671

return self._root.findtext(path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

672

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

673

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

674

"""Find all matching subelements by tag name or path.

675

676

Same as getroot().findall(path), which is Element.findall().

677

678

*path* is a string having either an element tag or an XPath,

679

*namespaces* is an optional mapping from namespace prefix to full name.

680

681

Return list containing all matching elements in document order.

682

683

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

684

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

685

if path[:1] == "/":

686

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

687

warnings.warn(

688

"This search is broken in 1.3 and earlier, and will be "

689

"fixed in a future version. If you rely on the current "

690

"behaviour, change it to %r" % path,

691

FutureWarning, stacklevel=2

692

)

693

return self._root.findall(path, namespaces)

694

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

695

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

696

"""Find all matching subelements by tag name or path.

697

698

Same as getroot().iterfind(path), which is element.iterfind()

699

700

*path* is a string having either an element tag or an XPath,

701

*namespaces* is an optional mapping from namespace prefix to full name.

702

703

Return an iterable yielding all matching elements in document order.

704

705

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

706

# assert self._root is not None

if path[:1] == "/":

path = "." + path

warnings.warn(

"This search is broken in 1.3 and earlier, and will be "

711

"fixed in a future version. If you rely on the current "

712

"behaviour, change it to %r" % path,

713

FutureWarning, stacklevel=2

714

)

715

return self._root.iterfind(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

716

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

717

def write(self, file_or_filename,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

718

encoding=None,

719

xml_declaration=None,

720

default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

721

method=None, *,

722

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

723

"""Write element tree to a file as XML.

724

725

Arguments:

726

*file_or_filename* -- file name or a file object opened for writing

727

728

*encoding* -- the output encoding (default: US-ASCII)

729

730

*xml_declaration* -- bool indicating if an XML declaration should be

731

added to the output. If None, an XML declaration

732

is added if encoding IS NOT either of:

733

US-ASCII, UTF-8, or Unicode

734

735

*default_namespace* -- sets the default XML namespace (for "xmlns")

736

737

*method* -- either "xml" (default), "html, "text", or "c14n"

738

739

*short_empty_elements* -- controls the formatting of elements

740

that contain no content. If True (default)

741

they are emitted as a single self-closed

742

tag, otherwise they are emitted as a pair

743

of start/end tags

Eli Bendersky

e9af827

2013-01-13 06:27:51 -0800

[diff] [blame]

744

745

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

746

if not method:

747

method = "xml"

748

elif method not in _serialize:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

749

raise ValueError("unknown method %r" % method)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

if not encoding:

if method == "c14n":

encoding = "utf-8"

else:

encoding = "us-ascii"

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

755

enc_lower = encoding.lower()

756

with _get_writer(file_or_filename, enc_lower) as write:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

757

if method == "xml" and (xml_declaration or

758

(xml_declaration is None and

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

759

enc_lower not in ("utf-8", "us-ascii", "unicode"))):

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

760

declared_encoding = encoding

Martin Panter

89f76d3

2015-09-23 01:14:35 +0000

[diff] [blame]

761

if enc_lower == "unicode":

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

762

# Retrieve the default encoding for the xml declaration

763

import locale

764

declared_encoding = locale.getpreferredencoding()

765

write("<?xml version='1.0' encoding='%s'?>\n" % (

766

declared_encoding,))

767

if method == "text":

768

_serialize_text(write, self._root)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

769

else:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

770

qnames, namespaces = _namespaces(self._root, default_namespace)

771

serialize = _serialize[method]

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

772

serialize(write, self._root, qnames, namespaces,

773

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

774

775

def write_c14n(self, file):

776

# lxml.etree compatibility. use output method instead

777

return self.write(file, method="c14n")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

778

779

# --------------------------------------------------------------------

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

780

# serialization support

781

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

782

@contextlib.contextmanager

783

def _get_writer(file_or_filename, encoding):

Ezio Melotti

b5bc353

2013-08-17 16:11:40 +0300

[diff] [blame]

784

# returns text write method and release all resources after using

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

785

try:

786

write = file_or_filename.write

787

except AttributeError:

788

# file_or_filename is a file name

789

if encoding == "unicode":

790

file = open(file_or_filename, "w")

791

else:

792

file = open(file_or_filename, "w", encoding=encoding,

793

errors="xmlcharrefreplace")

with file:

yield file.write

else:

# file_or_filename is a file-like object

798

# encoding determines if it is a text or binary writer

799

if encoding == "unicode":

800

# use a text writer as is

801

yield write

802

else:

803

# wrap a binary writer with TextIOWrapper

804

with contextlib.ExitStack() as stack:

805

if isinstance(file_or_filename, io.BufferedIOBase):

806

file = file_or_filename

807

elif isinstance(file_or_filename, io.RawIOBase):

808

file = io.BufferedWriter(file_or_filename)

809

# Keep the original file open when the BufferedWriter is

810

# destroyed

811

stack.callback(file.detach)

812

else:

813

# This is to handle passed objects that aren't in the

814

# IOBase hierarchy, but just have a write method

815

file = io.BufferedIOBase()

816

file.writable = lambda: True

817

file.write = write

818

try:

819

# TextIOWrapper uses this methods to determine

820

# if BOM (for UTF-16, etc) should be added

821

file.seekable = file_or_filename.seekable

822

file.tell = file_or_filename.tell

823

except AttributeError:

824

pass

825

file = io.TextIOWrapper(file,

826

encoding=encoding,

827

errors="xmlcharrefreplace",

828

newline="\n")

829

# Keep the original file open when the TextIOWrapper is

830

# destroyed

831

stack.callback(file.detach)

832

yield file.write

833

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

834

def _namespaces(elem, default_namespace=None):

835

# identify namespaces used in this tree

836

837

# maps qnames to *encoded* prefix:local names

838

qnames = {None: None}

839

840

# maps uri:s to prefixes

841

namespaces = {}

842

if default_namespace:

843

namespaces[default_namespace] = ""

844

845

def add_qname(qname):

846

# calculate serialized qname representation

847

try:

848

if qname[:1] == "{":

849

uri, tag = qname[1:].rsplit("}", 1)

850

prefix = namespaces.get(uri)

851

if prefix is None:

852

prefix = _namespace_map.get(uri)

853

if prefix is None:

854

prefix = "ns%d" % len(namespaces)

855

if prefix != "xml":

856

namespaces[uri] = prefix

857

if prefix:

858

qnames[qname] = "%s:%s" % (prefix, tag)

859

else:

860

qnames[qname] = tag # default element

861

else:

862

if default_namespace:

863

# FIXME: can this be handled in XML 1.0?

864

raise ValueError(

865

"cannot use non-qualified names with "

866

"default_namespace option"

867

)

868

qnames[qname] = qname

869

except TypeError:

870

_raise_serialization_error(qname)

871

872

# populate qname and namespaces table

Eli Bendersky

64d11e6

2012-06-15 07:42:50 +0300

[diff] [blame]

873

for elem in elem.iter():

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

874

tag = elem.tag

Senthil Kumaran

ec30b3d

2010-11-09 02:36:59 +0000

[diff] [blame]

875

if isinstance(tag, QName):

876

if tag.text not in qnames:

877

add_qname(tag.text)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

878

elif isinstance(tag, str):

879

if tag not in qnames:

880

add_qname(tag)

881

elif tag is not None and tag is not Comment and tag is not PI:

882

_raise_serialization_error(tag)

883

for key, value in elem.items():

884

if isinstance(key, QName):

885

key = key.text

886

if key not in qnames:

887

add_qname(key)

888

if isinstance(value, QName) and value.text not in qnames:

889

add_qname(value.text)

890

text = elem.text

891

if isinstance(text, QName) and text.text not in qnames:

892

add_qname(text.text)

893

return qnames, namespaces

894

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

895

def _serialize_xml(write, elem, qnames, namespaces,

896

short_empty_elements, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % text)

901

elif tag is ProcessingInstruction:

902

write("<?%s?>" % text)

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

908

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

909

_serialize_xml(write, e, qnames, None,

910

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

911

else:

912

write("<" + tag)

913

items = list(elem.items())

914

if items or namespaces:

915

if namespaces:

916

for v, k in sorted(namespaces.items(),

917

key=lambda x: x[1]): # sort on prefix

918

if k:

919

k = ":" + k

920

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

925

if isinstance(k, QName):

926

k = k.text

927

if isinstance(v, QName):

928

v = qnames[v.text]

929

else:

930

v = _escape_attrib(v)

931

write(" %s=\"%s\"" % (qnames[k], v))

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

932

if text or len(elem) or not short_empty_elements:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

933

write(">")

934

if text:

935

write(_escape_cdata(text))

936

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

937

_serialize_xml(write, e, qnames, None,

938

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

939

write("</" + tag + ">")

else:

write(" />")

if elem.tail:

write(_escape_cdata(elem.tail))

944

945

HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",

Ezio Melotti

c90111f

2012-09-19 08:19:12 +0300

[diff] [blame]

946

"img", "input", "isindex", "link", "meta", "param")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

947

948

try:

949

HTML_EMPTY = set(HTML_EMPTY)

except NameError:

pass

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

953

def _serialize_html(write, elem, qnames, namespaces, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % _escape_cdata(text))

958

elif tag is ProcessingInstruction:

959

write("<?%s?>" % _escape_cdata(text))

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

965

for e in elem:

966

_serialize_html(write, e, qnames, None)

967

else:

968

write("<" + tag)

969

items = list(elem.items())

970

if items or namespaces:

971

if namespaces:

972

for v, k in sorted(namespaces.items(),

973

key=lambda x: x[1]): # sort on prefix

974

if k:

975

k = ":" + k

976

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

981

if isinstance(k, QName):

982

k = k.text

983

if isinstance(v, QName):

984

v = qnames[v.text]

985

else:

986

v = _escape_attrib_html(v)

987

# FIXME: handle boolean attributes

988

write(" %s=\"%s\"" % (qnames[k], v))

989

write(">")

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

990

ltag = tag.lower()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

991

if text:

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

992

if ltag == "script" or ltag == "style":

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

993

write(text)

994

else:

995

write(_escape_cdata(text))

996

for e in elem:

997

_serialize_html(write, e, qnames, None)

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

998

if ltag not in HTML_EMPTY:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

999

write("</" + tag + ">")

1000

if elem.tail:

1001

write(_escape_cdata(elem.tail))

1002

1003

def _serialize_text(write, elem):

1004

for part in elem.itertext():

write(part)

if elem.tail:

write(elem.tail)

_serialize = {

"xml": _serialize_xml,

1011

"html": _serialize_html,

1012

"text": _serialize_text,

1013

# this optional method is imported at the end of the module

1014

# "c14n": _serialize_c14n,

1015

}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1016

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1017

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1018

def register_namespace(prefix, uri):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1019

"""Register a namespace prefix.

1020

1021

The registry is global, and any existing mapping for either the

1022

given prefix or the namespace URI will be removed.

1023

1024

*prefix* is the namespace prefix, *uri* is a namespace uri. Tags and

1025

attributes in this namespace will be serialized with prefix if possible.

1026

1027

ValueError is raised if prefix is reserved or is invalid.

1028

1029

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1030

if re.match("ns\d+$", prefix):

1031

raise ValueError("Prefix format reserved for internal use")

Georg Brandl

90b2067

2010-12-28 10:38:33 +0000

[diff] [blame]

1032

for k, v in list(_namespace_map.items()):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1033

if k == uri or v == prefix:

1034

del _namespace_map[k]

1035

_namespace_map[uri] = prefix

1036

1037

_namespace_map = {

1038

# "well-known" namespace prefixes

1039

"http://www.w3.org/XML/1998/namespace": "xml",

1040

"http://www.w3.org/1999/xhtml": "html",

1041

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

1042

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

1043

# xml schema

1044

"http://www.w3.org/2001/XMLSchema": "xs",

1045

"http://www.w3.org/2001/XMLSchema-instance": "xsi",

1046

# dublin core

1047

"http://purl.org/dc/elements/1.1/": "dc",

1048

}

Florent Xicluna

1639505

2012-02-16 23:28:35 +0100

[diff] [blame]

1049

# For tests and troubleshooting

1050

register_namespace._namespace_map = _namespace_map

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1051

1052

def _raise_serialization_error(text):

1053

raise TypeError(

1054

"cannot serialize %r (type %s)" % (text, type(text).__name__)

1055

)

1056

1057

def _escape_cdata(text):

1058

# escape character data

1059

try:

1060

# it's worth avoiding do-nothing calls for strings that are

1061

# shorter than 500 character, or so. assume that's, by far,

1062

# the most common case in most applications.

1063

if "&" in text:

1064

text = text.replace("&", "&")

1065

if "<" in text:

1066

text = text.replace("<", "<")

1067

if ">" in text:

1068

text = text.replace(">", ">")

1069

return text

1070

except (TypeError, AttributeError):

1071

_raise_serialization_error(text)

1072

1073

def _escape_attrib(text):

1074

# escape attribute value

1075

try:

1076

if "&" in text:

1077

text = text.replace("&", "&")

1078

if "<" in text:

1079

text = text.replace("<", "<")

1080

if ">" in text:

1081

text = text.replace(">", ">")

1082

if "\"" in text:

1083

text = text.replace("\"", """)

1084

if "\n" in text:

1085

text = text.replace("\n", "
")

1086

return text

1087

except (TypeError, AttributeError):

1088

_raise_serialization_error(text)

1089

1090

def _escape_attrib_html(text):

1091

# escape attribute value

1092

try:

1093

if "&" in text:

1094

text = text.replace("&", "&")

1095

if ">" in text:

1096

text = text.replace(">", ">")

1097

if "\"" in text:

1098

text = text.replace("\"", """)

1099

return text

1100

except (TypeError, AttributeError):

1101

_raise_serialization_error(text)

1102

1103

# --------------------------------------------------------------------

1104

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1105

def tostring(element, encoding=None, method=None, *,

1106

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1107

"""Generate string representation of XML element.

1108

1109

All subelements are included. If encoding is "unicode", a string

1110

is returned. Otherwise a bytestring is returned.

1111

1112

*element* is an Element instance, *encoding* is an optional output

1113

encoding defaulting to US-ASCII, *method* is an optional output which can

1114

be one of "xml" (default), "html", "text" or "c14n".

1115

1116

Returns an (optionally) encoded string containing the XML data.

1117

1118

"""

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1119

stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1120

ElementTree(element).write(stream, encoding, method=method,

1121

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1122

return stream.getvalue()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1123

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1124

class _ListDataStream(io.BufferedIOBase):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1125

"""An auxiliary stream accumulating into a list reference."""

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1126

def __init__(self, lst):

1127

self.lst = lst

Eli Bendersky

f90fc68

2012-07-17 15:09:56 +0300

[diff] [blame]

1128

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

def writable(self):

return True

def seekable(self):

return True

def write(self, b):

self.lst.append(b)

def tell(self):

return len(self.lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1141

def tostringlist(element, encoding=None, method=None, *,

1142

short_empty_elements=True):

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1143

lst = []

1144

stream = _ListDataStream(lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1145

ElementTree(element).write(stream, encoding, method=method,

1146

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1147

return lst

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1148

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1149

1150

def dump(elem):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1151

"""Write element tree or element structure to sys.stdout.

1152

1153

This function should be used for debugging only.

1154

1155

*elem* is either an ElementTree, or a single Element. The exact output

1156

format is implementation dependent. In this version, it's written as an

1157

ordinary XML file.

1158

1159

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1160

# debugging

1161

if not isinstance(elem, ElementTree):

1162

elem = ElementTree(elem)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

1163

elem.write(sys.stdout, encoding="unicode")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1164

tail = elem.getroot().tail

1165

if not tail or tail[-1] != "\n":

1166

sys.stdout.write("\n")

1167

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1168

# --------------------------------------------------------------------

1169

# parsing

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1170

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1171

1172

def parse(source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1173

"""Parse XML document into element tree.

1174

1175

*source* is a filename or file object containing XML data,

1176

*parser* is an optional parser instance defaulting to XMLParser.

1177

1178

Return an ElementTree instance.

1179

1180

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1181

tree = ElementTree()

1182

tree.parse(source, parser)

1183

return tree

1184

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1185

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1186

def iterparse(source, events=None, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1187

"""Incrementally parse XML document into ElementTree.

1188

1189

This class also reports what's going on to the user based on the

1190

*events* it is initialized with. The supported events are the strings

1191

"start", "end", "start-ns" and "end-ns" (the "ns" events are used to get

1192

detailed namespace information). If *events* is omitted, only

1193

"end" events are reported.

1194

1195

*source* is a filename or file object containing XML data, *events* is

1196

a list of events to report back, *parser* is an optional parser instance.

1197

1198

Returns an iterator providing (event, elem) pairs.

1199

1200

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1201

close_source = False

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1202

if not hasattr(source, "read"):

1203

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1204

close_source = True

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1205

return _IterParseIterator(source, events, parser, close_source)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1206

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1207

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1208

class XMLPullParser:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1209

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1210

def __init__(self, events=None, *, _parser=None):

1211

# The _parser argument is for internal use only and must not be relied

1212

# upon in user code. It will be removed in a future release.

1213

# See http://bugs.python.org/issue17741 for more details.

1214

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1215

# _elementtree.c expects a list, not a deque

1216

self._events_queue = []

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1217

self._index = 0

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1218

self._parser = _parser or XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1219

# wire up the parser for event reporting

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1220

if events is None:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1221

events = ("end",)

1222

self._parser._setevents(self._events_queue, events)

1223

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1224

def feed(self, data):

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1225

"""Feed encoded data to parser."""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1226

if self._parser is None:

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1227

raise ValueError("feed() called after end of stream")

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1228

if data:

1229

try:

1230

self._parser.feed(data)

1231

except SyntaxError as exc:

1232

self._events_queue.append(exc)

1233

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1234

def _close_and_return_root(self):

1235

# iterparse needs this to set its root attribute properly :(

1236

root = self._parser.close()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1237

self._parser = None

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

return root

def close(self):

"""Finish feeding data to parser.

1242

1243

Unlike XMLParser, does not return the root element. Use

1244

read_events() to consume elements from XMLPullParser.

1245

"""

1246

self._close_and_return_root()

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1247

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1248

def read_events(self):

R David Murray

410d320

2014-01-04 23:52:50 -0500

[diff] [blame]

1249

"""Return an iterator over currently available (event, elem) pairs.

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1250

1251

Events are consumed from the internal event queue as they are

1252

retrieved from the iterator.

1253

"""

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1254

events = self._events_queue

while True:

index = self._index

try:

event = events[self._index]

1259

# Avoid retaining references to past events

1260

events[self._index] = None

except IndexError:

break

index += 1

# Compact the list in a O(1) amortized fashion

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1265

# As noted above, _elementree.c needs a list, not a deque

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1266

if index * 2 >= len(events):

1267

events[:index] = []

1268

self._index = 0

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1269

else:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1270

self._index = index

1271

if isinstance(event, Exception):

1272

raise event

1273

else:

1274

yield event

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1275

1276

Antoine Pitrou

0acbcb5

2013-08-23 23:04:30 +0200

[diff] [blame]

1277

class _IterParseIterator:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1278

1279

def __init__(self, source, events, parser, close_source=False):

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1280

# Use the internal, undocumented _parser argument for now; When the

1281

# parser argument of iterparse is removed, this can be killed.

1282

self._parser = XMLPullParser(events=events, _parser=parser)

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1283

self._file = source

1284

self._close_file = close_source

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1285

self.root = self._root = None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1286

Georg Brandl

a18af4e

2007-04-21 15:47:16 +0000

[diff] [blame]

1287

def __next__(self):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1288

while 1:

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1289

for event in self._parser.read_events():

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1290

return event

Antoine Pitrou

0acbcb5

2013-08-23 23:04:30 +0200

[diff] [blame]

1291

if self._parser._parser is None:

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1292

self.root = self._root

Florent Xicluna

91d5193

2011-11-01 23:31:09 +0100

[diff] [blame]

if self._close_file:

self._file.close()

raise StopIteration

# load event buffer

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1297

data = self._file.read(16 * 1024)

Florent Xicluna

91d5193

2011-11-01 23:31:09 +0100

[diff] [blame]

1298

if data:

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1299

self._parser.feed(data)

Florent Xicluna

91d5193

2011-11-01 23:31:09 +0100

[diff] [blame]

1300

else:

Nick Coghlan

2013-09-28 23:50:35 +1000

[diff] [blame]

1301

self._root = self._parser._close_and_return_root()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1302

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1303

def __iter__(self):

1304

return self

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1305

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1306

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1307

def XML(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1308

"""Parse XML document from string constant.

1309

1310

This function can be used to embed "XML Literals" in Python code.

1311

1312

*text* is a string containing XML data, *parser* is an

1313

optional parser instance, defaulting to the standard XMLParser.

1314

1315

Returns an Element instance.

1316

1317

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1318

if not parser:

1319

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1320

parser.feed(text)

1321

return parser.close()

1322

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1323

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1324

def XMLID(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1325

"""Parse XML document from string constant for its IDs.

1326

1327

*text* is a string containing XML data, *parser* is an

1328

optional parser instance, defaulting to the standard XMLParser.

1329

1330

Returns an (Element, dict) tuple, in which the

1331

dict maps element id:s to elements.

1332

1333

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1334

if not parser:

1335

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1336

parser.feed(text)

1337

tree = parser.close()

1338

ids = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1339

for elem in tree.iter():

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

id = elem.get("id")

if id:

ids[id] = elem

return tree, ids

Victor Stinner

2013-03-26 01:11:54 +0100

[diff] [blame]

1345

# Parse XML document from string constant. Alias for XML().

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1346

fromstring = XML

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1347

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1348

def fromstringlist(sequence, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1349

"""Parse XML document from sequence of string fragments.

1350

1351

*sequence* is a list of other sequence, *parser* is an optional parser

1352

instance, defaulting to the standard XMLParser.

1353

1354

Returns an Element instance.

1355

1356

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1357

if not parser:

1358

parser = XMLParser(target=TreeBuilder())

1359

for text in sequence:

1360

parser.feed(text)

1361

return parser.close()

1362

1363

# --------------------------------------------------------------------

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1364

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1365

1366

class TreeBuilder:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1367

"""Generic element structure builder.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1368

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1369

This builder converts a sequence of start, data, and end method

1370

calls to a well-formed element structure.

1371

1372

You can use this class to build an element structure using a custom XML

1373

parser, or a parser for some other XML-like format.

1374

1375

*element_factory* is an optional element factory which is called

1376

to create new Element instances, as necessary.

1377

1378

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1379

def __init__(self, element_factory=None):

1380

self._data = [] # data collector

1381

self._elem = [] # element stack

1382

self._last = None # last element

1383

self._tail = None # true if we're after an end tag

1384

if element_factory is None:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1385

element_factory = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1386

self._factory = element_factory

1387

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1388

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1389

"""Flush builder buffers and return toplevel document Element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1390

assert len(self._elem) == 0, "missing end tags"

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1391

assert self._last is not None, "missing toplevel element"

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

return self._last

def _flush(self):

if self._data:

if self._last is not None:

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1397

text = "".join(self._data)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1398

if self._tail:

1399

assert self._last.tail is None, "internal error (tail)"

1400

self._last.tail = text

1401

else:

1402

assert self._last.text is None, "internal error (text)"

1403

self._last.text = text

1404

self._data = []

1405

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1406

def data(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1407

"""Add text to current element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1408

self._data.append(data)

1409

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1410

def start(self, tag, attrs):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1411

"""Open new element and return it.

1412

1413

*tag* is the element name, *attrs* is a dict containing element

1414

attributes.

1415

1416

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1417

self._flush()

1418

self._last = elem = self._factory(tag, attrs)

1419

if self._elem:

1420

self._elem[-1].append(elem)

1421

self._elem.append(elem)

self._tail = 0

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1425

def end(self, tag):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1426

"""Close and return current Element.

1427

1428

*tag* is the element name.

1429

1430

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1431

self._flush()

1432

self._last = self._elem.pop()

1433

assert self._last.tag == tag,\

1434

"end tag mismatch (expected %s, got %s)" % (

self._last.tag, tag)

self._tail = 1

return self._last

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1439

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1440

# also see ElementTree and TreeBuilder

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1441

class XMLParser:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1442

"""Element structure builder for XML source data based on the expat parser.

1443

1444

*html* are predefined HTML entities (not supported currently),

1445

*target* is an optional target object which defaults to an instance of the

1446

standard TreeBuilder class, *encoding* is an optional encoding string

1447

which if given, overrides the encoding specified in the XML file:

1448

http://www.iana.org/assignments/character-sets

1449

1450

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1451

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1452

def __init__(self, html=0, target=None, encoding=None):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1453

try:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1454

from xml.parsers import expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1455

except ImportError:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1456

try:

1457

import pyexpat as expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1458

except ImportError:

1459

raise ImportError(

1460

"No module named expat; use SimpleXMLTreeBuilder instead"

1461

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1462

parser = expat.ParserCreate(encoding, "}")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1463

if target is None:

1464

target = TreeBuilder()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1465

# underscored names are provided for compatibility only

1466

self.parser = self._parser = parser

1467

self.target = self._target = target

1468

self._error = expat.error

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1469

self._names = {} # name memo cache

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1470

# main callbacks

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1471

parser.DefaultHandlerExpand = self._default

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1472

if hasattr(target, 'start'):

1473

parser.StartElementHandler = self._start

1474

if hasattr(target, 'end'):

1475

parser.EndElementHandler = self._end

1476

if hasattr(target, 'data'):

1477

parser.CharacterDataHandler = target.data

1478

# miscellaneous callbacks

1479

if hasattr(target, 'comment'):

1480

parser.CommentHandler = target.comment

1481

if hasattr(target, 'pi'):

1482

parser.ProcessingInstructionHandler = target.pi

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1483

# Configure pyexpat: buffering, new-style attribute handling.

1484

parser.buffer_text = 1

1485

parser.ordered_attributes = 1

1486

parser.specified_attributes = 1

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1487

self._doctype = None

1488

self.entity = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1489

try:

1490

self.version = "Expat %d.%d.%d" % expat.version_info

1491

except AttributeError:

1492

pass # unknown

1493

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1494

def _setevents(self, events_queue, events_to_report):

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1495

# Internal API for XMLPullParser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1496

# events_to_report: a list of events to report during parsing (same as

Eli Bendersky

2013-08-30 05:51:20 -0700

[diff] [blame]

1497

# the *events* of XMLPullParser's constructor.

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1498

# events_queue: a list of actual parsing events that will be populated

1499

# by the underlying parser.

1500

#

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1501

parser = self._parser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1502

append = events_queue.append

1503

for event_name in events_to_report:

1504

if event_name == "start":

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1505

parser.ordered_attributes = 1

1506

parser.specified_attributes = 1

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1507

def handler(tag, attrib_in, event=event_name, append=append,

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1508

start=self._start):

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1509

append((event, start(tag, attrib_in)))

1510

parser.StartElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1511

elif event_name == "end":

1512

def handler(tag, event=event_name, append=append,

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1513

end=self._end):

1514

append((event, end(tag)))

1515

parser.EndElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1516

elif event_name == "start-ns":

1517

def handler(prefix, uri, event=event_name, append=append):

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1518

append((event, (prefix or "", uri or "")))

1519

parser.StartNamespaceDeclHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1520

elif event_name == "end-ns":

1521

def handler(prefix, event=event_name, append=append):

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1522

append((event, None))

1523

parser.EndNamespaceDeclHandler = handler

1524

else:

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1525

raise ValueError("unknown event %r" % event_name)

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1526

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1527

def _raiseerror(self, value):

1528

err = ParseError(value)

1529

err.code = value.code

1530

err.position = value.lineno, value.offset

1531

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1532

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1533

def _fixname(self, key):

1534

# expand qname, and convert name string to ascii, if possible

1535

try:

1536

name = self._names[key]

except KeyError:

name = key

if "}" in name:

name = "{" + name

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1541

self._names[key] = name

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1542

return name

1543

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1544

def _start(self, tag, attr_list):

1545

# Handler for expat's StartElementHandler. Since ordered_attributes

1546

# is set, the attributes are reported as a list of alternating

1547

# attribute name,value.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1548

fixname = self._fixname

1549

tag = fixname(tag)

1550

attrib = {}

Eli Bendersky

2013-08-25 18:58:18 -0700

[diff] [blame]

1551

if attr_list:

1552

for i in range(0, len(attr_list), 2):

1553

attrib[fixname(attr_list[i])] = attr_list[i+1]

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1554

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1555

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1556

def _end(self, tag):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1557

return self.target.end(self._fixname(tag))

1558

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1559

def _default(self, text):

1560

prefix = text[:1]

1561

if prefix == "&":

1562

# deal with undefined entities

1563

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1564

data_handler = self.target.data

1565

except AttributeError:

1566

return

1567

try:

1568

data_handler(self.entity[text[1:-1]])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1569

except KeyError:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1570

from xml.parsers import expat

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1571

err = expat.error(

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1572

"undefined entity %s: line %d, column %d" %

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1573

(text, self.parser.ErrorLineNumber,

1574

self.parser.ErrorColumnNumber)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1575

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1576

err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1577

err.lineno = self.parser.ErrorLineNumber

1578

err.offset = self.parser.ErrorColumnNumber

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1579

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1580

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1581

self._doctype = [] # inside a doctype declaration

1582

elif self._doctype is not None:

1583

# parse doctype contents

1584

if prefix == ">":

1585

self._doctype = None

1586

return

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1587

text = text.strip()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1588

if not text:

1589

return

1590

self._doctype.append(text)

1591

n = len(self._doctype)

1592

if n > 2:

1593

type = self._doctype[1]

1594

if type == "PUBLIC" and n == 4:

1595

name, type, pubid, system = self._doctype

Florent Xicluna

a1c974a

2012-07-07 13:16:44 +0200

[diff] [blame]

1596

if pubid:

1597

pubid = pubid[1:-1]

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1598

elif type == "SYSTEM" and n == 3:

1599

name, type, system = self._doctype

1600

pubid = None

1601

else:

1602

return

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1603

if hasattr(self.target, "doctype"):

1604

self.target.doctype(name, pubid, system[1:-1])

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1605

elif self.doctype != self._XMLParser__doctype:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1606

# warn about deprecated call

1607

self._XMLParser__doctype(name, pubid, system[1:-1])

1608

self.doctype(name, pubid, system[1:-1])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1609

self._doctype = None

1610

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1611

def doctype(self, name, pubid, system):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1612

"""(Deprecated) Handle doctype declaration

1613

1614

*name* is the Doctype name, *pubid* is the public identifier,

1615

and *system* is the system identifier.

1616

1617

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1618

warnings.warn(

1619

"This method of XMLParser is deprecated. Define doctype() "

1620

"method on the TreeBuilder target.",

DeprecationWarning,

)

# sentinel, if doctype is redefined in a subclass

1625

__doctype = doctype

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1626

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1627

def feed(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1628

"""Feed encoded data to parser."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1629

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1630

self.parser.Parse(data, 0)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1631

except self._error as v:

1632

self._raiseerror(v)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1633

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1634

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1635

"""Finish feeding data to parser and return element structure."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1636

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1637

self.parser.Parse("", 1) # end of data

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1638

except self._error as v:

1639

self._raiseerror(v)

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1640

try:

Florent Xicluna

fb06746

2012-03-05 11:42:49 +0100

[diff] [blame]

1641

close_handler = self.target.close

1642

except AttributeError:

1643

pass

1644

else:

1645

return close_handler()

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1646

finally:

1647

# get rid of circular references

1648

del self.parser, self._parser

1649

del self.target, self._target

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1650

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1651

1652

# Import the C accelerators

1653

try:

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

1654

# Element is going to be shadowed by the C implementation. We need to keep

1655

# the Python version of it accessible for some "creative" by external code

1656

# (see tests)

1657

_Element_Py = Element

1658

Florent Xicluna