Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

2013-03-09 07:12:48 -0800

[diff] [blame]

1

"""Lightweight XML support for Python.

2

3

XML is an inherently hierarchical data format, and the most natural way to

4

represent it is with a tree. This module has two classes for this purpose:

5

6

1. ElementTree represents the whole XML document as a tree and

7

8

2. Element represents a single node in this tree.

9

10

Interactions with the whole document (reading and writing to/from files) are

11

usually done on the ElementTree level. Interactions with a single XML element

12

and its sub-elements are done on the Element level.

13

14

Element is a flexible container object designed to store hierarchical data

15

structures in memory. It can be described as a cross between a list and a

16

dictionary. Each Element has a number of properties associated with it:

17

18

'tag' - a string containing the element's name.

19

20

'attributes' - a Python dictionary storing the element's attributes.

21

22

'text' - a string containing the element's text content.

23

24

'tail' - an optional string containing text after the element's end tag.

25

26

And a number of child elements stored in a Python sequence.

27

28

To create an element instance, use the Element constructor,

29

or the SubElement factory function.

30

31

You can also use the ElementTree class to wrap an element structure

32

and convert it to and from XML.

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

36

#

37

# ElementTree

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

38

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

39

#

40

# fredrik@pythonware.com

41

# http://www.pythonware.com

42

#

43

# --------------------------------------------------------------------

44

# The ElementTree toolkit is

45

#

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

46

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

47

#

48

# By obtaining, using, and/or copying this software and/or its

49

# associated documentation, you agree that you have read, understood,

50

# and will comply with the following terms and conditions:

51

#

52

# Permission to use, copy, modify, and distribute this software and

53

# its associated documentation for any purpose and without fee is

54

# hereby granted, provided that the above copyright notice appears in

55

# all copies, and that both that copyright notice and this permission

56

# notice appear in supporting documentation, and that the name of

57

# Secret Labs AB or the author not be used in advertising or publicity

58

# pertaining to distribution of the software without specific, written

59

# prior permission.

60

#

61

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

62

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

63

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

64

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

65

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

66

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

67

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

68

# OF THIS SOFTWARE.

69

# --------------------------------------------------------------------

70

Fredrik Lundh

63168a5

2005-12-14 22:29:34 +0000

[diff] [blame]

71

# Licensed to PSF under a Contributor Agreement.

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

72

# See http://www.python.org/psf/license for licensing details.

Fredrik Lundh

63168a5

2005-12-14 22:29:34 +0000

[diff] [blame]

73

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

__all__ = [

# public symbols

"Comment",

"dump",

"Element", "ElementTree",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

79

"fromstring", "fromstringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

80

"iselement", "iterparse",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

81

"parse", "ParseError",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

82

"PI", "ProcessingInstruction",

83

"QName",

84

"SubElement",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

85

"tostring", "tostringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

86

"TreeBuilder",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

87

"VERSION",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

88

"XML", "XMLID",

Thomas Wouters

2006-08-11 14:57:12 +0000

[diff] [blame]

89

"XMLParser", "XMLTreeBuilder",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

90

"register_namespace",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

91

]

92

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

93

VERSION = "1.3.0"

94

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

95

import sys

96

import re

97

import warnings

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

98

import io

99

import contextlib

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

100

Eli Bendersky

27cbb19

2012-06-15 09:03:19 +0300

[diff] [blame]

101

from . import ElementPath

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

102

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

103

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

104

class ParseError(SyntaxError):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

105

"""An error when parsing an XML document.

106

107

In addition to its exception value, a ParseError contains

108

two extra attributes:

109

'code' - the specific exception code

110

'position' - the line and column of the error

111

112

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

113

pass

114

115

# --------------------------------------------------------------------

116

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

117

118

def iselement(element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

119

"""Return True if *element* appears to be an Element."""

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

120

return hasattr(element, 'tag')

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

121

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

122

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

123

class Element:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

124

"""An XML element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

125

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

126

This class is the reference implementation of the Element interface.

127

128

An element's length is its number of subelements. That means if you

129

you want to check if an element is truly empty, you should check BOTH

130

its length AND its text attribute.

131

132

The element tag, attribute names, and attribute values can be either

133

bytes or strings.

134

135

*tag* is the element name. *attrib* is an optional dictionary containing

136

element attributes. *extra* are additional element attributes given as

keyword arguments.

Example form:

<tag attrib>text<child/>...</tag>tail

141

142

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

143

144

tag = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

145

"""The element's name."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

146

147

attrib = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

148

"""Dictionary of the element's attributes."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

149

150

text = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

151

"""

152

Text before first subelement. This is either a string or the value None.

153

Note that if there is no text, this attribute may be either

154

None or the empty string, depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

155

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

156

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

157

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

158

tail = None

159

"""

160

Text after this element's end tag, but before the next sibling element's

161

start tag. This is either a string or the value None. Note that if there

162

was no text, this attribute may be either None or an empty string,

163

depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

164

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

165

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

166

167

def __init__(self, tag, attrib={}, **extra):

Eli Bendersky

737b173

2012-05-29 06:02:56 +0300

[diff] [blame]

168

if not isinstance(attrib, dict):

169

raise TypeError("attrib must be dict, not %s" % (

170

attrib.__class__.__name__,))

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

171

attrib = attrib.copy()

172

attrib.update(extra)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

self.tag = tag

self.attrib = attrib

self._children = []

def __repr__(self):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

178

return "<Element %s at 0x%x>" % (repr(self.tag), id(self))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

179

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

180

def makeelement(self, tag, attrib):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

181

"""Create a new element with the same type.

182

183

*tag* is a string containing the element name.

184

*attrib* is a dictionary containing the element attributes.

185

186

Do not call this method, use the SubElement factory function instead.

187

188

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

189

return self.__class__(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

190

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

191

def copy(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

192

"""Return copy of current element.

193

194

This creates a shallow copy. Subelements will be shared with the

195

original tree.

196

197

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

198

elem = self.makeelement(self.tag, self.attrib)

199

elem.text = self.text

200

elem.tail = self.tail

elem[:] = self

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

204

def __len__(self):

205

return len(self._children)

206

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

207

def __bool__(self):

208

warnings.warn(

209

"The behavior of this method will change in future versions. "

210

"Use specific 'len(elem)' or 'elem is not None' test instead.",

211

FutureWarning, stacklevel=2

212

)

213

return len(self._children) != 0 # emulate old behaviour, for now

214

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

215

def __getitem__(self, index):

216

return self._children[index]

217

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

218

def __setitem__(self, index, element):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

219

# if isinstance(index, slice):

220

# for elt in element:

221

# assert iselement(elt)

222

# else:

223

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

224

self._children[index] = element

225

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

226

def __delitem__(self, index):

227

del self._children[index]

228

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

229

def append(self, subelement):

230

"""Add *subelement* to the end of this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

231

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

232

The new element will appear in document order after the last existing

233

subelement (or directly after the text, if it's the first subelement),

234

but before the end tag for this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

235

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

236

"""

237

self._assert_is_element(subelement)

238

self._children.append(subelement)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

239

240

def extend(self, elements):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

241

"""Append subelements from a sequence.

242

243

*elements* is a sequence with zero or more elements.

244

245

"""

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

246

for element in elements:

247

self._assert_is_element(element)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

248

self._children.extend(elements)

249

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

250

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

251

def insert(self, index, subelement):

252

"""Insert *subelement* at position *index*."""

253

self._assert_is_element(subelement)

254

self._children.insert(index, subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

255

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

256

def _assert_is_element(self, e):

Antoine Pitrou

ee32931

2012-10-04 19:53:29 +0200

[diff] [blame]

257

# Need to refer to the actual Python implementation, not the

258

# shadowing C implementation.

259

if not isinstance(e, _Element):

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

260

raise TypeError('expected an Element, not %s' % type(e).__name__)

261

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

262

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

263

def remove(self, subelement):

264

"""Remove matching subelement.

265

266

Unlike the find methods, this method compares elements based on

267

identity, NOT ON tag value or contents. To remove subelements by

268

other means, the easiest way is to use a list comprehension to

269

select what elements to keep, and then use slice assignment to update

270

the parent element.

271

272

ValueError is raised if a matching element could not be found.

273

274

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

275

# assert iselement(element)

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

276

self._children.remove(subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

277

278

def getchildren(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

279

"""(Deprecated) Return all subelements.

280

281

Elements are returned in document order.

282

283

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

284

warnings.warn(

285

"This method will be removed in future versions. "

286

"Use 'list(elem)' or iteration over elem instead.",

287

DeprecationWarning, stacklevel=2

288

)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

289

return self._children

290

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

291

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

292

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

293

"""Find first matching element by tag name or path.

294

295

*path* is a string having either an element tag or an XPath,

296

*namespaces* is an optional mapping from namespace prefix to full name.

297

298

Return the first matching element, or None if no element was found.

299

300

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

301

return ElementPath.find(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

302

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

303

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

304

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

305

"""Find text for first matching element by tag name or path.

306

307

*path* is a string having either an element tag or an XPath,

308

*default* is the value to return if the element was not found,

309

*namespaces* is an optional mapping from namespace prefix to full name.

310

311

Return text content of first matching element, or default value if

312

none was found. Note that if an element is found having no text

313

content, the empty string is returned.

314

315

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

316

return ElementPath.findtext(self, path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

317

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

318

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

319

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

320

"""Find all matching subelements by tag name or path.

321

322

*path* is a string having either an element tag or an XPath,

323

*namespaces* is an optional mapping from namespace prefix to full name.

324

325

Returns list containing all matching elements in document order.

326

327

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

328

return ElementPath.findall(self, path, namespaces)

329

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

330

331

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

332

"""Find all matching subelements by tag name or path.

333

334

*path* is a string having either an element tag or an XPath,

335

*namespaces* is an optional mapping from namespace prefix to full name.

336

337

Return an iterable yielding all matching elements in document order.

338

339

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

340

return ElementPath.iterfind(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

341

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

342

343

def clear(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

344

"""Reset element.

345

346

This function removes all subelements, clears all attributes, and sets

347

the text and tail attributes to None.

348

349

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

350

self.attrib.clear()

351

self._children = []

352

self.text = self.tail = None

353

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

354

355

def get(self, key, default=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

356

"""Get element attribute.

357

358

Equivalent to attrib.get, but some implementations may handle this a

359

bit more efficiently. *key* is what attribute to look for, and

360

*default* is what to return if the attribute was not found.

361

362

Returns a string containing the attribute value, or the default if

363

attribute was not found.

364

365

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

366

return self.attrib.get(key, default)

367

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

368

369

def set(self, key, value):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

370

"""Set element attribute.

371

372

Equivalent to attrib[key] = value, but some implementations may handle

373

this a bit more efficiently. *key* is what attribute to set, and

374

*value* is the attribute value to set it to.

375

376

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

377

self.attrib[key] = value

378

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

379

380

def keys(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

381

"""Get list of attribute names.

382

383

Names are returned in an arbitrary order, just like an ordinary

384

Python dict. Equivalent to attrib.keys()

385

386

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

387

return self.attrib.keys()

388

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

389

390

def items(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

391

"""Get element attributes as a sequence.

392

393

The attributes are returned in arbitrary order. Equivalent to

394

attrib.items().

395

396

Return a list of (name, value) tuples.

397

398

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

399

return self.attrib.items()

400

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

401

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

402

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

403

"""Create tree iterator.

404

405

The iterator loops over the element and all subelements in document

406

order, returning all elements with a matching tag.

407

408

If the tree structure is modified during iteration, new or removed

409

elements may or may not be included. To get a stable set, use the

410

list() function on the iterator, and loop over the resulting list.

411

412

*tag* is what tags to look for (default is to return all elements)

413

414

Return an iterator containing all the matching elements.

415

416

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

417

if tag == "*":

418

tag = None

419

if tag is None or self.tag == tag:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

420

yield self

421

for e in self._children:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

422

yield from e.iter(tag)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

423

424

# compatibility

425

def getiterator(self, tag=None):

426

# Change for a DeprecationWarning in 1.4

427

warnings.warn(

428

"This method will be removed in future versions. "

429

"Use 'elem.iter()' or 'list(elem.iter())' instead.",

430

PendingDeprecationWarning, stacklevel=2

431

)

432

return list(self.iter(tag))

433

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

434

435

def itertext(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

436

"""Create text iterator.

437

438

The iterator loops over the element and all subelements in document

439

order, returning all inner text.

440

441

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

442

tag = self.tag

443

if not isinstance(tag, str) and tag is not None:

return

if self.text:

yield self.text

for e in self:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

448

yield from e.itertext()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

449

if e.tail:

450

yield e.tail

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

451

452

# compatibility

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

453

_Element = _ElementInterface = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

454

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

455

456

def SubElement(parent, tag, attrib={}, **extra):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

457

"""Subelement factory which creates an element instance, and appends it

458

to an existing parent.

459

460

The element tag, attribute names, and attribute values can be either

461

bytes or Unicode strings.

462

463

*parent* is the parent element, *tag* is the subelements name, *attrib* is

464

an optional directory containing element attributes, *extra* are

465

additional attributes given as keyword arguments.

466

467

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

468

attrib = attrib.copy()

469

attrib.update(extra)

470

element = parent.makeelement(tag, attrib)

471

parent.append(element)

472

return element

473

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

474

475

def Comment(text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

476

"""Comment element factory.

477

478

This function creates a special element which the standard serializer

479

serializes as an XML comment.

480

481

*text* is a string containing the comment string.

482

483

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

484

element = Element(Comment)

element.text = text

return element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

488

489

def ProcessingInstruction(target, text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

490

"""Processing Instruction element factory.

491

492

This function creates a special element which the standard serializer

493

serializes as an XML comment.

494

495

*target* is a string containing the processing instruction, *text* is a

496

string containing the processing instruction contents, if any.

497

498

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

499

element = Element(ProcessingInstruction)

500

element.text = target

501

if text:

502

element.text = element.text + " " + text

503

return element

504

505

PI = ProcessingInstruction

506

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

507

508

class QName:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

509

"""Qualified name wrapper.

510

511

This class can be used to wrap a QName attribute value in order to get

512

proper namespace handing on output.

513

514

*text_or_uri* is a string containing the QName value either in the form

515

{uri}local, or if the tag argument is given, the URI part of a QName.

516

517

*tag* is an optional argument which if given, will make the first

518

argument (text_or_uri) be interpreted as a URI, and this argument (tag)

519

be interpreted as a local name.

520

521

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

522

def __init__(self, text_or_uri, tag=None):

523

if tag:

524

text_or_uri = "{%s}%s" % (text_or_uri, tag)

525

self.text = text_or_uri

526

def __str__(self):

527

return self.text

Georg Brandl

b56c0e2

2010-12-09 18:10:27 +0000

[diff] [blame]

528

def __repr__(self):

Georg Brandl

c95c918

2010-12-09 18:26:02 +0000

[diff] [blame]

529

return '<QName %r>' % (self.text,)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

530

def __hash__(self):

531

return hash(self.text)

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

532

def __le__(self, other):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

533

if isinstance(other, QName):

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

534

return self.text <= other.text

535

return self.text <= other

536

def __lt__(self, other):

537

if isinstance(other, QName):

538

return self.text < other.text

539

return self.text < other

540

def __ge__(self, other):

541

if isinstance(other, QName):

542

return self.text >= other.text

543

return self.text >= other

544

def __gt__(self, other):

545

if isinstance(other, QName):

546

return self.text > other.text

547

return self.text > other

548

def __eq__(self, other):

549

if isinstance(other, QName):

550

return self.text == other.text

551

return self.text == other

552

def __ne__(self, other):

553

if isinstance(other, QName):

554

return self.text != other.text

555

return self.text != other

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

556

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

557

# --------------------------------------------------------------------

558

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

559

560

class ElementTree:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

561

"""An XML element hierarchy.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

562

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

563

This class also provides support for serialization to and from

564

standard XML.

565

566

*element* is an optional root element node,

567

*file* is an optional file handle or file name of an XML file whose

568

contents will be used to initialize the tree with.

569

570

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

571

def __init__(self, element=None, file=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

572

# assert element is None or iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

573

self._root = element # first node

if file:

self.parse(file)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

577

def getroot(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

578

"""Return root element of this tree."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

579

return self._root

580

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

581

def _setroot(self, element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

582

"""Replace root element of this tree.

583

584

This will discard the current contents of the tree and replace it

585

with the given element. Use with care!

586

587

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

588

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

589

self._root = element

590

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

591

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

592

"""Load external XML document into element tree.

593

594

*source* is a file name or file object, *parser* is an optional parser

595

instance that defaults to XMLParser.

596

597

ParseError is raised if the parser fails to parse the document.

598

599

Returns the root element of the given source document.

600

601

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

602

close_source = False

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

603

if not hasattr(source, "read"):

604

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

close_source = True

try:

if not parser:

parser = XMLParser(target=TreeBuilder())

609

while 1:

610

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

return self._root

finally:

if close_source:

source.close()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

619

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

620

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

621

"""Create and return tree iterator for the root element.

622

623

The iterator loops over all elements in this tree, in document order.

624

625

*tag* is a string with the tag name to iterate over

626

(default is to return all elements).

627

628

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

629

# assert self._root is not None

630

return self._root.iter(tag)

631

632

# compatibility

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

633

def getiterator(self, tag=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

634

# Change for a DeprecationWarning in 1.4

635

warnings.warn(

636

"This method will be removed in future versions. "

637

"Use 'tree.iter()' or 'list(tree.iter())' instead.",

638

PendingDeprecationWarning, stacklevel=2

639

)

640

return list(self.iter(tag))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

641

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

642

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

643

"""Find first matching element by tag name or path.

644

645

Same as getroot().find(path), which is Element.find()

646

647

*path* is a string having either an element tag or an XPath,

648

*namespaces* is an optional mapping from namespace prefix to full name.

649

650

Return the first matching element, or None if no element was found.

651

652

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

653

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

654

if path[:1] == "/":

655

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

656

warnings.warn(

657

"This search is broken in 1.3 and earlier, and will be "

658

"fixed in a future version. If you rely on the current "

659

"behaviour, change it to %r" % path,

660

FutureWarning, stacklevel=2

661

)

662

return self._root.find(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

663

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

664

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

665

"""Find first matching element by tag name or path.

666

667

Same as getroot().findtext(path), which is Element.findtext()

668

669

*path* is a string having either an element tag or an XPath,

670

*namespaces* is an optional mapping from namespace prefix to full name.

671

672

Return the first matching element, or None if no element was found.

673

674

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

675

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

676

if path[:1] == "/":

677

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

678

warnings.warn(

679

"This search is broken in 1.3 and earlier, and will be "

680

"fixed in a future version. If you rely on the current "

681

"behaviour, change it to %r" % path,

682

FutureWarning, stacklevel=2

683

)

684

return self._root.findtext(path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

685

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

686

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

687

"""Find all matching subelements by tag name or path.

688

689

Same as getroot().findall(path), which is Element.findall().

690

691

*path* is a string having either an element tag or an XPath,

692

*namespaces* is an optional mapping from namespace prefix to full name.

693

694

Return list containing all matching elements in document order.

695

696

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

697

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

698

if path[:1] == "/":

699

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

700

warnings.warn(

701

"This search is broken in 1.3 and earlier, and will be "

702

"fixed in a future version. If you rely on the current "

703

"behaviour, change it to %r" % path,

704

FutureWarning, stacklevel=2

705

)

706

return self._root.findall(path, namespaces)

707

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

708

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

709

"""Find all matching subelements by tag name or path.

710

711

Same as getroot().iterfind(path), which is element.iterfind()

712

713

*path* is a string having either an element tag or an XPath,

714

*namespaces* is an optional mapping from namespace prefix to full name.

715

716

Return an iterable yielding all matching elements in document order.

717

718

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

719

# assert self._root is not None

if path[:1] == "/":

path = "." + path

warnings.warn(

"This search is broken in 1.3 and earlier, and will be "

724

"fixed in a future version. If you rely on the current "

725

"behaviour, change it to %r" % path,

726

FutureWarning, stacklevel=2

727

)

728

return self._root.iterfind(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

729

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

730

def write(self, file_or_filename,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

731

encoding=None,

732

xml_declaration=None,

733

default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

734

method=None, *,

735

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

736

"""Write element tree to a file as XML.

737

738

Arguments:

739

*file_or_filename* -- file name or a file object opened for writing

740

741

*encoding* -- the output encoding (default: US-ASCII)

742

743

*xml_declaration* -- bool indicating if an XML declaration should be

744

added to the output. If None, an XML declaration

745

is added if encoding IS NOT either of:

746

US-ASCII, UTF-8, or Unicode

747

748

*default_namespace* -- sets the default XML namespace (for "xmlns")

749

750

*method* -- either "xml" (default), "html, "text", or "c14n"

751

752

*short_empty_elements* -- controls the formatting of elements

753

that contain no content. If True (default)

754

they are emitted as a single self-closed

755

tag, otherwise they are emitted as a pair

756

of start/end tags

Eli Bendersky

e9af827

2013-01-13 06:27:51 -0800

[diff] [blame]

757

758

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

759

if not method:

760

method = "xml"

761

elif method not in _serialize:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

762

raise ValueError("unknown method %r" % method)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

if not encoding:

if method == "c14n":

encoding = "utf-8"

else:

encoding = "us-ascii"

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

768

else:

769

encoding = encoding.lower()

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

770

with _get_writer(file_or_filename, encoding) as write:

771

if method == "xml" and (xml_declaration or

772

(xml_declaration is None and

773

encoding not in ("utf-8", "us-ascii", "unicode"))):

774

declared_encoding = encoding

775

if encoding == "unicode":

776

# Retrieve the default encoding for the xml declaration

777

import locale

778

declared_encoding = locale.getpreferredencoding()

779

write("<?xml version='1.0' encoding='%s'?>\n" % (

780

declared_encoding,))

781

if method == "text":

782

_serialize_text(write, self._root)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

783

else:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

784

qnames, namespaces = _namespaces(self._root, default_namespace)

785

serialize = _serialize[method]

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

786

serialize(write, self._root, qnames, namespaces,

787

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

788

789

def write_c14n(self, file):

790

# lxml.etree compatibility. use output method instead

791

return self.write(file, method="c14n")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

792

793

# --------------------------------------------------------------------

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

794

# serialization support

795

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

796

@contextlib.contextmanager

797

def _get_writer(file_or_filename, encoding):

798

# returns text write method and release all resourses after using

799

try:

800

write = file_or_filename.write

801

except AttributeError:

802

# file_or_filename is a file name

803

if encoding == "unicode":

804

file = open(file_or_filename, "w")

805

else:

806

file = open(file_or_filename, "w", encoding=encoding,

807

errors="xmlcharrefreplace")

with file:

yield file.write

else:

# file_or_filename is a file-like object

812

# encoding determines if it is a text or binary writer

813

if encoding == "unicode":

814

# use a text writer as is

815

yield write

816

else:

817

# wrap a binary writer with TextIOWrapper

818

with contextlib.ExitStack() as stack:

819

if isinstance(file_or_filename, io.BufferedIOBase):

820

file = file_or_filename

821

elif isinstance(file_or_filename, io.RawIOBase):

822

file = io.BufferedWriter(file_or_filename)

823

# Keep the original file open when the BufferedWriter is

824

# destroyed

825

stack.callback(file.detach)

826

else:

827

# This is to handle passed objects that aren't in the

828

# IOBase hierarchy, but just have a write method

829

file = io.BufferedIOBase()

830

file.writable = lambda: True

831

file.write = write

832

try:

833

# TextIOWrapper uses this methods to determine

834

# if BOM (for UTF-16, etc) should be added

835

file.seekable = file_or_filename.seekable

836

file.tell = file_or_filename.tell

837

except AttributeError:

838

pass

839

file = io.TextIOWrapper(file,

840

encoding=encoding,

841

errors="xmlcharrefreplace",

842

newline="\n")

843

# Keep the original file open when the TextIOWrapper is

844

# destroyed

845

stack.callback(file.detach)

846

yield file.write

847

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

848

def _namespaces(elem, default_namespace=None):

849

# identify namespaces used in this tree

850

851

# maps qnames to *encoded* prefix:local names

852

qnames = {None: None}

853

854

# maps uri:s to prefixes

855

namespaces = {}

856

if default_namespace:

857

namespaces[default_namespace] = ""

858

859

def add_qname(qname):

860

# calculate serialized qname representation

861

try:

862

if qname[:1] == "{":

863

uri, tag = qname[1:].rsplit("}", 1)

864

prefix = namespaces.get(uri)

865

if prefix is None:

866

prefix = _namespace_map.get(uri)

867

if prefix is None:

868

prefix = "ns%d" % len(namespaces)

869

if prefix != "xml":

870

namespaces[uri] = prefix

871

if prefix:

872

qnames[qname] = "%s:%s" % (prefix, tag)

873

else:

874

qnames[qname] = tag # default element

875

else:

876

if default_namespace:

877

# FIXME: can this be handled in XML 1.0?

878

raise ValueError(

879

"cannot use non-qualified names with "

880

"default_namespace option"

881

)

882

qnames[qname] = qname

883

except TypeError:

884

_raise_serialization_error(qname)

885

886

# populate qname and namespaces table

Eli Bendersky

64d11e6

2012-06-15 07:42:50 +0300

[diff] [blame]

887

for elem in elem.iter():

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

888

tag = elem.tag

Senthil Kumaran

ec30b3d

2010-11-09 02:36:59 +0000

[diff] [blame]

889

if isinstance(tag, QName):

890

if tag.text not in qnames:

891

add_qname(tag.text)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

892

elif isinstance(tag, str):

893

if tag not in qnames:

894

add_qname(tag)

895

elif tag is not None and tag is not Comment and tag is not PI:

896

_raise_serialization_error(tag)

897

for key, value in elem.items():

898

if isinstance(key, QName):

899

key = key.text

900

if key not in qnames:

901

add_qname(key)

902

if isinstance(value, QName) and value.text not in qnames:

903

add_qname(value.text)

904

text = elem.text

905

if isinstance(text, QName) and text.text not in qnames:

906

add_qname(text.text)

907

return qnames, namespaces

908

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

909

def _serialize_xml(write, elem, qnames, namespaces,

910

short_empty_elements, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % text)

915

elif tag is ProcessingInstruction:

916

write("<?%s?>" % text)

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

922

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

923

_serialize_xml(write, e, qnames, None,

924

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

925

else:

926

write("<" + tag)

927

items = list(elem.items())

928

if items or namespaces:

929

if namespaces:

930

for v, k in sorted(namespaces.items(),

931

key=lambda x: x[1]): # sort on prefix

932

if k:

933

k = ":" + k

934

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

939

if isinstance(k, QName):

940

k = k.text

941

if isinstance(v, QName):

942

v = qnames[v.text]

943

else:

944

v = _escape_attrib(v)

945

write(" %s=\"%s\"" % (qnames[k], v))

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

946

if text or len(elem) or not short_empty_elements:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

947

write(">")

948

if text:

949

write(_escape_cdata(text))

950

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

951

_serialize_xml(write, e, qnames, None,

952

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

953

write("</" + tag + ">")

else:

write(" />")

if elem.tail:

write(_escape_cdata(elem.tail))

958

959

HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",

Ezio Melotti

c90111f

2012-09-19 08:19:12 +0300

[diff] [blame]

960

"img", "input", "isindex", "link", "meta", "param")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

961

962

try:

963

HTML_EMPTY = set(HTML_EMPTY)

except NameError:

pass

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

967

def _serialize_html(write, elem, qnames, namespaces, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % _escape_cdata(text))

972

elif tag is ProcessingInstruction:

973

write("<?%s?>" % _escape_cdata(text))

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

979

for e in elem:

980

_serialize_html(write, e, qnames, None)

981

else:

982

write("<" + tag)

983

items = list(elem.items())

984

if items or namespaces:

985

if namespaces:

986

for v, k in sorted(namespaces.items(),

987

key=lambda x: x[1]): # sort on prefix

988

if k:

989

k = ":" + k

990

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

995

if isinstance(k, QName):

996

k = k.text

997

if isinstance(v, QName):

998

v = qnames[v.text]

999

else:

1000

v = _escape_attrib_html(v)

1001

# FIXME: handle boolean attributes

1002

write(" %s=\"%s\"" % (qnames[k], v))

write(">")

tag = tag.lower()

if text:

if tag == "script" or tag == "style":

1007

write(text)

1008

else:

1009

write(_escape_cdata(text))

1010

for e in elem:

1011

_serialize_html(write, e, qnames, None)

1012

if tag not in HTML_EMPTY:

1013

write("</" + tag + ">")

1014

if elem.tail:

1015

write(_escape_cdata(elem.tail))

1016

1017

def _serialize_text(write, elem):

1018

for part in elem.itertext():

write(part)

if elem.tail:

write(elem.tail)

_serialize = {

"xml": _serialize_xml,

1025

"html": _serialize_html,

1026

"text": _serialize_text,

1027

# this optional method is imported at the end of the module

1028

# "c14n": _serialize_c14n,

1029

}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1030

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1031

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1032

def register_namespace(prefix, uri):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1033

"""Register a namespace prefix.

1034

1035

The registry is global, and any existing mapping for either the

1036

given prefix or the namespace URI will be removed.

1037

1038

*prefix* is the namespace prefix, *uri* is a namespace uri. Tags and

1039

attributes in this namespace will be serialized with prefix if possible.

1040

1041

ValueError is raised if prefix is reserved or is invalid.

1042

1043

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1044

if re.match("ns\d+$", prefix):

1045

raise ValueError("Prefix format reserved for internal use")

Georg Brandl

90b2067

2010-12-28 10:38:33 +0000

[diff] [blame]

1046

for k, v in list(_namespace_map.items()):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1047

if k == uri or v == prefix:

1048

del _namespace_map[k]

1049

_namespace_map[uri] = prefix

1050

1051

_namespace_map = {

1052

# "well-known" namespace prefixes

1053

"http://www.w3.org/XML/1998/namespace": "xml",

1054

"http://www.w3.org/1999/xhtml": "html",

1055

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

1056

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

1057

# xml schema

1058

"http://www.w3.org/2001/XMLSchema": "xs",

1059

"http://www.w3.org/2001/XMLSchema-instance": "xsi",

1060

# dublin core

1061

"http://purl.org/dc/elements/1.1/": "dc",

1062

}

Florent Xicluna

1639505

2012-02-16 23:28:35 +0100

[diff] [blame]

1063

# For tests and troubleshooting

1064

register_namespace._namespace_map = _namespace_map

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1065

1066

def _raise_serialization_error(text):

1067

raise TypeError(

1068

"cannot serialize %r (type %s)" % (text, type(text).__name__)

1069

)

1070

1071

def _escape_cdata(text):

1072

# escape character data

1073

try:

1074

# it's worth avoiding do-nothing calls for strings that are

1075

# shorter than 500 character, or so. assume that's, by far,

1076

# the most common case in most applications.

1077

if "&" in text:

1078

text = text.replace("&", "&")

1079

if "<" in text:

1080

text = text.replace("<", "<")

1081

if ">" in text:

1082

text = text.replace(">", ">")

1083

return text

1084

except (TypeError, AttributeError):

1085

_raise_serialization_error(text)

1086

1087

def _escape_attrib(text):

1088

# escape attribute value

1089

try:

1090

if "&" in text:

1091

text = text.replace("&", "&")

1092

if "<" in text:

1093

text = text.replace("<", "<")

1094

if ">" in text:

1095

text = text.replace(">", ">")

1096

if "\"" in text:

1097

text = text.replace("\"", """)

1098

if "\n" in text:

1099

text = text.replace("\n", "
")

1100

return text

1101

except (TypeError, AttributeError):

1102

_raise_serialization_error(text)

1103

1104

def _escape_attrib_html(text):

1105

# escape attribute value

1106

try:

1107

if "&" in text:

1108

text = text.replace("&", "&")

1109

if ">" in text:

1110

text = text.replace(">", ">")

1111

if "\"" in text:

1112

text = text.replace("\"", """)

1113

return text

1114

except (TypeError, AttributeError):

1115

_raise_serialization_error(text)

1116

1117

# --------------------------------------------------------------------

1118

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1119

def tostring(element, encoding=None, method=None, *,

1120

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1121

"""Generate string representation of XML element.

1122

1123

All subelements are included. If encoding is "unicode", a string

1124

is returned. Otherwise a bytestring is returned.

1125

1126

*element* is an Element instance, *encoding* is an optional output

1127

encoding defaulting to US-ASCII, *method* is an optional output which can

1128

be one of "xml" (default), "html", "text" or "c14n".

1129

1130

Returns an (optionally) encoded string containing the XML data.

1131

1132

"""

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1133

stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1134

ElementTree(element).write(stream, encoding, method=method,

1135

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1136

return stream.getvalue()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1137

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1138

class _ListDataStream(io.BufferedIOBase):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1139

"""An auxiliary stream accumulating into a list reference."""

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1140

def __init__(self, lst):

1141

self.lst = lst

Eli Bendersky

f90fc68

2012-07-17 15:09:56 +0300

[diff] [blame]

1142

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

def writable(self):

return True

def seekable(self):

return True

def write(self, b):

self.lst.append(b)

def tell(self):

return len(self.lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1155

def tostringlist(element, encoding=None, method=None, *,

1156

short_empty_elements=True):

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1157

lst = []

1158

stream = _ListDataStream(lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1159

ElementTree(element).write(stream, encoding, method=method,

1160

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1161

return lst

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1162

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1163

1164

def dump(elem):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1165

"""Write element tree or element structure to sys.stdout.

1166

1167

This function should be used for debugging only.

1168

1169

*elem* is either an ElementTree, or a single Element. The exact output

1170

format is implementation dependent. In this version, it's written as an

1171

ordinary XML file.

1172

1173

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1174

# debugging

1175

if not isinstance(elem, ElementTree):

1176

elem = ElementTree(elem)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

1177

elem.write(sys.stdout, encoding="unicode")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1178

tail = elem.getroot().tail

1179

if not tail or tail[-1] != "\n":

1180

sys.stdout.write("\n")

1181

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1182

# --------------------------------------------------------------------

1183

# parsing

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1184

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1185

1186

def parse(source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1187

"""Parse XML document into element tree.

1188

1189

*source* is a filename or file object containing XML data,

1190

*parser* is an optional parser instance defaulting to XMLParser.

1191

1192

Return an ElementTree instance.

1193

1194

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1195

tree = ElementTree()

1196

tree.parse(source, parser)

1197

return tree

1198

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1199

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1200

def iterparse(source, events=None, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1201

"""Incrementally parse XML document into ElementTree.

1202

1203

This class also reports what's going on to the user based on the

1204

*events* it is initialized with. The supported events are the strings

1205

"start", "end", "start-ns" and "end-ns" (the "ns" events are used to get

1206

detailed namespace information). If *events* is omitted, only

1207

"end" events are reported.

1208

1209

*source* is a filename or file object containing XML data, *events* is

1210

a list of events to report back, *parser* is an optional parser instance.

1211

1212

Returns an iterator providing (event, elem) pairs.

1213

1214

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1215

close_source = False

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1216

if not hasattr(source, "read"):

1217

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1218

close_source = True

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1219

if not parser:

1220

parser = XMLParser(target=TreeBuilder())

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1221

return _IterParseIterator(source, events, parser, close_source)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1222

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1223

class _IterParseIterator:

1224

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1225

def __init__(self, source, events, parser, close_source=False):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1226

self._file = source

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1227

self._close_file = close_source

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1228

self._events = []

1229

self._index = 0

Florent Xicluna

91d5193

2011-11-01 23:31:09 +0100

[diff] [blame]

1230

self._error = None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1231

self.root = self._root = None

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1232

self._parser = parser

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1233

# wire up the parser for event reporting

1234

parser = self._parser._parser

1235

append = self._events.append

if events is None:

events = ["end"]

for event in events:

if event == "start":

try:

parser.ordered_attributes = 1

1242

parser.specified_attributes = 1

1243

def handler(tag, attrib_in, event=event, append=append,

1244

start=self._parser._start_list):

1245

append((event, start(tag, attrib_in)))

1246

parser.StartElementHandler = handler

1247

except AttributeError:

1248

def handler(tag, attrib_in, event=event, append=append,

1249

start=self._parser._start):

1250

append((event, start(tag, attrib_in)))

1251

parser.StartElementHandler = handler

1252

elif event == "end":

1253

def handler(tag, event=event, append=append,

1254

end=self._parser._end):

1255

append((event, end(tag)))

1256

parser.EndElementHandler = handler

1257

elif event == "start-ns":

1258

def handler(prefix, uri, event=event, append=append):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1259

append((event, (prefix or "", uri or "")))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1260

parser.StartNamespaceDeclHandler = handler

1261

elif event == "end-ns":

1262

def handler(prefix, event=event, append=append):

1263

append((event, None))

1264

parser.EndNamespaceDeclHandler = handler

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1265

else:

1266

raise ValueError("unknown event %r" % event)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1267

Georg Brandl

a18af4e

2007-04-21 15:47:16 +0000

[diff] [blame]

1268

def __next__(self):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1269

while 1:

1270

try:

1271

item = self._events[self._index]

Florent Xicluna

91d5193

2011-11-01 23:31:09 +0100

[diff] [blame]

1272

self._index += 1

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1273

return item

Florent Xicluna

91d5193

2011-11-01 23:31:09 +0100

[diff] [blame]

except IndexError:

pass

if self._error:

e = self._error

self._error = None

raise e

if self._parser is None:

1281

self.root = self._root

if self._close_file:

self._file.close()

raise StopIteration

# load event buffer

del self._events[:]

self._index = 0

data = self._file.read(16384)

1289

if data:

1290

try:

1291

self._parser.feed(data)

1292

except SyntaxError as exc:

1293

self._error = exc

1294

else:

1295

self._root = self._parser.close()

1296

self._parser = None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1297

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1298

def __iter__(self):

1299

return self

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1300

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1301

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1302

def XML(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1303

"""Parse XML document from string constant.

1304

1305

This function can be used to embed "XML Literals" in Python code.

1306

1307

*text* is a string containing XML data, *parser* is an

1308

optional parser instance, defaulting to the standard XMLParser.

1309

1310

Returns an Element instance.

1311

1312

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1313

if not parser:

1314

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1315

parser.feed(text)

1316

return parser.close()

1317

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1318

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1319

def XMLID(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1320

"""Parse XML document from string constant for its IDs.

1321

1322

*text* is a string containing XML data, *parser* is an

1323

optional parser instance, defaulting to the standard XMLParser.

1324

1325

Returns an (Element, dict) tuple, in which the

1326

dict maps element id:s to elements.

1327

1328

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1329

if not parser:

1330

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1331

parser.feed(text)

1332

tree = parser.close()

1333

ids = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1334

for elem in tree.iter():

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

id = elem.get("id")

if id:

ids[id] = elem

return tree, ids

Victor Stinner

2013-03-26 01:11:54 +0100

[diff] [blame]

1340

# Parse XML document from string constant. Alias for XML().

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1341

fromstring = XML

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1342

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1343

def fromstringlist(sequence, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1344

"""Parse XML document from sequence of string fragments.

1345

1346

*sequence* is a list of other sequence, *parser* is an optional parser

1347

instance, defaulting to the standard XMLParser.

1348

1349

Returns an Element instance.

1350

1351

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1352

if not parser:

1353

parser = XMLParser(target=TreeBuilder())

1354

for text in sequence:

1355

parser.feed(text)

1356

return parser.close()

1357

1358

# --------------------------------------------------------------------

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1359

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1360

1361

class TreeBuilder:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1362

"""Generic element structure builder.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1363

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1364

This builder converts a sequence of start, data, and end method

1365

calls to a well-formed element structure.

1366

1367

You can use this class to build an element structure using a custom XML

1368

parser, or a parser for some other XML-like format.

1369

1370

*element_factory* is an optional element factory which is called

1371

to create new Element instances, as necessary.

1372

1373

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1374

def __init__(self, element_factory=None):

1375

self._data = [] # data collector

1376

self._elem = [] # element stack

1377

self._last = None # last element

1378

self._tail = None # true if we're after an end tag

1379

if element_factory is None:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1380

element_factory = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1381

self._factory = element_factory

1382

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1383

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1384

"""Flush builder buffers and return toplevel document Element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1385

assert len(self._elem) == 0, "missing end tags"

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1386

assert self._last is not None, "missing toplevel element"

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

return self._last

def _flush(self):

if self._data:

if self._last is not None:

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1392

text = "".join(self._data)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1393

if self._tail:

1394

assert self._last.tail is None, "internal error (tail)"

1395

self._last.tail = text

1396

else:

1397

assert self._last.text is None, "internal error (text)"

1398

self._last.text = text

1399

self._data = []

1400

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1401

1402

def data(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1403

"""Add text to current element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1404

self._data.append(data)

1405

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1406

1407

def start(self, tag, attrs):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1408

"""Open new element and return it.

1409

1410

*tag* is the element name, *attrs* is a dict containing element

1411

attributes.

1412

1413

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1414

self._flush()

1415

self._last = elem = self._factory(tag, attrs)

1416

if self._elem:

1417

self._elem[-1].append(elem)

1418

self._elem.append(elem)

self._tail = 0

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1422

1423

def end(self, tag):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1424

"""Close and return current Element.

1425

1426

*tag* is the element name.

1427

1428

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1429

self._flush()

1430

self._last = self._elem.pop()

1431

assert self._last.tag == tag,\

1432

"end tag mismatch (expected %s, got %s)" % (

self._last.tag, tag)

self._tail = 1

return self._last

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1437

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1438

# also see ElementTree and TreeBuilder

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1439

class XMLParser:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1440

"""Element structure builder for XML source data based on the expat parser.

1441

1442

*html* are predefined HTML entities (not supported currently),

1443

*target* is an optional target object which defaults to an instance of the

1444

standard TreeBuilder class, *encoding* is an optional encoding string

1445

which if given, overrides the encoding specified in the XML file:

1446

http://www.iana.org/assignments/character-sets

1447

1448

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1449

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1450

def __init__(self, html=0, target=None, encoding=None):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1451

try:

Thomas Wouters

2006-08-11 14:57:12 +0000

[diff] [blame]

1452

from xml.parsers import expat

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1453

except ImportError:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1454

try:

1455

import pyexpat as expat

1456

except ImportError:

1457

raise ImportError(

1458

"No module named expat; use SimpleXMLTreeBuilder instead"

1459

)

1460

parser = expat.ParserCreate(encoding, "}")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1461

if target is None:

1462

target = TreeBuilder()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1463

# underscored names are provided for compatibility only

1464

self.parser = self._parser = parser

1465

self.target = self._target = target

1466

self._error = expat.error

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1467

self._names = {} # name memo cache

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1468

# main callbacks

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1469

parser.DefaultHandlerExpand = self._default

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1470

if hasattr(target, 'start'):

1471

parser.StartElementHandler = self._start

1472

if hasattr(target, 'end'):

1473

parser.EndElementHandler = self._end

1474

if hasattr(target, 'data'):

1475

parser.CharacterDataHandler = target.data

1476

# miscellaneous callbacks

1477

if hasattr(target, 'comment'):

1478

parser.CommentHandler = target.comment

1479

if hasattr(target, 'pi'):

1480

parser.ProcessingInstructionHandler = target.pi

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1481

# let expat do the buffering, if supported

1482

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1483

parser.buffer_text = 1

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1484

except AttributeError:

1485

pass

1486

# use new-style attribute handling, if supported

1487

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1488

parser.ordered_attributes = 1

1489

parser.specified_attributes = 1

1490

if hasattr(target, 'start'):

1491

parser.StartElementHandler = self._start_list

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1492

except AttributeError:

1493

pass

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1494

self._doctype = None

1495

self.entity = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1496

try:

1497

self.version = "Expat %d.%d.%d" % expat.version_info

1498

except AttributeError:

1499

pass # unknown

1500

1501

def _raiseerror(self, value):

1502

err = ParseError(value)

1503

err.code = value.code

1504

err.position = value.lineno, value.offset

1505

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1506

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1507

def _fixname(self, key):

1508

# expand qname, and convert name string to ascii, if possible

1509

try:

1510

name = self._names[key]

except KeyError:

name = key

if "}" in name:

name = "{" + name

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1515

self._names[key] = name

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1516

return name

1517

1518

def _start(self, tag, attrib_in):

1519

fixname = self._fixname

1520

tag = fixname(tag)

1521

attrib = {}

1522

for key, value in attrib_in.items():

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1523

attrib[fixname(key)] = value

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1524

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1525

1526

def _start_list(self, tag, attrib_in):

1527

fixname = self._fixname

tag = fixname(tag)

attrib = {}

if attrib_in:

for i in range(0, len(attrib_in), 2):

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1532

attrib[fixname(attrib_in[i])] = attrib_in[i+1]

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1533

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1534

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1535

def _end(self, tag):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1536

return self.target.end(self._fixname(tag))

1537

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1538

def _default(self, text):

1539

prefix = text[:1]

1540

if prefix == "&":

1541

# deal with undefined entities

1542

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1543

data_handler = self.target.data

1544

except AttributeError:

1545

return

1546

try:

1547

data_handler(self.entity[text[1:-1]])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1548

except KeyError:

Thomas Wouters

2006-08-11 14:57:12 +0000

[diff] [blame]

1549

from xml.parsers import expat

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1550

err = expat.error(

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1551

"undefined entity %s: line %d, column %d" %

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1552

(text, self.parser.ErrorLineNumber,

1553

self.parser.ErrorColumnNumber)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1554

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1555

err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1556

err.lineno = self.parser.ErrorLineNumber

1557

err.offset = self.parser.ErrorColumnNumber

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1558

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1559

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1560

self._doctype = [] # inside a doctype declaration

1561

elif self._doctype is not None:

1562

# parse doctype contents

1563

if prefix == ">":

1564

self._doctype = None

1565

return

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1566

text = text.strip()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1567

if not text:

1568

return

1569

self._doctype.append(text)

1570

n = len(self._doctype)

1571

if n > 2:

1572

type = self._doctype[1]

1573

if type == "PUBLIC" and n == 4:

1574

name, type, pubid, system = self._doctype

Florent Xicluna

a1c974a

2012-07-07 13:16:44 +0200

[diff] [blame]

1575

if pubid:

1576

pubid = pubid[1:-1]

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1577

elif type == "SYSTEM" and n == 3:

1578

name, type, system = self._doctype

1579

pubid = None

1580

else:

1581

return

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1582

if hasattr(self.target, "doctype"):

1583

self.target.doctype(name, pubid, system[1:-1])

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1584

elif self.doctype != self._XMLParser__doctype:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1585

# warn about deprecated call

1586

self._XMLParser__doctype(name, pubid, system[1:-1])

1587

self.doctype(name, pubid, system[1:-1])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1588

self._doctype = None

1589

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1590

def doctype(self, name, pubid, system):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1591

"""(Deprecated) Handle doctype declaration

1592

1593

*name* is the Doctype name, *pubid* is the public identifier,

1594

and *system* is the system identifier.

1595

1596

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1597

warnings.warn(

1598

"This method of XMLParser is deprecated. Define doctype() "

1599

"method on the TreeBuilder target.",

DeprecationWarning,

)

# sentinel, if doctype is redefined in a subclass

1604

__doctype = doctype

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1605

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1606

def feed(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1607

"""Feed encoded data to parser."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1608

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1609

self.parser.Parse(data, 0)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1610

except self._error as v:

1611

self._raiseerror(v)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1612

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1613

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1614

"""Finish feeding data to parser and return element structure."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1615

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1616

self.parser.Parse("", 1) # end of data

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1617

except self._error as v:

1618

self._raiseerror(v)

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1619

try:

Florent Xicluna

fb06746

2012-03-05 11:42:49 +0100

[diff] [blame]

1620

close_handler = self.target.close

1621

except AttributeError:

1622

pass

1623

else:

1624

return close_handler()

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1625

finally:

1626

# get rid of circular references

1627

del self.parser, self._parser

1628

del self.target, self._target

Thomas Wouters

2006-08-11 14:57:12 +0000

[diff] [blame]

1629

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1630

1631

# Import the C accelerators

1632

try:

1633

# Element, SubElement, ParseError, TreeBuilder, XMLParser

1634

from _elementtree import *

except ImportError:

pass

else:

# Overwrite 'ElementTree.parse' and 'iterparse' to use the C XMLParser

1639

1640

class ElementTree(ElementTree):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1641

__doc__ = ElementTree.__doc__

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1642

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1643

__doc__ = ElementTree.parse.__doc__

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1644

close_source = False

1645

if not hasattr(source, 'read'):

1646

source = open(source, 'rb')

1647

close_source = True

1648

try:

1649

if parser is not None:

1650

while True:

1651

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

1656

else:

1657

parser = XMLParser()

1658

self._root = parser._parse(source)

return self._root

finally:

if close_source:

source.close()

class iterparse:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1665

__doc__ = iterparse.__doc__

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1666

root = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1667

def __init__(self, source, events=None, parser=None):

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1668

self._close_file = False

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1669

if not hasattr(source, 'read'):

1670

source = open(source, 'rb')

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1671

self._close_file = True

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1672

self._file = source

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

self._events = []

self._index = 0

self._error = None

self.root = self._root = None

Eli Bendersky

aaa9780

2013-01-24 07:15:19 -0800

[diff] [blame]

1677

if parser is None:

1678

parser = XMLParser(target=TreeBuilder())

1679

self._parser = parser

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1680

self._parser._setevents(self._events, events)

def __next__(self):

while True:

try:

item = self._events[self._index]

self._index += 1

return item

except IndexError:

pass

if self._error:

e = self._error

self._error = None

raise e

if self._parser is None:

1695

self.root = self._root

if self._close_file:

self._file.close()

raise StopIteration

# load event buffer

del self._events[:]

self._index = 0

data = self._file.read(16384)

1703

if data:

1704

try:

1705

self._parser.feed(data)

1706

except SyntaxError as exc:

1707

self._error = exc

1708

else:

1709

self._root = self._parser.close()

self._parser = None

def __iter__(self):

return self

Thomas Wouters

2006-08-11 14:57:12 +0000

[diff] [blame]

1715

# compatibility

Florent Xicluna