Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

2013-03-09 07:12:48 -0800

[diff] [blame]

1

"""Lightweight XML support for Python.

2

3

XML is an inherently hierarchical data format, and the most natural way to

4

represent it is with a tree. This module has two classes for this purpose:

5

6

1. ElementTree represents the whole XML document as a tree and

7

8

2. Element represents a single node in this tree.

9

10

Interactions with the whole document (reading and writing to/from files) are

11

usually done on the ElementTree level. Interactions with a single XML element

12

and its sub-elements are done on the Element level.

13

14

Element is a flexible container object designed to store hierarchical data

15

structures in memory. It can be described as a cross between a list and a

16

dictionary. Each Element has a number of properties associated with it:

17

18

'tag' - a string containing the element's name.

19

20

'attributes' - a Python dictionary storing the element's attributes.

21

22

'text' - a string containing the element's text content.

23

24

'tail' - an optional string containing text after the element's end tag.

25

26

And a number of child elements stored in a Python sequence.

27

28

To create an element instance, use the Element constructor,

29

or the SubElement factory function.

30

31

You can also use the ElementTree class to wrap an element structure

32

and convert it to and from XML.

"""

Eli Bendersky

2013-04-20 05:44:01 -0700

[diff] [blame]

36

#---------------------------------------------------------------------

37

# Licensed to PSF under a Contributor Agreement.

38

# See http://www.python.org/psf/license for licensing details.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

39

#

40

# ElementTree

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

41

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

42

#

43

# fredrik@pythonware.com

44

# http://www.pythonware.com

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

45

# --------------------------------------------------------------------

46

# The ElementTree toolkit is

47

#

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

48

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

49

#

50

# By obtaining, using, and/or copying this software and/or its

51

# associated documentation, you agree that you have read, understood,

52

# and will comply with the following terms and conditions:

53

#

54

# Permission to use, copy, modify, and distribute this software and

55

# its associated documentation for any purpose and without fee is

56

# hereby granted, provided that the above copyright notice appears in

57

# all copies, and that both that copyright notice and this permission

58

# notice appear in supporting documentation, and that the name of

59

# Secret Labs AB or the author not be used in advertising or publicity

60

# pertaining to distribution of the software without specific, written

61

# prior permission.

62

#

63

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

64

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

65

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

66

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

67

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

68

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

69

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

70

# OF THIS SOFTWARE.

71

# --------------------------------------------------------------------

__all__ = [

# public symbols

"Comment",

"dump",

"Element", "ElementTree",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

78

"fromstring", "fromstringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

79

"iselement", "iterparse",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

80

"parse", "ParseError",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

81

"PI", "ProcessingInstruction",

82

"QName",

83

"SubElement",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

84

"tostring", "tostringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

85

"TreeBuilder",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

86

"VERSION",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

87

"XML", "XMLID",

Thomas Wouters

2006-08-11 14:57:12 +0000

[diff] [blame]

88

"XMLParser", "XMLTreeBuilder",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

89

"register_namespace",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

90

]

91

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

92

VERSION = "1.3.0"

93

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

94

import sys

95

import re

96

import warnings

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

97

import io

98

import contextlib

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

99

Eli Bendersky

27cbb19

2012-06-15 09:03:19 +0300

[diff] [blame]

100

from . import ElementPath

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

101

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

102

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

103

class ParseError(SyntaxError):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

104

"""An error when parsing an XML document.

105

106

In addition to its exception value, a ParseError contains

107

two extra attributes:

108

'code' - the specific exception code

109

'position' - the line and column of the error

110

111

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

112

pass

113

114

# --------------------------------------------------------------------

115

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

116

117

def iselement(element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

118

"""Return True if *element* appears to be an Element."""

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

119

return hasattr(element, 'tag')

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

120

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

121

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

122

class Element:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

123

"""An XML element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

124

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

125

This class is the reference implementation of the Element interface.

126

127

An element's length is its number of subelements. That means if you

128

you want to check if an element is truly empty, you should check BOTH

129

its length AND its text attribute.

130

131

The element tag, attribute names, and attribute values can be either

132

bytes or strings.

133

134

*tag* is the element name. *attrib* is an optional dictionary containing

135

element attributes. *extra* are additional element attributes given as

keyword arguments.

Example form:

<tag attrib>text<child/>...</tag>tail

140

141

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

142

143

tag = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

144

"""The element's name."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

145

146

attrib = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

147

"""Dictionary of the element's attributes."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

148

149

text = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

150

"""

151

Text before first subelement. This is either a string or the value None.

152

Note that if there is no text, this attribute may be either

153

None or the empty string, depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

154

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

155

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

156

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

157

tail = None

158

"""

159

Text after this element's end tag, but before the next sibling element's

160

start tag. This is either a string or the value None. Note that if there

161

was no text, this attribute may be either None or an empty string,

162

depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

163

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

164

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

165

166

def __init__(self, tag, attrib={}, **extra):

Eli Bendersky

737b173

2012-05-29 06:02:56 +0300

[diff] [blame]

167

if not isinstance(attrib, dict):

168

raise TypeError("attrib must be dict, not %s" % (

169

attrib.__class__.__name__,))

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

170

attrib = attrib.copy()

171

attrib.update(extra)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

self.tag = tag

self.attrib = attrib

self._children = []

def __repr__(self):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

177

return "<Element %s at 0x%x>" % (repr(self.tag), id(self))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

178

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

179

def makeelement(self, tag, attrib):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

180

"""Create a new element with the same type.

181

182

*tag* is a string containing the element name.

183

*attrib* is a dictionary containing the element attributes.

184

185

Do not call this method, use the SubElement factory function instead.

186

187

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

188

return self.__class__(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

189

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

190

def copy(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

191

"""Return copy of current element.

192

193

This creates a shallow copy. Subelements will be shared with the

194

original tree.

195

196

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

197

elem = self.makeelement(self.tag, self.attrib)

198

elem.text = self.text

199

elem.tail = self.tail

elem[:] = self

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

203

def __len__(self):

204

return len(self._children)

205

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

206

def __bool__(self):

207

warnings.warn(

208

"The behavior of this method will change in future versions. "

209

"Use specific 'len(elem)' or 'elem is not None' test instead.",

210

FutureWarning, stacklevel=2

211

)

212

return len(self._children) != 0 # emulate old behaviour, for now

213

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

214

def __getitem__(self, index):

215

return self._children[index]

216

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

217

def __setitem__(self, index, element):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

218

# if isinstance(index, slice):

219

# for elt in element:

220

# assert iselement(elt)

221

# else:

222

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

223

self._children[index] = element

224

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

225

def __delitem__(self, index):

226

del self._children[index]

227

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

228

def append(self, subelement):

229

"""Add *subelement* to the end of this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

230

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

231

The new element will appear in document order after the last existing

232

subelement (or directly after the text, if it's the first subelement),

233

but before the end tag for this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

234

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

235

"""

236

self._assert_is_element(subelement)

237

self._children.append(subelement)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

238

239

def extend(self, elements):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

240

"""Append subelements from a sequence.

241

242

*elements* is a sequence with zero or more elements.

243

244

"""

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

245

for element in elements:

246

self._assert_is_element(element)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

247

self._children.extend(elements)

248

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

249

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

250

def insert(self, index, subelement):

251

"""Insert *subelement* at position *index*."""

252

self._assert_is_element(subelement)

253

self._children.insert(index, subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

254

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

255

def _assert_is_element(self, e):

Antoine Pitrou

ee32931

2012-10-04 19:53:29 +0200

[diff] [blame]

256

# Need to refer to the actual Python implementation, not the

257

# shadowing C implementation.

258

if not isinstance(e, _Element):

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

259

raise TypeError('expected an Element, not %s' % type(e).__name__)

260

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

261

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

262

def remove(self, subelement):

263

"""Remove matching subelement.

264

265

Unlike the find methods, this method compares elements based on

266

identity, NOT ON tag value or contents. To remove subelements by

267

other means, the easiest way is to use a list comprehension to

268

select what elements to keep, and then use slice assignment to update

269

the parent element.

270

271

ValueError is raised if a matching element could not be found.

272

273

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

274

# assert iselement(element)

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

275

self._children.remove(subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

276

277

def getchildren(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

278

"""(Deprecated) Return all subelements.

279

280

Elements are returned in document order.

281

282

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

283

warnings.warn(

284

"This method will be removed in future versions. "

285

"Use 'list(elem)' or iteration over elem instead.",

286

DeprecationWarning, stacklevel=2

287

)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

288

return self._children

289

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

290

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

291

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

292

"""Find first matching element by tag name or path.

293

294

*path* is a string having either an element tag or an XPath,

295

*namespaces* is an optional mapping from namespace prefix to full name.

296

297

Return the first matching element, or None if no element was found.

298

299

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

300

return ElementPath.find(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

301

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

302

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

303

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

304

"""Find text for first matching element by tag name or path.

305

306

*path* is a string having either an element tag or an XPath,

307

*default* is the value to return if the element was not found,

308

*namespaces* is an optional mapping from namespace prefix to full name.

309

310

Return text content of first matching element, or default value if

311

none was found. Note that if an element is found having no text

312

content, the empty string is returned.

313

314

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

315

return ElementPath.findtext(self, path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

316

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

317

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

318

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

319

"""Find all matching subelements by tag name or path.

320

321

*path* is a string having either an element tag or an XPath,

322

*namespaces* is an optional mapping from namespace prefix to full name.

323

324

Returns list containing all matching elements in document order.

325

326

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

327

return ElementPath.findall(self, path, namespaces)

328

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

329

330

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

331

"""Find all matching subelements by tag name or path.

332

333

*path* is a string having either an element tag or an XPath,

334

*namespaces* is an optional mapping from namespace prefix to full name.

335

336

Return an iterable yielding all matching elements in document order.

337

338

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

339

return ElementPath.iterfind(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

340

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

341

342

def clear(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

343

"""Reset element.

344

345

This function removes all subelements, clears all attributes, and sets

346

the text and tail attributes to None.

347

348

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

349

self.attrib.clear()

350

self._children = []

351

self.text = self.tail = None

352

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

353

354

def get(self, key, default=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

355

"""Get element attribute.

356

357

Equivalent to attrib.get, but some implementations may handle this a

358

bit more efficiently. *key* is what attribute to look for, and

359

*default* is what to return if the attribute was not found.

360

361

Returns a string containing the attribute value, or the default if

362

attribute was not found.

363

364

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

365

return self.attrib.get(key, default)

366

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

367

368

def set(self, key, value):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

369

"""Set element attribute.

370

371

Equivalent to attrib[key] = value, but some implementations may handle

372

this a bit more efficiently. *key* is what attribute to set, and

373

*value* is the attribute value to set it to.

374

375

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

376

self.attrib[key] = value

377

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

378

379

def keys(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

380

"""Get list of attribute names.

381

382

Names are returned in an arbitrary order, just like an ordinary

383

Python dict. Equivalent to attrib.keys()

384

385

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

386

return self.attrib.keys()

387

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

388

389

def items(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

390

"""Get element attributes as a sequence.

391

392

The attributes are returned in arbitrary order. Equivalent to

393

attrib.items().

394

395

Return a list of (name, value) tuples.

396

397

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

398

return self.attrib.items()

399

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

400

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

401

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

402

"""Create tree iterator.

403

404

The iterator loops over the element and all subelements in document

405

order, returning all elements with a matching tag.

406

407

If the tree structure is modified during iteration, new or removed

408

elements may or may not be included. To get a stable set, use the

409

list() function on the iterator, and loop over the resulting list.

410

411

*tag* is what tags to look for (default is to return all elements)

412

413

Return an iterator containing all the matching elements.

414

415

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

416

if tag == "*":

417

tag = None

418

if tag is None or self.tag == tag:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

419

yield self

420

for e in self._children:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

421

yield from e.iter(tag)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

422

423

# compatibility

424

def getiterator(self, tag=None):

425

# Change for a DeprecationWarning in 1.4

426

warnings.warn(

427

"This method will be removed in future versions. "

428

"Use 'elem.iter()' or 'list(elem.iter())' instead.",

429

PendingDeprecationWarning, stacklevel=2

430

)

431

return list(self.iter(tag))

432

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

433

434

def itertext(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

435

"""Create text iterator.

436

437

The iterator loops over the element and all subelements in document

438

order, returning all inner text.

439

440

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

441

tag = self.tag

442

if not isinstance(tag, str) and tag is not None:

return

if self.text:

yield self.text

for e in self:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

447

yield from e.itertext()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

448

if e.tail:

449

yield e.tail

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

450

451

# compatibility

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

452

_Element = _ElementInterface = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

453

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

454

455

def SubElement(parent, tag, attrib={}, **extra):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

456

"""Subelement factory which creates an element instance, and appends it

457

to an existing parent.

458

459

The element tag, attribute names, and attribute values can be either

460

bytes or Unicode strings.

461

462

*parent* is the parent element, *tag* is the subelements name, *attrib* is

463

an optional directory containing element attributes, *extra* are

464

additional attributes given as keyword arguments.

465

466

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

467

attrib = attrib.copy()

468

attrib.update(extra)

469

element = parent.makeelement(tag, attrib)

470

parent.append(element)

471

return element

472

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

473

474

def Comment(text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

475

"""Comment element factory.

476

477

This function creates a special element which the standard serializer

478

serializes as an XML comment.

479

480

*text* is a string containing the comment string.

481

482

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

483

element = Element(Comment)

element.text = text

return element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

487

488

def ProcessingInstruction(target, text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

489

"""Processing Instruction element factory.

490

491

This function creates a special element which the standard serializer

492

serializes as an XML comment.

493

494

*target* is a string containing the processing instruction, *text* is a

495

string containing the processing instruction contents, if any.

496

497

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

498

element = Element(ProcessingInstruction)

499

element.text = target

500

if text:

501

element.text = element.text + " " + text

502

return element

503

504

PI = ProcessingInstruction

505

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

506

507

class QName:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

508

"""Qualified name wrapper.

509

510

This class can be used to wrap a QName attribute value in order to get

511

proper namespace handing on output.

512

513

*text_or_uri* is a string containing the QName value either in the form

514

{uri}local, or if the tag argument is given, the URI part of a QName.

515

516

*tag* is an optional argument which if given, will make the first

517

argument (text_or_uri) be interpreted as a URI, and this argument (tag)

518

be interpreted as a local name.

519

520

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

521

def __init__(self, text_or_uri, tag=None):

522

if tag:

523

text_or_uri = "{%s}%s" % (text_or_uri, tag)

524

self.text = text_or_uri

525

def __str__(self):

526

return self.text

Georg Brandl

b56c0e2

2010-12-09 18:10:27 +0000

[diff] [blame]

527

def __repr__(self):

Georg Brandl

c95c918

2010-12-09 18:26:02 +0000

[diff] [blame]

528

return '<QName %r>' % (self.text,)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

529

def __hash__(self):

530

return hash(self.text)

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

531

def __le__(self, other):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

532

if isinstance(other, QName):

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

533

return self.text <= other.text

534

return self.text <= other

535

def __lt__(self, other):

536

if isinstance(other, QName):

537

return self.text < other.text

538

return self.text < other

539

def __ge__(self, other):

540

if isinstance(other, QName):

541

return self.text >= other.text

542

return self.text >= other

543

def __gt__(self, other):

544

if isinstance(other, QName):

545

return self.text > other.text

546

return self.text > other

547

def __eq__(self, other):

548

if isinstance(other, QName):

549

return self.text == other.text

550

return self.text == other

551

def __ne__(self, other):

552

if isinstance(other, QName):

553

return self.text != other.text

554

return self.text != other

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

555

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

556

# --------------------------------------------------------------------

557

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

558

559

class ElementTree:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

560

"""An XML element hierarchy.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

561

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

562

This class also provides support for serialization to and from

563

standard XML.

564

565

*element* is an optional root element node,

566

*file* is an optional file handle or file name of an XML file whose

567

contents will be used to initialize the tree with.

568

569

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

570

def __init__(self, element=None, file=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

571

# assert element is None or iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

572

self._root = element # first node

if file:

self.parse(file)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

576

def getroot(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

577

"""Return root element of this tree."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

578

return self._root

579

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

580

def _setroot(self, element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

581

"""Replace root element of this tree.

582

583

This will discard the current contents of the tree and replace it

584

with the given element. Use with care!

585

586

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

587

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

588

self._root = element

589

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

590

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

591

"""Load external XML document into element tree.

592

593

*source* is a file name or file object, *parser* is an optional parser

594

instance that defaults to XMLParser.

595

596

ParseError is raised if the parser fails to parse the document.

597

598

Returns the root element of the given source document.

599

600

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

601

close_source = False

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

602

if not hasattr(source, "read"):

603

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

close_source = True

try:

if not parser:

parser = XMLParser(target=TreeBuilder())

608

while 1:

609

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

return self._root

finally:

if close_source:

source.close()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

618

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

619

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

620

"""Create and return tree iterator for the root element.

621

622

The iterator loops over all elements in this tree, in document order.

623

624

*tag* is a string with the tag name to iterate over

625

(default is to return all elements).

626

627

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

628

# assert self._root is not None

629

return self._root.iter(tag)

630

631

# compatibility

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

632

def getiterator(self, tag=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

633

# Change for a DeprecationWarning in 1.4

634

warnings.warn(

635

"This method will be removed in future versions. "

636

"Use 'tree.iter()' or 'list(tree.iter())' instead.",

637

PendingDeprecationWarning, stacklevel=2

638

)

639

return list(self.iter(tag))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

640

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

641

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

642

"""Find first matching element by tag name or path.

643

644

Same as getroot().find(path), which is Element.find()

645

646

*path* is a string having either an element tag or an XPath,

647

*namespaces* is an optional mapping from namespace prefix to full name.

648

649

Return the first matching element, or None if no element was found.

650

651

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

652

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

653

if path[:1] == "/":

654

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

655

warnings.warn(

656

"This search is broken in 1.3 and earlier, and will be "

657

"fixed in a future version. If you rely on the current "

658

"behaviour, change it to %r" % path,

659

FutureWarning, stacklevel=2

660

)

661

return self._root.find(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

662

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

663

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

664

"""Find first matching element by tag name or path.

665

666

Same as getroot().findtext(path), which is Element.findtext()

667

668

*path* is a string having either an element tag or an XPath,

669

*namespaces* is an optional mapping from namespace prefix to full name.

670

671

Return the first matching element, or None if no element was found.

672

673

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

674

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

675

if path[:1] == "/":

676

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

677

warnings.warn(

678

"This search is broken in 1.3 and earlier, and will be "

679

"fixed in a future version. If you rely on the current "

680

"behaviour, change it to %r" % path,

681

FutureWarning, stacklevel=2

682

)

683

return self._root.findtext(path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

684

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

685

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

686

"""Find all matching subelements by tag name or path.

687

688

Same as getroot().findall(path), which is Element.findall().

689

690

*path* is a string having either an element tag or an XPath,

691

*namespaces* is an optional mapping from namespace prefix to full name.

692

693

Return list containing all matching elements in document order.

694

695

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

696

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

697

if path[:1] == "/":

698

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

699

warnings.warn(

700

"This search is broken in 1.3 and earlier, and will be "

701

"fixed in a future version. If you rely on the current "

702

"behaviour, change it to %r" % path,

703

FutureWarning, stacklevel=2

704

)

705

return self._root.findall(path, namespaces)

706

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

707

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

708

"""Find all matching subelements by tag name or path.

709

710

Same as getroot().iterfind(path), which is element.iterfind()

711

712

*path* is a string having either an element tag or an XPath,

713

*namespaces* is an optional mapping from namespace prefix to full name.

714

715

Return an iterable yielding all matching elements in document order.

716

717

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

718

# assert self._root is not None

if path[:1] == "/":

path = "." + path

warnings.warn(

"This search is broken in 1.3 and earlier, and will be "

723

"fixed in a future version. If you rely on the current "

724

"behaviour, change it to %r" % path,

725

FutureWarning, stacklevel=2

726

)

727

return self._root.iterfind(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

728

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

729

def write(self, file_or_filename,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

730

encoding=None,

731

xml_declaration=None,

732

default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

733

method=None, *,

734

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

735

"""Write element tree to a file as XML.

736

737

Arguments:

738

*file_or_filename* -- file name or a file object opened for writing

739

740

*encoding* -- the output encoding (default: US-ASCII)

741

742

*xml_declaration* -- bool indicating if an XML declaration should be

743

added to the output. If None, an XML declaration

744

is added if encoding IS NOT either of:

745

US-ASCII, UTF-8, or Unicode

746

747

*default_namespace* -- sets the default XML namespace (for "xmlns")

748

749

*method* -- either "xml" (default), "html, "text", or "c14n"

750

751

*short_empty_elements* -- controls the formatting of elements

752

that contain no content. If True (default)

753

they are emitted as a single self-closed

754

tag, otherwise they are emitted as a pair

755

of start/end tags

Eli Bendersky

e9af827

2013-01-13 06:27:51 -0800

[diff] [blame]

756

757

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

758

if not method:

759

method = "xml"

760

elif method not in _serialize:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

761

raise ValueError("unknown method %r" % method)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

if not encoding:

if method == "c14n":

encoding = "utf-8"

else:

encoding = "us-ascii"

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

767

else:

768

encoding = encoding.lower()

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

769

with _get_writer(file_or_filename, encoding) as write:

770

if method == "xml" and (xml_declaration or

771

(xml_declaration is None and

772

encoding not in ("utf-8", "us-ascii", "unicode"))):

773

declared_encoding = encoding

774

if encoding == "unicode":

775

# Retrieve the default encoding for the xml declaration

776

import locale

777

declared_encoding = locale.getpreferredencoding()

778

write("<?xml version='1.0' encoding='%s'?>\n" % (

779

declared_encoding,))

780

if method == "text":

781

_serialize_text(write, self._root)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

782

else:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

783

qnames, namespaces = _namespaces(self._root, default_namespace)

784

serialize = _serialize[method]

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

785

serialize(write, self._root, qnames, namespaces,

786

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

787

788

def write_c14n(self, file):

789

# lxml.etree compatibility. use output method instead

790

return self.write(file, method="c14n")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

791

792

# --------------------------------------------------------------------

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

793

# serialization support

794

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

795

@contextlib.contextmanager

796

def _get_writer(file_or_filename, encoding):

797

# returns text write method and release all resourses after using

798

try:

799

write = file_or_filename.write

800

except AttributeError:

801

# file_or_filename is a file name

802

if encoding == "unicode":

803

file = open(file_or_filename, "w")

804

else:

805

file = open(file_or_filename, "w", encoding=encoding,

806

errors="xmlcharrefreplace")

with file:

yield file.write

else:

# file_or_filename is a file-like object

811

# encoding determines if it is a text or binary writer

812

if encoding == "unicode":

813

# use a text writer as is

814

yield write

815

else:

816

# wrap a binary writer with TextIOWrapper

817

with contextlib.ExitStack() as stack:

818

if isinstance(file_or_filename, io.BufferedIOBase):

819

file = file_or_filename

820

elif isinstance(file_or_filename, io.RawIOBase):

821

file = io.BufferedWriter(file_or_filename)

822

# Keep the original file open when the BufferedWriter is

823

# destroyed

824

stack.callback(file.detach)

825

else:

826

# This is to handle passed objects that aren't in the

827

# IOBase hierarchy, but just have a write method

828

file = io.BufferedIOBase()

829

file.writable = lambda: True

830

file.write = write

831

try:

832

# TextIOWrapper uses this methods to determine

833

# if BOM (for UTF-16, etc) should be added

834

file.seekable = file_or_filename.seekable

835

file.tell = file_or_filename.tell

836

except AttributeError:

837

pass

838

file = io.TextIOWrapper(file,

839

encoding=encoding,

840

errors="xmlcharrefreplace",

841

newline="\n")

842

# Keep the original file open when the TextIOWrapper is

843

# destroyed

844

stack.callback(file.detach)

845

yield file.write

846

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

847

def _namespaces(elem, default_namespace=None):

848

# identify namespaces used in this tree

849

850

# maps qnames to *encoded* prefix:local names

851

qnames = {None: None}

852

853

# maps uri:s to prefixes

854

namespaces = {}

855

if default_namespace:

856

namespaces[default_namespace] = ""

857

858

def add_qname(qname):

859

# calculate serialized qname representation

860

try:

861

if qname[:1] == "{":

862

uri, tag = qname[1:].rsplit("}", 1)

863

prefix = namespaces.get(uri)

864

if prefix is None:

865

prefix = _namespace_map.get(uri)

866

if prefix is None:

867

prefix = "ns%d" % len(namespaces)

868

if prefix != "xml":

869

namespaces[uri] = prefix

870

if prefix:

871

qnames[qname] = "%s:%s" % (prefix, tag)

872

else:

873

qnames[qname] = tag # default element

874

else:

875

if default_namespace:

876

# FIXME: can this be handled in XML 1.0?

877

raise ValueError(

878

"cannot use non-qualified names with "

879

"default_namespace option"

880

)

881

qnames[qname] = qname

882

except TypeError:

883

_raise_serialization_error(qname)

884

885

# populate qname and namespaces table

Eli Bendersky

64d11e6

2012-06-15 07:42:50 +0300

[diff] [blame]

886

for elem in elem.iter():

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

887

tag = elem.tag

Senthil Kumaran

ec30b3d

2010-11-09 02:36:59 +0000

[diff] [blame]

888

if isinstance(tag, QName):

889

if tag.text not in qnames:

890

add_qname(tag.text)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

891

elif isinstance(tag, str):

892

if tag not in qnames:

893

add_qname(tag)

894

elif tag is not None and tag is not Comment and tag is not PI:

895

_raise_serialization_error(tag)

896

for key, value in elem.items():

897

if isinstance(key, QName):

898

key = key.text

899

if key not in qnames:

900

add_qname(key)

901

if isinstance(value, QName) and value.text not in qnames:

902

add_qname(value.text)

903

text = elem.text

904

if isinstance(text, QName) and text.text not in qnames:

905

add_qname(text.text)

906

return qnames, namespaces

907

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

908

def _serialize_xml(write, elem, qnames, namespaces,

909

short_empty_elements, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % text)

914

elif tag is ProcessingInstruction:

915

write("<?%s?>" % text)

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

921

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

922

_serialize_xml(write, e, qnames, None,

923

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

924

else:

925

write("<" + tag)

926

items = list(elem.items())

927

if items or namespaces:

928

if namespaces:

929

for v, k in sorted(namespaces.items(),

930

key=lambda x: x[1]): # sort on prefix

931

if k:

932

k = ":" + k

933

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

938

if isinstance(k, QName):

939

k = k.text

940

if isinstance(v, QName):

941

v = qnames[v.text]

942

else:

943

v = _escape_attrib(v)

944

write(" %s=\"%s\"" % (qnames[k], v))

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

945

if text or len(elem) or not short_empty_elements:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

946

write(">")

947

if text:

948

write(_escape_cdata(text))

949

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

950

_serialize_xml(write, e, qnames, None,

951

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

952

write("</" + tag + ">")

else:

write(" />")

if elem.tail:

write(_escape_cdata(elem.tail))

957

958

HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",

Ezio Melotti

c90111f

2012-09-19 08:19:12 +0300

[diff] [blame]

959

"img", "input", "isindex", "link", "meta", "param")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

960

961

try:

962

HTML_EMPTY = set(HTML_EMPTY)

except NameError:

pass

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

966

def _serialize_html(write, elem, qnames, namespaces, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % _escape_cdata(text))

971

elif tag is ProcessingInstruction:

972

write("<?%s?>" % _escape_cdata(text))

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

978

for e in elem:

979

_serialize_html(write, e, qnames, None)

980

else:

981

write("<" + tag)

982

items = list(elem.items())

983

if items or namespaces:

984

if namespaces:

985

for v, k in sorted(namespaces.items(),

986

key=lambda x: x[1]): # sort on prefix

987

if k:

988

k = ":" + k

989

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

994

if isinstance(k, QName):

995

k = k.text

996

if isinstance(v, QName):

997

v = qnames[v.text]

998

else:

999

v = _escape_attrib_html(v)

1000

# FIXME: handle boolean attributes

1001

write(" %s=\"%s\"" % (qnames[k], v))

write(">")

tag = tag.lower()

if text:

if tag == "script" or tag == "style":

1006

write(text)

1007

else:

1008

write(_escape_cdata(text))

1009

for e in elem:

1010

_serialize_html(write, e, qnames, None)

1011

if tag not in HTML_EMPTY:

1012

write("</" + tag + ">")

1013

if elem.tail:

1014

write(_escape_cdata(elem.tail))

1015

1016

def _serialize_text(write, elem):

1017

for part in elem.itertext():

write(part)

if elem.tail:

write(elem.tail)

_serialize = {

"xml": _serialize_xml,

1024

"html": _serialize_html,

1025

"text": _serialize_text,

1026

# this optional method is imported at the end of the module

1027

# "c14n": _serialize_c14n,

1028

}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1029

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1030

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1031

def register_namespace(prefix, uri):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1032

"""Register a namespace prefix.

1033

1034

The registry is global, and any existing mapping for either the

1035

given prefix or the namespace URI will be removed.

1036

1037

*prefix* is the namespace prefix, *uri* is a namespace uri. Tags and

1038

attributes in this namespace will be serialized with prefix if possible.

1039

1040

ValueError is raised if prefix is reserved or is invalid.

1041

1042

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1043

if re.match("ns\d+$", prefix):

1044

raise ValueError("Prefix format reserved for internal use")

Georg Brandl

90b2067

2010-12-28 10:38:33 +0000

[diff] [blame]

1045

for k, v in list(_namespace_map.items()):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1046

if k == uri or v == prefix:

1047

del _namespace_map[k]

1048

_namespace_map[uri] = prefix

1049

1050

_namespace_map = {

1051

# "well-known" namespace prefixes

1052

"http://www.w3.org/XML/1998/namespace": "xml",

1053

"http://www.w3.org/1999/xhtml": "html",

1054

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

1055

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

1056

# xml schema

1057

"http://www.w3.org/2001/XMLSchema": "xs",

1058

"http://www.w3.org/2001/XMLSchema-instance": "xsi",

1059

# dublin core

1060

"http://purl.org/dc/elements/1.1/": "dc",

1061

}

Florent Xicluna

1639505

2012-02-16 23:28:35 +0100

[diff] [blame]

1062

# For tests and troubleshooting

1063

register_namespace._namespace_map = _namespace_map

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1064

1065

def _raise_serialization_error(text):

1066

raise TypeError(

1067

"cannot serialize %r (type %s)" % (text, type(text).__name__)

1068

)

1069

1070

def _escape_cdata(text):

1071

# escape character data

1072

try:

1073

# it's worth avoiding do-nothing calls for strings that are

1074

# shorter than 500 character, or so. assume that's, by far,

1075

# the most common case in most applications.

1076

if "&" in text:

1077

text = text.replace("&", "&")

1078

if "<" in text:

1079

text = text.replace("<", "<")

1080

if ">" in text:

1081

text = text.replace(">", ">")

1082

return text

1083

except (TypeError, AttributeError):

1084

_raise_serialization_error(text)

1085

1086

def _escape_attrib(text):

1087

# escape attribute value

1088

try:

1089

if "&" in text:

1090

text = text.replace("&", "&")

1091

if "<" in text:

1092

text = text.replace("<", "<")

1093

if ">" in text:

1094

text = text.replace(">", ">")

1095

if "\"" in text:

1096

text = text.replace("\"", """)

1097

if "\n" in text:

1098

text = text.replace("\n", "
")

1099

return text

1100

except (TypeError, AttributeError):

1101

_raise_serialization_error(text)

1102

1103

def _escape_attrib_html(text):

1104

# escape attribute value

1105

try:

1106

if "&" in text:

1107

text = text.replace("&", "&")

1108

if ">" in text:

1109

text = text.replace(">", ">")

1110

if "\"" in text:

1111

text = text.replace("\"", """)

1112

return text

1113

except (TypeError, AttributeError):

1114

_raise_serialization_error(text)

1115

1116

# --------------------------------------------------------------------

1117

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1118

def tostring(element, encoding=None, method=None, *,

1119

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1120

"""Generate string representation of XML element.

1121

1122

All subelements are included. If encoding is "unicode", a string

1123

is returned. Otherwise a bytestring is returned.

1124

1125

*element* is an Element instance, *encoding* is an optional output

1126

encoding defaulting to US-ASCII, *method* is an optional output which can

1127

be one of "xml" (default), "html", "text" or "c14n".

1128

1129

Returns an (optionally) encoded string containing the XML data.

1130

1131

"""

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1132

stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1133

ElementTree(element).write(stream, encoding, method=method,

1134

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1135

return stream.getvalue()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1136

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1137

class _ListDataStream(io.BufferedIOBase):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1138

"""An auxiliary stream accumulating into a list reference."""

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1139

def __init__(self, lst):

1140

self.lst = lst

Eli Bendersky

f90fc68

2012-07-17 15:09:56 +0300

[diff] [blame]

1141

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

def writable(self):

return True

def seekable(self):

return True

def write(self, b):

self.lst.append(b)

def tell(self):

return len(self.lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1154

def tostringlist(element, encoding=None, method=None, *,

1155

short_empty_elements=True):

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1156

lst = []

1157

stream = _ListDataStream(lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1158

ElementTree(element).write(stream, encoding, method=method,

1159

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1160

return lst

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1161

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1162

1163

def dump(elem):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1164

"""Write element tree or element structure to sys.stdout.

1165

1166

This function should be used for debugging only.

1167

1168

*elem* is either an ElementTree, or a single Element. The exact output

1169

format is implementation dependent. In this version, it's written as an

1170

ordinary XML file.

1171

1172

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1173

# debugging

1174

if not isinstance(elem, ElementTree):

1175

elem = ElementTree(elem)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

1176

elem.write(sys.stdout, encoding="unicode")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1177

tail = elem.getroot().tail

1178

if not tail or tail[-1] != "\n":

1179

sys.stdout.write("\n")

1180

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1181

# --------------------------------------------------------------------

1182

# parsing

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1183

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1184

1185

def parse(source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1186

"""Parse XML document into element tree.

1187

1188

*source* is a filename or file object containing XML data,

1189

*parser* is an optional parser instance defaulting to XMLParser.

1190

1191

Return an ElementTree instance.

1192

1193

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1194

tree = ElementTree()

1195

tree.parse(source, parser)

1196

return tree

1197

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1198

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1199

def iterparse(source, events=None, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1200

"""Incrementally parse XML document into ElementTree.

1201

1202

This class also reports what's going on to the user based on the

1203

*events* it is initialized with. The supported events are the strings

1204

"start", "end", "start-ns" and "end-ns" (the "ns" events are used to get

1205

detailed namespace information). If *events* is omitted, only

1206

"end" events are reported.

1207

1208

*source* is a filename or file object containing XML data, *events* is

1209

a list of events to report back, *parser* is an optional parser instance.

1210

1211

Returns an iterator providing (event, elem) pairs.

1212

1213

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1214

close_source = False

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1215

if not hasattr(source, "read"):

1216

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1217

close_source = True

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1218

return _IterParseIterator(source, events, parser, close_source)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1219

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1220

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1221

class IncrementalParser:

1222

1223

def __init__(self, events=None, parser=None):

1224

# _elementtree.c expects a list, not a deque

1225

self._events_queue = []

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1226

self._index = 0

1227

self.root = self._root = None

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1228

if not parser:

1229

parser = XMLParser(target=TreeBuilder())

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1230

self._parser = parser

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1231

# wire up the parser for event reporting

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1232

if events is None:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1233

events = ("end",)

1234

self._parser._setevents(self._events_queue, events)

1235

1236

def data_received(self, data):

1237

if self._parser is None:

1238

raise ValueError("data_received() called after end of stream")

1239

if data:

1240

try:

1241

self._parser.feed(data)

1242

except SyntaxError as exc:

1243

self._events_queue.append(exc)

1244

1245

def eof_received(self):

1246

self._root = self._parser.close()

1247

self._parser = None

1248

if self._index >= len(self._events_queue):

1249

self.root = self._root

1250

1251

def events(self):

1252

events = self._events_queue

while True:

index = self._index

try:

event = events[self._index]

1257

# Avoid retaining references to past events

1258

events[self._index] = None

except IndexError:

break

index += 1

# Compact the list in a O(1) amortized fashion

1263

if index * 2 >= len(events):

1264

events[:index] = []

1265

self._index = 0

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1266

else:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1267

self._index = index

1268

if isinstance(event, Exception):

raise event

else:

yield event

if self._parser is None:

1273

self.root = self._root

1274

1275

1276

class _IterParseIterator(IncrementalParser):

1277

1278

def __init__(self, source, events, parser, close_source=False):

1279

IncrementalParser.__init__(self, events, parser)

1280

self._file = source

1281

self._close_file = close_source

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1282

Georg Brandl

a18af4e

2007-04-21 15:47:16 +0000

[diff] [blame]

1283

def __next__(self):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1284

while 1:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1285

for event in self.events():

1286

return event

Florent Xicluna

2011-11-01 23:31:09 +0100

[diff] [blame]

1287

if self._parser is None:

Florent Xicluna

2011-11-01 23:31:09 +0100

[diff] [blame]

if self._close_file:

self._file.close()

raise StopIteration

# load event buffer

Florent Xicluna

2011-11-01 23:31:09 +0100

[diff] [blame]

1292

data = self._file.read(16384)

1293

if data:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1294

self.data_received(data)

Florent Xicluna

2011-11-01 23:31:09 +0100

[diff] [blame]

1295

else:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1296

self.eof_received()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1297

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1298

def __iter__(self):

1299

return self

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1300

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1301

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1302

def XML(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1303

"""Parse XML document from string constant.

1304

1305

This function can be used to embed "XML Literals" in Python code.

1306

1307

*text* is a string containing XML data, *parser* is an

1308

optional parser instance, defaulting to the standard XMLParser.

1309

1310

Returns an Element instance.

1311

1312

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1313

if not parser:

1314

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1315

parser.feed(text)

1316

return parser.close()

1317

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1318

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1319

def XMLID(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1320

"""Parse XML document from string constant for its IDs.

1321

1322

*text* is a string containing XML data, *parser* is an

1323

optional parser instance, defaulting to the standard XMLParser.

1324

1325

Returns an (Element, dict) tuple, in which the

1326

dict maps element id:s to elements.

1327

1328

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1329

if not parser:

1330

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1331

parser.feed(text)

1332

tree = parser.close()

1333

ids = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1334

for elem in tree.iter():

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

id = elem.get("id")

if id:

ids[id] = elem

return tree, ids

Victor Stinner

2013-03-26 01:11:54 +0100

[diff] [blame]

1340

# Parse XML document from string constant. Alias for XML().

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1341

fromstring = XML

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1342

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1343

def fromstringlist(sequence, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1344

"""Parse XML document from sequence of string fragments.

1345

1346

*sequence* is a list of other sequence, *parser* is an optional parser

1347

instance, defaulting to the standard XMLParser.

1348

1349

Returns an Element instance.

1350

1351

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1352

if not parser:

1353

parser = XMLParser(target=TreeBuilder())

1354

for text in sequence:

1355

parser.feed(text)

1356

return parser.close()

1357

1358

# --------------------------------------------------------------------

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1359

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1360

1361

class TreeBuilder:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1362

"""Generic element structure builder.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1363

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1364

This builder converts a sequence of start, data, and end method

1365

calls to a well-formed element structure.

1366

1367

You can use this class to build an element structure using a custom XML

1368

parser, or a parser for some other XML-like format.

1369

1370

*element_factory* is an optional element factory which is called

1371

to create new Element instances, as necessary.

1372

1373

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1374

def __init__(self, element_factory=None):

1375

self._data = [] # data collector

1376

self._elem = [] # element stack

1377

self._last = None # last element

1378

self._tail = None # true if we're after an end tag

1379

if element_factory is None:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1380

element_factory = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1381

self._factory = element_factory

1382

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1383

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1384

"""Flush builder buffers and return toplevel document Element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1385

assert len(self._elem) == 0, "missing end tags"

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1386

assert self._last is not None, "missing toplevel element"

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

return self._last

def _flush(self):

if self._data:

if self._last is not None:

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1392

text = "".join(self._data)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1393

if self._tail:

1394

assert self._last.tail is None, "internal error (tail)"

1395

self._last.tail = text

1396

else:

1397

assert self._last.text is None, "internal error (text)"

1398

self._last.text = text

1399

self._data = []

1400

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1401

1402

def data(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1403

"""Add text to current element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1404

self._data.append(data)

1405

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1406

1407

def start(self, tag, attrs):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1408

"""Open new element and return it.

1409

1410

*tag* is the element name, *attrs* is a dict containing element

1411

attributes.

1412

1413

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1414

self._flush()

1415

self._last = elem = self._factory(tag, attrs)

1416

if self._elem:

1417

self._elem[-1].append(elem)

1418

self._elem.append(elem)

self._tail = 0

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1422

1423

def end(self, tag):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1424

"""Close and return current Element.

1425

1426

*tag* is the element name.

1427

1428

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1429

self._flush()

1430

self._last = self._elem.pop()

1431

assert self._last.tag == tag,\

1432

"end tag mismatch (expected %s, got %s)" % (

self._last.tag, tag)

self._tail = 1

return self._last

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1437

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1438

# also see ElementTree and TreeBuilder

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1439

class XMLParser:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1440

"""Element structure builder for XML source data based on the expat parser.

1441

1442

*html* are predefined HTML entities (not supported currently),

1443

*target* is an optional target object which defaults to an instance of the

1444

standard TreeBuilder class, *encoding* is an optional encoding string

1445

which if given, overrides the encoding specified in the XML file:

1446

http://www.iana.org/assignments/character-sets

1447

1448

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1449

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1450

def __init__(self, html=0, target=None, encoding=None):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1451

try:

Thomas Wouters

2006-08-11 14:57:12 +0000

[diff] [blame]

1452

from xml.parsers import expat

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1453

except ImportError:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1454

try:

1455

import pyexpat as expat

1456

except ImportError:

1457

raise ImportError(

1458

"No module named expat; use SimpleXMLTreeBuilder instead"

1459

)

1460

parser = expat.ParserCreate(encoding, "}")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1461

if target is None:

1462

target = TreeBuilder()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1463

# underscored names are provided for compatibility only

1464

self.parser = self._parser = parser

1465

self.target = self._target = target

1466

self._error = expat.error

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1467

self._names = {} # name memo cache

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1468

# main callbacks

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1469

parser.DefaultHandlerExpand = self._default

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1470

if hasattr(target, 'start'):

1471

parser.StartElementHandler = self._start

1472

if hasattr(target, 'end'):

1473

parser.EndElementHandler = self._end

1474

if hasattr(target, 'data'):

1475

parser.CharacterDataHandler = target.data

1476

# miscellaneous callbacks

1477

if hasattr(target, 'comment'):

1478

parser.CommentHandler = target.comment

1479

if hasattr(target, 'pi'):

1480

parser.ProcessingInstructionHandler = target.pi

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1481

# let expat do the buffering, if supported

1482

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1483

parser.buffer_text = 1

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1484

except AttributeError:

1485

pass

1486

# use new-style attribute handling, if supported

1487

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1488

parser.ordered_attributes = 1

1489

parser.specified_attributes = 1

1490

if hasattr(target, 'start'):

1491

parser.StartElementHandler = self._start_list

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1492

except AttributeError:

1493

pass

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1494

self._doctype = None

1495

self.entity = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1496

try:

1497

self.version = "Expat %d.%d.%d" % expat.version_info

1498

except AttributeError:

1499

pass # unknown

1500

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1501

def _setevents(self, event_list, events):

1502

# Internal API for IncrementalParser

1503

parser = self._parser

1504

append = event_list.append

1505

for event in events:

1506

if event == "start":

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1507

parser.ordered_attributes = 1

1508

parser.specified_attributes = 1

1509

def handler(tag, attrib_in, event=event, append=append,

1510

start=self._start_list):

1511

append((event, start(tag, attrib_in)))

1512

parser.StartElementHandler = handler

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1513

elif event == "end":

1514

def handler(tag, event=event, append=append,

1515

end=self._end):

1516

append((event, end(tag)))

1517

parser.EndElementHandler = handler

1518

elif event == "start-ns":

1519

def handler(prefix, uri, event=event, append=append):

1520

append((event, (prefix or "", uri or "")))

1521

parser.StartNamespaceDeclHandler = handler

1522

elif event == "end-ns":

1523

def handler(prefix, event=event, append=append):

1524

append((event, None))

1525

parser.EndNamespaceDeclHandler = handler

1526

else:

1527

raise ValueError("unknown event %r" % event)

1528

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1529

def _raiseerror(self, value):

1530

err = ParseError(value)

1531

err.code = value.code

1532

err.position = value.lineno, value.offset

1533

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1534

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1535

def _fixname(self, key):

1536

# expand qname, and convert name string to ascii, if possible

1537

try:

1538

name = self._names[key]

except KeyError:

name = key

if "}" in name:

name = "{" + name

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1543

self._names[key] = name

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1544

return name

1545

1546

def _start(self, tag, attrib_in):

1547

fixname = self._fixname

1548

tag = fixname(tag)

1549

attrib = {}

1550

for key, value in attrib_in.items():

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1551

attrib[fixname(key)] = value

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1552

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1553

1554

def _start_list(self, tag, attrib_in):

1555

fixname = self._fixname

tag = fixname(tag)

attrib = {}

if attrib_in:

for i in range(0, len(attrib_in), 2):

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1560

attrib[fixname(attrib_in[i])] = attrib_in[i+1]

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1561

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1562

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1563

def _end(self, tag):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1564

return self.target.end(self._fixname(tag))

1565

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1566

def _default(self, text):

1567

prefix = text[:1]

1568

if prefix == "&":

1569

# deal with undefined entities

1570

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1571

data_handler = self.target.data

1572

except AttributeError:

1573

return

1574

try:

1575

data_handler(self.entity[text[1:-1]])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1576

except KeyError:

Thomas Wouters

2006-08-11 14:57:12 +0000

[diff] [blame]

1577

from xml.parsers import expat

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1578

err = expat.error(

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1579

"undefined entity %s: line %d, column %d" %

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1580

(text, self.parser.ErrorLineNumber,

1581

self.parser.ErrorColumnNumber)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1582

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1583

err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1584

err.lineno = self.parser.ErrorLineNumber

1585

err.offset = self.parser.ErrorColumnNumber

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1586

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1587

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1588

self._doctype = [] # inside a doctype declaration

1589

elif self._doctype is not None:

1590

# parse doctype contents

1591

if prefix == ">":

1592

self._doctype = None

1593

return

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1594

text = text.strip()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1595

if not text:

1596

return

1597

self._doctype.append(text)

1598

n = len(self._doctype)

1599

if n > 2:

1600

type = self._doctype[1]

1601

if type == "PUBLIC" and n == 4:

1602

name, type, pubid, system = self._doctype

Florent Xicluna

a1c974a

2012-07-07 13:16:44 +0200

[diff] [blame]

1603

if pubid:

1604

pubid = pubid[1:-1]

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1605

elif type == "SYSTEM" and n == 3:

1606

name, type, system = self._doctype

1607

pubid = None

1608

else:

1609

return

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1610

if hasattr(self.target, "doctype"):

1611

self.target.doctype(name, pubid, system[1:-1])

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1612

elif self.doctype != self._XMLParser__doctype:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1613

# warn about deprecated call

1614

self._XMLParser__doctype(name, pubid, system[1:-1])

1615

self.doctype(name, pubid, system[1:-1])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1616

self._doctype = None

1617

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1618

def doctype(self, name, pubid, system):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1619

"""(Deprecated) Handle doctype declaration

1620

1621

*name* is the Doctype name, *pubid* is the public identifier,

1622

and *system* is the system identifier.

1623

1624

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1625

warnings.warn(

1626

"This method of XMLParser is deprecated. Define doctype() "

1627

"method on the TreeBuilder target.",

DeprecationWarning,

)

# sentinel, if doctype is redefined in a subclass

1632

__doctype = doctype

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1633

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1634

def feed(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1635

"""Feed encoded data to parser."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1636

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1637

self.parser.Parse(data, 0)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1638

except self._error as v:

1639

self._raiseerror(v)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1640

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1641

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1642

"""Finish feeding data to parser and return element structure."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1643

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1644

self.parser.Parse("", 1) # end of data

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1645

except self._error as v:

1646

self._raiseerror(v)

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1647

try:

Florent Xicluna

fb06746

2012-03-05 11:42:49 +0100

[diff] [blame]

1648

close_handler = self.target.close

1649

except AttributeError:

1650

pass

1651

else:

1652

return close_handler()

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1653

finally:

1654

# get rid of circular references

1655

del self.parser, self._parser

1656

del self.target, self._target

Thomas Wouters

2006-08-11 14:57:12 +0000

[diff] [blame]

1657

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1658

1659

# Import the C accelerators

1660

try:

1661

# Element, SubElement, ParseError, TreeBuilder, XMLParser

1662

from _elementtree import *

1663

except ImportError:

1664

pass

1665

else:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1666

# Overwrite 'ElementTree.parse' to use the C XMLParser

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1667

1668

class ElementTree(ElementTree):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1669

__doc__ = ElementTree.__doc__

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1670

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1671

__doc__ = ElementTree.parse.__doc__

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1672

close_source = False

1673

if not hasattr(source, 'read'):

1674

source = open(source, 'rb')

1675

close_source = True

1676

try:

1677

if parser is not None:

1678

while True:

1679

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

1684

else:

1685

parser = XMLParser()

1686

self._root = parser._parse(source)

return self._root

finally:

if close_source:

source.close()

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1692

Thomas Wouters

2006-08-11 14:57:12 +0000

[diff] [blame]

1693

# compatibility

Florent Xicluna