Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

2013-03-09 07:12:48 -0800

[diff] [blame]

1

"""Lightweight XML support for Python.

2

3

XML is an inherently hierarchical data format, and the most natural way to

4

represent it is with a tree. This module has two classes for this purpose:

5

6

1. ElementTree represents the whole XML document as a tree and

7

8

2. Element represents a single node in this tree.

9

10

Interactions with the whole document (reading and writing to/from files) are

11

usually done on the ElementTree level. Interactions with a single XML element

12

and its sub-elements are done on the Element level.

13

14

Element is a flexible container object designed to store hierarchical data

15

structures in memory. It can be described as a cross between a list and a

16

dictionary. Each Element has a number of properties associated with it:

17

18

'tag' - a string containing the element's name.

19

20

'attributes' - a Python dictionary storing the element's attributes.

21

22

'text' - a string containing the element's text content.

23

24

'tail' - an optional string containing text after the element's end tag.

25

26

And a number of child elements stored in a Python sequence.

27

28

To create an element instance, use the Element constructor,

29

or the SubElement factory function.

30

31

You can also use the ElementTree class to wrap an element structure

32

and convert it to and from XML.

"""

Eli Bendersky

2013-04-20 05:44:01 -0700

[diff] [blame]

36

#---------------------------------------------------------------------

37

# Licensed to PSF under a Contributor Agreement.

38

# See http://www.python.org/psf/license for licensing details.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

39

#

40

# ElementTree

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

41

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

42

#

43

# fredrik@pythonware.com

44

# http://www.pythonware.com

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

45

# --------------------------------------------------------------------

46

# The ElementTree toolkit is

47

#

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

48

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

49

#

50

# By obtaining, using, and/or copying this software and/or its

51

# associated documentation, you agree that you have read, understood,

52

# and will comply with the following terms and conditions:

53

#

54

# Permission to use, copy, modify, and distribute this software and

55

# its associated documentation for any purpose and without fee is

56

# hereby granted, provided that the above copyright notice appears in

57

# all copies, and that both that copyright notice and this permission

58

# notice appear in supporting documentation, and that the name of

59

# Secret Labs AB or the author not be used in advertising or publicity

60

# pertaining to distribution of the software without specific, written

61

# prior permission.

62

#

63

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

64

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

65

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

66

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

67

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

68

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

69

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

70

# OF THIS SOFTWARE.

71

# --------------------------------------------------------------------

__all__ = [

# public symbols

"Comment",

"dump",

"Element", "ElementTree",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

78

"fromstring", "fromstringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

79

"iselement", "iterparse",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

80

"parse", "ParseError",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

81

"PI", "ProcessingInstruction",

82

"QName",

83

"SubElement",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

84

"tostring", "tostringlist",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

85

"TreeBuilder",

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

86

"VERSION",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

87

"XML", "XMLID",

Eli Bendersky

c4e98a6

2013-05-19 09:24:43 -0700

[diff] [blame]

88

"XMLParser",

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

89

"register_namespace",

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

90

]

91

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

92

VERSION = "1.3.0"

93

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

94

import sys

95

import re

96

import warnings

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

97

import io

98

import contextlib

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

99

Eli Bendersky

27cbb19

2012-06-15 09:03:19 +0300

[diff] [blame]

100

from . import ElementPath

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

101

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

102

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

103

class ParseError(SyntaxError):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

104

"""An error when parsing an XML document.

105

106

In addition to its exception value, a ParseError contains

107

two extra attributes:

108

'code' - the specific exception code

109

'position' - the line and column of the error

110

111

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

112

pass

113

114

# --------------------------------------------------------------------

115

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

116

117

def iselement(element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

118

"""Return True if *element* appears to be an Element."""

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

119

return hasattr(element, 'tag')

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

120

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

121

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

122

class Element:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

123

"""An XML element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

124

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

125

This class is the reference implementation of the Element interface.

126

127

An element's length is its number of subelements. That means if you

128

you want to check if an element is truly empty, you should check BOTH

129

its length AND its text attribute.

130

131

The element tag, attribute names, and attribute values can be either

132

bytes or strings.

133

134

*tag* is the element name. *attrib* is an optional dictionary containing

135

element attributes. *extra* are additional element attributes given as

keyword arguments.

Example form:

<tag attrib>text<child/>...</tag>tail

140

141

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

142

143

tag = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

144

"""The element's name."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

145

146

attrib = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

147

"""Dictionary of the element's attributes."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

148

149

text = None

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

150

"""

151

Text before first subelement. This is either a string or the value None.

152

Note that if there is no text, this attribute may be either

153

None or the empty string, depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

154

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

155

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

156

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

157

tail = None

158

"""

159

Text after this element's end tag, but before the next sibling element's

160

start tag. This is either a string or the value None. Note that if there

161

was no text, this attribute may be either None or an empty string,

162

depending on the parser.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

163

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

164

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

165

166

def __init__(self, tag, attrib={}, **extra):

Eli Bendersky

737b173

2012-05-29 06:02:56 +0300

[diff] [blame]

167

if not isinstance(attrib, dict):

168

raise TypeError("attrib must be dict, not %s" % (

169

attrib.__class__.__name__,))

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

170

attrib = attrib.copy()

171

attrib.update(extra)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

self.tag = tag

self.attrib = attrib

self._children = []

def __repr__(self):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

177

return "<Element %s at 0x%x>" % (repr(self.tag), id(self))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

178

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

179

def makeelement(self, tag, attrib):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

180

"""Create a new element with the same type.

181

182

*tag* is a string containing the element name.

183

*attrib* is a dictionary containing the element attributes.

184

185

Do not call this method, use the SubElement factory function instead.

186

187

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

188

return self.__class__(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

189

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

190

def copy(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

191

"""Return copy of current element.

192

193

This creates a shallow copy. Subelements will be shared with the

194

original tree.

195

196

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

197

elem = self.makeelement(self.tag, self.attrib)

198

elem.text = self.text

199

elem.tail = self.tail

elem[:] = self

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

203

def __len__(self):

204

return len(self._children)

205

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

206

def __bool__(self):

207

warnings.warn(

208

"The behavior of this method will change in future versions. "

209

"Use specific 'len(elem)' or 'elem is not None' test instead.",

210

FutureWarning, stacklevel=2

211

)

212

return len(self._children) != 0 # emulate old behaviour, for now

213

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

214

def __getitem__(self, index):

215

return self._children[index]

216

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

217

def __setitem__(self, index, element):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

218

# if isinstance(index, slice):

219

# for elt in element:

220

# assert iselement(elt)

221

# else:

222

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

223

self._children[index] = element

224

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

225

def __delitem__(self, index):

226

del self._children[index]

227

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

228

def append(self, subelement):

229

"""Add *subelement* to the end of this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

230

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

231

The new element will appear in document order after the last existing

232

subelement (or directly after the text, if it's the first subelement),

233

but before the end tag for this element.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

234

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

235

"""

236

self._assert_is_element(subelement)

237

self._children.append(subelement)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

238

239

def extend(self, elements):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

240

"""Append subelements from a sequence.

241

242

*elements* is a sequence with zero or more elements.

243

244

"""

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

245

for element in elements:

246

self._assert_is_element(element)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

247

self._children.extend(elements)

248

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

249

def insert(self, index, subelement):

250

"""Insert *subelement* at position *index*."""

251

self._assert_is_element(subelement)

252

self._children.insert(index, subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

253

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

254

def _assert_is_element(self, e):

Antoine Pitrou

ee32931

2012-10-04 19:53:29 +0200

[diff] [blame]

255

# Need to refer to the actual Python implementation, not the

256

# shadowing C implementation.

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

257

if not isinstance(e, _Element_Py):

Eli Bendersky

396e8fc

2012-03-23 14:24:20 +0200

[diff] [blame]

258

raise TypeError('expected an Element, not %s' % type(e).__name__)

259

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

260

def remove(self, subelement):

261

"""Remove matching subelement.

262

263

Unlike the find methods, this method compares elements based on

264

identity, NOT ON tag value or contents. To remove subelements by

265

other means, the easiest way is to use a list comprehension to

266

select what elements to keep, and then use slice assignment to update

267

the parent element.

268

269

ValueError is raised if a matching element could not be found.

270

271

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

272

# assert iselement(element)

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

273

self._children.remove(subelement)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

274

275

def getchildren(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

276

"""(Deprecated) Return all subelements.

277

278

Elements are returned in document order.

279

280

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

281

warnings.warn(

282

"This method will be removed in future versions. "

283

"Use 'list(elem)' or iteration over elem instead.",

284

DeprecationWarning, stacklevel=2

285

)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

286

return self._children

287

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

288

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

289

"""Find first matching element by tag name or path.

290

291

*path* is a string having either an element tag or an XPath,

292

*namespaces* is an optional mapping from namespace prefix to full name.

293

294

Return the first matching element, or None if no element was found.

295

296

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

297

return ElementPath.find(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

298

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

299

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

300

"""Find text for first matching element by tag name or path.

301

302

*path* is a string having either an element tag or an XPath,

303

*default* is the value to return if the element was not found,

304

*namespaces* is an optional mapping from namespace prefix to full name.

305

306

Return text content of first matching element, or default value if

307

none was found. Note that if an element is found having no text

308

content, the empty string is returned.

309

310

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

311

return ElementPath.findtext(self, path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

312

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

313

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

314

"""Find all matching subelements by tag name or path.

315

316

*path* is a string having either an element tag or an XPath,

317

*namespaces* is an optional mapping from namespace prefix to full name.

318

319

Returns list containing all matching elements in document order.

320

321

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

322

return ElementPath.findall(self, path, namespaces)

323

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

324

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

325

"""Find all matching subelements by tag name or path.

326

327

*path* is a string having either an element tag or an XPath,

328

*namespaces* is an optional mapping from namespace prefix to full name.

329

330

Return an iterable yielding all matching elements in document order.

331

332

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

333

return ElementPath.iterfind(self, path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

334

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

335

def clear(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

336

"""Reset element.

337

338

This function removes all subelements, clears all attributes, and sets

339

the text and tail attributes to None.

340

341

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

342

self.attrib.clear()

343

self._children = []

344

self.text = self.tail = None

345

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

346

def get(self, key, default=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

347

"""Get element attribute.

348

349

Equivalent to attrib.get, but some implementations may handle this a

350

bit more efficiently. *key* is what attribute to look for, and

351

*default* is what to return if the attribute was not found.

352

353

Returns a string containing the attribute value, or the default if

354

attribute was not found.

355

356

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

357

return self.attrib.get(key, default)

358

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

359

def set(self, key, value):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

360

"""Set element attribute.

361

362

Equivalent to attrib[key] = value, but some implementations may handle

363

this a bit more efficiently. *key* is what attribute to set, and

364

*value* is the attribute value to set it to.

365

366

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

367

self.attrib[key] = value

368

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

369

def keys(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

370

"""Get list of attribute names.

371

372

Names are returned in an arbitrary order, just like an ordinary

373

Python dict. Equivalent to attrib.keys()

374

375

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

376

return self.attrib.keys()

377

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

378

def items(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

379

"""Get element attributes as a sequence.

380

381

The attributes are returned in arbitrary order. Equivalent to

382

attrib.items().

383

384

Return a list of (name, value) tuples.

385

386

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

387

return self.attrib.items()

388

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

389

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

390

"""Create tree iterator.

391

392

The iterator loops over the element and all subelements in document

393

order, returning all elements with a matching tag.

394

395

If the tree structure is modified during iteration, new or removed

396

elements may or may not be included. To get a stable set, use the

397

list() function on the iterator, and loop over the resulting list.

398

399

*tag* is what tags to look for (default is to return all elements)

400

401

Return an iterator containing all the matching elements.

402

403

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

404

if tag == "*":

405

tag = None

406

if tag is None or self.tag == tag:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

407

yield self

408

for e in self._children:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

409

yield from e.iter(tag)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

410

411

# compatibility

412

def getiterator(self, tag=None):

413

# Change for a DeprecationWarning in 1.4

414

warnings.warn(

415

"This method will be removed in future versions. "

416

"Use 'elem.iter()' or 'list(elem.iter())' instead.",

417

PendingDeprecationWarning, stacklevel=2

418

)

419

return list(self.iter(tag))

420

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

421

def itertext(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

422

"""Create text iterator.

423

424

The iterator loops over the element and all subelements in document

425

order, returning all inner text.

426

427

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

428

tag = self.tag

429

if not isinstance(tag, str) and tag is not None:

return

if self.text:

yield self.text

for e in self:

Philip Jenvey

fd0d3e5

2012-10-01 15:34:31 -0700

[diff] [blame]

434

yield from e.itertext()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

435

if e.tail:

436

yield e.tail

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

437

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

438

439

def SubElement(parent, tag, attrib={}, **extra):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

440

"""Subelement factory which creates an element instance, and appends it

441

to an existing parent.

442

443

The element tag, attribute names, and attribute values can be either

444

bytes or Unicode strings.

445

446

*parent* is the parent element, *tag* is the subelements name, *attrib* is

447

an optional directory containing element attributes, *extra* are

448

additional attributes given as keyword arguments.

449

450

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

451

attrib = attrib.copy()

452

attrib.update(extra)

453

element = parent.makeelement(tag, attrib)

454

parent.append(element)

455

return element

456

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

457

458

def Comment(text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

459

"""Comment element factory.

460

461

This function creates a special element which the standard serializer

462

serializes as an XML comment.

463

464

*text* is a string containing the comment string.

465

466

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

467

element = Element(Comment)

element.text = text

return element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

471

472

def ProcessingInstruction(target, text=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

473

"""Processing Instruction element factory.

474

475

This function creates a special element which the standard serializer

476

serializes as an XML comment.

477

478

*target* is a string containing the processing instruction, *text* is a

479

string containing the processing instruction contents, if any.

480

481

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

482

element = Element(ProcessingInstruction)

483

element.text = target

484

if text:

485

element.text = element.text + " " + text

486

return element

487

488

PI = ProcessingInstruction

489

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

490

491

class QName:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

492

"""Qualified name wrapper.

493

494

This class can be used to wrap a QName attribute value in order to get

495

proper namespace handing on output.

496

497

*text_or_uri* is a string containing the QName value either in the form

498

{uri}local, or if the tag argument is given, the URI part of a QName.

499

500

*tag* is an optional argument which if given, will make the first

501

argument (text_or_uri) be interpreted as a URI, and this argument (tag)

502

be interpreted as a local name.

503

504

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

505

def __init__(self, text_or_uri, tag=None):

506

if tag:

507

text_or_uri = "{%s}%s" % (text_or_uri, tag)

508

self.text = text_or_uri

509

def __str__(self):

510

return self.text

Georg Brandl

b56c0e2

2010-12-09 18:10:27 +0000

[diff] [blame]

511

def __repr__(self):

Georg Brandl

c95c918

2010-12-09 18:26:02 +0000

[diff] [blame]

512

return '<QName %r>' % (self.text,)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

513

def __hash__(self):

514

return hash(self.text)

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

515

def __le__(self, other):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

516

if isinstance(other, QName):

Mark Dickinson

a56c467

2009-01-27 18:17:45 +0000

[diff] [blame]

517

return self.text <= other.text

518

return self.text <= other

519

def __lt__(self, other):

520

if isinstance(other, QName):

521

return self.text < other.text

522

return self.text < other

523

def __ge__(self, other):

524

if isinstance(other, QName):

525

return self.text >= other.text

526

return self.text >= other

527

def __gt__(self, other):

528

if isinstance(other, QName):

529

return self.text > other.text

530

return self.text > other

531

def __eq__(self, other):

532

if isinstance(other, QName):

533

return self.text == other.text

534

return self.text == other

535

def __ne__(self, other):

536

if isinstance(other, QName):

537

return self.text != other.text

538

return self.text != other

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

539

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

540

# --------------------------------------------------------------------

541

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

542

543

class ElementTree:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

544

"""An XML element hierarchy.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

545

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

546

This class also provides support for serialization to and from

547

standard XML.

548

549

*element* is an optional root element node,

550

*file* is an optional file handle or file name of an XML file whose

551

contents will be used to initialize the tree with.

552

553

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

554

def __init__(self, element=None, file=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

555

# assert element is None or iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

556

self._root = element # first node

if file:

self.parse(file)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

560

def getroot(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

561

"""Return root element of this tree."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

562

return self._root

563

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

564

def _setroot(self, element):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

565

"""Replace root element of this tree.

566

567

This will discard the current contents of the tree and replace it

568

with the given element. Use with care!

569

570

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

571

# assert iselement(element)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

572

self._root = element

573

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

574

def parse(self, source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

575

"""Load external XML document into element tree.

576

577

*source* is a file name or file object, *parser* is an optional parser

578

instance that defaults to XMLParser.

579

580

ParseError is raised if the parser fails to parse the document.

581

582

Returns the root element of the given source document.

583

584

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

585

close_source = False

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

586

if not hasattr(source, "read"):

587

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

588

close_source = True

589

try:

Eli Bendersky

a369923

2013-05-19 18:47:23 -0700

[diff] [blame]

590

if parser is None:

591

# If no parser was specified, create a default XMLParser

592

parser = XMLParser()

593

if hasattr(parser, '_parse_whole'):

594

# The default XMLParser, when it comes from an accelerator,

595

# can define an internal _parse_whole API for efficiency.

596

# It can be used to parse the whole source without feeding

597

# it with chunks.

598

self._root = parser._parse_whole(source)

599

return self._root

600

while True:

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

601

data = source.read(65536)

if not data:

break

parser.feed(data)

self._root = parser.close()

return self._root

finally:

if close_source:

source.close()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

610

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

611

def iter(self, tag=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

612

"""Create and return tree iterator for the root element.

613

614

The iterator loops over all elements in this tree, in document order.

615

616

*tag* is a string with the tag name to iterate over

617

(default is to return all elements).

618

619

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

620

# assert self._root is not None

621

return self._root.iter(tag)

622

623

# compatibility

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

624

def getiterator(self, tag=None):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

625

# Change for a DeprecationWarning in 1.4

626

warnings.warn(

627

"This method will be removed in future versions. "

628

"Use 'tree.iter()' or 'list(tree.iter())' instead.",

629

PendingDeprecationWarning, stacklevel=2

630

)

631

return list(self.iter(tag))

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

632

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

633

def find(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

634

"""Find first matching element by tag name or path.

635

636

Same as getroot().find(path), which is Element.find()

637

638

*path* is a string having either an element tag or an XPath,

639

*namespaces* is an optional mapping from namespace prefix to full name.

640

641

Return the first matching element, or None if no element was found.

642

643

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

644

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

645

if path[:1] == "/":

646

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

647

warnings.warn(

648

"This search is broken in 1.3 and earlier, and will be "

649

"fixed in a future version. If you rely on the current "

650

"behaviour, change it to %r" % path,

651

FutureWarning, stacklevel=2

652

)

653

return self._root.find(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

654

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

655

def findtext(self, path, default=None, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

656

"""Find first matching element by tag name or path.

657

658

Same as getroot().findtext(path), which is Element.findtext()

659

660

*path* is a string having either an element tag or an XPath,

661

*namespaces* is an optional mapping from namespace prefix to full name.

662

663

Return the first matching element, or None if no element was found.

664

665

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

666

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

667

if path[:1] == "/":

668

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

669

warnings.warn(

670

"This search is broken in 1.3 and earlier, and will be "

671

"fixed in a future version. If you rely on the current "

672

"behaviour, change it to %r" % path,

673

FutureWarning, stacklevel=2

674

)

675

return self._root.findtext(path, default, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

676

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

677

def findall(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

678

"""Find all matching subelements by tag name or path.

679

680

Same as getroot().findall(path), which is Element.findall().

681

682

*path* is a string having either an element tag or an XPath,

683

*namespaces* is an optional mapping from namespace prefix to full name.

684

685

Return list containing all matching elements in document order.

686

687

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

688

# assert self._root is not None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

689

if path[:1] == "/":

690

path = "." + path

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

691

warnings.warn(

692

"This search is broken in 1.3 and earlier, and will be "

693

"fixed in a future version. If you rely on the current "

694

"behaviour, change it to %r" % path,

695

FutureWarning, stacklevel=2

696

)

697

return self._root.findall(path, namespaces)

698

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

699

def iterfind(self, path, namespaces=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

700

"""Find all matching subelements by tag name or path.

701

702

Same as getroot().iterfind(path), which is element.iterfind()

703

704

*path* is a string having either an element tag or an XPath,

705

*namespaces* is an optional mapping from namespace prefix to full name.

706

707

Return an iterable yielding all matching elements in document order.

708

709

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

710

# assert self._root is not None

if path[:1] == "/":

path = "." + path

warnings.warn(

"This search is broken in 1.3 and earlier, and will be "

715

"fixed in a future version. If you rely on the current "

716

"behaviour, change it to %r" % path,

717

FutureWarning, stacklevel=2

718

)

719

return self._root.iterfind(path, namespaces)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

720

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

721

def write(self, file_or_filename,

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

722

encoding=None,

723

xml_declaration=None,

724

default_namespace=None,

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

725

method=None, *,

726

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

727

"""Write element tree to a file as XML.

728

729

Arguments:

730

*file_or_filename* -- file name or a file object opened for writing

731

732

*encoding* -- the output encoding (default: US-ASCII)

733

734

*xml_declaration* -- bool indicating if an XML declaration should be

735

added to the output. If None, an XML declaration

736

is added if encoding IS NOT either of:

737

US-ASCII, UTF-8, or Unicode

738

739

*default_namespace* -- sets the default XML namespace (for "xmlns")

740

741

*method* -- either "xml" (default), "html, "text", or "c14n"

742

743

*short_empty_elements* -- controls the formatting of elements

744

that contain no content. If True (default)

745

they are emitted as a single self-closed

746

tag, otherwise they are emitted as a pair

747

of start/end tags

Eli Bendersky

e9af827

2013-01-13 06:27:51 -0800

[diff] [blame]

748

749

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

750

if not method:

751

method = "xml"

752

elif method not in _serialize:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

753

raise ValueError("unknown method %r" % method)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

if not encoding:

if method == "c14n":

encoding = "utf-8"

else:

encoding = "us-ascii"

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

759

else:

760

encoding = encoding.lower()

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

761

with _get_writer(file_or_filename, encoding) as write:

762

if method == "xml" and (xml_declaration or

763

(xml_declaration is None and

764

encoding not in ("utf-8", "us-ascii", "unicode"))):

765

declared_encoding = encoding

766

if encoding == "unicode":

767

# Retrieve the default encoding for the xml declaration

768

import locale

769

declared_encoding = locale.getpreferredencoding()

770

write("<?xml version='1.0' encoding='%s'?>\n" % (

771

declared_encoding,))

772

if method == "text":

773

_serialize_text(write, self._root)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

774

else:

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

775

qnames, namespaces = _namespaces(self._root, default_namespace)

776

serialize = _serialize[method]

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

777

serialize(write, self._root, qnames, namespaces,

778

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

779

780

def write_c14n(self, file):

781

# lxml.etree compatibility. use output method instead

782

return self.write(file, method="c14n")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

783

784

# --------------------------------------------------------------------

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

785

# serialization support

786

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

787

@contextlib.contextmanager

788

def _get_writer(file_or_filename, encoding):

Ezio Melotti

b5bc353

2013-08-17 16:11:40 +0300

[diff] [blame]

789

# returns text write method and release all resources after using

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

790

try:

791

write = file_or_filename.write

792

except AttributeError:

793

# file_or_filename is a file name

794

if encoding == "unicode":

795

file = open(file_or_filename, "w")

796

else:

797

file = open(file_or_filename, "w", encoding=encoding,

798

errors="xmlcharrefreplace")

with file:

yield file.write

else:

# file_or_filename is a file-like object

803

# encoding determines if it is a text or binary writer

804

if encoding == "unicode":

805

# use a text writer as is

806

yield write

807

else:

808

# wrap a binary writer with TextIOWrapper

809

with contextlib.ExitStack() as stack:

810

if isinstance(file_or_filename, io.BufferedIOBase):

811

file = file_or_filename

812

elif isinstance(file_or_filename, io.RawIOBase):

813

file = io.BufferedWriter(file_or_filename)

814

# Keep the original file open when the BufferedWriter is

815

# destroyed

816

stack.callback(file.detach)

817

else:

818

# This is to handle passed objects that aren't in the

819

# IOBase hierarchy, but just have a write method

820

file = io.BufferedIOBase()

821

file.writable = lambda: True

822

file.write = write

823

try:

824

# TextIOWrapper uses this methods to determine

825

# if BOM (for UTF-16, etc) should be added

826

file.seekable = file_or_filename.seekable

827

file.tell = file_or_filename.tell

828

except AttributeError:

829

pass

830

file = io.TextIOWrapper(file,

831

encoding=encoding,

832

errors="xmlcharrefreplace",

833

newline="\n")

834

# Keep the original file open when the TextIOWrapper is

835

# destroyed

836

stack.callback(file.detach)

837

yield file.write

838

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

839

def _namespaces(elem, default_namespace=None):

840

# identify namespaces used in this tree

841

842

# maps qnames to *encoded* prefix:local names

843

qnames = {None: None}

844

845

# maps uri:s to prefixes

846

namespaces = {}

847

if default_namespace:

848

namespaces[default_namespace] = ""

849

850

def add_qname(qname):

851

# calculate serialized qname representation

852

try:

853

if qname[:1] == "{":

854

uri, tag = qname[1:].rsplit("}", 1)

855

prefix = namespaces.get(uri)

856

if prefix is None:

857

prefix = _namespace_map.get(uri)

858

if prefix is None:

859

prefix = "ns%d" % len(namespaces)

860

if prefix != "xml":

861

namespaces[uri] = prefix

862

if prefix:

863

qnames[qname] = "%s:%s" % (prefix, tag)

864

else:

865

qnames[qname] = tag # default element

866

else:

867

if default_namespace:

868

# FIXME: can this be handled in XML 1.0?

869

raise ValueError(

870

"cannot use non-qualified names with "

871

"default_namespace option"

872

)

873

qnames[qname] = qname

874

except TypeError:

875

_raise_serialization_error(qname)

876

877

# populate qname and namespaces table

Eli Bendersky

64d11e6

2012-06-15 07:42:50 +0300

[diff] [blame]

878

for elem in elem.iter():

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

879

tag = elem.tag

Senthil Kumaran

ec30b3d

2010-11-09 02:36:59 +0000

[diff] [blame]

880

if isinstance(tag, QName):

881

if tag.text not in qnames:

882

add_qname(tag.text)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

883

elif isinstance(tag, str):

884

if tag not in qnames:

885

add_qname(tag)

886

elif tag is not None and tag is not Comment and tag is not PI:

887

_raise_serialization_error(tag)

888

for key, value in elem.items():

889

if isinstance(key, QName):

890

key = key.text

891

if key not in qnames:

892

add_qname(key)

893

if isinstance(value, QName) and value.text not in qnames:

894

add_qname(value.text)

895

text = elem.text

896

if isinstance(text, QName) and text.text not in qnames:

897

add_qname(text.text)

898

return qnames, namespaces

899

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

900

def _serialize_xml(write, elem, qnames, namespaces,

901

short_empty_elements, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % text)

906

elif tag is ProcessingInstruction:

907

write("<?%s?>" % text)

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

913

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

914

_serialize_xml(write, e, qnames, None,

915

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

916

else:

917

write("<" + tag)

918

items = list(elem.items())

919

if items or namespaces:

920

if namespaces:

921

for v, k in sorted(namespaces.items(),

922

key=lambda x: x[1]): # sort on prefix

923

if k:

924

k = ":" + k

925

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

930

if isinstance(k, QName):

931

k = k.text

932

if isinstance(v, QName):

933

v = qnames[v.text]

934

else:

935

v = _escape_attrib(v)

936

write(" %s=\"%s\"" % (qnames[k], v))

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

937

if text or len(elem) or not short_empty_elements:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

938

write(">")

939

if text:

940

write(_escape_cdata(text))

941

for e in elem:

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

942

_serialize_xml(write, e, qnames, None,

943

short_empty_elements=short_empty_elements)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

944

write("</" + tag + ">")

else:

write(" />")

if elem.tail:

write(_escape_cdata(elem.tail))

949

950

HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",

Ezio Melotti

c90111f

2012-09-19 08:19:12 +0300

[diff] [blame]

951

"img", "input", "isindex", "link", "meta", "param")

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

952

953

try:

954

HTML_EMPTY = set(HTML_EMPTY)

except NameError:

pass

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

958

def _serialize_html(write, elem, qnames, namespaces, **kwargs):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

tag = elem.tag

text = elem.text

if tag is Comment:

write("" % _escape_cdata(text))

963

elif tag is ProcessingInstruction:

964

write("<?%s?>" % _escape_cdata(text))

else:

tag = qnames[tag]

if tag is None:

if text:

write(_escape_cdata(text))

970

for e in elem:

971

_serialize_html(write, e, qnames, None)

972

else:

973

write("<" + tag)

974

items = list(elem.items())

975

if items or namespaces:

976

if namespaces:

977

for v, k in sorted(namespaces.items(),

978

key=lambda x: x[1]): # sort on prefix

979

if k:

980

k = ":" + k

981

write(" xmlns%s=\"%s\"" % (

k,

_escape_attrib(v)

))

for k, v in sorted(items): # lexical order

986

if isinstance(k, QName):

987

k = k.text

988

if isinstance(v, QName):

989

v = qnames[v.text]

990

else:

991

v = _escape_attrib_html(v)

992

# FIXME: handle boolean attributes

993

write(" %s=\"%s\"" % (qnames[k], v))

994

write(">")

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

995

ltag = tag.lower()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

996

if text:

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

997

if ltag == "script" or ltag == "style":

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

998

write(text)

999

else:

1000

write(_escape_cdata(text))

1001

for e in elem:

1002

_serialize_html(write, e, qnames, None)

Christian Heimes

54ad7e3

2013-07-05 01:39:49 +0200

[diff] [blame]

1003

if ltag not in HTML_EMPTY:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1004

write("</" + tag + ">")

1005

if elem.tail:

1006

write(_escape_cdata(elem.tail))

1007

1008

def _serialize_text(write, elem):

1009

for part in elem.itertext():

write(part)

if elem.tail:

write(elem.tail)

_serialize = {

"xml": _serialize_xml,

1016

"html": _serialize_html,

1017

"text": _serialize_text,

1018

# this optional method is imported at the end of the module

1019

# "c14n": _serialize_c14n,

1020

}

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1021

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1022

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1023

def register_namespace(prefix, uri):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1024

"""Register a namespace prefix.

1025

1026

The registry is global, and any existing mapping for either the

1027

given prefix or the namespace URI will be removed.

1028

1029

*prefix* is the namespace prefix, *uri* is a namespace uri. Tags and

1030

attributes in this namespace will be serialized with prefix if possible.

1031

1032

ValueError is raised if prefix is reserved or is invalid.

1033

1034

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1035

if re.match("ns\d+$", prefix):

1036

raise ValueError("Prefix format reserved for internal use")

Georg Brandl

90b2067

2010-12-28 10:38:33 +0000

[diff] [blame]

1037

for k, v in list(_namespace_map.items()):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1038

if k == uri or v == prefix:

1039

del _namespace_map[k]

1040

_namespace_map[uri] = prefix

1041

1042

_namespace_map = {

1043

# "well-known" namespace prefixes

1044

"http://www.w3.org/XML/1998/namespace": "xml",

1045

"http://www.w3.org/1999/xhtml": "html",

1046

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

1047

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

1048

# xml schema

1049

"http://www.w3.org/2001/XMLSchema": "xs",

1050

"http://www.w3.org/2001/XMLSchema-instance": "xsi",

1051

# dublin core

1052

"http://purl.org/dc/elements/1.1/": "dc",

1053

}

Florent Xicluna

1639505

2012-02-16 23:28:35 +0100

[diff] [blame]

1054

# For tests and troubleshooting

1055

register_namespace._namespace_map = _namespace_map

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1056

1057

def _raise_serialization_error(text):

1058

raise TypeError(

1059

"cannot serialize %r (type %s)" % (text, type(text).__name__)

1060

)

1061

1062

def _escape_cdata(text):

1063

# escape character data

1064

try:

1065

# it's worth avoiding do-nothing calls for strings that are

1066

# shorter than 500 character, or so. assume that's, by far,

1067

# the most common case in most applications.

1068

if "&" in text:

1069

text = text.replace("&", "&")

1070

if "<" in text:

1071

text = text.replace("<", "<")

1072

if ">" in text:

1073

text = text.replace(">", ">")

1074

return text

1075

except (TypeError, AttributeError):

1076

_raise_serialization_error(text)

1077

1078

def _escape_attrib(text):

1079

# escape attribute value

1080

try:

1081

if "&" in text:

1082

text = text.replace("&", "&")

1083

if "<" in text:

1084

text = text.replace("<", "<")

1085

if ">" in text:

1086

text = text.replace(">", ">")

1087

if "\"" in text:

1088

text = text.replace("\"", """)

1089

if "\n" in text:

1090

text = text.replace("\n", "
")

1091

return text

1092

except (TypeError, AttributeError):

1093

_raise_serialization_error(text)

1094

1095

def _escape_attrib_html(text):

1096

# escape attribute value

1097

try:

1098

if "&" in text:

1099

text = text.replace("&", "&")

1100

if ">" in text:

1101

text = text.replace(">", ">")

1102

if "\"" in text:

1103

text = text.replace("\"", """)

1104

return text

1105

except (TypeError, AttributeError):

1106

_raise_serialization_error(text)

1107

1108

# --------------------------------------------------------------------

1109

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1110

def tostring(element, encoding=None, method=None, *,

1111

short_empty_elements=True):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1112

"""Generate string representation of XML element.

1113

1114

All subelements are included. If encoding is "unicode", a string

1115

is returned. Otherwise a bytestring is returned.

1116

1117

*element* is an Element instance, *encoding* is an optional output

1118

encoding defaulting to US-ASCII, *method* is an optional output which can

1119

be one of "xml" (default), "html", "text" or "c14n".

1120

1121

Returns an (optionally) encoded string containing the XML data.

1122

1123

"""

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1124

stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1125

ElementTree(element).write(stream, encoding, method=method,

1126

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-15 06:02:22 +0300

[diff] [blame]

1127

return stream.getvalue()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1128

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1129

class _ListDataStream(io.BufferedIOBase):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1130

"""An auxiliary stream accumulating into a list reference."""

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1131

def __init__(self, lst):

1132

self.lst = lst

Eli Bendersky

f90fc68

2012-07-17 15:09:56 +0300

[diff] [blame]

1133

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

def writable(self):

return True

def seekable(self):

return True

def write(self, b):

self.lst.append(b)

def tell(self):

return len(self.lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1146

def tostringlist(element, encoding=None, method=None, *,

1147

short_empty_elements=True):

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1148

lst = []

1149

stream = _ListDataStream(lst)

Eli Bendersky

2013-01-13 06:04:43 -0800

[diff] [blame]

1150

ElementTree(element).write(stream, encoding, method=method,

1151

short_empty_elements=short_empty_elements)

Eli Bendersky

2012-07-17 15:09:12 +0300

[diff] [blame]

1152

return lst

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1153

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1154

1155

def dump(elem):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1156

"""Write element tree or element structure to sys.stdout.

1157

1158

This function should be used for debugging only.

1159

1160

*elem* is either an ElementTree, or a single Element. The exact output

1161

format is implementation dependent. In this version, it's written as an

1162

ordinary XML file.

1163

1164

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1165

# debugging

1166

if not isinstance(elem, ElementTree):

1167

elem = ElementTree(elem)

Florent Xicluna

c17f172

2010-08-08 19:48:29 +0000

[diff] [blame]

1168

elem.write(sys.stdout, encoding="unicode")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1169

tail = elem.getroot().tail

1170

if not tail or tail[-1] != "\n":

1171

sys.stdout.write("\n")

1172

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1173

# --------------------------------------------------------------------

1174

# parsing

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1175

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1176

1177

def parse(source, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1178

"""Parse XML document into element tree.

1179

1180

*source* is a filename or file object containing XML data,

1181

*parser* is an optional parser instance defaulting to XMLParser.

1182

1183

Return an ElementTree instance.

1184

1185

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1186

tree = ElementTree()

1187

tree.parse(source, parser)

1188

return tree

1189

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1190

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1191

def iterparse(source, events=None, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1192

"""Incrementally parse XML document into ElementTree.

1193

1194

This class also reports what's going on to the user based on the

1195

*events* it is initialized with. The supported events are the strings

1196

"start", "end", "start-ns" and "end-ns" (the "ns" events are used to get

1197

detailed namespace information). If *events* is omitted, only

1198

"end" events are reported.

1199

1200

*source* is a filename or file object containing XML data, *events* is

1201

a list of events to report back, *parser* is an optional parser instance.

1202

1203

Returns an iterator providing (event, elem) pairs.

1204

1205

"""

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1206

close_source = False

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1207

if not hasattr(source, "read"):

1208

source = open(source, "rb")

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1209

close_source = True

Antoine Pitrou

2010-10-29 10:38:18 +0000

[diff] [blame]

1210

return _IterParseIterator(source, events, parser, close_source)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1211

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1212

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1213

class IncrementalParser:

1214

1215

def __init__(self, events=None, parser=None):

1216

# _elementtree.c expects a list, not a deque

1217

self._events_queue = []

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1218

self._index = 0

1219

self.root = self._root = None

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1220

if not parser:

1221

parser = XMLParser(target=TreeBuilder())

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1222

self._parser = parser

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1223

# wire up the parser for event reporting

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1224

if events is None:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1225

events = ("end",)

1226

self._parser._setevents(self._events_queue, events)

1227

1228

def data_received(self, data):

1229

if self._parser is None:

1230

raise ValueError("data_received() called after end of stream")

1231

if data:

1232

try:

1233

self._parser.feed(data)

1234

except SyntaxError as exc:

1235

self._events_queue.append(exc)

1236

1237

def eof_received(self):

1238

self._root = self._parser.close()

1239

self._parser = None

1240

if self._index >= len(self._events_queue):

1241

self.root = self._root

1242

1243

def events(self):

1244

events = self._events_queue

while True:

index = self._index

try:

event = events[self._index]

1249

# Avoid retaining references to past events

1250

events[self._index] = None

except IndexError:

break

index += 1

# Compact the list in a O(1) amortized fashion

1255

if index * 2 >= len(events):

1256

events[:index] = []

1257

self._index = 0

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1258

else:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1259

self._index = index

1260

if isinstance(event, Exception):

raise event

else:

yield event

if self._parser is None:

1265

self.root = self._root

1266

1267

Antoine Pitrou

2013-08-23 23:04:30 +0200

[diff] [blame^]

1268

class _IterParseIterator:

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1269

1270

def __init__(self, source, events, parser, close_source=False):

Antoine Pitrou

2013-08-23 23:04:30 +0200

[diff] [blame^]

1271

self._parser = IncrementalParser(events, parser)

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1272

self._file = source

1273

self._close_file = close_source

Antoine Pitrou

2013-08-23 23:04:30 +0200

[diff] [blame^]

1274

self.root = None

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1275

Georg Brandl

a18af4e

2007-04-21 15:47:16 +0000

[diff] [blame]

1276

def __next__(self):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1277

while 1:

Antoine Pitrou

2013-08-23 23:04:30 +0200

[diff] [blame^]

1278

for event in self._parser.events():

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1279

return event

Antoine Pitrou

2013-08-23 23:04:30 +0200

[diff] [blame^]

1280

if self._parser._parser is None:

1281

self.root = self._parser.root

Florent Xicluna

91d5193

2011-11-01 23:31:09 +0100

[diff] [blame]

if self._close_file:

self._file.close()

raise StopIteration

# load event buffer

Florent Xicluna

91d5193

2011-11-01 23:31:09 +0100

[diff] [blame]

1286

data = self._file.read(16384)

1287

if data:

Antoine Pitrou

2013-08-23 23:04:30 +0200

[diff] [blame^]

1288

self._parser.data_received(data)

Florent Xicluna

91d5193

2011-11-01 23:31:09 +0100

[diff] [blame]

1289

else:

Antoine Pitrou

2013-08-23 23:04:30 +0200

[diff] [blame^]

1290

self._parser.eof_received()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1291

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1292

def __iter__(self):

1293

return self

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1294

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1295

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1296

def XML(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1297

"""Parse XML document from string constant.

1298

1299

This function can be used to embed "XML Literals" in Python code.

1300

1301

*text* is a string containing XML data, *parser* is an

1302

optional parser instance, defaulting to the standard XMLParser.

1303

1304

Returns an Element instance.

1305

1306

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1307

if not parser:

1308

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1309

parser.feed(text)

1310

return parser.close()

1311

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1312

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1313

def XMLID(text, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1314

"""Parse XML document from string constant for its IDs.

1315

1316

*text* is a string containing XML data, *parser* is an

1317

optional parser instance, defaulting to the standard XMLParser.

1318

1319

Returns an (Element, dict) tuple, in which the

1320

dict maps element id:s to elements.

1321

1322

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1323

if not parser:

1324

parser = XMLParser(target=TreeBuilder())

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1325

parser.feed(text)

1326

tree = parser.close()

1327

ids = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1328

for elem in tree.iter():

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

id = elem.get("id")

if id:

ids[id] = elem

return tree, ids

Victor Stinner

2013-03-26 01:11:54 +0100

[diff] [blame]

1334

# Parse XML document from string constant. Alias for XML().

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1335

fromstring = XML

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1336

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1337

def fromstringlist(sequence, parser=None):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1338

"""Parse XML document from sequence of string fragments.

1339

1340

*sequence* is a list of other sequence, *parser* is an optional parser

1341

instance, defaulting to the standard XMLParser.

1342

1343

Returns an Element instance.

1344

1345

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1346

if not parser:

1347

parser = XMLParser(target=TreeBuilder())

1348

for text in sequence:

1349

parser.feed(text)

1350

return parser.close()

1351

1352

# --------------------------------------------------------------------

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1353

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1354

1355

class TreeBuilder:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1356

"""Generic element structure builder.

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1357

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1358

This builder converts a sequence of start, data, and end method

1359

calls to a well-formed element structure.

1360

1361

You can use this class to build an element structure using a custom XML

1362

parser, or a parser for some other XML-like format.

1363

1364

*element_factory* is an optional element factory which is called

1365

to create new Element instances, as necessary.

1366

1367

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1368

def __init__(self, element_factory=None):

1369

self._data = [] # data collector

1370

self._elem = [] # element stack

1371

self._last = None # last element

1372

self._tail = None # true if we're after an end tag

1373

if element_factory is None:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1374

element_factory = Element

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1375

self._factory = element_factory

1376

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1377

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1378

"""Flush builder buffers and return toplevel document Element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1379

assert len(self._elem) == 0, "missing end tags"

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1380

assert self._last is not None, "missing toplevel element"

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

return self._last

def _flush(self):

if self._data:

if self._last is not None:

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1386

text = "".join(self._data)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1387

if self._tail:

1388

assert self._last.tail is None, "internal error (tail)"

1389

self._last.tail = text

1390

else:

1391

assert self._last.text is None, "internal error (text)"

1392

self._last.text = text

1393

self._data = []

1394

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1395

def data(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1396

"""Add text to current element."""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1397

self._data.append(data)

1398

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1399

def start(self, tag, attrs):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1400

"""Open new element and return it.

1401

1402

*tag* is the element name, *attrs* is a dict containing element

1403

attributes.

1404

1405

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1406

self._flush()

1407

self._last = elem = self._factory(tag, attrs)

1408

if self._elem:

1409

self._elem[-1].append(elem)

1410

self._elem.append(elem)

self._tail = 0

return elem

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1414

def end(self, tag):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1415

"""Close and return current Element.

1416

1417

*tag* is the element name.

1418

1419

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1420

self._flush()

1421

self._last = self._elem.pop()

1422

assert self._last.tag == tag,\

1423

"end tag mismatch (expected %s, got %s)" % (

self._last.tag, tag)

self._tail = 1

return self._last

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1428

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1429

# also see ElementTree and TreeBuilder

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1430

class XMLParser:

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1431

"""Element structure builder for XML source data based on the expat parser.

1432

1433

*html* are predefined HTML entities (not supported currently),

1434

*target* is an optional target object which defaults to an instance of the

1435

standard TreeBuilder class, *encoding* is an optional encoding string

1436

which if given, overrides the encoding specified in the XML file:

1437

http://www.iana.org/assignments/character-sets

1438

1439

"""

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1440

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1441

def __init__(self, html=0, target=None, encoding=None):

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1442

try:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1443

from xml.parsers import expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1444

except ImportError:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1445

try:

1446

import pyexpat as expat

Brett Cannon

cd171c8

2013-07-04 17:43:24 -0400

[diff] [blame]

1447

except ImportError:

1448

raise ImportError(

1449

"No module named expat; use SimpleXMLTreeBuilder instead"

1450

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1451

parser = expat.ParserCreate(encoding, "}")

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1452

if target is None:

1453

target = TreeBuilder()

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1454

# underscored names are provided for compatibility only

1455

self.parser = self._parser = parser

1456

self.target = self._target = target

1457

self._error = expat.error

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1458

self._names = {} # name memo cache

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1459

# main callbacks

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1460

parser.DefaultHandlerExpand = self._default

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1461

if hasattr(target, 'start'):

1462

parser.StartElementHandler = self._start

1463

if hasattr(target, 'end'):

1464

parser.EndElementHandler = self._end

1465

if hasattr(target, 'data'):

1466

parser.CharacterDataHandler = target.data

1467

# miscellaneous callbacks

1468

if hasattr(target, 'comment'):

1469

parser.CommentHandler = target.comment

1470

if hasattr(target, 'pi'):

1471

parser.ProcessingInstructionHandler = target.pi

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1472

# let expat do the buffering, if supported

1473

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1474

parser.buffer_text = 1

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1475

except AttributeError:

1476

pass

1477

# use new-style attribute handling, if supported

1478

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1479

parser.ordered_attributes = 1

1480

parser.specified_attributes = 1

1481

if hasattr(target, 'start'):

1482

parser.StartElementHandler = self._start_list

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1483

except AttributeError:

1484

pass

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1485

self._doctype = None

1486

self.entity = {}

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1487

try:

1488

self.version = "Expat %d.%d.%d" % expat.version_info

1489

except AttributeError:

1490

pass # unknown

1491

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1492

def _setevents(self, events_queue, events_to_report):

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1493

# Internal API for IncrementalParser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1494

# events_to_report: a list of events to report during parsing (same as

1495

# the *events* of IncrementalParser's constructor.

1496

# events_queue: a list of actual parsing events that will be populated

1497

# by the underlying parser.

1498

#

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1499

parser = self._parser

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1500

append = events_queue.append

1501

for event_name in events_to_report:

1502

if event_name == "start":

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1503

parser.ordered_attributes = 1

1504

parser.specified_attributes = 1

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1505

def handler(tag, attrib_in, event=event_name, append=append,

Eli Bendersky

c9f5ca2

2013-04-20 09:11:37 -0700

[diff] [blame]

1506

start=self._start_list):

1507

append((event, start(tag, attrib_in)))

1508

parser.StartElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1509

elif event_name == "end":

1510

def handler(tag, event=event_name, append=append,

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1511

end=self._end):

1512

append((event, end(tag)))

1513

parser.EndElementHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1514

elif event_name == "start-ns":

1515

def handler(prefix, uri, event=event_name, append=append):

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1516

append((event, (prefix or "", uri or "")))

1517

parser.StartNamespaceDeclHandler = handler

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1518

elif event_name == "end-ns":

1519

def handler(prefix, event=event_name, append=append):

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1520

append((event, None))

1521

parser.EndNamespaceDeclHandler = handler

1522

else:

Eli Bendersky

2013-05-19 09:01:49 -0700

[diff] [blame]

1523

raise ValueError("unknown event %r" % event_name)

Antoine Pitrou

2013-04-18 19:37:06 +0200

[diff] [blame]

1524

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1525

def _raiseerror(self, value):

1526

err = ParseError(value)

1527

err.code = value.code

1528

err.position = value.lineno, value.offset

1529

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1530

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1531

def _fixname(self, key):

1532

# expand qname, and convert name string to ascii, if possible

1533

try:

1534

name = self._names[key]

except KeyError:

name = key

if "}" in name:

name = "{" + name

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1539

self._names[key] = name

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1540

return name

1541

1542

def _start(self, tag, attrib_in):

1543

fixname = self._fixname

1544

tag = fixname(tag)

1545

attrib = {}

1546

for key, value in attrib_in.items():

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1547

attrib[fixname(key)] = value

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1548

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1549

1550

def _start_list(self, tag, attrib_in):

1551

fixname = self._fixname

tag = fixname(tag)

attrib = {}

if attrib_in:

for i in range(0, len(attrib_in), 2):

Martin v. Löwis

f30bb0e

2007-07-28 11:40:46 +0000

[diff] [blame]

1556

attrib[fixname(attrib_in[i])] = attrib_in[i+1]

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1557

return self.target.start(tag, attrib)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1558

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1559

def _end(self, tag):

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1560

return self.target.end(self._fixname(tag))

1561

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1562

def _default(self, text):

1563

prefix = text[:1]

1564

if prefix == "&":

1565

# deal with undefined entities

1566

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1567

data_handler = self.target.data

1568

except AttributeError:

1569

return

1570

try:

1571

data_handler(self.entity[text[1:-1]])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1572

except KeyError:

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1573

from xml.parsers import expat

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1574

err = expat.error(

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1575

"undefined entity %s: line %d, column %d" %

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1576

(text, self.parser.ErrorLineNumber,

1577

self.parser.ErrorColumnNumber)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1578

)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1579

err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1580

err.lineno = self.parser.ErrorLineNumber

1581

err.offset = self.parser.ErrorColumnNumber

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1582

raise err

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1583

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1584

self._doctype = [] # inside a doctype declaration

1585

elif self._doctype is not None:

1586

# parse doctype contents

1587

if prefix == ">":

1588

self._doctype = None

1589

return

Neal Norwitz

9d72bb4

2007-04-17 08:48:32 +0000

[diff] [blame]

1590

text = text.strip()

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1591

if not text:

1592

return

1593

self._doctype.append(text)

1594

n = len(self._doctype)

1595

if n > 2:

1596

type = self._doctype[1]

1597

if type == "PUBLIC" and n == 4:

1598

name, type, pubid, system = self._doctype

Florent Xicluna

a1c974a

2012-07-07 13:16:44 +0200

[diff] [blame]

1599

if pubid:

1600

pubid = pubid[1:-1]

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1601

elif type == "SYSTEM" and n == 3:

1602

name, type, system = self._doctype

1603

pubid = None

1604

else:

1605

return

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1606

if hasattr(self.target, "doctype"):

1607

self.target.doctype(name, pubid, system[1:-1])

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1608

elif self.doctype != self._XMLParser__doctype:

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1609

# warn about deprecated call

1610

self._XMLParser__doctype(name, pubid, system[1:-1])

1611

self.doctype(name, pubid, system[1:-1])

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1612

self._doctype = None

1613

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1614

def doctype(self, name, pubid, system):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1615

"""(Deprecated) Handle doctype declaration

1616

1617

*name* is the Doctype name, *pubid* is the public identifier,

1618

and *system* is the system identifier.

1619

1620

"""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1621

warnings.warn(

1622

"This method of XMLParser is deprecated. Define doctype() "

1623

"method on the TreeBuilder target.",

DeprecationWarning,

)

# sentinel, if doctype is redefined in a subclass

1628

__doctype = doctype

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1629

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1630

def feed(self, data):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1631

"""Feed encoded data to parser."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1632

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1633

self.parser.Parse(data, 0)

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1634

except self._error as v:

1635

self._raiseerror(v)

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1636

Armin Rigo

2005-12-14 18:10:45 +0000

[diff] [blame]

1637

def close(self):

Eli Bendersky

2013-03-09 07:12:48 -0800

[diff] [blame]

1638

"""Finish feeding data to parser and return element structure."""

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1639

try:

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1640

self.parser.Parse("", 1) # end of data

Florent Xicluna

2010-03-13 23:24:31 +0000

[diff] [blame]

1641

except self._error as v:

1642

self._raiseerror(v)

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1643

try:

Florent Xicluna

fb06746

2012-03-05 11:42:49 +0100

[diff] [blame]

1644

close_handler = self.target.close

1645

except AttributeError:

1646

pass

1647

else:

1648

return close_handler()

Florent Xicluna

2012-03-05 10:42:19 +0100

[diff] [blame]

1649

finally:

1650

# get rid of circular references

1651

del self.parser, self._parser

1652

del self.target, self._target

Thomas Wouters

0e3f591

2006-08-11 14:57:12 +0000

[diff] [blame]

1653

Florent Xicluna

2012-02-13 11:03:30 +0100

[diff] [blame]

1654

1655

# Import the C accelerators

1656

try:

Eli Bendersky

46955b2

2013-05-19 09:20:50 -0700

[diff] [blame]

1657

# Element is going to be shadowed by the C implementation. We need to keep

1658

# the Python version of it accessible for some "creative" by external code

1659

# (see tests)

1660

_Element_Py = Element

1661

Florent Xicluna