Skip to content

Commit 034bc48

Browse files
gh-149468: Add option to validate ElementTree during serialization
1 parent 8cad740 commit 034bc48

5 files changed

Lines changed: 334 additions & 28 deletions

File tree

Doc/library/xml.etree.elementtree.rst

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -711,14 +711,14 @@ Functions
711711

712712
.. function:: tostring(element, encoding="us-ascii", method="xml", *, \
713713
xml_declaration=None, default_namespace=None, \
714-
short_empty_elements=True)
714+
validate=False, short_empty_elements=True)
715715

716716
Generates a string representation of an XML element, including all
717717
subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is
718718
the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to
719719
generate a Unicode string (otherwise, a bytestring is generated). *method*
720720
is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``).
721-
*xml_declaration*, *default_namespace* and *short_empty_elements* has the same
721+
*xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* has the same
722722
meaning as in :meth:`ElementTree.write`. Returns an (optionally) encoded string
723723
containing the XML data.
724724

@@ -732,17 +732,20 @@ Functions
732732
The :func:`tostring` function now preserves the attribute order
733733
specified by the user.
734734

735+
.. versionchanged:: next
736+
Added the *validate* parameter.
737+
735738

736739
.. function:: tostringlist(element, encoding="us-ascii", method="xml", *, \
737740
xml_declaration=None, default_namespace=None, \
738-
short_empty_elements=True)
741+
validate=False, short_empty_elements=True)
739742

740743
Generates a string representation of an XML element, including all
741744
subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is
742745
the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to
743746
generate a Unicode string (otherwise, a bytestring is generated). *method*
744747
is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``).
745-
*xml_declaration*, *default_namespace* and *short_empty_elements* has the same
748+
*xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* has the same
746749
meaning as in :meth:`ElementTree.write`. Returns a list of (optionally) encoded
747750
strings containing the XML data. It does not guarantee any specific sequence,
748751
except that ``b"".join(tostringlist(element)) == tostring(element)``.
@@ -752,13 +755,19 @@ Functions
752755
.. versionchanged:: 3.4
753756
Added the *short_empty_elements* parameter.
754757

758+
.. versionchanged:: next
759+
Added the *validate* parameter.
760+
755761
.. versionchanged:: 3.8
756762
Added the *xml_declaration* and *default_namespace* parameters.
757763

758764
.. versionchanged:: 3.8
759765
The :func:`tostringlist` function now preserves the attribute order
760766
specified by the user.
761767

768+
.. versionchanged:: next
769+
Added the *validate* parameter.
770+
762771

763772
.. function:: XML(text, parser=None)
764773

@@ -1186,7 +1195,7 @@ ElementTree Objects
11861195

11871196
.. method:: write(file, encoding="us-ascii", xml_declaration=None, \
11881197
default_namespace=None, method="xml", *, \
1189-
short_empty_elements=True)
1198+
validate=False, short_empty_elements=True)
11901199
11911200
Writes the element tree to a file, as XML. *file* is a file name, or a
11921201
:term:`file object` opened for writing. *encoding* [1]_ is the output
@@ -1197,6 +1206,14 @@ ElementTree Objects
11971206
*default_namespace* sets the default XML namespace (for "xmlns").
11981207
*method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is
11991208
``"xml"``).
1209+
1210+
If *validate* is true, check that all characters are legal XML or HTML
1211+
characters, depending on *method*, element and attribute names are
1212+
valid, and the content of comments, processing instructions and
1213+
HTML elements like ``<script>`` do not contain illegal sequences,
1214+
and raise :exc:`ValueError` otherwise.
1215+
By default, no validation is performed.
1216+
12001217
The keyword-only *short_empty_elements* parameter controls the formatting
12011218
of elements that contain no content. If ``True`` (the default), they are
12021219
emitted as a single self-closed tag, otherwise they are emitted as a pair
@@ -1216,6 +1233,9 @@ ElementTree Objects
12161233
The :meth:`write` method now preserves the attribute order specified
12171234
by the user.
12181235

1236+
.. versionchanged:: next
1237+
Added the *validate* parameter.
1238+
12191239

12201240
This is the XML file that is going to be manipulated::
12211241

Doc/whatsnew/3.15.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1779,6 +1779,17 @@ xml
17791779
(Contributed by Serhiy Storchaka in :gh:`139489`.)
17801780

17811781

1782+
xml.etree.ElementTree
1783+
---------------------
1784+
1785+
* Add the *validate* option to functions
1786+
:func:`~xml.etree.ElementTree.tostring`,
1787+
:func:`~xml.etree.ElementTree.tostringlist`, and the
1788+
:meth:`Element.write <xml.etree.ElementTree.ElementTree.write>` method,
1789+
which allows to validate the element or element tree before serialization.
1790+
(Contributed by Serhiy Storchaka in :gh:`xxxxxx`.)
1791+
1792+
17821793
xml.parsers.expat
17831794
-----------------
17841795

Lib/test/test_xml_etree.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1331,6 +1331,192 @@ def test_attlist_default(self):
13311331
{'{http://www.w3.org/XML/1998/namespace}lang': 'eng'})
13321332

13331333

1334+
class XMLValidationTest(unittest.TestCase):
1335+
1336+
def check(self, elem, expected=None):
1337+
self.assertRaises(ValueError,
1338+
ET.tostring, elem, validate=True)
1339+
ET.tostring(elem) # no exception
1340+
1341+
def test_invalid_comment(self):
1342+
self.check(ET.Comment('a--b'))
1343+
self.check(ET.Comment(' B+, B, or B-'))
1344+
1345+
def test_invalid_processing_instruction(self):
1346+
self.check(ET.PI(''))
1347+
self.check(ET.PI('0'))
1348+
self.check(ET.PI('a/b'))
1349+
self.check(ET.PI('foo\xa0bar'))
1350+
self.check(ET.PI('xml'))
1351+
self.check(ET.PI('xml', 'encoding="UTF-8"'))
1352+
self.check(ET.PI('foo', 'a?>b'))
1353+
self.check(ET.PI('foo', '\x00'))
1354+
self.check(ET.PI('foo', '\ud8ff'))
1355+
self.check(ET.PI('foo', '\ufffe'))
1356+
1357+
def test_invalid_tag(self):
1358+
self.check(ET.Element(''))
1359+
self.check(ET.Element('0'))
1360+
self.check(ET.Element('a/b'))
1361+
self.check(ET.Element(ET.QName('')))
1362+
self.check(ET.Element(ET.QName('0')))
1363+
self.check(ET.Element(ET.QName('a/b')))
1364+
1365+
def test_invalid_attr_name(self):
1366+
self.check(ET.Element('tag', attrib={'': 'value'}))
1367+
self.check(ET.Element('tag', attrib={'0': 'value'}))
1368+
self.check(ET.Element('tag', attrib={'a/b': 'value'}))
1369+
self.check(ET.Element('tag', attrib={ET.QName(''): 'value'}))
1370+
self.check(ET.Element('tag', attrib={ET.QName('0'): 'value'}))
1371+
self.check(ET.Element('tag', attrib={ET.QName('a/b'): 'value'}))
1372+
1373+
def test_invalid_attr_value(self):
1374+
self.check(ET.Element('tag', attrib={'key': '\x00'}))
1375+
self.check(ET.Element('tag', attrib={'key': '\ud8ff'}))
1376+
self.check(ET.Element('tag', attrib={'key': '\ufffe'}))
1377+
self.check(ET.Element('tag', attrib={'key': ET.QName('\x00')}))
1378+
self.check(ET.Element('tag', attrib={'key': ET.QName('\ud8ff')}))
1379+
self.check(ET.Element('tag', attrib={'key': ET.QName('\ufffe')}))
1380+
1381+
def test_invalid_text(self):
1382+
elem = ET.Element('tag')
1383+
elem.text = '\x00'
1384+
self.check(elem)
1385+
elem.text = '\ud8ff'
1386+
self.check(elem)
1387+
elem.text = '\ufffe'
1388+
self.check(elem)
1389+
1390+
def test_invalid_tail(self):
1391+
elem = ET.Element('tag')
1392+
elem.tail = '\x00'
1393+
self.check(elem)
1394+
elem.tail = '\ud8ff'
1395+
self.check(elem)
1396+
elem.tail = '\ufffe'
1397+
self.check(elem)
1398+
1399+
def test_invalid_text_without_tag(self):
1400+
elem = ET.Element(None)
1401+
elem.text = '\x00'
1402+
self.check(elem)
1403+
elem.text = '\ud8ff'
1404+
self.check(elem)
1405+
elem.text = '\ufffe'
1406+
self.check(elem)
1407+
1408+
def test_invalid_subelements(self):
1409+
elem = ET.Element('tag')
1410+
subelem = ET.SubElement(elem, 'subtag')
1411+
ET.SubElement(subelem, '\x00')
1412+
self.check(elem)
1413+
elem.tag = None
1414+
self.check(elem)
1415+
1416+
def test_invalid_namespace_uri(self):
1417+
self.check(ET.Element('{\x00}tag'))
1418+
self.check(ET.Element('{\ud8ff}tag'))
1419+
self.check(ET.Element('{\ufffe}tag'))
1420+
self.check(ET.Element(ET.QName('\x00', 'tag')))
1421+
self.check(ET.Element(ET.QName('\ud8ff', 'tag')))
1422+
self.check(ET.Element(ET.QName('\ufffe', 'tag')))
1423+
1424+
class HTMLValidationTest(unittest.TestCase):
1425+
1426+
def check(self, elem, expected=None):
1427+
self.assertRaises(ValueError,
1428+
ET.tostring, elem, method='html', validate=True)
1429+
ET.tostring(elem, method='html') # no exception
1430+
1431+
def test_invalid_comment(self):
1432+
self.check(ET.Comment('>'))
1433+
self.check(ET.Comment('->'))
1434+
self.check(ET.Comment('a-->b'))
1435+
self.check(ET.Comment('a--!>b'))
1436+
self.check(ET.Comment('a\x00b'))
1437+
1438+
def test_invalid_processing_instruction(self):
1439+
self.check(ET.PI('a>b'))
1440+
self.check(ET.PI('a\x00b'))
1441+
1442+
def test_invalid_tag(self):
1443+
self.check(ET.Element(''))
1444+
self.check(ET.Element('?'))
1445+
self.check(ET.Element('!'))
1446+
self.check(ET.Element('0'))
1447+
self.check(ET.Element(' a'))
1448+
self.check(ET.Element('a b'))
1449+
self.check(ET.Element('a\nb'))
1450+
self.check(ET.Element('a/b'))
1451+
self.check(ET.Element('a>b'))
1452+
self.check(ET.Element('a\x00b'))
1453+
self.check(ET.Element(ET.QName('')))
1454+
self.check(ET.Element(ET.QName('0')))
1455+
self.check(ET.Element(ET.QName('a/b')))
1456+
1457+
def test_invalid_attr_name(self):
1458+
self.check(ET.Element('tag', attrib={'': 'value'}))
1459+
self.check(ET.Element('tag', attrib={'a/b': 'value'}))
1460+
self.check(ET.Element('tag', attrib={'a=b': 'value'}))
1461+
self.check(ET.Element('tag', attrib={ET.QName(''): 'value'}))
1462+
self.check(ET.Element('tag', attrib={ET.QName('a/b'): 'value'}))
1463+
1464+
def test_invalid_attr_value(self):
1465+
self.check(ET.Element('tag', attrib={'key': '\x00'}))
1466+
self.check(ET.Element('tag', attrib={'key': ET.QName('\x00')}))
1467+
self.check(ET.Element('tag', attrib={'key': ET.QName('a"b')}))
1468+
self.check(ET.Element('tag', attrib={'key': ET.QName('a&b')}))
1469+
1470+
def test_invalid_text(self):
1471+
elem = ET.Element('tag')
1472+
elem.text = '\x00'
1473+
self.check(elem)
1474+
1475+
def test_invalid_tail(self):
1476+
elem = ET.Element('tag')
1477+
elem.tail = '\x00'
1478+
self.check(elem)
1479+
1480+
def test_invalid_text_without_tag(self):
1481+
elem = ET.Element(None)
1482+
elem.text = '\x00'
1483+
self.check(elem)
1484+
1485+
def test_invalid_subelements(self):
1486+
elem = ET.Element('tag')
1487+
subelem = ET.SubElement(elem, 'subtag')
1488+
ET.SubElement(subelem, '\x00')
1489+
self.check(elem)
1490+
elem.tag = None
1491+
self.check(elem)
1492+
1493+
def test_invalid_namespace_uri(self):
1494+
self.check(ET.Element('{\x00}tag'))
1495+
self.check(ET.Element(ET.QName('\x00', 'tag')))
1496+
1497+
@support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes"))
1498+
def test_invalid_cdata_content(self, tag):
1499+
elem = ET.Element(tag.upper())
1500+
elem.text = 'a</%s>b' % tag.title()
1501+
self.check(elem)
1502+
elem.text = 'a</%s b' % tag.title()
1503+
self.check(elem)
1504+
elem.text = 'a</%s/b' % tag.title()
1505+
self.check(elem)
1506+
elem.text = 'a\x00b'
1507+
self.check(elem)
1508+
1509+
@support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes"))
1510+
def test_cdata_subelements(self, tag):
1511+
elem = ET.Element(tag)
1512+
ET.SubElement(elem, 'subtag')
1513+
self.check(elem)
1514+
1515+
def test_invalid_plaintext_content(self):
1516+
elem = ET.Element('plaintext')
1517+
elem.text = 'a\x00b'
1518+
self.check(elem)
1519+
13341520
class IterparseTest(unittest.TestCase):
13351521
# Test iterparse interface.
13361522

0 commit comments

Comments
 (0)