fix CVE-2024-37388
(cherry picked from commit e861ad4dc5e50f1a111f3becbadc6278622dca2b)
This commit is contained in:
parent
a1ab747614
commit
891c7fac3c
78
Fix-test_elementtree-with-Expat-2.6.0.patch
Normal file
78
Fix-test_elementtree-with-Expat-2.6.0.patch
Normal file
@ -0,0 +1,78 @@
|
||||
From e3012a702dea2b03830fe00a5e8f7a429bbc3f42 Mon Sep 17 00:00:00 2001
|
||||
From: Serhiy Storchaka <storchaka@gmail.com>
|
||||
Date: Mon, 22 Apr 2024 16:52:26 +0800
|
||||
Subject: [PATCH] Fix test_elementtree with Expat 2.6.0
|
||||
|
||||
---
|
||||
src/lxml/tests/test_elementtree.py | 48 ++++++++++++++++--------------
|
||||
1 file changed, 25 insertions(+), 23 deletions(-)
|
||||
|
||||
diff --git a/src/lxml/tests/test_elementtree.py b/src/lxml/tests/test_elementtree.py
|
||||
index 96426cb..d9cd47e 100644
|
||||
--- a/src/lxml/tests/test_elementtree.py
|
||||
+++ b/src/lxml/tests/test_elementtree.py
|
||||
@@ -14,6 +14,7 @@ import copy
|
||||
import io
|
||||
import operator
|
||||
import os
|
||||
+import pyexpat
|
||||
import re
|
||||
import sys
|
||||
import textwrap
|
||||
@@ -4396,29 +4397,30 @@ class _XMLPullParserTest(unittest.TestCase):
|
||||
self.assertEqual([(action, elem.tag) for action, elem in events],
|
||||
expected)
|
||||
|
||||
- def test_simple_xml(self):
|
||||
- for chunk_size in (None, 1, 5):
|
||||
- #with self.subTest(chunk_size=chunk_size):
|
||||
- parser = self.etree.XMLPullParser()
|
||||
- self.assert_event_tags(parser, [])
|
||||
- self._feed(parser, "<!-- comment -->\n", chunk_size)
|
||||
- self.assert_event_tags(parser, [])
|
||||
- self._feed(parser,
|
||||
- "<root>\n <element key='value'>text</element",
|
||||
- chunk_size)
|
||||
- self.assert_event_tags(parser, [])
|
||||
- self._feed(parser, ">\n", chunk_size)
|
||||
- self.assert_event_tags(parser, [('end', 'element')])
|
||||
- self._feed(parser, "<element>text</element>tail\n", chunk_size)
|
||||
- self._feed(parser, "<empty-element/>\n", chunk_size)
|
||||
- self.assert_event_tags(parser, [
|
||||
- ('end', 'element'),
|
||||
- ('end', 'empty-element'),
|
||||
- ])
|
||||
- self._feed(parser, "</root>\n", chunk_size)
|
||||
- self.assert_event_tags(parser, [('end', 'root')])
|
||||
- root = self._close_and_return_root(parser)
|
||||
- self.assertEqual(root.tag, 'root')
|
||||
+ def test_simple_xml(self, chunk_size=None):
|
||||
+ parser = self.etree.XMLPullParser()
|
||||
+ self.assert_event_tags(parser, [])
|
||||
+ self._feed(parser, "<!-- comment -->\n", chunk_size)
|
||||
+ self.assert_event_tags(parser, [])
|
||||
+ self._feed(parser,
|
||||
+ "<root>\n <element key='value'>text</element",
|
||||
+ chunk_size)
|
||||
+ self.assert_event_tags(parser, [])
|
||||
+ self._feed(parser, ">\n", chunk_size)
|
||||
+ self.assert_event_tags(parser, [('end', 'element')])
|
||||
+ self._feed(parser, "<element>text</element>tail\n", chunk_size)
|
||||
+ self._feed(parser, "<empty-element/>\n", chunk_size)
|
||||
+ self.assert_event_tags(parser, [
|
||||
+ ('end', 'element'),
|
||||
+ ('end', 'empty-element'),
|
||||
+ ])
|
||||
+ self._feed(parser, "</root>\n", chunk_size)
|
||||
+ self.assert_event_tags(parser, [('end', 'root')])
|
||||
+ root = self._close_and_return_root(parser)
|
||||
+ self.assertEqual(root.tag, 'root')
|
||||
+
|
||||
+ def test_simple_xml_chunk_22(self):
|
||||
+ self.test_simple_xml(chunk_size=22)
|
||||
|
||||
def test_feed_while_iterating(self):
|
||||
parser = self.etree.XMLPullParser()
|
||||
--
|
||||
2.33.0
|
||||
|
||||
372
backport-CVE-2024-37388.patch
Normal file
372
backport-CVE-2024-37388.patch
Normal file
@ -0,0 +1,372 @@
|
||||
From b38cebf2f846e92bd63de4488fd3d1c8b568f397 Mon Sep 17 00:00:00 2001
|
||||
From: scoder <stefan_ml@behnel.de>
|
||||
Date: Fri, 29 Dec 2023 14:21:23 +0100
|
||||
Subject: [PATCH] Disable external entity resolution (XXE) by default (GH-391)
|
||||
|
||||
This prevents security risks that would allow loading arbitrary external files.
|
||||
|
||||
Closes https://bugs.launchpad.net/lxml/+bug/1742885
|
||||
Supersedes https://github.com/lxml/lxml/pull/130
|
||||
---
|
||||
doc/FAQ.txt | 12 +++--
|
||||
src/lxml/includes/xmlparser.pxd | 18 +++++++-
|
||||
src/lxml/parser.pxi | 70 ++++++++++++++++++++++++++--
|
||||
src/lxml/tests/test_etree.py | 81 +++++++++++++++++++++++++++++++++
|
||||
4 files changed, 170 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/doc/FAQ.txt b/doc/FAQ.txt
|
||||
index 48f69a6..7f3a524 100644
|
||||
--- a/doc/FAQ.txt
|
||||
+++ b/doc/FAQ.txt
|
||||
@@ -1107,9 +1107,9 @@ useless for the data commonly sent through web services and
|
||||
can simply be disabled, which rules out several types of
|
||||
denial of service attacks at once. This also involves an attack
|
||||
that reads local files from the server, as XML entities can be
|
||||
-defined to expand into their content. Consequently, version
|
||||
-1.2 of the SOAP standard explicitly disallows entity references
|
||||
-in the XML stream.
|
||||
+defined to expand into the content of external resources.
|
||||
+Consequently, version 1.2 of the SOAP standard explicitly
|
||||
+disallows entity references in the XML stream.
|
||||
|
||||
To disable entity expansion, use an XML parser that is configured
|
||||
with the option ``resolve_entities=False``. Then, after (or
|
||||
@@ -1117,7 +1117,11 @@ while) parsing the document, use ``root.iter(etree.Entity)`` to
|
||||
recursively search for entity references. If it contains any,
|
||||
reject the entire input document with a suitable error response.
|
||||
In lxml 3.x, you can also use the new DTD introspection API to
|
||||
-apply your own restrictions on input documents.
|
||||
+apply your own restrictions on input documents. Since version 5.x,
|
||||
+lxml disables the expansion of external entities (XXE) by default.
|
||||
+If you really want to allow loading external files into XML documents
|
||||
+using this functionality, you have to explicitly set
|
||||
+``resolve_entities=True``.
|
||||
|
||||
Another attack to consider is compression bombs. If you allow
|
||||
compressed input into your web service, attackers can try to send
|
||||
diff --git a/src/lxml/includes/xmlparser.pxd b/src/lxml/includes/xmlparser.pxd
|
||||
index 45acfc8..3945495 100644
|
||||
--- a/src/lxml/includes/xmlparser.pxd
|
||||
+++ b/src/lxml/includes/xmlparser.pxd
|
||||
@@ -1,9 +1,9 @@
|
||||
from libc.string cimport const_char
|
||||
|
||||
from lxml.includes.tree cimport (
|
||||
- xmlDoc, xmlNode, xmlDict, xmlDtd, xmlChar, const_xmlChar)
|
||||
+ xmlDoc, xmlNode, xmlEntity, xmlDict, xmlDtd, xmlChar, const_xmlChar)
|
||||
from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback
|
||||
-from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc
|
||||
+from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc, xmlErrorLevel
|
||||
|
||||
|
||||
cdef extern from "libxml/parser.h":
|
||||
@@ -47,11 +47,14 @@ cdef extern from "libxml/parser.h":
|
||||
|
||||
ctypedef void (*referenceSAXFunc)(void * ctx, const_xmlChar* name)
|
||||
|
||||
+ ctypedef xmlEntity* (*getEntitySAXFunc)(void* ctx, const_xmlChar* name)
|
||||
+
|
||||
cdef int XML_SAX2_MAGIC
|
||||
|
||||
cdef extern from "libxml/tree.h":
|
||||
ctypedef struct xmlParserInput:
|
||||
int line
|
||||
+ int col
|
||||
int length
|
||||
const_xmlChar* base
|
||||
const_xmlChar* cur
|
||||
@@ -76,6 +79,7 @@ cdef extern from "libxml/tree.h":
|
||||
charactersSAXFunc characters
|
||||
cdataBlockSAXFunc cdataBlock
|
||||
referenceSAXFunc reference
|
||||
+ getEntitySAXFunc getEntity
|
||||
commentSAXFunc comment
|
||||
processingInstructionSAXFunc processingInstruction
|
||||
startDocumentSAXFunc startDocument
|
||||
@@ -150,6 +154,8 @@ cdef extern from "libxml/parser.h":
|
||||
int inSubset
|
||||
int charset
|
||||
xmlParserInput* input
|
||||
+ int inputNr
|
||||
+ xmlParserInput** inputTab
|
||||
|
||||
ctypedef enum xmlParserOption:
|
||||
XML_PARSE_RECOVER = 1 # recover on errors
|
||||
@@ -212,6 +218,12 @@ cdef extern from "libxml/parser.h":
|
||||
char* filename, const_char* encoding,
|
||||
int options) nogil
|
||||
|
||||
+ cdef void xmlErrParser(xmlParserCtxt* ctxt, xmlNode* node,
|
||||
+ int domain, int code, xmlErrorLevel level,
|
||||
+ const xmlChar *str1, const xmlChar *str2, const xmlChar *str3,
|
||||
+ int int1, const char *msg, ...)
|
||||
+
|
||||
+
|
||||
# iterparse:
|
||||
|
||||
cdef xmlParserCtxt* xmlCreatePushParserCtxt(xmlSAXHandler* sax,
|
||||
@@ -233,6 +245,8 @@ cdef extern from "libxml/parser.h":
|
||||
cdef xmlExternalEntityLoader xmlGetExternalEntityLoader() nogil
|
||||
cdef void xmlSetExternalEntityLoader(xmlExternalEntityLoader f) nogil
|
||||
|
||||
+ cdef xmlEntity* xmlSAX2GetEntity(void* ctxt, const_xmlChar* name) nogil
|
||||
+
|
||||
# DTDs:
|
||||
|
||||
cdef xmlDtd* xmlParseDTD(const_xmlChar* ExternalID, const_xmlChar* SystemID) nogil
|
||||
diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi
|
||||
index 3187a38..2f0ce80 100644
|
||||
--- a/src/lxml/parser.pxi
|
||||
+++ b/src/lxml/parser.pxi
|
||||
@@ -794,6 +794,7 @@ cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
|
||||
c_attr = c_attr.next
|
||||
return 0
|
||||
|
||||
+
|
||||
@cython.internal
|
||||
cdef class _BaseParser:
|
||||
cdef ElementClassLookup _class_lookup
|
||||
@@ -806,6 +807,7 @@ cdef class _BaseParser:
|
||||
cdef bint _remove_pis
|
||||
cdef bint _strip_cdata
|
||||
cdef bint _collect_ids
|
||||
+ cdef bint _resolve_external_entities
|
||||
cdef XMLSchema _schema
|
||||
cdef bytes _filename
|
||||
cdef readonly object target
|
||||
@@ -814,7 +816,7 @@ cdef class _BaseParser:
|
||||
|
||||
def __init__(self, int parse_options, bint for_html, XMLSchema schema,
|
||||
remove_comments, remove_pis, strip_cdata, collect_ids,
|
||||
- target, encoding):
|
||||
+ target, encoding, bint resolve_external_entities=True):
|
||||
cdef tree.xmlCharEncodingHandler* enchandler
|
||||
cdef int c_encoding
|
||||
if not isinstance(self, (XMLParser, HTMLParser)):
|
||||
@@ -827,6 +829,7 @@ cdef class _BaseParser:
|
||||
self._remove_pis = remove_pis
|
||||
self._strip_cdata = strip_cdata
|
||||
self._collect_ids = collect_ids
|
||||
+ self._resolve_external_entities = resolve_external_entities
|
||||
self._schema = schema
|
||||
|
||||
self._resolvers = _ResolverRegistry()
|
||||
@@ -906,6 +909,8 @@ cdef class _BaseParser:
|
||||
if self._strip_cdata:
|
||||
# hard switch-off for CDATA nodes => makes them plain text
|
||||
pctxt.sax.cdataBlock = NULL
|
||||
+ if not self._resolve_external_entities:
|
||||
+ pctxt.sax.getEntity = _getInternalEntityOnly
|
||||
|
||||
cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
|
||||
cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
|
||||
@@ -1206,6 +1211,56 @@ cdef class _BaseParser:
|
||||
finally:
|
||||
context.cleanup()
|
||||
|
||||
+cdef tree.xmlEntity* _getInternalEntityOnly(void* ctxt, const_xmlChar* name):
|
||||
+ """
|
||||
+ Callback function to intercept the entity resolution when external entity loading is disabled.
|
||||
+ """
|
||||
+ cdef tree.xmlEntity* entity = xmlparser.xmlSAX2GetEntity(ctxt, name)
|
||||
+ if not entity:
|
||||
+ return NULL
|
||||
+ if entity.etype not in (
|
||||
+ tree.xmlEntityType.XML_EXTERNAL_GENERAL_PARSED_ENTITY,
|
||||
+ tree.xmlEntityType.XML_EXTERNAL_GENERAL_UNPARSED_ENTITY,
|
||||
+ tree.xmlEntityType.XML_EXTERNAL_PARAMETER_ENTITY):
|
||||
+ return entity
|
||||
+
|
||||
+ # Reject all external entities and fail the parsing instead. There is currently
|
||||
+ # no way in libxml2 to just prevent the entity resolution in this case.
|
||||
+ cdef xmlerror.xmlError c_error
|
||||
+ cdef xmlerror.xmlStructuredErrorFunc err_func
|
||||
+ cdef xmlparser.xmlParserInput* parser_input
|
||||
+ cdef void* err_context
|
||||
+
|
||||
+ c_ctxt = <xmlparser.xmlParserCtxt *> ctxt
|
||||
+ err_func = xmlerror.xmlStructuredError
|
||||
+ if err_func:
|
||||
+ parser_input = c_ctxt.input
|
||||
+ # Copied from xmlVErrParser() in libxml2: get current input from stack.
|
||||
+ if parser_input and parser_input.filename is NULL and c_ctxt.inputNr > 1:
|
||||
+ parser_input = c_ctxt.inputTab[c_ctxt.inputNr - 2]
|
||||
+
|
||||
+ c_error = xmlerror.xmlError(
|
||||
+ domain=xmlerror.xmlErrorDomain.XML_FROM_PARSER,
|
||||
+ code=xmlerror.xmlParserErrors.XML_ERR_EXT_ENTITY_STANDALONE,
|
||||
+ level=xmlerror.xmlErrorLevel.XML_ERR_FATAL,
|
||||
+ message=b"External entity resolution is disabled for security reasons "
|
||||
+ b"when resolving '&%s;'. Use 'XMLParser(resolve_entities=True)' "
|
||||
+ b"if you consider it safe to enable it.",
|
||||
+ file=parser_input.filename,
|
||||
+ node=entity,
|
||||
+ str1=<char*> name,
|
||||
+ str2=NULL,
|
||||
+ str3=NULL,
|
||||
+ line=parser_input.line if parser_input else 0,
|
||||
+ int1=0,
|
||||
+ int2=parser_input.col if parser_input else 0,
|
||||
+ )
|
||||
+ err_context = xmlerror.xmlStructuredErrorContext
|
||||
+ err_func(err_context, &c_error)
|
||||
+
|
||||
+ c_ctxt.wellFormed = 0
|
||||
+ # The entity was looked up and does not need to be freed.
|
||||
+ return NULL
|
||||
|
||||
cdef void _initSaxDocument(void* ctxt) with gil:
|
||||
xmlparser.xmlSAX2StartDocument(ctxt)
|
||||
@@ -1508,12 +1563,14 @@ cdef class XMLParser(_FeedParser):
|
||||
- strip_cdata - replace CDATA sections by normal text content (default: True)
|
||||
- compact - save memory for short text content (default: True)
|
||||
- collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
|
||||
- - resolve_entities - replace entities by their text value (default: True)
|
||||
- huge_tree - disable security restrictions and support very deep trees
|
||||
and very long text content (only affects libxml2 2.7+)
|
||||
|
||||
Other keyword arguments:
|
||||
-
|
||||
+ - resolve_entities - replace entities by their text value: False for keeping the
|
||||
+ entity references, True for resolving them, and 'internal' for resolving
|
||||
+ internal definitions only (no external file/URL access).
|
||||
+ The default used to be True and was changed to 'internal' in lxml 5.0.
|
||||
- encoding - override the document encoding
|
||||
- target - a parser target object that will receive the parse events
|
||||
- schema - an XMLSchema to validate against
|
||||
@@ -1525,10 +1582,11 @@ cdef class XMLParser(_FeedParser):
|
||||
def __init__(self, *, encoding=None, attribute_defaults=False,
|
||||
dtd_validation=False, load_dtd=False, no_network=True,
|
||||
ns_clean=False, recover=False, XMLSchema schema=None,
|
||||
- huge_tree=False, remove_blank_text=False, resolve_entities=True,
|
||||
+ huge_tree=False, remove_blank_text=False, resolve_entities='internal',
|
||||
remove_comments=False, remove_pis=False, strip_cdata=True,
|
||||
collect_ids=True, target=None, compact=True):
|
||||
cdef int parse_options
|
||||
+ cdef bint resolve_external = True
|
||||
parse_options = _XML_DEFAULT_PARSE_OPTIONS
|
||||
if load_dtd:
|
||||
parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
|
||||
@@ -1553,12 +1611,14 @@ cdef class XMLParser(_FeedParser):
|
||||
parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
|
||||
if not resolve_entities:
|
||||
parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
|
||||
+ elif resolve_entities == 'internal':
|
||||
+ resolve_external = False
|
||||
if not strip_cdata:
|
||||
parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
|
||||
|
||||
_BaseParser.__init__(self, parse_options, 0, schema,
|
||||
remove_comments, remove_pis, strip_cdata,
|
||||
- collect_ids, target, encoding)
|
||||
+ collect_ids, target, encoding, resolve_external)
|
||||
|
||||
|
||||
cdef class XMLPullParser(XMLParser):
|
||||
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
|
||||
index 14b21f7..bc7548f 100644
|
||||
--- a/src/lxml/tests/test_etree.py
|
||||
+++ b/src/lxml/tests/test_etree.py
|
||||
@@ -12,11 +12,14 @@ from __future__ import absolute_import
|
||||
from collections import OrderedDict
|
||||
import os.path
|
||||
import unittest
|
||||
+import contextlib
|
||||
import copy
|
||||
import sys
|
||||
import re
|
||||
import gc
|
||||
import operator
|
||||
+import shutil
|
||||
+import tempfile
|
||||
import textwrap
|
||||
import zlib
|
||||
import gzip
|
||||
@@ -1675,6 +1678,84 @@ class ETreeOnlyTestCase(HelperTestCase):
|
||||
self.assertEqual(_bytes('<doc>&myentity;</doc>'),
|
||||
tostring(root))
|
||||
|
||||
+ @contextlib.contextmanager
|
||||
+ def _xml_test_file(self, name, content=b'<evil>XML</evil>'):
|
||||
+ temp_dir = tempfile.mkdtemp()
|
||||
+ try:
|
||||
+ xml_file = os.path.join(temp_dir, name)
|
||||
+ with open(xml_file, 'wb') as tmpfile:
|
||||
+ tmpfile.write(content)
|
||||
+ yield xml_file
|
||||
+ finally:
|
||||
+ shutil.rmtree(temp_dir)
|
||||
+
|
||||
+ def test_entity_parse_external(self):
|
||||
+ fromstring = self.etree.fromstring
|
||||
+ tostring = self.etree.tostring
|
||||
+ parser = self.etree.XMLParser(resolve_entities=True)
|
||||
+
|
||||
+ with self._xml_test_file("entity.xml") as entity_file:
|
||||
+ xml = '''
|
||||
+ <!DOCTYPE doc [
|
||||
+ <!ENTITY my_external_entity SYSTEM "%s">
|
||||
+ ]>
|
||||
+ <doc>&my_external_entity;</doc>
|
||||
+ ''' % path2url(entity_file)
|
||||
+ root = fromstring(xml, parser)
|
||||
+
|
||||
+ self.assertEqual(_bytes('<doc><evil>XML</evil></doc>'),
|
||||
+ tostring(root))
|
||||
+ self.assertEqual(root.tag, 'doc')
|
||||
+ self.assertEqual(root[0].tag, 'evil')
|
||||
+ self.assertEqual(root[0].text, 'XML')
|
||||
+ self.assertEqual(root[0].tail, None)
|
||||
+
|
||||
+ def test_entity_parse_external_no_resolve(self):
|
||||
+ fromstring = self.etree.fromstring
|
||||
+ parser = self.etree.XMLParser(resolve_entities=False)
|
||||
+ Entity = self.etree.Entity
|
||||
+
|
||||
+ with self._xml_test_file("entity.xml") as entity_file:
|
||||
+ xml = '''
|
||||
+ <!DOCTYPE doc [
|
||||
+ <!ENTITY my_external_entity SYSTEM "%s">
|
||||
+ ]>
|
||||
+ <doc>&my_external_entity;</doc>
|
||||
+ ''' % path2url(entity_file)
|
||||
+ root = fromstring(xml, parser)
|
||||
+
|
||||
+ self.assertEqual(root[0].tag, Entity)
|
||||
+ self.assertEqual(root[0].text, "&my_external_entity;")
|
||||
+
|
||||
+ def test_entity_parse_no_external_default(self):
|
||||
+ fromstring = self.etree.fromstring
|
||||
+
|
||||
+ with self._xml_test_file("entity.xml") as entity_file:
|
||||
+ xml = '''
|
||||
+ <!DOCTYPE doc [
|
||||
+ <!ENTITY my_failing_external_entity SYSTEM "%s">
|
||||
+ ]>
|
||||
+ <doc>&my_failing_external_entity;</doc>
|
||||
+ ''' % path2url(entity_file)
|
||||
+
|
||||
+ try:
|
||||
+ fromstring(xml)
|
||||
+ except self.etree.XMLSyntaxError as exc:
|
||||
+ exception = exc
|
||||
+ else:
|
||||
+ self.assertTrue(False, "XMLSyntaxError was not raised")
|
||||
+
|
||||
+ self.assertIn("my_failing_external_entity", str(exception))
|
||||
+ self.assertTrue(exception.error_log)
|
||||
+ # Depending on the libxml2 version, we get different errors here,
|
||||
+ # not necessarily the one that lxml produced. But it should fail either way.
|
||||
+ for error in exception.error_log:
|
||||
+ if "my_failing_external_entity" in error.message:
|
||||
+ self.assertEqual(5, error.line)
|
||||
+ break
|
||||
+ else:
|
||||
+ self.assertFalse("entity error not found in parser error log")
|
||||
+
|
||||
def test_entity_restructure(self):
|
||||
xml = _bytes('''<!DOCTYPE root [ <!ENTITY nbsp " "> ]>
|
||||
<root>
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -0,0 +1,220 @@
|
||||
From 72f5a287a4016ecb405f2e8a4a03ae22a5b0b496 Mon Sep 17 00:00:00 2001
|
||||
From: Stefan Behnel <stefan_ml@behnel.de>
|
||||
Date: Wed, 5 Jul 2023 22:10:45 +0200
|
||||
Subject: [PATCH] Change HTML "prefix" handling in ElementPath to let
|
||||
"element.find('part1:part2')" search for "part1:part2" instead of just
|
||||
"part2" with an unknown prefix. Also adapt the HTML "prefix" parsing test to
|
||||
make it work in libxml2 2.10.4 and later, where HTML "prefixes" are kept as
|
||||
part of the tag name by the parser.
|
||||
|
||||
---
|
||||
src/lxml/_elementpath.py | 22 +++++++++++-----------
|
||||
src/lxml/apihelpers.pxi | 7 +++++++
|
||||
src/lxml/etree.pyx | 8 ++++----
|
||||
src/lxml/includes/tree.pxd | 12 ++++++++++++
|
||||
src/lxml/tests/test_etree.py | 20 ++++++++++++++++----
|
||||
5 files changed, 50 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/src/lxml/_elementpath.py b/src/lxml/_elementpath.py
|
||||
index eabd81c..001b345 100644
|
||||
--- a/src/lxml/_elementpath.py
|
||||
+++ b/src/lxml/_elementpath.py
|
||||
@@ -71,14 +71,14 @@ xpath_tokenizer_re = re.compile(
|
||||
r"\s+"
|
||||
)
|
||||
|
||||
-def xpath_tokenizer(pattern, namespaces=None):
|
||||
+def xpath_tokenizer(pattern, namespaces=None, with_prefixes=True):
|
||||
# ElementTree uses '', lxml used None originally.
|
||||
default_namespace = (namespaces.get(None) or namespaces.get('')) if namespaces else None
|
||||
parsing_attribute = False
|
||||
for token in xpath_tokenizer_re.findall(pattern):
|
||||
ttype, tag = token
|
||||
if tag and tag[0] != "{":
|
||||
- if ":" in tag:
|
||||
+ if ":" in tag and with_prefixes:
|
||||
prefix, uri = tag.split(":", 1)
|
||||
try:
|
||||
if not namespaces:
|
||||
@@ -251,7 +251,7 @@ ops = {
|
||||
_cache = {}
|
||||
|
||||
|
||||
-def _build_path_iterator(path, namespaces):
|
||||
+def _build_path_iterator(path, namespaces, with_prefixes=True):
|
||||
"""compile selector pattern"""
|
||||
if path[-1:] == "/":
|
||||
path += "*" # implicit all (FIXME: keep this?)
|
||||
@@ -279,7 +279,7 @@ def _build_path_iterator(path, namespaces):
|
||||
|
||||
if path[:1] == "/":
|
||||
raise SyntaxError("cannot use absolute path on element")
|
||||
- stream = iter(xpath_tokenizer(path, namespaces))
|
||||
+ stream = iter(xpath_tokenizer(path, namespaces, with_prefixes=with_prefixes))
|
||||
try:
|
||||
_next = stream.next
|
||||
except AttributeError:
|
||||
@@ -308,8 +308,8 @@ def _build_path_iterator(path, namespaces):
|
||||
##
|
||||
# Iterate over the matching nodes
|
||||
|
||||
-def iterfind(elem, path, namespaces=None):
|
||||
- selector = _build_path_iterator(path, namespaces)
|
||||
+def iterfind(elem, path, namespaces=None, with_prefixes=True):
|
||||
+ selector = _build_path_iterator(path, namespaces, with_prefixes=with_prefixes)
|
||||
result = iter((elem,))
|
||||
for select in selector:
|
||||
result = select(result)
|
||||
@@ -319,8 +319,8 @@ def iterfind(elem, path, namespaces=None):
|
||||
##
|
||||
# Find first matching object.
|
||||
|
||||
-def find(elem, path, namespaces=None):
|
||||
- it = iterfind(elem, path, namespaces)
|
||||
+def find(elem, path, namespaces=None, with_prefixes=True):
|
||||
+ it = iterfind(elem, path, namespaces, with_prefixes=with_prefixes)
|
||||
try:
|
||||
return next(it)
|
||||
except StopIteration:
|
||||
@@ -330,15 +330,15 @@ def find(elem, path, namespaces=None):
|
||||
##
|
||||
# Find all matching objects.
|
||||
|
||||
-def findall(elem, path, namespaces=None):
|
||||
+def findall(elem, path, namespaces=None, with_prefixes=True):
|
||||
return list(iterfind(elem, path, namespaces))
|
||||
|
||||
|
||||
##
|
||||
# Find text for first matching object.
|
||||
|
||||
-def findtext(elem, path, default=None, namespaces=None):
|
||||
- el = find(elem, path, namespaces)
|
||||
+def findtext(elem, path, default=None, namespaces=None, with_prefixes=True):
|
||||
+ el = find(elem, path, namespaces, with_prefixes=with_prefixes)
|
||||
if el is None:
|
||||
return default
|
||||
else:
|
||||
diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi
|
||||
index 88a031d..effd116 100644
|
||||
--- a/src/lxml/apihelpers.pxi
|
||||
+++ b/src/lxml/apihelpers.pxi
|
||||
@@ -15,6 +15,13 @@ cdef void displayNode(xmlNode* c_node, indent):
|
||||
finally:
|
||||
return # swallow any exceptions
|
||||
|
||||
+cdef inline bint _isHtmlDocument(_Element element) except -1:
|
||||
+ cdef xmlNode* c_node = element._c_node
|
||||
+ return (
|
||||
+ c_node is not NULL and c_node.doc is not NULL and
|
||||
+ c_node.doc.properties & tree.XML_DOC_HTML != 0
|
||||
+ )
|
||||
+
|
||||
cdef inline int _assertValidNode(_Element element) except -1:
|
||||
assert element._c_node is not NULL, u"invalid Element proxy at %s" % id(element)
|
||||
|
||||
diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx
|
||||
index 689c330..90753fc 100644
|
||||
--- a/src/lxml/etree.pyx
|
||||
+++ b/src/lxml/etree.pyx
|
||||
@@ -1544,7 +1544,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
|
||||
"""
|
||||
if isinstance(path, QName):
|
||||
path = (<QName>path).text
|
||||
- return _elementpath.find(self, path, namespaces)
|
||||
+ return _elementpath.find(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
|
||||
|
||||
def findtext(self, path, default=None, namespaces=None):
|
||||
u"""findtext(self, path, default=None, namespaces=None)
|
||||
@@ -1557,7 +1557,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
|
||||
"""
|
||||
if isinstance(path, QName):
|
||||
path = (<QName>path).text
|
||||
- return _elementpath.findtext(self, path, default, namespaces)
|
||||
+ return _elementpath.findtext(self, path, default, namespaces, with_prefixes=not _isHtmlDocument(self))
|
||||
|
||||
def findall(self, path, namespaces=None):
|
||||
u"""findall(self, path, namespaces=None)
|
||||
@@ -1570,7 +1570,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
|
||||
"""
|
||||
if isinstance(path, QName):
|
||||
path = (<QName>path).text
|
||||
- return _elementpath.findall(self, path, namespaces)
|
||||
+ return _elementpath.findall(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
|
||||
|
||||
def iterfind(self, path, namespaces=None):
|
||||
u"""iterfind(self, path, namespaces=None)
|
||||
@@ -1583,7 +1583,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
|
||||
"""
|
||||
if isinstance(path, QName):
|
||||
path = (<QName>path).text
|
||||
- return _elementpath.iterfind(self, path, namespaces)
|
||||
+ return _elementpath.iterfind(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
|
||||
|
||||
def xpath(self, _path, *, namespaces=None, extensions=None,
|
||||
smart_strings=True, **_variables):
|
||||
diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd
|
||||
index 010af80..d709313 100644
|
||||
--- a/src/lxml/includes/tree.pxd
|
||||
+++ b/src/lxml/includes/tree.pxd
|
||||
@@ -154,6 +154,17 @@ cdef extern from "libxml/tree.h":
|
||||
XML_EXTERNAL_PARAMETER_ENTITY= 5
|
||||
XML_INTERNAL_PREDEFINED_ENTITY= 6
|
||||
|
||||
+ ctypedef enum xmlDocProperties:
|
||||
+ XML_DOC_WELLFORMED = 1 # /* document is XML well formed */
|
||||
+ XML_DOC_NSVALID = 2 # /* document is Namespace valid */
|
||||
+ XML_DOC_OLD10 = 4 # /* parsed with old XML-1.0 parser */
|
||||
+ XML_DOC_DTDVALID = 8 # /* DTD validation was successful */
|
||||
+ XML_DOC_XINCLUDE = 16 # /* XInclude substitution was done */
|
||||
+ XML_DOC_USERBUILT = 32 # /* Document was built using the API
|
||||
+ # and not by parsing an instance */
|
||||
+ XML_DOC_INTERNAL = 64 # /* built for internal processing */
|
||||
+ XML_DOC_HTML = 128 # /* parsed or built HTML document */
|
||||
+
|
||||
ctypedef struct xmlNs:
|
||||
const_xmlChar* href
|
||||
const_xmlChar* prefix
|
||||
@@ -274,6 +285,7 @@ cdef extern from "libxml/tree.h":
|
||||
void* _private
|
||||
xmlDtd* intSubset
|
||||
xmlDtd* extSubset
|
||||
+ int properties
|
||||
|
||||
ctypedef struct xmlAttr:
|
||||
void* _private
|
||||
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
|
||||
index bde496d..e969f3a 100644
|
||||
--- a/src/lxml/tests/test_etree.py
|
||||
+++ b/src/lxml/tests/test_etree.py
|
||||
@@ -3137,11 +3137,23 @@ class ETreeOnlyTestCase(HelperTestCase):
|
||||
|
||||
def test_html_prefix_nsmap(self):
|
||||
etree = self.etree
|
||||
- el = etree.HTML('<hha:page-description>aa</hha:page-description>').find('.//page-description')
|
||||
- if etree.LIBXML_VERSION < (2, 9, 11):
|
||||
- self.assertEqual({'hha': None}, el.nsmap)
|
||||
+ el = etree.HTML('<hha:page-description>aa</hha:page-description>')
|
||||
+ pd = el[-1]
|
||||
+ while len(pd):
|
||||
+ pd = pd[-1]
|
||||
+
|
||||
+ if etree.LIBXML_VERSION >= (2, 9, 11):
|
||||
+ # "Prefix" is kept as part of the tag name.
|
||||
+ self.assertEqual("hha:page-description", pd.tag)
|
||||
+ self.assertIsNone(el.find('.//page-description'))
|
||||
+ self.assertIsNotNone(el.find('.//hha:page-description')) # no namespaces!
|
||||
+ for e in el.iter():
|
||||
+ self.assertEqual({}, e.nsmap)
|
||||
else:
|
||||
- self.assertEqual({}, el.nsmap)
|
||||
+ # "Prefix" is parsed as XML prefix.
|
||||
+ self.assertEqual("page-description", pd.tag)
|
||||
+ pd = el.find('.//page-description')
|
||||
+ self.assertEqual({'hha': None}, pd.nsmap)
|
||||
|
||||
def test_getchildren(self):
|
||||
Element = self.etree.Element
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -7,7 +7,7 @@ The latest release works with all CPython versions from 2.7 to 3.7.
|
||||
|
||||
Name: python-%{modname}
|
||||
Version: 4.7.1
|
||||
Release: 5
|
||||
Release: 6
|
||||
Summary: XML processing library combining libxml2/libxslt with the ElementTree API
|
||||
License: BSD
|
||||
URL: https://files.pythonhosted.org
|
||||
@ -15,6 +15,9 @@ Source0: https://files.pythonhosted.org/packages/source/l/lxml/lxml-%{ver
|
||||
|
||||
Patch6000: backport-CVE-2022-2309.patch
|
||||
Patch6001: backport-Work-around-libxml2-bug-in-affected-versions.patch
|
||||
Patch6002: Fix-test_elementtree-with-Expat-2.6.0.patch
|
||||
Patch6003: backport-CVE-2024-37388.patch
|
||||
Patch6004: backport-Change-HTML-prefix-handling-in-ElementPath-to-let-el.patch
|
||||
|
||||
BuildRequires: gcc libxml2-devel libxslt-devel
|
||||
|
||||
@ -55,6 +58,12 @@ make test3
|
||||
%doc README.rst src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt
|
||||
|
||||
%changelog
|
||||
* Wed Jun 12 2024 zhuofeng <zhuofeng2@huawei.com> - 4.7.1-6
|
||||
- Type:CVE
|
||||
- CVE:CVE-2024-37388
|
||||
- SUG:NA
|
||||
- DESC:fix CVE-2024-37388
|
||||
|
||||
* Wed Nov 16 2022 zhuofeng <zhuofeng@huawei.com> - 4.7.1-5
|
||||
- change the Source0
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user