python-aiohttp/CVE-2024-23829.patch

From: Sam Bull <git@sambull.org>
Date: Sun, 28 Jan 2024 17:09:58 +0000
Subject: Improve validation in HTTP parser (#8074) (#8078)
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 8bit

Co-authored-by: Paul J. Dorn <pajod@users.noreply.github.com>
Co-authored-by: Sviatoslav Sydorenko (Святослав Сидоренко)
<sviat@redhat.com>
(cherry picked from commit 33ccdfb0a12690af5bb49bda2319ec0907fa7827)
---
 CONTRIBUTORS.txt          |   1 +
 aiohttp/http_parser.py    |  28 +++++----
 tests/test_http_parser.py | 155 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 169 insertions(+), 15 deletions(-)

diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
index ad63ce9..45fee0e 100644
--- a/CONTRIBUTORS.txt
+++ b/CONTRIBUTORS.txt
@@ -215,6 +215,7 @@ Panagiotis Kolokotronis
 Pankaj Pandey
 Pau Freixes
 Paul Colomiets
+Paul J. Dorn
 Paulius Šileikis
 Paulus Schoutsen
 Pavel Kamaev
diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py
index 8e5e816..2d58947 100644
--- a/aiohttp/http_parser.py
+++ b/aiohttp/http_parser.py
@@ -53,11 +53,10 @@ ASCIISET = set(string.printable)
 #     tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
 #             "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
 #     token = 1*tchar
-METHRE = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
-VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
-HDRRE: Final[Pattern[bytes]] = re.compile(
-    rb"[\x00-\x1F\x7F-\xFF()<>@,;:\[\]={} \t\"\\]"
-)
+_TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~")
+TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+")
+VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII)
+DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII)

 RawRequestMessage = collections.namedtuple(
     "RawRequestMessage",
@@ -122,6 +121,7 @@ class HeadersParser:
         self, lines: List[bytes]
     ) -> Tuple["CIMultiDictProxy[str]", RawHeaders]:
         headers = CIMultiDict()  # type: CIMultiDict[str]
+        # note: "raw" does not mean inclusion of OWS before/after the field value
         raw_headers = []

         lines_idx = 1
@@ -135,13 +135,14 @@ class HeadersParser:
             except ValueError:
                 raise InvalidHeader(line) from None

+            if len(bname) == 0:
+                raise InvalidHeader(bname)
+
             # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
             if {bname[0], bname[-1]} & {32, 9}:  # {" ", "\t"}
                 raise InvalidHeader(line)

             bvalue = bvalue.lstrip(b" \t")
-            if HDRRE.search(bname):
-                raise InvalidHeader(bname)
             if len(bname) > self.max_field_size:
                 raise LineTooLong(
                     "request header name {}".format(
@@ -150,6 +151,9 @@ class HeadersParser:
                     str(self.max_field_size),
                     str(len(bname)),
                 )
+            name = bname.decode("utf-8", "surrogateescape")
+            if not TOKENRE.fullmatch(name):
+                raise InvalidHeader(bname)

             header_length = len(bvalue)

@@ -196,7 +200,6 @@ class HeadersParser:
                     )

             bvalue = bvalue.strip(b" \t")
-            name = bname.decode("utf-8", "surrogateescape")
             value = bvalue.decode("utf-8", "surrogateescape")

             # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
@@ -317,7 +320,8 @@ class HttpParser(abc.ABC):
                         if length is not None:
                             # Shouldn't allow +/- or other number formats.
                             # https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
-                            if not length.strip(" \t").isdigit():
+                            # msg.headers is already stripped of leading/trailing wsp
+                            if not DIGITS.fullmatch(length):
                                 raise InvalidHeader(CONTENT_LENGTH)

                             length = int(length)
@@ -538,7 +542,7 @@ class HttpRequestParser(HttpParser):
         path_part, _question_mark_separator, qs_part = path_part.partition("?")

         # method
-        if not METHRE.match(method):
+        if not TOKENRE.fullmatch(method):
             raise BadStatusLine(method)

         # version
@@ -615,8 +619,8 @@ class HttpResponseParser(HttpParser):
             raise BadStatusLine(line)
         version_o = HttpVersion(int(match.group(1)), int(match.group(2)))

-        # The status code is a three-digit number
-        if len(status) != 3 or not status.isdigit():
+        # The status code is a three-digit ASCII number, no padding
+        if len(status) != 3 or not DIGITS.fullmatch(status):
             raise BadStatusLine(line)
         status_i = int(status)

diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py
index 9d65b2f..6073371 100644
--- a/tests/test_http_parser.py
+++ b/tests/test_http_parser.py
@@ -3,7 +3,8 @@
 import asyncio
 from unittest import mock
 from urllib.parse import quote
-from typing import Any
+from contextlib import nullcontext
+from typing import Any, Dict, List

 import pytest
 from multidict import CIMultiDict
@@ -102,6 +103,20 @@ test2: data\r
     assert not msg.upgrade


+def test_parse_unusual_request_line(parser) -> None:
+    if not isinstance(response, HttpResponseParserPy):
+        pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+    text = b"#smol //a HTTP/1.3\r\n\r\n"
+    messages, upgrade, tail = parser.feed_data(text)
+    assert len(messages) == 1
+    msg, _ = messages[0]
+    assert msg.compression is None
+    assert not msg.upgrade
+    assert msg.method == "#smol"
+    assert msg.path == "//a"
+    assert msg.version == (1, 3)
+
+
 def test_parse(parser) -> None:
     text = b"GET /test HTTP/1.1\r\n\r\n"
     messages, upgrade, tail = parser.feed_data(text)
@@ -365,6 +380,51 @@ def test_headers_content_length_err_2(parser) -> None:
         parser.feed_data(text)


+_pad: Dict[bytes, str] = {
+    b"": "empty",
+    # not a typo. Python likes triple zero
+    b"\000": "NUL",
+    b" ": "SP",
+    b"  ": "SPSP",
+    # not a typo: both 0xa0 and 0x0a in case of 8-bit fun
+    b"\n": "LF",
+    b"\xa0": "NBSP",
+    b"\t ": "TABSP",
+}
+
+
+@pytest.mark.parametrize("hdr", [b"", b"foo"], ids=["name-empty", "with-name"])
+@pytest.mark.parametrize("pad2", _pad.keys(), ids=["post-" + n for n in _pad.values()])
+@pytest.mark.parametrize("pad1", _pad.keys(), ids=["pre-" + n for n in _pad.values()])
+def test_invalid_header_spacing(parser, pad1: bytes, pad2: bytes, hdr: bytes) -> None:
+    text = b"GET /test HTTP/1.1\r\n" b"%s%s%s: value\r\n\r\n" % (pad1, hdr, pad2)
+    if isinstance(parser, HttpRequestParserPy):
+        expectation = pytest.raises(http_exceptions.InvalidHeader)
+    else:
+        pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+    if pad1 == pad2 == b"" and hdr != b"":
+        # one entry in param matrix is correct: non-empty name, not padded
+        expectation = nullcontext()
+    if pad1 == pad2 == hdr == b"":
+        if not isinstance(response, HttpResponseParserPy):
+            pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+    # work around pytest.raises not working
+    try:
+        parser.feed_data(text)
+    except aiohttp.http_exceptions.InvalidHeader:
+        pass
+    except aiohttp.http_exceptions.BadHttpMessage:
+        pass
+
+
+def test_empty_header_name(parser) -> None:
+    if not isinstance(response, HttpResponseParserPy):
+        pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+    text = b"GET /test HTTP/1.1\r\n" b":test\r\n\r\n"
+    with pytest.raises(http_exceptions.BadHttpMessage):
+        parser.feed_data(text)
+
+
 def test_invalid_header(parser) -> None:
     text = b"GET /test HTTP/1.1\r\n" b"test line\r\n\r\n"
     with pytest.raises(http_exceptions.BadHttpMessage):
@@ -387,11 +447,27 @@ def test_cve_2023_37276(parser: Any) -> None:
         pytest.xfail("Regression test for Py parser. May match C behaviour later.")


+@pytest.mark.parametrize(
+    "rfc9110_5_6_2_token_delim",
+    r'"(),/:;<=>?@[\]{}',
+)
+def test_bad_header_name(parser: Any, rfc9110_5_6_2_token_delim: str) -> None:
+    text = f"POST / HTTP/1.1\r\nhead{rfc9110_5_6_2_token_delim}er: val\r\n\r\n".encode()
+    expectation = pytest.raises(http_exceptions.BadHttpMessage)
+    if rfc9110_5_6_2_token_delim == ":":
+        # Inserting colon into header just splits name/value earlier.
+        expectation = nullcontext()
+    with expectation:
+        parser.feed_data(text)
+
+
 @pytest.mark.parametrize(
     "hdr",
     (
         "Content-Length: -5",  # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length
         "Content-Length: +256",
+        "Content-Length: \N{superscript one}",
+        "Content-Length: \N{mathematical double-struck digit one}",
         "Foo: abc\rdef",  # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
         "Bar: abc\ndef",
         "Baz: abc\x00def",
@@ -563,6 +639,42 @@ def test_http_request_bad_status_line(parser) -> None:
         parser.feed_data(text)


+_num: Dict[bytes, str] = {
+    # dangerous: accepted by Python int()
+    # unicodedata.category("\U0001D7D9") == 'Nd'
+    "\N{mathematical double-struck digit one}".encode(): "utf8digit",
+    # only added for interop tests, refused by Python int()
+    # unicodedata.category("\U000000B9") == 'No'
+    "\N{superscript one}".encode(): "utf8number",
+    "\N{superscript one}".encode("latin-1"): "latin1number",
+}
+
+
+@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
+def test_http_request_bad_status_line_number(
+    parser: Any, nonascii_digit: bytes
+) -> None:
+    text = b"GET /digit HTTP/1." + nonascii_digit + b"\r\n\r\n"
+    if isinstance(parser, HttpRequestParserPy):
+        exception = http_exceptions.BadStatusLine
+    else:
+        exception = http_exceptions.BadHttpMessage
+    with pytest.raises(exception):
+        parser.feed_data(text)
+
+
+def test_http_request_bad_status_line_separator(parser: Any) -> None:
+    # single code point, old, multibyte NFKC, multibyte NFKD
+    utf8sep = "\N{arabic ligature sallallahou alayhe wasallam}".encode()
+    text = b"GET /ligature HTTP/1" + utf8sep + b"1\r\n\r\n"
+    if isinstance(parser, HttpRequestParserPy):
+        exception = http_exceptions.BadStatusLine
+    else:
+        exception = http_exceptions.BadHttpMessage
+    with pytest.raises(exception):
+        parser.feed_data(text)
+
+
 def test_http_request_bad_status_line_whitespace(parser: Any) -> None:
     text = b"GET\n/path\fHTTP/1.1\r\n\r\n"
     with pytest.raises(http_exceptions.BadStatusLine):
@@ -584,6 +696,31 @@ def test_http_request_upgrade(parser: Any) -> None:
     assert tail == b"some raw data"


+def test_http_request_parser_utf8_request_line(parser) -> None:
+    if not isinstance(response, HttpResponseParserPy):
+        pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+    messages, upgrade, tail = parser.feed_data(
+        # note the truncated unicode sequence
+        b"GET /P\xc3\xbcnktchen\xa0\xef\xb7 HTTP/1.1\r\n" +
+        # for easier grep: ASCII 0xA0 more commonly known as non-breaking space
+        # note the leading and trailing spaces
+        "sTeP:  \N{latin small letter sharp s}nek\t\N{no-break space}  "
+        "\r\n\r\n".encode()
+    )
+    msg = messages[0][0]
+
+    assert msg.method == "GET"
+    assert msg.path == "/Pünktchen\udca0\udcef\udcb7"
+    assert msg.version == (1, 1)
+    assert msg.headers == CIMultiDict([("STEP", "ßnek\t\xa0")])
+    assert msg.raw_headers == ((b"sTeP", "ßnek\t\xa0".encode()),)
+    assert not msg.should_close
+    assert msg.compression is None
+    assert not msg.upgrade
+    assert not msg.chunked
+    assert msg.url.path == URL("/P%C3%BCnktchen\udca0\udcef\udcb7").path
+
+
 def test_http_request_parser_utf8(parser) -> None:
     text = "GET /path HTTP/1.1\r\nx-test:тест\r\n\r\n".encode()
     messages, upgrade, tail = parser.feed_data(text)
@@ -633,9 +770,15 @@ def test_http_request_parser_two_slashes(parser) -> None:
     assert not msg.chunked


-def test_http_request_parser_bad_method(parser) -> None:
+@pytest.mark.parametrize(
+    "rfc9110_5_6_2_token_delim",
+    [bytes([i]) for i in rb'"(),/:;<=>?@[\]{}'],
+)
+def test_http_request_parser_bad_method(
+    parser, rfc9110_5_6_2_token_delim: bytes
+) -> None:
     with pytest.raises(http_exceptions.BadStatusLine):
-        parser.feed_data(b'=":<G>(e),[T];?" /get HTTP/1.1\r\n\r\n')
+        parser.feed_data(rfc9110_5_6_2_token_delim + b'ET" /get HTTP/1.1\r\n\r\n')


 def test_http_request_parser_bad_version(parser) -> None:
@@ -751,6 +894,12 @@ def test_http_response_parser_code_not_int(response) -> None:
         response.feed_data(b"HTTP/1.1 ttt test\r\n\r\n")


+@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
+def test_http_response_parser_code_not_ascii(response, nonascii_digit: bytes) -> None:
+    with pytest.raises(http_exceptions.BadStatusLine):
+        response.feed_data(b"HTTP/1.1 20" + nonascii_digit + b" test\r\n\r\n")
+
+
 def test_http_request_chunked_payload(parser) -> None:
     text = b"GET /test HTTP/1.1\r\n" b"transfer-encoding: chunked\r\n\r\n"
     msg, payload = parser.feed_data(text)[0][0]