python-aiohttp/CVE-2024-23829.patch
starlet-dx 9d926e8cf7 Fix CVE-2023-47627,CVE-2023-49082,CVE-2024-23334,CVE-2024-23829,CVE-2024-27306 and CVE-2024-30251
(cherry picked from commit dabdc40effcfef17ad7e2a967edf709e75859b23)
2025-03-06 10:11:34 +08:00

344 lines
13 KiB
Diff

From: Sam Bull <git@sambull.org>
Date: Sun, 28 Jan 2024 17:09:58 +0000
Subject: Improve validation in HTTP parser (#8074) (#8078)
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 8bit
Co-authored-by: Paul J. Dorn <pajod@users.noreply.github.com>
Co-authored-by: Sviatoslav Sydorenko (Святослав Сидоренко)
<sviat@redhat.com>
(cherry picked from commit 33ccdfb0a12690af5bb49bda2319ec0907fa7827)
---
CONTRIBUTORS.txt | 1 +
aiohttp/http_parser.py | 28 +++++----
tests/test_http_parser.py | 155 +++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 169 insertions(+), 15 deletions(-)
diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
index ad63ce9..45fee0e 100644
--- a/CONTRIBUTORS.txt
+++ b/CONTRIBUTORS.txt
@@ -215,6 +215,7 @@ Panagiotis Kolokotronis
Pankaj Pandey
Pau Freixes
Paul Colomiets
+Paul J. Dorn
Paulius Šileikis
Paulus Schoutsen
Pavel Kamaev
diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py
index 8e5e816..2d58947 100644
--- a/aiohttp/http_parser.py
+++ b/aiohttp/http_parser.py
@@ -53,11 +53,10 @@ ASCIISET = set(string.printable)
# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
# "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
# token = 1*tchar
-METHRE = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
-VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
-HDRRE: Final[Pattern[bytes]] = re.compile(
- rb"[\x00-\x1F\x7F-\xFF()<>@,;:\[\]={} \t\"\\]"
-)
+_TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~")
+TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+")
+VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII)
+DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII)
RawRequestMessage = collections.namedtuple(
"RawRequestMessage",
@@ -122,6 +121,7 @@ class HeadersParser:
self, lines: List[bytes]
) -> Tuple["CIMultiDictProxy[str]", RawHeaders]:
headers = CIMultiDict() # type: CIMultiDict[str]
+ # note: "raw" does not mean inclusion of OWS before/after the field value
raw_headers = []
lines_idx = 1
@@ -135,13 +135,14 @@ class HeadersParser:
except ValueError:
raise InvalidHeader(line) from None
+ if len(bname) == 0:
+ raise InvalidHeader(bname)
+
# https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
if {bname[0], bname[-1]} & {32, 9}: # {" ", "\t"}
raise InvalidHeader(line)
bvalue = bvalue.lstrip(b" \t")
- if HDRRE.search(bname):
- raise InvalidHeader(bname)
if len(bname) > self.max_field_size:
raise LineTooLong(
"request header name {}".format(
@@ -150,6 +151,9 @@ class HeadersParser:
str(self.max_field_size),
str(len(bname)),
)
+ name = bname.decode("utf-8", "surrogateescape")
+ if not TOKENRE.fullmatch(name):
+ raise InvalidHeader(bname)
header_length = len(bvalue)
@@ -196,7 +200,6 @@ class HeadersParser:
)
bvalue = bvalue.strip(b" \t")
- name = bname.decode("utf-8", "surrogateescape")
value = bvalue.decode("utf-8", "surrogateescape")
# https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
@@ -317,7 +320,8 @@ class HttpParser(abc.ABC):
if length is not None:
# Shouldn't allow +/- or other number formats.
# https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
- if not length.strip(" \t").isdigit():
+ # msg.headers is already stripped of leading/trailing wsp
+ if not DIGITS.fullmatch(length):
raise InvalidHeader(CONTENT_LENGTH)
length = int(length)
@@ -538,7 +542,7 @@ class HttpRequestParser(HttpParser):
path_part, _question_mark_separator, qs_part = path_part.partition("?")
# method
- if not METHRE.match(method):
+ if not TOKENRE.fullmatch(method):
raise BadStatusLine(method)
# version
@@ -615,8 +619,8 @@ class HttpResponseParser(HttpParser):
raise BadStatusLine(line)
version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
- # The status code is a three-digit number
- if len(status) != 3 or not status.isdigit():
+ # The status code is a three-digit ASCII number, no padding
+ if len(status) != 3 or not DIGITS.fullmatch(status):
raise BadStatusLine(line)
status_i = int(status)
diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py
index 9d65b2f..6073371 100644
--- a/tests/test_http_parser.py
+++ b/tests/test_http_parser.py
@@ -3,7 +3,8 @@
import asyncio
from unittest import mock
from urllib.parse import quote
-from typing import Any
+from contextlib import nullcontext
+from typing import Any, Dict, List
import pytest
from multidict import CIMultiDict
@@ -102,6 +103,20 @@ test2: data\r
assert not msg.upgrade
+def test_parse_unusual_request_line(parser) -> None:
+ if not isinstance(response, HttpResponseParserPy):
+ pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+ text = b"#smol //a HTTP/1.3\r\n\r\n"
+ messages, upgrade, tail = parser.feed_data(text)
+ assert len(messages) == 1
+ msg, _ = messages[0]
+ assert msg.compression is None
+ assert not msg.upgrade
+ assert msg.method == "#smol"
+ assert msg.path == "//a"
+ assert msg.version == (1, 3)
+
+
def test_parse(parser) -> None:
text = b"GET /test HTTP/1.1\r\n\r\n"
messages, upgrade, tail = parser.feed_data(text)
@@ -365,6 +380,51 @@ def test_headers_content_length_err_2(parser) -> None:
parser.feed_data(text)
+_pad: Dict[bytes, str] = {
+ b"": "empty",
+ # not a typo. Python likes triple zero
+ b"\000": "NUL",
+ b" ": "SP",
+ b" ": "SPSP",
+ # not a typo: both 0xa0 and 0x0a in case of 8-bit fun
+ b"\n": "LF",
+ b"\xa0": "NBSP",
+ b"\t ": "TABSP",
+}
+
+
+@pytest.mark.parametrize("hdr", [b"", b"foo"], ids=["name-empty", "with-name"])
+@pytest.mark.parametrize("pad2", _pad.keys(), ids=["post-" + n for n in _pad.values()])
+@pytest.mark.parametrize("pad1", _pad.keys(), ids=["pre-" + n for n in _pad.values()])
+def test_invalid_header_spacing(parser, pad1: bytes, pad2: bytes, hdr: bytes) -> None:
+ text = b"GET /test HTTP/1.1\r\n" b"%s%s%s: value\r\n\r\n" % (pad1, hdr, pad2)
+ if isinstance(parser, HttpRequestParserPy):
+ expectation = pytest.raises(http_exceptions.InvalidHeader)
+ else:
+ pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+ if pad1 == pad2 == b"" and hdr != b"":
+ # one entry in param matrix is correct: non-empty name, not padded
+ expectation = nullcontext()
+ if pad1 == pad2 == hdr == b"":
+ if not isinstance(response, HttpResponseParserPy):
+ pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+ # work around pytest.raises not working
+ try:
+ parser.feed_data(text)
+ except aiohttp.http_exceptions.InvalidHeader:
+ pass
+ except aiohttp.http_exceptions.BadHttpMessage:
+ pass
+
+
+def test_empty_header_name(parser) -> None:
+ if not isinstance(response, HttpResponseParserPy):
+ pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+ text = b"GET /test HTTP/1.1\r\n" b":test\r\n\r\n"
+ with pytest.raises(http_exceptions.BadHttpMessage):
+ parser.feed_data(text)
+
+
def test_invalid_header(parser) -> None:
text = b"GET /test HTTP/1.1\r\n" b"test line\r\n\r\n"
with pytest.raises(http_exceptions.BadHttpMessage):
@@ -387,11 +447,27 @@ def test_cve_2023_37276(parser: Any) -> None:
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+@pytest.mark.parametrize(
+ "rfc9110_5_6_2_token_delim",
+ r'"(),/:;<=>?@[\]{}',
+)
+def test_bad_header_name(parser: Any, rfc9110_5_6_2_token_delim: str) -> None:
+ text = f"POST / HTTP/1.1\r\nhead{rfc9110_5_6_2_token_delim}er: val\r\n\r\n".encode()
+ expectation = pytest.raises(http_exceptions.BadHttpMessage)
+ if rfc9110_5_6_2_token_delim == ":":
+ # Inserting colon into header just splits name/value earlier.
+ expectation = nullcontext()
+ with expectation:
+ parser.feed_data(text)
+
+
@pytest.mark.parametrize(
"hdr",
(
"Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length
"Content-Length: +256",
+ "Content-Length: \N{superscript one}",
+ "Content-Length: \N{mathematical double-struck digit one}",
"Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
"Bar: abc\ndef",
"Baz: abc\x00def",
@@ -563,6 +639,42 @@ def test_http_request_bad_status_line(parser) -> None:
parser.feed_data(text)
+_num: Dict[bytes, str] = {
+ # dangerous: accepted by Python int()
+ # unicodedata.category("\U0001D7D9") == 'Nd'
+ "\N{mathematical double-struck digit one}".encode(): "utf8digit",
+ # only added for interop tests, refused by Python int()
+ # unicodedata.category("\U000000B9") == 'No'
+ "\N{superscript one}".encode(): "utf8number",
+ "\N{superscript one}".encode("latin-1"): "latin1number",
+}
+
+
+@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
+def test_http_request_bad_status_line_number(
+ parser: Any, nonascii_digit: bytes
+) -> None:
+ text = b"GET /digit HTTP/1." + nonascii_digit + b"\r\n\r\n"
+ if isinstance(parser, HttpRequestParserPy):
+ exception = http_exceptions.BadStatusLine
+ else:
+ exception = http_exceptions.BadHttpMessage
+ with pytest.raises(exception):
+ parser.feed_data(text)
+
+
+def test_http_request_bad_status_line_separator(parser: Any) -> None:
+ # single code point, old, multibyte NFKC, multibyte NFKD
+ utf8sep = "\N{arabic ligature sallallahou alayhe wasallam}".encode()
+ text = b"GET /ligature HTTP/1" + utf8sep + b"1\r\n\r\n"
+ if isinstance(parser, HttpRequestParserPy):
+ exception = http_exceptions.BadStatusLine
+ else:
+ exception = http_exceptions.BadHttpMessage
+ with pytest.raises(exception):
+ parser.feed_data(text)
+
+
def test_http_request_bad_status_line_whitespace(parser: Any) -> None:
text = b"GET\n/path\fHTTP/1.1\r\n\r\n"
with pytest.raises(http_exceptions.BadStatusLine):
@@ -584,6 +696,31 @@ def test_http_request_upgrade(parser: Any) -> None:
assert tail == b"some raw data"
+def test_http_request_parser_utf8_request_line(parser) -> None:
+ if not isinstance(response, HttpResponseParserPy):
+ pytest.xfail("Regression test for Py parser. May match C behaviour later.")
+ messages, upgrade, tail = parser.feed_data(
+ # note the truncated unicode sequence
+ b"GET /P\xc3\xbcnktchen\xa0\xef\xb7 HTTP/1.1\r\n" +
+ # for easier grep: ASCII 0xA0 more commonly known as non-breaking space
+ # note the leading and trailing spaces
+ "sTeP: \N{latin small letter sharp s}nek\t\N{no-break space} "
+ "\r\n\r\n".encode()
+ )
+ msg = messages[0][0]
+
+ assert msg.method == "GET"
+ assert msg.path == "/Pünktchen\udca0\udcef\udcb7"
+ assert msg.version == (1, 1)
+ assert msg.headers == CIMultiDict([("STEP", "ßnek\t\xa0")])
+ assert msg.raw_headers == ((b"sTeP", "ßnek\t\xa0".encode()),)
+ assert not msg.should_close
+ assert msg.compression is None
+ assert not msg.upgrade
+ assert not msg.chunked
+ assert msg.url.path == URL("/P%C3%BCnktchen\udca0\udcef\udcb7").path
+
+
def test_http_request_parser_utf8(parser) -> None:
text = "GET /path HTTP/1.1\r\nx-test:тест\r\n\r\n".encode()
messages, upgrade, tail = parser.feed_data(text)
@@ -633,9 +770,15 @@ def test_http_request_parser_two_slashes(parser) -> None:
assert not msg.chunked
-def test_http_request_parser_bad_method(parser) -> None:
+@pytest.mark.parametrize(
+ "rfc9110_5_6_2_token_delim",
+ [bytes([i]) for i in rb'"(),/:;<=>?@[\]{}'],
+)
+def test_http_request_parser_bad_method(
+ parser, rfc9110_5_6_2_token_delim: bytes
+) -> None:
with pytest.raises(http_exceptions.BadStatusLine):
- parser.feed_data(b'=":<G>(e),[T];?" /get HTTP/1.1\r\n\r\n')
+ parser.feed_data(rfc9110_5_6_2_token_delim + b'ET" /get HTTP/1.1\r\n\r\n')
def test_http_request_parser_bad_version(parser) -> None:
@@ -751,6 +894,12 @@ def test_http_response_parser_code_not_int(response) -> None:
response.feed_data(b"HTTP/1.1 ttt test\r\n\r\n")
+@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
+def test_http_response_parser_code_not_ascii(response, nonascii_digit: bytes) -> None:
+ with pytest.raises(http_exceptions.BadStatusLine):
+ response.feed_data(b"HTTP/1.1 20" + nonascii_digit + b" test\r\n\r\n")
+
+
def test_http_request_chunked_payload(parser) -> None:
text = b"GET /test HTTP/1.1\r\n" b"transfer-encoding: chunked\r\n\r\n"
msg, payload = parser.feed_data(text)[0][0]