344 lines
13 KiB
Diff
344 lines
13 KiB
Diff
From: Sam Bull <git@sambull.org>
|
|
Date: Sun, 28 Jan 2024 17:09:58 +0000
|
|
Subject: Improve validation in HTTP parser (#8074) (#8078)
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset="utf-8"
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Co-authored-by: Paul J. Dorn <pajod@users.noreply.github.com>
|
|
Co-authored-by: Sviatoslav Sydorenko (Святослав Сидоренко)
|
|
<sviat@redhat.com>
|
|
(cherry picked from commit 33ccdfb0a12690af5bb49bda2319ec0907fa7827)
|
|
---
|
|
CONTRIBUTORS.txt | 1 +
|
|
aiohttp/http_parser.py | 28 +++++----
|
|
tests/test_http_parser.py | 155 +++++++++++++++++++++++++++++++++++++++++++++-
|
|
3 files changed, 169 insertions(+), 15 deletions(-)
|
|
|
|
diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
|
|
index ad63ce9..45fee0e 100644
|
|
--- a/CONTRIBUTORS.txt
|
|
+++ b/CONTRIBUTORS.txt
|
|
@@ -215,6 +215,7 @@ Panagiotis Kolokotronis
|
|
Pankaj Pandey
|
|
Pau Freixes
|
|
Paul Colomiets
|
|
+Paul J. Dorn
|
|
Paulius Šileikis
|
|
Paulus Schoutsen
|
|
Pavel Kamaev
|
|
diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py
|
|
index 8e5e816..2d58947 100644
|
|
--- a/aiohttp/http_parser.py
|
|
+++ b/aiohttp/http_parser.py
|
|
@@ -53,11 +53,10 @@ ASCIISET = set(string.printable)
|
|
# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
|
|
# "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
|
|
# token = 1*tchar
|
|
-METHRE = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
|
|
-VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
|
|
-HDRRE: Final[Pattern[bytes]] = re.compile(
|
|
- rb"[\x00-\x1F\x7F-\xFF()<>@,;:\[\]={} \t\"\\]"
|
|
-)
|
|
+_TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~")
|
|
+TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+")
|
|
+VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII)
|
|
+DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII)
|
|
|
|
RawRequestMessage = collections.namedtuple(
|
|
"RawRequestMessage",
|
|
@@ -122,6 +121,7 @@ class HeadersParser:
|
|
self, lines: List[bytes]
|
|
) -> Tuple["CIMultiDictProxy[str]", RawHeaders]:
|
|
headers = CIMultiDict() # type: CIMultiDict[str]
|
|
+ # note: "raw" does not mean inclusion of OWS before/after the field value
|
|
raw_headers = []
|
|
|
|
lines_idx = 1
|
|
@@ -135,13 +135,14 @@ class HeadersParser:
|
|
except ValueError:
|
|
raise InvalidHeader(line) from None
|
|
|
|
+ if len(bname) == 0:
|
|
+ raise InvalidHeader(bname)
|
|
+
|
|
# https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
|
|
if {bname[0], bname[-1]} & {32, 9}: # {" ", "\t"}
|
|
raise InvalidHeader(line)
|
|
|
|
bvalue = bvalue.lstrip(b" \t")
|
|
- if HDRRE.search(bname):
|
|
- raise InvalidHeader(bname)
|
|
if len(bname) > self.max_field_size:
|
|
raise LineTooLong(
|
|
"request header name {}".format(
|
|
@@ -150,6 +151,9 @@ class HeadersParser:
|
|
str(self.max_field_size),
|
|
str(len(bname)),
|
|
)
|
|
+ name = bname.decode("utf-8", "surrogateescape")
|
|
+ if not TOKENRE.fullmatch(name):
|
|
+ raise InvalidHeader(bname)
|
|
|
|
header_length = len(bvalue)
|
|
|
|
@@ -196,7 +200,6 @@ class HeadersParser:
|
|
)
|
|
|
|
bvalue = bvalue.strip(b" \t")
|
|
- name = bname.decode("utf-8", "surrogateescape")
|
|
value = bvalue.decode("utf-8", "surrogateescape")
|
|
|
|
# https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
|
|
@@ -317,7 +320,8 @@ class HttpParser(abc.ABC):
|
|
if length is not None:
|
|
# Shouldn't allow +/- or other number formats.
|
|
# https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
|
|
- if not length.strip(" \t").isdigit():
|
|
+ # msg.headers is already stripped of leading/trailing wsp
|
|
+ if not DIGITS.fullmatch(length):
|
|
raise InvalidHeader(CONTENT_LENGTH)
|
|
|
|
length = int(length)
|
|
@@ -538,7 +542,7 @@ class HttpRequestParser(HttpParser):
|
|
path_part, _question_mark_separator, qs_part = path_part.partition("?")
|
|
|
|
# method
|
|
- if not METHRE.match(method):
|
|
+ if not TOKENRE.fullmatch(method):
|
|
raise BadStatusLine(method)
|
|
|
|
# version
|
|
@@ -615,8 +619,8 @@ class HttpResponseParser(HttpParser):
|
|
raise BadStatusLine(line)
|
|
version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
|
|
|
|
- # The status code is a three-digit number
|
|
- if len(status) != 3 or not status.isdigit():
|
|
+ # The status code is a three-digit ASCII number, no padding
|
|
+ if len(status) != 3 or not DIGITS.fullmatch(status):
|
|
raise BadStatusLine(line)
|
|
status_i = int(status)
|
|
|
|
diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py
|
|
index 9d65b2f..6073371 100644
|
|
--- a/tests/test_http_parser.py
|
|
+++ b/tests/test_http_parser.py
|
|
@@ -3,7 +3,8 @@
|
|
import asyncio
|
|
from unittest import mock
|
|
from urllib.parse import quote
|
|
-from typing import Any
|
|
+from contextlib import nullcontext
|
|
+from typing import Any, Dict, List
|
|
|
|
import pytest
|
|
from multidict import CIMultiDict
|
|
@@ -102,6 +103,20 @@ test2: data\r
|
|
assert not msg.upgrade
|
|
|
|
|
|
+def test_parse_unusual_request_line(parser) -> None:
|
|
+ if not isinstance(response, HttpResponseParserPy):
|
|
+ pytest.xfail("Regression test for Py parser. May match C behaviour later.")
|
|
+ text = b"#smol //a HTTP/1.3\r\n\r\n"
|
|
+ messages, upgrade, tail = parser.feed_data(text)
|
|
+ assert len(messages) == 1
|
|
+ msg, _ = messages[0]
|
|
+ assert msg.compression is None
|
|
+ assert not msg.upgrade
|
|
+ assert msg.method == "#smol"
|
|
+ assert msg.path == "//a"
|
|
+ assert msg.version == (1, 3)
|
|
+
|
|
+
|
|
def test_parse(parser) -> None:
|
|
text = b"GET /test HTTP/1.1\r\n\r\n"
|
|
messages, upgrade, tail = parser.feed_data(text)
|
|
@@ -365,6 +380,51 @@ def test_headers_content_length_err_2(parser) -> None:
|
|
parser.feed_data(text)
|
|
|
|
|
|
+_pad: Dict[bytes, str] = {
|
|
+ b"": "empty",
|
|
+ # not a typo. Python likes triple zero
|
|
+ b"\000": "NUL",
|
|
+ b" ": "SP",
|
|
+ b" ": "SPSP",
|
|
+ # not a typo: both 0xa0 and 0x0a in case of 8-bit fun
|
|
+ b"\n": "LF",
|
|
+ b"\xa0": "NBSP",
|
|
+ b"\t ": "TABSP",
|
|
+}
|
|
+
|
|
+
|
|
+@pytest.mark.parametrize("hdr", [b"", b"foo"], ids=["name-empty", "with-name"])
|
|
+@pytest.mark.parametrize("pad2", _pad.keys(), ids=["post-" + n for n in _pad.values()])
|
|
+@pytest.mark.parametrize("pad1", _pad.keys(), ids=["pre-" + n for n in _pad.values()])
|
|
+def test_invalid_header_spacing(parser, pad1: bytes, pad2: bytes, hdr: bytes) -> None:
|
|
+ text = b"GET /test HTTP/1.1\r\n" b"%s%s%s: value\r\n\r\n" % (pad1, hdr, pad2)
|
|
+ if isinstance(parser, HttpRequestParserPy):
|
|
+ expectation = pytest.raises(http_exceptions.InvalidHeader)
|
|
+ else:
|
|
+ pytest.xfail("Regression test for Py parser. May match C behaviour later.")
|
|
+ if pad1 == pad2 == b"" and hdr != b"":
|
|
+ # one entry in param matrix is correct: non-empty name, not padded
|
|
+ expectation = nullcontext()
|
|
+ if pad1 == pad2 == hdr == b"":
|
|
+ if not isinstance(response, HttpResponseParserPy):
|
|
+ pytest.xfail("Regression test for Py parser. May match C behaviour later.")
|
|
+ # work around pytest.raises not working
|
|
+ try:
|
|
+ parser.feed_data(text)
|
|
+ except aiohttp.http_exceptions.InvalidHeader:
|
|
+ pass
|
|
+ except aiohttp.http_exceptions.BadHttpMessage:
|
|
+ pass
|
|
+
|
|
+
|
|
+def test_empty_header_name(parser) -> None:
|
|
+ if not isinstance(response, HttpResponseParserPy):
|
|
+ pytest.xfail("Regression test for Py parser. May match C behaviour later.")
|
|
+ text = b"GET /test HTTP/1.1\r\n" b":test\r\n\r\n"
|
|
+ with pytest.raises(http_exceptions.BadHttpMessage):
|
|
+ parser.feed_data(text)
|
|
+
|
|
+
|
|
def test_invalid_header(parser) -> None:
|
|
text = b"GET /test HTTP/1.1\r\n" b"test line\r\n\r\n"
|
|
with pytest.raises(http_exceptions.BadHttpMessage):
|
|
@@ -387,11 +447,27 @@ def test_cve_2023_37276(parser: Any) -> None:
|
|
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
|
|
|
|
|
|
+@pytest.mark.parametrize(
|
|
+ "rfc9110_5_6_2_token_delim",
|
|
+ r'"(),/:;<=>?@[\]{}',
|
|
+)
|
|
+def test_bad_header_name(parser: Any, rfc9110_5_6_2_token_delim: str) -> None:
|
|
+ text = f"POST / HTTP/1.1\r\nhead{rfc9110_5_6_2_token_delim}er: val\r\n\r\n".encode()
|
|
+ expectation = pytest.raises(http_exceptions.BadHttpMessage)
|
|
+ if rfc9110_5_6_2_token_delim == ":":
|
|
+ # Inserting colon into header just splits name/value earlier.
|
|
+ expectation = nullcontext()
|
|
+ with expectation:
|
|
+ parser.feed_data(text)
|
|
+
|
|
+
|
|
@pytest.mark.parametrize(
|
|
"hdr",
|
|
(
|
|
"Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length
|
|
"Content-Length: +256",
|
|
+ "Content-Length: \N{superscript one}",
|
|
+ "Content-Length: \N{mathematical double-struck digit one}",
|
|
"Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
|
|
"Bar: abc\ndef",
|
|
"Baz: abc\x00def",
|
|
@@ -563,6 +639,42 @@ def test_http_request_bad_status_line(parser) -> None:
|
|
parser.feed_data(text)
|
|
|
|
|
|
+_num: Dict[bytes, str] = {
|
|
+ # dangerous: accepted by Python int()
|
|
+ # unicodedata.category("\U0001D7D9") == 'Nd'
|
|
+ "\N{mathematical double-struck digit one}".encode(): "utf8digit",
|
|
+ # only added for interop tests, refused by Python int()
|
|
+ # unicodedata.category("\U000000B9") == 'No'
|
|
+ "\N{superscript one}".encode(): "utf8number",
|
|
+ "\N{superscript one}".encode("latin-1"): "latin1number",
|
|
+}
|
|
+
|
|
+
|
|
+@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
|
|
+def test_http_request_bad_status_line_number(
|
|
+ parser: Any, nonascii_digit: bytes
|
|
+) -> None:
|
|
+ text = b"GET /digit HTTP/1." + nonascii_digit + b"\r\n\r\n"
|
|
+ if isinstance(parser, HttpRequestParserPy):
|
|
+ exception = http_exceptions.BadStatusLine
|
|
+ else:
|
|
+ exception = http_exceptions.BadHttpMessage
|
|
+ with pytest.raises(exception):
|
|
+ parser.feed_data(text)
|
|
+
|
|
+
|
|
+def test_http_request_bad_status_line_separator(parser: Any) -> None:
|
|
+ # single code point, old, multibyte NFKC, multibyte NFKD
|
|
+ utf8sep = "\N{arabic ligature sallallahou alayhe wasallam}".encode()
|
|
+ text = b"GET /ligature HTTP/1" + utf8sep + b"1\r\n\r\n"
|
|
+ if isinstance(parser, HttpRequestParserPy):
|
|
+ exception = http_exceptions.BadStatusLine
|
|
+ else:
|
|
+ exception = http_exceptions.BadHttpMessage
|
|
+ with pytest.raises(exception):
|
|
+ parser.feed_data(text)
|
|
+
|
|
+
|
|
def test_http_request_bad_status_line_whitespace(parser: Any) -> None:
|
|
text = b"GET\n/path\fHTTP/1.1\r\n\r\n"
|
|
with pytest.raises(http_exceptions.BadStatusLine):
|
|
@@ -584,6 +696,31 @@ def test_http_request_upgrade(parser: Any) -> None:
|
|
assert tail == b"some raw data"
|
|
|
|
|
|
+def test_http_request_parser_utf8_request_line(parser) -> None:
|
|
+ if not isinstance(response, HttpResponseParserPy):
|
|
+ pytest.xfail("Regression test for Py parser. May match C behaviour later.")
|
|
+ messages, upgrade, tail = parser.feed_data(
|
|
+ # note the truncated unicode sequence
|
|
+ b"GET /P\xc3\xbcnktchen\xa0\xef\xb7 HTTP/1.1\r\n" +
|
|
+ # for easier grep: ASCII 0xA0 more commonly known as non-breaking space
|
|
+ # note the leading and trailing spaces
|
|
+ "sTeP: \N{latin small letter sharp s}nek\t\N{no-break space} "
|
|
+ "\r\n\r\n".encode()
|
|
+ )
|
|
+ msg = messages[0][0]
|
|
+
|
|
+ assert msg.method == "GET"
|
|
+ assert msg.path == "/Pünktchen\udca0\udcef\udcb7"
|
|
+ assert msg.version == (1, 1)
|
|
+ assert msg.headers == CIMultiDict([("STEP", "ßnek\t\xa0")])
|
|
+ assert msg.raw_headers == ((b"sTeP", "ßnek\t\xa0".encode()),)
|
|
+ assert not msg.should_close
|
|
+ assert msg.compression is None
|
|
+ assert not msg.upgrade
|
|
+ assert not msg.chunked
|
|
+ assert msg.url.path == URL("/P%C3%BCnktchen\udca0\udcef\udcb7").path
|
|
+
|
|
+
|
|
def test_http_request_parser_utf8(parser) -> None:
|
|
text = "GET /path HTTP/1.1\r\nx-test:тест\r\n\r\n".encode()
|
|
messages, upgrade, tail = parser.feed_data(text)
|
|
@@ -633,9 +770,15 @@ def test_http_request_parser_two_slashes(parser) -> None:
|
|
assert not msg.chunked
|
|
|
|
|
|
-def test_http_request_parser_bad_method(parser) -> None:
|
|
+@pytest.mark.parametrize(
|
|
+ "rfc9110_5_6_2_token_delim",
|
|
+ [bytes([i]) for i in rb'"(),/:;<=>?@[\]{}'],
|
|
+)
|
|
+def test_http_request_parser_bad_method(
|
|
+ parser, rfc9110_5_6_2_token_delim: bytes
|
|
+) -> None:
|
|
with pytest.raises(http_exceptions.BadStatusLine):
|
|
- parser.feed_data(b'=":<G>(e),[T];?" /get HTTP/1.1\r\n\r\n')
|
|
+ parser.feed_data(rfc9110_5_6_2_token_delim + b'ET" /get HTTP/1.1\r\n\r\n')
|
|
|
|
|
|
def test_http_request_parser_bad_version(parser) -> None:
|
|
@@ -751,6 +894,12 @@ def test_http_response_parser_code_not_int(response) -> None:
|
|
response.feed_data(b"HTTP/1.1 ttt test\r\n\r\n")
|
|
|
|
|
|
+@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
|
|
+def test_http_response_parser_code_not_ascii(response, nonascii_digit: bytes) -> None:
|
|
+ with pytest.raises(http_exceptions.BadStatusLine):
|
|
+ response.feed_data(b"HTTP/1.1 20" + nonascii_digit + b" test\r\n\r\n")
|
|
+
|
|
+
|
|
def test_http_request_chunked_payload(parser) -> None:
|
|
text = b"GET /test HTTP/1.1\r\n" b"transfer-encoding: chunked\r\n\r\n"
|
|
msg, payload = parser.feed_data(text)[0][0]
|