diff --git a/ext/lexbor/lexbor/encoding/decode.c b/ext/lexbor/lexbor/encoding/decode.c index 3e48971e3657..5010e4ed9d04 100644 --- a/ext/lexbor/lexbor/encoding/decode.c +++ b/ext/lexbor/lexbor/encoding/decode.c @@ -2907,42 +2907,51 @@ lxb_encoding_decode_valid_utf_8_single(const lxb_char_t **data, else if ((*p & 0xe0) == 0xc0) { /* 110xxxxx 10xxxxxx */ - if (end - p < 2) { - *data = end; + if (*p < 0xC2 || end - p < 2 || (p[1] & 0xC0) != 0x80) { + (*data) = (end - p < 2) ? end : *data + 1; return LXB_ENCODING_DECODE_ERROR; } - cp = (p[0] ^ (0xC0 & p[0])) << 6; - cp |= (p[1] ^ (0x80 & p[1])); + cp = (*p & 0x1F) << 6; + cp |= (p[1] & 0x3F); (*data) += 2; } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx 10xxxxxx 10xxxxxx */ - if (end - p < 3) { - *data = end; + if (end - p < 3 + || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80 + || (*p == 0xE0 && p[1] < 0xA0) + || (*p == 0xED && p[1] > 0x9F)) + { + (*data) = (end - p < 3) ? end : *data + 1; return LXB_ENCODING_DECODE_ERROR; } - cp = (p[0] ^ (0xE0 & p[0])) << 12; - cp |= (p[1] ^ (0x80 & p[1])) << 6; - cp |= (p[2] ^ (0x80 & p[2])); + cp = (*p & 0x0F) << 12; + cp |= (p[1] & 0x3F) << 6; + cp |= (p[2] & 0x3F); (*data) += 3; } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ - if (end - p < 4) { - *data = end; + if (*p > 0xF4 || end - p < 4 + || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80 + || (p[3] & 0xC0) != 0x80 + || (*p == 0xF0 && p[1] < 0x90) + || (*p == 0xF4 && p[1] > 0x8F)) + { + (*data) = (end - p < 4) ? end : *data + 1; return LXB_ENCODING_DECODE_ERROR; } - cp = (p[0] ^ (0xF0 & p[0])) << 18; - cp |= (p[1] ^ (0x80 & p[1])) << 12; - cp |= (p[2] ^ (0x80 & p[2])) << 6; - cp |= (p[3] ^ (0x80 & p[3])); + cp = (*p & 0x07) << 18; + cp |= (p[1] & 0x3F) << 12; + cp |= (p[2] & 0x3F) << 6; + cp |= (p[3] & 0x3F); (*data) += 4; } diff --git a/ext/lexbor/patches/0007-Validate-UTF-8-in-decode_valid_utf_8_single.patch b/ext/lexbor/patches/0007-Validate-UTF-8-in-decode_valid_utf_8_single.patch new file mode 100644 index 000000000000..a33fd738fd42 --- /dev/null +++ b/ext/lexbor/patches/0007-Validate-UTF-8-in-decode_valid_utf_8_single.patch @@ -0,0 +1,71 @@ +diff --git a/ext/lexbor/lexbor/encoding/decode.c b/ext/lexbor/lexbor/encoding/decode.c +index 3e48971e365..5010e4ed9d0 100644 +--- a/ext/lexbor/lexbor/encoding/decode.c ++++ b/ext/lexbor/lexbor/encoding/decode.c +@@ -2907,42 +2907,51 @@ lxb_encoding_decode_valid_utf_8_single(const lxb_char_t **data, + else if ((*p & 0xe0) == 0xc0) { + /* 110xxxxx 10xxxxxx */ + +- if (end - p < 2) { +- *data = end; ++ if (*p < 0xC2 || end - p < 2 || (p[1] & 0xC0) != 0x80) { ++ (*data) = (end - p < 2) ? end : *data + 1; + return LXB_ENCODING_DECODE_ERROR; + } + +- cp = (p[0] ^ (0xC0 & p[0])) << 6; +- cp |= (p[1] ^ (0x80 & p[1])); ++ cp = (*p & 0x1F) << 6; ++ cp |= (p[1] & 0x3F); + + (*data) += 2; + } + else if ((*p & 0xf0) == 0xe0) { + /* 1110xxxx 10xxxxxx 10xxxxxx */ + +- if (end - p < 3) { +- *data = end; ++ if (end - p < 3 ++ || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80 ++ || (*p == 0xE0 && p[1] < 0xA0) ++ || (*p == 0xED && p[1] > 0x9F)) ++ { ++ (*data) = (end - p < 3) ? end : *data + 1; + return LXB_ENCODING_DECODE_ERROR; + } + +- cp = (p[0] ^ (0xE0 & p[0])) << 12; +- cp |= (p[1] ^ (0x80 & p[1])) << 6; +- cp |= (p[2] ^ (0x80 & p[2])); ++ cp = (*p & 0x0F) << 12; ++ cp |= (p[1] & 0x3F) << 6; ++ cp |= (p[2] & 0x3F); + + (*data) += 3; + } + else if ((*p & 0xf8) == 0xf0) { + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + +- if (end - p < 4) { +- *data = end; ++ if (*p > 0xF4 || end - p < 4 ++ || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80 ++ || (p[3] & 0xC0) != 0x80 ++ || (*p == 0xF0 && p[1] < 0x90) ++ || (*p == 0xF4 && p[1] > 0x8F)) ++ { ++ (*data) = (end - p < 4) ? end : *data + 1; + return LXB_ENCODING_DECODE_ERROR; + } + +- cp = (p[0] ^ (0xF0 & p[0])) << 18; +- cp |= (p[1] ^ (0x80 & p[1])) << 12; +- cp |= (p[2] ^ (0x80 & p[2])) << 6; +- cp |= (p[3] ^ (0x80 & p[3])); ++ cp = (*p & 0x07) << 18; ++ cp |= (p[1] & 0x3F) << 12; ++ cp |= (p[2] & 0x3F) << 6; ++ cp |= (p[3] & 0x3F); + + (*data) += 4; + } diff --git a/ext/uri/tests/whatwg/parsing/gh21734_overlong_utf8.phpt b/ext/uri/tests/whatwg/parsing/gh21734_overlong_utf8.phpt new file mode 100644 index 000000000000..812292071f8b --- /dev/null +++ b/ext/uri/tests/whatwg/parsing/gh21734_overlong_utf8.phpt @@ -0,0 +1,43 @@ +--TEST-- +GH-21734: WHATWG URL parser rejects overlong UTF-8 and invalid continuation bytes +--FILE-- +getPath()); + +// Invalid continuation byte: \xC0 followed by ASCII 'A' (not 0x80-0xBF) +$url = Uri\WhatWg\Url::parse("http://example.com/\xC0\x41"); +var_dump($url->getPath()); + +// Surrogate \xED\xA0\x80 = U+D800 (must be rejected) +$url = Uri\WhatWg\Url::parse("http://example.com/\xED\xA0\x80"); +var_dump($url->getPath()); + +// Valid UTF-8 still works +$url = Uri\WhatWg\Url::parse("http://example.com/café"); +var_dump($url->getPath()); + +// Valid IDN hostname still works +$url = Uri\WhatWg\Url::parse("http://münchen.de/"); +var_dump($url->getAsciiHost()); + +?> +--EXPECT-- +NULL +NULL +string(9) "/a%C0%AFb" +string(5) "/%C0A" +string(10) "/%ED%A0%80" +string(10) "/caf%C3%A9" +string(17) "xn--mnchen-3ya.de"