Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 24 additions & 15 deletions ext/lexbor/lexbor/encoding/decode.c
Original file line number Diff line number Diff line change
Expand Up @@ -2907,42 +2907,51 @@ lxb_encoding_decode_valid_utf_8_single(const lxb_char_t **data,
else if ((*p & 0xe0) == 0xc0) {
/* 110xxxxx 10xxxxxx */

if (end - p < 2) {
*data = end;
if (*p < 0xC2 || end - p < 2 || (p[1] & 0xC0) != 0x80) {
(*data) = (end - p < 2) ? end : *data + 1;
return LXB_ENCODING_DECODE_ERROR;
}

cp = (p[0] ^ (0xC0 & p[0])) << 6;
cp |= (p[1] ^ (0x80 & p[1]));
cp = (*p & 0x1F) << 6;
cp |= (p[1] & 0x3F);

(*data) += 2;
}
else if ((*p & 0xf0) == 0xe0) {
/* 1110xxxx 10xxxxxx 10xxxxxx */

if (end - p < 3) {
*data = end;
if (end - p < 3
|| (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80
|| (*p == 0xE0 && p[1] < 0xA0)
|| (*p == 0xED && p[1] > 0x9F))
{
(*data) = (end - p < 3) ? end : *data + 1;
return LXB_ENCODING_DECODE_ERROR;
}

cp = (p[0] ^ (0xE0 & p[0])) << 12;
cp |= (p[1] ^ (0x80 & p[1])) << 6;
cp |= (p[2] ^ (0x80 & p[2]));
cp = (*p & 0x0F) << 12;
cp |= (p[1] & 0x3F) << 6;
cp |= (p[2] & 0x3F);

(*data) += 3;
}
else if ((*p & 0xf8) == 0xf0) {
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */

if (end - p < 4) {
*data = end;
if (*p > 0xF4 || end - p < 4
|| (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80
|| (p[3] & 0xC0) != 0x80
|| (*p == 0xF0 && p[1] < 0x90)
|| (*p == 0xF4 && p[1] > 0x8F))
{
(*data) = (end - p < 4) ? end : *data + 1;
return LXB_ENCODING_DECODE_ERROR;
}

cp = (p[0] ^ (0xF0 & p[0])) << 18;
cp |= (p[1] ^ (0x80 & p[1])) << 12;
cp |= (p[2] ^ (0x80 & p[2])) << 6;
cp |= (p[3] ^ (0x80 & p[3]));
cp = (*p & 0x07) << 18;
cp |= (p[1] & 0x3F) << 12;
cp |= (p[2] & 0x3F) << 6;
cp |= (p[3] & 0x3F);

(*data) += 4;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
diff --git a/ext/lexbor/lexbor/encoding/decode.c b/ext/lexbor/lexbor/encoding/decode.c
index 3e48971e365..5010e4ed9d0 100644
--- a/ext/lexbor/lexbor/encoding/decode.c
+++ b/ext/lexbor/lexbor/encoding/decode.c
@@ -2907,42 +2907,51 @@ lxb_encoding_decode_valid_utf_8_single(const lxb_char_t **data,
else if ((*p & 0xe0) == 0xc0) {
/* 110xxxxx 10xxxxxx */

- if (end - p < 2) {
- *data = end;
+ if (*p < 0xC2 || end - p < 2 || (p[1] & 0xC0) != 0x80) {
+ (*data) = (end - p < 2) ? end : *data + 1;
return LXB_ENCODING_DECODE_ERROR;
}

- cp = (p[0] ^ (0xC0 & p[0])) << 6;
- cp |= (p[1] ^ (0x80 & p[1]));
+ cp = (*p & 0x1F) << 6;
+ cp |= (p[1] & 0x3F);

(*data) += 2;
}
else if ((*p & 0xf0) == 0xe0) {
/* 1110xxxx 10xxxxxx 10xxxxxx */

- if (end - p < 3) {
- *data = end;
+ if (end - p < 3
+ || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80
+ || (*p == 0xE0 && p[1] < 0xA0)
+ || (*p == 0xED && p[1] > 0x9F))
+ {
+ (*data) = (end - p < 3) ? end : *data + 1;
return LXB_ENCODING_DECODE_ERROR;
}

- cp = (p[0] ^ (0xE0 & p[0])) << 12;
- cp |= (p[1] ^ (0x80 & p[1])) << 6;
- cp |= (p[2] ^ (0x80 & p[2]));
+ cp = (*p & 0x0F) << 12;
+ cp |= (p[1] & 0x3F) << 6;
+ cp |= (p[2] & 0x3F);

(*data) += 3;
}
else if ((*p & 0xf8) == 0xf0) {
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */

- if (end - p < 4) {
- *data = end;
+ if (*p > 0xF4 || end - p < 4
+ || (p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80
+ || (p[3] & 0xC0) != 0x80
+ || (*p == 0xF0 && p[1] < 0x90)
+ || (*p == 0xF4 && p[1] > 0x8F))
+ {
+ (*data) = (end - p < 4) ? end : *data + 1;
return LXB_ENCODING_DECODE_ERROR;
}

- cp = (p[0] ^ (0xF0 & p[0])) << 18;
- cp |= (p[1] ^ (0x80 & p[1])) << 12;
- cp |= (p[2] ^ (0x80 & p[2])) << 6;
- cp |= (p[3] ^ (0x80 & p[3]));
+ cp = (*p & 0x07) << 18;
+ cp |= (p[1] & 0x3F) << 12;
+ cp |= (p[2] & 0x3F) << 6;
+ cp |= (p[3] & 0x3F);

(*data) += 4;
}
43 changes: 43 additions & 0 deletions ext/uri/tests/whatwg/parsing/gh21734_overlong_utf8.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
--TEST--
GH-21734: WHATWG URL parser rejects overlong UTF-8 and invalid continuation bytes
--FILE--
<?php

// Overlong hostname: %C1%A5 = overlong 'e', %C1%B6 = overlong 'v', etc.
// Must fail to parse (browsers reject at the UTF-8 decode step)
$url = Uri\WhatWg\Url::parse("http://%C1%A5%C1%B6%C1%A9%C1%AC.com/");
var_dump($url);

// Raw overlong 'a' in hostname
$url = Uri\WhatWg\Url::parse("http://\xC1\xA1.com/");
var_dump($url);

// Overlong '/' in path: raw bytes \xC0\xAF should not produce a path separator
$url = Uri\WhatWg\Url::parse("http://example.com/a\xC0\xAFb");
var_dump($url->getPath());

// Invalid continuation byte: \xC0 followed by ASCII 'A' (not 0x80-0xBF)
$url = Uri\WhatWg\Url::parse("http://example.com/\xC0\x41");
var_dump($url->getPath());

// Surrogate \xED\xA0\x80 = U+D800 (must be rejected)
$url = Uri\WhatWg\Url::parse("http://example.com/\xED\xA0\x80");
var_dump($url->getPath());

// Valid UTF-8 still works
$url = Uri\WhatWg\Url::parse("http://example.com/café");
var_dump($url->getPath());

// Valid IDN hostname still works
$url = Uri\WhatWg\Url::parse("http://münchen.de/");
var_dump($url->getAsciiHost());

?>
--EXPECT--
NULL
NULL
string(9) "/a%C0%AFb"
string(5) "/%C0A"
string(10) "/%ED%A0%80"
string(10) "/caf%C3%A9"
string(17) "xn--mnchen-3ya.de"
Loading