From 60e776456a8951f14cbb2ac9924261008b699c47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 21 Apr 2026 17:00:16 +0200 Subject: [PATCH 1/5] Add REGEXP_LIKE() UDF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements MySQL REGEXP_LIKE(expr, pattern [, match_type]) via a new variadic UDF. Introduces a shared regexp_compile() helper that translates MySQL match_type flags to PCRE modifiers and always uses UTF-8 mode, plus regexp_run() (suppresses preg_* warnings) and regexp_fail() (translates preg failures into MySQL-style messages). regexp_compile() rejects empty patterns to match MySQL ERROR 3685 and documents two known limitations of the emulation: collation-blind case-sensitivity defaulting and the always-on /u modifier diverging from the legacy REGEXP operator on binary data. The match_type loop accepts MySQL's c/i/m/n/u flags (last of the case flags wins; "u" — Unix-only line endings — is a no-op since PCRE's default already matches that semantics). Unknown flags raise "Invalid match_type flag: X.". Tests cover: data-driven match cases, NULL propagation, invalid patterns, multi-flag combinations, UTF-8 input errors via the PREG_BAD_UTF8_ERROR branch, the backtrack-limit branch, and that the legacy REGEXP operator still works alongside REGEXP_LIKE. --- ...s-wp-sqlite-pdo-user-defined-functions.php | 153 ++++++++++++++++++ .../tests/WP_SQLite_Driver_Tests.php | 113 +++++++++++++ 2 files changed, 266 insertions(+) diff --git a/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php b/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php index b72d787f..574fe9e5 100644 --- a/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php +++ b/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php @@ -71,6 +71,7 @@ public static function register_for( $pdo ): self { 'isnull' => 'isnull', 'if' => '_if', 'regexp' => 'regexp', + 'regexp_like' => 'regexp_like', 'field' => 'field', 'log' => 'log', 'least' => 'least', @@ -536,6 +537,32 @@ public function regexp( $pattern, $field ) { return preg_match( $pattern, $field ); } + /** + * Method to emulate MySQL REGEXP_LIKE() function. + * + * @param string|null $expr The subject string. + * @param string|null $pattern The regex pattern. + * @param string|null $match_type Optional MySQL match_type flags. + * + * @throws Exception If the pattern is not a valid regular expression. + * @return int|null 1 on match, 0 on no match, NULL if any argument is NULL. + */ + public function regexp_like( $expr, $pattern, $match_type = '' ) { + if ( null === $expr || null === $pattern || null === $match_type ) { + return null; + } + $compiled = $this->regexp_compile( $pattern, $match_type ); + $result = $this->regexp_run( + function () use ( $compiled, $expr ) { + return preg_match( $compiled, $expr ); + } + ); + if ( false === $result ) { + $this->regexp_fail( $pattern ); + } + return $result; + } + /** * Method to emulate MySQL FIELD() function. * @@ -896,4 +923,130 @@ public function _helper_like_to_glob_pattern( $pattern ) { return $pattern; } + + /** + * Compile a MySQL-style regex into a PCRE pattern string. + * + * Translates MySQL match_type flags (c/i/m/n/u) to PCRE modifiers and always + * appends the u (UTF-8) modifier. Case-insensitive is the default, matching + * the existing REGEXP operator. + * + * MySQL's native engine is ICU; we use PHP's PCRE. The two diverge in a + * few corners: + * + * - Some Unicode property shorthands and POSIX class spellings differ. + * - PCRE accepts both `(?...)` and `(?P...)`; MySQL accepts + * only the former and errors on the latter. + * - MySQL's `u` match_type flag ("Unix-only line endings") narrows the + * meaning of `^`, `$`, and `.` to just "\n". PCRE's default line + * handling already behaves this way, so the flag is accepted but has + * no effect; it is MySQL's default mode (without `u`) that is broader + * and cannot be fully emulated through the `m` modifier alone. + * + * Known limitations of this emulation: + * + * - The default (case-insensitive) is correct for the usual + * `utf8mb4_0900_ai_ci` collation; callers that rely on a `_bin` or + * `_cs` collation must pass an explicit `c` match_type because this + * helper has no access to the session collation. + * - The `u` (UTF-8) PCRE modifier is always applied. Binary data with + * invalid UTF-8 bytes that matches fine under the legacy `REGEXP` + * operator raises "Invalid UTF-8 data in regular expression input." + * when routed through REGEXP_LIKE / _REPLACE / _SUBSTR / _INSTR. + * + * @param string $pattern The MySQL regex pattern. + * @param string $match_type MySQL match_type flag string. + * + * @throws Exception If the pattern is empty or the match_type string + * contains an unrecognized flag. + * @return string PCRE-ready pattern with delimiter and modifiers. + */ + private function regexp_compile( $pattern, $match_type ) { + if ( '' === (string) $pattern ) { + throw new Exception( 'Illegal argument to a regular expression.' ); + } + $match_type = (string) $match_type; + $case_sensitive = false; + $multiline = false; + $dotall = false; + $len = strlen( $match_type ); + for ( $i = 0; $i < $len; $i++ ) { + $flag = $match_type[ $i ]; + if ( 'c' === $flag ) { + $case_sensitive = true; + } elseif ( 'i' === $flag ) { + $case_sensitive = false; + } elseif ( 'm' === $flag ) { + $multiline = true; + } elseif ( 'n' === $flag ) { + $dotall = true; + } elseif ( 'u' === $flag ) { + // Unix-only line endings. PCRE's default matches this already; no-op. + continue; + } else { + throw new Exception( "Invalid match_type flag: $flag." ); + } + } + + $modifiers = 'u'; + if ( ! $case_sensitive ) { + $modifiers .= 'i'; + } + if ( $multiline ) { + $modifiers .= 'm'; + } + if ( $dotall ) { + $modifiers .= 's'; + } + + return '/' . str_replace( '/', '\\/', $pattern ) . '/' . $modifiers; + } + + /** + * Run a preg_* callable with PHP warnings suppressed. + * + * PHPUnit's strict error handler turns preg_* warnings into ErrorExceptions + * before we can translate them into a MySQL-style error. This wrapper + * suppresses those warnings so the caller can check the result sentinel + * (false for preg_match, null for preg_replace / preg_replace_callback) + * and throw a clean exception. + * + * @param callable $op Preg operation. Must be self-contained. + * + * @return mixed Return value of the callable. + */ + private function regexp_run( $op ) { + set_error_handler( static function () {} ); + try { + return $op(); + } finally { + restore_error_handler(); + } + } + + /** + * Translate a preg_* failure into a caller-friendly exception message. + * + * Uses preg_last_error() to distinguish invalid patterns from runtime + * limit failures and invalid-UTF-8 input. + * + * @param string $pattern The original MySQL regex pattern. + * + * @throws Exception Always. + * @return void + */ + private function regexp_fail( $pattern ) { + $err = preg_last_error(); + if ( + PREG_BACKTRACK_LIMIT_ERROR === $err + || PREG_RECURSION_LIMIT_ERROR === $err + || ( defined( 'PREG_JIT_STACKLIMIT_ERROR' ) && PREG_JIT_STACKLIMIT_ERROR === $err ) + ) { + throw new Exception( 'Regular expression evaluation exceeded internal limits.' ); + } + if ( PREG_BAD_UTF8_ERROR === $err ) { + throw new Exception( 'Invalid UTF-8 data in regular expression input.' ); + } + throw new Exception( 'Invalid regular expression: ' . $pattern . '.' ); + } } diff --git a/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php b/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php index 4daf7ca7..7159eb91 100644 --- a/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php +++ b/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php @@ -104,6 +104,119 @@ public static function regexpOperators() { ); } + /** + * @dataProvider regexpLikeCases + */ + public function testRegexpLike( $expr, $pattern, $match_type, $expected ) { + $expr_sql = null === $expr ? 'NULL' : "'" . addslashes( $expr ) . "'"; + $pattern_sql = null === $pattern ? 'NULL' : "'" . addslashes( $pattern ) . "'"; + $args = $expr_sql . ', ' . $pattern_sql; + if ( null !== $match_type ) { + $args .= ", '" . addslashes( $match_type ) . "'"; + } + $this->assertQuery( "SELECT REGEXP_LIKE($args) AS r" ); + $this->assertSame( $expected, $this->engine->get_query_results()[0]->r ); + } + + public static function regexpLikeCases() { + return array( + // Basic matching. + 'match' => array( 'abc', 'abc', null, '1' ), + 'no match' => array( 'xbc', 'abc', null, '0' ), + 'quantifier match' => array( 'abbbbc', 'ab*bc', null, '1' ), + + // Default is case-insensitive (matches existing REGEXP operator behavior). + 'default i' => array( 'ABC', 'abc', null, '1' ), + + // Explicit flags. + 'explicit c' => array( 'ABC', 'abc', 'c', '0' ), + 'explicit i' => array( 'ABC', 'abc', 'i', '1' ), + + // Later flag wins. + 'ci -> c' => array( 'ABC', 'abc', 'ci', '1' ), + 'ic -> i' => array( 'ABC', 'abc', 'ic', '0' ), + + // Multiline. + 'm off: ^ anchored' => array( "abc\ndef", '^def', null, '0' ), + 'm on: ^ per line' => array( "abc\ndef", '^def', 'm', '1' ), + + // Dot matches newline. + "n off: . no \\n" => array( "a\nb", 'a.b', null, '0' ), + "n on: . matches \\n" => array( "a\nb", 'a.b', 'n', '1' ), + + // NULL propagation. + 'null expr' => array( null, 'abc', null, null ), + 'null pattern' => array( 'abc', null, null, null ), + ); + } + + public function testRegexpLikeNullMatchType() { + $this->assertQuery( "SELECT REGEXP_LIKE('abc', 'abc', NULL) AS r" ); + $this->assertNull( $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpLikeInvalidFlag() { + $this->assertQueryError( + "SELECT REGEXP_LIKE('abc', 'a', 'x')", + 'Invalid match_type flag: x.' + ); + } + + public function testRegexpLikeInvalidPattern() { + $this->assertQueryError( + "SELECT REGEXP_LIKE('abc', '(abc')", + 'Invalid regular expression: (abc.' + ); + } + + public function testRegexpMatchTypeMultipleFlags() { + // Later-wins across a four-character match_type. 'cimn' ends in 'n', + // so case-insensitive (last of c/i) + multiline + dotall apply. + $this->assertQuery( "SELECT REGEXP_LIKE('ABC', 'abc', 'cimn') AS r" ); + $this->assertSame( '1', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpMatchTypeUnixFlagNoOp() { + // The 'u' flag is accepted for source compatibility but has no effect + // (PCRE's default already matches MySQL's 'u' semantics). + $this->assertQuery( "SELECT REGEXP_LIKE('abc', 'abc', 'u') AS r" ); + $this->assertSame( '1', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpMatchTypeEmpty() { + // Empty match_type behaves like the default (case-insensitive). + $this->assertQuery( "SELECT REGEXP_LIKE('ABC', 'abc', '') AS r" ); + $this->assertSame( '1', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpInvalidUtf8() { + // Raw 0xFF is never valid UTF-8; /u rejects it, which regexp_fail + // translates to a dedicated error. + $this->assertQueryError( + "SELECT REGEXP_LIKE(CAST(X'FF' AS CHAR), 'a')", + 'Invalid UTF-8 data in regular expression input.' + ); + } + + public function testRegexpBacktrackLimit() { + // Classic exponential-backtracking pattern that exceeds PCRE's default + // backtrack limit; exercises the PREG_BACKTRACK_LIMIT_ERROR branch of + // regexp_fail(). + $subject = str_repeat( 'a', 30 ); + $this->assertQueryError( + "SELECT REGEXP_LIKE('$subject', '^(a?){30}a{30}\$')", + 'Regular expression evaluation exceeded internal limits.' + ); + } + + public function testRegexpLegacyOperatorRegression() { + // The legacy REGEXP operator must keep working alongside REGEXP_LIKE. + $this->assertQuery( "SELECT 'abc' REGEXP 'ABC' AS r" ); + $this->assertSame( '1', $this->engine->get_query_results()[0]->r ); + $this->assertQuery( "SELECT 'abc' REGEXP 'xyz' AS r" ); + $this->assertSame( '0', $this->engine->get_query_results()[0]->r ); + } + public function testInsertDateNow() { $this->assertQuery( "INSERT INTO _dates (option_name, option_value) VALUES ('first', now());" From 14700b6a3fa7d57c797e81dc66032180cdd2c649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 21 Apr 2026 17:03:35 +0200 Subject: [PATCH 2/5] Add REGEXP_REPLACE() UDF Implements MySQL REGEXP_REPLACE(expr, pattern, replacement [, pos [, occurrence [, match_type]]]) with three new private helpers: - regexp_char_to_byte_offset() converts a 1-based character pos into a byte offset, accepting char_count + 1 for the "start at end" case that MySQL allows for REPLACE / SUBSTR. - regexp_find_matches() walks the subject with preg_match and its offset argument so that lookbehind assertions can see bytes before pos. Skips UTF-8 continuation bytes after zero-width matches. - regexp_expand_replacement() implements MySQL/ICU replacement grammar: "$N" backreferences (with "$0" = full match and longest valid digit-prefix wins), "\X" emits X literally, "${N}" is rejected as invalid, and a trailing lone backslash is dropped. Errors mirror MySQL's: "A capture group has an invalid name." (3887) and "Index out of bounds in regular expression search." (3686). REGEXP_REPLACE rebuilds the result by walking collected matches, emitting the in-between bytes verbatim and substituting only the targeted occurrence (or all when occurrence = 0). Negative occurrence is clamped to 1 to match MySQL. Tests cover the data-driven happy path, NULL propagation, every documented backreference form, lookbehind across pos, zero-width matches, the pos = char_count + 1 edge, the negative-occurrence clamp, and the ICU error branches. --- ...s-wp-sqlite-pdo-user-defined-functions.php | 225 ++++++++++++++++++ .../tests/WP_SQLite_Driver_Tests.php | 200 ++++++++++++++++ 2 files changed, 425 insertions(+) diff --git a/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php b/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php index 574fe9e5..96e5e531 100644 --- a/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php +++ b/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php @@ -72,6 +72,7 @@ public static function register_for( $pdo ): self { 'if' => '_if', 'regexp' => 'regexp', 'regexp_like' => 'regexp_like', + 'regexp_replace' => 'regexp_replace', 'field' => 'field', 'log' => 'log', 'least' => 'least', @@ -563,6 +564,74 @@ function () use ( $compiled, $expr ) { return $result; } + /** + * Method to emulate MySQL REGEXP_REPLACE() function. + * + * Uses MySQL/ICU replacement grammar: "$N" backreferences ("$0" is the + * full match), "\X" emits X (drops the backslash), "${N}" is rejected. + * Negative `occurrence` is clamped to 1; `pos = char_count + 1` is + * accepted and returns the subject unchanged. + * + * @param string|null $expr Subject string. + * @param string|null $pattern Regex pattern. + * @param string|null $replacement Replacement string (supports $N backreferences). + * @param int|null $pos 1-based character position to start matching. + * @param int|null $occurrence Nth match to replace; 0 = all matches. + * @param string|null $match_type MySQL match_type flags. + * + * @throws Exception If the pattern is not a valid regular expression, or pos is out of range. + * @return string|null The replaced string, or NULL if any argument is NULL. + */ + public function regexp_replace( $expr, $pattern, $replacement, $pos = 1, $occurrence = 0, $match_type = '' ) { + if ( + null === $expr || null === $pattern || null === $replacement + || null === $pos || null === $occurrence || null === $match_type + ) { + return null; + } + + $compiled = $this->regexp_compile( $pattern, $match_type ); + $byte_start = $this->regexp_char_to_byte_offset( $expr, (int) $pos, true ); + $n = (int) $occurrence; + + // 0 means replace all; negative occurrences are clamped to 1 (MySQL behavior). + if ( $n < 0 ) { + $n = 1; + } + + $matches = $this->regexp_find_matches( $compiled, $expr, $byte_start, $n > 0 ? $n : -1 ); + if ( false === $matches ) { + $this->regexp_fail( $pattern ); + } + + // Rebuild: bytes before pos are untouched, then walk the collected + // matches, substituting only the targeted occurrence (or all when N=0). + $out = substr( $expr, 0, $byte_start ); + $cur = $byte_start; + foreach ( $matches as $i => $m ) { + $match_start = $m[0][1]; + $match_length = strlen( $m[0][0] ); + + $out .= substr( $expr, $cur, $match_start - $cur ); + + $replace_this = 0 === $n || ( $i + 1 ) === $n; + if ( $replace_this ) { + $groups = array(); + foreach ( $m as $g ) { + $groups[] = $g[0]; + } + $out .= $this->regexp_expand_replacement( $replacement, $groups ); + } else { + $out .= $m[0][0]; + } + + $cur = $match_start + $match_length; + } + $out .= substr( $expr, $cur ); + + return $out; + } + /** * Method to emulate MySQL FIELD() function. * @@ -1024,6 +1093,162 @@ private function regexp_run( $op ) { } } + /** + * Convert a 1-based character position into a byte offset into the UTF-8 string. + * + * @param string $s UTF-8 string. + * @param int $char_pos 1-based character position. + * @param bool $allow_past_end Whether to accept char_pos == char_count + 1 + * (returns strlen($s)). MySQL allows this for + * REGEXP_REPLACE and REGEXP_SUBSTR but not for + * REGEXP_INSTR. + * + * @throws Exception If $char_pos is out of range. + * @return int Byte offset into $s. + */ + private function regexp_char_to_byte_offset( $s, $char_pos, $allow_past_end = false ) { + if ( $char_pos < 1 ) { + throw new Exception( 'Index out of bounds in regular expression search.' ); + } + if ( 1 === $char_pos ) { + return 0; + } + $byte_len = strlen( $s ); + $chars = 1; + for ( $i = 0; $i < $byte_len; $i++ ) { + // Count every byte that isn't a UTF-8 continuation byte. + if ( ( ord( $s[ $i ] ) & 0xC0 ) !== 0x80 ) { + if ( $chars === $char_pos ) { + return $i; + } + ++$chars; + } + } + if ( $allow_past_end && $chars === $char_pos ) { + return $byte_len; + } + throw new Exception( 'Index out of bounds in regular expression search.' ); + } + + /** + * Expand a MySQL/ICU-style replacement template. + * + * Rules (from ICU, used by MySQL REGEXP_REPLACE): + * - "\X" for any X: emit X, drop the backslash (also applies to "\\" -> "\"). + * - Trailing lone backslash: dropped. + * - "$N" (N is one or more digits): emit the Nth capture group. Consumes + * the longest digit run that forms a valid group index; any trailing + * digits become literal text. + * - "$" not followed by a digit: error (matches MySQL ERROR 3887). + * - "$N" where N is larger than any existing group: error (ERROR 3686). + * - "${N}" is NOT supported and raises the same error as a bare "$". + * + * @param string $replacement The replacement template. + * @param array $groups Capture-group texts, with index 0 = full match. + * + * @throws Exception On an invalid "$..." reference. + * @return string The expanded replacement. + */ + private function regexp_expand_replacement( $replacement, $groups ) { + $max_group = count( $groups ) - 1; + $out = ''; + $len = strlen( $replacement ); + $i = 0; + while ( $i < $len ) { + $c = $replacement[ $i ]; + if ( '\\' === $c ) { + if ( $i + 1 < $len ) { + $out .= $replacement[ $i + 1 ]; + $i += 2; + } else { + ++$i; + } + continue; + } + if ( '$' === $c ) { + if ( $i + 1 >= $len || ! ctype_digit( $replacement[ $i + 1 ] ) ) { + throw new Exception( 'A capture group has an invalid name.' ); + } + $j = $i + 1; + while ( $j < $len && ctype_digit( $replacement[ $j ] ) ) { + ++$j; + } + // Longest digit prefix that refers to an existing group wins; + // remaining digits are emitted literally. + $digits = substr( $replacement, $i + 1, $j - $i - 1 ); + $idx = null; + $consumed = 0; + for ( $k = strlen( $digits ); $k > 0; --$k ) { + $cand = (int) substr( $digits, 0, $k ); + if ( $cand <= $max_group ) { + $idx = $cand; + $consumed = $k; + break; + } + } + if ( null === $idx ) { + throw new Exception( 'Index out of bounds in regular expression search.' ); + } + $out .= $groups[ $idx ]; + $i += 1 + $consumed; + continue; + } + $out .= $c; + ++$i; + } + return $out; + } + + /** + * Walk the subject applying a compiled pattern starting at a byte offset. + * + * Returns a list of match arrays in PREG_OFFSET_CAPTURE format. Uses + * preg_match with its offset argument rather than slicing the subject so + * lookbehind assertions can see bytes preceding byte_start. + * + * @param string $compiled PCRE-wrapped pattern. + * @param string $subject Full subject string. + * @param int $byte_start Byte offset at which matching begins. + * @param int $limit Max matches to collect; -1 for unlimited. + * + * @return array|false List of match arrays, or false on preg error. + */ + private function regexp_find_matches( $compiled, $subject, $byte_start, $limit ) { + return $this->regexp_run( + function () use ( $compiled, $subject, $byte_start, $limit ) { + $results = array(); + $offset = $byte_start; + $len = strlen( $subject ); + while ( -1 === $limit || count( $results ) < $limit ) { + $r = preg_match( $compiled, $subject, $m, PREG_OFFSET_CAPTURE, $offset ); + if ( false === $r ) { + return false; + } + if ( 0 === $r ) { + break; + } + $results[] = $m; + $match_start = $m[0][1]; + $match_length = strlen( $m[0][0] ); + $next = $match_start + $match_length; + if ( 0 === $match_length ) { + // Advance past a zero-width match to avoid looping on the same offset. + // Skip any UTF-8 continuation bytes so the next match starts on a code point boundary. + ++$next; + while ( $next < $len && ( ord( $subject[ $next ] ) & 0xC0 ) === 0x80 ) { + ++$next; + } + } + if ( $next > $len ) { + break; + } + $offset = $next; + } + return $results; + } + ); + } + /** * Translate a preg_* failure into a caller-friendly exception message. * diff --git a/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php b/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php index 7159eb91..bb80a073 100644 --- a/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php +++ b/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php @@ -217,6 +217,206 @@ public function testRegexpLegacyOperatorRegression() { $this->assertSame( '0', $this->engine->get_query_results()[0]->r ); } + /** + * @dataProvider regexpReplaceBasicCases + */ + public function testRegexpReplaceBasic( $expr, $pattern, $replacement, $expected ) { + $this->assertQuery( + sprintf( + "SELECT REGEXP_REPLACE('%s', '%s', '%s') AS r", + addslashes( $expr ), + addslashes( $pattern ), + addslashes( $replacement ) + ) + ); + $this->assertSame( $expected, $this->engine->get_query_results()[0]->r ); + } + + public static function regexpReplaceBasicCases() { + return array( + 'simple' => array( 'abcabc', 'b', 'X', 'aXcaXc' ), + 'no match' => array( 'abc', 'z', 'X', 'abc' ), + 'quantifier' => array( 'aabbcc', 'b+', 'B', 'aaBcc' ), + 'groups' => array( 'John Doe', '(\\w+) (\\w+)', '$2 $1', 'Doe John' ), + 'case-insensitive' => array( 'ABC', 'abc', 'x', 'x' ), + ); + } + + public function testRegexpReplaceNullPropagation() { + $this->assertQuery( "SELECT REGEXP_REPLACE(NULL, 'a', 'b') AS r" ); + $this->assertNull( $this->engine->get_query_results()[0]->r ); + $this->assertQuery( "SELECT REGEXP_REPLACE('abc', NULL, 'b') AS r" ); + $this->assertNull( $this->engine->get_query_results()[0]->r ); + $this->assertQuery( "SELECT REGEXP_REPLACE('abc', 'a', NULL) AS r" ); + $this->assertNull( $this->engine->get_query_results()[0]->r ); + } + + /** + * @dataProvider regexpReplaceFullCases + */ + public function testRegexpReplaceFull( $sql, $expected ) { + $this->assertQuery( "SELECT $sql AS r" ); + $this->assertSame( $expected, $this->engine->get_query_results()[0]->r ); + } + + public static function regexpReplaceFullCases() { + return array( + // pos: only replace from position 3 onward (1-based, character). + 'pos=3' => array( "REGEXP_REPLACE('abcabc', 'b', 'X', 3)", 'abcaXc' ), + // occurrence=1: replace only the first match after pos. + 'occurrence=1 from start' => array( "REGEXP_REPLACE('abcabc', 'b', 'X', 1, 1)", 'aXcabc' ), + // occurrence=2 from start. + 'occurrence=2 from start' => array( "REGEXP_REPLACE('abcabc', 'b', 'X', 1, 2)", 'abcaXc' ), + // occurrence=0 means all matches from pos. + 'occurrence=0 from pos 3' => array( "REGEXP_REPLACE('abcabc', 'b', 'X', 3, 0)", 'abcaXc' ), + // match_type c with default pos/occurrence. + 'match_type c' => array( "REGEXP_REPLACE('ABC', 'abc', 'x', 1, 0, 'c')", 'ABC' ), + // match_type i. + 'match_type i' => array( "REGEXP_REPLACE('ABC', 'abc', 'x', 1, 0, 'i')", 'x' ), + // Multi-byte pos: skip the first character, replace only in the rest. + 'multibyte pos' => array( "REGEXP_REPLACE('éabc', 'a', 'X', 2)", 'éXbc' ), + ); + } + + public function testRegexpReplacePosOutOfRange() { + $this->assertQueryError( + "SELECT REGEXP_REPLACE('abc', 'a', 'X', 10)", + 'Index out of bounds in regular expression search.' + ); + } + + public function testRegexpReplacePosAtEnd() { + // MySQL allows pos = char_count + 1 for REPLACE; returns subject unchanged. + $this->assertQuery( "SELECT REGEXP_REPLACE('abc', 'a', 'X', 4) AS r" ); + $this->assertSame( 'abc', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplacePosBeyondEnd() { + $this->assertQueryError( + "SELECT REGEXP_REPLACE('abc', 'a', 'X', 5)", + 'Index out of bounds in regular expression search.' + ); + } + + public function testRegexpReplacePosZero() { + $this->assertQueryError( + "SELECT REGEXP_REPLACE('abc', 'a', 'X', 0)", + 'Index out of bounds in regular expression search.' + ); + } + + public function testRegexpReplaceOccurrenceBeyondMatches() { + // MySQL: if occurrence exceeds the number of matches, return subject unchanged. + $this->assertQuery( "SELECT REGEXP_REPLACE('abc', 'a', 'X', 1, 5) AS r" ); + $this->assertSame( 'abc', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceNegativeOccurrenceClamped() { + // MySQL clamps negative occurrence to 1; 0 still means "replace all". + $this->assertQuery( "SELECT REGEXP_REPLACE('abcabc', 'b', 'X', 1, -100) AS r" ); + $this->assertSame( 'aXcabc', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceOccurrenceBackreferenceForms() { + $this->assertQuery( "SELECT REGEXP_REPLACE('abc', '(a)(b)(c)', '\$2\$1', 1, 1) AS r" ); + $this->assertSame( 'ba', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceBraceBackrefIsInvalid() { + // MySQL/ICU rejects "${N}"; "$" must be followed by a digit. + $this->assertQueryError( + "SELECT REGEXP_REPLACE('abc', '(a)(b)(c)', '\${2}\${1}', 1, 1)", + 'A capture group has an invalid name.' + ); + } + + public function testRegexpReplaceBackslashDigitIsLiteral() { + // MySQL strips backslash before any character; "\2\1" becomes "21". + $this->assertQuery( "SELECT REGEXP_REPLACE('xyzabc', '(a)(b)(c)', '\\\\2\\\\1', 1, 1) AS r" ); + $this->assertSame( 'xyz21', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceFullMatchBackref() { + // "$0" is the whole match. + $this->assertQuery( "SELECT REGEXP_REPLACE('abc', 'b', '[\$0]') AS r" ); + $this->assertSame( 'a[b]c', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceBackslashZeroIsLiteral() { + // "\0" is literal "0", not the full match. + $this->assertQuery( "SELECT REGEXP_REPLACE('abc', 'b', '[\\\\0]') AS r" ); + $this->assertSame( 'a[0]c', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceDollarDigitGreedyFallback() { + // "$10" with a single capture group is "$1" followed by literal "0". + $this->assertQuery( "SELECT REGEXP_REPLACE('abc', '(b)', '[\$10]') AS r" ); + $this->assertSame( 'a[b0]c', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceDollarWithoutDigitErrors() { + $this->assertQueryError( + "SELECT REGEXP_REPLACE('abc', 'b', 'x\$y')", + 'A capture group has an invalid name.' + ); + } + + public function testRegexpReplaceDollarOutOfBoundsErrors() { + $this->assertQueryError( + "SELECT REGEXP_REPLACE('abc', '(b)', '[\$9]')", + 'Index out of bounds in regular expression search.' + ); + } + + public function testRegexpReplaceTrailingBackslashDropped() { + // Trailing lone backslash is dropped (matches MySQL). + $this->assertQuery( "SELECT REGEXP_REPLACE('a', 'a', 'x\\\\') AS r" ); + $this->assertSame( 'x', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceBackslashLetterIsLiteral() { + // "\q" -> "q" (backslash stripped before any char, including letters). + $this->assertQuery( "SELECT REGEXP_REPLACE('abc', 'b', '[\\\\q]') AS r" ); + $this->assertSame( 'a[q]c', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceOccurrenceLookbehind() { + // Lookbehind depends on pattern context; previously a context-less + // preg_replace on the matched text silently dropped the replacement. + $this->assertQuery( "SELECT REGEXP_REPLACE('abcabc', '(?<=a)b', 'X', 1, 1) AS r" ); + $this->assertSame( 'aXcabc', $this->engine->get_query_results()[0]->r ); + $this->assertQuery( "SELECT REGEXP_REPLACE('abcabc', '(?<=a)b', 'X', 1, 2) AS r" ); + $this->assertSame( 'abcaXc', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceLookbehindAcrossPos() { + // The lookbehind sees bytes before pos because the full subject is kept. + $this->assertQuery( "SELECT REGEXP_REPLACE('ab', '(?<=a)b', 'X', 2) AS r" ); + $this->assertSame( 'aX', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceOccurrenceZeroWidth() { + // Zero-width match at a word boundary. + $this->assertQuery( "SELECT REGEXP_REPLACE('abc def', '\\\\b', '|', 1, 1) AS r" ); + $this->assertSame( '|abc def', $this->engine->get_query_results()[0]->r ); + $this->assertQuery( "SELECT REGEXP_REPLACE('abc def', '\\\\b', '|', 1, 2) AS r" ); + $this->assertSame( 'abc| def', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceOccurrenceLiteralEscapes() { + // \\ -> literal backslash in the replacement. SQL string literal tricks: + // PHP source "\\\\\\\\" is 4 backslashes, which SQL parses as 2 backslashes + // received by the function; the replacement grammar expander folds those + // to a single literal backslash. + $this->assertQuery( "SELECT REGEXP_REPLACE('a', 'a', '\\\\\\\\', 1, 1) AS r" ); + $this->assertSame( '\\', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpReplaceEmptyReplacement() { + $this->assertQuery( "SELECT REGEXP_REPLACE('abc', 'b', '') AS r" ); + $this->assertSame( 'ac', $this->engine->get_query_results()[0]->r ); + } + public function testInsertDateNow() { $this->assertQuery( "INSERT INTO _dates (option_name, option_value) VALUES ('first', now());" From 2f619da5cf3cf02c7e2e291f7508dda98e92a530 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 21 Apr 2026 17:05:05 +0200 Subject: [PATCH 3/5] Add REGEXP_SUBSTR() UDF Returns the Nth matched substring at or after a given character position, or NULL if no match. Reuses regexp_compile(), regexp_char_to_byte_offset() (with $allow_past_end so pos = char_count + 1 yields NULL), regexp_find_matches(), and regexp_fail() introduced with REGEXP_REPLACE. Negative or zero `occurrence` is clamped to 1, matching MySQL. Tests cover the data-driven happy path, NULL propagation, the occurrence clamp, the pos = char_count + 1 / pos > char_count + 1 boundary, multi-byte matches, invalid patterns, invalid flags, and a lookbehind whose context spans pos. --- ...s-wp-sqlite-pdo-user-defined-functions.php | 40 ++++++++ .../tests/WP_SQLite_Driver_Tests.php | 93 +++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php b/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php index 96e5e531..a854d1bf 100644 --- a/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php +++ b/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php @@ -73,6 +73,7 @@ public static function register_for( $pdo ): self { 'regexp' => 'regexp', 'regexp_like' => 'regexp_like', 'regexp_replace' => 'regexp_replace', + 'regexp_substr' => 'regexp_substr', 'field' => 'field', 'log' => 'log', 'least' => 'least', @@ -632,6 +633,45 @@ public function regexp_replace( $expr, $pattern, $replacement, $pos = 1, $occurr return $out; } + /** + * Method to emulate MySQL REGEXP_SUBSTR() function. + * + * Values of `occurrence` less than 1 are clamped to 1, matching MySQL. + * `pos = char_count + 1` is accepted and yields no match (NULL). + * + * @param string|null $expr Subject string. + * @param string|null $pattern Regex pattern. + * @param int|null $pos 1-based character position to start matching. + * @param int|null $occurrence Which match to return (1-based; <= 0 clamps to 1). + * @param string|null $match_type MySQL match_type flags. + * + * @throws Exception If the pattern is not a valid regular expression, or pos is out of range. + * @return string|null The matched substring, NULL if no match or any argument is NULL. + */ + public function regexp_substr( $expr, $pattern, $pos = 1, $occurrence = 1, $match_type = '' ) { + if ( + null === $expr || null === $pattern + || null === $pos || null === $occurrence || null === $match_type + ) { + return null; + } + + // MySQL clamps occurrence <= 0 to 1. + $n = max( 1, (int) $occurrence ); + + $compiled = $this->regexp_compile( $pattern, $match_type ); + $byte_start = $this->regexp_char_to_byte_offset( $expr, (int) $pos, true ); + + $matches = $this->regexp_find_matches( $compiled, $expr, $byte_start, $n ); + if ( false === $matches ) { + $this->regexp_fail( $pattern ); + } + if ( count( $matches ) < $n ) { + return null; + } + return $matches[ $n - 1 ][0][0]; + } + /** * Method to emulate MySQL FIELD() function. * diff --git a/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php b/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php index bb80a073..5a56c083 100644 --- a/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php +++ b/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php @@ -417,6 +417,99 @@ public function testRegexpReplaceEmptyReplacement() { $this->assertSame( 'ac', $this->engine->get_query_results()[0]->r ); } + /** + * @dataProvider regexpSubstrCases + */ + public function testRegexpSubstr( $sql, $expected ) { + $this->assertQuery( "SELECT $sql AS r" ); + $this->assertSame( $expected, $this->engine->get_query_results()[0]->r ); + } + + public static function regexpSubstrCases() { + return array( + 'basic match' => array( "REGEXP_SUBSTR('abc123def', '[0-9]+')", '123' ), + 'no match' => array( "REGEXP_SUBSTR('abcdef', '[0-9]+')", null ), + 'pos' => array( "REGEXP_SUBSTR('abc123def456', '[0-9]+', 5)", '23' ), + 'pos with occurrence=2' => array( "REGEXP_SUBSTR('abc123def456', '[0-9]+', 5, 2)", '456' ), + 'occurrence' => array( "REGEXP_SUBSTR('a1 b2 c3', '[a-z][0-9]', 1, 2)", 'b2' ), + 'occurrence too high' => array( "REGEXP_SUBSTR('a1 b2', '[a-z][0-9]', 1, 5)", null ), + 'match_type c' => array( "REGEXP_SUBSTR('ABC', 'abc', 1, 1, 'c')", null ), + 'multibyte match' => array( "REGEXP_SUBSTR('café', 'é')", 'é' ), + 'null expr' => array( 'REGEXP_SUBSTR(NULL, \'abc\')', null ), + 'null pattern' => array( "REGEXP_SUBSTR('abc', NULL)", null ), + ); + } + + public function testRegexpSubstrNullPos() { + $this->assertQuery( "SELECT REGEXP_SUBSTR('abc', 'a', NULL) AS r" ); + $this->assertNull( $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpSubstrNullOccurrence() { + $this->assertQuery( "SELECT REGEXP_SUBSTR('abc', 'a', 1, NULL) AS r" ); + $this->assertNull( $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpSubstrNullMatchType() { + $this->assertQuery( "SELECT REGEXP_SUBSTR('abc', 'a', 1, 1, NULL) AS r" ); + $this->assertNull( $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpSubstrOccurrenceClampedToOne() { + // MySQL clamps occurrence <= 0 to 1. + $this->assertQuery( "SELECT REGEXP_SUBSTR('abcabc', 'b', 1, 0) AS r" ); + $this->assertSame( 'b', $this->engine->get_query_results()[0]->r ); + $this->assertQuery( "SELECT REGEXP_SUBSTR('abcabc', 'b', 1, -5) AS r" ); + $this->assertSame( 'b', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpSubstrPosOutOfRange() { + $this->assertQueryError( + "SELECT REGEXP_SUBSTR('abc', 'a', 10)", + 'Index out of bounds in regular expression search.' + ); + } + + public function testRegexpSubstrPosAtEnd() { + // MySQL allows pos = char_count + 1 for SUBSTR; returns NULL. + $this->assertQuery( "SELECT REGEXP_SUBSTR('abc', 'a', 4) AS r" ); + $this->assertNull( $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpSubstrPosBeyondEnd() { + $this->assertQueryError( + "SELECT REGEXP_SUBSTR('abc', 'a', 5)", + 'Index out of bounds in regular expression search.' + ); + } + + public function testRegexpSubstrPosZero() { + $this->assertQueryError( + "SELECT REGEXP_SUBSTR('abc', 'a', 0)", + 'Index out of bounds in regular expression search.' + ); + } + + public function testRegexpSubstrInvalidPattern() { + $this->assertQueryError( + "SELECT REGEXP_SUBSTR('abc', '(abc')", + 'Invalid regular expression: (abc.' + ); + } + + public function testRegexpSubstrInvalidFlag() { + $this->assertQueryError( + "SELECT REGEXP_SUBSTR('abc', 'a', 1, 1, 'x')", + 'Invalid match_type flag: x.' + ); + } + + public function testRegexpSubstrLookbehindAcrossPos() { + // The lookbehind sees bytes before pos because the full subject is kept. + $this->assertQuery( "SELECT REGEXP_SUBSTR('ab', '(?<=a)b', 2) AS r" ); + $this->assertSame( 'b', $this->engine->get_query_results()[0]->r ); + } + public function testInsertDateNow() { $this->assertQuery( "INSERT INTO _dates (option_name, option_value) VALUES ('first', now());" From beb1aa5cd5ad24d69c547dbc3fc832bda8cf1b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 21 Apr 2026 17:06:47 +0200 Subject: [PATCH 4/5] Add REGEXP_INSTR() UDF Returns the 1-based character position of the Nth match (or 0 if none), with return_option controlling whether to report the match start or the position one past its end. Adds the small regexp_byte_offset_to_char_index() helper that converts a byte offset returned by PCRE into a UTF-8 character index. `pos` greater than char_count is rejected even when SUBSTR / REPLACE allow it, matching MySQL's stricter validation for INSTR. Negative or zero `occurrence` is clamped to 1, also matching MySQL. return_option must be 0 (start) or 1 (one past end); anything else raises "Incorrect arguments to regexp_instr: return_option must be 1 or 0." (matching MySQL's wording). The check runs before the occurrence clamp so the message is consistent. Tests cover the data-driven happy path, NULL propagation, the occurrence clamp, the straddling-match boundary, multi-byte return_option=1, the return_option validation (including under otherwise no-op occurrences), and a lookbehind whose context spans pos. --- ...s-wp-sqlite-pdo-user-defined-functions.php | 77 +++++++++++++ .../tests/WP_SQLite_Driver_Tests.php | 102 ++++++++++++++++++ 2 files changed, 179 insertions(+) diff --git a/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php b/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php index a854d1bf..0cf506e9 100644 --- a/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php +++ b/packages/mysql-on-sqlite/src/sqlite/class-wp-sqlite-pdo-user-defined-functions.php @@ -74,6 +74,7 @@ public static function register_for( $pdo ): self { 'regexp_like' => 'regexp_like', 'regexp_replace' => 'regexp_replace', 'regexp_substr' => 'regexp_substr', + 'regexp_instr' => 'regexp_instr', 'field' => 'field', 'log' => 'log', 'least' => 'least', @@ -672,6 +673,59 @@ public function regexp_substr( $expr, $pattern, $pos = 1, $occurrence = 1, $matc return $matches[ $n - 1 ][0][0]; } + /** + * Method to emulate MySQL REGEXP_INSTR() function. + * + * Values of `occurrence` less than 1 are clamped to 1, matching MySQL. + * `pos` greater than char_count is rejected (unlike SUBSTR and REPLACE). + * + * @param string|null $expr Subject string. + * @param string|null $pattern Regex pattern. + * @param int|null $pos 1-based character position to start matching. + * @param int|null $occurrence Which match to locate (1-based; <= 0 clamps to 1). + * @param int|null $return_option 0 = start of match (default), 1 = one past end. + * @param string|null $match_type MySQL match_type flags. + * + * @throws Exception If the pattern is invalid, pos is out of range, or + * return_option is not 0 or 1. + * @return int|null 1-based character position, 0 if no match, NULL on NULL input. + */ + public function regexp_instr( $expr, $pattern, $pos = 1, $occurrence = 1, $return_option = 0, $match_type = '' ) { + if ( + null === $expr || null === $pattern + || null === $pos || null === $occurrence + || null === $return_option || null === $match_type + ) { + return null; + } + + $ret = (int) $return_option; + if ( 0 !== $ret && 1 !== $ret ) { + throw new Exception( 'Incorrect arguments to regexp_instr: return_option must be 1 or 0.' ); + } + // MySQL clamps occurrence <= 0 to 1. + $n = max( 1, (int) $occurrence ); + + $compiled = $this->regexp_compile( $pattern, $match_type ); + $byte_start = $this->regexp_char_to_byte_offset( $expr, (int) $pos ); + + $matches = $this->regexp_find_matches( $compiled, $expr, $byte_start, $n ); + if ( false === $matches ) { + $this->regexp_fail( $pattern ); + } + if ( count( $matches ) < $n ) { + return 0; + } + + list( $matched_text, $matched_byte_offset ) = $matches[ $n - 1 ][0]; + $target_byte = $matched_byte_offset; + if ( 1 === $ret ) { + $target_byte += strlen( $matched_text ); + } + + return $this->regexp_byte_offset_to_char_index( $expr, $target_byte ) + 1; + } + /** * Method to emulate MySQL FIELD() function. * @@ -1170,6 +1224,29 @@ private function regexp_char_to_byte_offset( $s, $char_pos, $allow_past_end = fa throw new Exception( 'Index out of bounds in regular expression search.' ); } + /** + * Convert a byte offset within a UTF-8 string into the 0-based character index. + * + * The byte offset is expected to fall on a UTF-8 code point boundary, as is + * the case for offsets returned by PCRE. Offsets greater than the string + * length are clamped to the string length as a defensive measure. + * + * @param string $s UTF-8 string. + * @param int $byte_offset Byte offset on a code point boundary. + * + * @return int 0-based character index. + */ + private function regexp_byte_offset_to_char_index( $s, $byte_offset ) { + $byte_offset = min( $byte_offset, strlen( $s ) ); + $chars = 0; + for ( $i = 0; $i < $byte_offset; $i++ ) { + if ( ( ord( $s[ $i ] ) & 0xC0 ) !== 0x80 ) { + ++$chars; + } + } + return $chars; + } + /** * Expand a MySQL/ICU-style replacement template. * diff --git a/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php b/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php index 5a56c083..494bde65 100644 --- a/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php +++ b/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php @@ -510,6 +510,108 @@ public function testRegexpSubstrLookbehindAcrossPos() { $this->assertSame( 'b', $this->engine->get_query_results()[0]->r ); } + /** + * @dataProvider regexpInstrCases + */ + public function testRegexpInstr( $sql, $expected ) { + $this->assertQuery( "SELECT $sql AS r" ); + $this->assertSame( $expected, $this->engine->get_query_results()[0]->r ); + } + + public static function regexpInstrCases() { + return array( + 'basic' => array( "REGEXP_INSTR('dog cat dog', 'dog')", '1' ), + 'no match' => array( "REGEXP_INSTR('abc', 'xyz')", '0' ), + 'second match' => array( "REGEXP_INSTR('dog cat dog', 'dog', 1, 2)", '9' ), + 'pos skips first match' => array( "REGEXP_INSTR('dog cat dog', 'dog', 5)", '9' ), + 'return_option=1 (end)' => array( "REGEXP_INSTR('dog cat dog', 'dog', 1, 1, 1)", '4' ), + 'match_type c miss' => array( "REGEXP_INSTR('DOG', 'dog', 1, 1, 0, 'c')", '0' ), + 'multibyte position' => array( "REGEXP_INSTR('café123', '[0-9]+')", '5' ), + ); + } + + public function testRegexpInstrNullExpr() { + $this->assertQuery( "SELECT REGEXP_INSTR(NULL, 'abc') AS r" ); + $this->assertNull( $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpInstrNullPattern() { + $this->assertQuery( "SELECT REGEXP_INSTR('abc', NULL) AS r" ); + $this->assertNull( $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpInstrPosZero() { + $this->assertQueryError( + "SELECT REGEXP_INSTR('abc', 'a', 0)", + 'Index out of bounds in regular expression search.' + ); + } + + public function testRegexpInstrPosOutOfRange() { + $this->assertQueryError( + "SELECT REGEXP_INSTR('abc', 'a', 10)", + 'Index out of bounds in regular expression search.' + ); + } + + public function testRegexpInstrInvalidReturnOption() { + $this->assertQueryError( + "SELECT REGEXP_INSTR('abc', 'a', 1, 1, 2)", + 'Incorrect arguments to regexp_instr: return_option must be 1 or 0.' + ); + } + + public function testRegexpInstrInvalidReturnOptionWithOccurrenceZero() { + // return_option must be validated before occurrence is clamped, so an + // invalid return_option consistently errors regardless of occurrence. + $this->assertQueryError( + "SELECT REGEXP_INSTR('abc', 'a', 1, 0, 99)", + 'Incorrect arguments to regexp_instr: return_option must be 1 or 0.' + ); + } + + public function testRegexpInstrInvalidPattern() { + $this->assertQueryError( + "SELECT REGEXP_INSTR('abc', '(abc')", + 'Invalid regular expression: (abc.' + ); + } + + public function testRegexpInstrInvalidFlag() { + $this->assertQueryError( + "SELECT REGEXP_INSTR('abc', 'a', 1, 1, 0, 'x')", + 'Invalid match_type flag: x.' + ); + } + + public function testRegexpInstrOccurrenceClampedToOne() { + // MySQL clamps occurrence <= 0 to 1. + $this->assertQuery( "SELECT REGEXP_INSTR('abcabc', 'b', 1, 0) AS r" ); + $this->assertSame( '2', $this->engine->get_query_results()[0]->r ); + $this->assertQuery( "SELECT REGEXP_INSTR('abcabc', 'b', 1, -5) AS r" ); + $this->assertSame( '2', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpInstrStraddlingMatch() { + // A match that starts before pos is not returned; the next match at or + // after pos is returned instead. + $this->assertQuery( "SELECT REGEXP_INSTR('abc123def', '[0-9]+', 5) AS r" ); + $this->assertSame( '5', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpInstrMultibyteReturnOptionEnd() { + // Multibyte match ('é' is 2 bytes) with return_option=1 (one past end). + // 'aéb' char positions: a=1, é=2, b=3. 'é' matches at char 2, end char position = 3. + $this->assertQuery( "SELECT REGEXP_INSTR('aéb', 'é', 1, 1, 1) AS r" ); + $this->assertSame( '3', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpInstrLookbehindAcrossPos() { + // The lookbehind sees bytes before pos because the full subject is kept. + $this->assertQuery( "SELECT REGEXP_INSTR('ab', '(?<=a)b', 2) AS r" ); + $this->assertSame( '2', $this->engine->get_query_results()[0]->r ); + } + public function testInsertDateNow() { $this->assertQuery( "INSERT INTO _dates (option_name, option_value) VALUES ('first', now());" From 14ad7f8515b862a061559dc2c84d523e384d751e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jake=C5=A1?= Date: Tue, 21 Apr 2026 17:07:52 +0200 Subject: [PATCH 5/5] Pin down REGEXP_* edge cases shared across functions Adds a final layer of tests that exercise behaviors which involve more than one of REGEXP_LIKE / _REPLACE / _SUBSTR / _INSTR at once and only become testable once all four functions are available: - Empty pattern raises "Illegal argument to a regular expression." uniformly (MySQL ERROR 3685). - Empty subject with a zero-width-matching pattern still produces a match (LIKE = 1, SUBSTR = "", INSTR = 1). - Zero-width anchors ^ / $ report sensible 1-based positions and an empty-string match for SUBSTR rather than NULL. - Astral-plane (4-byte UTF-8) characters are counted as one code point by both SUBSTR and INSTR. - Negative pos rejects consistently across REPLACE / SUBSTR / INSTR. --- .../tests/WP_SQLite_Driver_Tests.php | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php b/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php index 494bde65..a3012a0d 100644 --- a/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php +++ b/packages/mysql-on-sqlite/tests/WP_SQLite_Driver_Tests.php @@ -612,6 +612,73 @@ public function testRegexpInstrLookbehindAcrossPos() { $this->assertSame( '2', $this->engine->get_query_results()[0]->r ); } + public function testRegexpEmptyPatternRejected() { + // MySQL rejects the empty pattern with ERROR 3685. + $this->assertQueryError( + "SELECT REGEXP_LIKE('abc', '')", + 'Illegal argument to a regular expression.' + ); + $this->assertQueryError( + "SELECT REGEXP_REPLACE('abc', '', 'x')", + 'Illegal argument to a regular expression.' + ); + $this->assertQueryError( + "SELECT REGEXP_SUBSTR('abc', '')", + 'Illegal argument to a regular expression.' + ); + $this->assertQueryError( + "SELECT REGEXP_INSTR('abc', '')", + 'Illegal argument to a regular expression.' + ); + } + + public function testRegexpEmptySubject() { + // A pattern that matches empty string still matches against an empty subject. + $this->assertQuery( "SELECT REGEXP_LIKE('', 'a*') AS r" ); + $this->assertSame( '1', $this->engine->get_query_results()[0]->r ); + $this->assertQuery( "SELECT REGEXP_SUBSTR('', 'a*') AS r" ); + $this->assertSame( '', $this->engine->get_query_results()[0]->r ); + $this->assertQuery( "SELECT REGEXP_INSTR('', 'a*') AS r" ); + $this->assertSame( '1', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpZeroWidthAnchors() { + // ^ matches at position 1 (length 0). + $this->assertQuery( "SELECT REGEXP_INSTR('abc', '^') AS r" ); + $this->assertSame( '1', $this->engine->get_query_results()[0]->r ); + // $ matches one past the last character. + $this->assertQuery( "SELECT REGEXP_INSTR('abc', '\$') AS r" ); + $this->assertSame( '4', $this->engine->get_query_results()[0]->r ); + // SUBSTR of a zero-width anchor is the empty string, not NULL. + $this->assertQuery( "SELECT REGEXP_SUBSTR('abc', '^') AS r" ); + $this->assertSame( '', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpAstralPlaneCharacter() { + // 4-byte UTF-8 encodes as one code point; char offsets should reflect that. + // "x😀y" has three characters (x at 1, 😀 at 2, y at 3). + $this->assertQuery( "SELECT REGEXP_SUBSTR('x😀y', '.', 2) AS r" ); + $this->assertSame( '😀', $this->engine->get_query_results()[0]->r ); + $this->assertQuery( "SELECT REGEXP_INSTR('😀z', 'z', 1, 1, 1) AS r" ); + $this->assertSame( '3', $this->engine->get_query_results()[0]->r ); + } + + public function testRegexpNegativePosErrors() { + // REGEXP_LIKE has no pos argument. The other three reject negative pos. + $this->assertQueryError( + "SELECT REGEXP_REPLACE('abc', 'a', 'X', -1)", + 'Index out of bounds in regular expression search.' + ); + $this->assertQueryError( + "SELECT REGEXP_SUBSTR('abc', 'a', -1)", + 'Index out of bounds in regular expression search.' + ); + $this->assertQueryError( + "SELECT REGEXP_INSTR('abc', 'a', -1)", + 'Index out of bounds in regular expression search.' + ); + } + public function testInsertDateNow() { $this->assertQuery( "INSERT INTO _dates (option_name, option_value) VALUES ('first', now());"