diff options
author | Tim Starling <tstarling@wikimedia.org> | 2021-12-10 10:35:07 +1100 |
---|---|---|
committer | Tim Starling <tstarling@wikimedia.org> | 2021-12-13 22:13:10 +0000 |
commit | e85d532aa244cce4c9a68b79985464d25e94672d (patch) | |
tree | 320c067fd6dd05274b5dc50e557212bb0ecd82f2 | |
parent | 132cba07f2da7d4ffba24879cc1a61bf2718c5b0 (diff) |
RemoteIcuCollation
Add a collation that gets its data from a remote Shellbox instance. This
is meant as a migration helper to use during an ICU upgrade.
Add a batch method to Collation so that this can be somewhat efficient
when adding multiple categories.
Bug: T263437
Change-Id: I76610d251fb55df90c78acb9f59fd81421f876dd
-rw-r--r-- | autoload.php | 1 | ||||
-rw-r--r-- | includes/collation/Collation.php | 14 | ||||
-rw-r--r-- | includes/collation/CollationFactory.php | 10 | ||||
-rw-r--r-- | includes/collation/RemoteIcuCollation.php | 110 | ||||
-rw-r--r-- | tests/phpunit/includes/collation/RemoteIcuCollationTest.php | 128 |
5 files changed, 263 insertions, 0 deletions
diff --git a/autoload.php b/autoload.php index 21ab1d62dc9c..68e2e8f9d4b8 100644 --- a/autoload.php +++ b/autoload.php @@ -1326,6 +1326,7 @@ $wgAutoloadLocalClasses = [ 'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php', 'RefreshSecondaryDataUpdate' => __DIR__ . '/includes/deferred/RefreshSecondaryDataUpdate.php', 'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php', + 'RemoteIcuCollation' => __DIR__ . '/includes/collation/RemoteIcuCollation.php', 'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php', 'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php', 'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php', diff --git a/includes/collation/Collation.php b/includes/collation/Collation.php index 1a31751c1154..939eb243f005 100644 --- a/includes/collation/Collation.php +++ b/includes/collation/Collation.php @@ -72,6 +72,20 @@ abstract class Collation { abstract public function getSortKey( $string ); /** + * Get multiple sort keys + * + * @param string[] $strings + * @return string[] + */ + public function getSortKeys( $strings ) { + $ret = []; + foreach ( $strings as $key => $s ) { + $ret[$key] = $this->getSortKey( $s ); + } + return $ret; + } + + /** * Given a string, return the logical "first letter" to be used for * grouping on category pages and so on. This has to be coordinated * carefully with convertToSortkey(), or else the sorted list might jump diff --git a/includes/collation/CollationFactory.php b/includes/collation/CollationFactory.php index e1d8f62462de..060f810d674f 100644 --- a/includes/collation/CollationFactory.php +++ b/includes/collation/CollationFactory.php @@ -151,6 +151,16 @@ class CollationFactory { $match[1], ] ] ); + } elseif ( preg_match( '/^remote-uca-([A-Za-z@=-]+)$/', $collationName, $match ) ) { + return $this->instantiateCollation( [ + 'class' => \RemoteIcuCollation::class, + 'services' => [ + 'ShellboxClientFactory' + ], + 'args' => [ + $match[1] + ] + ] ); } // Provide a mechanism for extensions to hook in. diff --git a/includes/collation/RemoteIcuCollation.php b/includes/collation/RemoteIcuCollation.php new file mode 100644 index 000000000000..6650d0c7e46b --- /dev/null +++ b/includes/collation/RemoteIcuCollation.php @@ -0,0 +1,110 @@ +<?php + +use MediaWiki\Shell\ShellboxClientFactory; + +/** + * An ICU collation that uses a remote server to compute sort keys. This can be + * used in conjunction with $wgTempCategoryCollations to migrate to a different + * version of ICU. + */ +class RemoteIcuCollation extends Collation { + private $rpcClient; + private $locale; + + /** + * @param ShellboxClientFactory $shellboxClientFactory + * @param string $locale + */ + public function __construct( ShellboxClientFactory $shellboxClientFactory, $locale ) { + $this->rpcClient = $shellboxClientFactory->getRpcClient( + [ 'service' => 'icu-collation' ] ); + $this->locale = $locale; + } + + public function getSortKey( $string ) { + return $this->getSortKeys( [ $string ] )[0]; + } + + /** + * Encode an array of binary strings as a string + * + * @param string[] $strings + * @return string + */ + private static function encode( $strings ) { + $ret = ''; + foreach ( $strings as $s ) { + $ret .= sprintf( "%08x", strlen( $s ) ) . $s; + } + return $ret; + } + + /** + * Decode the value returned by encode() + * + * @param string $blob + * @return string[] + */ + private static function decode( $blob ) { + $p = 0; + $ret = []; + while ( $p < strlen( $blob ) ) { + $len = intval( substr( $blob, $p, 8 ), 16 ); + $p += 8; + $ret[] = substr( $blob, $p, $len ); + $p += $len; + } + return $ret; + } + + public function getSortKeys( $strings ) { + if ( !count( $strings ) ) { + return []; + } + $blob = $this->rpcClient->call( + 'icu-collation', + self::class . '::' . 'doGetSortKeys', + [ + $this->locale, + self::encode( array_values( $strings ) ) + ], + [ + 'classes' => [ parent::class, self::class ], + 'binary' => true + ] + ); + return array_combine( + array_keys( $strings ), + self::decode( $blob ) + ); + } + + public function getFirstLetter( $string ) { + // @phan-suppress-previous-line PhanPluginNeverReturnMethod + throw new RuntimeException( __METHOD__ . ': not implemented' ); + } + + /** + * The remote entry point. Get sort keys for an encoded list of inputs. + * + * @param string $locale The ICU locale + * @param string $blob The input array encoded with encode() + * @return string The encoded result + */ + public static function doGetSortKeys( $locale, $blob ) { + $mainCollator = Collator::create( $locale ); + if ( !$mainCollator ) { + throw new RuntimeException( "Invalid ICU locale specified for collation: $locale" ); + } + + // If the special suffix for numeric collation is present, turn on numeric collation. + if ( substr( $locale, -5, 5 ) === '-u-kn' ) { + $mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON ); + } + $ret = []; + foreach ( self::decode( $blob ) as $string ) { + $ret[] = $mainCollator->getSortKey( $string ); + } + return self::encode( $ret ); + } +} diff --git a/tests/phpunit/includes/collation/RemoteIcuCollationTest.php b/tests/phpunit/includes/collation/RemoteIcuCollationTest.php new file mode 100644 index 000000000000..49c79d1ffb08 --- /dev/null +++ b/tests/phpunit/includes/collation/RemoteIcuCollationTest.php @@ -0,0 +1,128 @@ +<?php + +use Wikimedia\TestingAccessWrapper; + +/** + * @covers RemoteIcuCollation + */ +class RemoteIcuCollationTest extends MediaWikiLangTestCase { + public static function provideEncode() { + return [ + [ + [], + '' + ], + [ + [ 'foo' ], + '00000003foo' + ], + [ + [ 'foo', 'a somewhat longer string' ], + '00000003foo00000018a somewhat longer string' + ], + ]; + } + + /** @dataProvider provideEncode */ + public function testEncode( $input, $expected ) { + $coll = TestingAccessWrapper::newFromClass( RemoteIcuCollation::class ); + $this->assertSame( $expected, $coll->encode( $input ) ); + } + + public static function provideEncodeDecode() { + return [ + [ [ "\000" ] ], + [ [ "a\000b" ] ], + [ [ str_repeat( "\001", 100 ) ] ], + [ [ 'foo' ] ], + [ [ 'foo', 'bar' ] ], + [ [ 'foo', 'bar', str_repeat( 'x', 1000 ) ] ] + ]; + } + + /** @dataProvider provideEncodeDecode */ + public function testEncodeDecode( $input ) { + $coll = TestingAccessWrapper::newFromClass( RemoteIcuCollation::class ); + $this->assertSame( $input, $coll->decode( $coll->encode( $input ) ) ); + } + + public static function provideGetSortKeys() { + $cases = [ + [], + [ '' ], + [ 'test1' => 'bar', 'test2' => 'foo' ], + [ + 'bar', + 'foo' + ], + [ + 'first', + 'Second' + ], + [ + '', + 'second' + ], + [ + 'Berić', + 'Berisha', + ], + [ + '2', + '10', + ] + ]; + foreach ( $cases as $case ) { + yield [ $case ]; + } + } + + /** @dataProvider provideGetSortKeys */ + public function testGetSortKeys( $inputs ) { + if ( !extension_loaded( 'intl' ) ) { + $this->markTestSkipped( 'Need PHP intl' ); + } + $coll = new RemoteIcuCollation( + $this->getServiceContainer()->getShellboxClientFactory(), + 'uca-default-u-kn' + ); + $sortKeys = $coll->getSortKeys( $inputs ); + $prevKey = null; + if ( count( $inputs ) ) { + foreach ( $inputs as $i => $input ) { + $key = $sortKeys[$i]; + $this->assertIsString( $key ); + if ( $prevKey ) { + $this->assertLessThan( 0, strcmp( $prevKey, $key ) ); + } + $prevKey = $key; + } + } else { + $this->assertSame( [], $sortKeys ); + } + } + + /** @dataProvider provideGetSortKeys */ + public function testGetSortKey( $inputs ) { + if ( !count( $inputs ) ) { + // Not risky, it's just handy to reuse the provider + $this->assertTrue( true ); + } + if ( !extension_loaded( 'intl' ) ) { + $this->markTestSkipped( 'Need PHP intl' ); + } + $coll = new RemoteIcuCollation( + $this->getServiceContainer()->getShellboxClientFactory(), + 'uca-default-u-kn' + ); + $prevKey = null; + foreach ( $inputs as $input ) { + $key = $coll->getSortKey( $input ); + $this->assertIsString( $key ); + if ( $prevKey ) { + $this->assertLessThan( 0, strcmp( $prevKey, $key ) ); + } + $prevKey = $key; + } + } +} |