summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Starling <tstarling@wikimedia.org>2021-12-10 10:35:07 +1100
committerTim Starling <tstarling@wikimedia.org>2021-12-13 22:13:10 +0000
commite85d532aa244cce4c9a68b79985464d25e94672d (patch)
tree320c067fd6dd05274b5dc50e557212bb0ecd82f2
parent132cba07f2da7d4ffba24879cc1a61bf2718c5b0 (diff)
RemoteIcuCollation
Add a collation that gets its data from a remote Shellbox instance. This is meant as a migration helper to use during an ICU upgrade. Add a batch method to Collation so that this can be somewhat efficient when adding multiple categories. Bug: T263437 Change-Id: I76610d251fb55df90c78acb9f59fd81421f876dd
-rw-r--r--autoload.php1
-rw-r--r--includes/collation/Collation.php14
-rw-r--r--includes/collation/CollationFactory.php10
-rw-r--r--includes/collation/RemoteIcuCollation.php110
-rw-r--r--tests/phpunit/includes/collation/RemoteIcuCollationTest.php128
5 files changed, 263 insertions, 0 deletions
diff --git a/autoload.php b/autoload.php
index 21ab1d62dc9c..68e2e8f9d4b8 100644
--- a/autoload.php
+++ b/autoload.php
@@ -1326,6 +1326,7 @@ $wgAutoloadLocalClasses = [
'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php',
'RefreshSecondaryDataUpdate' => __DIR__ . '/includes/deferred/RefreshSecondaryDataUpdate.php',
'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php',
+ 'RemoteIcuCollation' => __DIR__ . '/includes/collation/RemoteIcuCollation.php',
'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php',
'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php',
'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php',
diff --git a/includes/collation/Collation.php b/includes/collation/Collation.php
index 1a31751c1154..939eb243f005 100644
--- a/includes/collation/Collation.php
+++ b/includes/collation/Collation.php
@@ -72,6 +72,20 @@ abstract class Collation {
abstract public function getSortKey( $string );
/**
+ * Get multiple sort keys
+ *
+ * @param string[] $strings
+ * @return string[]
+ */
+ public function getSortKeys( $strings ) {
+ $ret = [];
+ foreach ( $strings as $key => $s ) {
+ $ret[$key] = $this->getSortKey( $s );
+ }
+ return $ret;
+ }
+
+ /**
* Given a string, return the logical "first letter" to be used for
* grouping on category pages and so on. This has to be coordinated
* carefully with convertToSortkey(), or else the sorted list might jump
diff --git a/includes/collation/CollationFactory.php b/includes/collation/CollationFactory.php
index e1d8f62462de..060f810d674f 100644
--- a/includes/collation/CollationFactory.php
+++ b/includes/collation/CollationFactory.php
@@ -151,6 +151,16 @@ class CollationFactory {
$match[1],
]
] );
+ } elseif ( preg_match( '/^remote-uca-([A-Za-z@=-]+)$/', $collationName, $match ) ) {
+ return $this->instantiateCollation( [
+ 'class' => \RemoteIcuCollation::class,
+ 'services' => [
+ 'ShellboxClientFactory'
+ ],
+ 'args' => [
+ $match[1]
+ ]
+ ] );
}
// Provide a mechanism for extensions to hook in.
diff --git a/includes/collation/RemoteIcuCollation.php b/includes/collation/RemoteIcuCollation.php
new file mode 100644
index 000000000000..6650d0c7e46b
--- /dev/null
+++ b/includes/collation/RemoteIcuCollation.php
@@ -0,0 +1,110 @@
+<?php
+
+use MediaWiki\Shell\ShellboxClientFactory;
+
+/**
+ * An ICU collation that uses a remote server to compute sort keys. This can be
+ * used in conjunction with $wgTempCategoryCollations to migrate to a different
+ * version of ICU.
+ */
+class RemoteIcuCollation extends Collation {
+ private $rpcClient;
+ private $locale;
+
+ /**
+ * @param ShellboxClientFactory $shellboxClientFactory
+ * @param string $locale
+ */
+ public function __construct( ShellboxClientFactory $shellboxClientFactory, $locale ) {
+ $this->rpcClient = $shellboxClientFactory->getRpcClient(
+ [ 'service' => 'icu-collation' ] );
+ $this->locale = $locale;
+ }
+
+ public function getSortKey( $string ) {
+ return $this->getSortKeys( [ $string ] )[0];
+ }
+
+ /**
+ * Encode an array of binary strings as a string
+ *
+ * @param string[] $strings
+ * @return string
+ */
+ private static function encode( $strings ) {
+ $ret = '';
+ foreach ( $strings as $s ) {
+ $ret .= sprintf( "%08x", strlen( $s ) ) . $s;
+ }
+ return $ret;
+ }
+
+ /**
+ * Decode the value returned by encode()
+ *
+ * @param string $blob
+ * @return string[]
+ */
+ private static function decode( $blob ) {
+ $p = 0;
+ $ret = [];
+ while ( $p < strlen( $blob ) ) {
+ $len = intval( substr( $blob, $p, 8 ), 16 );
+ $p += 8;
+ $ret[] = substr( $blob, $p, $len );
+ $p += $len;
+ }
+ return $ret;
+ }
+
+ public function getSortKeys( $strings ) {
+ if ( !count( $strings ) ) {
+ return [];
+ }
+ $blob = $this->rpcClient->call(
+ 'icu-collation',
+ self::class . '::' . 'doGetSortKeys',
+ [
+ $this->locale,
+ self::encode( array_values( $strings ) )
+ ],
+ [
+ 'classes' => [ parent::class, self::class ],
+ 'binary' => true
+ ]
+ );
+ return array_combine(
+ array_keys( $strings ),
+ self::decode( $blob )
+ );
+ }
+
+ public function getFirstLetter( $string ) {
+ // @phan-suppress-previous-line PhanPluginNeverReturnMethod
+ throw new RuntimeException( __METHOD__ . ': not implemented' );
+ }
+
+ /**
+ * The remote entry point. Get sort keys for an encoded list of inputs.
+ *
+ * @param string $locale The ICU locale
+ * @param string $blob The input array encoded with encode()
+ * @return string The encoded result
+ */
+ public static function doGetSortKeys( $locale, $blob ) {
+ $mainCollator = Collator::create( $locale );
+ if ( !$mainCollator ) {
+ throw new RuntimeException( "Invalid ICU locale specified for collation: $locale" );
+ }
+
+ // If the special suffix for numeric collation is present, turn on numeric collation.
+ if ( substr( $locale, -5, 5 ) === '-u-kn' ) {
+ $mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
+ }
+ $ret = [];
+ foreach ( self::decode( $blob ) as $string ) {
+ $ret[] = $mainCollator->getSortKey( $string );
+ }
+ return self::encode( $ret );
+ }
+}
diff --git a/tests/phpunit/includes/collation/RemoteIcuCollationTest.php b/tests/phpunit/includes/collation/RemoteIcuCollationTest.php
new file mode 100644
index 000000000000..49c79d1ffb08
--- /dev/null
+++ b/tests/phpunit/includes/collation/RemoteIcuCollationTest.php
@@ -0,0 +1,128 @@
+<?php
+
+use Wikimedia\TestingAccessWrapper;
+
+/**
+ * @covers RemoteIcuCollation
+ */
+class RemoteIcuCollationTest extends MediaWikiLangTestCase {
+ public static function provideEncode() {
+ return [
+ [
+ [],
+ ''
+ ],
+ [
+ [ 'foo' ],
+ '00000003foo'
+ ],
+ [
+ [ 'foo', 'a somewhat longer string' ],
+ '00000003foo00000018a somewhat longer string'
+ ],
+ ];
+ }
+
+ /** @dataProvider provideEncode */
+ public function testEncode( $input, $expected ) {
+ $coll = TestingAccessWrapper::newFromClass( RemoteIcuCollation::class );
+ $this->assertSame( $expected, $coll->encode( $input ) );
+ }
+
+ public static function provideEncodeDecode() {
+ return [
+ [ [ "\000" ] ],
+ [ [ "a\000b" ] ],
+ [ [ str_repeat( "\001", 100 ) ] ],
+ [ [ 'foo' ] ],
+ [ [ 'foo', 'bar' ] ],
+ [ [ 'foo', 'bar', str_repeat( 'x', 1000 ) ] ]
+ ];
+ }
+
+ /** @dataProvider provideEncodeDecode */
+ public function testEncodeDecode( $input ) {
+ $coll = TestingAccessWrapper::newFromClass( RemoteIcuCollation::class );
+ $this->assertSame( $input, $coll->decode( $coll->encode( $input ) ) );
+ }
+
+ public static function provideGetSortKeys() {
+ $cases = [
+ [],
+ [ '' ],
+ [ 'test1' => 'bar', 'test2' => 'foo' ],
+ [
+ 'bar',
+ 'foo'
+ ],
+ [
+ 'first',
+ 'Second'
+ ],
+ [
+ '',
+ 'second'
+ ],
+ [
+ 'Berić',
+ 'Berisha',
+ ],
+ [
+ '2',
+ '10',
+ ]
+ ];
+ foreach ( $cases as $case ) {
+ yield [ $case ];
+ }
+ }
+
+ /** @dataProvider provideGetSortKeys */
+ public function testGetSortKeys( $inputs ) {
+ if ( !extension_loaded( 'intl' ) ) {
+ $this->markTestSkipped( 'Need PHP intl' );
+ }
+ $coll = new RemoteIcuCollation(
+ $this->getServiceContainer()->getShellboxClientFactory(),
+ 'uca-default-u-kn'
+ );
+ $sortKeys = $coll->getSortKeys( $inputs );
+ $prevKey = null;
+ if ( count( $inputs ) ) {
+ foreach ( $inputs as $i => $input ) {
+ $key = $sortKeys[$i];
+ $this->assertIsString( $key );
+ if ( $prevKey ) {
+ $this->assertLessThan( 0, strcmp( $prevKey, $key ) );
+ }
+ $prevKey = $key;
+ }
+ } else {
+ $this->assertSame( [], $sortKeys );
+ }
+ }
+
+ /** @dataProvider provideGetSortKeys */
+ public function testGetSortKey( $inputs ) {
+ if ( !count( $inputs ) ) {
+ // Not risky, it's just handy to reuse the provider
+ $this->assertTrue( true );
+ }
+ if ( !extension_loaded( 'intl' ) ) {
+ $this->markTestSkipped( 'Need PHP intl' );
+ }
+ $coll = new RemoteIcuCollation(
+ $this->getServiceContainer()->getShellboxClientFactory(),
+ 'uca-default-u-kn'
+ );
+ $prevKey = null;
+ foreach ( $inputs as $input ) {
+ $key = $coll->getSortKey( $input );
+ $this->assertIsString( $key );
+ if ( $prevKey ) {
+ $this->assertLessThan( 0, strcmp( $prevKey, $key ) );
+ }
+ $prevKey = $key;
+ }
+ }
+}