summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMiha Zupan <mihazupan.zupan1@gmail.com>2024-03-02 22:39:30 +0100
committerGitHub <noreply@github.com>2024-03-02 13:39:30 -0800
commit8aff565e389f10535d9520182e26d7b45de71b60 (patch)
tree1e448e66530baf56b9b9fcb7dd17bd8990df5cff
parentcd5587ddeddecebf6291f8462b4b6c25db433a71 (diff)
Expand small value sets to all case permutations in SearchValues<string> (#98902)
* Expand small value sets to all case permutations in SearchValues<string> * Use the constant in one more place
-rw-r--r--src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs81
1 files changed, 79 insertions, 2 deletions
diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs
index 86a13dd04b9b..2bff05214c51 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs
@@ -3,6 +3,7 @@
using System.Collections.Generic;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
@@ -14,6 +15,8 @@ namespace System.Buffers
{
internal static class StringSearchValues
{
+ private const int TeddyBucketCount = 8;
+
private static readonly SearchValues<char> s_asciiLetters =
SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
@@ -248,6 +251,18 @@ namespace System.Buffers
Debug.Assert(!(asciiStartLettersOnly && asciiStartUnaffectedByCaseConversion));
+ // If we still have empty buckets we could use and we're ignoring case, we may be able to
+ // generate all possible permutations of the first N characters and switch to case-sensitive searching.
+ // E.g. ["ab", "c!"] => ["ab", "Ab" "aB", "AB", "c!", "C!"].
+ // This won't apply to inputs with many letters (e.g. "abc" => 8 permutations on its own).
+ if (!asciiStartUnaffectedByCaseConversion &&
+ values.Length < TeddyBucketCount &&
+ TryGenerateAllCasePermutationsForPrefixes(values, n, TeddyBucketCount, out string[]? newValues))
+ {
+ asciiStartUnaffectedByCaseConversion = true;
+ values = newValues;
+ }
+
if (asciiStartUnaffectedByCaseConversion)
{
return nonAsciiAffectedByCaseConversion
@@ -278,9 +293,9 @@ namespace System.Buffers
Debug.Assert(values.Length > 1);
Debug.Assert(n is 2 or 3);
- if (values.Length > 8)
+ if (values.Length > TeddyBucketCount)
{
- string[][] buckets = TeddyBucketizer.Bucketize(values, bucketCount: 8, n);
+ string[][] buckets = TeddyBucketizer.Bucketize(values, TeddyBucketCount, n);
// Potential optimization: We don't have to pick the first N characters for the fingerprint.
// Different offset selection can noticeably improve throughput (e.g. 2x).
@@ -297,6 +312,68 @@ namespace System.Buffers
}
}
+ private static bool TryGenerateAllCasePermutationsForPrefixes(ReadOnlySpan<string> values, int n, int maxValues, [NotNullWhen(true)] out string[]? newValues)
+ {
+ Debug.Assert(n is 2 or 3);
+ Debug.Assert(values.Length < maxValues);
+
+ // Count how many possible permutations there are.
+ int newValuesCount = 0;
+
+ foreach (string value in values)
+ {
+ int permutations = 1;
+
+ foreach (char c in value.AsSpan(0, n))
+ {
+ Debug.Assert(char.IsAscii(c));
+
+ if (char.IsAsciiLetter(c))
+ {
+ permutations *= 2;
+ }
+ }
+
+ newValuesCount += permutations;
+ }
+
+ Debug.Assert(newValuesCount > values.Length, "Shouldn't have been called if there were no letters present");
+
+ if (newValuesCount > maxValues)
+ {
+ newValues = null;
+ return false;
+ }
+
+ // Generate the permutations.
+ newValues = new string[newValuesCount];
+ newValuesCount = 0;
+
+ foreach (string value in values)
+ {
+ int start = newValuesCount;
+
+ newValues[newValuesCount++] = value;
+
+ for (int i = 0; i < n; i++)
+ {
+ char c = value[i];
+
+ if (char.IsAsciiLetter(c))
+ {
+ // Copy all the previous permutations of this value but change the casing of the i-th character.
+ foreach (string previous in newValues.AsSpan(start, newValuesCount - start))
+ {
+ newValues[newValuesCount++] = $"{previous.AsSpan(0, i)}{(char)(c ^ 0x20)}{previous.AsSpan(i + 1)}";
+ }
+ }
+ }
+ }
+
+ Debug.Assert(newValuesCount == newValues.Length);
+ return true;
+ }
+
private static SearchValues<string> CreateForSingleValue(
string value,
HashSet<string>? uniqueValues,