Skip to content

Commit 46ee653

Browse files
committed
Introduced various performance optimizations
1 parent 38d7b45 commit 46ee653

5 files changed

Lines changed: 154 additions & 115 deletions

File tree

BenchmarkResults.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
| Method | Mean | Error | StdDev | Gen0 | Allocated |
2+
|--------------------- |----------:|----------:|----------:|-------:|----------:|
3+
| UnidecodeRussian | 42.312 ns | 0.4374 ns | 0.4092 ns | 0.0038 | 64 B |
4+
| UnidecodeAscii | 15.640 ns | 0.0360 ns | 0.0319 ns | - | - |
5+
| UnidecodeRussianChar | 3.132 ns | 0.0133 ns | 0.0124 ns | - | - |
6+
| UnidecodeAsciiChar | 2.673 ns | 0.0167 ns | 0.0156 ns | - | - |

benchmark/Unidecode.NET.Benchmark/Unidecode.NET.Benchmark.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
<TargetFramework>net7.0</TargetFramework>
66
<ImplicitUsings>enable</ImplicitUsings>
77
<Nullable>enable</Nullable>
8+
<AllowUnsafeBlocks>True</AllowUnsafeBlocks>
89
</PropertyGroup>
910

1011
<ItemGroup>

src/Unidecode.NET.csproj

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<Project Sdk="Microsoft.NET.Sdk">
1+
<Project Sdk="Microsoft.NET.Sdk">
22
<PropertyGroup>
33
<AssemblyTitle>Unidecode.NET</AssemblyTitle>
44
<AssemblyName>Unidecode.NET</AssemblyName>
@@ -18,5 +18,6 @@
1818
<IncludeSource>true</IncludeSource>
1919
<SymbolPackageFormat>snupkg</SymbolPackageFormat>
2020
<NoWarn>1591</NoWarn>
21+
<AllowUnsafeBlocks>True</AllowUnsafeBlocks>
2122
</PropertyGroup>
2223
</Project>

src/Unidecoder.cs

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,24 @@
33
using System.Runtime.CompilerServices;
44
using System.Text;
55

6+
// this IntenralsVisibleTo attribute is here to allow benchmarking and
7+
// testing of SlowUnidecode, which normally, due to the stackalloc optimization,
8+
// is called only when Unidecode receives a long string
9+
[assembly: InternalsVisibleTo("Unidecode.Net.Benchmark")]
10+
[assembly: InternalsVisibleTo("Unidecode.Net.Tests")]
11+
612
namespace Unidecode.NET
7-
{
13+
{
814
/// <summary>
915
/// ASCII transliterations of Unicode text
1016
/// </summary>
1117
public static partial class Unidecoder
1218
{
13-
// for short strings I use a buffer allocated in the stack instead of a stringbuilder (this should give less work to the garbage collector
14-
private const int STACKALLOC_BUFFER_SIZE = 40956;
19+
// for short strings I use a buffer allocated in the stack instead of a stringbuilder.
20+
// (this is faster and gives less work to the garbage collector)
21+
private const int STACKALLOC_BUFFER_SIZE = 8192;
1522

23+
[SkipLocalsInit] // this is to avoid the local raw buffer variable stackBuffer do be zeroed for every call: we don't need it and is very cpu intensive (this attribute needs unsafe compliation)
1624
/// <summary>
1725
/// Transliterate Unicode string to ASCII string.
1826
/// </summary>
@@ -31,34 +39,44 @@ public static string Unidecode(this string input, int? tempStringBuilderCapacity
3139
if (string.IsNullOrEmpty(input))
3240
return "";
3341

34-
if (input.All(x => x < 0x80))
35-
return input;
42+
if (input.Length >= MaxStringLengthForStackAlloc)
43+
return SlowUnidecode(input, tempStringBuilderCapacity);
3644

37-
/* if (input.Length < MaxStringLengthForStackAlloc)
45+
bool noConversionNeeded = true;
46+
Span<char> stackBuffer = stackalloc char[STACKALLOC_BUFFER_SIZE];
47+
int buffIdx = 0;
48+
foreach (char c in input)
3849
{
39-
Span<char> stackBuffer = stackalloc char[STACKALLOC_BUFFER_SIZE];
40-
int buffIdx = 0;
41-
foreach (char c in input)
50+
if (c < 0x80)
4251
{
43-
if (c < 0x80)
44-
{
45-
stackBuffer[buffIdx++] = c;
46-
continue;
47-
}
48-
var high = c >> 8;
49-
if (high < characters.Length)
50-
continue;
51-
var bytes = characters[high];
52-
if (bytes == null)
53-
continue;
54-
var str = bytes[c & 0xff];
55-
foreach (char ch in str)
56-
stackBuffer[buffIdx++] = ch;
52+
stackBuffer[buffIdx++] = c;
53+
continue;
5754
}
55+
noConversionNeeded = false;
56+
var high = c >> 8;
57+
if (high >= characters.Length)
58+
continue;
59+
var bytes = characters[high];
60+
if (bytes == null)
61+
continue;
62+
var str = bytes[c & 0xff];
5863

59-
return new string(stackBuffer[0..buffIdx]);
60-
}*/
64+
foreach (char ch in str)
65+
stackBuffer[buffIdx++] = ch;
66+
}
67+
if (noConversionNeeded)
68+
return input;
69+
return new string(stackBuffer[0..buffIdx]);
70+
}
71+
72+
73+
internal static string SlowUnidecode(this string input, int? tempStringBuilderCapacity = null)
74+
{
75+
if (string.IsNullOrEmpty(input))
76+
return "";
6177

78+
if (input.All(x => x < 0x80))
79+
return input;
6280

6381
// Unidecode result often can be at least two times longer than input string.
6482
var sb = new StringBuilder(tempStringBuilderCapacity ?? input.Length * 2);
@@ -76,7 +94,7 @@ public static string Unidecode(this string input, int? tempStringBuilderCapacity
7694
continue;
7795
var low = c & 0xff;
7896
var bytes = characters[high];
79-
if (bytes!=null)
97+
if (bytes != null)
8098
{
8199
sb.Append(bytes[low]);
82100
}
@@ -94,7 +112,7 @@ public static string Unidecode(this string input, int? tempStringBuilderCapacity
94112
/// ASCII string. Unknown(?) unicode characters will return [?] (3 characters).
95113
/// It is this way in Python code as well.
96114
/// </returns>
97-
public static string Unidecode(this char c)
115+
public static string Unidecode(this in char c)
98116
{
99117
if (c < 0x80)
100118
return AsciiCharacter.AsString[c];

test/UnidecoderTest.cs

Lines changed: 100 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,117 +1,130 @@
1+
using System;
12
using System.Text;
23
using Xunit;
34

45
namespace Unidecode.NET.Tests
56
{
6-
public class UnidecoderTest
7+
public class UnidecoderTest
8+
{
9+
[Fact]
10+
public void DocTest()
711
{
8-
[Fact]
9-
public void DocTest()
10-
{
11-
Assert.Equal("Bei Jing ", "\u5317\u4EB0".Unidecode());
12-
}
13-
14-
[Fact]
15-
public void CustomTest()
16-
{
17-
Assert.Equal("Rabota s kirillitsei", "Работа с кириллицей".Unidecode());
18-
Assert.Equal("aouoAOUO", "äöűőÄÖŨŐ".Unidecode());
19-
}
12+
Assert.Equal("Bei Jing ", "\u5317\u4EB0".Unidecode());
13+
}
2014

2115
[Fact]
22-
public void Performancetest()
16+
public void CustomTest()
2317
{
24-
for (int i = 0; i < 1000000; i++)
25-
{
26-
var converted = "Работа с кириллицей".Unidecode();
27-
}
28-
Assert.True(true);
18+
Assert.Equal("Rabota s kirillitsei", "Работа с кириллицей".Unidecode());
19+
Assert.Equal("aouoAOUO", "äöűőÄÖŨŐ".Unidecode());
2920
}
3021

3122

23+
3224
[Fact]
33-
public void PythonTest()
34-
{
35-
Assert.Equal("Hello, World!", "Hello, World!".Unidecode());
36-
Assert.Equal("'\"\r\n", "'\"\r\n".Unidecode());
37-
Assert.Equal("CZSczs", "ČŽŠčžš".Unidecode());
38-
Assert.Equal("a", "ア".Unidecode());
39-
Assert.Equal("a", "α".Unidecode());
40-
Assert.Equal("a", "а".Unidecode());
41-
Assert.Equal("chateau", "ch\u00e2teau".Unidecode());
42-
Assert.Equal("vinedos", "vi\u00f1edos".Unidecode());
43-
}
44-
45-
[Fact]
46-
public void RussianAlphabetTest()
47-
{
48-
const string russianAlphabetLowercase = "а б в г д е ё ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я";
49-
const string russianAlphabetUppercase = "А Б В Г Д Е Ё Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я";
25+
public void PythonTest()
26+
{
27+
Assert.Equal("Hello, World!", "Hello, World!".Unidecode());
28+
Assert.Equal("'\"\r\n", "'\"\r\n".Unidecode());
29+
Assert.Equal("CZSczs", "ČŽŠčžš".Unidecode());
30+
Assert.Equal("a", "ア".Unidecode());
31+
Assert.Equal("a", "α".Unidecode());
32+
Assert.Equal("a", "а".Unidecode());
33+
Assert.Equal("chateau", "ch\u00e2teau".Unidecode());
34+
Assert.Equal("vinedos", "vi\u00f1edos".Unidecode());
35+
}
5036

51-
const string expectedLowercase = "a b v g d e io zh z i i k l m n o p r s t u f kh ts ch sh shch ' y ' e iu ia";
52-
const string expectedUppercase = "A B V G D E Io Zh Z I I K L M N O P R S T U F Kh Ts Ch Sh Shch ' Y ' E Iu Ia";
37+
[Fact]
38+
public void RussianAlphabetTest()
39+
{
40+
const string russianAlphabetLowercase = "а б в г д е ё ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я";
41+
const string russianAlphabetUppercase = "А Б В Г Д Е Ё Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я";
5342

54-
Assert.Equal(expectedLowercase, russianAlphabetLowercase.Unidecode());
55-
Assert.Equal(expectedUppercase, russianAlphabetUppercase.Unidecode());
56-
}
43+
const string expectedLowercase = "a b v g d e io zh z i i k l m n o p r s t u f kh ts ch sh shch ' y ' e iu ia";
44+
const string expectedUppercase = "A B V G D E Io Zh Z I I K L M N O P R S T U F Kh Ts Ch Sh Shch ' Y ' E Iu Ia";
5745

58-
[Fact]
59-
public void CharUnidecodeTest()
60-
{
61-
const string input = "а б в г д е ё ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я А Б В Г Д Е Ё Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я";
62-
const string expected = "a b v g d e io zh z i i k l m n o p r s t u f kh ts ch sh shch ' y ' e iu ia A B V G D E Io Zh Z I I K L M N O P R S T U F Kh Ts Ch Sh Shch ' Y ' E Iu Ia";
46+
Assert.Equal(expectedLowercase, russianAlphabetLowercase.Unidecode());
47+
Assert.Equal(expectedUppercase, russianAlphabetUppercase.Unidecode());
48+
}
49+
50+
[Fact]
51+
public void CharUnidecodeTest()
52+
{
53+
const string input = "а б в г д е ё ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я А Б В Г Д Е Ё Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я";
54+
const string expected = "a b v g d e io zh z i i k l m n o p r s t u f kh ts ch sh shch ' y ' e iu ia A B V G D E Io Zh Z I I K L M N O P R S T U F Kh Ts Ch Sh Shch ' Y ' E Iu Ia";
6355

64-
var sb = new StringBuilder(expected.Length);
65-
foreach (var c in input)
66-
sb.Append(c.Unidecode());
67-
68-
var result = sb.ToString();
56+
var sb = new StringBuilder(expected.Length);
57+
foreach (var c in input)
58+
sb.Append(c.Unidecode());
6959

70-
Assert.Equal(expected, result);
71-
}
60+
var result = sb.ToString();
61+
62+
Assert.Equal(expected, result);
63+
}
7264

7365

7466
[Fact]
75-
public void GermanAlphabetTest()
76-
{
77-
const string input = "a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ä ö ü ß Ä Ö Ü ẞ";
78-
const string expected = "a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a o u ss A O U Ss";
67+
public void GermanAlphabetTest()
68+
{
69+
const string input = "a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ä ö ü ß Ä Ö Ü ẞ";
70+
const string expected = "a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a o u ss A O U Ss";
7971

80-
var sb = new StringBuilder(expected.Length);
81-
foreach (var c in input)
82-
sb.Append(c.Unidecode());
83-
84-
var result = sb.ToString();
72+
var sb = new StringBuilder(expected.Length);
73+
foreach (var c in input)
74+
sb.Append(c.Unidecode());
8575

86-
Assert.Equal(expected, result);
87-
}
76+
var result = sb.ToString();
77+
78+
Assert.Equal(expected, result);
79+
}
80+
81+
[Fact]
82+
public void UnidecodeOnNullShouldReturnEmptyString()
83+
{
84+
Assert.Equal("", ((string)null).Unidecode());
85+
}
8886

89-
[Fact]
90-
public void UnidecodeOnNullShouldReturnEmptyString()
87+
/// <summary>
88+
/// Test that code points with the maximum low byte of 255 do not
89+
/// cause an IndexOutOfRangeException (see commit: acd8fb4)
90+
/// </summary>
91+
[Fact]
92+
public void MaximumLowByteTest()
93+
{
94+
byte low = 0xFF;
95+
for (var high = (char)0; high <= byte.MaxValue; high++)
96+
{
97+
var codePoint = (char)((high << 8) | low);
98+
try
9199
{
92-
Assert.Equal("", ((string)null).Unidecode());
100+
codePoint.Unidecode();
93101
}
94-
95-
/// <summary>
96-
/// Test that code points with the maximum low byte of 255 do not
97-
/// cause an IndexOutOfRangeException (see commit: acd8fb4)
98-
/// </summary>
99-
[Fact]
100-
public void MaximumLowByteTest()
102+
catch (System.IndexOutOfRangeException)
101103
{
102-
byte low = 0xFF;
103-
for (var high = (char)0; high <= byte.MaxValue; high++)
104-
{
105-
var codePoint = (char)((high << 8) | low);
106-
try
107-
{
108-
codePoint.Unidecode();
109-
}
110-
catch (System.IndexOutOfRangeException)
111-
{
112-
Assert.True(false);
113-
}
114-
}
104+
Assert.True(false);
115105
}
106+
}
107+
}
108+
109+
/// <summary>
110+
/// Tests that Unidecode "stackAlloc" optimized implementation falls back to the slowest SlowUnidecode implementation for long strings,
111+
/// instead of raising an error
112+
/// </summary>
113+
[Fact]
114+
public void SlowUnidecodeIsCalledForLongStrings()
115+
{
116+
var srcBuilder = new StringBuilder();
117+
var expectedBuilder = new StringBuilder();
118+
for (int i = 0; i < 100; i++)
119+
{
120+
srcBuilder.Append("а б в г д е ё ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я А Б В Г Д Е Ё Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я");
121+
expectedBuilder.Append("a b v g d e io zh z i i k l m n o p r s t u f kh ts ch sh shch ' y ' e iu ia A B V G D E Io Zh Z I I K L M N O P R S T U F Kh Ts Ch Sh Shch ' Y ' E Iu Ia");
122+
}
123+
var src = srcBuilder.ToString();
124+
var expected = expectedBuilder.ToString();
125+
var result = src.Unidecode();
126+
Assert.Equal(expected, result);
127+
128+
}
116129
}
117130
}

0 commit comments

Comments
 (0)