Skip to content

Commit 3448be8

Browse files
committed
Unidecoder.Character now uses a string[][] instead of a Dictionary<int,string[]>
1 parent 5c6dde5 commit 3448be8

8 files changed

Lines changed: 1211 additions & 1230 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@ obj
44
py-codes
55

66
*.user
7+
.vs/
8+
*.ncrunchsolution
9+
*.ncrunchproject

Unidecoder.Characters.cs

Lines changed: 520 additions & 572 deletions
Large diffs are not rendered by default.

py2cs.py

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,19 @@
2929
*/
3030
3131
using System;
32-
using System.Collections.Generic;
3332
34-
namespace Unidecode.NET
33+
namespace Unidecode.NET
3534
{
3635
public static partial class Unidecoder
3736
{
38-
private static readonly Dictionary<int, string[]> characters;
37+
private static readonly string[][] characters;
38+
private static readonly int MaxDecodedCharLength;
39+
private static readonly int MaxStringLengthForStackAlloc;
3940
4041
static Unidecoder()
4142
{
42-
characters = new Dictionary<int, string[]> {
43+
characters = new string[][]
44+
{
4345
''')
4446

4547

@@ -50,7 +52,8 @@ def formatch(ch, cc):
5052
ch = ch.replace("\n", '"+Environment.NewLine+"')
5153
return ch if cc > 31 else "\\u" + ('%x' % cc).rjust(4, '0')
5254

53-
55+
lastFoundIndex = -1
56+
indent = ' '
5457
for file in [file for file in os.listdir(d) if not file in [".", ".."]]:
5558
m = re.search('x(.{3})\.py$', file)
5659
if m:
@@ -61,18 +64,40 @@ def formatch(ch, cc):
6164
data += (fill,)*missing
6265
assert len(data) == 256
6366
c = 0
64-
num = int(m.group(1), 16) * 256
65-
fp.write(' {%s /*%s %s*/, new[]{\n' % (int(m.group(1), 16), num, m.group(1)))
67+
idx = int(m.group(1), 16)
68+
num = idx * 256
69+
fp.write(indent)
70+
for missingindex in range(lastFoundIndex+1,idx):
71+
fp.write('/* %3s */ null,\n' % (missingindex))
72+
fp.write(indent)
73+
lastFoundIndex = idx
74+
fp.write('/* %3s */ /*%5s %s*/ new[] {' % (idx, num, m.group(1)))
6675
for ch in data:
67-
fp.write('"%s" /*%s*/%s ' % (
68-
formatch(ch, num + c),
69-
("%x" % (num + c)).rjust(4, '0'),
70-
"," if c < 255 else ""))
71-
c = c + 1
72-
fp.write('}},\n\n')
76+
if ch is None:
77+
fp.write('"%s" /*%s*/%s ' % (
78+
'',
79+
("%x" % (num + c)).rjust(4, '0'),
80+
"," if c < 255 else ""))
81+
else:
82+
fp.write('"%s" /*%s*/%s ' % (
83+
formatch(ch, num + c),
84+
("%x" % (num + c)).rjust(4, '0'),
85+
"," if c < 255 else ""))
86+
c = c + 1
87+
fp.write('},\n')
7388

7489
fp.write(
75-
''' };
90+
''' };
91+
MaxDecodedCharLength = 1;
92+
foreach (var block in characters)
93+
{
94+
if (block == null)
95+
continue;
96+
foreach (var str in block)
97+
if (str.Length > MaxDecodedCharLength)
98+
MaxDecodedCharLength = str.Length;
99+
}
100+
MaxStringLengthForStackAlloc = STACKALLOC_BUFFER_SIZE / MaxDecodedCharLength - 1;
76101
}
77102
}
78103
}

src/Unidecode.NET.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<PackageLicenseExpression>MIT</PackageLicenseExpression>
99
<GenerateDocumentationFile>true</GenerateDocumentationFile>
1010
<Description>Unidecode.NET returns transliterated string</Description>
11-
<TargetFrameworks>netstandard1.2;netstandard2.0;netstandard2.1</TargetFrameworks>
11+
<TargetFrameworks>net7.0</TargetFrameworks>
1212
<VersionPrefix>2.1.0</VersionPrefix>
1313
<PackageTags>text;unicode;seo</PackageTags>
1414
<RepositoryType>git</RepositoryType>

src/Unidecoder.Characters.cs

Lines changed: 519 additions & 571 deletions
Large diffs are not rendered by default.

src/Unidecoder.cs

Lines changed: 111 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,80 +1,125 @@
1-
using System.Linq;
1+
using System;
2+
using System.Linq;
3+
using System.Runtime.CompilerServices;
24
using System.Text;
35

46
namespace Unidecode.NET
57
{
8+
/// <summary>
9+
/// ASCII transliterations of Unicode text
10+
/// </summary>
11+
public static partial class Unidecoder
12+
{
13+
// for short strings I use a buffer allocated in the stack instead of a stringbuilder (this should give less work to the garbage collector
14+
private const int STACKALLOC_BUFFER_SIZE = 40956;
15+
616
/// <summary>
7-
/// ASCII transliterations of Unicode text
17+
/// Transliterate Unicode string to ASCII string.
818
/// </summary>
9-
public static partial class Unidecoder
19+
/// <param name="input">String you want to transliterate into ASCII</param>
20+
/// <param name="tempStringBuilderCapacity">
21+
/// If you know the length of the result,
22+
/// pass the value for StringBuilder capacity.
23+
/// InputString.Length*2 is used by default.
24+
/// </param>
25+
/// <returns>
26+
/// ASCII string. There are [?] (3 characters) in places of some unknown(?) unicode characters.
27+
/// It is this way in Python code as well.
28+
/// </returns>
29+
public static string Unidecode(this string input, int? tempStringBuilderCapacity = null)
1030
{
11-
/// <summary>
12-
/// Transliterate Unicode string to ASCII string.
13-
/// </summary>
14-
/// <param name="input">String you want to transliterate into ASCII</param>
15-
/// <param name="tempStringBuilderCapacity">
16-
/// If you know the length of the result,
17-
/// pass the value for StringBuilder capacity.
18-
/// InputString.Length*2 is used by default.
19-
/// </param>
20-
/// <returns>
21-
/// ASCII string. There are [?] (3 characters) in places of some unknown(?) unicode characters.
22-
/// It is this way in Python code as well.
23-
/// </returns>
24-
public static string Unidecode(this string input, int? tempStringBuilderCapacity = null)
25-
{
26-
if (string.IsNullOrEmpty(input))
27-
return "";
28-
29-
if (input.All(x => x < 0x80))
30-
return input;
31-
32-
// Unidecode result often can be at least two times longer than input string.
33-
var sb = new StringBuilder(tempStringBuilderCapacity ?? input.Length * 2);
34-
foreach (var c in input)
35-
{
36-
// Copypaste is bad, but sb.Append(c.Unidecode()); would be a bit slower.
37-
if (c < 0x80)
38-
{
39-
sb.Append(c);
40-
}
41-
else
42-
{
43-
var high = c >> 8;
44-
var low = c & 0xff;
45-
if (characters.TryGetValue(high, out var transliterations))
46-
{
47-
sb.Append(transliterations[low]);
48-
}
49-
}
50-
}
31+
if (string.IsNullOrEmpty(input))
32+
return "";
5133

52-
return sb.ToString();
53-
}
34+
if (input.All(x => x < 0x80))
35+
return input;
5436

55-
/// <summary>
56-
/// Transliterate Unicode character to ASCII string.
57-
/// </summary>
58-
/// <param name="c">Character you want to transliterate into ASCII</param>
59-
/// <returns>
60-
/// ASCII string. Unknown(?) unicode characters will return [?] (3 characters).
61-
/// It is this way in Python code as well.
62-
/// </returns>
63-
public static string Unidecode(this char c)
37+
if (input.Length < MaxStringLengthForStackAlloc)
38+
{
39+
Span<char> stackBuffer = stackalloc char[STACKALLOC_BUFFER_SIZE];
40+
int buffIdx = 0;
41+
foreach (char c in input)
6442
{
65-
string result;
66-
if (c < 0x80)
67-
{
68-
result = new string(c, 1);
69-
}
70-
else
71-
{
72-
var high = c >> 8;
73-
var low = c & 0xff;
74-
result = characters.TryGetValue(high, out var transliterations) ? transliterations[low] : "";
75-
}
43+
if (c < 0x80)
44+
{
45+
stackBuffer[buffIdx++] = c;
46+
continue;
47+
}
48+
var high = c >> 8;
49+
if (high < characters.Length)
50+
continue;
51+
var bytes = characters[high];
52+
if (bytes == null)
53+
continue;
54+
var str = bytes[c & 0xff];
55+
foreach (char ch in str)
56+
stackBuffer[buffIdx++] = ch;
57+
}
58+
59+
return new string(stackBuffer[0..buffIdx]);
60+
}
7661

77-
return result;
62+
63+
// Unidecode result often can be at least two times longer than input string.
64+
var sb = new StringBuilder(tempStringBuilderCapacity ?? input.Length * 2);
65+
foreach (var c in input)
66+
{
67+
// Copypaste is bad, but sb.Append(c.Unidecode()); would be a bit slower.
68+
if (c < 0x80)
69+
{
70+
sb.Append(c);
7871
}
72+
else
73+
{
74+
var high = c >> 8;
75+
if (high >= characters.Length)
76+
continue;
77+
var low = c & 0xff;
78+
var bytes = characters[high];
79+
if (bytes!=null)
80+
{
81+
sb.Append(bytes[low]);
82+
}
83+
}
84+
}
85+
86+
return sb.ToString();
7987
}
88+
89+
/// <summary>
90+
/// Transliterate Unicode character to ASCII string.
91+
/// </summary>
92+
/// <param name="c">Character you want to transliterate into ASCII</param>
93+
/// <returns>
94+
/// ASCII string. Unknown(?) unicode characters will return [?] (3 characters).
95+
/// It is this way in Python code as well.
96+
/// </returns>
97+
public static string Unidecode(this char c)
98+
{
99+
if (c < 0x80)
100+
return AsciiCharacter.AsString[c];
101+
102+
var high = c >> 8;
103+
if (high >= characters.Length)
104+
return null;
105+
var bytes = characters[high];
106+
if (bytes == null)
107+
return null;
108+
109+
return bytes[c & 0xff];
110+
}
111+
112+
private static class AsciiCharacter
113+
{
114+
public static readonly string[] AsString;
115+
static AsciiCharacter()
116+
{
117+
AsString = new string[0x80];
118+
for (char ch = '\0'; ch < AsString.Length; ch++)
119+
AsString[ch] = new string(ch,1);
120+
}
121+
122+
}
123+
124+
}
80125
}

test/Unidecode.NET.Tests.csproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
<Project Sdk="Microsoft.NET.Sdk">
1+
<Project Sdk="Microsoft.NET.Sdk">
22
<PropertyGroup>
3-
<TargetFramework>netcoreapp3.1</TargetFramework>
3+
<TargetFramework>net7.0</TargetFramework>
44
</PropertyGroup>
55
<ItemGroup>
66
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.8.0" />

test/UnidecoderTest.cs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
using System.Text;
1+
using System.Text;
22
using Xunit;
33

44
namespace Unidecode.NET.Tests
@@ -18,7 +18,18 @@ public void CustomTest()
1818
Assert.Equal("aouoAOUO", "äöűőÄÖŨŐ".Unidecode());
1919
}
2020

21-
[Fact]
21+
[Fact]
22+
public void Performancetest()
23+
{
24+
for (int i = 0; i < 1000000; i++)
25+
{
26+
var converted = "Работа с кириллицей".Unidecode();
27+
}
28+
Assert.True(true);
29+
}
30+
31+
32+
[Fact]
2233
public void PythonTest()
2334
{
2435
Assert.Equal("Hello, World!", "Hello, World!".Unidecode());
@@ -58,8 +69,9 @@ public void CharUnidecodeTest()
5869

5970
Assert.Equal(expected, result);
6071
}
61-
62-
[Fact]
72+
73+
74+
[Fact]
6375
public void GermanAlphabetTest()
6476
{
6577
const string input = "a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ä ö ü ß Ä Ö Ü ẞ";

0 commit comments

Comments
 (0)