Skip to content

Commit e119903

Browse files
authored
Merge pull request #16 from csm101/array_instead_of_dictionary
3x performance, resolved memory allocation problem and correct handling of double-character codepoints
2 parents 5c6dde5 + 785d26e commit e119903

16 files changed

Lines changed: 828 additions & 1427 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@ obj
44
py-codes
55

66
*.user
7+
.vs/
8+
*.ncrunchsolution
9+
*.ncrunchproject

BenchmarkResults.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
| Method | Mean | Error | StdDev | Gen0 | Allocated |
2+
|--------------------- |----------:|----------:|----------:|-------:|----------:|
3+
| UnidecodeRussian | 42.312 ns | 0.4374 ns | 0.4092 ns | 0.0038 | 64 B |
4+
| UnidecodeAscii | 15.640 ns | 0.0360 ns | 0.0319 ns | - | - |
5+
| UnidecodeRussianChar | 3.132 ns | 0.0133 ns | 0.0124 ns | - | - |
6+
| UnidecodeAsciiChar | 2.673 ns | 0.0167 ns | 0.0156 ns | - | - |

Convert.bat

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
@python py2cs.py
2+
@copy unidecoder-decodemap.txt assets /y

Unidecode.NET.sln

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11

22
Microsoft Visual Studio Solution File, Format Version 12.00
3-
# Visual Studio 15
4-
VisualStudioVersion = 15.0.26228.4
3+
# Visual Studio Version 17
4+
VisualStudioVersion = 17.6.33815.320
55
MinimumVisualStudioVersion = 10.0.40219.1
66
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{604AAA61-6C9C-4421-9DA5-0805968113A8}"
77
EndProject
88
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{C9018F64-4917-4F4F-8F78-3A674896029D}"
99
ProjectSection(SolutionItems) = preProject
10+
.editorconfig = .editorconfig
1011
LICENSE = LICENSE
1112
readme.md = readme.md
12-
.editorconfig = .editorconfig
1313
EndProjectSection
1414
EndProject
1515
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Unidecode.NET", "src\Unidecode.NET.csproj", "{3ED46251-F3F1-43F0-8776-A5055D96BB56}"
@@ -18,6 +18,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{8F8EB995-9
1818
EndProject
1919
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Unidecode.NET.Tests", "test\Unidecode.NET.Tests.csproj", "{C93F3F13-BFB7-4440-BC52-B1BFAC74EB1B}"
2020
EndProject
21+
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "benchmark", "benchmark", "{6B823939-68FD-4909-90D9-A6B4BE177ADA}"
22+
EndProject
23+
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Unidecode.NET.Benchmark", "benchmark\Unidecode.NET.Benchmark\Unidecode.NET.Benchmark.csproj", "{BBB60B62-9870-460D-8629-4825D096F9A3}"
24+
EndProject
2125
Global
2226
GlobalSection(SolutionConfigurationPlatforms) = preSolution
2327
Debug|Any CPU = Debug|Any CPU
@@ -32,12 +36,20 @@ Global
3236
{C93F3F13-BFB7-4440-BC52-B1BFAC74EB1B}.Debug|Any CPU.Build.0 = Debug|Any CPU
3337
{C93F3F13-BFB7-4440-BC52-B1BFAC74EB1B}.Release|Any CPU.ActiveCfg = Release|Any CPU
3438
{C93F3F13-BFB7-4440-BC52-B1BFAC74EB1B}.Release|Any CPU.Build.0 = Release|Any CPU
39+
{BBB60B62-9870-460D-8629-4825D096F9A3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
40+
{BBB60B62-9870-460D-8629-4825D096F9A3}.Debug|Any CPU.Build.0 = Debug|Any CPU
41+
{BBB60B62-9870-460D-8629-4825D096F9A3}.Release|Any CPU.ActiveCfg = Release|Any CPU
42+
{BBB60B62-9870-460D-8629-4825D096F9A3}.Release|Any CPU.Build.0 = Release|Any CPU
3543
EndGlobalSection
3644
GlobalSection(SolutionProperties) = preSolution
3745
HideSolutionNode = FALSE
3846
EndGlobalSection
3947
GlobalSection(NestedProjects) = preSolution
4048
{3ED46251-F3F1-43F0-8776-A5055D96BB56} = {604AAA61-6C9C-4421-9DA5-0805968113A8}
4149
{C93F3F13-BFB7-4440-BC52-B1BFAC74EB1B} = {8F8EB995-97FD-41CD-B307-E2F8E987C468}
50+
{BBB60B62-9870-460D-8629-4825D096F9A3} = {6B823939-68FD-4909-90D9-A6B4BE177ADA}
51+
EndGlobalSection
52+
GlobalSection(ExtensibilityGlobals) = postSolution
53+
SolutionGuid = {AFE6DE01-A43D-4110-90C8-75C670655EF1}
4254
EndGlobalSection
4355
EndGlobal

Unidecoder.Characters.cs

Lines changed: 0 additions & 608 deletions
This file was deleted.

assets/unidecoder-decodemap.txt

Lines changed: 190 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
using System.Text;
2+
using BenchmarkDotNet.Attributes;
3+
4+
namespace Unidecode.NET.Benchmark;
5+
[MemoryDiagnoser]
6+
public class Benchmarks
7+
{
8+
9+
[Benchmark]
10+
public void FastUnidecodeRussian()
11+
{
12+
var converted = "Работа с кириллицей".Unidecode(UnidecodeAlgorithm.Fast);
13+
}
14+
15+
[Benchmark]
16+
public void CompleteUnidecodeRussian()
17+
{
18+
var converted = "Работа с кириллицей".Unidecode(UnidecodeAlgorithm.Complete);
19+
}
20+
21+
[Benchmark]
22+
public void FastUnidecodeAscii()
23+
{
24+
var converted = "Hello World!".Unidecode(UnidecodeAlgorithm.Fast);
25+
}
26+
27+
[Benchmark]
28+
public void CompleteUnidecodeAscii()
29+
{
30+
var converted = "Hello World!".Unidecode(UnidecodeAlgorithm.Complete);
31+
}
32+
33+
[Benchmark]
34+
public void UnidecodeRussianChar()
35+
{
36+
var converted = 'и'.Unidecode();
37+
}
38+
39+
[Benchmark]
40+
public void UnidecodeAsciiChar()
41+
{
42+
var converted = 'Z'.Unidecode();
43+
}
44+
45+
46+
private readonly static Rune russianRune = new('и');
47+
[Benchmark]
48+
public void UnidecodeRussianRune()
49+
{
50+
51+
var converted = russianRune.Unidecode();
52+
}
53+
54+
private readonly static Rune AsciiRune = new('Z');
55+
56+
[Benchmark]
57+
public void UnidecodeAsciiRune()
58+
{
59+
var converted = AsciiRune.Unidecode();
60+
}
61+
62+
63+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
using BenchmarkDotNet.Running;
2+
3+
namespace Unidecode.NET.Benchmark;
4+
5+
internal class Program
6+
{
7+
static void Main(string[] args)
8+
{
9+
var summary = BenchmarkRunner.Run<Benchmarks>();
10+
}
11+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net7.0</TargetFramework>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<Nullable>enable</Nullable>
8+
<AllowUnsafeBlocks>True</AllowUnsafeBlocks>
9+
</PropertyGroup>
10+
11+
<ItemGroup>
12+
<Compile Include="..\..\src\Unidecoder.cs" Link="Unidecoder.cs" />
13+
</ItemGroup>
14+
15+
<ItemGroup>
16+
<EmbeddedResource Include="..\..\assets\unidecoder-decodemap.txt" Link="unidecoder-decodemap.txt" />
17+
</ItemGroup>
18+
19+
<ItemGroup>
20+
<PackageReference Include="BenchmarkDotNet" Version="0.13.5" />
21+
</ItemGroup>
22+
23+
</Project>

py2cs.py

Lines changed: 22 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,27 @@
11
import os, re
22

3+
# this python script generates the unidecoder-decodemap.txt file
4+
# from the original python source. to make it run, you need
5+
# to download this directory from the original repository:
6+
# https://github.com/avian2/unidecode/tree/master/unidecode
7+
# and extract it in a folder named "py-codes"
8+
# when run, it will generate a file named unidecoder-decodemap.txt
9+
# that must be copied in the assets folder.
10+
# this file will be included in the assembly and used in the static
11+
# constructor of Unidecoder class.
12+
313
d = "py-codes" # https://github.com/avian2/unidecode/tree/master/unidecode
414
print("working...")
515

6-
fp = open("Unidecoder.Characters.cs", "w")
7-
fp.write('''/*
8-
COPYRIGHT
9-
10-
Character transliteration tables:
11-
12-
Copyright 2001, Sean M. Burke <sburke@cpan.org>, all rights reserved.
13-
14-
Python code:
15-
16-
Copyright 2009, Tomaz Solc <tomaz@zemanta.com>
17-
18-
The programs and documentation in this dist are distributed in the
19-
hope that they will be useful, but without any warranty; without even
20-
the implied warranty of merchantability or fitness for a particular
21-
purpose.
22-
23-
This library is free software; you can redistribute it and/or modify
24-
it under the same terms as Perl.
25-
*/
26-
27-
/*
28-
Don't edit, this code is generated.
29-
*/
30-
31-
using System;
32-
using System.Collections.Generic;
33-
34-
namespace Unidecode.NET
35-
{
36-
public static partial class Unidecoder
37-
{
38-
private static readonly Dictionary<int, string[]> characters;
39-
40-
static Unidecoder()
41-
{
42-
characters = new Dictionary<int, string[]> {
43-
''')
44-
16+
fp = open("unidecoder-decodemap.txt", "w")
4517

4618
def formatch(ch, cc):
4719
ch = ch.replace("\r", "")
4820
ch = ch.replace("\\", "\\\\")
4921
ch = ch.replace("\"", "\\\"")
50-
ch = ch.replace("\n", '"+Environment.NewLine+"')
22+
ch = ch.replace("\n", '"\\n"')
5123
return ch if cc > 31 else "\\u" + ('%x' % cc).rjust(4, '0')
5224

53-
5425
for file in [file for file in os.listdir(d) if not file in [".", ".."]]:
5526
m = re.search('x(.{3})\.py$', file)
5627
if m:
@@ -61,21 +32,18 @@ def formatch(ch, cc):
6132
data += (fill,)*missing
6233
assert len(data) == 256
6334
c = 0
64-
num = int(m.group(1), 16) * 256
65-
fp.write(' {%s /*%s %s*/, new[]{\n' % (int(m.group(1), 16), num, m.group(1)))
35+
idx = int(m.group(1), 16)
36+
num = idx * 256
37+
fp.write('%3s\t' % (idx))
6638
for ch in data:
67-
fp.write('"%s" /*%s*/%s ' % (
68-
formatch(ch, num + c),
69-
("%x" % (num + c)).rjust(4, '0'),
70-
"," if c < 255 else ""))
39+
if ch is None:
40+
fp.write('""');
41+
else:
42+
fp.write('"%s"' % (formatch(ch, num + c)))
43+
if c<255:
44+
fp.write('\t')
7145
c = c + 1
72-
fp.write('}},\n\n')
46+
fp.write('\n')
7347

74-
fp.write(
75-
''' };
76-
}
77-
}
78-
}
79-
''')
8048
print("converted!")
8149

0 commit comments

Comments
 (0)