|
1 | | -using System.Linq; |
| 1 | +using System; |
| 2 | +using System.Linq; |
| 3 | +using System.Runtime.CompilerServices; |
2 | 4 | using System.Text; |
3 | 5 |
|
4 | 6 | namespace Unidecode.NET |
5 | 7 | { |
| 8 | + /// <summary> |
| 9 | + /// ASCII transliterations of Unicode text |
| 10 | + /// </summary> |
| 11 | + public static partial class Unidecoder |
| 12 | + { |
| 13 | + // for short strings I use a buffer allocated in the stack instead of a stringbuilder (this should give less work to the garbage collector |
| 14 | + private const int STACKALLOC_BUFFER_SIZE = 40956; |
| 15 | + |
6 | 16 | /// <summary> |
7 | | - /// ASCII transliterations of Unicode text |
| 17 | + /// Transliterate Unicode string to ASCII string. |
8 | 18 | /// </summary> |
9 | | - public static partial class Unidecoder |
| 19 | + /// <param name="input">String you want to transliterate into ASCII</param> |
| 20 | + /// <param name="tempStringBuilderCapacity"> |
| 21 | + /// If you know the length of the result, |
| 22 | + /// pass the value for StringBuilder capacity. |
| 23 | + /// InputString.Length*2 is used by default. |
| 24 | + /// </param> |
| 25 | + /// <returns> |
| 26 | + /// ASCII string. There are [?] (3 characters) in places of some unknown(?) unicode characters. |
| 27 | + /// It is this way in Python code as well. |
| 28 | + /// </returns> |
| 29 | + public static string Unidecode(this string input, int? tempStringBuilderCapacity = null) |
10 | 30 | { |
11 | | - /// <summary> |
12 | | - /// Transliterate Unicode string to ASCII string. |
13 | | - /// </summary> |
14 | | - /// <param name="input">String you want to transliterate into ASCII</param> |
15 | | - /// <param name="tempStringBuilderCapacity"> |
16 | | - /// If you know the length of the result, |
17 | | - /// pass the value for StringBuilder capacity. |
18 | | - /// InputString.Length*2 is used by default. |
19 | | - /// </param> |
20 | | - /// <returns> |
21 | | - /// ASCII string. There are [?] (3 characters) in places of some unknown(?) unicode characters. |
22 | | - /// It is this way in Python code as well. |
23 | | - /// </returns> |
24 | | - public static string Unidecode(this string input, int? tempStringBuilderCapacity = null) |
25 | | - { |
26 | | - if (string.IsNullOrEmpty(input)) |
27 | | - return ""; |
28 | | - |
29 | | - if (input.All(x => x < 0x80)) |
30 | | - return input; |
31 | | - |
32 | | - // Unidecode result often can be at least two times longer than input string. |
33 | | - var sb = new StringBuilder(tempStringBuilderCapacity ?? input.Length * 2); |
34 | | - foreach (var c in input) |
35 | | - { |
36 | | - // Copypaste is bad, but sb.Append(c.Unidecode()); would be a bit slower. |
37 | | - if (c < 0x80) |
38 | | - { |
39 | | - sb.Append(c); |
40 | | - } |
41 | | - else |
42 | | - { |
43 | | - var high = c >> 8; |
44 | | - var low = c & 0xff; |
45 | | - if (characters.TryGetValue(high, out var transliterations)) |
46 | | - { |
47 | | - sb.Append(transliterations[low]); |
48 | | - } |
49 | | - } |
50 | | - } |
| 31 | + if (string.IsNullOrEmpty(input)) |
| 32 | + return ""; |
51 | 33 |
|
52 | | - return sb.ToString(); |
53 | | - } |
| 34 | + if (input.All(x => x < 0x80)) |
| 35 | + return input; |
54 | 36 |
|
55 | | - /// <summary> |
56 | | - /// Transliterate Unicode character to ASCII string. |
57 | | - /// </summary> |
58 | | - /// <param name="c">Character you want to transliterate into ASCII</param> |
59 | | - /// <returns> |
60 | | - /// ASCII string. Unknown(?) unicode characters will return [?] (3 characters). |
61 | | - /// It is this way in Python code as well. |
62 | | - /// </returns> |
63 | | - public static string Unidecode(this char c) |
| 37 | + if (input.Length < MaxStringLengthForStackAlloc) |
| 38 | + { |
| 39 | + Span<char> stackBuffer = stackalloc char[STACKALLOC_BUFFER_SIZE]; |
| 40 | + int buffIdx = 0; |
| 41 | + foreach (char c in input) |
64 | 42 | { |
65 | | - string result; |
66 | | - if (c < 0x80) |
67 | | - { |
68 | | - result = new string(c, 1); |
69 | | - } |
70 | | - else |
71 | | - { |
72 | | - var high = c >> 8; |
73 | | - var low = c & 0xff; |
74 | | - result = characters.TryGetValue(high, out var transliterations) ? transliterations[low] : ""; |
75 | | - } |
| 43 | + if (c < 0x80) |
| 44 | + { |
| 45 | + stackBuffer[buffIdx++] = c; |
| 46 | + continue; |
| 47 | + } |
| 48 | + var high = c >> 8; |
| 49 | + if (high < characters.Length) |
| 50 | + continue; |
| 51 | + var bytes = characters[high]; |
| 52 | + if (bytes == null) |
| 53 | + continue; |
| 54 | + var str = bytes[c & 0xff]; |
| 55 | + foreach (char ch in str) |
| 56 | + stackBuffer[buffIdx++] = ch; |
| 57 | + } |
| 58 | + |
| 59 | + return new string(stackBuffer[0..buffIdx]); |
| 60 | + } |
76 | 61 |
|
77 | | - return result; |
| 62 | + |
| 63 | + // Unidecode result often can be at least two times longer than input string. |
| 64 | + var sb = new StringBuilder(tempStringBuilderCapacity ?? input.Length * 2); |
| 65 | + foreach (var c in input) |
| 66 | + { |
| 67 | + // Copypaste is bad, but sb.Append(c.Unidecode()); would be a bit slower. |
| 68 | + if (c < 0x80) |
| 69 | + { |
| 70 | + sb.Append(c); |
78 | 71 | } |
| 72 | + else |
| 73 | + { |
| 74 | + var high = c >> 8; |
| 75 | + if (high >= characters.Length) |
| 76 | + continue; |
| 77 | + var low = c & 0xff; |
| 78 | + var bytes = characters[high]; |
| 79 | + if (bytes!=null) |
| 80 | + { |
| 81 | + sb.Append(bytes[low]); |
| 82 | + } |
| 83 | + } |
| 84 | + } |
| 85 | + |
| 86 | + return sb.ToString(); |
79 | 87 | } |
| 88 | + |
| 89 | + /// <summary> |
| 90 | + /// Transliterate Unicode character to ASCII string. |
| 91 | + /// </summary> |
| 92 | + /// <param name="c">Character you want to transliterate into ASCII</param> |
| 93 | + /// <returns> |
| 94 | + /// ASCII string. Unknown(?) unicode characters will return [?] (3 characters). |
| 95 | + /// It is this way in Python code as well. |
| 96 | + /// </returns> |
| 97 | + public static string Unidecode(this char c) |
| 98 | + { |
| 99 | + if (c < 0x80) |
| 100 | + return AsciiCharacter.AsString[c]; |
| 101 | + |
| 102 | + var high = c >> 8; |
| 103 | + if (high >= characters.Length) |
| 104 | + return null; |
| 105 | + var bytes = characters[high]; |
| 106 | + if (bytes == null) |
| 107 | + return null; |
| 108 | + |
| 109 | + return bytes[c & 0xff]; |
| 110 | + } |
| 111 | + |
| 112 | + private static class AsciiCharacter |
| 113 | + { |
| 114 | + public static readonly string[] AsString; |
| 115 | + static AsciiCharacter() |
| 116 | + { |
| 117 | + AsString = new string[0x80]; |
| 118 | + for (char ch = '\0'; ch < AsString.Length; ch++) |
| 119 | + AsString[ch] = new string(ch,1); |
| 120 | + } |
| 121 | + |
| 122 | + } |
| 123 | + |
| 124 | + } |
80 | 125 | } |
0 commit comments