Skip to content

Commit 7126f22

Browse files
committed
downloaded homoglyph txt file
1 parent dc02162 commit 7126f22

2 files changed

Lines changed: 183 additions & 10 deletions

File tree

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# intentional.txt
2+
# Date: 2024-05-03, 03:26:39 GMT
3+
# © 2024 Unicode®, Inc.
4+
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
5+
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
6+
#
7+
# Unicode Security Mechanisms for UTS #39
8+
# Version: 16.0.0
9+
#
10+
# For documentation and usage, see https://www.unicode.org/reports/tr39
11+
#
12+
0021 ; 01C3 #* ( ! ~ ǃ ) EXCLAMATION MARK ~ LATIN LETTER RETROFLEX CLICK
13+
14+
0041 ; 0391 # ( A ~ Α ) LATIN CAPITAL LETTER A ~ GREEK CAPITAL LETTER ALPHA
15+
16+
0042 ; 0392 # ( B ~ Β ) LATIN CAPITAL LETTER B ~ GREEK CAPITAL LETTER BETA
17+
18+
0043 ; 0421 # ( C ~ С ) LATIN CAPITAL LETTER C ~ CYRILLIC CAPITAL LETTER ES
19+
20+
0045 ; 0395 # ( E ~ Ε ) LATIN CAPITAL LETTER E ~ GREEK CAPITAL LETTER EPSILON
21+
22+
0048 ; 0397 # ( H ~ Η ) LATIN CAPITAL LETTER H ~ GREEK CAPITAL LETTER ETA
23+
24+
0049 ; 0399 # ( I ~ Ι ) LATIN CAPITAL LETTER I ~ GREEK CAPITAL LETTER IOTA
25+
26+
004A ; 0408 # ( J ~ Ј ) LATIN CAPITAL LETTER J ~ CYRILLIC CAPITAL LETTER JE
27+
28+
004B ; 039A # ( K ~ Κ ) LATIN CAPITAL LETTER K ~ GREEK CAPITAL LETTER KAPPA
29+
30+
004D ; 039C # ( M ~ Μ ) LATIN CAPITAL LETTER M ~ GREEK CAPITAL LETTER MU
31+
32+
004E ; 039D # ( N ~ Ν ) LATIN CAPITAL LETTER N ~ GREEK CAPITAL LETTER NU
33+
34+
004F ; 039F # ( O ~ Ο ) LATIN CAPITAL LETTER O ~ GREEK CAPITAL LETTER OMICRON
35+
36+
0050 ; 03A1 # ( P ~ Ρ ) LATIN CAPITAL LETTER P ~ GREEK CAPITAL LETTER RHO
37+
38+
0053 ; 0405 # ( S ~ Ѕ ) LATIN CAPITAL LETTER S ~ CYRILLIC CAPITAL LETTER DZE
39+
40+
0054 ; 03A4 # ( T ~ Τ ) LATIN CAPITAL LETTER T ~ GREEK CAPITAL LETTER TAU
41+
42+
0058 ; 03A7 # ( X ~ Χ ) LATIN CAPITAL LETTER X ~ GREEK CAPITAL LETTER CHI
43+
44+
0059 ; 03A5 # ( Y ~ Υ ) LATIN CAPITAL LETTER Y ~ GREEK CAPITAL LETTER UPSILON
45+
46+
005A ; 0396 # ( Z ~ Ζ ) LATIN CAPITAL LETTER Z ~ GREEK CAPITAL LETTER ZETA
47+
48+
0061 ; 0430 # ( a ~ а ) LATIN SMALL LETTER A ~ CYRILLIC SMALL LETTER A
49+
50+
0063 ; 0441 # ( c ~ с ) LATIN SMALL LETTER C ~ CYRILLIC SMALL LETTER ES
51+
52+
0064 ; 0501 # ( d ~ ԁ ) LATIN SMALL LETTER D ~ CYRILLIC SMALL LETTER KOMI DE
53+
54+
0065 ; 0435 # ( e ~ е ) LATIN SMALL LETTER E ~ CYRILLIC SMALL LETTER IE
55+
56+
0068 ; 04BB # ( h ~ һ ) LATIN SMALL LETTER H ~ CYRILLIC SMALL LETTER SHHA
57+
58+
0069 ; 0456 # ( i ~ і ) LATIN SMALL LETTER I ~ CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
59+
60+
006A ; 03F3 # ( j ~ ϳ ) LATIN SMALL LETTER J ~ GREEK LETTER YOT
61+
62+
006F ; 03BF # ( o ~ ο ) LATIN SMALL LETTER O ~ GREEK SMALL LETTER OMICRON
63+
64+
0070 ; 0440 # ( p ~ р ) LATIN SMALL LETTER P ~ CYRILLIC SMALL LETTER ER
65+
66+
0073 ; 0455 # ( s ~ ѕ ) LATIN SMALL LETTER S ~ CYRILLIC SMALL LETTER DZE
67+
68+
0078 ; 0445 # ( x ~ х ) LATIN SMALL LETTER X ~ CYRILLIC SMALL LETTER HA
69+
70+
0079 ; 0443 # ( y ~ у ) LATIN SMALL LETTER Y ~ CYRILLIC SMALL LETTER U
71+
72+
00C6 ; 04D4 # ( Æ ~ Ӕ ) LATIN CAPITAL LETTER AE ~ CYRILLIC CAPITAL LIGATURE A IE
73+
74+
00D0 ; 0110 # ( Ð ~ Đ ) LATIN CAPITAL LETTER ETH ~ LATIN CAPITAL LETTER D WITH STROKE
75+
76+
00E6 ; 04D5 # ( æ ~ ӕ ) LATIN SMALL LETTER AE ~ CYRILLIC SMALL LIGATURE A IE
77+
78+
0138 ; 043A # ( ĸ ~ к ) LATIN SMALL LETTER KRA ~ CYRILLIC SMALL LETTER KA
79+
80+
0182 ; 0411 # ( Ƃ ~ Б ) LATIN CAPITAL LETTER B WITH TOPBAR ~ CYRILLIC CAPITAL LETTER BE
81+
82+
018F ; 04D8 # ( Ə ~ Ә ) LATIN CAPITAL LETTER SCHWA ~ CYRILLIC CAPITAL LETTER SCHWA
83+
84+
019F ; 04E8 # ( Ɵ ~ Ө ) LATIN CAPITAL LETTER O WITH MIDDLE TILDE ~ CYRILLIC CAPITAL LETTER BARRED O
85+
86+
01A9 ; 03A3 # ( Ʃ ~ Σ ) LATIN CAPITAL LETTER ESH ~ GREEK CAPITAL LETTER SIGMA
87+
88+
01DD ; 0259 # ( ǝ ~ ə ) LATIN SMALL LETTER TURNED E ~ LATIN SMALL LETTER SCHWA
89+
90+
0245 ; 039B # ( Ʌ ~ Λ ) LATIN CAPITAL LETTER TURNED V ~ GREEK CAPITAL LETTER LAMDA
91+
92+
0259 ; 04D9 # ( ə ~ ә ) LATIN SMALL LETTER SCHWA ~ CYRILLIC SMALL LETTER SCHWA
93+
94+
025B ; 03B5 # ( ɛ ~ ε ) LATIN SMALL LETTER OPEN E ~ GREEK SMALL LETTER EPSILON
95+
96+
0269 ; 03B9 # ( ɩ ~ ι ) LATIN SMALL LETTER IOTA ~ GREEK SMALL LETTER IOTA
97+
98+
026A ; 04CF # ( ɪ ~ ӏ ) LATIN LETTER SMALL CAPITAL I ~ CYRILLIC SMALL LETTER PALOCHKA
99+
100+
0275 ; 04E9 # ( ɵ ~ ө ) LATIN SMALL LETTER BARRED O ~ CYRILLIC SMALL LETTER BARRED O
101+
102+
0292 ; 04E1 # ( ʒ ~ ӡ ) LATIN SMALL LETTER EZH ~ CYRILLIC SMALL LETTER ABKHASIAN DZE
103+
104+
0299 ; 0432 # ( ʙ ~ в ) LATIN LETTER SMALL CAPITAL B ~ CYRILLIC SMALL LETTER VE
105+
106+
029C ; 043D # ( ʜ ~ н ) LATIN LETTER SMALL CAPITAL H ~ CYRILLIC SMALL LETTER EN
107+
108+
0393 ; 0413 # ( Γ ~ Г ) GREEK CAPITAL LETTER GAMMA ~ CYRILLIC CAPITAL LETTER GHE
109+
110+
03A0 ; 041F # ( Π ~ П ) GREEK CAPITAL LETTER PI ~ CYRILLIC CAPITAL LETTER PE
111+
112+
03B1 ; 237A # ( α ~ ⍺ ) GREEK SMALL LETTER ALPHA ~ APL FUNCTIONAL SYMBOL ALPHA
113+
114+
03B9 ; 2373 # ( ι ~ ⍳ ) GREEK SMALL LETTER IOTA ~ APL FUNCTIONAL SYMBOL IOTA
115+
116+
03C1 ; 2374 # ( ρ ~ ⍴ ) GREEK SMALL LETTER RHO ~ APL FUNCTIONAL SYMBOL RHO
117+
118+
03C9 ; 2375 # ( ω ~ ⍵ ) GREEK SMALL LETTER OMEGA ~ APL FUNCTIONAL SYMBOL OMEGA
119+
120+
0433 ; 1D26 # ( г ~ ᴦ ) CYRILLIC SMALL LETTER GHE ~ GREEK LETTER SMALL CAPITAL GAMMA
121+
122+
043B ; 1D2B # ( л ~ ᴫ ) CYRILLIC SMALL LETTER EL ~ CYRILLIC LETTER SMALL CAPITAL EL
123+
124+
043F ; 1D28 # ( п ~ ᴨ ) CYRILLIC SMALL LETTER PE ~ GREEK LETTER SMALL CAPITAL PI
125+
126+
101D ; 1040 # ( ဝ ~ ၀ ) MYANMAR LETTER WA ~ MYANMAR DIGIT ZERO
127+
128+
17A2 ; 17A3 # ( អ ~ ឣ ) KHMER LETTER QA ~ KHMER INDEPENDENT VOWEL QAQ
129+
130+
1835 ; 1855 # ( ᠵ ~ ᡕ ) MONGOLIAN LETTER JA ~ MONGOLIAN LETTER TODO YA
131+
132+
199E ; 19D0 # ( ᦞ ~ ᧐ ) NEW TAI LUE LETTER LOW VA ~ NEW TAI LUE DIGIT ZERO
133+
134+
19B1 ; 19D1 # ( ᦱ ~ ᧑ ) NEW TAI LUE VOWEL SIGN AA ~ NEW TAI LUE DIGIT ONE
135+
136+
1A45 ; 1A80 # ( ᩅ ~ ᪀ ) TAI THAM LETTER WA ~ TAI THAM HORA DIGIT ZERO
137+
1A45 ; 1A90 # ( ᩅ ~ ᪐ ) TAI THAM LETTER WA ~ TAI THAM THAM DIGIT ZERO
138+
139+
1B0D ; 1B52 # ( ᬍ ~ ᭒ ) BALINESE LETTER LA LENGA ~ BALINESE DIGIT TWO
140+
141+
1B11 ; 1B53 # ( ᬑ ~ ᭓ ) BALINESE LETTER OKARA ~ BALINESE DIGIT THREE
142+
143+
1B28 ; 1B58 # ( ᬨ ~ ᭘ ) BALINESE LETTER PA KAPAL ~ BALINESE DIGIT EIGHT
144+
145+
1B50 ; 1B5C # ( ᭐ ~ ᭜ ) BALINESE DIGIT ZERO ~ BALINESE WINDU
146+
147+
1D0D ; 043C # ( ᴍ ~ м ) LATIN LETTER SMALL CAPITAL M ~ CYRILLIC SMALL LETTER EM
148+
149+
1D18 ; 1D29 # ( ᴘ ~ ᴩ ) LATIN LETTER SMALL CAPITAL P ~ GREEK LETTER SMALL CAPITAL RHO
150+
151+
1D1B ; 0442 # ( ᴛ ~ т ) LATIN LETTER SMALL CAPITAL T ~ CYRILLIC SMALL LETTER TE
152+
153+
2C67 ; 04A2 # ( Ⱨ ~ Ң ) LATIN CAPITAL LETTER H WITH DESCENDER ~ CYRILLIC CAPITAL LETTER EN WITH DESCENDER
154+
155+
2C69 ; 049A # ( Ⱪ ~ Қ ) LATIN CAPITAL LETTER K WITH DESCENDER ~ CYRILLIC CAPITAL LETTER KA WITH DESCENDER
156+
157+
A9D0 ; A9C6 # ( ꧐ ~ ꧆ ) JAVANESE DIGIT ZERO ~ JAVANESE PADA WINDU
158+
159+
10382 ; 103D1 # ( 𐎂 ~ 𐏑 ) UGARITIC LETTER GAMLA ~ OLD PERSIAN NUMBER ONE
160+
161+
10393 ; 103D3 # ( 𐎓 ~ 𐏓 ) UGARITIC LETTER AIN ~ OLD PERSIAN NUMBER TEN
162+
163+
1039A ; 12038 # ( 𐎚 ~ 𒀸 ) UGARITIC LETTER TO ~ CUNEIFORM SIGN ASH
164+
165+
10486 ; 104A0 # ( 𐒆 ~ 𐒠 ) OSMANYA LETTER DEEL ~ OSMANYA DIGIT ZERO
166+

textattack/transformations/word_swaps/word_swap_homoglyph_swap.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from .word_swap_differential_evolution import WordSwapDifferentialEvolution
77
from typing import List, Tuple
88
from textattack.shared import AttackedText
9-
import requests
9+
import os
1010

1111

1212
class WordSwapHomoglyphSwap(WordSwapDifferentialEvolution):
@@ -65,16 +65,23 @@ def __init__(self, random_one=False, **kwargs):
6565

6666
# Retrieve Unicode Intentional homoglyph characters
6767
self.homos_intentional = dict()
68-
int_resp = requests.get("https://www.unicode.org/Public/security/latest/intentional.txt", stream=True)
69-
for line in int_resp.iter_lines():
70-
if len(line):
71-
line = line.decode('utf-8-sig')
72-
if line[0] != '#':
68+
# int_resp = requests.get("https://www.unicode.org/Public/security/latest/intentional.txt", stream=True)
69+
path = os.path.dirname(os.path.abspath(__file__))
70+
path_list = path.split(os.sep)
71+
path_list = path_list[:-2]
72+
path_list.append("shared/intentional_homoglyphs.txt")
73+
homoglyphs_path = os.sep.join(path_list)
74+
with open(homoglyphs_path, "r", encoding="utf-8-sig") as f:
75+
for line in f:
76+
if line.strip() and not line.startswith("#"):
7377
line = line.replace("#*", "#")
74-
_, line = line.split("#", maxsplit=1)
75-
if line[3] not in self.homos_intentional:
76-
self.homos_intentional[line[3]] = []
77-
self.homos_intentional[line[3]].append(line[7])
78+
try:
79+
_, data = line.split("#", maxsplit=1)
80+
key = data[3]
81+
value = data[7]
82+
self.homos_intentional.setdefault(key, []).append(value)
83+
except IndexError:
84+
continue # skip malformed lines
7885

7986
def _get_precomputed(self, current_text: AttackedText) -> List[List[Tuple[int, str]]]:
8087
return [self._get_glyph_map(current_text)]

0 commit comments

Comments
 (0)