Skip to content

Commit e47b050

Browse files
committed
Add option to use chardet for encoding detection
The tradeoff is it's much, much, much slower. In my tests, circa 10 times slower than without chardet. But it always use the right encoding. Maybe the right thing to do is only a fallback to chardet since most of source code is in ascii/utf-8/iso8859-1. This will be left undecided until 1.2 comes out.
1 parent eddcd98 commit e47b050

2 files changed

Lines changed: 88 additions & 36 deletions

File tree

TODO

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
.eps files -- they are not binary files but codespell shouldn't check them
77
whatsoever.
88

9-
- Use chardet for encoding detection
10-
119
- Add option to ignore big files. The biggest issue is if you try to run
1210
codespell with a file in the source code tree like cscope.out. I don't know
1311
if the best approach is to filter by name or by size

codespell.py

Lines changed: 88 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
misspellings = {}
3232
exclude_lines = set()
3333
options = None
34+
fileopener = None
3435
quiet_level = 0
3536
encodings = [ 'utf-8', 'iso-8859-1' ]
3637

@@ -85,6 +86,82 @@ def __str__(self):
8586

8687
return "\n".join(["{0}{1:{width}}".format(key, self.summary.get(key), width=15 - len(key)) for key in keys])
8788

89+
class FileOpener:
90+
def __init__(self, use_chardet):
91+
self.use_chardet = use_chardet
92+
self.init_chardet()
93+
94+
def init_chardet(self):
95+
from chardet.universaldetector import UniversalDetector
96+
97+
self.encdetector = UniversalDetector()
98+
99+
def open(self, filename):
100+
if self.use_chardet:
101+
return self.open_with_chardet(filename)
102+
else:
103+
return self.open_with_internal(filename)
104+
105+
def open_with_chardet(self, filename):
106+
self.encdetector.reset()
107+
with open(filename, 'rb') as f:
108+
for line in f:
109+
self.encdetector.feed(line)
110+
if self.encdetector.done:
111+
break
112+
self.encdetector.close()
113+
encoding = self.encdetector.result['encoding']
114+
115+
try:
116+
f = open(filename, encoding=encoding)
117+
lines = f.readlines()
118+
except UnicodeDecodeError:
119+
print('ERROR: Could not detect encoding: %s' % filename,
120+
file=sys.stderr)
121+
raise
122+
except LookupError:
123+
print('ERROR: %s -- Don\'t know how to handle encoding %s'
124+
% (filename, encoding), file=sys.stderr)
125+
raise
126+
finally:
127+
f.close()
128+
129+
return lines, encoding
130+
131+
132+
def open_with_internal(self, filename):
133+
curr = 0
134+
global encodings
135+
136+
while True:
137+
try:
138+
f = open(filename, 'r', encoding=encodings[curr])
139+
lines = f.readlines()
140+
break
141+
except UnicodeDecodeError:
142+
if not quiet_level & QuietLevels.ENCODING:
143+
print('WARNING: Decoding file %s' % filename,
144+
file=sys.stderr)
145+
print('WARNING: using encoding=%s failed. '
146+
% encodings[curr],
147+
file=sys.stderr)
148+
print('WARNING: Trying next encoding: %s' % encodings[curr],
149+
file=sys.stderr)
150+
151+
curr += 1
152+
153+
finally:
154+
f.close()
155+
156+
if not lines:
157+
print('ERROR: Could not detect encoding: %s' % filename,
158+
file=sys.stderr)
159+
raise Exception('Unknown encoding')
160+
161+
encoding = encodings[curr]
162+
163+
return lines, encoding
164+
88165
# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
89166

90167
def parse_options(args):
@@ -128,6 +205,13 @@ def parse_options(args):
128205
'8: don\'t print anything for non-automatic '\
129206
'fixes. 16: don\'t print fixed files.')
130207

208+
parser.add_option('-e', '--hard-encoding-detection',
209+
action='store_true', default = False,
210+
help = 'Use chardet to detect the encoding of each'\
211+
'file. This can slow down codespell, but is more'\
212+
'reliable to detect encodings other than utf-8,'\
213+
'iso8859-1 and ascii.')
214+
131215

132216
(o, args) = parser.parse_args()
133217
if (len(args) < 1):
@@ -231,38 +315,6 @@ def ask_for_word_fix(line, wrongword, misspelling, interactivity):
231315

232316
return misspelling.fix, misspelling.data
233317

234-
def get_encoding(filename):
235-
curr = 0
236-
while True:
237-
try:
238-
f = open(filename, 'r', encoding=encodings[curr])
239-
lines = f.readlines()
240-
break
241-
except UnicodeDecodeError:
242-
243-
if not quiet_level & QuietLevels.ENCODING:
244-
print('WARNING: Decoding file %s' % filename,
245-
file=sys.stderr)
246-
print('WARNING: using encoding=%s failed. '
247-
% encodings[curr],
248-
file=sys.stderr)
249-
print('WARNING: Trying next encoding: %s' % encodings[curr],
250-
file=sys.stderr)
251-
252-
curr += 1
253-
254-
finally:
255-
f.close()
256-
257-
if not lines:
258-
print('ERROR: Could not detect encoding: %s' % filename,
259-
file=sys.stderr)
260-
raise Exception('Unknown encoding')
261-
262-
encoding = encodings[curr]
263-
return lines, encoding
264-
265-
266318
def parse_file(filename, colors, summary):
267319
lines = None
268320
changed = False
@@ -283,7 +335,7 @@ def parse_file(filename, colors, summary):
283335
print("WARNING: Binary file: %s " % filename, file=sys.stderr)
284336
return
285337
try:
286-
lines, encoding = get_encoding(filename)
338+
lines, encoding = fileopener.open(filename)
287339
except:
288340
return
289341

@@ -376,10 +428,10 @@ def parse_file(filename, colors, summary):
376428
f.writelines(lines)
377429
f.close()
378430

379-
380431
def main(*args):
381432
global options
382433
global quiet_level
434+
global fileopener
383435

384436
(options, args) = parse_options(args)
385437

@@ -399,6 +451,8 @@ def main(*args):
399451
if options.quiet_level:
400452
quiet_level = options.quiet_level
401453

454+
fileopener = FileOpener(options.hard_encoding_detection)
455+
402456
for filename in args[1:]:
403457
# ignore hidden files
404458
if ishidden(filename):

0 commit comments

Comments
 (0)