3131misspellings = {}
3232exclude_lines = set ()
3333options = None
34+ fileopener = None
3435quiet_level = 0
3536encodings = [ 'utf-8' , 'iso-8859-1' ]
3637
@@ -85,6 +86,82 @@ def __str__(self):
8586
8687 return "\n " .join (["{0}{1:{width}}" .format (key , self .summary .get (key ), width = 15 - len (key )) for key in keys ])
8788
89+ class FileOpener :
90+ def __init__ (self , use_chardet ):
91+ self .use_chardet = use_chardet
92+ self .init_chardet ()
93+
94+ def init_chardet (self ):
95+ from chardet .universaldetector import UniversalDetector
96+
97+ self .encdetector = UniversalDetector ()
98+
99+ def open (self , filename ):
100+ if self .use_chardet :
101+ return self .open_with_chardet (filename )
102+ else :
103+ return self .open_with_internal (filename )
104+
105+ def open_with_chardet (self , filename ):
106+ self .encdetector .reset ()
107+ with open (filename , 'rb' ) as f :
108+ for line in f :
109+ self .encdetector .feed (line )
110+ if self .encdetector .done :
111+ break
112+ self .encdetector .close ()
113+ encoding = self .encdetector .result ['encoding' ]
114+
115+ try :
116+ f = open (filename , encoding = encoding )
117+ lines = f .readlines ()
118+ except UnicodeDecodeError :
119+ print ('ERROR: Could not detect encoding: %s' % filename ,
120+ file = sys .stderr )
121+ raise
122+ except LookupError :
123+ print ('ERROR: %s -- Don\' t know how to handle encoding %s'
124+ % (filename , encoding ), file = sys .stderr )
125+ raise
126+ finally :
127+ f .close ()
128+
129+ return lines , encoding
130+
131+
132+ def open_with_internal (self , filename ):
133+ curr = 0
134+ global encodings
135+
136+ while True :
137+ try :
138+ f = open (filename , 'r' , encoding = encodings [curr ])
139+ lines = f .readlines ()
140+ break
141+ except UnicodeDecodeError :
142+ if not quiet_level & QuietLevels .ENCODING :
143+ print ('WARNING: Decoding file %s' % filename ,
144+ file = sys .stderr )
145+ print ('WARNING: using encoding=%s failed. '
146+ % encodings [curr ],
147+ file = sys .stderr )
148+ print ('WARNING: Trying next encoding: %s' % encodings [curr ],
149+ file = sys .stderr )
150+
151+ curr += 1
152+
153+ finally :
154+ f .close ()
155+
156+ if not lines :
157+ print ('ERROR: Could not detect encoding: %s' % filename ,
158+ file = sys .stderr )
159+ raise Exception ('Unknown encoding' )
160+
161+ encoding = encodings [curr ]
162+
163+ return lines , encoding
164+
88165# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
89166
90167def parse_options (args ):
@@ -128,6 +205,13 @@ def parse_options(args):
128205 '8: don\' t print anything for non-automatic ' \
129206 'fixes. 16: don\' t print fixed files.' )
130207
208+ parser .add_option ('-e' , '--hard-encoding-detection' ,
209+ action = 'store_true' , default = False ,
210+ help = 'Use chardet to detect the encoding of each' \
211+ 'file. This can slow down codespell, but is more' \
212+ 'reliable to detect encodings other than utf-8,' \
213+ 'iso8859-1 and ascii.' )
214+
131215
132216 (o , args ) = parser .parse_args ()
133217 if (len (args ) < 1 ):
@@ -231,38 +315,6 @@ def ask_for_word_fix(line, wrongword, misspelling, interactivity):
231315
232316 return misspelling .fix , misspelling .data
233317
234- def get_encoding (filename ):
235- curr = 0
236- while True :
237- try :
238- f = open (filename , 'r' , encoding = encodings [curr ])
239- lines = f .readlines ()
240- break
241- except UnicodeDecodeError :
242-
243- if not quiet_level & QuietLevels .ENCODING :
244- print ('WARNING: Decoding file %s' % filename ,
245- file = sys .stderr )
246- print ('WARNING: using encoding=%s failed. '
247- % encodings [curr ],
248- file = sys .stderr )
249- print ('WARNING: Trying next encoding: %s' % encodings [curr ],
250- file = sys .stderr )
251-
252- curr += 1
253-
254- finally :
255- f .close ()
256-
257- if not lines :
258- print ('ERROR: Could not detect encoding: %s' % filename ,
259- file = sys .stderr )
260- raise Exception ('Unknown encoding' )
261-
262- encoding = encodings [curr ]
263- return lines , encoding
264-
265-
266318def parse_file (filename , colors , summary ):
267319 lines = None
268320 changed = False
@@ -283,7 +335,7 @@ def parse_file(filename, colors, summary):
283335 print ("WARNING: Binary file: %s " % filename , file = sys .stderr )
284336 return
285337 try :
286- lines , encoding = get_encoding (filename )
338+ lines , encoding = fileopener . open (filename )
287339 except :
288340 return
289341
@@ -376,10 +428,10 @@ def parse_file(filename, colors, summary):
376428 f .writelines (lines )
377429 f .close ()
378430
379-
380431def main (* args ):
381432 global options
382433 global quiet_level
434+ global fileopener
383435
384436 (options , args ) = parse_options (args )
385437
@@ -399,6 +451,8 @@ def main(*args):
399451 if options .quiet_level :
400452 quiet_level = options .quiet_level
401453
454+ fileopener = FileOpener (options .hard_encoding_detection )
455+
402456 for filename in args [1 :]:
403457 # ignore hidden files
404458 if ishidden (filename ):
0 commit comments