Add option to use chardet for encoding detection

lucasdemarchi · lucasdemarchi · commit e47b050d4df5 · 2011-07-02T16:43:11.000-03:00
The tradeoff is it's much, much, much slower. In my tests, circa 10
times slower than without chardet. But it always use the right encoding.

Maybe the right thing to do is only a fallback to chardet since most of
source code is in ascii/utf-8/iso8859-1. This will be left undecided
until 1.2 comes out.
diff --git a/TODO b/TODO
@@ -6,8 +6,6 @@
   .eps files -- they are not binary files but codespell shouldn't check them
   whatsoever.
 
-- Use chardet for encoding detection
-
 - Add option to ignore big files. The biggest issue is if you try to run
   codespell with a file in the source code tree like cscope.out. I  don't know
   if the best approach is to filter by name or by size
diff --git a/codespell.py b/codespell.py
@@ -31,6 +31,7 @@
 misspellings = {}
 exclude_lines = set()
 options = None
+fileopener = None
 quiet_level = 0
 encodings = [ 'utf-8', 'iso-8859-1' ]
 
@@ -85,6 +86,82 @@ def __str__(self):
 
         return "\n".join(["{0}{1:{width}}".format(key, self.summary.get(key), width=15 - len(key)) for key in keys])
 
+class FileOpener:
+    def __init__(self, use_chardet):
+        self.use_chardet = use_chardet
+        self.init_chardet()
+
+    def init_chardet(self):
+        from chardet.universaldetector import UniversalDetector
+
+        self.encdetector = UniversalDetector()
+
+    def open(self, filename):
+        if self.use_chardet:
+            return self.open_with_chardet(filename)
+        else:
+            return self.open_with_internal(filename)
+
+    def open_with_chardet(self, filename):
+        self.encdetector.reset()
+        with open(filename, 'rb') as f:
+            for line in f:
+                self.encdetector.feed(line)
+                if self.encdetector.done:
+                    break
+        self.encdetector.close()
+        encoding = self.encdetector.result['encoding']
+
+        try:
+            f = open(filename, encoding=encoding)
+            lines = f.readlines()
+        except UnicodeDecodeError:
+            print('ERROR: Could not detect encoding: %s' % filename,
+                                                        file=sys.stderr)
+            raise
+        except LookupError:
+            print('ERROR: %s -- Don\'t know how to handle encoding %s'
+                                % (filename, encoding), file=sys.stderr)
+            raise
+        finally:
+            f.close()
+
+        return lines, encoding
+
+
+    def open_with_internal(self, filename):
+        curr = 0
+        global encodings
+
+        while True:
+            try:
+                f = open(filename, 'r', encoding=encodings[curr])
+                lines = f.readlines()
+                break
+            except UnicodeDecodeError:
+                if not quiet_level & QuietLevels.ENCODING:
+                    print('WARNING: Decoding file %s' % filename,
+                                                        file=sys.stderr)
+                    print('WARNING: using encoding=%s failed. '
+                                                        % encodings[curr],
+                                                        file=sys.stderr)
+                    print('WARNING: Trying next encoding: %s' % encodings[curr],
+                                                        file=sys.stderr)
+
+                curr += 1
+
+            finally:
+                f.close()
+
+        if not lines:
+            print('ERROR: Could not detect encoding: %s' % filename,
+                                                        file=sys.stderr)
+            raise Exception('Unknown encoding')
+
+        encoding = encodings[curr]
+
+        return lines, encoding
+
 # -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
 
 def parse_options(args):
@@ -128,6 +205,13 @@ def parse_options(args):
                                 '8: don\'t print anything for non-automatic '\
                                 'fixes. 16: don\'t print fixed files.')
 
+    parser.add_option('-e', '--hard-encoding-detection',
+                        action='store_true', default = False,
+                        help = 'Use chardet to detect the encoding of each'\
+                        'file. This can slow down codespell, but is more'\
+                        'reliable to detect encodings other than utf-8,'\
+                        'iso8859-1 and ascii.')
+
 
     (o, args) = parser.parse_args()
     if (len(args) < 1):
@@ -231,38 +315,6 @@ def ask_for_word_fix(line, wrongword, misspelling, interactivity):
 
     return misspelling.fix, misspelling.data
 
-def get_encoding(filename):
-    curr = 0
-    while True:
-        try:
-            f = open(filename, 'r', encoding=encodings[curr])
-            lines = f.readlines()
-            break
-        except UnicodeDecodeError:
-
-            if not quiet_level & QuietLevels.ENCODING:
-                print('WARNING: Decoding file %s' % filename,
-                                                        file=sys.stderr)
-                print('WARNING: using encoding=%s failed. '
-                                                        % encodings[curr],
-                                                        file=sys.stderr)
-                print('WARNING: Trying next encoding: %s' % encodings[curr],
-                                                        file=sys.stderr)
-
-            curr += 1
-
-        finally:
-            f.close()
-
-    if not lines:
-        print('ERROR: Could not detect encoding: %s' % filename,
-                                                        file=sys.stderr)
-        raise Exception('Unknown encoding')
-
-    encoding = encodings[curr]
-    return lines, encoding
-
-
 def parse_file(filename, colors, summary):
     lines = None
     changed = False
@@ -283,7 +335,7 @@ def parse_file(filename, colors, summary):
                 print("WARNING: Binary file: %s " % filename, file=sys.stderr)
             return
         try:
-            lines, encoding = get_encoding(filename)
+            lines, encoding = fileopener.open(filename)
         except:
             return
 
@@ -376,10 +428,10 @@ def parse_file(filename, colors, summary):
             f.writelines(lines)
             f.close()
 
-
 def main(*args):
     global options
     global quiet_level
+    global fileopener
 
     (options, args) = parse_options(args)
 
@@ -399,6 +451,8 @@ def main(*args):
     if options.quiet_level:
         quiet_level = options.quiet_level
 
+    fileopener = FileOpener(options.hard_encoding_detection)
+
     for filename in args[1:]:
         # ignore hidden files
         if ishidden(filename):