Do not hash twice

lucasdemarchi · lucasdemarchi · commit 8ca6ac1d64ac · 2011-05-25T11:24:38.000-03:00
Sets already use the __hash__() method of each object to decide if an
object is in it. When we use the sha1 we are therefore hashing twice.

The impact is on performance. Following the performance before and after
this patch to parse the entire Linux Kernel tree with a big exclude
list.

Before:
	real	2m20.959s
	user	2m16.888s
	sys	0m1.386s

After:
	real	1m35.169s
	user	1m28.719s
	sys	0m1.354s
diff --git a/codespell.py b/codespell.py
@@ -22,15 +22,14 @@
 import re
 from optparse import OptionParser
 import os
-import hashlib
 
 USAGE = """
 \t%prog [OPTIONS] dict_filename [file1 file2 ... fileN]
 """
 VERSION = '1.0'
 
 misspellings = {}
-xlines = set()
+exclude_lines = set()
 options = None
 encodings = [ 'utf-8', 'iso-8859-1' ]
 
@@ -121,7 +120,7 @@ def parse_options(args):
 def build_exclude_hashes(filename):
     with open(filename, 'r') as f:
         for line in f:
-            xlines.add(hashlib.sha1(line.encode()).digest())
+            exclude_lines.add(line)
 
 def build_dict(filename):
     with open(filename, 'r') as f:
@@ -252,7 +251,7 @@ def parse_file(filename, colors, summary):
     i = 1
     rx = re.compile(r"[\w']+")
     for line in lines:
-        if hashlib.sha1(line.encode()).digest() in xlines:
+        if line in exclude_lines:
             i += 1
             continue