Skip to content

Commit 8ca6ac1

Browse files
committed
Do not hash twice
Sets already use the __hash__() method of each object to decide if an object is in it. When we use the sha1 we are therefore hashing twice. The impact is on performance. Following the performance before and after this patch to parse the entire Linux Kernel tree with a big exclude list. Before: real 2m20.959s user 2m16.888s sys 0m1.386s After: real 1m35.169s user 1m28.719s sys 0m1.354s
1 parent 363d289 commit 8ca6ac1

1 file changed

Lines changed: 3 additions & 4 deletions

File tree

codespell.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,14 @@
2222
import re
2323
from optparse import OptionParser
2424
import os
25-
import hashlib
2625

2726
USAGE = """
2827
\t%prog [OPTIONS] dict_filename [file1 file2 ... fileN]
2928
"""
3029
VERSION = '1.0'
3130

3231
misspellings = {}
33-
xlines = set()
32+
exclude_lines = set()
3433
options = None
3534
encodings = [ 'utf-8', 'iso-8859-1' ]
3635

@@ -121,7 +120,7 @@ def parse_options(args):
121120
def build_exclude_hashes(filename):
122121
with open(filename, 'r') as f:
123122
for line in f:
124-
xlines.add(hashlib.sha1(line.encode()).digest())
123+
exclude_lines.add(line)
125124

126125
def build_dict(filename):
127126
with open(filename, 'r') as f:
@@ -252,7 +251,7 @@ def parse_file(filename, colors, summary):
252251
i = 1
253252
rx = re.compile(r"[\w']+")
254253
for line in lines:
255-
if hashlib.sha1(line.encode()).digest() in xlines:
254+
if line in exclude_lines:
256255
i += 1
257256
continue
258257

0 commit comments

Comments
 (0)