Skip to content

Commit 94c9b4a

Browse files
Joxitmissinglink
authored andcommitted
feat(normalizer): Add normalizer for WOF which can be configured to remove accents/hyphen/spaces and do lowercase (#33)
1 parent 7f05b8f commit 94c9b4a

6 files changed

Lines changed: 116 additions & 3 deletions

File tree

classifier/WhosOnFirstClassifier.js

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ const CountryClassification = require('../classification/CountryClassification')
55
const RegionClassification = require('../classification/RegionClassification')
66
const LocalityClassification = require('../classification/LocalityClassification')
77
const whosonfirst = require('../resources/whosonfirst/whosonfirst')
8+
const normalize = require('../tokenization/normalizer')({ lowercase: true, removeHyphen: true, removeAccents: true })
89

910
// databases sourced from the WhosOnFirst project
1011
// see: https://whosonfirst.org
@@ -35,7 +36,8 @@ class WhosOnFirstClassifier extends PhraseClassifier {
3536
Object.keys(placetypes).forEach(placetype => {
3637
this.tokens[placetype] = new Set()
3738
whosonfirst.load(this.tokens[placetype], [placetype], placetypes[placetype].files, {
38-
minlength: 2
39+
minlength: 2,
40+
normalizer: normalize
3941
})
4042

4143
// general blacklist
@@ -102,8 +104,9 @@ class WhosOnFirstClassifier extends PhraseClassifier {
102104
return
103105
}
104106

107+
const normalizedSpan = normalize(span.norm)
105108
Object.keys(placetypes).forEach(placetype => {
106-
if (this.tokens[placetype].has(span.norm)) {
109+
if (this.tokens[placetype].has(normalizedSpan)) {
107110
// do not classify tokens if they already have a 'StopWordClassification'
108111
if (
109112
span.classifications.hasOwnProperty('StopWordClassification') || (

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333
},
3434
"dependencies": {
3535
"cluster": "^0.7.7",
36-
"express": "^4.16.4"
36+
"express": "^4.16.4",
37+
"remove-accents": "^0.4.2"
3738
},
3839
"devDependencies": {
3940
"better-sqlite3": "^5.4.0",

resources/whosonfirst/whosonfirst.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ function _normalize (cell, options) {
4444
if (options && options.lowercase) {
4545
value = value.toLowerCase()
4646
}
47+
if (options && options.normalizer && typeof options.normalizer === 'function') {
48+
value = options.normalizer(value)
49+
}
4750
return value
4851
}
4952

test/address.fra.test.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,22 @@ const testcase = (test, common) => {
6363
assert('Rue Jean Baptiste Clément', [
6464
{ street: 'Rue Jean Baptiste Clément' }
6565
], true)
66+
67+
assert('Mery Sur Oise', [
68+
{ locality: 'Mery Sur Oise' }
69+
], true)
70+
71+
assert('Méry Sur Oise', [
72+
{ locality: 'Méry Sur Oise' }
73+
], true)
74+
75+
assert('Méry-Sur-Oise', [
76+
{ locality: 'Méry-Sur-Oise' }
77+
], true)
78+
79+
assert('Mery-Sur-Oise', [
80+
{ locality: 'Mery-Sur-Oise' }
81+
], true)
6682
}
6783

6884
module.exports.all = (tape, common) => {

tokenization/normalizer.js

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
const removeAccents = require('remove-accents')
2+
3+
function normalizer (options = {}) {
4+
return (value) => {
5+
value = value.trim()
6+
if (options.lowercase) {
7+
value = value.toLowerCase()
8+
}
9+
if (options.removeAccents) {
10+
value = removeAccents(value)
11+
}
12+
if (options.removeHyphen) {
13+
value = value.replace(/-/g, ' ')
14+
}
15+
if (options.removeSpaces) {
16+
value = value.replace(/ /g, '')
17+
}
18+
return value
19+
}
20+
}
21+
22+
module.exports = normalizer

tokenization/normalizer.test.js

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
const normalizer = require('./normalizer')
2+
3+
module.exports.tests = {}
4+
5+
module.exports.tests.normalizer = (test) => {
6+
test('normalizerr: hyphen', (t) => {
7+
const value = ' Value-With-Some-Hyphen '
8+
const expected = 'Value With Some Hyphen'
9+
const normalize = normalizer({ removeHyphen: true })
10+
11+
t.deepEquals(normalize(value), expected)
12+
t.end()
13+
})
14+
15+
test('normalizer: accents', (t) => {
16+
const value = ' Vâlüé-Wìth-Sômê-Accents '
17+
const expected = 'Value-With-Some-Accents'
18+
const normalize = normalizer({ removeAccents: true })
19+
20+
t.deepEquals(normalize(value), expected)
21+
t.end()
22+
})
23+
24+
test('normalizer: lowercase', (t) => {
25+
const value = 'Value-With-Some-UpperCases'
26+
const expected = 'value-with-some-uppercases'
27+
const normalize = normalizer({ lowercase: true })
28+
29+
t.deepEquals(normalize(value), expected)
30+
t.end()
31+
})
32+
33+
test('normalizer: spaces', (t) => {
34+
const value = 'Value With Some Spaces'
35+
const expected = 'ValueWithSomeSpaces'
36+
const normalize = normalizer({ removeSpaces: true })
37+
38+
t.deepEquals(normalize(value), expected)
39+
t.end()
40+
})
41+
42+
test('normalizer: option mix', (t) => {
43+
const value = 'Vâlüé-Mìxèd'
44+
const expected = 'value mixed'
45+
const normalize = normalizer({ lowercase: true, removeHyphen: true, removeAccents: true })
46+
47+
t.deepEquals(normalize(value), expected)
48+
t.end()
49+
})
50+
51+
test('normalizer: no options', (t) => {
52+
const value = 'Value-With-Some-Hyphen'
53+
const normalize = normalizer()
54+
55+
t.deepEquals(normalize(value), value)
56+
t.end()
57+
})
58+
}
59+
60+
module.exports.all = (tape, common) => {
61+
function test (name, testFunction) {
62+
return tape(`normalizer: ${name}`, testFunction)
63+
}
64+
65+
for (var testCase in module.exports.tests) {
66+
module.exports.tests[testCase](test, common)
67+
}
68+
}

0 commit comments

Comments
 (0)