feat(normalizer): Add normalizer for WOF which can be configured to remove accents/hyphen/spaces and do lowercase (#33)

Joxit · missinglink · commit 94c9b4a6afc6 · 2019-06-05T14:16:25.000+02:00
diff --git a/classifier/WhosOnFirstClassifier.js b/classifier/WhosOnFirstClassifier.js
@@ -5,6 +5,7 @@ const CountryClassification = require('../classification/CountryClassification')
 const RegionClassification = require('../classification/RegionClassification')
 const LocalityClassification = require('../classification/LocalityClassification')
 const whosonfirst = require('../resources/whosonfirst/whosonfirst')
+const normalize = require('../tokenization/normalizer')({ lowercase: true, removeHyphen: true, removeAccents: true })
 
 // databases sourced from the WhosOnFirst project
 // see: https://whosonfirst.org
@@ -35,7 +36,8 @@ class WhosOnFirstClassifier extends PhraseClassifier {
     Object.keys(placetypes).forEach(placetype => {
       this.tokens[placetype] = new Set()
       whosonfirst.load(this.tokens[placetype], [placetype], placetypes[placetype].files, {
-        minlength: 2
+        minlength: 2,
+        normalizer: normalize
       })
 
       // general blacklist
@@ -102,8 +104,9 @@ class WhosOnFirstClassifier extends PhraseClassifier {
       return
     }
 
+    const normalizedSpan = normalize(span.norm)
     Object.keys(placetypes).forEach(placetype => {
-      if (this.tokens[placetype].has(span.norm)) {
+      if (this.tokens[placetype].has(normalizedSpan)) {
         // do not classify tokens if they already have a 'StopWordClassification'
         if (
           span.classifications.hasOwnProperty('StopWordClassification') || (
diff --git a/package.json b/package.json
@@ -33,7 +33,8 @@
   },
   "dependencies": {
     "cluster": "^0.7.7",
-    "express": "^4.16.4"
+    "express": "^4.16.4",
+    "remove-accents": "^0.4.2"
   },
   "devDependencies": {
     "better-sqlite3": "^5.4.0",
diff --git a/resources/whosonfirst/whosonfirst.js b/resources/whosonfirst/whosonfirst.js
@@ -44,6 +44,9 @@ function _normalize (cell, options) {
   if (options && options.lowercase) {
     value = value.toLowerCase()
   }
+  if (options && options.normalizer && typeof options.normalizer === 'function') {
+    value = options.normalizer(value)
+  }
   return value
 }
 
diff --git a/test/address.fra.test.js b/test/address.fra.test.js
@@ -63,6 +63,22 @@ const testcase = (test, common) => {
   assert('Rue Jean Baptiste Clément', [
     { street: 'Rue Jean Baptiste Clément' }
   ], true)
+
+  assert('Mery Sur Oise', [
+    { locality: 'Mery Sur Oise' }
+  ], true)
+
+  assert('Méry Sur Oise', [
+    { locality: 'Méry Sur Oise' }
+  ], true)
+
+  assert('Méry-Sur-Oise', [
+    { locality: 'Méry-Sur-Oise' }
+  ], true)
+
+  assert('Mery-Sur-Oise', [
+    { locality: 'Mery-Sur-Oise' }
+  ], true)
 }
 
 module.exports.all = (tape, common) => {
diff --git a/tokenization/normalizer.js b/tokenization/normalizer.js
@@ -0,0 +1,22 @@
+const removeAccents = require('remove-accents')
+
+function normalizer (options = {}) {
+  return (value) => {
+    value = value.trim()
+    if (options.lowercase) {
+      value = value.toLowerCase()
+    }
+    if (options.removeAccents) {
+      value = removeAccents(value)
+    }
+    if (options.removeHyphen) {
+      value = value.replace(/-/g, ' ')
+    }
+    if (options.removeSpaces) {
+      value = value.replace(/ /g, '')
+    }
+    return value
+  }
+}
+
+module.exports = normalizer
diff --git a/tokenization/normalizer.test.js b/tokenization/normalizer.test.js
@@ -0,0 +1,68 @@
+const normalizer = require('./normalizer')
+
+module.exports.tests = {}
+
+module.exports.tests.normalizer = (test) => {
+  test('normalizerr: hyphen', (t) => {
+    const value = ' Value-With-Some-Hyphen '
+    const expected = 'Value With Some Hyphen'
+    const normalize = normalizer({ removeHyphen: true })
+
+    t.deepEquals(normalize(value), expected)
+    t.end()
+  })
+
+  test('normalizer: accents', (t) => {
+    const value = ' Vâlüé-Wìth-Sômê-Accents '
+    const expected = 'Value-With-Some-Accents'
+    const normalize = normalizer({ removeAccents: true })
+
+    t.deepEquals(normalize(value), expected)
+    t.end()
+  })
+
+  test('normalizer: lowercase', (t) => {
+    const value = 'Value-With-Some-UpperCases'
+    const expected = 'value-with-some-uppercases'
+    const normalize = normalizer({ lowercase: true })
+
+    t.deepEquals(normalize(value), expected)
+    t.end()
+  })
+
+  test('normalizer: spaces', (t) => {
+    const value = 'Value With Some Spaces'
+    const expected = 'ValueWithSomeSpaces'
+    const normalize = normalizer({ removeSpaces: true })
+
+    t.deepEquals(normalize(value), expected)
+    t.end()
+  })
+
+  test('normalizer: option mix', (t) => {
+    const value = 'Vâlüé-Mìxèd'
+    const expected = 'value mixed'
+    const normalize = normalizer({ lowercase: true, removeHyphen: true, removeAccents: true })
+
+    t.deepEquals(normalize(value), expected)
+    t.end()
+  })
+
+  test('normalizer: no options', (t) => {
+    const value = 'Value-With-Some-Hyphen'
+    const normalize = normalizer()
+
+    t.deepEquals(normalize(value), value)
+    t.end()
+  })
+}
+
+module.exports.all = (tape, common) => {
+  function test (name, testFunction) {
+    return tape(`normalizer: ${name}`, testFunction)
+  }
+
+  for (var testCase in module.exports.tests) {
+    module.exports.tests[testCase](test, common)
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,9 @@ function _normalize (cell, options) {`
`44`	`44`	`if (options && options.lowercase) {`
`45`	`45`	`value = value.toLowerCase()`
`46`	`46`	`}`
	`47`	`+ if (options && options.normalizer && typeof options.normalizer === 'function') {`
	`48`	`+ value = options.normalizer(value)`
	`49`	`+ }`
`47`	`50`	`return value`
`48`	`51`	`}`
`49`	`52`