diff --git a/settings.js b/settings.js index 918fcb01..4e07fb47 100644 --- a/settings.js +++ b/settings.js @@ -48,6 +48,7 @@ function generate(){ "icu_folding", "trim", "word_delimiter", + "custom_admin", "notnull" ] }, @@ -59,6 +60,7 @@ function generate(){ "lowercase", "icu_folding", "trim", + "custom_name", "full_token_address_suffix_expansion", "ampersand", "remove_ordinals", @@ -114,6 +116,7 @@ function generate(){ "lowercase", "icu_folding", "trim", + "custom_name", "ampersand", "street_suffix_contractions", "directionals", @@ -152,6 +155,7 @@ function generate(){ "lowercase", "icu_folding", "remove_duplicate_spaces", + "custom_street", ].concat( synonyms.street_suffix_contractions.map( function( synonym ){ return "keyword_street_suffix_" + synonym.split(' ')[0]; })).concat( synonyms.directionals.map( function( synonym ){ diff --git a/synonyms/custom_admin.txt b/synonyms/custom_admin.txt new file mode 100644 index 00000000..a61474de --- /dev/null +++ b/synonyms/custom_admin.txt @@ -0,0 +1,25 @@ +# ============================================================================= +# This file allows users to add their own custom synonyms below. +# +# 1. Blank lines and lines starting with '#' are comments. +# +# 2. Explicit mappings match any token sequence on the left-hand-side of "=>" and replace with all +# alternatives on the right-hand-side. +# These types of mappings ignore the expand parameter in the constructor. +# Example: +# i-pod, i pod => ipod +# +# 3. Equivalent synonyms may be separated with commas and give no explicit mapping. +# In this case the mapping behavior will be taken from the expand parameter in the constructor. +# This allows the same synonym file to be used in different synonym handling strategies. +# Example: +# ipod, i-pod, i pod +# +# 4. Multiple synonym mapping entries are merged. +# Example: +# foo => foo bar +# foo => baz +# is equivalent to: +# foo => foo bar, baz +# +# ============================================================================= diff --git a/synonyms/custom_name.txt b/synonyms/custom_name.txt new file mode 100644 index 00000000..a61474de --- /dev/null +++ b/synonyms/custom_name.txt @@ -0,0 +1,25 @@ +# ============================================================================= +# This file allows users to add their own custom synonyms below. +# +# 1. Blank lines and lines starting with '#' are comments. +# +# 2. Explicit mappings match any token sequence on the left-hand-side of "=>" and replace with all +# alternatives on the right-hand-side. +# These types of mappings ignore the expand parameter in the constructor. +# Example: +# i-pod, i pod => ipod +# +# 3. Equivalent synonyms may be separated with commas and give no explicit mapping. +# In this case the mapping behavior will be taken from the expand parameter in the constructor. +# This allows the same synonym file to be used in different synonym handling strategies. +# Example: +# ipod, i-pod, i pod +# +# 4. Multiple synonym mapping entries are merged. +# Example: +# foo => foo bar +# foo => baz +# is equivalent to: +# foo => foo bar, baz +# +# ============================================================================= diff --git a/synonyms/custom_street.txt b/synonyms/custom_street.txt new file mode 100644 index 00000000..a61474de --- /dev/null +++ b/synonyms/custom_street.txt @@ -0,0 +1,25 @@ +# ============================================================================= +# This file allows users to add their own custom synonyms below. +# +# 1. Blank lines and lines starting with '#' are comments. +# +# 2. Explicit mappings match any token sequence on the left-hand-side of "=>" and replace with all +# alternatives on the right-hand-side. +# These types of mappings ignore the expand parameter in the constructor. +# Example: +# i-pod, i pod => ipod +# +# 3. Equivalent synonyms may be separated with commas and give no explicit mapping. +# In this case the mapping behavior will be taken from the expand parameter in the constructor. +# This allows the same synonym file to be used in different synonym handling strategies. +# Example: +# ipod, i-pod, i pod +# +# 4. Multiple synonym mapping entries are merged. +# Example: +# foo => foo bar +# foo => baz +# is equivalent to: +# foo => foo bar, baz +# +# ============================================================================= diff --git a/synonyms/parser.js b/synonyms/parser.js index e2a33afd..20aab483 100644 --- a/synonyms/parser.js +++ b/synonyms/parser.js @@ -2,7 +2,7 @@ var fs = require('fs'); // https://www.elastic.co/guide/en/elasticsearch/reference/2.4/analysis-synonym-tokenfilter.html -function parser( filename ){ +function load( filename ){ // path not specified / file does not exist try { @@ -14,17 +14,21 @@ function parser( filename ){ } // parse solr synonyms format - return fs.readFileSync( filename, 'utf8' ) - .split('\n') - .map( line => { - return line.trim() // trim whitespace - .replace( /\s\s+/g, ' ' ) // squash double spaces - .replace(/(^,)|(,$)/g, '') // trim commas - .replace(/(\s*,\s*)/g,',') // trim spaces around commas - .replace(/(\s*=>\s*)/g,' => '); // trim spaces around arrows - }) - .filter( line => line.length > 0 ) // remove empty lines - .filter( line => '#' !== line[0] ); // remove comments + return parse( fs.readFileSync( filename, 'utf8' ) ); } -module.exports = parser; +function parse( contents ){ + return contents.split('\n') + .map( line => { + return line.trim().toLowerCase() // lowercase all tokens + .replace( /\s\s+/g, ' ' ) // squash double spaces + .replace(/(^,)|(,$)/g, '') // trim commas + .replace(/(\s*,\s*)/g,',') // trim spaces around commas + .replace(/(\s*=>\s*)/g,' => '); // trim spaces around arrows + }) + .filter( line => line.length > 0 ) // remove empty lines + .filter( line => '#' !== line[0] ); // remove comments +} + +module.exports = load; +module.exports.parse = parse; diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 04df7b5b..f1f36657 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -24,6 +24,7 @@ "icu_folding", "trim", "word_delimiter", + "custom_admin", "notnull" ] }, @@ -38,6 +39,7 @@ "lowercase", "icu_folding", "trim", + "custom_name", "full_token_address_suffix_expansion", "ampersand", "remove_ordinals", @@ -102,6 +104,7 @@ "lowercase", "icu_folding", "trim", + "custom_name", "ampersand", "street_suffix_contractions", "directionals", @@ -149,6 +152,7 @@ "lowercase", "icu_folding", "remove_duplicate_spaces", + "custom_street", "keyword_street_suffix_alley", "keyword_street_suffix_annex", "keyword_street_suffix_avenue", @@ -283,6 +287,18 @@ } }, "filter": { + "custom_name": { + "type": "synonym", + "synonyms": [""] + }, + "custom_street": { + "type": "synonym", + "synonyms": [""] + }, + "custom_admin": { + "type": "synonym", + "synonyms": [""] + }, "ampersand": { "type": "synonym", "synonyms": [ diff --git a/test/run.js b/test/run.js index 402715cf..b0c1a87e 100644 --- a/test/run.js +++ b/test/run.js @@ -16,7 +16,8 @@ var tests = [ require('./partial-literal.js'), require('./partial-hash.js'), require('./settings.js'), - require('./configValidation.js') + require('./configValidation.js'), + require('./synonyms/parser.js'), ]; tests.map(function(t) { diff --git a/test/settings.js b/test/settings.js index 0d1f716e..89432160 100644 --- a/test/settings.js +++ b/test/settings.js @@ -30,7 +30,7 @@ module.exports.tests.configValidation = function(test, common) { t.end(); }); -} +}; module.exports.tests.compile = function(test, common) { test('valid settings file', function(t) { @@ -81,6 +81,7 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) { "lowercase", "icu_folding", "trim", + "custom_name", "full_token_address_suffix_expansion", "ampersand", "remove_ordinals", @@ -116,6 +117,7 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) { "lowercase", "icu_folding", "trim", + "custom_name", "ampersand", "street_suffix_contractions", "directionals", @@ -194,7 +196,7 @@ module.exports.tests.peliasStreetAnalyzer = function(test, common) { }); test('peliasStreet token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasStreet; - t.equal( analyzer.filter.length, 133, 'lots of filters' ); + t.equal( analyzer.filter.length, 134, 'lots of filters' ); t.end(); }); }; diff --git a/test/synonyms/parser.js b/test/synonyms/parser.js new file mode 100644 index 00000000..6ce2d201 --- /dev/null +++ b/test/synonyms/parser.js @@ -0,0 +1,85 @@ +const parser = require('../../synonyms/parser'); + +module.exports.tests = {}; + +module.exports.tests.load = function(test, common) { + test('load: invalid file', function(t) { + t.throws(() => parser('/invalid/path'), /file not found/, 'invalid file'); + t.throws(() => parser('/tmp'), /file not found/, 'directory'); + t.end(); + }); +}; + +module.exports.tests.parse = function(test, common) { + test('empty file', function(t) { + t.deepEqual( parser.parse(``), [] ); + t.end(); + }); + test('comments and newlines', function(t) { + t.deepEqual( parser.parse(` + +# foo bar + +# baz + +`), [] ); + t.end(); + }); + test('lowercase', function(t) { + t.deepEqual( parser.parse(` +Foo => BaR +Foo,Bar,Baz +`), [ + 'foo => bar', + 'foo,bar,baz' +] ); + t.end(); + }); + test('squash spaces', function(t) { + t.deepEqual( parser.parse(` +foo bar => foo +Foo Bar, Foo +`), [ + 'foo bar => foo', + 'foo bar,foo' +] ); + t.end(); + }); + test('trim commas', function(t) { + t.deepEqual( parser.parse(` +,foo => bar +,foo, bar, +`), [ + 'foo => bar', + 'foo,bar' +] ); + t.end(); + }); + test('trim around commas', function(t) { + t.deepEqual( parser.parse(` + ,foo, bar , baz +`), [ + 'foo,bar,baz' +] ); + t.end(); + }); + test('trim around arrows', function(t) { + t.deepEqual( parser.parse(` + foo => bar +`), [ + 'foo => bar' +] ); + t.end(); + }); +}; + +module.exports.all = function (tape, common) { + + function test(name, testFunction) { + return tape('synonyms parser: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +};