Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom synonyms #273

Merged
merged 4 commits into from
Mar 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ function generate(){
"icu_folding",
"trim",
"word_delimiter",
"custom_admin",
"notnull"
]
},
Expand All @@ -59,6 +60,7 @@ function generate(){
"lowercase",
"icu_folding",
"trim",
"custom_name",
"full_token_address_suffix_expansion",
"ampersand",
"remove_ordinals",
Expand Down Expand Up @@ -114,6 +116,7 @@ function generate(){
"lowercase",
"icu_folding",
"trim",
"custom_name",
"ampersand",
"street_suffix_contractions",
"directionals",
Expand Down Expand Up @@ -152,6 +155,7 @@ function generate(){
"lowercase",
"icu_folding",
"remove_duplicate_spaces",
"custom_street",
].concat( synonyms.street_suffix_contractions.map( function( synonym ){
return "keyword_street_suffix_" + synonym.split(' ')[0];
})).concat( synonyms.directionals.map( function( synonym ){
Expand Down
25 changes: 25 additions & 0 deletions synonyms/custom_admin.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# =============================================================================
# This file allows users to add their own custom synonyms below.
#
# 1. Blank lines and lines starting with '#' are comments.
#
# 2. Explicit mappings match any token sequence on the left-hand-side of "=>" and replace with all
# alternatives on the right-hand-side.
# These types of mappings ignore the expand parameter in the constructor.
# Example:
# i-pod, i pod => ipod
#
# 3. Equivalent synonyms may be separated with commas and give no explicit mapping.
# In this case the mapping behavior will be taken from the expand parameter in the constructor.
# This allows the same synonym file to be used in different synonym handling strategies.
# Example:
# ipod, i-pod, i pod
#
# 4. Multiple synonym mapping entries are merged.
# Example:
# foo => foo bar
# foo => baz
# is equivalent to:
# foo => foo bar, baz
#
# =============================================================================
25 changes: 25 additions & 0 deletions synonyms/custom_name.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# =============================================================================
# This file allows users to add their own custom synonyms below.
#
# 1. Blank lines and lines starting with '#' are comments.
#
# 2. Explicit mappings match any token sequence on the left-hand-side of "=>" and replace with all
# alternatives on the right-hand-side.
# These types of mappings ignore the expand parameter in the constructor.
# Example:
# i-pod, i pod => ipod
#
# 3. Equivalent synonyms may be separated with commas and give no explicit mapping.
# In this case the mapping behavior will be taken from the expand parameter in the constructor.
# This allows the same synonym file to be used in different synonym handling strategies.
# Example:
# ipod, i-pod, i pod
#
# 4. Multiple synonym mapping entries are merged.
# Example:
# foo => foo bar
# foo => baz
# is equivalent to:
# foo => foo bar, baz
#
# =============================================================================
25 changes: 25 additions & 0 deletions synonyms/custom_street.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# =============================================================================
# This file allows users to add their own custom synonyms below.
#
# 1. Blank lines and lines starting with '#' are comments.
#
# 2. Explicit mappings match any token sequence on the left-hand-side of "=>" and replace with all
# alternatives on the right-hand-side.
# These types of mappings ignore the expand parameter in the constructor.
# Example:
# i-pod, i pod => ipod
#
# 3. Equivalent synonyms may be separated with commas and give no explicit mapping.
# In this case the mapping behavior will be taken from the expand parameter in the constructor.
# This allows the same synonym file to be used in different synonym handling strategies.
# Example:
# ipod, i-pod, i pod
#
# 4. Multiple synonym mapping entries are merged.
# Example:
# foo => foo bar
# foo => baz
# is equivalent to:
# foo => foo bar, baz
#
# =============================================================================
30 changes: 17 additions & 13 deletions synonyms/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ var fs = require('fs');

// https://www.elastic.co/guide/en/elasticsearch/reference/2.4/analysis-synonym-tokenfilter.html

function parser( filename ){
function load( filename ){

// path not specified / file does not exist
try {
Expand All @@ -14,17 +14,21 @@ function parser( filename ){
}

// parse solr synonyms format
return fs.readFileSync( filename, 'utf8' )
.split('\n')
.map( line => {
return line.trim() // trim whitespace
.replace( /\s\s+/g, ' ' ) // squash double spaces
.replace(/(^,)|(,$)/g, '') // trim commas
.replace(/(\s*,\s*)/g,',') // trim spaces around commas
.replace(/(\s*=>\s*)/g,' => '); // trim spaces around arrows
})
.filter( line => line.length > 0 ) // remove empty lines
.filter( line => '#' !== line[0] ); // remove comments
return parse( fs.readFileSync( filename, 'utf8' ) );
}

module.exports = parser;
function parse( contents ){
return contents.split('\n')
.map( line => {
return line.trim().toLowerCase() // lowercase all tokens
.replace( /\s\s+/g, ' ' ) // squash double spaces
.replace(/(^,)|(,$)/g, '') // trim commas
.replace(/(\s*,\s*)/g,',') // trim spaces around commas
.replace(/(\s*=>\s*)/g,' => '); // trim spaces around arrows
})
.filter( line => line.length > 0 ) // remove empty lines
.filter( line => '#' !== line[0] ); // remove comments
}

module.exports = load;
module.exports.parse = parse;
16 changes: 16 additions & 0 deletions test/fixtures/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"icu_folding",
"trim",
"word_delimiter",
"custom_admin",
"notnull"
]
},
Expand All @@ -38,6 +39,7 @@
"lowercase",
"icu_folding",
"trim",
"custom_name",
"full_token_address_suffix_expansion",
"ampersand",
"remove_ordinals",
Expand Down Expand Up @@ -102,6 +104,7 @@
"lowercase",
"icu_folding",
"trim",
"custom_name",
"ampersand",
"street_suffix_contractions",
"directionals",
Expand Down Expand Up @@ -149,6 +152,7 @@
"lowercase",
"icu_folding",
"remove_duplicate_spaces",
"custom_street",
"keyword_street_suffix_alley",
"keyword_street_suffix_annex",
"keyword_street_suffix_avenue",
Expand Down Expand Up @@ -283,6 +287,18 @@
}
},
"filter": {
"custom_name": {
"type": "synonym",
"synonyms": [""]
},
"custom_street": {
"type": "synonym",
"synonyms": [""]
},
"custom_admin": {
"type": "synonym",
"synonyms": [""]
},
"ampersand": {
"type": "synonym",
"synonyms": [
Expand Down
3 changes: 2 additions & 1 deletion test/run.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ var tests = [
require('./partial-literal.js'),
require('./partial-hash.js'),
require('./settings.js'),
require('./configValidation.js')
require('./configValidation.js'),
require('./synonyms/parser.js'),
];

tests.map(function(t) {
Expand Down
6 changes: 4 additions & 2 deletions test/settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ module.exports.tests.configValidation = function(test, common) {
t.end();

});
}
};

module.exports.tests.compile = function(test, common) {
test('valid settings file', function(t) {
Expand Down Expand Up @@ -81,6 +81,7 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) {
"lowercase",
"icu_folding",
"trim",
"custom_name",
"full_token_address_suffix_expansion",
"ampersand",
"remove_ordinals",
Expand Down Expand Up @@ -116,6 +117,7 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) {
"lowercase",
"icu_folding",
"trim",
"custom_name",
"ampersand",
"street_suffix_contractions",
"directionals",
Expand Down Expand Up @@ -194,7 +196,7 @@ module.exports.tests.peliasStreetAnalyzer = function(test, common) {
});
test('peliasStreet token filters', function(t) {
var analyzer = settings().analysis.analyzer.peliasStreet;
t.equal( analyzer.filter.length, 133, 'lots of filters' );
t.equal( analyzer.filter.length, 134, 'lots of filters' );
t.end();
});
};
Expand Down
85 changes: 85 additions & 0 deletions test/synonyms/parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
const parser = require('../../synonyms/parser');

module.exports.tests = {};

module.exports.tests.load = function(test, common) {
test('load: invalid file', function(t) {
t.throws(() => parser('/invalid/path'), /file not found/, 'invalid file');
t.throws(() => parser('/tmp'), /file not found/, 'directory');
t.end();
});
};

module.exports.tests.parse = function(test, common) {
test('empty file', function(t) {
t.deepEqual( parser.parse(``), [] );
t.end();
});
test('comments and newlines', function(t) {
t.deepEqual( parser.parse(`

# foo bar

# baz

`), [] );
t.end();
});
test('lowercase', function(t) {
t.deepEqual( parser.parse(`
Foo => BaR
Foo,Bar,Baz
`), [
'foo => bar',
'foo,bar,baz'
] );
t.end();
});
test('squash spaces', function(t) {
t.deepEqual( parser.parse(`
foo bar => foo
Foo Bar, Foo
`), [
'foo bar => foo',
'foo bar,foo'
] );
t.end();
});
test('trim commas', function(t) {
t.deepEqual( parser.parse(`
,foo => bar
,foo, bar,
`), [
'foo => bar',
'foo,bar'
] );
t.end();
});
test('trim around commas', function(t) {
t.deepEqual( parser.parse(`
,foo, bar , baz
`), [
'foo,bar,baz'
] );
t.end();
});
test('trim around arrows', function(t) {
t.deepEqual( parser.parse(`
foo => bar
`), [
'foo => bar'
] );
t.end();
});
};

module.exports.all = function (tape, common) {

function test(name, testFunction) {
return tape('synonyms parser: ' + name, testFunction);
}

for( var testCase in module.exports.tests ){
module.exports.tests[testCase](test, common);
}
};