Commit 54f4e71b authored by Matija Obreza's avatar Matija Obreza
Browse files

Use Levenshtein and Dice coefficients (fixes #16)

parent ffc3fc36
......@@ -355,6 +355,7 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
// .peek(InMemoryTaxonomyDatabase::print)
// convert each candidate to a Suggestion
.map(speciesRow -> new Suggestion<>(speciesRow, similarityScore(species, speciesRow.getSpeciesName())))
// .peek(InMemoryTaxonomyDatabase::print)
// keep suggestions with score over .5
.filter(scored -> scored.getScore() >= .5)
// if record is not current, reduce score by 20%
......@@ -375,8 +376,10 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
* @param candidate the candidate
* @return the score between 0 and 1.0 where 0 is no similarity and 1.0 is full match
*/
private double similarityScore(String original, String candidate) {
return StringSimilarity.diceCoefficientOptimized(original.toLowerCase(), candidate.toLowerCase());
@Override
public double similarityScore(final String original, final String candidate) {
return (StringSimilarity.diceCoefficientOptimized(original.toLowerCase(), candidate.toLowerCase())
+ StringSimilarity.getLevenshteinCoefficient(original.toLowerCase(), candidate.toLowerCase())) / 2.0f;
// StringUtils.getLevenshteinDistance(original.toLowerCase(), candidate.toLowerCase(), 5);
// (float) StringUtils.getJaroWinklerDistance(original, candidate)
}
......@@ -384,7 +387,8 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
/*
* (non-Javadoc)
*
* @see org.genesys2.gringlobal.taxonomy.component.TaxonomyDatabase#getSpeciesAuthority(java.lang.String, java.lang.String)
* @see org.genesys2.gringlobal.taxonomy.component.TaxonomyDatabase#getSpeciesAuthority(java.lang.String,
* java.lang.String)
*/
@Override
public String getSpeciesAuthority(String genus, String species) {
......
......@@ -158,4 +158,15 @@ public class TaxonomyChecker {
return database.getSubtaxaAuthority(genus, species, subtaxa);
}
/**
* Get string similarity score
* @param string1
* @param string2
* @return
*/
public double similarityScore(final String string1, final String string2) {
return database.similarityScore(string1, string2);
}
}
......@@ -137,4 +137,13 @@ public interface TaxonomyDatabase {
*/
String getGenus(long genusId);
/**
* Get string similarity score
*
* @param string1
* @param string2
* @return
*/
double similarityScore(String string1, String string2);
}
......@@ -132,8 +132,7 @@ public class TaxonomyCheckerTest extends DatabaseTest {
assertThat("Juniperus must be best suggestion", CHECKER.suggestGenus("Junepirus", 10), hasSize(greaterThan(0)));
assertThat("Juniperus must be suggested", CHECKER.suggestGenus("Junepirus", 10), hasItem("Juniperus"));
// FIXME Enable after implementing #16
// assertThat("Juniperus must be best suggestion", CHECKER.suggestGenus("Junepirus", 1), contains("Juniperus"));
assertThat("Juniperus must be best suggestion", CHECKER.suggestGenus("Junepirus", 1), contains("Juniperus"));
}
/**
......@@ -229,4 +228,17 @@ public class TaxonomyCheckerTest extends DatabaseTest {
* Zygophyllon suggests too many options: Zygophyllum;Aphyllon;Phyllogonum;Zygophyllidium;Phyllophyton
*/
/**
* Phaseolus vuigoris does not suggest Phaseolus vulgaris
*/
@Test
public void testPhaseolus_vuigoris() {
if (CHECKER == null) {
return;
}
assertThat("Phaseolus vulgaris is a valid species", DATABASE.containsSpecies("Phaseolus", "vulgaris"), equalTo(true));
assertThat(CHECKER.similarityScore("vulgaris", "vuigoris"), greaterThan(0.5));
assertThat(CHECKER.suggestSpecies("Phaseolus", "vuigoris", 10), hasSize(greaterThan(0)));
assertThat(CHECKER.suggestSpecies("Phaseolus", "vuigoris", 10), hasItem("vulgaris"));
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment