Commit 945a0569 authored by Matija Obreza's avatar Matija Obreza
Browse files

similarityScore must be between 0.0 and 1.0

Using Dice's coefficient
parent 878b3f5a
......@@ -22,14 +22,14 @@ package org.genesys2.gringlobal.taxonomy.component;
public class BestScore {
/** The best score. */
private float bestScore = Float.MAX_VALUE;
private double bestScore = Float.MIN_VALUE;
/**
* Gets the best score.
*
* @return the best score
*/
public float getBestScore() {
public double getBestScore() {
return bestScore;
}
......@@ -38,8 +38,8 @@ public class BestScore {
*
* @param score the score
*/
public void update(float score) {
if (score < bestScore) {
public void update(double score) {
if (score > bestScore) {
// System.err.println("Updating best prev=" + bestScore + " now=" + score);
bestScore = score;
}
......
......@@ -24,7 +24,6 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.genesys2.gringlobal.taxonomy.model.SpeciesRow;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -39,10 +38,10 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
/** The genus id lookup. */
private Map<String, List<Long>> genusIdLookup = new HashMap<>();
/** The species lookup. */
private Map<Long, List<SpeciesRow>> speciesLookup = new HashMap<>();
/** The species rows. */
private int speciesRows;
......@@ -75,7 +74,9 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
speciesRows++;
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see java.lang.Object#toString()
*/
@Override
......@@ -95,7 +96,9 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
// .keySet().stream().anyMatch(g -> g.equalsIgnoreCase(genus));
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see org.genesys2.gringlobal.taxonomy.component.TaxonomyDatabase#findSimilarGenus(java.lang.String, int)
*/
@Override
......@@ -107,13 +110,13 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
BestScore bestScore = new BestScore();
return genusIdLookup.keySet().parallelStream().map(candidate -> new Suggestion<String>(candidate, similarityScore(genus, candidate, 100)))
return genusIdLookup.keySet().parallelStream().map(candidate -> new Suggestion<String>(candidate, similarityScore(genus, candidate)))
// .peek(InMemoryTaxonomyDatabase::print)
.filter(scored -> scored.getScore() >= 0 && scored.getScore() < 99).sequential()
.filter(scored -> scored.getScore() > .5).sequential()
// .peek(InMemoryTaxonomyDatabase::print)
.peek(scored -> bestScore.update(scored.getScore())).sorted(Comparator.comparing(Suggestion::getScore))
.peek(scored -> bestScore.update(scored.getScore())).sorted(Comparator.comparing(Suggestion::getScore, Comparator.reverseOrder()))
// .peek(InMemoryTaxonomyDatabase::print)
.filter(scored -> scored.getScore() <= bestScore.getBestScore() * 1.5)
.filter(scored -> scored.getScore() >= (bestScore.getBestScore() == 1.0 ? 0.95 : bestScore.getBestScore() * 0.8))
// .filter(scored -> scored.getScore() - bestScore.getBestScore() <= 2)
.peek(InMemoryTaxonomyDatabase::print).map(Suggestion::getSuggestion).distinct().limit(maxSize).collect(Collectors.toList());
}
......@@ -158,7 +161,9 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
});
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see org.genesys2.gringlobal.taxonomy.component.TaxonomyDatabase#containsSpecies(java.lang.String, java.lang.String)
*/
@Override
......@@ -172,7 +177,9 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
return getAllGenusSpecies(genus).stream().anyMatch(speciesRow -> species.equals(speciesRow.getSpeciesName()));
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see org.genesys2.gringlobal.taxonomy.component.TaxonomyDatabase#findSimilarSpecies(java.lang.String, java.lang.String, int)
*/
@Override
......@@ -189,30 +196,34 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
BestScore bestScore = new BestScore();
// System.err.println("doo");
return getAllGenusSpecies(genus).stream().map(SpeciesRow::getSpeciesName).distinct()
// .peek(InMemoryTaxonomyDatabase::print)
.map(candidate -> new Suggestion<String>(candidate, similarityScore(species, candidate, 5))).filter(scored -> scored.getScore() >= 0 && scored.getScore() < 4)
.sorted(Comparator.comparing(Suggestion::getScore))
.map(candidate -> new Suggestion<String>(candidate, similarityScore(species, candidate))).filter(scored -> scored.getScore() >= .5)
.sorted(Comparator.comparing(Suggestion::getScore, Comparator.reverseOrder())).peek(scored -> bestScore.update(scored.getScore()))
// .peek(InMemoryTaxonomyDatabase::print)
.peek(scored -> bestScore.update(scored.getScore())).filter(scored -> scored.getScore() <= bestScore.getBestScore() * 1.5)
// .filter(scored -> scored.getScore() >= bestScore.getBestScore() * 0.8)
.filter(scored -> scored.getScore() >= (bestScore.getBestScore() == 1.0 ? 0.95 : bestScore.getBestScore() * 0.8))
// .filter(scored -> scored.getScore() - bestScore.getBestScore() <= 2)
.peek(InMemoryTaxonomyDatabase::print).map(Suggestion::getSuggestion).distinct().limit(maxSize).collect(Collectors.toList());
}
/**
* StringUtils.getLevenshteinDistance
* similarityScore returns a string similarity value in the range [0, 1.0] (where 1.0 is full match).
*
* @param original the original
* @param candidate the candidate
* @param threshold the threshold
* @return the float
* @return the score between 0 and 1.0 where 0 is no similarity and 1.0 is full match
*/
private float similarityScore(String original, String candidate, int threshold) {
return StringUtils.getLevenshteinDistance(original.toLowerCase(), candidate.toLowerCase(), threshold);
private double similarityScore(String original, String candidate) {
return StringSimilarity.diceCoefficientOptimized(original.toLowerCase(), candidate.toLowerCase());
// StringUtils.getLevenshteinDistance(original.toLowerCase(), candidate.toLowerCase(), 5);
// (float) StringUtils.getJaroWinklerDistance(original, candidate)
}
/* (non-Javadoc)
/*
* (non-Javadoc)
*
* @see org.genesys2.gringlobal.taxonomy.component.TaxonomyDatabase#getSpeciesAuthority(java.lang.String, java.lang.String)
*/
@Override
......
......@@ -27,7 +27,7 @@ public class Suggestion<T> {
private T suggestion;
/** The score. */
private float score;
private double score;
/**
* Instantiates a new suggestion.
......@@ -35,7 +35,7 @@ public class Suggestion<T> {
* @param suggestion the suggestion
* @param score the score
*/
Suggestion(T suggestion, float score) {
Suggestion(T suggestion, double score) {
this.suggestion = suggestion;
this.score = score;
}
......@@ -45,7 +45,7 @@ public class Suggestion<T> {
*
* @return the score
*/
public float getScore() {
public double getScore() {
return score;
}
......
/*
* Copyright 2016 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys2.gringlobal.taxonomy;
import static org.hamcrest.Matchers.*;
import static org.junit.Assert.*;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.text.ParseException;
import com.opencsv.CSVReader;
import org.genesys2.gringlobal.taxonomy.component.InMemoryTaxonomyDatabase;
import org.genesys2.gringlobal.taxonomy.component.TaxonomyReader;
import org.genesys2.gringlobal.taxonomy.model.GenusRow;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
public class InMemoryDatabaseTest {
private static InMemoryTaxonomyDatabase DATABASE = null;
@BeforeClass
public static void loadDatabase() throws UnsupportedEncodingException, FileNotFoundException, IOException, ParseException {
File file = new File("../taxonomy_data", "TAXONOMY_GENUS.txt");
if (!file.exists()) {
return;
}
// read TAXONOMY_GENUS.txt
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(file), "UTF-8"), 1)) {
DATABASE = new InMemoryTaxonomyDatabase();
GenusRow genusRow = null;
while ((genusRow = TaxonomyReader.toGenus(reader.readNext())) != null) {
DATABASE.registerGenus(genusRow.getGenusId(), genusRow.getGenusName());
}
}
}
@AfterClass
public static void unloadDatabase() {
DATABASE = null;
}
@Test
public void test1() {
if (DATABASE == null) {
return;
}
assertThat(DATABASE.containsGenus("Vigna"), equalTo(true));
assertThat(DATABASE.containsGenus("VIGNA"), equalTo(false));
assertThat(DATABASE.findSimilarGenus("VIGNA", 5), contains("Vigna"));
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment