Commit 30650fb0 authored by Matija Obreza's avatar Matija Obreza
Browse files

Give deprecated records (synonyms) 80% of their score (fixes #9)

parent a4512040
......@@ -111,15 +111,33 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
BestScore bestScore = new BestScore();
return genusIdLookup.keySet().parallelStream().map(candidate -> new Suggestion<String>(candidate, similarityScore(genus, candidate)))
return genusIdLookup.keySet().parallelStream()
// convert each genus to Suggestion
.map(candidate -> new Suggestion<String>(candidate, similarityScore(genus, candidate)))
// debug print
// .peek(InMemoryTaxonomyDatabase::print)
.filter(scored -> scored.getScore() > .5).sequential()
// keep decently scored suggestions
.filter(scored -> scored.getScore() > .5)
// convert to sequential stream
.sequential()
// debug print
// .peek(InMemoryTaxonomyDatabase::print)
.peek(scored -> bestScore.update(scored.getScore())).sorted(Comparator.comparing(Suggestion::getScore, Comparator.reverseOrder()))
// update best score
.peek(scored -> bestScore.update(scored.getScore()))
// sort by score, descending
.sorted(Comparator.comparing(Suggestion::getScore, Comparator.reverseOrder()))
// debug print
// .peek(InMemoryTaxonomyDatabase::print)
// Prefer full match
.filter(scored -> scored.getScore() >= (bestScore.getBestScore() == 1.0 ? 0.95 : bestScore.getBestScore() * 0.8))
// .filter(scored -> scored.getScore() - bestScore.getBestScore() <= 2)
.peek(InMemoryTaxonomyDatabase::print).map(Suggestion::getSuggestion).distinct().limit(maxSize).collect(Collectors.toList());
// debug print
.peek(InMemoryTaxonomyDatabase::print)
// convert to suggested Strings
.map(Suggestion::getSuggestion)
// keep distinct results
.distinct()
// limit to maxSize
.limit(maxSize).collect(Collectors.toList());
}
/**
......@@ -197,15 +215,15 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
BestScore bestScore = new BestScore();
return getAllGenusSpecies(genus).stream().map(SpeciesRow::getSpeciesName)
// keep distinct elements
.distinct()
return getAllGenusSpecies(genus).stream()
// print
// .peek(InMemoryTaxonomyDatabase::print)
// convert each candidate to a Suggestion
.map(candidate -> new Suggestion<String>(candidate, similarityScore(species, candidate)))
.map(speciesRow -> new Suggestion<>(speciesRow, similarityScore(species, speciesRow.getSpeciesName())))
// keep suggestions with score over .5
.filter(scored -> scored.getScore() >= .5)
// if record is not current, reduce score by 20%
.peek(scored -> scored.setScore((scored.getSuggestion().isCurrent() ? 1.0 : 0.8) * scored.getScore()))
// sort suggestions by score, descending
.sorted(Comparator.comparing(Suggestion::getScore, Comparator.reverseOrder()))
// generate bestScore
......@@ -217,6 +235,10 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
// .peek(InMemoryTaxonomyDatabase::print)
// convert sorted Suggestion list back to Strings
.map(Suggestion::getSuggestion)
// keep species name
.map(SpeciesRow::getSpeciesName)
// keep distinct elements
.distinct()
// limit to maxSize
.limit(maxSize)
// convert to List<String>
......@@ -271,28 +293,32 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
BestScore bestScore = new BestScore();
return getAllGenusSpecies(genus).stream().filter(speciesRow -> species.equals(speciesRow.getSpeciesName()))
// keep subtaxa
.map(SpeciesRow::getSubtaxa)
// debug print
// .peek(InMemoryTaxonomyDatabase::print)
// keep candidates that are not blank
.filter(candidate -> StringUtils.isNotBlank(candidate))
// keep distinct
.distinct()
.filter(speciesRow -> StringUtils.isNotBlank(speciesRow.getSubtaxa()))
// convert to Suggestions
.map(candidate -> new Suggestion<String>(candidate, similarityScore(subtaxa, candidate)))
.map(speciesRow -> new Suggestion<>(speciesRow, similarityScore(subtaxa, speciesRow.getSubtaxa())))
// keep suggestions with decent score
.filter(scored -> scored.getScore() >= .5)
// .peek(InMemoryTaxonomyDatabase::print)
// if record is not current, reduce score by 20%
.peek(scored -> scored.setScore((scored.getSuggestion().isCurrent() ? 1.0 : 0.8) * scored.getScore()))
// sort by score descending
.sorted(Comparator.comparing(Suggestion::getScore, Comparator.reverseOrder()))
// update best score
.peek(scored -> bestScore.update(scored.getScore()))
// .peek(InMemoryTaxonomyDatabase::print)
// keep only nice Suggestions
.filter(scored -> scored.getScore() >= (bestScore.getBestScore() == 1.0 ? 0.95 : bestScore.getBestScore() * 0.8))
// debug print
// .peek(InMemoryTaxonomyDatabase::print)
// back to Strings
.map(Suggestion::getSuggestion)
// keep subtaxa
.map(SpeciesRow::getSubtaxa)
// keep distinct
.distinct()
// limit results to maxSize
.limit(maxSize).collect(Collectors.toList());
}
......
......@@ -22,10 +22,10 @@ package org.genesys2.gringlobal.taxonomy.component;
* @param <T> the generic type
*/
public class Suggestion<T> {
/** The suggestion. */
private T suggestion;
/** The score. */
private double score;
......@@ -49,6 +49,15 @@ public class Suggestion<T> {
return score;
}
/**
* Update the score
*
* @param newScore obviously
*/
public void setScore(double newScore) {
this.score = newScore;
}
/**
* Gets the suggestion.
*
......@@ -57,4 +66,5 @@ public class Suggestion<T> {
public T getSuggestion() {
return suggestion;
}
}
......@@ -1044,4 +1044,9 @@ public class SpeciesRow implements Serializable {
public String getSubtaxaAuthority() {
return getNameAuthority();
}
@Override
public String toString() {
return getSpeciesName() + " " + getSpeciesAuthority() + " " + getSubtaxa() + " " + getSubtaxaAuthority() + " current=" + isCurrent();
}
}
......@@ -15,8 +15,9 @@
*/
package org.genesys2.gringlobal.taxonomy;
import static org.hamcrest.Matchers.*;
import static org.junit.Assert.*;
import static org.hamcrest.Matchers.contains;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertThat;
import java.io.File;
import java.io.FileInputStream;
......@@ -26,20 +27,22 @@ import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.text.ParseException;
import com.opencsv.CSVReader;
import org.genesys2.gringlobal.taxonomy.component.InMemoryTaxonomyDatabase;
import org.genesys2.gringlobal.taxonomy.component.TaxonomyException;
import org.genesys2.gringlobal.taxonomy.component.TaxonomyReader;
import org.genesys2.gringlobal.taxonomy.model.GenusRow;
import org.genesys2.gringlobal.taxonomy.model.SpeciesRow;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import com.opencsv.CSVReader;
public class InMemoryDatabaseTest {
private static InMemoryTaxonomyDatabase DATABASE = null;
@BeforeClass
public static void loadDatabase() throws UnsupportedEncodingException, FileNotFoundException, IOException, ParseException {
public static void loadDatabase() throws UnsupportedEncodingException, FileNotFoundException, IOException, ParseException, TaxonomyException {
File file = new File("../taxonomy_data", "TAXONOMY_GENUS.txt");
if (!file.exists()) {
return;
......@@ -53,6 +56,17 @@ public class InMemoryDatabaseTest {
DATABASE.registerGenus(genusRow.getGenusId(), genusRow.getGenusName());
}
}
file = new File("../taxonomy_data", "TAXONOMY_SPECIES.txt");
if (!file.exists()) {
return;
}
// read TAXONOMY_SPECIES.txt
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(file), "UTF-8"), 1)) {
SpeciesRow speciesRow = null;
while ((speciesRow = TaxonomyReader.toSpecies(reader.readNext())) != null) {
DATABASE.registerSpecies(speciesRow);
}
}
}
@AfterClass
......@@ -61,7 +75,7 @@ public class InMemoryDatabaseTest {
}
@Test
public void test1() {
public void testVigna() {
if (DATABASE == null) {
return;
}
......@@ -70,4 +84,13 @@ public class InMemoryDatabaseTest {
assertThat(DATABASE.containsGenus("VIGNA"), equalTo(false));
assertThat(DATABASE.findSimilarGenus("VIGNA", 5), contains("Vigna"));
}
@Test
public void testSmallerScoreForDeprecatedRecords() {
if (DATABASE == null) {
return;
}
assertThat(DATABASE.findSimilarSubtaxa("Vigna", "unguiculata", "dekindtiana", 5), contains("subsp. dekindtiana", "var. dekindtiana"));
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment