Commit 6e9c5e56 authored by Matija Obreza's avatar Matija Obreza
Browse files

Implemented checking for hybrid species (addresses #12)

parent 31be18d3
......@@ -22,14 +22,15 @@ import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.genesys2.gringlobal.taxonomy.model.SpeciesRow;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
// TODO: Auto-generated Javadoc
/**
* {@link List} based in-memory "database". NOT THREAD-SAFE!
*/
......@@ -42,6 +43,15 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
private static final String HYBRID_SPECIES_PREFIX = "x ";
/** Hybrid marker */
private static final String HYBRID_MARKER = " x ";
/** Alternative hybrid markers */
private static final String[] HYBRID_MARKER_ALT = { " x ", " X " };
/** Pattern to split hybrid species to left and right part */
private static final Pattern HYBRID_MARKER_REGEXP = Pattern.compile("\\s+[xX]\\s+");
/** The genus id lookup. */
private Map<Long, String> genusIdLookup = new HashMap<>();
......@@ -209,15 +219,28 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
return false;
}
final boolean isHybrid = StringUtils.startsWith(species, HYBRID_SPECIES_PREFIX);
final boolean isSpecificHybrid = isNameSpecificHybrid(species);
final boolean isHybrid = isNameHybrid(species);
return getAllGenusSpecies(genus).stream().anyMatch(speciesRow -> {
if (isHybrid) {
return true == speciesRow.getSpecificHybrid() && StringUtils.equals(species.substring(HYBRID_SPECIES_PREFIX.length()), speciesRow.getSpeciesName());
} else {
return StringUtils.equals(species, speciesRow.getSpeciesName());
if (isHybrid) {
String[] split = HYBRID_MARKER_REGEXP.split(species);
String speciesLeft = split[0];
String speciesRight = split[1];
if (LOG.isTraceEnabled()) {
LOG.trace("Species {} is a hybrid of {} and {}", species, speciesLeft, speciesRight);
}
});
// Check left and right
return containsSpecies(genus, speciesLeft) && containsSpecies(genus, speciesRight);
} else {
return getAllGenusSpecies(genus).stream().anyMatch(speciesRow -> {
if (isSpecificHybrid) {
return true == speciesRow.getSpecificHybrid() && StringUtils.equals(species.substring(HYBRID_SPECIES_PREFIX.length()), speciesRow.getSpeciesName());
} else {
return StringUtils.equals(species, speciesRow.getSpeciesName());
}
});
}
}
/*
......@@ -237,6 +260,94 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
throw new UnsupportedOperationException("Genus does not exist in database. Genus=" + genus);
}
final boolean isHybrid = isNameHybrid(species);
if (isHybrid) {
String[] split = HYBRID_MARKER_REGEXP.split(species);
String speciesLeft = split[0];
String speciesRight = split[1];
if (LOG.isTraceEnabled()) {
LOG.trace("Species {} is a hybrid of {} and {}", species, speciesLeft, speciesRight);
}
List<Suggestion<String>> lefts = makeSuggestions(genus, speciesLeft).limit(maxSize).map(suggestion -> new Suggestion<>(suggestion.getSuggestion().getSpeciesName(), suggestion.getScore()))
.collect(Collectors.toList());
List<Suggestion<String>> rights = makeSuggestions(genus, speciesRight).limit(maxSize)
.map(suggestion -> new Suggestion<>(suggestion.getSuggestion().getSpeciesName(), suggestion.getScore())).collect(Collectors.toList());
// Inject the provided species name when the best counterpart is not a 100% match
if (lefts.size() == 0 && rights.size() > 0 && rights.get(0).getScore() < 1) {
lefts.add(new Suggestion<String>(speciesLeft, 0.1));
} else if (rights.size() == 0 && lefts.size() > 0 && lefts.get(0).getScore() < 1) {
rights.add(new Suggestion<String>(speciesRight, 0.1));
}
if (LOG.isTraceEnabled()) {
LOG.trace("Left for {} is {}", speciesLeft, lefts);
LOG.trace("Right for {} is {}", speciesRight, rights);
}
return crossJoinSpecies(lefts, rights)
// stream results
.stream()
// order by score
.sorted(Comparator.comparing(Suggestion::getScore, Comparator.reverseOrder()))
// convert sorted Suggestion list back to Strings
.map(Suggestion::getSuggestion)
// keep distinct elements
.distinct()
// limit to maxSize
.limit(maxSize)
// convert to List<String>
.collect(Collectors.toList());
}
return makeSuggestions(genus, species)
// print
// .peek(InMemoryTaxonomyDatabase::print)
// convert sorted Suggestion list back to Strings
.map(Suggestion::getSuggestion)
// keep species name
.map(SpeciesRow::getSpeciesName)
// keep distinct elements
.distinct()
// limit to maxSize
.limit(maxSize)
// convert to List<String>
.collect(Collectors.toList());
}
/**
* Create a list of Suggestions pairing each one in left with each one in right. The score of the joined suggestion is a product of both scores.
*
* @param lefts left suggestions
* @param rights right suggestions
* @return
* @return List of cross-joined suggestions from both lists
*/
private List<Suggestion<String>> crossJoinSpecies(List<Suggestion<String>> lefts, List<Suggestion<String>> rights) {
List<Suggestion<String>> crossJoin = new ArrayList<>();
for (Suggestion<String> l : lefts) {
for (Suggestion<String> r : rights) {
Suggestion<String> j = new Suggestion<>(l.getSuggestion().concat(HYBRID_MARKER).concat(r.getSuggestion()), l.getScore() * r.getScore());
if (LOG.isTraceEnabled()) {
LOG.trace("Cross-join '{}' with '{}' result={}", l, r, j);
}
crossJoin.add(j);
}
}
return crossJoin;
}
/**
* Return a stream of Suggestion for genus and species
*
* @param genus genus
* @param species species
* @return stream of best Suggestions
*/
private Stream<Suggestion<SpeciesRow>> makeSuggestions(final String genus, final String species) {
BestScore bestScore = new BestScore();
return getAllGenusSpecies(genus).stream()
......@@ -254,19 +365,7 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
.peek(scored -> bestScore.update(scored.getScore()))
// keep suggestions: when bestScore is 1.0 keep suggestions over 0.95 otherwise keep suggestions within
// 80%
.filter(scored -> scored.getScore() >= (bestScore.getBestScore() == 1.0 ? 0.95 : (bestScore.getBestScore() * 0.8)))
// print
// .peek(InMemoryTaxonomyDatabase::print)
// convert sorted Suggestion list back to Strings
.map(Suggestion::getSuggestion)
// keep species name
.map(SpeciesRow::getSpeciesName)
// keep distinct elements
.distinct()
// limit to maxSize
.limit(maxSize)
// convert to List<String>
.collect(Collectors.toList());
.filter(scored -> scored.getScore() >= (bestScore.getBestScore() == 1.0 ? 0.95 : (bestScore.getBestScore() * 0.8)));
}
/**
......@@ -294,12 +393,12 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
return null;
}
final boolean isHybrid = StringUtils.startsWith(species, HYBRID_SPECIES_PREFIX);
final boolean isSpecificHybrid = isNameSpecificHybrid(species);
return getAllGenusSpecies(genus).stream()
// keep rows with matching species
.filter(speciesRow -> {
if (isHybrid) {
if (isSpecificHybrid) {
return true == speciesRow.getSpecificHybrid() && StringUtils.equals(species.substring(HYBRID_SPECIES_PREFIX.length()), speciesRow.getSpeciesName());
} else {
return StringUtils.equals(species, speciesRow.getSpeciesName());
......@@ -313,6 +412,30 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
.map(speciesRow -> speciesRow.getSpeciesAuthority()).orElse(null);
}
/**
* Check if the name starts with "x "
*
* @param name the name to check
* @return true if name denotes a specific hybrid
*/
private boolean isNameSpecificHybrid(String name) {
return StringUtils.startsWith(name, HYBRID_SPECIES_PREFIX);
}
/**
* Check if name contains " x "
*
* @param name the name to check
* @return true when name denotes a hybrid
*/
private boolean isNameHybrid(String name) {
for (String opt : HYBRID_MARKER_ALT) {
if (StringUtils.contains(name, opt))
return true;
}
return false;
}
/*
* (non-Javadoc)
*
......@@ -422,12 +545,12 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
@Override
public List<SpeciesRow> findSpeciesRow(String genus, String species, String subtaxa) {
final boolean isHybrid = StringUtils.startsWith(species, HYBRID_SPECIES_PREFIX);
final boolean isSpecificHybrid = isNameSpecificHybrid(species);
return getAllGenusSpecies(genus).stream()
// keep rows with matching species
.filter(speciesRow -> {
if (isHybrid) {
if (isSpecificHybrid) {
return true == speciesRow.getSpecificHybrid() && StringUtils.equals(species.substring(HYBRID_SPECIES_PREFIX.length()), speciesRow.getSpeciesName());
} else {
return StringUtils.equals(species, speciesRow.getSpeciesName());
......
......@@ -67,4 +67,8 @@ public class Suggestion<T> {
return suggestion;
}
@Override
public String toString() {
return "score=" + score + " suggestion=" + suggestion;
}
}
......@@ -127,7 +127,7 @@ public class TaxonomyCheckerTest extends DatabaseTest {
* Sorghum x almum
*/
@Test
public void testSorghumXalmum() {
public void testSorghum_Xalmum() {
if (CHECKER == null) {
return;
}
......@@ -142,7 +142,7 @@ public class TaxonomyCheckerTest extends DatabaseTest {
* Sorghum x almum Parodi
*/
@Test
public void testSorghumXalmumParodi() {
public void testSorghum_XalmumParodi() {
if (CHECKER == null) {
return;
}
......@@ -155,6 +155,57 @@ public class TaxonomyCheckerTest extends DatabaseTest {
}
/**
* GENUS=Sesbania and SPECIES=sesban x goetzei
* ETH013: GENUS=Sesbania and SPECIES=sesban x goetzei
*/
@Test
public void testSesbania_sesbanXgoetzei() {
if (CHECKER == null) {
return;
}
assertThat("Sesbania sesban is a valid species", DATABASE.containsSpecies("Sesbania", "sesban"), equalTo(true));
assertThat("Sesbania goetzei is a valid species", DATABASE.containsSpecies("Sesbania", "goetzei"), equalTo(true));
assertThat("Sesbania sesban x goetzei is a valid hybrid", DATABASE.containsSpecies("Sesbania", "sesban x goetzei"), equalTo(true));
}
/**
* ETH013: GENUS=Sesbania and SPECIES=grandiflora x goetzeii
*/
@Test
public void testSesbania_grandifloraXgoetzeii() {
if (CHECKER == null) {
return;
}
testMisspelledHybrid("Sesbania", "grandiflora", "goetzei", "grandiflora x goetzei");
testMisspelledHybrid("Sesbania", "grandiflor", "goetzeii", "grandiflora x goetzei");
}
private void testMisspelledHybrid(String genus, String species1, String species2, String correctHybrid) {
// assertThat(genus + " " + species1 + " is a valid species", DATABASE.containsSpecies(genus, species1), equalTo(true));
// assertThat(genus + " " + species2 + " is not a valid species", DATABASE.containsSpecies(genus, species2), equalTo(false));
// assertThat("Must have suggestions for " + species1, CHECKER.suggestSpecies(genus, species1, 10), hasSize(greaterThan(0)));
// assertThat("Must have suggestions for " + species2, CHECKER.suggestSpecies(genus, species2, 10), hasSize(greaterThan(0)));
String hybrid=species1 + " x " + species2;
// assertThat(genus + " " + hybrid + " is not a valid hybrid", DATABASE.containsSpecies(genus, hybrid), equalTo(false));
assertThat("At least one suggestion must be made", CHECKER.suggestSpecies(genus, hybrid, 10), hasSize(greaterThan(0)));
assertThat("Suggestion must be " + correctHybrid, CHECKER.suggestSpecies(genus, hybrid, 1), contains(correctHybrid));
}
/**
* Acaena magellanica x tenera
*/
@Test
public void testAcaena_magellanicaXtenera() {
if (CHECKER == null) {
return;
}
assertThat("No suggestion must be made (no tenera suggestions)", CHECKER.suggestSpecies("Acaena", "magellanica x tenera", 10), hasSize(0));
testMisspelledHybrid("Acaena", "magellanic", "tenera", "magellanica x tenera");
testMisspelledHybrid("Acaena", "magellanicaa", "tenera", "magellanica x tenera");
}
}
......@@ -295,11 +295,11 @@ public class GGTC {
}
LOG.debug("Adding {}", HEADER_GRINTAX_SPECIESCURRENT);
outputHeaders.add(outputHeaders.indexOf(HEADER_SPECIES) + 1, HEADER_GRINTAX_SPECIESCURRENT);
outputMapping.add(outputHeaders.indexOf(HEADER_SPECIES) + 1, null);
outputHeaders.add(outputHeaders.indexOf(HEADER_SPECIES_CHECK) + 1, HEADER_GRINTAX_SPECIESCURRENT);
outputMapping.add(outputHeaders.indexOf(HEADER_SPECIES_CHECK) + 1, null);
LOG.debug("Adding {}", HEADER_GRINTAX_SPECIESID);
outputHeaders.add(outputHeaders.indexOf(HEADER_SPECIES) + 1, HEADER_GRINTAX_SPECIESID);
outputMapping.add(outputHeaders.indexOf(HEADER_SPECIES) + 1, null);
outputHeaders.add(outputHeaders.indexOf(HEADER_SPECIES_CHECK) + 1, HEADER_GRINTAX_SPECIESID);
outputMapping.add(outputHeaders.indexOf(HEADER_SPECIES_CHECK) + 1, null);
}
LOG.info("Output headers: {}", outputHeaders);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment