Commit a4512040 authored by Matija Obreza's avatar Matija Obreza
Browse files

SUBTAXA check (fixes #7)

parent 73375935
......@@ -24,6 +24,7 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.genesys2.gringlobal.taxonomy.model.SpeciesRow;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -195,17 +196,31 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
}
BestScore bestScore = new BestScore();
// System.err.println("doo");
return getAllGenusSpecies(genus).stream().map(SpeciesRow::getSpeciesName).distinct()
return getAllGenusSpecies(genus).stream().map(SpeciesRow::getSpeciesName)
// keep distinct elements
.distinct()
// print
// .peek(InMemoryTaxonomyDatabase::print)
.map(candidate -> new Suggestion<String>(candidate, similarityScore(species, candidate))).filter(scored -> scored.getScore() >= .5)
.sorted(Comparator.comparing(Suggestion::getScore, Comparator.reverseOrder())).peek(scored -> bestScore.update(scored.getScore()))
// .peek(InMemoryTaxonomyDatabase::print)
// .filter(scored -> scored.getScore() >= bestScore.getBestScore() * 0.8)
// convert each candidate to a Suggestion
.map(candidate -> new Suggestion<String>(candidate, similarityScore(species, candidate)))
// keep suggestions with score over .5
.filter(scored -> scored.getScore() >= .5)
// sort suggestions by score, descending
.sorted(Comparator.comparing(Suggestion::getScore, Comparator.reverseOrder()))
// generate bestScore
.peek(scored -> bestScore.update(scored.getScore()))
// keep suggestions: when bestScore is 1.0 keep suggestions over 0.95 otherwise keep suggestions within
// 80%
.filter(scored -> scored.getScore() >= (bestScore.getBestScore() == 1.0 ? 0.95 : bestScore.getBestScore() * 0.8))
// .filter(scored -> scored.getScore() - bestScore.getBestScore() <= 2)
.peek(InMemoryTaxonomyDatabase::print).map(Suggestion::getSuggestion).distinct().limit(maxSize).collect(Collectors.toList());
// print
// .peek(InMemoryTaxonomyDatabase::print)
// convert sorted Suggestion list back to Strings
.map(Suggestion::getSuggestion)
// limit to maxSize
.limit(maxSize)
// convert to List<String>
.collect(Collectors.toList());
}
/**
......@@ -233,9 +248,52 @@ public class InMemoryTaxonomyDatabase implements TaxonomyDatabase {
return null;
}
return getAllGenusSpecies(genus).stream().filter(speciesRow -> species.equals(speciesRow.getSpeciesName()))
.peek(speciesRow -> LOG.trace("Species authority {}", speciesRow.getSpeciesAuthority())).findFirst().map(speciesRow -> speciesRow.getSpeciesAuthority())
.orElse(null);
return getAllGenusSpecies(genus).stream()
// keep rows with matching species
.filter(speciesRow -> species.equals(speciesRow.getSpeciesName()))
// debug print
.peek(speciesRow -> LOG.trace("Species authority {}", speciesRow.getSpeciesAuthority()))
// keep first match only
.findFirst()
// to String or null
.map(speciesRow -> speciesRow.getSpeciesAuthority()).orElse(null);
}
@Override
public List<String> findSimilarSubtaxa(String genus, String species, String subtaxa, int maxSize) {
LOG.debug("Searching similar subtaxa for genus={} species={} subtaxa={}", genus, species, subtaxa);
List<Long> genusId = genusIdLookup.get(genus);
if (genusId == null) {
throw new UnsupportedOperationException("Genus does not exist in database. Genus=" + genus);
}
BestScore bestScore = new BestScore();
return getAllGenusSpecies(genus).stream().filter(speciesRow -> species.equals(speciesRow.getSpeciesName()))
// keep subtaxa
.map(SpeciesRow::getSubtaxa)
// debug print
// .peek(InMemoryTaxonomyDatabase::print)
// keep candidates that are not blank
.filter(candidate -> StringUtils.isNotBlank(candidate))
// keep distinct
.distinct()
// convert to Suggestions
.map(candidate -> new Suggestion<String>(candidate, similarityScore(subtaxa, candidate)))
// keep suggestions with decent score
.filter(scored -> scored.getScore() >= .5)
// sort by score descending
.sorted(Comparator.comparing(Suggestion::getScore, Comparator.reverseOrder()))
// update best score
.peek(scored -> bestScore.update(scored.getScore()))
// keep only nice Suggestions
.filter(scored -> scored.getScore() >= (bestScore.getBestScore() == 1.0 ? 0.95 : bestScore.getBestScore() * 0.8))
// debug print
// .peek(InMemoryTaxonomyDatabase::print)
// back to Strings
.map(Suggestion::getSuggestion)
// limit results to maxSize
.limit(maxSize).collect(Collectors.toList());
}
}
......@@ -70,7 +70,7 @@ public class TaxonomyChecker {
*
* @param genus the genus
* @param species the species
* @param maxSize TODO
* @param maxSize maximum number of suggestions to return
* @return suggested fixes for genus or empty list if species is fine or when there are no suggestions.
*/
public List<String> suggestSpecies(String genus, String species, int maxSize) {
......@@ -111,4 +111,20 @@ public class TaxonomyChecker {
return database.getSpeciesAuthority(genus, species);
}
/**
* Find suggestions for SUBTAXA.
*
* @param genus must be valid genus in the database
* @param species species must be valid species within genus
* @param species current subtaxa, must not be null or blank
* @param maxSize maximum number of suggestions to return
* @return suggested fixes for subtaxa or empty list if there are no suggestions.
*/
public List<String> suggestSubtaxa(String genus, String species, String subtaxa, int maxSize) {
if (StringUtils.isBlank(subtaxa) || !database.containsSpecies(genus, species)) {
return Collections.emptyList();
}
return database.findSimilarSubtaxa(genus, species, subtaxa, maxSize);
}
}
......@@ -54,7 +54,7 @@ public interface TaxonomyDatabase {
*
* @param genus valid genus
* @param species species
* @param maxSize TODO
* @param maxSize maximum number of suggestions to return
* @return list of suggestions, ordered by preference (best first), never null.
*/
List<String> findSimilarSpecies(String genus, String species, int maxSize);
......@@ -68,4 +68,15 @@ public interface TaxonomyDatabase {
*/
String getSpeciesAuthority(String genus, String species);
/**
* Suggest a list of subtaxa
*
* @param genus valid genus
* @param species valid species
* @param subtaxa current subtaxa
* @param maxSize maximum number of suggestions to return
* @return list of suggestions, ordered by preference, never null
*/
List<String> findSimilarSubtaxa(String genus, String species, String subtaxa, int maxSize);
}
......@@ -156,6 +156,9 @@ public class TaxonomyReader {
speciesRow.setOwnedDate(toDate(row[41]));
speciesRow.setOwnedById(row[42]);
// MCPD SUBTAXA
speciesRow.setSubtaxa(speciesRow.toSubtaxa());
return speciesRow;
}
......
......@@ -20,6 +20,7 @@ import java.util.Date;
import com.opencsv.bean.CsvBind;
// TODO: Auto-generated Javadoc
/**
* "TAXONOMY_SPECIES_ID", "CURRENT_TAXONOMY_SPECIES_ID", "NOMEN_NUMBER", "IS_SPECIFIC_HYBRID", "SPECIES_NAME",
* "SPECIES_AUTHORITY", "IS_SUBSPECIFIC_HYBRID", "SUBSPECIES_NAME", "SUBSPECIES_AUTHORITY", "IS_VARIETAL_HYBRID",
......@@ -40,175 +41,178 @@ public class SpeciesRow implements Serializable {
/** The species id. */
@CsvBind
private Long speciesId;
/** The current species id. */
@CsvBind
private Long currentSpeciesId;
/** The nomen number. */
@CsvBind
private Long nomenNumber;
/** The specific hybrid. */
@CsvBind
private Boolean specificHybrid;
/** The species name. */
@CsvBind
private String speciesName;
/** The species authority. */
@CsvBind
private String speciesAuthority;
/** The subspecific hybrid. */
@CsvBind
private Boolean subspecificHybrid;
/** The subspecies name. */
@CsvBind
private String subspeciesName;
/** The subspecies authority. */
@CsvBind
private String subspeciesAuthority;
/** The varietal hybrid. */
@CsvBind
private Boolean varietalHybrid;
/** The variety name. */
@CsvBind
private String varietyName;
/** The variety authority. */
@CsvBind
private String varietyAuthority;
/** The subvarietal hybrid. */
@CsvBind
private Boolean subvarietalHybrid;
/** The subvariety name. */
@CsvBind
private String subvarietyName;
/** The subvariety authority. */
@CsvBind
private String subvarietyAuthority;
/** The forma hybrid. */
@CsvBind
private Boolean formaHybrid;
/** The forma rank type. */
@CsvBind
private String formaRankType;
/** The forma name. */
@CsvBind
private String formaName;
/** The forma authority. */
@CsvBind
private String formaAuthority;
/** The genus id. */
@CsvBind
private Long genusId;
/** The priority site1. */
@CsvBind
private String prioritySite1;
/** The priority site2. */
@CsvBind
private String prioritySite2;
/** The curator1 id. */
@CsvBind
private Long curator1Id;
/** The curator2 id. */
@CsvBind
private Long curator2Id;
/** The restriction code. */
@CsvBind
private String restrictionCode;
/** The life form code. */
@CsvBind
private String lifeFormCode;
/** The common fertilization code. */
@CsvBind
private String commonFertilizationCode;
/** The name pending. */
@CsvBind
private Boolean namePending;
/** The synonym code. */
@CsvBind
private String synonymCode;
/** The verifier id. */
@CsvBind
private Long verifierId;
/** The name verified date. */
@CsvBind
private Date nameVerifiedDate;
/** The name. */
@CsvBind
private String name;
/** The name authority. */
@CsvBind
private String nameAuthority;
/** The protologue. */
@CsvBind
private String protologue;
/** The note. */
@CsvBind
private String note;
/** The site note. */
@CsvBind
private String siteNote;
/** The alternate name. */
@CsvBind
private String alternateName;
/** The created date. */
@CsvBind
private Date createdDate;
/** The created by id. */
@CsvBind
private String createdById;
/** The modified date. */
@CsvBind
private Date modifiedDate;
/** The modified by id. */
@CsvBind
private String modifiedById;
/** The owned date. */
@CsvBind
private Date ownedDate;
/** The owned by id. */
@CsvBind
private String ownedById;
/** MCPD SUBTAXA is not part of GRIN-Global taxonomy data */
private String subtaxa;
/**
* Gets the species id.
*
......@@ -991,4 +995,53 @@ public class SpeciesRow implements Serializable {
public boolean isCurrent() {
return speciesId.equals(currentSpeciesId);
}
/**
* Gets the MCPD SUBTAXA.
*
* @return the subtaxa
*/
public String getSubtaxa() {
return this.subtaxa;
}
/**
* Sets the MCPD SUBTAXA.
*
* @param subtaxa the new subtaxa
*/
public void setSubtaxa(String subtaxa) {
this.subtaxa = subtaxa;
}
/**
* Generates and returns the subspecies, variety information as the MCPD SUBTAXA. This should be like taking the
* {@link #name} field and removing Genus and species.
*
* @return the subtaxa
*/
public String toSubtaxa() {
if (formaName != null) {
return formaName;
}
if (subvarietyName != null) {
return "subvar. " + subvarietyName;
}
if (varietyName != null) {
return "var. " + varietyName;
}
if (subspeciesName != null) {
return "subsp. " + subspeciesName;
}
return null;
}
/**
* Get MCPD SUBTAUTHOR, a synonym for {@link #getNameAuthority()}.
*
* @return {@link #getNameAuthority()}
*/
public String getSubtaxaAuthority() {
return getNameAuthority();
}
}
......@@ -31,6 +31,7 @@ import java.util.Arrays;
import com.opencsv.CSVReader;
import org.apache.commons.lang3.StringUtils;
import org.genesys2.gringlobal.taxonomy.component.TaxonomyReader;
import org.genesys2.gringlobal.taxonomy.model.SpeciesRow;
import org.junit.Test;
......@@ -39,7 +40,7 @@ import org.junit.Test;
* Testing the TAXONOMY_SPECIES.txt
*/
public class SpeciesTest {
/** The Constant TAXONOMY_SPECIES_HEADERS. */
private static final String[] TAXONOMY_SPECIES_HEADERS = { "TAXONOMY_SPECIES_ID", "CURRENT_TAXONOMY_SPECIES_ID", "NOMEN_NUMBER", "IS_SPECIFIC_HYBRID", "SPECIES_NAME",
"SPECIES_AUTHORITY", "IS_SUBSPECIFIC_HYBRID", "SUBSPECIES_NAME", "SUBSPECIES_AUTHORITY", "IS_VARIETAL_HYBRID", "VARIETY_NAME", "VARIETY_AUTHORITY",
......@@ -50,7 +51,7 @@ public class SpeciesTest {
/** The Constant TAXONOMY_SPECIES_CSV. */
private static final String TAXONOMY_SPECIES_CSV = "TAXONOMY_SPECIES.txt";
/** The Constant RESOURCES_PATH. */
private static final String RESOURCES_PATH = "/taxonomy_data_samples/";
......@@ -106,13 +107,12 @@ public class SpeciesTest {
public void read1stSpeciesRow() throws FileNotFoundException, IOException, ParseException {
try (CSVReader reader = open(TAXONOMY_SPECIES_CSV, 1)) {
SpeciesRow speciesRow = TaxonomyReader.toSpecies(reader.readNext());
assertThat("1st data row does not match", speciesRow, notNullValue());
assertThat(speciesRow.getGenusId(), equalTo(7085l));
}
}
/**
* Scan entire CSV. This test only runs if www.ars-grin.gov/~dbmuke/cgi-bin/gringlobal/1.9.6.2/taxonomy_data.cab is unpacked to taxonomy_data folder.
......@@ -124,26 +124,29 @@ public class SpeciesTest {
*/
@Test
public void fullScan() throws UnsupportedEncodingException, FileNotFoundException, IOException, ParseException {
File file = new File("taxonomy_data", "TAXONOMY_SPECIES.txt");
File file = new File("../taxonomy_data", "TAXONOMY_SPECIES.txt");
if (!file.exists()) {
return;
}
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(file), "UTF-8"), 0)) {
String[] headerRow = reader.readNext();
// Dumb BOM at the start of a UTF8 file!
headerRow[0]=TAXONOMY_SPECIES_HEADERS[0];
headerRow[0] = TAXONOMY_SPECIES_HEADERS[0];
assertThat("Header row must have more than 1 element", Arrays.asList(headerRow), not(hasSize(1)));
assertThat("Header row does not match", Arrays.asList(headerRow), contains(TAXONOMY_SPECIES_HEADERS));
SpeciesRow speciesRow = null;
while ((speciesRow = TaxonomyReader.toSpecies(reader.readNext())) != null) {
// System.out.println(speciesRow.isCurrent() + " " + speciesRow.getSpeciesAuthority() + " " + speciesRow.getName());
// System.out.println(speciesRow.isCurrent() + " " + speciesRow.getSpeciesAuthority() + " " +
// speciesRow.getName());
assertThat(speciesRow, not(nullValue()));
assertThat(speciesRow.getSpeciesId(), not(nullValue()));
assertThat(speciesRow.getGenusId(), not(nullValue()));
assertThat("SUBTAXA messed up", speciesRow.getName(), endsWith(StringUtils.defaultIfBlank(speciesRow.getSubtaxa(), "")));
// System.err.println(speciesRow.getName() + " << " + speciesRow.toSubtaxa());
}
}
......
......@@ -64,6 +64,12 @@ public class GGTC {
private static final String HEADER_SPAUTHOR = "SPAUTHOR";
private static final String HEADER_SPAUTHOR_CHECK = "SPAUTHOR_check";
private static final String HEADER_SUBTAXA = "SUBTAXA";
private static final String HEADER_SUBTAXA_CHECK = "SUBTAXA_check";
private static final String HEADER_SUBTAUTHOR = "SUBTAUTHOR";
private static final String HEADER_SUBTAUTHOR_CHECK = "SUBTAUTHOR_check";
/** The Constant LOG. */
private final static Logger LOG = LoggerFactory.getLogger(GGTC.class);
......@@ -87,7 +93,7 @@ public class GGTC {
if ("-v".equals(arg)) {
// Increase logging level
org.apache.log4j.Logger logger = LogManager.getLogger("org.genesys2");
logger.setLevel(nextLevel(logger.getLevel().toInt()));
logger.setLevel(nextLevel(logger.getLevel()));
LOG.warn("Increasing loglevel to {}", logger.getLevel());
} else {
// Files
......@@ -142,23 +148,19 @@ public class GGTC {
}
}
private static Level nextLevel(int level) {
if (level == Level.OFF_INT) {
return Level.FATAL;
} else if (level == Level.FATAL_INT) {
private static Level nextLevel(Level level) {
if (level.equals(Level.FATAL)) {
return Level.ERROR;
} else if (level == Level.ERROR_INT) {
} else if (level.equals(Level.ERROR)) {
return Level.WARN;
} else if (level == Level.WARN_INT) {
} else if (level.equals(Level.WARN)) {
return Level.INFO;
} else if (level == Level.INFO_INT) {
} else if (level.equals(Level.INFO)) {
return Level.DEBUG;
} else if (level == Level.DEBUG_INT) {
} else if (level.equals(Level.DEBUG)) {
return Level.TRACE;
} else if (level == Level.TRACE_INT) {
} else
return Level.ALL;
}
return Level.INFO;
}