Commit aa4e6c04 authored by Matija Obreza's avatar Matija Obreza
Browse files

Updated for new filenames in USDA Taxonomy dump

parent e3654299
......@@ -136,8 +136,8 @@ public class ApplicationConfig {
}
// The two required files
final File genusFile = new File(dataFolder, "TAXONOMY_GENUS.txt");
final File speciesFile = new File(dataFolder, "TAXONOMY_SPECIES.txt");
final File genusFile = new File(dataFolder, "taxonomy_genus.txt");
final File speciesFile = new File(dataFolder, "taxonomy_species.txt");
if (!genusFile.exists() || !speciesFile.exists()) {
LOG.warn("Taxonomy data not provided in {}, starting download", dataFolder.getAbsolutePath());
......
......@@ -95,18 +95,18 @@ public class TaxonomyProcessServiceImpl implements ProcessService {
public void readDatabase(final String path) throws UnsupportedEncodingException, FileNotFoundException, IOException, ParseException, TaxonomyException {
final File rootDir = new File(path);
LOG.info("Loading TAXONOMY_GENUS.txt");
// read TAXONOMY_GENUS.txt
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(new File(rootDir, "TAXONOMY_GENUS.txt")), "UTF-8"), 1)) {
LOG.info("Loading taxonomy_genus.txt");
// read taxonomy_genus.txt
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(new File(rootDir, "taxonomy_genus.txt")), "UTF-8"), 1)) {
GenusRow genusRow = null;
while ((genusRow = TaxonomyReader.toGenus(reader.readNext())) != null) {
taxonomyDatabase.registerGenus(genusRow.getGenusId(), genusRow.getGenusName());
}
}
LOG.info("Loading TAXONOMY_SPECIES.txt");
// read TAXONOMY_SPECIES.txt
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(new File(rootDir, "TAXONOMY_SPECIES.txt")), "UTF-8"), 1)) {
LOG.info("Loading taxonomy_species.txt");
// read taxonomy_species.txt
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(new File(rootDir, "taxonomy_species.txt")), "UTF-8"), 1)) {
SpeciesRow speciesRow = null;
while ((speciesRow = TaxonomyReader.toSpecies(reader.readNext())) != null) {
taxonomyDatabase.registerSpecies(speciesRow);
......
package org.genesys.grin;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.ParseException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.taxonomy.checker.web.config.ApplicationConfig;
import org.genesys.taxonomy.download.TaxonomyDownloader;
import org.genesys.taxonomy.gringlobal.component.TaxonomyReader;
import org.genesys.taxonomy.gringlobal.model.AuthorRow;
import org.genesys.taxonomy.gringlobal.model.GenusRow;
import org.genesys.taxonomy.gringlobal.model.SpeciesRow;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.opencsv.CSVReader;
public class WhatsWrong {
private final static Logger LOG = LoggerFactory.getLogger(ApplicationConfig.class);
public static void main(String[] args) {
try {
// doit();
doitFamilyAuth();
// doitGenusAuth();
// doitSpeciesAuth();
} catch (Exception e) {
LOG.error(e.getMessage(), e);
}
}
public static void doit() throws IOException {
final File dataFolder = new File("data/grintaxa");
if (!dataFolder.exists()) {
LOG.warn("Making directory " + dataFolder.getAbsolutePath());
dataFolder.mkdirs();
}
final File taxonomyAuthor = new File(dataFolder, "TAXONOMY_AUTHOR.txt");
if (! taxonomyAuthor.exists()) {
LOG.warn("Taxonomy data not provided in {}, starting download", dataFolder.getAbsolutePath());
final TaxonomyDownloader dl = new TaxonomyDownloader();
LOG.warn("Downloading GRIN-Taxonomy database to {}", dataFolder.getAbsolutePath());
final File downloadedCabFile = File.createTempFile("grin-", ".cab");
dl.downloadCurrent(downloadedCabFile);
TaxonomyDownloader.unpackCabinetFile(downloadedCabFile, dataFolder, false);
}
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(taxonomyAuthor), "UTF-8"), 1)) {
String[] row = null;
while ((row = reader.readNext()) != null) {
String name = row[1];
String htmlName = row[3];
if (! name.equals(htmlName)) {
String unescaped = StringEscapeUtils.unescapeHtml4(htmlName);
if (unescaped != null && ! unescaped.equals(name)) {
LOG.warn("Author {} = {} should be {} in TAXONOMY_AUTHOR_ID={}", name, htmlName, unescaped, row[0]);
}
}
}
}
}
public static void doitFamilyAuth() throws IOException, ParseException {
final File dataFolder = new File("data/grintaxa");
if (!dataFolder.exists()) {
LOG.warn("Making directory " + dataFolder.getAbsolutePath());
dataFolder.mkdirs();
}
final File taxonomyAuthor = new File(dataFolder, "taxonomy_author.txt");
final File familyFile = new File(dataFolder, "taxonomy_family.txt");
// final File genusFile = new File(dataFolder, "taxonomy_genus.txt");
// final File speciesFile = new File(dataFolder, "taxonomy_species.txt");
if (! (taxonomyAuthor.exists() && familyFile.exists())) {
LOG.warn("Taxonomy data not provided in {}, starting download", dataFolder.getAbsolutePath());
final TaxonomyDownloader dl = new TaxonomyDownloader();
LOG.warn("Downloading GRIN-Taxonomy database to {}", dataFolder.getAbsolutePath());
final File downloadedCabFile = File.createTempFile("grin-", ".cab");
dl.downloadCurrent(downloadedCabFile);
TaxonomyDownloader.unpackCabinetFile(downloadedCabFile, dataFolder, false);
}
Map<String, AuthorRow> authors = new HashMap<>();
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(taxonomyAuthor), "UTF-8"), 1)) {
String[] row = null;
while ((row = reader.readNext()) != null) {
AuthorRow authorRow = TaxonomyReader.toAuthor(row);
authors.put(authorRow.getShortName(), authorRow);
if (! authorRow.getShortName().equals(authorRow.getShortNameHtml())) {
String unescaped = StringEscapeUtils.unescapeHtml4(authorRow.getShortNameHtml());
if (unescaped != null && ! unescaped.equals(authorRow.getShortName())) {
// LOG.warn("Author {} = {} should be {} in TAXONOMY_AUTHOR_ID={}", authorRow.getShortName(), authorRow.htmlName, unescaped, authorRow.authorId);
}
}
}
}
Set<String> authorCache = new HashSet<>();
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(familyFile), "UTF-8"), 1)) {
String[] row = null;
while ((row = reader.readNext()) != null) {
String authorName = row[6];
if (authorCache.contains(authorName)) {
continue;
}
checkAuthority(authors, "FAMILY_AUTHORITY", Long.parseLong(row[0]), row[5], authorName);
authorCache.add(authorName);
}
}
}
public static void doitGenusAuth() throws IOException, ParseException {
final File dataFolder = new File("data/grintaxa");
if (!dataFolder.exists()) {
LOG.warn("Making directory " + dataFolder.getAbsolutePath());
dataFolder.mkdirs();
}
final File taxonomyAuthor = new File(dataFolder, "taxonomy_author.txt");
final File genusFile = new File(dataFolder, "taxonomy_genus.txt");
if (! (taxonomyAuthor.exists() && genusFile.exists())) {
LOG.warn("Taxonomy data not provided in {}, starting download", dataFolder.getAbsolutePath());
final TaxonomyDownloader dl = new TaxonomyDownloader();
LOG.warn("Downloading GRIN-Taxonomy database to {}", dataFolder.getAbsolutePath());
final File downloadedCabFile = File.createTempFile("grin-", ".cab");
dl.downloadCurrent(downloadedCabFile);
TaxonomyDownloader.unpackCabinetFile(downloadedCabFile, dataFolder, false);
}
Map<String, AuthorRow> authors = new HashMap<>();
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(taxonomyAuthor), "UTF-8"), 1)) {
String[] row = null;
while ((row = reader.readNext()) != null) {
AuthorRow authorRow = TaxonomyReader.toAuthor(row);
authors.put(authorRow.getShortName(), authorRow);
if (! authorRow.getShortName().equals(authorRow.getShortNameHtml())) {
String unescaped = StringEscapeUtils.unescapeHtml4(authorRow.getShortNameHtml());
if (unescaped != null && ! unescaped.equals(authorRow.getShortName())) {
// LOG.warn("Author {} = {} should be {} in TAXONOMY_AUTHOR_ID={}", authorRow.getShortName(), authorRow.htmlName, unescaped, authorRow.authorId);
}
}
}
}
Set<String> authorCache = new HashSet<>();
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(genusFile), "UTF-8"), 1)) {
String[] row = null;
while ((row = reader.readNext()) != null) {
GenusRow genusRow=TaxonomyReader.toGenus(row);
if (authorCache.contains(genusRow.getGenusAuthority())) {
continue;
}
Set<String> authorNames = parseAuthority(genusRow.getGenusAuthority());
for (String name : authorNames) {
AuthorRow authorRow = authors.get(name);
if (authorRow == null) {
LOG.warn("No author name\t{}\tin authority\t{}\tfor genus\t{}\tin TAXONOMY_GENUS_ID=\t{}\t{}", name, genusRow.getGenusAuthority(), genusRow.getGenusName(), genusRow.getGenusId(), genusRow.isCurrent());
} else if (! name.equals(authorRow.getShortName())) {
LOG.warn("Genus\t{}\tauthority\t{}={}\tshould be\t{}\tin TAXONOMY_GENUS_ID=\t{}\t{}", genusRow.getGenusName(), genusRow.getGenusAuthority(), authorRow.getShortName(), authorRow.getShortNameHtml(), genusRow.getGenusId(), genusRow.isCurrent());
}
}
authorCache.add(genusRow.getGenusAuthority());
}
}
}
public static void doitSpeciesAuth() throws IOException, ParseException {
final File dataFolder = new File("data/grintaxa");
if (!dataFolder.exists()) {
LOG.warn("Making directory " + dataFolder.getAbsolutePath());
dataFolder.mkdirs();
}
final File taxonomyAuthor = new File(dataFolder, "taxonomy_author.txt");
final File speciesFile = new File(dataFolder, "taxonomy_species.txt");
if (! (taxonomyAuthor.exists() && speciesFile.exists())) {
LOG.warn("Taxonomy data not provided in {}, starting download", dataFolder.getAbsolutePath());
final TaxonomyDownloader dl = new TaxonomyDownloader();
LOG.warn("Downloading GRIN-Taxonomy database to {}", dataFolder.getAbsolutePath());
final File downloadedCabFile = File.createTempFile("grin-", ".cab");
dl.downloadCurrent(downloadedCabFile);
TaxonomyDownloader.unpackCabinetFile(downloadedCabFile, dataFolder, false);
}
Map<String, AuthorRow> authors = new HashMap<>();
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(taxonomyAuthor), "UTF-8"), 1)) {
String[] row = null;
while ((row = reader.readNext()) != null) {
AuthorRow authorRow = TaxonomyReader.toAuthor(row);
authors.put(authorRow.getShortName(), authorRow);
if (! authorRow.getShortName().equals(authorRow.getShortNameHtml())) {
String unescaped = StringEscapeUtils.unescapeHtml4(authorRow.getShortNameHtml());
if (unescaped != null && ! unescaped.equals(authorRow.getShortName())) {
// LOG.warn("Author {} = {} should be {} in TAXONOMY_AUTHOR_ID={}", authorRow.getShortName(), authorRow.htmlName, unescaped, authorRow.authorId);
}
}
}
}
try (CSVReader reader = TaxonomyReader.openCsvReader(new InputStreamReader(new FileInputStream(speciesFile), "UTF-8"), 1)) {
String[] row = null;
while ((row = reader.readNext()) != null) {
try {
SpeciesRow speciesRow=TaxonomyReader.toSpecies(row);
checkAuthority(authors, "SPECIES_AUTHORITY", speciesRow.getSpeciesId(), speciesRow.getName(), speciesRow.getSpeciesAuthority());
checkAuthority(authors, "SUBSPECIES_AUTHORITY", speciesRow.getSpeciesId(), speciesRow.getName(), speciesRow.getSubspeciesAuthority());
checkAuthority(authors, "VARIETY_AUTHORITY", speciesRow.getSpeciesId(), speciesRow.getName(), speciesRow.getVarietyAuthority());
checkAuthority(authors, "SUBVARIETY_AUTHORITY", speciesRow.getSpeciesId(), speciesRow.getName(), speciesRow.getSubvarietyAuthority());
checkAuthority(authors, "FORMA_AUTHORITY", speciesRow.getSpeciesId(), speciesRow.getName(), speciesRow.getFormaAuthority());
checkAuthority(authors, "NAME_AUTHORITY", speciesRow.getSpeciesId(), speciesRow.getName(), speciesRow.getNameAuthority());
} catch (ParseException e) {
String x="\n";
for (String y : row) {
x+=y + "\n";
}
LOG.warn("{} in row:\n{}", e.getMessage(), x, e);
}
}
}
}
private static void checkAuthority(Map<String, AuthorRow> authors, String label, Long rowId, String rowName, String authority) {
if (StringUtils.isBlank(authority)) {
return;
}
Set<String> authorNames = parseAuthority(authority);
for (String name : authorNames) {
AuthorRow author = authors.get(name);
if (author == null) {
LOG.warn("No author name\t{}\tin {}\t{}\tfor species\t{}\tin row_ID=\t{}", name, label, authority, rowName, rowId);
} else if (! name.equals(author.getShortName())) {
// LOG.warn("{} of \t{}\tauthority\t{}={}\tshould be\t{}\tin TAXONOMY_GENUS_ID=\t{}\t{}", label, rowName, authority, authorRow.getShortName(), authorRow.getShortName(), rowId);
}
}
return;
}
private static Set<String> parseAuthority(String genusAuthority) {
if (genusAuthority == null)
return Collections.emptySet();
Set<String> authors = new HashSet<>();
String[] names = genusAuthority.split(",|\\sex\\s|&|\\)");
for (String name : names) {
name=name.replaceAll("[\\(\\)]|(et al\\.)|(nom. inval\\.)|(orth\\. var\\.)|(sensu)", "").trim();
if (StringUtils.isBlank(name)) {
continue;
}
// LOG.warn("{} -> {}", genusAuthority, name);
authors.add(name);
}
return authors;
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment