Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Genesys PGR
validator.genesys-pgr.org
Commits
aa4e6c04
Commit
aa4e6c04
authored
Nov 09, 2018
by
Matija Obreza
Browse files
Updated for new filenames in USDA Taxonomy dump
parent
e3654299
Changes
3
Hide whitespace changes
Inline
Side-by-side
src/main/java/org/genesys/taxonomy/checker/web/config/ApplicationConfig.java
View file @
aa4e6c04
...
...
@@ -136,8 +136,8 @@ public class ApplicationConfig {
}
// The two required files
final
File
genusFile
=
new
File
(
dataFolder
,
"
TAXONOMY_GENUS
.txt"
);
final
File
speciesFile
=
new
File
(
dataFolder
,
"
TAXONOMY_SPECIES
.txt"
);
final
File
genusFile
=
new
File
(
dataFolder
,
"
taxonomy_genus
.txt"
);
final
File
speciesFile
=
new
File
(
dataFolder
,
"
taxonomy_species
.txt"
);
if
(!
genusFile
.
exists
()
||
!
speciesFile
.
exists
())
{
LOG
.
warn
(
"Taxonomy data not provided in {}, starting download"
,
dataFolder
.
getAbsolutePath
());
...
...
src/main/java/org/genesys/taxonomy/checker/web/service/impl/TaxonomyProcessServiceImpl.java
View file @
aa4e6c04
...
...
@@ -95,18 +95,18 @@ public class TaxonomyProcessServiceImpl implements ProcessService {
public
void
readDatabase
(
final
String
path
)
throws
UnsupportedEncodingException
,
FileNotFoundException
,
IOException
,
ParseException
,
TaxonomyException
{
final
File
rootDir
=
new
File
(
path
);
LOG
.
info
(
"Loading
TAXONOMY_GENUS
.txt"
);
// read
TAXONOMY_GENUS
.txt
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
new
File
(
rootDir
,
"
TAXONOMY_GENUS
.txt"
)),
"UTF-8"
),
1
))
{
LOG
.
info
(
"Loading
taxonomy_genus
.txt"
);
// read
taxonomy_genus
.txt
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
new
File
(
rootDir
,
"
taxonomy_genus
.txt"
)),
"UTF-8"
),
1
))
{
GenusRow
genusRow
=
null
;
while
((
genusRow
=
TaxonomyReader
.
toGenus
(
reader
.
readNext
()))
!=
null
)
{
taxonomyDatabase
.
registerGenus
(
genusRow
.
getGenusId
(),
genusRow
.
getGenusName
());
}
}
LOG
.
info
(
"Loading
TAXONOMY_SPECIES
.txt"
);
// read
TAXONOMY_SPECIES
.txt
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
new
File
(
rootDir
,
"
TAXONOMY_SPECIES
.txt"
)),
"UTF-8"
),
1
))
{
LOG
.
info
(
"Loading
taxonomy_species
.txt"
);
// read
taxonomy_species
.txt
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
new
File
(
rootDir
,
"
taxonomy_species
.txt"
)),
"UTF-8"
),
1
))
{
SpeciesRow
speciesRow
=
null
;
while
((
speciesRow
=
TaxonomyReader
.
toSpecies
(
reader
.
readNext
()))
!=
null
)
{
taxonomyDatabase
.
registerSpecies
(
speciesRow
);
...
...
src/test/java/org/genesys/grin/WhatsWrong.java
0 → 100644
View file @
aa4e6c04
package
org.genesys.grin
;
import
java.io.File
;
import
java.io.FileInputStream
;
import
java.io.IOException
;
import
java.io.InputStreamReader
;
import
java.text.ParseException
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.HashSet
;
import
java.util.Map
;
import
java.util.Set
;
import
org.apache.commons.lang3.StringEscapeUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.genesys.taxonomy.checker.web.config.ApplicationConfig
;
import
org.genesys.taxonomy.download.TaxonomyDownloader
;
import
org.genesys.taxonomy.gringlobal.component.TaxonomyReader
;
import
org.genesys.taxonomy.gringlobal.model.AuthorRow
;
import
org.genesys.taxonomy.gringlobal.model.GenusRow
;
import
org.genesys.taxonomy.gringlobal.model.SpeciesRow
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.opencsv.CSVReader
;
public
class
WhatsWrong
{
private
final
static
Logger
LOG
=
LoggerFactory
.
getLogger
(
ApplicationConfig
.
class
);
public
static
void
main
(
String
[]
args
)
{
try
{
// doit();
doitFamilyAuth
();
// doitGenusAuth();
// doitSpeciesAuth();
}
catch
(
Exception
e
)
{
LOG
.
error
(
e
.
getMessage
(),
e
);
}
}
public
static
void
doit
()
throws
IOException
{
final
File
dataFolder
=
new
File
(
"data/grintaxa"
);
if
(!
dataFolder
.
exists
())
{
LOG
.
warn
(
"Making directory "
+
dataFolder
.
getAbsolutePath
());
dataFolder
.
mkdirs
();
}
final
File
taxonomyAuthor
=
new
File
(
dataFolder
,
"TAXONOMY_AUTHOR.txt"
);
if
(!
taxonomyAuthor
.
exists
())
{
LOG
.
warn
(
"Taxonomy data not provided in {}, starting download"
,
dataFolder
.
getAbsolutePath
());
final
TaxonomyDownloader
dl
=
new
TaxonomyDownloader
();
LOG
.
warn
(
"Downloading GRIN-Taxonomy database to {}"
,
dataFolder
.
getAbsolutePath
());
final
File
downloadedCabFile
=
File
.
createTempFile
(
"grin-"
,
".cab"
);
dl
.
downloadCurrent
(
downloadedCabFile
);
TaxonomyDownloader
.
unpackCabinetFile
(
downloadedCabFile
,
dataFolder
,
false
);
}
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
taxonomyAuthor
),
"UTF-8"
),
1
))
{
String
[]
row
=
null
;
while
((
row
=
reader
.
readNext
())
!=
null
)
{
String
name
=
row
[
1
];
String
htmlName
=
row
[
3
];
if
(!
name
.
equals
(
htmlName
))
{
String
unescaped
=
StringEscapeUtils
.
unescapeHtml4
(
htmlName
);
if
(
unescaped
!=
null
&&
!
unescaped
.
equals
(
name
))
{
LOG
.
warn
(
"Author {} = {} should be {} in TAXONOMY_AUTHOR_ID={}"
,
name
,
htmlName
,
unescaped
,
row
[
0
]);
}
}
}
}
}
public
static
void
doitFamilyAuth
()
throws
IOException
,
ParseException
{
final
File
dataFolder
=
new
File
(
"data/grintaxa"
);
if
(!
dataFolder
.
exists
())
{
LOG
.
warn
(
"Making directory "
+
dataFolder
.
getAbsolutePath
());
dataFolder
.
mkdirs
();
}
final
File
taxonomyAuthor
=
new
File
(
dataFolder
,
"taxonomy_author.txt"
);
final
File
familyFile
=
new
File
(
dataFolder
,
"taxonomy_family.txt"
);
// final File genusFile = new File(dataFolder, "taxonomy_genus.txt");
// final File speciesFile = new File(dataFolder, "taxonomy_species.txt");
if
(!
(
taxonomyAuthor
.
exists
()
&&
familyFile
.
exists
()))
{
LOG
.
warn
(
"Taxonomy data not provided in {}, starting download"
,
dataFolder
.
getAbsolutePath
());
final
TaxonomyDownloader
dl
=
new
TaxonomyDownloader
();
LOG
.
warn
(
"Downloading GRIN-Taxonomy database to {}"
,
dataFolder
.
getAbsolutePath
());
final
File
downloadedCabFile
=
File
.
createTempFile
(
"grin-"
,
".cab"
);
dl
.
downloadCurrent
(
downloadedCabFile
);
TaxonomyDownloader
.
unpackCabinetFile
(
downloadedCabFile
,
dataFolder
,
false
);
}
Map
<
String
,
AuthorRow
>
authors
=
new
HashMap
<>();
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
taxonomyAuthor
),
"UTF-8"
),
1
))
{
String
[]
row
=
null
;
while
((
row
=
reader
.
readNext
())
!=
null
)
{
AuthorRow
authorRow
=
TaxonomyReader
.
toAuthor
(
row
);
authors
.
put
(
authorRow
.
getShortName
(),
authorRow
);
if
(!
authorRow
.
getShortName
().
equals
(
authorRow
.
getShortNameHtml
()))
{
String
unescaped
=
StringEscapeUtils
.
unescapeHtml4
(
authorRow
.
getShortNameHtml
());
if
(
unescaped
!=
null
&&
!
unescaped
.
equals
(
authorRow
.
getShortName
()))
{
// LOG.warn("Author {} = {} should be {} in TAXONOMY_AUTHOR_ID={}", authorRow.getShortName(), authorRow.htmlName, unescaped, authorRow.authorId);
}
}
}
}
Set
<
String
>
authorCache
=
new
HashSet
<>();
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
familyFile
),
"UTF-8"
),
1
))
{
String
[]
row
=
null
;
while
((
row
=
reader
.
readNext
())
!=
null
)
{
String
authorName
=
row
[
6
];
if
(
authorCache
.
contains
(
authorName
))
{
continue
;
}
checkAuthority
(
authors
,
"FAMILY_AUTHORITY"
,
Long
.
parseLong
(
row
[
0
]),
row
[
5
],
authorName
);
authorCache
.
add
(
authorName
);
}
}
}
public
static
void
doitGenusAuth
()
throws
IOException
,
ParseException
{
final
File
dataFolder
=
new
File
(
"data/grintaxa"
);
if
(!
dataFolder
.
exists
())
{
LOG
.
warn
(
"Making directory "
+
dataFolder
.
getAbsolutePath
());
dataFolder
.
mkdirs
();
}
final
File
taxonomyAuthor
=
new
File
(
dataFolder
,
"taxonomy_author.txt"
);
final
File
genusFile
=
new
File
(
dataFolder
,
"taxonomy_genus.txt"
);
if
(!
(
taxonomyAuthor
.
exists
()
&&
genusFile
.
exists
()))
{
LOG
.
warn
(
"Taxonomy data not provided in {}, starting download"
,
dataFolder
.
getAbsolutePath
());
final
TaxonomyDownloader
dl
=
new
TaxonomyDownloader
();
LOG
.
warn
(
"Downloading GRIN-Taxonomy database to {}"
,
dataFolder
.
getAbsolutePath
());
final
File
downloadedCabFile
=
File
.
createTempFile
(
"grin-"
,
".cab"
);
dl
.
downloadCurrent
(
downloadedCabFile
);
TaxonomyDownloader
.
unpackCabinetFile
(
downloadedCabFile
,
dataFolder
,
false
);
}
Map
<
String
,
AuthorRow
>
authors
=
new
HashMap
<>();
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
taxonomyAuthor
),
"UTF-8"
),
1
))
{
String
[]
row
=
null
;
while
((
row
=
reader
.
readNext
())
!=
null
)
{
AuthorRow
authorRow
=
TaxonomyReader
.
toAuthor
(
row
);
authors
.
put
(
authorRow
.
getShortName
(),
authorRow
);
if
(!
authorRow
.
getShortName
().
equals
(
authorRow
.
getShortNameHtml
()))
{
String
unescaped
=
StringEscapeUtils
.
unescapeHtml4
(
authorRow
.
getShortNameHtml
());
if
(
unescaped
!=
null
&&
!
unescaped
.
equals
(
authorRow
.
getShortName
()))
{
// LOG.warn("Author {} = {} should be {} in TAXONOMY_AUTHOR_ID={}", authorRow.getShortName(), authorRow.htmlName, unescaped, authorRow.authorId);
}
}
}
}
Set
<
String
>
authorCache
=
new
HashSet
<>();
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
genusFile
),
"UTF-8"
),
1
))
{
String
[]
row
=
null
;
while
((
row
=
reader
.
readNext
())
!=
null
)
{
GenusRow
genusRow
=
TaxonomyReader
.
toGenus
(
row
);
if
(
authorCache
.
contains
(
genusRow
.
getGenusAuthority
()))
{
continue
;
}
Set
<
String
>
authorNames
=
parseAuthority
(
genusRow
.
getGenusAuthority
());
for
(
String
name
:
authorNames
)
{
AuthorRow
authorRow
=
authors
.
get
(
name
);
if
(
authorRow
==
null
)
{
LOG
.
warn
(
"No author name\t{}\tin authority\t{}\tfor genus\t{}\tin TAXONOMY_GENUS_ID=\t{}\t{}"
,
name
,
genusRow
.
getGenusAuthority
(),
genusRow
.
getGenusName
(),
genusRow
.
getGenusId
(),
genusRow
.
isCurrent
());
}
else
if
(!
name
.
equals
(
authorRow
.
getShortName
()))
{
LOG
.
warn
(
"Genus\t{}\tauthority\t{}={}\tshould be\t{}\tin TAXONOMY_GENUS_ID=\t{}\t{}"
,
genusRow
.
getGenusName
(),
genusRow
.
getGenusAuthority
(),
authorRow
.
getShortName
(),
authorRow
.
getShortNameHtml
(),
genusRow
.
getGenusId
(),
genusRow
.
isCurrent
());
}
}
authorCache
.
add
(
genusRow
.
getGenusAuthority
());
}
}
}
public
static
void
doitSpeciesAuth
()
throws
IOException
,
ParseException
{
final
File
dataFolder
=
new
File
(
"data/grintaxa"
);
if
(!
dataFolder
.
exists
())
{
LOG
.
warn
(
"Making directory "
+
dataFolder
.
getAbsolutePath
());
dataFolder
.
mkdirs
();
}
final
File
taxonomyAuthor
=
new
File
(
dataFolder
,
"taxonomy_author.txt"
);
final
File
speciesFile
=
new
File
(
dataFolder
,
"taxonomy_species.txt"
);
if
(!
(
taxonomyAuthor
.
exists
()
&&
speciesFile
.
exists
()))
{
LOG
.
warn
(
"Taxonomy data not provided in {}, starting download"
,
dataFolder
.
getAbsolutePath
());
final
TaxonomyDownloader
dl
=
new
TaxonomyDownloader
();
LOG
.
warn
(
"Downloading GRIN-Taxonomy database to {}"
,
dataFolder
.
getAbsolutePath
());
final
File
downloadedCabFile
=
File
.
createTempFile
(
"grin-"
,
".cab"
);
dl
.
downloadCurrent
(
downloadedCabFile
);
TaxonomyDownloader
.
unpackCabinetFile
(
downloadedCabFile
,
dataFolder
,
false
);
}
Map
<
String
,
AuthorRow
>
authors
=
new
HashMap
<>();
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
taxonomyAuthor
),
"UTF-8"
),
1
))
{
String
[]
row
=
null
;
while
((
row
=
reader
.
readNext
())
!=
null
)
{
AuthorRow
authorRow
=
TaxonomyReader
.
toAuthor
(
row
);
authors
.
put
(
authorRow
.
getShortName
(),
authorRow
);
if
(!
authorRow
.
getShortName
().
equals
(
authorRow
.
getShortNameHtml
()))
{
String
unescaped
=
StringEscapeUtils
.
unescapeHtml4
(
authorRow
.
getShortNameHtml
());
if
(
unescaped
!=
null
&&
!
unescaped
.
equals
(
authorRow
.
getShortName
()))
{
// LOG.warn("Author {} = {} should be {} in TAXONOMY_AUTHOR_ID={}", authorRow.getShortName(), authorRow.htmlName, unescaped, authorRow.authorId);
}
}
}
}
try
(
CSVReader
reader
=
TaxonomyReader
.
openCsvReader
(
new
InputStreamReader
(
new
FileInputStream
(
speciesFile
),
"UTF-8"
),
1
))
{
String
[]
row
=
null
;
while
((
row
=
reader
.
readNext
())
!=
null
)
{
try
{
SpeciesRow
speciesRow
=
TaxonomyReader
.
toSpecies
(
row
);
checkAuthority
(
authors
,
"SPECIES_AUTHORITY"
,
speciesRow
.
getSpeciesId
(),
speciesRow
.
getName
(),
speciesRow
.
getSpeciesAuthority
());
checkAuthority
(
authors
,
"SUBSPECIES_AUTHORITY"
,
speciesRow
.
getSpeciesId
(),
speciesRow
.
getName
(),
speciesRow
.
getSubspeciesAuthority
());
checkAuthority
(
authors
,
"VARIETY_AUTHORITY"
,
speciesRow
.
getSpeciesId
(),
speciesRow
.
getName
(),
speciesRow
.
getVarietyAuthority
());
checkAuthority
(
authors
,
"SUBVARIETY_AUTHORITY"
,
speciesRow
.
getSpeciesId
(),
speciesRow
.
getName
(),
speciesRow
.
getSubvarietyAuthority
());
checkAuthority
(
authors
,
"FORMA_AUTHORITY"
,
speciesRow
.
getSpeciesId
(),
speciesRow
.
getName
(),
speciesRow
.
getFormaAuthority
());
checkAuthority
(
authors
,
"NAME_AUTHORITY"
,
speciesRow
.
getSpeciesId
(),
speciesRow
.
getName
(),
speciesRow
.
getNameAuthority
());
}
catch
(
ParseException
e
)
{
String
x
=
"\n"
;
for
(
String
y
:
row
)
{
x
+=
y
+
"\n"
;
}
LOG
.
warn
(
"{} in row:\n{}"
,
e
.
getMessage
(),
x
,
e
);
}
}
}
}
private
static
void
checkAuthority
(
Map
<
String
,
AuthorRow
>
authors
,
String
label
,
Long
rowId
,
String
rowName
,
String
authority
)
{
if
(
StringUtils
.
isBlank
(
authority
))
{
return
;
}
Set
<
String
>
authorNames
=
parseAuthority
(
authority
);
for
(
String
name
:
authorNames
)
{
AuthorRow
author
=
authors
.
get
(
name
);
if
(
author
==
null
)
{
LOG
.
warn
(
"No author name\t{}\tin {}\t{}\tfor species\t{}\tin row_ID=\t{}"
,
name
,
label
,
authority
,
rowName
,
rowId
);
}
else
if
(!
name
.
equals
(
author
.
getShortName
()))
{
// LOG.warn("{} of \t{}\tauthority\t{}={}\tshould be\t{}\tin TAXONOMY_GENUS_ID=\t{}\t{}", label, rowName, authority, authorRow.getShortName(), authorRow.getShortName(), rowId);
}
}
return
;
}
private
static
Set
<
String
>
parseAuthority
(
String
genusAuthority
)
{
if
(
genusAuthority
==
null
)
return
Collections
.
emptySet
();
Set
<
String
>
authors
=
new
HashSet
<>();
String
[]
names
=
genusAuthority
.
split
(
",|\\sex\\s|&|\\)"
);
for
(
String
name
:
names
)
{
name
=
name
.
replaceAll
(
"[\\(\\)]|(et al\\.)|(nom. inval\\.)|(orth\\. var\\.)|(sensu)"
,
""
).
trim
();
if
(
StringUtils
.
isBlank
(
name
))
{
continue
;
}
// LOG.warn("{} -> {}", genusAuthority, name);
authors
.
add
(
name
);
}
return
authors
;
}
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment