Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
GGCE
GGCE Server
Commits
732ca5db
Commit
732ca5db
authored
Apr 27, 2022
by
Artem Hrybeniuk
Browse files
Accession similarity search
parent
9d1b9f5f
Changes
5
Hide whitespace changes
Inline
Side-by-side
server/src/main/java/org/gringlobal/api/v1/impl/AccessionController.java
View file @
732ca5db
...
...
@@ -52,6 +52,8 @@ import org.gringlobal.service.filter.AccessionFilter;
import
org.gringlobal.service.glis.impl.GlisDOIRegistrationManager
;
import
org.gringlobal.service.glis.impl.GlisDOIRegistrationManager.GlisDoiResponse
;
import
org.gringlobal.spring.CSVMessageConverter
;
import
org.gringlobal.worker.dupe.AccessionDuplicateFinder
;
import
org.gringlobal.worker.dupe.DuplicateFinder
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.data.domain.Page
;
import
org.springframework.data.domain.Pageable
;
...
...
@@ -92,6 +94,9 @@ public class AccessionController extends FilteredCRUDController<Accession, Acces
@Autowired
private
GlisDOIRegistrationManager
glisDOIRegistrationManager
;
@Autowired
(
required
=
false
)
private
AccessionDuplicateFinder
accessionDuplicateFinder
;
@Override
protected
Class
<
AccessionFilter
>
filterType
()
{
return
AccessionFilter
.
class
;
...
...
@@ -238,4 +243,15 @@ public class AccessionController extends FilteredCRUDController<Accession, Acces
return
glisDOIRegistrationManager
.
updateDoiRegistration
(
filter
);
}
/**
* Find similar accessions by source
*
* @param source source Accession
* @return the list of similar Accessions
*/
@PostMapping
(
value
=
"/similar"
)
public
List
<
DuplicateFinder
.
Hit
<
Accession
>>
findSimilarForUnsaved
(
@RequestBody
(
required
=
true
)
final
Accession
source
)
{
return
accessionDuplicateFinder
.
findSimilar
(
source
);
}
}
server/src/main/java/org/gringlobal/model/Inventory.java
View file @
732ca5db
...
...
@@ -774,6 +774,10 @@ public class Inventory extends CooperatorOwnedModel implements Copyable<Inventor
return
preferredName
;
}
public
List
<
AccessionInvName
>
getNames
()
{
return
names
;
}
@Override
public
void
lazyLoad
()
{
super
.
lazyLoad
();
...
...
server/src/main/java/org/gringlobal/worker/dupe/AccessionDuplicateFinder.java
0 → 100644
View file @
732ca5db
/*
* Copyright 2022 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package
org.gringlobal.worker.dupe
;
import
org.apache.commons.collections.CollectionUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.genesys.blocks.model.filters.StringFilter
;
import
org.gringlobal.custom.elasticsearch.SearchException
;
import
org.gringlobal.model.Accession
;
import
org.gringlobal.model.AccessionInvName
;
import
org.gringlobal.model.Inventory
;
import
org.gringlobal.service.AccessionService
;
import
org.gringlobal.service.filter.AccessionFilter
;
import
org.gringlobal.service.filter.SiteFilter
;
import
org.gringlobal.service.filter.TaxonomyGenusFilter
;
import
org.gringlobal.service.filter.TaxonomySpeciesFilter
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.data.domain.Page
;
import
org.springframework.data.domain.PageRequest
;
import
org.springframework.stereotype.Component
;
import
java.util.ArrayList
;
import
java.util.Collection
;
import
java.util.List
;
import
java.util.Objects
;
import
java.util.Set
;
import
java.util.stream.Collectors
;
@Component
public
class
AccessionDuplicateFinder
extends
DuplicateFinder
<
Accession
>
{
@Autowired
private
AccessionService
accessionService
;
@Override
protected
double
getBestScoreThreshold
()
{
return
1000
d
;
}
@Override
protected
List
<
Accession
>
getCandidates
(
Accession
target
,
Collection
<
Long
>
excludedById
)
{
assert
(
target
!=
null
);
LOG
.
info
(
"Searching for duplicates of {}"
,
target
);
List
<
Accession
>
candidates
=
new
ArrayList
<>(
100
);
// Accession fields for search: doi, faoInstituteNumber, accessionNumber, accessionNumberPart1, genusName, preferredName
var
doi
=
target
.
getDoi
();
var
site
=
target
.
getSite
();
var
faoInstituteNumber
=
site
!=
null
?
site
.
getFaoInstituteNumber
()
:
null
;
var
accessionNumber
=
target
.
getAccessionNumber
();
var
accessionNumberPart1
=
target
.
getAccessionNumberPart1
();
var
taxonomy
=
target
.
getTaxonomySpecies
();
var
genusName
=
taxonomy
!=
null
?
taxonomy
.
getTaxonomyGenus
().
getGenusName
()
:
null
;
var
preferredName
=
target
.
getPreferredName
();
// By doi
if
(
StringUtils
.
isNotBlank
(
doi
))
{
AccessionFilter
filter
=
getCandidatesFilter
(
target
,
excludedById
,
candidates
);
filter
.
doi
().
add
(
doi
);
try
{
LOG
.
info
(
"Filtering for aliases {}"
,
filter
);
Page
<
Accession
>
matches
=
accessionService
.
list
(
filter
,
PageRequest
.
of
(
0
,
20
));
candidates
.
addAll
(
matches
.
getContent
());
}
catch
(
SearchException
e
)
{
LOG
.
warn
(
e
.
getMessage
());
}
}
// By faoInstituteNumber
if
(
StringUtils
.
isNotBlank
(
faoInstituteNumber
))
{
AccessionFilter
filter
=
getCandidatesFilter
(
target
,
excludedById
,
candidates
);
filter
.
site
=
new
SiteFilter
();
filter
.
site
.
faoInstituteNumber
=
new
StringFilter
();
filter
.
site
.
faoInstituteNumber
.
eq
(
faoInstituteNumber
);
filter
.
_text
=
toSafeEsQuery
(
faoInstituteNumber
);
try
{
LOG
.
info
(
"Filtering for aliases {}"
,
filter
);
Page
<
Accession
>
matches
=
accessionService
.
list
(
filter
,
PageRequest
.
of
(
0
,
20
));
candidates
.
addAll
(
matches
.
getContent
());
}
catch
(
SearchException
e
)
{
LOG
.
warn
(
e
.
getMessage
());
}
}
// By accession number
if
(
StringUtils
.
isNotBlank
(
accessionNumber
))
{
AccessionFilter
filter
=
getCandidatesFilter
(
target
,
excludedById
,
candidates
);
filter
.
accessionNumber
=
Set
.
of
(
accessionNumber
);
try
{
LOG
.
info
(
"Filtering for aliases {}"
,
filter
);
Page
<
Accession
>
matches
=
accessionService
.
list
(
filter
,
PageRequest
.
of
(
0
,
20
));
candidates
.
addAll
(
matches
.
getContent
());
}
catch
(
SearchException
e
)
{
LOG
.
warn
(
e
.
getMessage
());
}
}
// By accession number part 1
if
(
StringUtils
.
isNotBlank
(
accessionNumberPart1
))
{
AccessionFilter
filter
=
getCandidatesFilter
(
target
,
excludedById
,
candidates
);
filter
.
accessionNumberPart1
=
new
StringFilter
();
filter
.
accessionNumberPart1
.
eq
(
accessionNumberPart1
);
filter
.
_text
=
toSafeEsQuery
(
accessionNumberPart1
);
try
{
LOG
.
info
(
"Filtering for aliases {}"
,
filter
);
Page
<
Accession
>
matches
=
accessionService
.
list
(
filter
,
PageRequest
.
of
(
0
,
20
));
candidates
.
addAll
(
matches
.
getContent
());
}
catch
(
SearchException
e
)
{
LOG
.
warn
(
e
.
getMessage
());
}
}
// By genus
if
(
StringUtils
.
isNotBlank
(
genusName
))
{
AccessionFilter
filter
=
getCandidatesFilter
(
target
,
excludedById
,
candidates
);
filter
.
taxonomySpecies
=
new
TaxonomySpeciesFilter
();
filter
.
taxonomySpecies
.
taxonomyGenus
=
new
TaxonomyGenusFilter
();
filter
.
taxonomySpecies
.
taxonomyGenus
.
genusName
=
new
StringFilter
();
filter
.
taxonomySpecies
.
taxonomyGenus
.
genusName
.
eq
(
genusName
);
filter
.
_text
=
toSafeEsQuery
(
genusName
);
try
{
LOG
.
info
(
"Filtering for aliases {}"
,
filter
);
Page
<
Accession
>
matches
=
accessionService
.
list
(
filter
,
PageRequest
.
of
(
0
,
20
));
candidates
.
addAll
(
matches
.
getContent
());
}
catch
(
SearchException
e
)
{
LOG
.
warn
(
e
.
getMessage
());
}
}
// By preferred name
if
(
StringUtils
.
isNotBlank
(
preferredName
))
{
AccessionFilter
filter
=
getCandidatesFilter
(
target
,
excludedById
,
candidates
);
filter
.
preferredName
=
new
StringFilter
();
filter
.
preferredName
.
eq
(
preferredName
);
filter
.
_text
=
toSafeEsQuery
(
preferredName
);
try
{
LOG
.
info
(
"Filtering for aliases {}"
,
filter
);
Page
<
Accession
>
matches
=
accessionService
.
list
(
filter
,
PageRequest
.
of
(
0
,
20
));
candidates
.
addAll
(
matches
.
getContent
());
}
catch
(
SearchException
e
)
{
LOG
.
warn
(
e
.
getMessage
());
}
}
return
candidates
;
}
@Override
protected
double
scoreHit
(
Accession
target
,
Hit
<
Accession
>
hit
)
{
double
score
=
hit
.
score
;
var
candidate
=
hit
.
result
;
var
targetAcceNumb
=
StringUtils
.
lowerCase
(
target
.
getAccessionNumber
());
var
candidateAcceNumb
=
StringUtils
.
lowerCase
(
target
.
getAccessionNumber
());
if
(
notNullEquals
(
hit
.
matches
,
candidateAcceNumb
,
targetAcceNumb
))
{
score
+=
500
;
}
else
{
score
+=
stringsAndNumbersCompare
(
hit
.
matches
,
candidateAcceNumb
,
targetAcceNumb
)
*
500
;
}
if
(
notNullEquals
(
hit
.
matches
,
candidate
.
getDoi
(),
target
.
getDoi
()))
{
score
+=
500
;
}
var
targetAcceNumbPart1
=
StringUtils
.
lowerCase
(
target
.
getAccessionNumberPart1
());
var
candidateAcceNumbPart1
=
StringUtils
.
lowerCase
(
candidate
.
getAccessionNumberPart1
());
if
(
notNullEquals
(
hit
.
matches
,
targetAcceNumbPart1
,
candidateAcceNumbPart1
))
{
score
+=
100
;
}
var
targetTaxonomy
=
target
.
getTaxonomySpecies
();
var
candidateTaxonomy
=
candidate
.
getTaxonomySpecies
();
if
(
candidateTaxonomy
!=
null
&&
targetTaxonomy
!=
null
)
{
if
(
notNullEquals
(
hit
.
matches
,
candidateTaxonomy
.
getTaxonomyGenus
().
getGenusName
(),
targetTaxonomy
.
getTaxonomyGenus
().
getGenusName
()))
{
score
+=
100
;
}
if
(
notNullEquals
(
hit
.
matches
,
candidateTaxonomy
.
getSpeciesName
(),
targetTaxonomy
.
getSpeciesName
()))
{
score
+=
200
;
}
}
if
(
notNullEquals
(
hit
.
matches
,
candidate
.
getPreferredName
(),
target
.
getPreferredName
()))
{
score
+=
100
;
}
else
{
score
+=
similarityScore
(
hit
.
matches
,
candidate
.
getPreferredName
(),
target
.
getPreferredName
())
*
100
;
}
var
targetInstitute
=
target
.
getSite
();
var
candidateInstitute
=
candidate
.
getSite
();
if
(
targetInstitute
!=
null
&&
candidateInstitute
!=
null
)
{
if
(
notNullEquals
(
hit
.
matches
,
candidateInstitute
.
getFaoInstituteNumber
(),
targetInstitute
.
getFaoInstituteNumber
()))
{
score
+=
100
;
}
}
var
targetInvNames
=
target
.
getNames
();
if
(
CollectionUtils
.
isNotEmpty
(
targetInvNames
))
{
var
candidateInventories
=
candidate
.
getInventories
();
if
(
CollectionUtils
.
isNotEmpty
(
candidateInventories
))
{
var
candidatePlantNames
=
candidateInventories
.
stream
().
map
(
Inventory:
:
getNames
)
.
filter
(
Objects:
:
nonNull
)
.
flatMap
(
Collection:
:
stream
)
.
map
(
AccessionInvName:
:
getPlantName
)
.
filter
(
Objects:
:
nonNull
)
.
collect
(
Collectors
.
toSet
());
score
+=
targetInvNames
.
stream
()
.
map
(
AccessionInvName:
:
getPlantName
)
.
filter
(
Objects:
:
nonNull
)
.
filter
(
candidatePlantNames:
:
contains
)
.
peek
(
targetPlantName
->
hit
.
matches
.
add
(
targetPlantName
))
.
mapToDouble
(
targetName
->
100
).
sum
();
}
}
hit
.
score
=
score
;
return
score
;
}
private
AccessionFilter
getCandidatesFilter
(
Accession
target
,
Collection
<
Long
>
excludedById
,
List
<
Accession
>
candidates
)
{
AccessionFilter
filter
=
new
AccessionFilter
();
if
(
target
.
getId
()
!=
null
)
{
filter
.
NOT
=
new
AccessionFilter
();
filter
.
NOT
.
id
().
add
(
target
.
getId
());
// Not this
}
if
(!
CollectionUtils
.
isEmpty
(
excludedById
))
{
if
(
filter
.
NOT
==
null
)
filter
.
NOT
=
new
AccessionFilter
();
filter
.
NOT
.
id
().
addAll
(
excludedById
);
}
if
(!
CollectionUtils
.
isEmpty
(
candidates
))
{
if
(
filter
.
NOT
==
null
)
filter
.
NOT
=
new
AccessionFilter
();
filter
.
NOT
.
id
().
addAll
(
candidates
.
stream
().
map
(
Accession:
:
getId
).
collect
(
Collectors
.
toSet
()));
// Not already found
}
return
filter
;
}
}
server/src/test/java/org/gringlobal/test/service/AbstractElasticServicesTest.java
View file @
732ca5db
...
...
@@ -5,6 +5,7 @@ import org.gringlobal.component.elastic.ElasticReindexProcessor;
import
org.gringlobal.component.elastic.FirehoseReindexListener
;
import
org.gringlobal.service.ElasticsearchService
;
import
org.gringlobal.test.config.TestElasticsearchConfig
;
import
org.gringlobal.worker.dupe.AccessionDuplicateFinder
;
import
org.gringlobal.worker.dupe.CooperatorDuplicateFinder
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.context.annotation.Bean
;
...
...
@@ -38,6 +39,11 @@ public abstract class AbstractElasticServicesTest extends AbstractServicesTest {
public
CooperatorDuplicateFinder
cooperatorDuplicateFinder
()
{
return
new
CooperatorDuplicateFinder
();
}
@Bean
public
AccessionDuplicateFinder
accessionDuplicateFinder
()
{
return
new
AccessionDuplicateFinder
();
}
}
...
...
server/src/test/java/org/gringlobal/test/service/AccessionSearchTest.java
View file @
732ca5db
...
...
@@ -18,6 +18,7 @@ package org.gringlobal.test.service;
import
static
org
.
hamcrest
.
MatcherAssert
.*;
import
static
org
.
hamcrest
.
Matchers
.*;
import
java.util.List
;
import
java.util.Set
;
import
org.gringlobal.custom.elasticsearch.SearchException
;
...
...
@@ -30,6 +31,7 @@ import org.gringlobal.service.filter.AccessionSourceFilter;
import
org.gringlobal.service.filter.CooperatorFilter
;
import
org.gringlobal.service.filter.InventoryFilter
;
import
org.gringlobal.spring.TransactionHelper
;
import
org.gringlobal.worker.dupe.AccessionDuplicateFinder
;
import
org.junit.After
;
import
org.junit.Test
;
import
org.springframework.beans.factory.annotation.Autowired
;
...
...
@@ -47,6 +49,8 @@ public class AccessionSearchTest extends AbstractElasticServicesTest {
private
AccessionService
accessionService
;
@Autowired
private
CooperatorService
cooperatorService
;
@Autowired
private
AccessionDuplicateFinder
accessionDuplicateFinder
;
@After
@Transactional
...
...
@@ -390,4 +394,87 @@ public class AccessionSearchTest extends AbstractElasticServicesTest {
list
=
accessionService
.
list
(
accessionFilter
,
PageRequest
.
of
(
0
,
2
));
assertThat
(
list
.
getContent
().
size
(),
is
(
1
));
}
@Test
public
void
findSimilarForUnsavedTest
()
throws
Exception
{
var
taxonomy
=
addTaxonomySpeciesToDB
();
Accession
a
=
new
Accession
();
a
.
setAccessionNumberPart1
(
"TMe"
);
a
.
setIsBackedUp
(
TRUE
);
a
.
setIsCore
(
TRUE
);
a
.
setIsWebVisible
(
TRUE
);
a
.
setStatusCode
(
ACCESSION_STATUS_CODE
);
a
.
setTaxonomySpecies
(
taxonomy
);
a
.
setSite
(
IITA_SITE
);
a
.
setBackupLocation1Site
(
IITA_SITE
);
a
.
setBackupLocation2Site
(
CIP_SITE
);
a
.
setDoi
(
"10.18730/M3YR2"
);
Accession
b
=
new
Accession
();
b
.
setAccessionNumberPart1
(
"TGm"
);
b
.
setAccessionNumberPart2
(
5L
);
b
.
setIsBackedUp
(
TRUE
);
b
.
setIsCore
(
TRUE
);
b
.
setIsWebVisible
(
TRUE
);
b
.
setStatusCode
(
ACCESSION_STATUS_CODE
);
b
.
setTaxonomySpecies
(
taxonomy
);
b
.
setSite
(
DEFAULT_SITE
);
b
.
setBackupLocation1Site
(
IITA_SITE
);
b
.
setBackupLocation2Site
(
CIP_SITE
);
b
.
setDoi
(
"10.18730/M3YV5"
);
Accession
savedAccessionA
=
accessionService
.
create
(
a
);
assertThat
(
savedAccessionA
,
notNullValue
());
Accession
savedAccessionB
=
accessionService
.
create
(
b
);
assertThat
(
savedAccessionB
,
notNullValue
());
elasticsearchService
.
waitForCount
(
Accession
.
class
,
null
,
2
);
Accession
unsavedAccession
=
new
Accession
();
unsavedAccession
.
setDoi
(
"10.18730/M3YV5"
);
var
similar
=
accessionDuplicateFinder
.
findSimilar
(
unsavedAccession
);
assertThat
(
similar
,
hasSize
(
1
));
assertThat
(
similar
.
get
(
0
).
result
.
getId
(),
is
(
b
.
getId
()));
assertThat
(
similar
.
get
(
0
).
score
,
is
(
500
d
));
unsavedAccession
.
setTaxonomySpecies
(
taxonomy
);
similar
=
accessionDuplicateFinder
.
findSimilar
(
unsavedAccession
);
assertThat
(
similar
,
hasSize
(
2
));
assertThat
(
similar
.
get
(
0
).
score
,
is
(
800
d
));
assertThat
(
similar
.
get
(
1
).
score
,
is
(
300
d
));
unsavedAccession
=
new
Accession
();
var
testSite
=
new
Site
();
testSite
.
setFaoInstituteNumber
(
"NGA039"
);
unsavedAccession
.
setSite
(
testSite
);
similar
=
accessionDuplicateFinder
.
findSimilar
(
unsavedAccession
);
assertThat
(
similar
,
hasSize
(
1
));
assertThat
(
similar
.
get
(
0
).
score
,
is
(
100
d
));
unsavedAccession
=
new
Accession
();
unsavedAccession
.
setAccessionNumberPart1
(
"TGm"
);
similar
=
accessionDuplicateFinder
.
findSimilar
(
unsavedAccession
);
assertThat
(
similar
,
hasSize
(
1
));
assertThat
(
similar
.
get
(
0
).
result
.
getId
(),
is
(
b
.
getId
()));
assertThat
(
similar
.
get
(
0
).
score
,
is
(
100
d
));
Inventory
inventory
=
addInventoryToDB
(
a
,
INVENTORY_NUMBER_PART1_DEFAULT
,
null
,
null
);
AccessionInvName
name
=
new
AccessionInvName
();
name
.
setPlantName
(
"TMe"
);
name
.
setInventory
(
inventory
);
name
.
setCategoryCode
(
"SITE"
);
unsavedAccession
=
new
Accession
();
unsavedAccession
.
setNames
(
List
.
of
(
name
));
unsavedAccession
.
setDoi
(
"10.18730/M3YR2"
);
similar
=
accessionDuplicateFinder
.
findSimilar
(
unsavedAccession
);
assertThat
(
similar
,
hasSize
(
1
));
assertThat
(
similar
.
get
(
0
).
score
,
is
(
600
d
));
}
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment