Commit 732ca5db authored by Artem Hrybeniuk's avatar Artem Hrybeniuk
Browse files

Accession similarity search

parent 9d1b9f5f
......@@ -52,6 +52,8 @@ import org.gringlobal.service.filter.AccessionFilter;
import org.gringlobal.service.glis.impl.GlisDOIRegistrationManager;
import org.gringlobal.service.glis.impl.GlisDOIRegistrationManager.GlisDoiResponse;
import org.gringlobal.spring.CSVMessageConverter;
import org.gringlobal.worker.dupe.AccessionDuplicateFinder;
import org.gringlobal.worker.dupe.DuplicateFinder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.Pageable;
......@@ -92,6 +94,9 @@ public class AccessionController extends FilteredCRUDController<Accession, Acces
@Autowired
private GlisDOIRegistrationManager glisDOIRegistrationManager;
@Autowired(required = false)
private AccessionDuplicateFinder accessionDuplicateFinder;
@Override
protected Class<AccessionFilter> filterType() {
return AccessionFilter.class;
......@@ -238,4 +243,15 @@ public class AccessionController extends FilteredCRUDController<Accession, Acces
return glisDOIRegistrationManager.updateDoiRegistration(filter);
}
/**
* Find similar accessions by source
*
* @param source source Accession
* @return the list of similar Accessions
*/
@PostMapping(value = "/similar")
public List<DuplicateFinder.Hit<Accession>> findSimilarForUnsaved(@RequestBody(required = true) final Accession source) {
return accessionDuplicateFinder.findSimilar(source);
}
}
......@@ -774,6 +774,10 @@ public class Inventory extends CooperatorOwnedModel implements Copyable<Inventor
return preferredName;
}
public List<AccessionInvName> getNames() {
return names;
}
@Override
public void lazyLoad() {
super.lazyLoad();
......
/*
* Copyright 2022 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gringlobal.worker.dupe;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.blocks.model.filters.StringFilter;
import org.gringlobal.custom.elasticsearch.SearchException;
import org.gringlobal.model.Accession;
import org.gringlobal.model.AccessionInvName;
import org.gringlobal.model.Inventory;
import org.gringlobal.service.AccessionService;
import org.gringlobal.service.filter.AccessionFilter;
import org.gringlobal.service.filter.SiteFilter;
import org.gringlobal.service.filter.TaxonomyGenusFilter;
import org.gringlobal.service.filter.TaxonomySpeciesFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@Component
public class AccessionDuplicateFinder extends DuplicateFinder<Accession> {
@Autowired
private AccessionService accessionService;
@Override
protected double getBestScoreThreshold() {
return 1000d;
}
@Override
protected List<Accession> getCandidates(Accession target, Collection<Long> excludedById) {
assert (target != null);
LOG.info("Searching for duplicates of {}", target);
List<Accession> candidates = new ArrayList<>(100);
// Accession fields for search: doi, faoInstituteNumber, accessionNumber, accessionNumberPart1, genusName, preferredName
var doi = target.getDoi();
var site = target.getSite();
var faoInstituteNumber = site != null ? site.getFaoInstituteNumber() : null;
var accessionNumber = target.getAccessionNumber();
var accessionNumberPart1 = target.getAccessionNumberPart1();
var taxonomy = target.getTaxonomySpecies();
var genusName = taxonomy != null ? taxonomy.getTaxonomyGenus().getGenusName() : null;
var preferredName = target.getPreferredName();
// By doi
if (StringUtils.isNotBlank(doi)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter.doi().add(doi);
try {
LOG.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
// By faoInstituteNumber
if (StringUtils.isNotBlank(faoInstituteNumber)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter.site = new SiteFilter();
filter.site.faoInstituteNumber = new StringFilter();
filter.site.faoInstituteNumber.eq(faoInstituteNumber);
filter._text = toSafeEsQuery(faoInstituteNumber);
try {
LOG.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
// By accession number
if (StringUtils.isNotBlank(accessionNumber)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter.accessionNumber = Set.of(accessionNumber);
try {
LOG.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
// By accession number part 1
if (StringUtils.isNotBlank(accessionNumberPart1)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter.accessionNumberPart1 = new StringFilter();
filter.accessionNumberPart1.eq(accessionNumberPart1);
filter._text = toSafeEsQuery(accessionNumberPart1);
try {
LOG.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
// By genus
if (StringUtils.isNotBlank(genusName)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter.taxonomySpecies = new TaxonomySpeciesFilter();
filter.taxonomySpecies.taxonomyGenus = new TaxonomyGenusFilter();
filter.taxonomySpecies.taxonomyGenus.genusName = new StringFilter();
filter.taxonomySpecies.taxonomyGenus.genusName.eq(genusName);
filter._text = toSafeEsQuery(genusName);
try {
LOG.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
// By preferred name
if (StringUtils.isNotBlank(preferredName)) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates);
filter.preferredName = new StringFilter();
filter.preferredName.eq(preferredName);
filter._text = toSafeEsQuery(preferredName);
try {
LOG.info("Filtering for aliases {}", filter);
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
return candidates;
}
@Override
protected double scoreHit(Accession target, Hit<Accession> hit) {
double score = hit.score;
var candidate = hit.result;
var targetAcceNumb = StringUtils.lowerCase(target.getAccessionNumber());
var candidateAcceNumb = StringUtils.lowerCase(target.getAccessionNumber());
if (notNullEquals(hit.matches, candidateAcceNumb, targetAcceNumb)) {
score += 500;
} else {
score += stringsAndNumbersCompare(hit.matches, candidateAcceNumb, targetAcceNumb) * 500;
}
if (notNullEquals(hit.matches, candidate.getDoi(), target.getDoi())) {
score += 500;
}
var targetAcceNumbPart1 = StringUtils.lowerCase(target.getAccessionNumberPart1());
var candidateAcceNumbPart1 = StringUtils.lowerCase(candidate.getAccessionNumberPart1());
if (notNullEquals(hit.matches, targetAcceNumbPart1, candidateAcceNumbPart1)) {
score += 100;
}
var targetTaxonomy = target.getTaxonomySpecies();
var candidateTaxonomy = candidate.getTaxonomySpecies();
if (candidateTaxonomy != null && targetTaxonomy != null) {
if (notNullEquals(hit.matches, candidateTaxonomy.getTaxonomyGenus().getGenusName(), targetTaxonomy.getTaxonomyGenus().getGenusName())) {
score += 100;
}
if (notNullEquals(hit.matches, candidateTaxonomy.getSpeciesName(), targetTaxonomy.getSpeciesName())) {
score += 200;
}
}
if (notNullEquals(hit.matches, candidate.getPreferredName(), target.getPreferredName())) {
score += 100;
} else {
score += similarityScore(hit.matches, candidate.getPreferredName(), target.getPreferredName()) * 100;
}
var targetInstitute = target.getSite();
var candidateInstitute = candidate.getSite();
if (targetInstitute != null && candidateInstitute != null) {
if (notNullEquals(hit.matches, candidateInstitute.getFaoInstituteNumber(), targetInstitute.getFaoInstituteNumber())) {
score += 100;
}
}
var targetInvNames = target.getNames();
if (CollectionUtils.isNotEmpty(targetInvNames)) {
var candidateInventories = candidate.getInventories();
if (CollectionUtils.isNotEmpty(candidateInventories)) {
var candidatePlantNames = candidateInventories.stream().map(Inventory::getNames)
.filter(Objects::nonNull)
.flatMap(Collection::stream)
.map(AccessionInvName::getPlantName)
.filter(Objects::nonNull)
.collect(Collectors.toSet());
score += targetInvNames.stream()
.map(AccessionInvName::getPlantName)
.filter(Objects::nonNull)
.filter(candidatePlantNames::contains)
.peek(targetPlantName -> hit.matches.add(targetPlantName))
.mapToDouble(targetName -> 100).sum();
}
}
hit.score = score;
return score;
}
private AccessionFilter getCandidatesFilter(Accession target, Collection<Long> excludedById, List<Accession> candidates) {
AccessionFilter filter = new AccessionFilter();
if (target.getId() != null) {
filter.NOT = new AccessionFilter();
filter.NOT.id().add(target.getId()); // Not this
}
if (! CollectionUtils.isEmpty(excludedById)) {
if (filter.NOT == null) filter.NOT = new AccessionFilter();
filter.NOT.id().addAll(excludedById);
}
if (! CollectionUtils.isEmpty(candidates)) {
if (filter.NOT == null) filter.NOT = new AccessionFilter();
filter.NOT.id().addAll(candidates.stream().map(Accession::getId).collect(Collectors.toSet())); // Not already found
}
return filter;
}
}
......@@ -5,6 +5,7 @@ import org.gringlobal.component.elastic.ElasticReindexProcessor;
import org.gringlobal.component.elastic.FirehoseReindexListener;
import org.gringlobal.service.ElasticsearchService;
import org.gringlobal.test.config.TestElasticsearchConfig;
import org.gringlobal.worker.dupe.AccessionDuplicateFinder;
import org.gringlobal.worker.dupe.CooperatorDuplicateFinder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
......@@ -38,6 +39,11 @@ public abstract class AbstractElasticServicesTest extends AbstractServicesTest {
public CooperatorDuplicateFinder cooperatorDuplicateFinder() {
return new CooperatorDuplicateFinder();
}
@Bean
public AccessionDuplicateFinder accessionDuplicateFinder() {
return new AccessionDuplicateFinder();
}
}
......
......@@ -18,6 +18,7 @@ package org.gringlobal.test.service;
import static org.hamcrest.MatcherAssert.*;
import static org.hamcrest.Matchers.*;
import java.util.List;
import java.util.Set;
import org.gringlobal.custom.elasticsearch.SearchException;
......@@ -30,6 +31,7 @@ import org.gringlobal.service.filter.AccessionSourceFilter;
import org.gringlobal.service.filter.CooperatorFilter;
import org.gringlobal.service.filter.InventoryFilter;
import org.gringlobal.spring.TransactionHelper;
import org.gringlobal.worker.dupe.AccessionDuplicateFinder;
import org.junit.After;
import org.junit.Test;
import org.springframework.beans.factory.annotation.Autowired;
......@@ -47,6 +49,8 @@ public class AccessionSearchTest extends AbstractElasticServicesTest {
private AccessionService accessionService;
@Autowired
private CooperatorService cooperatorService;
@Autowired
private AccessionDuplicateFinder accessionDuplicateFinder;
@After
@Transactional
......@@ -390,4 +394,87 @@ public class AccessionSearchTest extends AbstractElasticServicesTest {
list = accessionService.list(accessionFilter, PageRequest.of(0, 2));
assertThat(list.getContent().size(), is(1));
}
@Test
public void findSimilarForUnsavedTest() throws Exception {
var taxonomy = addTaxonomySpeciesToDB();
Accession a = new Accession();
a.setAccessionNumberPart1("TMe");
a.setIsBackedUp(TRUE);
a.setIsCore(TRUE);
a.setIsWebVisible(TRUE);
a.setStatusCode(ACCESSION_STATUS_CODE);
a.setTaxonomySpecies(taxonomy);
a.setSite(IITA_SITE);
a.setBackupLocation1Site(IITA_SITE);
a.setBackupLocation2Site(CIP_SITE);
a.setDoi("10.18730/M3YR2");
Accession b = new Accession();
b.setAccessionNumberPart1("TGm");
b.setAccessionNumberPart2(5L);
b.setIsBackedUp(TRUE);
b.setIsCore(TRUE);
b.setIsWebVisible(TRUE);
b.setStatusCode(ACCESSION_STATUS_CODE);
b.setTaxonomySpecies(taxonomy);
b.setSite(DEFAULT_SITE);
b.setBackupLocation1Site(IITA_SITE);
b.setBackupLocation2Site(CIP_SITE);
b.setDoi("10.18730/M3YV5");
Accession savedAccessionA = accessionService.create(a);
assertThat(savedAccessionA, notNullValue());
Accession savedAccessionB = accessionService.create(b);
assertThat(savedAccessionB, notNullValue());
elasticsearchService.waitForCount(Accession.class, null, 2);
Accession unsavedAccession = new Accession();
unsavedAccession.setDoi("10.18730/M3YV5");
var similar = accessionDuplicateFinder.findSimilar(unsavedAccession);
assertThat(similar, hasSize(1));
assertThat(similar.get(0).result.getId(), is(b.getId()));
assertThat(similar.get(0).score, is(500d));
unsavedAccession.setTaxonomySpecies(taxonomy);
similar = accessionDuplicateFinder.findSimilar(unsavedAccession);
assertThat(similar, hasSize(2));
assertThat(similar.get(0).score, is(800d));
assertThat(similar.get(1).score, is(300d));
unsavedAccession = new Accession();
var testSite = new Site();
testSite.setFaoInstituteNumber("NGA039");
unsavedAccession.setSite(testSite);
similar = accessionDuplicateFinder.findSimilar(unsavedAccession);
assertThat(similar, hasSize(1));
assertThat(similar.get(0).score, is(100d));
unsavedAccession = new Accession();
unsavedAccession.setAccessionNumberPart1("TGm");
similar = accessionDuplicateFinder.findSimilar(unsavedAccession);
assertThat(similar, hasSize(1));
assertThat(similar.get(0).result.getId(), is(b.getId()));
assertThat(similar.get(0).score, is(100d));
Inventory inventory = addInventoryToDB(a, INVENTORY_NUMBER_PART1_DEFAULT, null, null);
AccessionInvName name = new AccessionInvName();
name.setPlantName("TMe");
name.setInventory(inventory);
name.setCategoryCode("SITE");
unsavedAccession = new Accession();
unsavedAccession.setNames(List.of(name));
unsavedAccession.setDoi("10.18730/M3YR2");
similar = accessionDuplicateFinder.findSimilar(unsavedAccession);
assertThat(similar, hasSize(1));
assertThat(similar.get(0).score, is(600d));
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment