Commit 6f6e7949 authored by Maxym Borodenko's avatar Maxym Borodenko Committed by Viacheslav Pavlov
Browse files

Duplicates: Find similar Cooperators

Brought abstract duplicate finder from Genesys;
Included taxonomychecker to the pom.
parent 2810b7ba
......@@ -959,6 +959,11 @@
<artifactId>grin-taxonomy-reader</artifactId>
<version>3.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.genesys-pgr</groupId>
<artifactId>taxonomychecker</artifactId>
<version>3.0-SNAPSHOT</version>
</dependency>
</dependencies>
<repositories>
......
......@@ -16,6 +16,7 @@
package org.gringlobal.api.v1.impl;
import java.io.IOException;
import java.util.List;
import org.genesys.blocks.auditlog.model.AuditLog;
import org.genesys.blocks.auditlog.model.filters.AuditLogFilter;
......@@ -29,8 +30,11 @@ import org.gringlobal.model.Cooperator;
import org.gringlobal.model.QCooperator;
import org.gringlobal.service.CooperatorService;
import org.gringlobal.service.filter.CooperatorFilter;
import org.gringlobal.service.worker.dupe.CooperatorDuplicateFinder;
import org.gringlobal.service.worker.dupe.DuplicateFinder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.http.MediaType;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
......@@ -56,6 +60,9 @@ public class CooperatorController extends FilteredCRUDController<Cooperator, Coo
@Autowired
private AuditTrailService auditService;
@Autowired
private CooperatorDuplicateFinder duplicateFinder;
@Override
protected OrderSpecifier<?>[] defaultSort() {
return new OrderSpecifier[] { QCooperator.cooperator.id.asc() };
......@@ -82,6 +89,18 @@ public class CooperatorController extends FilteredCRUDController<Cooperator, Coo
return auditService.listAuditLogs(filter, page.toPageRequest(100));
}
/**
* Searching for similar cooperators
*
* @param id the target cooperator ID
* @return found similar cooperators
*/
@GetMapping(value = "/similar/{id:\\d+}", produces = MediaType.APPLICATION_JSON_VALUE)
public List<DuplicateFinder.Hit<Cooperator>> getSimilarCooperatorForID(@PathVariable("id") final long id) {
Cooperator cooperator = crudService.get(id);
return duplicateFinder.findSimilar(cooperator);
}
@Override
public Cooperator create(@RequestBody Cooperator entity) {
return super.create(entity);
......
/*
* Copyright 2021 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gringlobal.service.worker.dupe;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.gringlobal.custom.elasticsearch.SearchException;
import org.gringlobal.model.Cooperator;
import org.gringlobal.service.CooperatorService;
import org.gringlobal.service.filter.CooperatorFilter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;
/**
* Cooperator Duplicate Finder.
*/
@Component
public class CooperatorDuplicateFinder extends DuplicateFinder<Cooperator> {
@Autowired
private CooperatorService cooperatorService;
@Override
protected double getBestScoreThreshold() {
return 1000d;
}
@Override
protected List<Cooperator> getCandidates(Cooperator target, Collection<Long> excludedById) {
assert (target != null);
LOG.info("Searching for duplicates of {}", target.toString());
List<Cooperator> candidates = new ArrayList<>(20);
String firstName = target.getFirstName();
String lastName = target.getLastName();
if (StringUtils.isNotBlank(firstName) || StringUtils.isNotBlank(lastName)) {
try {
CooperatorFilter filter = new CooperatorFilter();
// exclude target
filter.NOT = new CooperatorFilter();
filter.NOT.id().add(target.getId()); // Not this
if (! CollectionUtils.isEmpty(excludedById)) {
filter.NOT.id().addAll(excludedById);
}
filter._text = toSafeEsQuery(StringUtils.defaultIfBlank(target.getTitle(), "") + " " + StringUtils.defaultIfBlank(firstName, "") + " " + StringUtils.defaultIfBlank(lastName, ""));
LOG.info("Filtering for {}", filter.toString());
Page<Cooperator> matches = cooperatorService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
return candidates;
}
/**
* Score hit.
*
* @param target the target
* @param hit the hit
* @return the double
*/
@Override
protected double scoreHit(Cooperator target, Hit<Cooperator> hit) {
var candidate = hit.result;
var score = hit.score;
if (notNullEquals(hit.matches, candidate.getFirstName(), target.getFirstName())) {
score += 250;
} else {
// could be Max | Maxim | Maxym | Maksim | Maksym
score += similarityScore(hit.matches, candidate.getFirstName(), target.getFirstName()) * 250;
}
if (notNullEquals(hit.matches, candidate.getLastName(), target.getLastName())) {
score += 300;
}
if (notNullEquals(hit.matches, candidate.getEmail(), target.getEmail())) {
score += 300;
}
if (notNullEquals(hit.matches, candidate.getSecondaryEmail(), target.getSecondaryEmail())) {
score += 100;
}
if (notNullEquals(hit.matches, candidate.getTitle(), target.getTitle())) {
score += 10;
}
// categoryCode is a codeValue
if (notNullEquals(hit.matches, candidate.getCategoryCode(), target.getCategoryCode())) {
score += 20;
}
// disciplineCode is a codeValue
if (notNullEquals(hit.matches, candidate.getDisciplineCode(), target.getDisciplineCode())) {
score += 10;
}
if (notNullEquals(hit.matches, candidate.getOrganization(), target.getOrganization())) {
score += 50;
} else {
score += similarityScore(hit.matches, candidate.getOrganization(), target.getOrganization()) * 50;
}
if (notNullEquals(hit.matches, candidate.getOrganizationAbbrev(), target.getOrganizationAbbrev())) {
score += 10;
}
if (notNullEquals(hit.matches, candidate.getJob(), target.getJob())) {
score += 10;
}
score += similarityScore(hit.matches, candidate.getNote(), target.getNote()) * 10;
{
/*
* Compare address data
*/
if (notNullEquals(hit.matches, candidate.getAddressLine1(), target.getAddressLine1())) {
score += 20;
} else {
score += similarityScore(hit.matches, candidate.getAddressLine1(), target.getAddressLine1()) * 20;
}
if (notNullEquals(hit.matches, candidate.getAddressLine2(), target.getAddressLine2())) {
score += 20;
} else {
score += similarityScore(hit.matches, candidate.getAddressLine2(), target.getAddressLine2()) * 20;
}
if (notNullEquals(hit.matches, candidate.getAddressLine3(), target.getAddressLine3())) {
score += 20;
} else {
score += similarityScore(hit.matches, candidate.getAddressLine3(), target.getAddressLine3()) * 20;
}
if (notNullEquals(hit.matches, candidate.getCity(), target.getCity())) {
score += 20;
} else {
score += similarityScore(hit.matches, candidate.getCity(), target.getCity()) * 50;
}
// organizationRegionCode is a codeValue
if (notNullEquals(hit.matches, candidate.getOrganizationRegionCode(), target.getOrganizationRegionCode())) {
score += 10;
}
if (notNullEquals(hit.matches, candidate.getPostalIndex(), target.getPostalIndex())) {
score += 10;
}
}
hit.score = score;
return score;
}
}
/*
* Copyright 2021 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gringlobal.service.worker.dupe;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.DoubleAdder;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.fasterxml.jackson.annotation.JsonUnwrapped;
import com.fasterxml.jackson.annotation.JsonValue;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.blocks.model.EntityId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.transaction.annotation.Transactional;
import org.genesys.taxonomy.checker.StringSimilarity;
/**
* Duplicate Finder base.
*/
public abstract class DuplicateFinder<T extends EntityId> {
protected final Logger LOG = LoggerFactory.getLogger(getClass());
public enum HitRating {
BEST(4), GOOD(3), OK(2), POOR(1);
private final int rating;
HitRating(int rating) {
this.rating = rating;
}
@JsonValue
public int getRating() {
return rating;
}
}
public static class SimilarityHit<T> {
public T source;
public List<Hit<T>> results;
public SimilarityHit(T source, List<Hit<T>> results) {
this.source = source;
this.results = results;
}
}
public static class Hit<T> {
@JsonUnwrapped
public T result;
public HitRating hitRating;
public double score = 0;
public List<String> matches = new ArrayList<>();
public Hit(T result, Double score) {
this.result = result;
this.score = score != null ? score.doubleValue() : 0;
}
}
@Transactional(readOnly = true)
@PreAuthorize("hasAuthority('GROUP_ADMINS')")
public final List<SimilarityHit<T>> findSimilar(List<T> targets) {
assert (targets != null);
LOG.warn("Finding duplicates for {} targets", targets.size());
return targets.stream().map((accession) -> new SimilarityHit<T>(accession, findSimilar(accession))).collect(Collectors.toList());
}
/**
* Find entities similar to the target.
*
* @param target the target
* @return the list
*/
@Transactional(readOnly = true)
public final List<Hit<T>> findSimilar(T target) {
return findSimilar(target, CollectionUtils.emptyCollection());
}
/**
* Find entities similar to the target, but excluding the ones with IDs listed in excludedById
*
* @param target the target
* @param excludedById the list of candidate IDs to exclude from matching
* @return the list
*/
@Transactional(readOnly = true)
public List<Hit<T>> findSimilar(T target, Collection<Long> excludedById) {
assert (target != null);
LOG.info("Searching for duplicates of {}", target.toString());
List<T> candidates = getCandidates(target, excludedById);
if (target.getId() != null) {
candidates.removeIf(candidate -> candidate.getId().equals(target.getId()));
}
// Remove excluded candidates by id
if (CollectionUtils.isNotEmpty(excludedById)) {
candidates.removeIf(candidate -> excludedById.contains(candidate.getId()));
}
LOG.info("Found {} potential hits", candidates.size());
Set<Long> ids = new HashSet<>();
var uniqueHits = candidates.stream().filter((hit) -> {
if (ids.contains(hit.getId())) {
return false;
} else {
ids.add(hit.getId());
return true;
}
}).map((candidate) -> new Hit<T>(candidate, 0d))
// Process
.peek((candidate) -> scoreHit(target, candidate))
// sort
.sorted((a, b) -> Double.compare(b.score, a.score))
// filter
.limit(20)
// done
.collect(Collectors.toList());
// double bestScore = uniqueHits.stream().map((hit) ->
// hit.score).max(Comparator.comparing(Double::valueOf)).orElse(1.0);
// Adjust maxScore to something that gives good results (theoretical max > 1360)
var bestScoreThreshold = getBestScoreThreshold(); // Math.min(bestScore, 1000);
uniqueHits.forEach((hit) -> {
var perc = hit.score / bestScoreThreshold;
hit.hitRating = perc > 0.9 ? HitRating.BEST : perc > 0.7 ? HitRating.GOOD : perc > 0.4 ? HitRating.OK : HitRating.POOR;
});
LOG.info("Found {} duplicates of {}", uniqueHits.size(), target.toString());
return uniqueHits;
}
/**
* Gets the best score threshold.
*
* @return the best score threshold
*/
protected abstract double getBestScoreThreshold();
/**
* Find all candidates that are potential matches for target.
*
* @param target the target
* @param excludedById the IDs of excluded entities
* @return list of candidates
*/
protected abstract List<T> getCandidates(final T target, final Collection<Long> excludedById);
/**
* Convert ES query to a safe ES query by replacing non digit, non word chars
* with " ".
*
* @param rawEsQuery the raw ES search string
* @return the safe search string
*/
protected final String toSafeEsQuery(String rawEsQuery) {
assert (rawEsQuery != null);
// Replace non digit, non word chars with " "
String esQuery = rawEsQuery.trim().replaceAll("[^\\p{L}\\d]+", " ");
return esQuery;
}
/**
* Score the target against the Hit. Scoring should be transitive.
*
* @param target the target
* @param hit the potential Match
* @return similarity score, the higer the better
*/
protected abstract double scoreHit(T target, Hit<T> hit);
protected final boolean notNullEquals(final Collection<String> matches, final String a, final String b) {
if (a == null || b == null || a.length() == 0 || b.length() == 0) {
return false;
}
if (StringUtils.equalsIgnoreCase(a, b)) {
matches.add(b);
return true;
} else {
return false;
}
}
/**
* similarityScore returns a string similarity value in the range [0, 1.0]
* (where 1.0 is full match).
*
* @param original the original
* @param candidate the candidate
* @return the score between 0 and 1.0 where 0 is no similarity and 1.0 is full
* match
*/
protected final double similarityScore(String original, String candidate) {
if (original == null || candidate == null || original.length() == 0 || candidate.length() == 0) {
return 0;
}
var score = (StringSimilarity.diceCoefficientOptimized(original.toLowerCase(), candidate.toLowerCase()) + StringSimilarity.getLevenshteinCoefficient(original.toLowerCase(),
candidate.toLowerCase())) / 2.0f;
return score;
}
protected final double similarityScore(final Collection<String> matches, final String original, final String candidate) {
var score = similarityScore(original, candidate);
if (score > 0.7) {
matches.add(candidate);
}
return score;
}
protected final double stringsAndNumbersCompare(final Collection<String> matches, String a, String b) {
var result = stringsAndNumbersCompare(a, b);
if (result >= 0.5) {
matches.add(a);
matches.add(b);
}
return result;
}
private static final Pattern NUMBERS_AND_STRINGS = Pattern.compile("(\\p{L}+)|0*(\\d+)");
/**
* Split input strings into sets consisting of parts of only digits and only
* letters (in lower case). Compare the two sets.
*
* @param a
* @param b
* @return a value between 0 and 1.
*/
protected final double stringsAndNumbersCompare(String a, String b) {
if (StringUtils.isBlank(a) || StringUtils.isBlank(b)) {
return 0;
}
var ma = uniqueStringsAndNumbers(a);
var mb = uniqueStringsAndNumbers(b);
return compareStringsAndNumbers(ma, mb);
}
/**
* Compare text parts, strings separately from numbers.
*
* For each matching string in a and b, add 5. For each matching number in a and
* b, add 20; for strings add a bit less.
*
* @param ma Set of Number | String
* @param mb Set of Number | String
* @return value in the range of 0 to 1
*/
protected final static double compareStringsAndNumbers(Set<Object> ma, Set<Object> mb) {
AtomicInteger nums = new AtomicInteger();
AtomicInteger same = new AtomicInteger();
ma.forEach((man) -> {
var len = man instanceof String ? 18 : 20; // Strings are less important than numbers
nums.addAndGet(len);
if (mb.contains(man)) {
same.addAndGet(len);
}
});
mb.forEach((man) -> {
var len = man instanceof String ? 18 : 20; // Strings are less important than numbers
nums.addAndGet(len);
if (ma.contains(man)) {
same.addAndGet(len);
}
});
return nums.doubleValue() == 0 ? 0.0 : same.doubleValue() / nums.doubleValue();
}
private final Cache<String, Set<Object>> uniqueStringsAndNumbersCache = CacheBuilder.newBuilder()
// size
.maximumSize(100)
// expiration
.expireAfterWrite(10, TimeUnit.SECONDS).build();
protected final Set<Object> uniqueStringsAndNumbers(final String a) {
if (StringUtils.isBlank(a)) {
return Set.of();
}
try {
return uniqueStringsAndNumbersCache.get(a, () -> {
return NUMBERS_AND_STRINGS.matcher(a).results()
// type conversion
.map((r) -> r.group(1) != null ? r.group(1).toLowerCase() : Long.parseLong(r.group(2)))
// get
.collect(Collectors.toSet());
});
} catch (ExecutionException e) {
throw new RuntimeException("Something went wrong", e);
}
}
private final Cache<String, List<Object>> stringsAndNumbersCache = CacheBuilder.newBuilder()
// size
.maximumSize(100)
// expiration
.expireAfterWrite(10, TimeUnit.SECONDS).build();
protected final List<Object> toStringsAndNumbers(final String a) {
if (StringUtils.isBlank(a)) {
return List.of();
}
try {
return stringsAndNumbersCache.get(a, () -> {
return NUMBERS_AND_STRINGS.matcher(a).results()
// type conversion