Commit 939f841b authored by Matija Obreza's avatar Matija Obreza
Browse files

SGSV data import

parent ce00839a
......@@ -61,6 +61,7 @@ public class Accession implements java.io.Serializable {
private Boolean availability;
private Boolean mlsStatus;
private String genus;
private String dataSource;
private AllAccnames accessionNames;
private AllAcqBreeding accessionBreeding;
private AllAcqCollect accessionCollection;
......@@ -290,6 +291,15 @@ public class Accession implements java.io.Serializable {
this.svalbardData = svalbardData;
}
@Column(name = "dataSource", length = 45)
public String getDataSource() {
return dataSource;
}
public void setDataSource(String dataSource) {
this.dataSource = dataSource;
}
@Override
public String toString() {
return MessageFormat.format("Accession id={0,number,#} A={3} inst={1} genus={2}", id, institute.getCode(), genus, accNumbHi);
......
package org.genesys2.rest.common.model.impl;
public interface AccessionIdentifier2 {
public interface AccessionIdentifier3 {
String getHoldingInstitute();
String getAccessionName();
String getGenus();
}
......@@ -84,6 +84,8 @@ public interface AccessionRepository extends JpaRepository<Accession, Long> {
@Query("select a from Accession a where a.taxonomy in ( ?1 )")
Page<Accession> findByTaxonomy(Collection<Taxonomy> taxonomies, Pageable pageable);
Accession findByInstituteCodeAndAccessionName(String holdingInstitute, String accessionName);
// Accession findByInstituteCodeAndAccessionName(String holdingInstitute, String accessionName);
Accession findByInstituteCodeAndAccessionNameAndGenus(String holdingInstitute, String accessionName, String genus);
}
......@@ -25,4 +25,6 @@ public interface TaxonomyRepository extends JpaRepository<Taxonomy, Long> {
Taxonomy getByGenusAndSpecies(String genus, String species);
List<Taxonomy> findByGenus(String genus);
Taxonomy getByTaxonName(String fullTaxa);
}
......@@ -31,7 +31,7 @@ import org.genesys2.rest.common.model.genesys.Metadata;
import org.genesys2.rest.common.model.genesys.Method;
import org.genesys2.rest.common.model.genesys.SvalbardData;
import org.genesys2.rest.common.model.genesys.Taxonomy;
import org.genesys2.rest.common.model.impl.AccessionIdentifier2;
import org.genesys2.rest.common.model.impl.AccessionIdentifier3;
import org.genesys2.rest.common.model.impl.Country;
import org.genesys2.rest.common.model.impl.Crop;
import org.genesys2.rest.common.model.impl.FaoInstitute;
......@@ -84,7 +84,7 @@ public interface GenesysService {
Page<Accession> listAccessions(Collection<Long> accessionIds, Pageable pageable);
List<Accession> listAccessions(List<? extends AccessionIdentifier2> accns);
List<Accession> listAccessions(List<? extends AccessionIdentifier3> accns);
Page<Object[]> statisticsGenusByInstitute(FaoInstitute faoInstitute, Pageable pageable);
......
......@@ -23,4 +23,6 @@ public interface TaxonomyService {
Taxonomy get(String genus, String species);
Taxonomy ensureTaxonomy(String genus, String species, String fullTaxa);
}
......@@ -35,7 +35,7 @@ import org.genesys2.rest.common.model.genesys.Method;
import org.genesys2.rest.common.model.genesys.Parameter;
import org.genesys2.rest.common.model.genesys.SvalbardData;
import org.genesys2.rest.common.model.genesys.Taxonomy;
import org.genesys2.rest.common.model.impl.AccessionIdentifier2;
import org.genesys2.rest.common.model.impl.AccessionIdentifier3;
import org.genesys2.rest.common.model.impl.Country;
import org.genesys2.rest.common.model.impl.Crop;
import org.genesys2.rest.common.model.impl.FaoInstitute;
......@@ -149,17 +149,17 @@ public class GenesysServiceImpl implements GenesysService, TraitService {
}
@Override
public List<Accession> listAccessions(List<? extends AccessionIdentifier2> accns) {
public List<Accession> listAccessions(List<? extends AccessionIdentifier3> accns) {
List<Accession> result = new ArrayList<Accession>(accns.size());
for (AccessionIdentifier2 aid2 : accns) {
Accession accn = accessionRepository.findByInstituteCodeAndAccessionName(aid2.getHoldingInstitute(), aid2.getAccessionName());
for (AccessionIdentifier3 aid3 : accns) {
Accession accn = accessionRepository.findByInstituteCodeAndAccessionNameAndGenus(aid3.getHoldingInstitute(), aid3.getAccessionName(), aid3.getGenus());
// Including null's
result.add(accn);
if (accn == null) {
FaoInstitute inst = instituteRepository.findByCode(aid2.getHoldingInstitute());
FaoInstitute inst = instituteRepository.findByCode(aid3.getHoldingInstitute());
if (inst == null)
// Only log full miss
LOG.debug("No accession " + aid2 + " in " + inst);
LOG.debug("No accession " + aid3 + " in " + inst);
}
}
return result;
......
/**
* Copyright 2013 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package org.genesys2.rest.common.service.impl;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import javax.annotation.PreDestroy;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.genesys2.rest.common.model.genesys.Accession;
import org.genesys2.rest.common.model.impl.AccessionIdentifier3;
import org.genesys2.rest.common.model.impl.FaoInstitute;
import org.genesys2.rest.common.service.GenesysService;
import org.genesys2.rest.common.service.InstituteService;
import org.genesys2.rest.common.service.TaxonomyService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.stereotype.Component;
import au.com.bytecode.opencsv.CSVReader;
@Component
public class SGSVInsertMissing {
private static final String SGSV_DOWNLOAD_URL = "http://www.nordgen.org/sgsv/download.php?file=/scope/sgsv/files/sgsv_templates.tab";
public static final Log LOG = LogFactory.getLog(SGSVInsertMissing.class);
private static final int BATCH_SIZE = 50;
private static final int nThreads = Runtime.getRuntime().availableProcessors();
private final ThreadPoolExecutor threadPool = (ThreadPoolExecutor) Executors.newFixedThreadPool(nThreads);
private static final String[] institutes = { "AUS039", "BDI003", "BDI004", "BDI005", "BRA001", "BRA008", "CAN004", "CHL002", "CIV039", "CRI001", "ECU076",
"GBR072", "GEO028", "GRC035", "IDN179", "ISR003", "ITA411", "KEN015", "KEN023", "KEN045", "KEN053", "KEN055", "KOR043", "MLI002", "MLI003",
"MLI219", "MMR003", "MNG030", "NGA010", "PAK001", "PER002", "PHL129", "PRK013", "SDN034", "THA214", "TJK027", "TWN006", "UGA031", "UKR001",
"UZB006", "ZMB030" };
@Value("${download.files.dir}")
String filesPath;
@Autowired
private GenesysService genesysService;
@Autowired
private InstituteService instituteService;
@Autowired
private TaxonomyService taxonomyService;
private Map<String, FaoInstitute> faoInstituteCache = new HashMap<String, FaoInstitute>();
@PreAuthorize("hasRole('ADMINISTRATOR')")
public void importMissingSGSV() {
LOG.warn("Importing SGSV data from " + SGSV_DOWNLOAD_URL);
final HttpClient httpclient = new DefaultHttpClient();
final HttpGet httpget = new HttpGet(SGSV_DOWNLOAD_URL);
HttpResponse response = null;
try {
response = httpclient.execute(httpget);
} catch (final ClientProtocolException e) {
LOG.error(e.getMessage(), e);
} catch (final IOException e) {
LOG.error(e.getMessage(), e);
}
LOG.debug(response.getStatusLine());
// Get hold of the response entity
final HttpEntity entity = response.getEntity();
for (final Header header : response.getAllHeaders()) {
LOG.debug(header);
}
LOG.debug(entity.getContentType() + " " + entity.getContentLength());
try {
importSGSVStream(entity.getContent(), SGSV_DOWNLOAD_URL);
} catch (final Throwable e) {
httpget.abort();
}
}
@PreAuthorize("hasRole('ADMINISTRATOR')")
public void importMissingSGSVFile() {
// sgsv_templates_20130610.tab
// sgsv_templates_20130502.tab
final File file = new File(filesPath, "sgsv_templates_20130610.tab");
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
LOG.warn("Importing SGSV data from " + file.getAbsolutePath());
importSGSVStream(fis, file.getAbsolutePath());
} catch (final FileNotFoundException e) {
LOG.error(e.getMessage(), e);
} catch (final IOException e) {
LOG.error(e.getMessage(), e);
} finally {
IOUtils.closeQuietly(fis);
}
}
private void importSGSVStream(final InputStream str, final String source) throws IOException {
int counter = 0;
CSVReader reader = null;
try {
reader = new CSVReader(new BufferedReader(new InputStreamReader(str)), '\t', '"', false);
final String[] headers = reader.readNext();
LOG.debug("Headers: " + headers.length);
if (headers.length != 30) {
LOG.warn("Expected 30 headers, got " + headers.length);
return;
}
// descriptors[0] = descriptorRepository.findByCode("SGSV_ID");
// descriptors[1] = descriptorRepository.findByCode("INSTCODE");
// descriptors[2] = descriptorRepository.findByCode("SGSV_BOXNO");
// descriptors[3] = descriptorRepository.findByCode("COLLNAME");
// descriptors[4] = descriptorRepository.findByCode("ACCENUMB");
// descriptors[5] = descriptorRepository.findByCode("FULL_SCINAME");
// descriptors[7] = descriptorRepository.findByCode("QTY");
// descriptors[8] =
// descriptorRepository.findByCode("regeneration_month_and_year");
// descriptors[9] = descriptorRepository.findByCode("OTHERNUMB");
// descriptors[10] =
// descriptorRepository.findByCode("provider_institute_code");
// descriptors[12] = descriptorRepository.findByCode("ORIGCTY");
// descriptors[16] = descriptorRepository.findByCode("GENUS");
// descriptors[17] =
// descriptorRepository.findByCode("species_epithet");
// descriptors[17] = descriptorRepository.findByCode("SPECIES");
// descriptors[19] = descriptorRepository.findByCode("TAXON_NAME");
// descriptors[20] =
// descriptorRepository.findByCode("SGSV_DEPOSIT_DATE");
// descriptors[23] = descriptorRepository.findByCode("SGSV_BOXID");
// descriptors[24] =
// descriptorRepository.findByCode("SGSV_TAXONID");
// descriptors[25] =
// descriptorRepository.findByCode("taxon_authority");
// descriptors[26] =
// descriptorRepository.findByCode("infraspesific_epithet");
// descriptors[27] =
// descriptorRepository.findByCode("VERNACULAR_NAME");
// descriptors[29] =
// descriptorRepository.findByCode("SGSV_GENUSID");
final List<String[]> bulk = new ArrayList<String[]>(BATCH_SIZE);
String[] line = null;
while ((line = reader.readNext()) != null) {
if (counter % 1000 == 0) {
LOG.info(counter + ": " + ArrayUtils.toString(line));
}
// Clean up
for (int i = 0; i < line.length; i++) {
line[i] = line[i].trim();
if (line[i].equals("null") || line[i].equals("<null>") || StringUtils.isBlank(line[i])) {
line[i] = null;
}
}
bulk.add(line);
counter++;
if (counter % BATCH_SIZE == 0) {
workIt(bulk);
bulk.clear();
}
}
workIt(bulk);
bulk.clear();
} catch (final Throwable e) {
LOG.error(e.getMessage(), e);
throw new IOException(e);
} finally {
IOUtils.closeQuietly(reader);
}
LOG.info("Done importing SGSV data. Imported: " + counter);
}
private void workIt(final List<String[]> bulk) {
// Need copy!
final ArrayList<String[]> bulkCopy = new ArrayList<String[]>(bulk);
bulk.clear();
while (threadPool.getQueue().size() > nThreads) {
LOG.trace("Queue is too large, waiting...");
try {
Thread.sleep(100);
} catch (final InterruptedException e) {
LOG.warn(e.getMessage());
}
}
threadPool.execute(new Runnable() {
@Override
public void run() {
List<SGSVEntry> accns = new ArrayList<SGSVEntry>(bulkCopy.size());
// Extract INSTCODE and ACCENUMB
for (String[] entry : bulkCopy) {
if (ArrayUtils.contains(institutes, entry[1])) {
accns.add(new SGSVEntry(entry));
}
}
if (LOG.isTraceEnabled())
LOG.trace("Got " + accns.size() + " entries");
// Must be blank
List<Accession> existing = genesysService.listAccessions(accns);
for (int i = accns.size() - 1; i >= 0; i--) {
Accession accn = existing.get(i);
if (accn != null) {
LOG.warn("Exists: " + accn);
accns.remove(i);
}
}
// Check if they match (Genus, OrigCty) for now
List<Accession> accessions = new ArrayList<Accession>(accns.size());
for (int i = accns.size() - 1; i >= 0; i--) {
SGSVEntry entry = accns.get(i);
Accession accn = new Accession();
accn.setAccessionName(entry.acceNumb);
accn.setOrigin(entry.origCty);
accn.setGenus(entry.genus);
accn.setTaxonomy(taxonomyService.ensureTaxonomy(entry.genus, entry.species, entry.fullTaxa));
accn.setInstituteCode(entry.instCode);
accn.setInstitute(getFromCache(entry.instCode));
accn.setDataSource("Svalbard");
if (accn.getInstitute() == null) {
LOG.warn("No institute: " + accn.getInstituteCode());
continue;
}
if (accn.getTaxonomy()==null) {
LOG.warn("No taxonomy: " + entry);
continue;
}
accessions.add(accn);
}
// Save data
if (accessions.size() > 0) {
genesysService.saveAccessions(accessions);
LOG.info("Added " + accessions.size() + " new entires");
}
}
});
}
protected synchronized FaoInstitute getFromCache(String instCode) {
FaoInstitute inst = faoInstituteCache.get(instCode);
if (inst == null) {
inst = instituteService.getInstitute(instCode);
faoInstituteCache.put(instCode, inst);
}
return inst;
}
@PreDestroy
private void shutdownPool() {
threadPool.shutdown();
LOG.info("Waiting for all threads to terminate");
try {
while (!threadPool.awaitTermination(1, TimeUnit.SECONDS)) {
try {
Thread.sleep(200);
} catch (final InterruptedException e) {
LOG.warn(e.getMessage());
}
}
LOG.info("All workers terminated.");
} catch (final InterruptedException e) {
LOG.error(e.getMessage(), e);
}
}
private class SGSVEntry implements AccessionIdentifier3 {
String instCode;
String acceNumb;
String origCty;
String genus;
String fullTaxa;
String species;
public SGSVEntry(String[] entry) {
instCode = entry[1];
acceNumb = entry[4];
fullTaxa = entry[5];
origCty = entry[12];
genus = entry[16];
species = entry[17];
}
@Override
public String getHoldingInstitute() {
return instCode;
}
@Override
public String getAccessionName() {
return acceNumb;
}
@Override
public String getGenus() {
return genus;
}
@Override
public String toString() {
return "SGSVEntry " + instCode + " " + acceNumb;
}
}
}
......@@ -45,7 +45,7 @@ import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.genesys2.rest.common.model.genesys.Accession;
import org.genesys2.rest.common.model.genesys.SvalbardData;
import org.genesys2.rest.common.model.impl.AccessionIdentifier2;
import org.genesys2.rest.common.model.impl.AccessionIdentifier3;
import org.genesys2.rest.common.service.GenesysService;
import org.hibernate.exception.ConstraintViolationException;
import org.springframework.beans.factory.annotation.Autowired;
......@@ -260,7 +260,7 @@ public class SGSVUpdate {
if (accn.getOrigin().equalsIgnoreCase(entry.origCty) && accn.getTaxonomy().getGenus().equals(entry.genus)) {
if (accn.getAccessionName().equalsIgnoreCase(entry.acceNumb) && accn.getInstituteCode().equals(entry.instCode)) {
if (true != accn.getInSvalbard()) {
if (accn.getInSvalbard() == null || accn.getInSvalbard() == false) {
LOG.info("Found one in SGSV: " + accn);
}
accn.setInSvalbard(true);
......@@ -321,7 +321,7 @@ public class SGSVUpdate {
}
}
private class SGSVEntry implements AccessionIdentifier2 {
private class SGSVEntry implements AccessionIdentifier3 {
String instCode;
String acceNumb;
String origCty;
......@@ -354,6 +354,10 @@ public class SGSVUpdate {
return acceNumb;
}
public String getGenus() {
return genus;
}
@Override
public String toString() {
return "SGSVEntry " + instCode + "+" + acceNumb;
......
......@@ -14,9 +14,10 @@
* limitations under the License.
**/
package org.genesys2.rest.common.service.impl;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.genesys2.rest.common.model.genesys.Taxonomy;
import org.genesys2.rest.common.persistence.domain.TaxonomyRepository;
import org.genesys2.rest.common.service.TaxonomyService;
......@@ -27,6 +28,7 @@ import org.springframework.transaction.annotation.Transactional;
@Service
@Transactional(readOnly = true)
public class TaxonomyServiceImpl implements TaxonomyService {
public static final Log LOG = LogFactory.getLog(TaxonomyServiceImpl.class);
@Autowired
private TaxonomyRepository taxonomyRepository;
......@@ -36,4 +38,39 @@ public class TaxonomyServiceImpl implements TaxonomyService {
return taxonomyRepository.getByGenusAndSpecies(genus, species);
}
@Override
@Transactional(readOnly = false)
public Taxonomy ensureTaxonomy(String genus, String species, String fullTaxa) {
if (genus == null)
return null;
Taxonomy existing = taxonomyRepository.getByTaxonName(fullTaxa);
if (existing == null) {
return internalEnsure(genus, species, fullTaxa);
}
return existing;
}
private synchronized Taxonomy internalEnsure(String genus, String species, String fullTaxa) {
Taxonomy taxonomy = taxonomyRepository.getByTaxonName(fullTaxa);
if (taxonomy == null) {
taxonomy = new Taxonomy();
LOG.warn("Inserting " + genus + " " + species + " " + fullTaxa);
taxonomy.setGenus(genus);
taxonomy.setSpecies(fullTaxa.substring(genus.length() + 1).trim());
taxonomy.setTaxonName(fullTaxa);
// try {
// Thread.sleep(2000);
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
try {
taxonomyRepository.save(taxonomy);
LOG.warn("Inserted new:" + taxonomy);
} catch (Exception e) {
LOG.warn("Error " + e.getMessage() + " :" + taxonomy);