Commit ad5d1748 authored by Matija Obreza's avatar Matija Obreza
Browse files

Reduced scope, multi-threaded loading

parent c757d1c7
...@@ -30,6 +30,6 @@ public interface GeonamesService { ...@@ -30,6 +30,6 @@ public interface GeonamesService {
* @param list list of geonames to be updated in the db * @param list list of geonames to be updated in the db
* @throws Exception when list of geonames not saved * @throws Exception when list of geonames not saved
*/ */
void update(List<Geoname> list) throws Exception; void upsert(List<Geoname> list) throws Exception;
} }
...@@ -15,6 +15,9 @@ ...@@ -15,6 +15,9 @@
*/ */
package org.genesys.catalog.service.impl; package org.genesys.catalog.service.impl;
import java.util.List;
import java.util.stream.Collectors;
import org.genesys.catalog.service.GeonamesService; import org.genesys.catalog.service.GeonamesService;
import org.genesys.common.model.Geoname; import org.genesys.common.model.Geoname;
import org.genesys.common.persistence.GeonameRepository; import org.genesys.common.persistence.GeonameRepository;
...@@ -24,8 +27,6 @@ import org.springframework.beans.factory.annotation.Autowired; ...@@ -24,8 +27,6 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional; import org.springframework.transaction.annotation.Transactional;
import java.util.List;
/** /**
* The Class GeonamesServiceImpl. * The Class GeonamesServiceImpl.
* *
...@@ -42,7 +43,21 @@ public class GeonamesServiceImpl implements GeonamesService { ...@@ -42,7 +43,21 @@ public class GeonamesServiceImpl implements GeonamesService {
@Override @Override
@Transactional @Transactional
public void update(final List<Geoname> list) throws Exception { public void upsert(final List<Geoname> list) throws Exception {
geonameRepository.bulkSave(list); if (list.isEmpty()) {
return;
}
List<Geoname> existing = geonameRepository.findAll(list.stream().map(geoname -> geoname.getId()).collect(Collectors.toSet()));
for (Geoname geoname : list) {
geoname.setVersion(
// find matching id
existing.stream().filter(e -> e.getId().equals(geoname.getId()))
// get its version
.map(e -> e.getVersion())
// or null
.findFirst().orElse(null));
}
geonameRepository.save(list);
} }
} }
...@@ -24,13 +24,12 @@ import java.text.SimpleDateFormat; ...@@ -24,13 +24,12 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Scanner; import java.util.Scanner;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream; import java.util.zip.ZipInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity; import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
...@@ -38,10 +37,10 @@ import org.apache.http.impl.client.CloseableHttpClient; ...@@ -38,10 +37,10 @@ import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClientBuilder;
import org.genesys.catalog.service.GeonamesService; import org.genesys.catalog.service.GeonamesService;
import org.genesys.common.model.Geoname; import org.genesys.common.model.Geoname;
import org.genesys.common.persistence.GeonameRepository;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.task.TaskExecutor;
import org.springframework.security.access.prepost.PreAuthorize; import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
...@@ -51,200 +50,256 @@ import org.springframework.stereotype.Component; ...@@ -51,200 +50,256 @@ import org.springframework.stereotype.Component;
@Component @Component
public class GeonamesUpdater { public class GeonamesUpdater {
private static final int BATCH_SIZE = 500; private static final int BATCH_SIZE = 100;
private static final Logger LOG = LoggerFactory.getLogger(GeonamesUpdater.class); private static final Logger LOG = LoggerFactory.getLogger(GeonamesUpdater.class);
public static final String DUMP_FILE_NAME = "allCountries.txt"; public static final String DUMP_FILE_NAME = "allCountries.txt";
// public static final String GEONAMES_DUMP_URL = "http://download.geonames.org/export/dump/UA.zip"; public static final String GEONAMES_DUMP_URL = "http://download.geonames.org/export/dump/allCountries.zip";
public static final String GEONAMES_DUMP_URL = "http://download.geonames.org/export/dump/allCountries.zip"; private static boolean RUNNING = false;
private static final String TEMP_DIR = "temp";
private static boolean RUNNING = false; public final static int GEONAME_ID = 0;
public final static int NAME = 1;
public final static int GEONAME_ID = 0; public final static int ASCII_NAME = 2;
public final static int NAME = 1; public final static int ALTERNATE_NAMES = 3;
public final static int ASCII_NAME = 2; public final static int LATITUDE = 4;
public final static int ALTERNATE_NAMES = 3; public final static int LONGITUDE = 5;
public final static int LATITUDE = 4; public final static int FEATURE_CLASS = 6;
public final static int LONGITUDE = 5; public final static int FEATURE_CODE = 7;
public final static int FEATURE_CLASS = 6; public final static int COUNTRY_CODE = 8;
public final static int FEATURE_CODE = 7; public final static int CC2 = 9;
public final static int COUNTRY_CODE = 8; public final static int ADMIN1_CODE = 10;
public final static int CC2 = 9; public final static int ADMIN2_CODE = 11;
public final static int ADMIN1_CODE = 10; public final static int ADMIN3_CODE = 12;
public final static int ADMIN2_CODE = 11; public final static int ADMIN4_CODE = 13;
public final static int ADMIN3_CODE = 12; public final static int POPULATION = 14;
public final static int ADMIN4_CODE = 13; public final static int ELEVATION = 15;
public final static int POPULATION = 14; public final static int DEM = 16;
public final static int ELEVATION = 15; public final static int TIMEZONE = 17;
public final static int DEM = 16; public final static int MODIFICATION_DATE = 18;
public final static int TIMEZONE = 17;
public final static int MODIFICATION_DATE = 18; // http://www.geonames.org/export/codes.html
private static final Character[] IMPORTED_FEATURE_CLASSES = { 'A', 'H', 'L', 'P', 'T', 'V' };
@Autowired private static final String[] IMPORTED_FEATURE_CODES = { "PCL", "PCLD", "PCLI", "PCLIX", "PCLS", "PCLH", "ADM1", "ADM2", "ADM3", "OCN", "SEA", "LK", "PPL" };
private GeonameRepository geonameRepository;
@Autowired
@Autowired private GeonamesService geonamesService;
private GeonamesService geonamesService;
@Autowired
private ExecutorService executor = Executors.newFixedThreadPool(1); private TaskExecutor executor;
/** /**
* Update local Geonames with data from geonames.org * Update local Geonames with data from geonames.org
* *
*/ */
@PreAuthorize("hasRole('ADMINISTRATOR')") @PreAuthorize("hasRole('ADMINISTRATOR')")
public void updateGeonames() { public void updateGeonames() {
executor.submit(() -> { executor.execute(() -> {
try { try {
if (!isRunning()) { if (!isRunning()) {
downloadUnpackAndImportGeonames(); downloadUnpackAndImportGeonames();
} }
} catch (final IOException e) { } catch (final IOException e) {
LOG.error(e.getMessage(), e); LOG.error(e.getMessage(), e);
} }
}); });
} }
private void downloadUnpackAndImportGeonames() throws IOException { public static synchronized boolean isRunning() {
LOG.warn("Downloading geonames data from {}", GEONAMES_DUMP_URL); return RUNNING;
RUNNING = true; }
final CloseableHttpClient httpclient = HttpClientBuilder.create().build();
final HttpGet httpget = new HttpGet(GEONAMES_DUMP_URL); private void downloadUnpackAndImportGeonames() throws IOException {
final HttpResponse response; LOG.warn("Importing geonames data");
final HttpEntity entity; RUNNING = true;
InputStream instream = null; File dumpFile = new File("data", DUMP_FILE_NAME);
File dumbFile = null;
try { try {
response = httpclient.execute(httpget); if (!dumpFile.exists()) {
dumpFile = downloadGeonames();
// Get hold of the response entity }
entity = response.getEntity();
if (dumpFile != null) {
final byte[] buffer = new byte[1024]; importGeonames(dumpFile);
instream = entity.getContent(); } else {
ZipInputStream zis = new ZipInputStream(instream); LOG.warn("Expected file {} was not found.", DUMP_FILE_NAME);
ZipEntry ze = zis.getNextEntry(); }
while (ze != null) { } catch (Throwable e) {
final String fileName = ze.getName(); // FileUtils.deleteQuietly(dumpFile);
final File newFile = new File(TEMP_DIR + File.separator + fileName); }
LOG.warn("Unpacking {} file to {}", fileName, newFile.getAbsolutePath()); }
if (fileName.equals(DUMP_FILE_NAME)) { private File downloadGeonames() throws IOException {
dumbFile = newFile; LOG.warn("Downloading geonames data from {}", GEONAMES_DUMP_URL);
}
//update directories for sub directories in zip final CloseableHttpClient httpclient = HttpClientBuilder.create().build();
new File(newFile.getParent()).mkdirs(); final HttpGet httpget = new HttpGet(GEONAMES_DUMP_URL);
final FileOutputStream fos = new FileOutputStream(newFile); final HttpResponse response;
int len; final HttpEntity entity;
while ((len = zis.read(buffer)) > 0) { InputStream instream = null;
fos.write(buffer, 0, len);
} try {
response = httpclient.execute(httpget);
LOG.warn("File unpack completed to {}", newFile.getAbsolutePath());
fos.close(); // Get hold of the response entity
zis.closeEntry(); entity = response.getEntity();
ze = zis.getNextEntry();
} final byte[] buffer = new byte[1024];
instream = entity.getContent();
if (dumbFile != null) { ZipInputStream zis = new ZipInputStream(instream);
importGeonames(dumbFile); ZipEntry ze = zis.getNextEntry();
} else {
LOG.warn("Expected file {} was not found.", DUMP_FILE_NAME); while (ze != null) {
throw new IOException("Missing file " + DUMP_FILE_NAME); final String fileName = ze.getName();
}
} catch (final Throwable e) { if (!fileName.equals(DUMP_FILE_NAME)) {
LOG.error("Geonames download and unpack failed to complete.", e); LOG.info("Skipping {}", fileName);
throw new IOException(e); continue;
} finally { }
RUNNING = false;
IOUtils.closeQuietly(httpclient); final File newFile = File.createTempFile("geonames", fileName);
IOUtils.closeQuietly(instream); LOG.warn("Unpacking {} file to {}", fileName, newFile.getAbsolutePath());
FileUtils.deleteQuietly(new File(TEMP_DIR));
} // update directories for sub directories in zip
} new File(newFile.getParent()).mkdirs();
final FileOutputStream fos = new FileOutputStream(newFile);
private void importGeonames(final File unpackedFile) throws IOException { int len;
LOG.warn("Importing geonames data from {} file.", unpackedFile.getName()); while ((len = zis.read(buffer)) > 0) {
fos.write(buffer, 0, len);
FileInputStream inputStream = null; }
final Scanner sc;
LOG.warn("File unpack completed to {}", newFile.getAbsolutePath());
try { fos.close();
inputStream = new FileInputStream(unpackedFile.getAbsolutePath()); zis.closeEntry();
sc = new Scanner(inputStream, "UTF-8"); ze = zis.getNextEntry();
List<Geoname> listToSave = new ArrayList<>(BATCH_SIZE);
long k = 0; return newFile;
while (sc.hasNextLine()) { }
final String line = sc.nextLine();
final String[] values = line.split("\t"); } catch (final Throwable e) {
if (listToSave.size() == BATCH_SIZE) { LOG.error("Geonames download and unpack failed to complete.", e);
processData(listToSave); throw new IOException(e);
} else { } finally {
Geoname geoname = geonameRepository.findOne(Long.valueOf(values[GEONAME_ID].trim())); RUNNING = false;
if (geoname == null) { IOUtils.closeQuietly(httpclient);
geoname = new Geoname(); IOUtils.closeQuietly(instream);
extractParsedLineIntoGeoname(geoname, values); }
} else {
extractParsedLineIntoGeoname(geoname, values); throw new IOException("Could not find " + DUMP_FILE_NAME + " in archive");
} }
listToSave.add(geoname);
} private void importGeonames(final File unpackedFile) throws IOException {
k++; LOG.warn("Importing geonames data from {} file.", unpackedFile.getName());
}
processData(listToSave); FileInputStream inputStream = null;
final Scanner sc;
LOG.info("Done importing geonames database");
IOUtils.closeQuietly(sc); try {
} finally { inputStream = new FileInputStream(unpackedFile.getAbsolutePath());
IOUtils.closeQuietly(inputStream); sc = new Scanner(inputStream, "UTF-8");
} List<String[]> linesToImport = new ArrayList<>(BATCH_SIZE);
} long k = 0;
while (sc.hasNextLine()) {
/** final String line = sc.nextLine();
* Extract data from line into Geoname instance final String[] values = line.split("\t");
*/
private void extractParsedLineIntoGeoname(final Geoname geoname, final String[] values) { if (linesToImport.size() == BATCH_SIZE) {
geoname.setId(Long.valueOf(values[GEONAME_ID].trim())); processData(linesToImport);
geoname.setName(values[NAME]); }
geoname.setAsciiname(values[ASCII_NAME]);
linesToImport.add(values);
//TODO error when saving string with ASCII characters k++;
// SQLException: Incorrect string value: '\xF0\x90\x8C\xB0\xF0\x90...' for column 'alternatenames'
geoname.setAlternatenames(values[ALTERNATE_NAMES].replaceAll("[^\\p{ASCII}]", "")); if (k % 10000 == 1) {
geoname.setLatitude(!values[LATITUDE].isEmpty() ? Double.valueOf(values[LATITUDE]) : null); LOG.info("Read {} lines", k);
geoname.setLongitude(!values[LONGITUDE].isEmpty() ? Double.valueOf(values[LONGITUDE]) : null); }
geoname.setFeatureClass(values[FEATURE_CLASS]); }
geoname.setFeatureCode(values[FEATURE_CODE]); processData(linesToImport);
geoname.setCountryCode(values[COUNTRY_CODE]);
geoname.setCc2(values[CC2]); LOG.info("Done importing geonames database");
geoname.setAdmin1Code(values[ADMIN1_CODE]); IOUtils.closeQuietly(sc);
geoname.setAdmin2Code(values[ADMIN2_CODE]); } finally {
geoname.setAdmin3Code(values[ADMIN3_CODE]); IOUtils.closeQuietly(inputStream);
geoname.setAdmin4Code(values[ADMIN4_CODE]); }
geoname.setPopulation(!values[POPULATION].isEmpty() ? Long.valueOf(values[POPULATION]) : null); }
geoname.setElevation(!values[ELEVATION].isEmpty() ? Integer.valueOf(values[ELEVATION]) : null);
geoname.setDem(!values[DEM].isEmpty() ? Integer.valueOf(values[DEM]) : null); private void processData(final List<String[]> linesToImport) {
geoname.setTimezone(values[TIMEZONE]); if (linesToImport.isEmpty()) {
try { return;
geoname.setModificationDate(!values[MODIFICATION_DATE].isEmpty() ? new SimpleDateFormat("yy-mm-dd").parse(values[MODIFICATION_DATE]) : null); }
} catch (Exception ex) {
geoname.setModificationDate(null); final List<String[]> copy = new ArrayList<>(linesToImport);
} linesToImport.clear();
}
executor.execute(() -> {
private void processData(final List<Geoname> bulk) { List<Geoname> toSave = new ArrayList<>();
final List<Geoname> copy = new ArrayList<>(bulk); for (String[] values : copy) {
bulk.clear(); Geoname parsed = extractParsedLineIntoGeoname(new Geoname(), values);
try { if (ArrayUtils.contains(IMPORTED_FEATURE_CLASSES, parsed.getFeatureClass())
geonamesService.update(copy); //
} catch (final Exception e) { && ArrayUtils.contains(IMPORTED_FEATURE_CODES, parsed.getFeatureCode())) {
LOG.error("Some data bulk was not saved, read logs!", e); if (parsed.getAdmin3Code() == null) {
} toSave.add(parsed);
} }
}
public static synchronized boolean isRunning() { }
return RUNNING; if (toSave.size() == 0) {
} return;
}
try {
LOG.info("Upserting {} geonames", toSave.size());
geonamesService.upsert(toSave);
} catch (final Throwable e) {
LOG.error("Some data bulk was not saved");
for (int i = 0; i < toSave.size(); i++) {
try {
geonamesService.upsert(toSave.subList(i, i + 1));
} catch (final Throwable e1) {
Geoname g = toSave.get(i);
LOG.warn("Data could not be saved geoname_id={}: {}", g.getId(), g.getName(), e1);
}
}
}
});
}
/**
* Extract data from line into Geoname instance
*
* @return
*/
private Geoname extractParsedLineIntoGeoname(final Geoname geoname, final String[] values) {
geoname.setId(Long.valueOf(values[GEONAME_ID].trim()));
geoname.setName(values[NAME]);
geoname.setAsciiName(values[ASCII_NAME]);
// String alternateNames = values[ALTERNATE_NAMES];
// // Remove all 4-byte utf8 characters
// alternateNames = alternateNames.replaceAll("[^\\u0000-\\uFFFF]", "");
// // geoname.setAlternateNames(StringUtils.trimToNull(alternateNames));
geoname.setLatitude(!values[LATITUDE].isEmpty() ? Double.valueOf(values[LATITUDE]) : null);
geoname.setLongitude(!values[LONGITUDE].isEmpty() ? Double.valueOf(values[LONGITUDE]) : null);
geoname.setFeatureClass(values[FEATURE_CLASS].length() == 0 ? null : values[FEATURE_CLASS].charAt(0));
geoname.setFeatureCode(StringUtils.trimToNull(values[FEATURE_CODE]));
geoname.setCountryCode(StringUtils.trimToNull(values[COUNTRY_CODE]));
geoname.setCc2(StringUtils.trimToNull(values[CC2]));
geoname.setAdmin1Code(StringUtils.trimToNull(values[ADMIN1_CODE]));
geoname.setAdmin2Code(StringUtils.trimToNull(values[ADMIN2_CODE]));
geoname.setAdmin3Code(StringUtils.trimToNull(values[ADMIN3_CODE]));
geoname.setAdmin4Code(StringUtils.trimToNull(values[ADMIN4_CODE]));
// geoname.setPopulation(!values[POPULATION].isEmpty() ?
// Long.valueOf(values[POPULATION]) : null);
// geoname.setElevation(!values[ELEVATION].isEmpty() ?
// Integer.valueOf(values[ELEVATION]) : null);
// geoname.setDem(!values[DEM].isEmpty() ? Integer.valueOf(values[DEM]) : null);
// geoname.setTimezone(StringUtils.trimToNull(values[TIMEZONE]));
try {
geoname.setModificationDate(!values[MODIFICATION_DATE].isEmpty() ? new SimpleDateFormat("yy-mm-dd").parse(values[MODIFICATION_DATE]) : null);
} catch (Exception ex) {
geoname.setModificationDate(null);
}
return geoname;
}
} }
...@@ -15,15 +15,17 @@ ...@@ -15,15 +15,17 @@
*/ */
package org.genesys.common.model; package org.genesys.common.model;
import org.genesys.blocks.model.AuditedVersionedModelWithoutId; import java.io.Serializable;
import java.util.Date;
import javax.persistence.Cacheable; import javax.persistence.Cacheable;
import javax.persistence.Column; import javax.persistence.Column;
import javax.persistence.Entity; import javax.persistence.Entity;
import javax.persistence.Id; import javax.persistence.Id;
import javax.persistence.Lob;
import javax.persistence.Table; import javax.persistence.Table;
import java.util.Date; import javax.persistence.Version;
import org.genesys.blocks.model.EntityId;