Commit d1848336 authored by Maxym Borodenko's avatar Maxym Borodenko Committed by Matija Obreza
Browse files

Controlled Vocabulary - Languages

refactored a field name
parent c8db7873
/*
* Copyright 2018 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys.catalog.service.worker;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
/**
* The Class GeonamesISOLanguageSource.
*
* @author Maxym Borodenko
*/
@Component
public class GeonamesISOLanguageSource {
private static final String GEONAMES_ISO639_URL = "http://download.geonames.org/export/dump/iso-languagecodes.txt";
/** The Constant LOG. */
public static final Logger LOG = LoggerFactory.getLogger(GeonamesISOLanguageSource.class);
/**
* Retrieve data from geonames.org
*
* @return List with LanguageInfo
* @throws IOException IOException
*/
public List<LanguageInfo> fetchLanguageData() throws IOException {
final CloseableHttpClient httpclient = HttpClientBuilder.create().build();
final HttpGet httpget = new HttpGet(GEONAMES_ISO639_URL);
HttpResponse response = null;
InputStream instream = null;
try {
response = httpclient.execute(httpget);
LOG.debug("HTTP Response status: {}", response.getStatusLine());
// Get hold of the response entity
final HttpEntity entity = response.getEntity();
LOG.debug(entity.getContentType() + " " + entity.getContentLength());
instream = entity.getContent();
final BufferedReader inreader = new BufferedReader(new InputStreamReader(instream));
final List<LanguageInfo> languages = new ArrayList<>();
String line;
while ((line = inreader.readLine()) != null) {
if (LOG.isTraceEnabled()) {
LOG.trace(line);
}
if (line.startsWith("ISO 639-3")) {
continue;
} else {
final LanguageInfo languageInfo = parseLine(line);
if (!languages.contains(languageInfo)) {
languages.add(languageInfo);
}
}
}
inreader.close();
LOG.info("Returning {} languages data from geonames.org", languages.size());
return languages;
} catch (final ClientProtocolException e) {
LOG.error(e.getMessage(), e);
throw new IOException("Could not execute HTTP request: " + e.getMessage(), e);
} catch (final RuntimeException ex) {
LOG.error(ex.getMessage(), ex);
httpget.abort();
throw new IOException(ex);
} finally {
LOG.info("Done fetching languages info from geonames.org");
if (instream != null) {
instream.close();
}
httpclient.close();
}
}
/**
* @param line line
* @return parsed LanguageInfo
*/
public static LanguageInfo parseLine(final String line) {
final String[] values = line.split("\t");
final String code = values[0];
final String lang = values[3];
if (LOG.isTraceEnabled()) {
LOG.trace("Language code={} name={}", code, lang);
}
return new LanguageInfo(code, lang);
}
/**
* The Class LanguageInfo.
*/
public static class LanguageInfo {
private String code;
private String language;
/**
* Instantiates a new language info.
*
* @param code the ISO639-3 code
* @param lang the language
*/
public LanguageInfo(final String code, final String lang) {
this.code = code;
this.language = lang;
}
/**
* Gets the ISO639-3 code.
*
* @return the ISO639-3 code
*/
public String getCode() {
return code;
}
/**
* Gets the language.
*
* @return the language
*/
public String getLanguage() {
return language;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append("code=").append(code).append(" language=").append(language);
return sb.toString();
}
}
}
/*
* Copyright 2018 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys.catalog.service.worker;
import java.io.IOException;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.function.Function;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.genesys.catalog.model.vocab.ControlledVocabulary;
import org.genesys.catalog.model.vocab.VocabularyTerm;
import org.genesys.catalog.service.worker.GeonamesISOLanguageSource.LanguageInfo;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
/**
* The Class ISO639VocabularyUpdater.
*
* @author Maxym Borodenko
*/
@Component
public class ISO639VocabularyUpdater {
/** The Constant LOG. */
public static final Log LOG = LogFactory.getLog(ISO639VocabularyUpdater.class);
@Autowired
private GeonamesISOLanguageSource isoLanguageSource;
/**
* Generates a current ISO639-3 {@link ControlledVocabulary} but doesn't
* persist it to storage.
*
* @return vocabulary of ISO639-3 3-letter language codes
* @throws IOException IOException
*/
public ControlledVocabulary getISO639Vocabulary() throws IOException {
return createVocabulary("ISO639-3", LanguageInfo::getCode);
}
/**
* Creates the vocabulary.
*
* @param title the title
* @param toTerm the to term
* @return the controlled vocabulary
* @throws IOException Signals that an I/O exception has occurred.
*/
protected ControlledVocabulary createVocabulary(final String title, final Function<LanguageInfo, String> toTerm) throws IOException {
final ControlledVocabulary vocabulary = new ControlledVocabulary();
vocabulary.setTitle(title);
final DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy.MM.dd");
final LocalDate localDate = LocalDate.now();
vocabulary.setVersionTag(dtf.format(localDate));
final Map<String, VocabularyTerm> assignedCodes = new HashMap<>();
isoLanguageSource.fetchLanguageData().stream().map(language -> {
final VocabularyTerm term = new VocabularyTerm();
term.setCode(toTerm.apply(language));
term.setTitle(language.getLanguage());
return term;
})
// remove terms without codes
.filter(term -> (term.getCode() != null) && (term.getCode().length() > 0))
// add to vocabularyTerms if ISO language code is not assigned
.forEach(term -> {
if (!assignedCodes.containsKey(term.getCode())) {
assignedCodes.put(term.getCode(), term);
}
});
vocabulary.setTerms(new ArrayList<>(assignedCodes.values()));
return vocabulary;
}
}
......@@ -29,7 +29,10 @@ import java.util.List;
import org.genesys.catalog.model.vocab.ControlledVocabulary;
import org.genesys.catalog.service.worker.DavrosCountrySource;
import org.genesys.catalog.service.worker.DavrosCountrySource.CountryInfo;
import org.genesys.catalog.service.worker.GeonamesISOLanguageSource;
import org.genesys.catalog.service.worker.GeonamesISOLanguageSource.LanguageInfo;
import org.genesys.catalog.service.worker.ISO3166VocabularyUpdater;
import org.genesys.catalog.service.worker.ISO639VocabularyUpdater;
import org.genesys.catalog.test.ServiceTest;
import org.junit.Test;
import org.springframework.beans.factory.annotation.Autowired;
......@@ -38,6 +41,7 @@ import org.springframework.beans.factory.annotation.Autowired;
* Test if automatic controlled vocabulary updating
*
* @author Matija Obreza
* @author Maxym Borodenko
*/
public class AutomaticVocabulariesTest extends ServiceTest {
......@@ -47,6 +51,12 @@ public class AutomaticVocabulariesTest extends ServiceTest {
@Autowired
private ISO3166VocabularyUpdater iso3updater;
@Autowired
private GeonamesISOLanguageSource geonames;
@Autowired
private ISO639VocabularyUpdater iso639Updater;
@Test
public void testDavros() throws IOException {
List<CountryInfo> countries = davros.fetchCountryData();
......@@ -83,4 +93,20 @@ public class AutomaticVocabulariesTest extends ServiceTest {
assertThat(vocab.getTerms().get(0).getCode(), both(notNullValue()).and(parsesToInt()));
}
@Test
public void testGeonames() throws IOException {
List<LanguageInfo> languages = geonames.fetchLanguageData();
languages.stream().peek(c -> {
// System.out.println(c);
}).count();
assertThat(languages, hasSize(greaterThan(0)));
}
@Test
public void testIso639() throws IOException {
ControlledVocabulary vocab = iso639Updater.getISO639Vocabulary();
assertThat(vocab.getTitle(), is("ISO639-3"));
assertThat(vocab.getTerms(), hasSize(greaterThan(0)));
assertThat(vocab.getTerms().get(0).getCode().length(), is(3));
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment