Commit b7ebd000 authored by Matija Obreza's avatar Matija Obreza
Browse files

Merge branch '5-detecting-data-structure' into 'main'

Resolve "Detecting data structure"

Closes #5

See merge request genesys-pgr/amphibian!12
parents 4b3c4a10 ed0f9d66
......@@ -5,3 +5,4 @@ bin/
.project
.classpath
amphibian-server/src/main/resources/amphibian.properties
amphibian-server/src/test/resources/amphibian.properties
......@@ -458,6 +458,29 @@ paths:
'404':
description: Not Found
deprecated: false
'/api/v0/preview/{uuid}/generate-descriptors':
post:
tags:
- preview
summary: Generate descriptors for all PreviewSheets of given Preview
operationId: generateDescriptors
parameters:
- name: uuid
in: path
description: Your reference UUID
required: true
type: string
format: uuid
responses:
'200':
description: OK
'401':
description: Unauthorized
'403':
description: Forbidden
'404':
description: Not Found
deprecated: false
definitions:
Observation:
type: object
......@@ -670,4 +693,27 @@ definitions:
rowCount:
type: integer
format: int32
descriptors:
type: array
items:
$ref: '#/definitions/PreviewDescriptor'
title: PreviewSheet
PreviewDescriptor:
title: PreviewDescriptor
type: object
properties:
columnName:
type: string
dataType:
type: string
enum:
- CODED
- TEXT
- NUMERIC
unique:
type: boolean
nullable: false
examples:
type: array
items:
type: object
......@@ -90,6 +90,7 @@ public class PreviewApi {
@ApiOperation(nickname="getPreview", value = "Get the overview of the parsed dataset", notes = "Use the same reference UUID as provided when ingesting a dataset")
public Preview get(@ApiParam(value = "Your reference UUID", required = true) @PathVariable UUID uuid) {
Preview preview = previewService.getPreview(uuid);
previewService.generateDescriptors(uuid);
if (preview == null) {
throw new NoSuchThingException("No preview for reference UUID=" + uuid);
}
......@@ -128,4 +129,14 @@ public class PreviewApi {
return previewService.getStatisticsData(uuid, sheet, startRow, limit.orElse(100), fields);
}
@PostMapping(path = "/{uuid}/generate-descriptors")
@ApiOperation(nickname="generateDescriptors", value = "Generate descriptors for all PreviewSheets of given Preview")
public void generateDescriptors(@ApiParam(value = "Your reference UUID", required = true) @PathVariable UUID uuid) {
Preview preview = previewService.getPreview(uuid);
if (preview == null) {
throw new NoSuchThingException("No preview for reference UUID=" + uuid);
}
previewService.generateDescriptors(uuid);
}
}
......@@ -21,6 +21,7 @@ import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
......@@ -28,6 +29,7 @@ import org.springframework.data.mongodb.MongoDbFactory;
import org.springframework.data.mongodb.MongoTransactionManager;
import org.springframework.data.mongodb.config.AbstractMongoConfiguration;
import org.springframework.data.mongodb.config.EnableMongoAuditing;
import org.springframework.data.mongodb.core.convert.MappingMongoConverter;
import org.springframework.data.mongodb.repository.config.EnableMongoRepositories;
import com.mongodb.MongoClient;
......@@ -79,4 +81,5 @@ public class DatabaseConfig extends AbstractMongoConfiguration {
MongoClient mongoClient = new MongoClient(servers, credential, options);
return mongoClient;
}
}
/*
* Copyright 2019 Global Crop Diversity Trust
* Copyright 2021 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -15,17 +15,39 @@
*/
package org.genesys.amphibian.model;
import java.util.List;
import java.util.Set;
/**
* PreviewSheet describes data found in each sheet of the {@link Preview}.
*/
public class PreviewSheet {
public String name;
public int index;
public int rowCount;
public List<Descriptor> descriptors;
public PreviewSheet() {
}
public PreviewSheet(int sheetIndex, String sheetName) {
index = sheetIndex;
name = sheetName;
rowCount = -1;
}
public static class Descriptor {
/// Column name
public String columnName;
/// Descriptor type
public DataType dataType;
/// Column contains unique values
public boolean unique;
/// For CODED columns, examples contains up to 100 distinct column values
public Set<Object> examples;
public enum DataType {
CODED, TEXT, NUMERIC
}
}
}
......@@ -44,6 +44,8 @@ public interface PreviewService {
Preview registerSheet(String id, PreviewSheet sheet);
void generateDescriptors(UUID referenceUuid);
class StatisticsData implements Serializable {
private static final long serialVersionUID = 3826590456962597612L;
......
......@@ -15,19 +15,22 @@
*/
package org.genesys.amphibian.service.impl;
import static org.springframework.data.mongodb.core.query.Criteria.*;
import static org.springframework.data.mongodb.core.query.Query.*;
import static org.springframework.data.mongodb.core.query.Criteria.where;
import static org.springframework.data.mongodb.core.query.Query.query;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.bson.Document;
import org.genesys.amphibian.model.Preview;
import org.genesys.amphibian.model.Preview.State;
......@@ -193,6 +196,61 @@ public class PreviewServiceImpl implements PreviewService, InitializingBean {
return statistics;
}
@Override
public void generateDescriptors(UUID referenceUuid) {
Preview preview = previewRepository.findByReferenceUuid(referenceUuid);
List<PreviewSheet> previewSheets = preview.getSheets();
// generate descriptors for all sheets
for (PreviewSheet previewSheet : previewSheets) {
// get columns
Query q = query(where(RAWDATA_DATASET).is(referenceUuid).and(RAWDATA_SHEET).is(previewSheet.index).and(RAWDATA_ROW).gte(0)).with(Sort.by(RAWDATA_ROW));
q.limit(1);
q.fields().exclude(RAWDATA_DATASET).exclude(RAWDATA_SHEET).exclude(RAWDATA_ROW).exclude(MONGO_ID);
Document namesRow = mongoTemplate.find(q, Document.class, MONGO_RAWDATA).get(0);
Set<String> columns = namesRow.keySet();
List<String> columnNames = namesRow.values().stream().map(value -> (String) value).collect(Collectors.toList());
// get statistics data for columns
List<StatisticsData> statisticsData = getStatisticsData(referenceUuid, previewSheet.index, 1, 100, columns.toArray(String[]::new));
List<PreviewSheet.Descriptor> descriptors = new ArrayList<>();
for (int i = 0; i < columns.size(); i++) {
StatisticsData statistic = statisticsData.get(i);
// create descriptor
PreviewSheet.Descriptor descriptor = new PreviewSheet.Descriptor();
descriptor.columnName = columnNames.get(i);
// Column contains unique values
descriptor.unique = statistic.numberOfDistinct == statistic.totalValues;
// Sample data
descriptor.examples = new HashSet<>(statistic.distinctValues);
if (((float) statistic.numberOfDistinct / statistic.totalValues) < 0.2) {
descriptor.dataType = PreviewSheet.Descriptor.DataType.CODED;
} else if (checkIsNumericValues(statistic.distinctValues)) {
descriptor.dataType = PreviewSheet.Descriptor.DataType.NUMERIC;
} else {
descriptor.dataType = PreviewSheet.Descriptor.DataType.TEXT;
}
descriptors.add(descriptor);
}
previewSheet.descriptors = descriptors;
}
preview.setSheets(previewSheets);
previewRepository.save(preview);
}
private boolean checkIsNumericValues(Set<Object> values) {
return values.stream().allMatch(value -> NumberUtils.isCreatable(String.valueOf(value)));
}
public void removeMongoRawdata(UUID referenceUuid) {
Preview preview = previewRepository.findByReferenceUuid(referenceUuid);
if (preview != null) {
......
/*
* Copyright 2021 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys.amphibian.test.api;
import com.opencsv.CSVWriter;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.*;
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.*;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.*;
import java.io.ByteArrayOutputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.UUID;
import org.apache.commons.lang3.RandomUtils;
import org.apache.logging.log4j.core.util.StringBuilderWriter;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.HorizontalAlignment;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.genesys.amphibian.api.v0.PreviewApi;
import org.genesys.amphibian.model.Preview;
import org.genesys.amphibian.model.PreviewSheet;
import org.genesys.amphibian.service.MetadataService;
import org.genesys.amphibian.service.PreviewService;
import org.genesys.amphibian.test.base.AbstractApiTest;
import org.junit.Test;
......@@ -15,24 +49,19 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.MediaType;
import org.springframework.mock.web.MockMultipartFile;
import java.io.ByteArrayOutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.UUID;
import static org.hamcrest.Matchers.*;
import static org.hamcrest.MatcherAssert.*;
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.multipart;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.opencsv.CSVWriter;
public class PreviewApiTest extends AbstractApiTest {
@Autowired
private PreviewService previewService;
@Autowired
private MetadataService metadataService;
/**
* Ingest csv test.
*/
@Test
public void ingestCsvTest() throws Exception {
UUID uuid = UUID.randomUUID();
......@@ -41,7 +70,7 @@ public class PreviewApiTest extends AbstractApiTest {
CSVWriter writer = new CSVWriter(new StringBuilderWriter(content));
//todo insert data
//insert data
String[] row1 = new String[]{"INSTCODE", "ACCENUMB", "GENUS", "SEEDWGT"};
String[] row2 = new String[]{"SYR002", "IG 137552", "Lens", "0.8799999952316284"};
List<String[]> rows = new ArrayList<>();
......@@ -78,40 +107,46 @@ public class PreviewApiTest extends AbstractApiTest {
assertThat(preview.getSheets().get(0).rowCount, is(2));
}
/**
* Ingest exel test.
*/
@Test
public void ingestExelTest() throws Exception {
UUID uuid = UUID.randomUUID();
StringBuilder content = new StringBuilder();
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet spreadsheet = workbook.createSheet("test");
XSSFRow row;
//todo insert data
Map<String, Object[]> data = new TreeMap<String, Object[]>();
data.put("1", new Object[] {"INSTCODE", "ACCENUMB", "GENUS", "SEEDWGT"});
data.put("2", new Object[] {"SYR002", "IG 137552", "Lens", "0.8799999952316284"});
Set<String> keyid = data.keySet();
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
int rowid = 0;
try (XSSFWorkbook workbook = new XSSFWorkbook()) {
var dateStyle = workbook.createCellStyle();
dateStyle.setDataFormat(workbook.createDataFormat().getFormat("dd-mmm-yyyy"));
dateStyle.setAlignment(HorizontalAlignment.RIGHT);
for (String key : keyid) {
row = spreadsheet.createRow(rowid++);
Object[] objectArr = data.get(key);
int cellid = 0;
for (Object obj : objectArr) {
Cell cell = row.createCell(cellid++);
cell.setCellValue((String)obj);
XSSFSheet spreadsheet = workbook.createSheet("test");
XSSFRow row;
//insert data
Map<String, Object[]> data = new TreeMap<String, Object[]>();
data.put("1", new Object[] {"INSTCODE", "ACCENUMB", "GENUS", "SEEDWGT", "CREATED"});
data.put("2", new Object[] {"SYR002", "IG 137552", "Lens", 0.8799999952316284, new Date()});
Set<String> keyid = data.keySet();
int rowid = 0;
for (String key : keyid) {
row = spreadsheet.createRow(rowid++);
Object[] objectArr = data.get(key);
int cellid = 0;
for (Object obj : objectArr) {
Cell cell = row.createCell(cellid++);
setCellValue(cell, obj, dateStyle);
}
}
workbook.write(outputStream);
}
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
workbook.write(outputStream);
MockMultipartFile file
= new MockMultipartFile(
"file",
......@@ -139,5 +174,115 @@ public class PreviewApiTest extends AbstractApiTest {
assertThat(preview, is(notNullValue()));
assertThat(preview.getSheets(), hasSize(1));
assertThat(preview.getSheets().get(0).rowCount, is(2));
var data = previewService.getData(uuid, 0, 0, 10);
assertThat(data, not(nullValue()));
// data.forEach(d -> {
// System.err.println(d.toJson());
// });
/*@formatter:off*/
mockMvc
.perform(post(PreviewApi.CONTROLLER_URL.concat("/{uuid}/generate-descriptors"), uuid.toString()).characterEncoding("UTF8"))
// .andDo(org.springframework.test.web.servlet.result.MockMvcResultHandlers.print())
.andExpect(status().isOk())
;
/*@formatter:on*/
preview = previewService.getPreview(uuid);
PreviewSheet sheet0 = preview.getSheets().get(0);
assertThat(sheet0.rowCount, is(2));
// System.err.println(new ObjectMapper().writeValueAsString(sheet0));
assertThat(sheet0.descriptors.get(0).columnName, is("INSTCODE"));
}
private void setCellValue(Cell cell, Object obj, CellStyle dateStyle) {
if (obj != null) {
if (Date.class.isInstance(obj)) {
cell.setCellValue((Date) obj);
cell.setCellStyle(dateStyle);
} else if (Number.class.isInstance(obj)) {
cell.setCellValue(((Number) obj).doubleValue());
} else {
cell.setCellValue(Objects.toString(obj));
}
}
}
/**
* Generate descriptors test.
*/
@Test
public void generateDescriptorsTest() throws Exception {
UUID uuid = UUID.randomUUID();
StringBuilder content = new StringBuilder();
CSVWriter writer = new CSVWriter(new StringBuilderWriter(content));
String[] row1 = new String[] { "INSTCODE", "ACCENUMB", "GENUS", "SEEDWGT", "COLOR" };
String[] row2 = new String[] { "LBN002", "IG 137552", "Lens", "0.8799999952316284", "A" };
String[] row3 = new String[] { "LBN002", "IG 137553", "Musa", "0.9799999952316284", "B" };
String[] row4 = new String[] { "LBN002", "IG 137554", "Lens", "0.5799999952316284", "C" };
List<String[]> rows = new ArrayList<>();
rows.add(row1);
rows.add(row2);
rows.add(row3);
rows.add(row4);
var colorCodes = new String[] { "A", "B", "C", "D", null };
for (var i = 0; i < 100; i++) {
rows.add(new String[] { "LBN002", "IG " + (137555 + i), "Hordeum", "" + RandomUtils.nextFloat(), colorCodes[RandomUtils.nextInt(0, colorCodes.length)] });
}
writer.writeAll(rows);
MockMultipartFile file
= new MockMultipartFile(
"file",
"preview.csv",
"text/csv",
content.toString().getBytes()
);
metadataService.previewFromCSV(uuid, null, file, null, null);
// Waiting for data inserting to complete
Thread.sleep(500);
/*@formatter:off*/
mockMvc
.perform(post(PreviewApi.CONTROLLER_URL.concat("/{uuid}/generate-descriptors"), uuid.toString()).characterEncoding("UTF8"))
// .andDo(org.springframework.test.web.servlet.result.MockMvcResultHandlers.print())
.andExpect(status().isOk())
;
/*@formatter:on*/
Preview preview = previewService.getPreview(uuid);
assertThat(preview, is(notNullValue()));
assertThat(preview.getSheets(), hasSize(1));
PreviewSheet sheet0 = preview.getSheets().get(0);
assertThat(sheet0.rowCount, is(rows.size()));
assertThat(sheet0.descriptors.get(0).columnName, is("INSTCODE"));
assertThat(sheet0.descriptors.get(0).dataType, is(PreviewSheet.Descriptor.DataType.CODED));
assertThat(sheet0.descriptors.get(0).examples, notNullValue());
assertThat(sheet0.descriptors.get(0).examples, contains("LBN002"));
assertThat(sheet0.descriptors.get(1).columnName, is("ACCENUMB"));
assertThat(sheet0.descriptors.get(1).dataType, is(PreviewSheet.Descriptor.DataType.TEXT));
assertThat(sheet0.descriptors.get(1).unique, is(true));
assertThat(sheet0.descriptors.get(2).columnName, is("GENUS"));
assertThat(sheet0.descriptors.get(2).dataType, is(PreviewSheet.Descriptor.DataType.CODED));
assertThat(sheet0.descriptors.get(2).examples, notNullValue());
assertThat(sheet0.descriptors.get(3).columnName, is("SEEDWGT"));
assertThat(sheet0.descriptors.get(3).dataType, is(PreviewSheet.Descriptor.DataType.NUMERIC));
assertThat(sheet0.descriptors.get(3).examples, notNullValue());
assertThat(sheet0.descriptors.get(4).columnName, is("COLOR"));
assertThat(sheet0.descriptors.get(4).dataType, is(PreviewSheet.Descriptor.DataType.CODED));
assertThat(sheet0.descriptors.get(4).examples, notNullValue());
assertThat(sheet0.descriptors.get(4).examples, containsInAnyOrder("A", "B", "C", "D"));
}
}
......@@ -23,6 +23,7 @@ import org.junit.After;
import org.junit.Before;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.ContextHierarchy;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
......@@ -37,6 +38,8 @@ import org.springframework.web.context.WebApplicationContext;
@ContextHierarchy(@ContextConfiguration(name = "api", classes = { ApplicationTestConfig.class }))
public abstract class AbstractApiTest {
private static final String MONGO_RAWDATA = "rawdata";
@Autowired
private WebApplicationContext webApplicationContext;
......@@ -51,12 +54,16 @@ public abstract class AbstractApiTest {
@Autowired
private TableRepository tableRepository;
@Autowired
private MongoTemplate mongoTemplate;
@After
@Transactional
public void afterTest() {
previewRepository.deleteAll();
datasetTableRepository.deleteAll();
tableRepository.deleteAll();
mongoTemplate.dropCollection(MONGO_RAWDATA);
}
@Before
......
......@@ -38,7 +38,7 @@ import java.util.ArrayList;
import java.util.List;
@Configuration
@PropertySources({ @PropertySource("classpath:/junit.properties") })
@PropertySources({ @PropertySource("classpath:/junit.properties"), @PropertySource("classpath:/amphibian.properties") })
@EnableMongoRepositories(basePackages = { "org.genesys.amphibian.repositories.mongo" })
@EnableMongoAuditing(modifyOnCreate = true)
public class DatabaseConfig extends AbstractMongoConfiguration {
......
Supports Markdown
0% or