Commit 5580000d authored by Matija Obreza's avatar Matija Obreza
Browse files

Autodetecting descriptors: Updated for CODED descriptor type, added unique flag and example values

parent 92389cf6
/*
* Copyright 2019 Global Crop Diversity Trust
* Copyright 2021 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -16,8 +16,11 @@
package org.genesys.amphibian.model;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* PreviewSheet describes data found in each sheet of the {@link Preview}.
*/
public class PreviewSheet {
public String name;
public int index;
......@@ -26,7 +29,7 @@ public class PreviewSheet {
public PreviewSheet() {
}
public PreviewSheet(int sheetIndex, String sheetName) {
index = sheetIndex;
name = sheetName;
......@@ -34,9 +37,14 @@ public class PreviewSheet {
}
public static class Descriptor {
/// Column name
public String columnName;
/// Descriptor type
public DataType dataType;
public Map<String, String> codes;
/// Column contains unique values
public boolean unique;
/// For CODED columns, examples contains up to 100 distinct column values
public Set<Object> examples;
public enum DataType {
CODED, TEXT, NUMERIC
......
......@@ -15,13 +15,13 @@
*/
package org.genesys.amphibian.service.impl;
import static org.springframework.data.mongodb.core.query.Criteria.*;
import static org.springframework.data.mongodb.core.query.Query.*;
import static org.springframework.data.mongodb.core.query.Criteria.where;
import static org.springframework.data.mongodb.core.query.Query.query;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
......@@ -223,12 +223,15 @@ public class PreviewServiceImpl implements PreviewService, InitializingBean {
PreviewSheet.Descriptor descriptor = new PreviewSheet.Descriptor();
descriptor.columnName = columnNames.get(i);
if (checkIsNumericValues(statistic.distinctValues)) {
descriptor.dataType = PreviewSheet.Descriptor.DataType.NUMERIC;
} else if (((float) statistic.numberOfDistinct / statistic.totalValues) < 0.2) {
// Column contains unique values
descriptor.unique = statistic.numberOfDistinct == statistic.totalValues;
// Sample data
descriptor.examples = new HashSet<>(statistic.distinctValues);
if (((float) statistic.numberOfDistinct / statistic.totalValues) < 0.2) {
descriptor.dataType = PreviewSheet.Descriptor.DataType.CODED;
descriptor.codes = new HashMap<>();
statistic.distinctValues.forEach(value -> descriptor.codes.put("code", String.valueOf(value)));
} else if (checkIsNumericValues(statistic.distinctValues)) {
descriptor.dataType = PreviewSheet.Descriptor.DataType.NUMERIC;
} else {
descriptor.dataType = PreviewSheet.Descriptor.DataType.TEXT;
}
......
/*
* Copyright 2021 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys.amphibian.test.api;
import com.opencsv.CSVWriter;
import org.apache.commons.lang3.RandomUtils;
import org.apache.logging.log4j.core.util.StringBuilderWriter;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.xssf.usermodel.XSSFRow;
......@@ -18,7 +35,6 @@ import org.springframework.http.MediaType;
import org.springframework.mock.web.MockMultipartFile;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
......@@ -35,10 +51,12 @@ public class PreviewApiTest extends AbstractApiTest {
@Autowired
private PreviewService previewService;
@Autowired
private MetadataService metadataService;
/**
* Ingest csv test.
*/
@Test
public void ingestCsvTest() throws Exception {
UUID uuid = UUID.randomUUID();
......@@ -84,40 +102,42 @@ public class PreviewApiTest extends AbstractApiTest {
assertThat(preview.getSheets().get(0).rowCount, is(2));
}
/**
* Ingest exel test.
*/
@Test
public void ingestExelTest() throws Exception {
UUID uuid = UUID.randomUUID();
StringBuilder content = new StringBuilder();
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet spreadsheet = workbook.createSheet("test");
XSSFRow row;
//insert data
Map<String, Object[]> data = new TreeMap<String, Object[]>();
data.put("1", new Object[] {"INSTCODE", "ACCENUMB", "GENUS", "SEEDWGT"});
data.put("2", new Object[] {"SYR002", "IG 137552", "Lens", "0.8799999952316284"});
Set<String> keyid = data.keySet();
int rowid = 0;
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
for (String key : keyid) {
row = spreadsheet.createRow(rowid++);
Object[] objectArr = data.get(key);
int cellid = 0;
for (Object obj : objectArr) {
Cell cell = row.createCell(cellid++);
cell.setCellValue((String)obj);
try (XSSFWorkbook workbook = new XSSFWorkbook()) {
XSSFSheet spreadsheet = workbook.createSheet("test");
XSSFRow row;
//insert data
Map<String, Object[]> data = new TreeMap<String, Object[]>();
data.put("1", new Object[] {"INSTCODE", "ACCENUMB", "GENUS", "SEEDWGT"});
data.put("2", new Object[] {"SYR002", "IG 137552", "Lens", "0.8799999952316284"});
Set<String> keyid = data.keySet();
int rowid = 0;
for (String key : keyid) {
row = spreadsheet.createRow(rowid++);
Object[] objectArr = data.get(key);
int cellid = 0;
for (Object obj : objectArr) {
Cell cell = row.createCell(cellid++);
cell.setCellValue((String)obj);
}
}
workbook.write(outputStream);
}
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
workbook.write(outputStream);
MockMultipartFile file
= new MockMultipartFile(
"file",
......@@ -147,6 +167,9 @@ public class PreviewApiTest extends AbstractApiTest {
assertThat(preview.getSheets().get(0).rowCount, is(2));
}
/**
* Generate descriptors test.
*/
@Test
public void generateDescriptorsTest() throws Exception {
UUID uuid = UUID.randomUUID();
......@@ -155,15 +178,20 @@ public class PreviewApiTest extends AbstractApiTest {
CSVWriter writer = new CSVWriter(new StringBuilderWriter(content));
String[] row1 = new String[]{"INSTCODE", "ACCENUMB", "GENUS", "SEEDWGT"};
String[] row2 = new String[]{"SYR002", "IG 137552", "Lens", "0.8799999952316284"};
String[] row3 = new String[]{"SYR002", "IG 137553", "Banana", "0.9799999952316284"};
String[] row4 = new String[]{"SYR002", "IG 137554", "Lens", "0.5799999952316284"};
String[] row1 = new String[] { "INSTCODE", "ACCENUMB", "GENUS", "SEEDWGT", "COLOR" };
String[] row2 = new String[] { "LBN002", "IG 137552", "Lens", "0.8799999952316284", "A" };
String[] row3 = new String[] { "LBN002", "IG 137553", "Musa", "0.9799999952316284", "B" };
String[] row4 = new String[] { "LBN002", "IG 137554", "Lens", "0.5799999952316284", "C" };
List<String[]> rows = new ArrayList<>();
rows.add(row1);
rows.add(row2);
rows.add(row3);
rows.add(row4);
var colorCodes = new String[] { "A", "B", "C", "D", null };
for (var i = 0; i < 100; i++) {
rows.add(new String[] { "LBN002", "IG " + (137555 + i), "Hordeum", "" + RandomUtils.nextFloat(), colorCodes[RandomUtils.nextInt(0, colorCodes.length)] });
}
writer.writeAll(rows);
MockMultipartFile file
......@@ -176,7 +204,7 @@ public class PreviewApiTest extends AbstractApiTest {
metadataService.previewFromCSV(uuid, null, file, null, null);
//waiting for data inserting
// Waiting for data inserting to complete
Thread.sleep(500);
/*@formatter:off*/
......@@ -191,15 +219,28 @@ public class PreviewApiTest extends AbstractApiTest {
Preview preview = previewService.getPreview(uuid);
assertThat(preview, is(notNullValue()));
assertThat(preview.getSheets(), hasSize(1));
assertThat(preview.getSheets().get(0).rowCount, is(4));
assertThat(preview.getSheets().get(0).descriptors.get(0).columnName, is("INSTCODE"));
assertThat(preview.getSheets().get(0).descriptors.get(0).dataType, is(PreviewSheet.Descriptor.DataType.TEXT));
assertThat(preview.getSheets().get(0).descriptors.get(1).columnName, is("ACCENUMB"));
assertThat(preview.getSheets().get(0).descriptors.get(1).dataType, is(PreviewSheet.Descriptor.DataType.TEXT));
assertThat(preview.getSheets().get(0).descriptors.get(2).columnName, is("GENUS"));
assertThat(preview.getSheets().get(0).descriptors.get(2).dataType, is(PreviewSheet.Descriptor.DataType.TEXT));
assertThat(preview.getSheets().get(0).descriptors.get(3).columnName, is("SEEDWGT"));
assertThat(preview.getSheets().get(0).descriptors.get(3).dataType, is(PreviewSheet.Descriptor.DataType.NUMERIC));
PreviewSheet sheet0 = preview.getSheets().get(0);
assertThat(sheet0.rowCount, is(rows.size()));
assertThat(sheet0.descriptors.get(0).columnName, is("INSTCODE"));
assertThat(sheet0.descriptors.get(0).dataType, is(PreviewSheet.Descriptor.DataType.CODED));
assertThat(sheet0.descriptors.get(0).examples, notNullValue());
assertThat(sheet0.descriptors.get(0).examples, contains("LBN002"));
assertThat(sheet0.descriptors.get(1).columnName, is("ACCENUMB"));
assertThat(sheet0.descriptors.get(1).dataType, is(PreviewSheet.Descriptor.DataType.TEXT));
assertThat(sheet0.descriptors.get(1).unique, is(true));
assertThat(sheet0.descriptors.get(2).columnName, is("GENUS"));
assertThat(sheet0.descriptors.get(2).dataType, is(PreviewSheet.Descriptor.DataType.CODED));
assertThat(sheet0.descriptors.get(2).examples, notNullValue());
assertThat(sheet0.descriptors.get(3).columnName, is("SEEDWGT"));
assertThat(sheet0.descriptors.get(3).dataType, is(PreviewSheet.Descriptor.DataType.NUMERIC));
assertThat(sheet0.descriptors.get(3).examples, nullValue());
assertThat(sheet0.descriptors.get(4).columnName, is("COLOR"));
assertThat(sheet0.descriptors.get(4).dataType, is(PreviewSheet.Descriptor.DataType.CODED));
assertThat(sheet0.descriptors.get(4).examples, notNullValue());
assertThat(sheet0.descriptors.get(4).examples, containsInAnyOrder("A", "B", "C", "D"));
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment