Commit 55c5d87f authored by Maxym Borodenko's avatar Maxym Borodenko
Browse files

Dataset analyzing

parent 0171b497
......@@ -400,6 +400,64 @@ paths:
'404':
description: Not Found
deprecated: false
'/api/v0/preview/stats/{uuid}/{sheet}/{startRow}':
get:
tags:
- preview
summary: Get the statistics of the parsed dataset
description: Use the same reference UUID as provided when ingesting a dataset
operationId: getStatisticsData
produces:
- '*/*'
parameters:
- name: uuid
in: path
description: Your reference UUID
required: true
type: string
format: uuid
- name: sheet
in: path
description: Sheet index
required: true
type: integer
format: int64
- name: startRow
in: path
description: Index of the first row
required: true
type: integer
format: int64
- name: limit
in: query
description: Number of distinct values to return
required: false
type: integer
default: 100
allowEmptyValue: false
- name: fields
in: query
description: Specify fields to analyze
required: true
type: array
items:
type: string
collectionFormat: multi
allowEmptyValue: false
responses:
'200':
description: OK
schema:
type: array
items:
type: object
'401':
description: Unauthorized
'403':
description: Forbidden
'404':
description: Not Found
deprecated: false
definitions:
Observation:
type: object
......
......@@ -112,4 +112,20 @@ public class PreviewApi {
return previewService.getData(uuid, sheet, startRow, limit.orElse(50), fields);
}
@GetMapping(path = { "/stats/{uuid}/{sheet}/{startRow}" })
@ApiOperation(nickname = "getStatisticsData", value = "Get the statistics of the parsed dataset", notes = "Use the same reference UUID as provided when ingesting a dataset")
public List<PreviewService.StatisticsData> getStatisticsData(@ApiParam(value = "Your reference UUID", required = true) @PathVariable UUID uuid, // uuid
@ApiParam(value = "Sheet index", required = true) @PathVariable long sheet, // sheet index
@ApiParam(value = "Index of the first row", required = true) @PathVariable long startRow, // start row
@ApiParam(value = "Number of distinct values to return", required = false, format="int32", type="integer") @RequestParam(name = "limit", required = false, defaultValue = "100") Optional<Integer> limit, // max distinct values
@ApiParam(value = "Specify fields to analyze", required = true) @RequestParam(required = true) String[] fields // selected fields
) {
Preview preview = previewService.getPreview(uuid);
if (preview == null) {
throw new NoSuchThingException("No preview for reference UUID=" + uuid);
}
return previewService.getStatisticsData(uuid, sheet, startRow, limit.orElse(100), fields);
}
}
......@@ -16,7 +16,9 @@
package org.genesys.amphibian.service;
import java.io.Serializable;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import org.bson.Document;
......@@ -34,8 +36,18 @@ public interface PreviewService {
List<Document> getData(UUID referenceUuid, long sheet, long startRow, int limit, String... selectedColumns);
List<StatisticsData> getStatisticsData(UUID referenceUuid, long sheet, long startRow, int limit, String... selectedFields);
Preview updateState(String id, State reading, String error);
Preview registerSheet(String id, PreviewSheet sheet);
class StatisticsData implements Serializable {
private static final long serialVersionUID = 3826590456962597612L;
public int numberOfDistinct;
public long totalValues;
public Set<Object> distinctValues;
}
}
......@@ -18,13 +18,16 @@ package org.genesys.amphibian.service.impl;
import static org.springframework.data.mongodb.core.query.Criteria.*;
import static org.springframework.data.mongodb.core.query.Query.*;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import org.apache.commons.lang3.ArrayUtils;
import org.bson.Document;
import org.genesys.amphibian.model.Preview;
import org.genesys.amphibian.model.Preview.State;
......@@ -163,6 +166,33 @@ public class PreviewServiceImpl implements PreviewService, InitializingBean {
return mongoTemplate.find(q, Document.class, MONGO_RAWDATA);
}
@Override
public List<StatisticsData> getStatisticsData(UUID referenceUuid, long sheet, long startRow, int limit, String... selectedFields) {
if (ArrayUtils.isEmpty(selectedFields)) {
// nothing to analyze here
return List.of();
}
List<StatisticsData> statistics = new ArrayList<>(selectedFields.length);
for (String selectedColumn : selectedFields) {
final var data = new StatisticsData();
List<Object> distinctValues = mongoTemplate.findDistinct(
query(where(RAWDATA_DATASET).is(referenceUuid).and(RAWDATA_SHEET).is(sheet).and(RAWDATA_ROW).gte(startRow)),
selectedColumn, MONGO_RAWDATA, Object.class);
data.numberOfDistinct = distinctValues.size();
data.distinctValues = distinctValues.stream().limit(limit).collect(Collectors.toSet());
data.totalValues = mongoTemplate.count(
query(where(RAWDATA_DATASET).is(referenceUuid).and(RAWDATA_SHEET).is(sheet).and(selectedColumn).exists(true).and(RAWDATA_ROW).gte(startRow)),
Document.class, MONGO_RAWDATA);
statistics.add(data);
}
return statistics;
}
@Scheduled(initialDelay = 1000 * 60 * 1, fixedDelay = 1000 * 60 * 10)
public void removeExpiredPreviews() {
List<Preview> expiredPreviews = previewRepository.findByExpiresLessThan(new Date());
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment