Commit d2be7b47 authored by Matija Obreza's avatar Matija Obreza
Browse files

Improved import

- Link Descriptors to existing Controlled Vocabularies
- Less verbose, but improved logging
- Extract descriptions from code titles
parent ca9106ae
......@@ -16,3 +16,4 @@ genesys-catalog-core/data
genesys-catalog-server/data
curation/cropdescriptors.json
curation/descriptorslists.json
curation/sources
......@@ -11,23 +11,23 @@ Install csvkit.
```bash
## Read CSV file
# ~/Library/Python/2.7/bin/in2csv "/Users/mobreza/Downloads/Genesys - crop descriptors - crop_descriptors.csv"
# ~/Library/Python/2.7/bin/in2csv "sources/crop_descriptors.csv"
## Print column headers
# ~/Library/Python/2.7/bin/in2csv "/Users/mobreza/Downloads/Genesys - crop descriptors - crop_descriptors.csv" | ~/Library/Python/2.7/bin/csvcut -n
# ~/Library/Python/2.7/bin/in2csv "sources/crop_descriptors.csv" | ~/Library/Python/2.7/bin/csvcut -n
## Convert selected columns to JSON
~/Library/Python/2.7/bin/in2csv "crop_descriptors.csv" \
~/Library/Python/2.7/bin/in2csv "sources/crop_descriptors.csv" \
| ~/Library/Python/2.7/bin/csvcut \
-c category,cropcode,title,dataType,description,integerOnly,maxValue,minValue,code,code_description,published,versionTag,columnName,uom \
| PYTHONIOENCODING=utf8 ~/Library/Python/2.7/bin/csvjson \
-c 'category,crop,versionTag,title,dataType,keyDescriptor,description,integerOnly,minValue,maxValue,vocabularyId,code,code_title,code_description,published,columnName,uom,headingNumber' \
| PYTHONIOENCODING=utf8 ~/Library/Python/2.7/bin/csvjson -i 2 \
> cropdescriptors.json
## Descriptor lists
~/Library/Python/2.7/bin/in2csv "descriptor_lists.csv" \
~/Library/Python/2.7/bin/in2csv "sources/descriptor_lists.csv" \
| ~/Library/Python/2.7/bin/csvcut \
-c descriptorList,url,description,cropcode,versionTag,bibliographicCitation \
| PYTHONIOENCODING=utf8 ~/Library/Python/2.7/bin/csvjson \
-c 'title,url,description,crop,versionTag,publisher,bibliographicCitation' \
| PYTHONIOENCODING=utf8 ~/Library/Python/2.7/bin/csvjson -i 2 \
> descriptorslists.json
```
......@@ -35,5 +35,5 @@ Install csvkit.
# Run import
```bash
API_URL=http://localhost:3000/proxy/api/v0 npm run import -- --import-lists --lists descriptorslists.json --import-descriptors --descriptors cropdescriptors.json --owner-id 9 --token '...`
API_URL=http://localhost:3000/proxy/api/v0 npm run import -- --import-lists --lists descriptorslists.json --import-descriptors --descriptors cropdescriptors.json --owner-id 22 --token '...'
```
......@@ -54,6 +54,14 @@ function authenticatedRequest(token, params) {
}, params));
};
export const listVocabularies = (token) => (filters = {}) => {
return authenticatedRequest(token, {
url: `/vocabulary/list?l=100`,
method: 'POST',
data: filters, // blank filters
});
}
export const createVocabulary = (token) => (vocabulary) => {
// console.log(`Creating new vocabulary with ${API_URL}/vocabulary/create\n`, vocabulary);
......
console.log('Hello World!');
const jsonfile = require('jsonfile');
import * as jsonfile from 'jsonfile';
import sleep from 'sleep';
import * as _ from 'lodash';
import * as catalogapi from './catalogapi.js';
import minimist from 'minimist';
......@@ -11,7 +11,7 @@ const argv = minimist(process.argv.slice(2), {
boolean: [
'--import-lists', '--import-descriptors'
],
string: ['--token', '--descriptors', '--lists' ],
string: ['--token', '--descriptors', '--lists']
});
console.dir(argv);
......@@ -19,7 +19,7 @@ const IMPORT_DESCRIPTORLISTS = argv['import-lists'] || false;
const IMPORT_DESCRIPTORS = argv['import-descriptors'] || false;
const INPUT_DESCRIPTORS_JSON = argv['descriptors'] || './cropdescriptors.json';
const INPUT_DESCRIPTORLISTS_JSON = argv['lists'] || './descriptorlists.json';
const OWNER_ID = +argv['owner-id'] || 17;
const OWNER_ID = + argv['owner-id'] || 22;
const TOKEN = argv['token'] || '';
......@@ -30,13 +30,19 @@ const OWNER = {
}
const handleAxiosError = (error) => {
console.log(error);
if (error.response) {
console.log(`API error ${error.response.status} - ${error.response.statusText}`, error.response.data);
} else {
console.log(error);
}
}
// Map of 'crop-versionTag' to descriptorList
const AVAILABLE_DESCRIPTORLISTS = new Map();
const AVAILABLE_CONTROLLEDVOCABULARIES = new Array();
const CONTROLLEDVOCAB_MAP = new Map();
const createVocabulary = catalogapi.createVocabulary(TOKEN);
const listVocabularies = catalogapi.listVocabularies(TOKEN);
const createDescriptor = catalogapi.createDescriptor(TOKEN);
const createDescriptorList = catalogapi.createDescriptorList(TOKEN);
const addDescriptorsToList = catalogapi.addDescriptorsToList(TOKEN);
......@@ -65,96 +71,127 @@ const rxescape = (str) => {
//
// process.exit(-1);
const fixVocabulary = (codes, descriptions) => {
if (!codes || !descriptions || codes === null || descriptions === null) {
console.log('Missing codes and descriptions');
// OR
return [ { code: 'N/A', title: 'Coding table was not provided.' } ];
const fixVocabulary = (codes, titles, descriptions) => {
if (!codes || !titles || codes === null || titles === null) {
return Promise.reject({message: 'Missing codes and descriptions'});
}
// console.log(`Fixing vocab`, codes, descriptions);
const c = codes.trim().split(/\n/gi);
const d = descriptions.trim().split(/\n/gi);
let t = titles.trim().split(/\n/gi);
let d = descriptions
? descriptions.split(/\n/gi)
: null;
if (d === null) {
// we have no descriptions, try to get them from titles -- whatever is in parentheses
d = new Array();
for (let i = 0; i < t.length; i++) {
const match = t[i].match(/\((.+)\)$/);
if (match) {
console.log(`Cleaning "${t[i]}" -> "${match[1]}"`);
d[i] = capitalize(match[1]);
t[i] = t[i].replace(`(${match[1]})`, '').trim();
console.log(`Cleaned up "${t[i]}" = "${d[i]}"`);
}
}
}
// console.log(`codes[${c.length}] & descriptions[${d.length}]: ${c} => ${d}`);
if (c.length !== d.length) {
return null;
if (c.length !== t.length) {
return Promise.reject({message: `Code lengths don't match descriptions`});
}
const terms = [];
for (var i = 0; i < c.length; i++) {
// Trim codes
d[i] = d[i].replace(new RegExp(`^${rxescape(c[i].trim())}\\s+`), '');
t[i] = t[i].replace(new RegExp(`^${rxescape(c[i].trim())}\\s+`), '');
// capitalize
d[i] = capitalize(d[i].trim());
terms.push({code: c[i].trim(), title: d[i]});
t[i] = capitalize(t[i].trim());
const description = d && d.length > i
? d[i]
: null;
terms.push({code: c[i].trim(), title: t[i], description});
}
return terms;
return Promise.resolve(terms);
}
const importDescriptor = (descriptor) => {
console.log(`Importing "${descriptor.title}" for crop=${descriptor.cropcode}`);
const importDescriptor = async (descriptor) => {
// console.log(`Importing "${descriptor.title}" for crop=${descriptor.crop}`);
descriptor.owner = OWNER;
// descriptor.published = false;
if (descriptor.category === '#N/A') {
console.log(`Descriptor "${descriptor.title}" for crop=${descriptor.cropcode} has no category`, descriptor);
return Promise.reject(new Error({message: `Descriptor "${descriptor.title}" for crop=${descriptor.cropcode} has no category`}));
// console.log(`Descriptor "${descriptor.title}" for crop=${descriptor.crop} has no category`, descriptor);
return Promise.reject({message: `Descriptor "${descriptor.title}" for crop=${descriptor.crop} has no category`});
}
if (descriptor.dataType === 'SCALE') {
// if (descriptor.minValue === null && descriptor.maxValue === null) {
// console.log(`Descriptor "${descriptor.title}" for crop=${descriptor.cropcode} requires min and max values`);
// return Promise.reject(new Error({message: `Descriptor "${descriptor.title}" for crop=${descriptor.cropcode} requires min and max values`}));
// console.log(`Descriptor "${descriptor.title}" for crop=${descriptor.crop} requires min and max values`);
// return Promise.reject(new Error({message: `Descriptor "${descriptor.title}" for crop=${descriptor.crop} requires min and max values`}));
// }
// OR
descriptor.minValue = descriptor.minValue === null ? 0 : descriptor.minValue;
descriptor.maxValue = descriptor.maxValue === null ? 10 : descriptor.maxValue;
descriptor.minValue = descriptor.minValue === null
? 0
: descriptor.minValue;
descriptor.maxValue = descriptor.maxValue === null
? 10
: descriptor.maxValue;
}
if (descriptor.dataType === 'CODED' || descriptor.dataType === 'SCALE') {
const vocab = fixVocabulary(descriptor.code, descriptor.code_description);
if (vocab) {
// console.log('C->D', vocab);
if (descriptor.vocabularyId !== null) {
console.log(`Descriptor ${descriptor.title} is using ${descriptor.vocabularyId}`, CONTROLLEDVOCAB_MAP.get(descriptor.vocabularyId));
delete descriptor.code;
delete descriptor.code_title;
delete descriptor.code_description;
descriptor.vocabulary = CONTROLLEDVOCAB_MAP.get(descriptor.vocabularyId);
descriptor.terms = vocab;
// return createVocabulary({title: `${descriptor.title}`, versionTag: descriptor.versionTag, description: `Custom controlled vocabulary for ${descriptor.crop} descriptor: ${descriptor.title}`, owner: OWNER, terms: vocab}).then(({data}) => {
// // console.log('Created vocabulary', data);
// // remove owner reference
// delete data.owner;
// descriptor.vocabulary = data;
//
// return descriptor;
// }).then((descriptor) => {
// return createDescriptor(descriptor);
// }).catch(handleAxiosError);
} else {
console.log(`Could not import vocabulary for ${descriptor.title}`, descriptor.code, descriptor.code_description);
return Promise.reject(new Error({message: `Could not import vocabulary for ${descriptor.title} with ${descriptor.code} => ${descriptor.code_description}`}));
const vocab = await fixVocabulary(descriptor.code, descriptor.code_title, descriptor.code_description).catch((err) => {
console.log(`Using "N/A" vocabulary for ${descriptor.title} ${err.message}`);
return [
{
code: 'N/A',
title: 'Coding table was not provided.'
}
];
});
if (vocab) {
// console.log('C->D', vocab);
delete descriptor.code;
delete descriptor.code_title;
delete descriptor.code_description;
descriptor.terms = vocab;
} else {
// console.log(`Could not import vocabulary for ${descriptor.title}`, descriptor.code, descriptor.code_description);
return Promise.reject({message: `Could not import vocabulary for ${descriptor.title} with ${descriptor.code} => ${descriptor.code_description}`});
}
}
}
console.log(`Importing "${descriptor.title}" for crop=${descriptor.crop}`, descriptor);
return createDescriptor(descriptor).catch(handleAxiosError);
// console.log(`Importing "${descriptor.title}" for crop=${descriptor.crop}`);
return createDescriptor(descriptor).then((descriptor) => {
console.log(`Successfully imported ${descriptor.uuid}`);
return descriptor;
});
}
async function importDescriptorLists() {
let descriptorLists = jsonfile.readFileSync(INPUT_DESCRIPTORLISTS_JSON);
descriptorLists = descriptorLists.filter((o, index) => index >= 2).map((o) => {
o.crop = o.cropcode;
delete o.cropcode;
o.title = o.descriptorList;
delete o.descriptorList;
o.published = true;
o.owner = OWNER;
// console.log(o);
......@@ -174,6 +211,95 @@ async function importDescriptorLists() {
console.log('Done with descriptor lists');
}
async function loadControlledVocabs() {
await listVocabularies().then(({data}) => {
data.content.forEach(cv => {
AVAILABLE_CONTROLLEDVOCABULARIES.push(cv);
// The UUIDs are fixed!
if (cv.uuid === '39a3d6a2-20e6-4fab-8bfe-acb1f9fe774c') {
CONTROLLEDVOCAB_MAP.set('ISO 3-letter country code', {
id: cv.id,
version: cv.version,
uuid: cv.uuid
});
} else if (cv.uuid === '36b4a674-e2eb-4ba1-a05a-71cfc2af862e') {
CONTROLLEDVOCAB_MAP.set('FAO WIEWS code', {
id: cv.id,
version: cv.version,
uuid: cv.uuid
});
}
});
});
console.log(`Received ${CONTROLLEDVOCAB_MAP.size} controlled vocabularies we know.`, CONTROLLEDVOCAB_MAP);
}
const isDescriptorComplete = (d) => {
const DATA_TYPES = [
'CODED',
'BOOLEAN',
'SCALE',
'NUMERIC',
'DATE',
'TEXT'
];
const CATEGORIES = [
'PASSPORT',
'MANAGEMENT',
'ENVIRONMENT',
'CHARACTERIZATION',
'EVALUATION',
'ABIOTICSTRESS',
'BIOTICSTRESS',
'MOLECULAR'
];
let messages = [];
let fail = false;
if (d.title === null || `${d.title}` === '') {
messages.push('Title is blank');
fail = true;
}
if (d.versionTag === null || `${d.versionTag}` === '') {
messages.push('versionTag is blank');
fail = true;
}
if (d.category === null) {
messages.push(`category not provided`);
// default to EVALUATION
d.category = 'EVALUATION';
d.published = false;
} else if (CATEGORIES.indexOf(d.category) < 0) {
messages.push(`category ${d.category} not recognized`);
fail = true;
}
if (d.dataType === null) {
messages.push(`dataType not provided`);
// default to TEXT
d.dataType = d.uom
? 'NUMERIC'
: 'TEXT';
d.published = false;
} else if (DATA_TYPES.indexOf(d.dataType) < 0) {
messages.push(`dataType ${d.dataType} not recognized`);
fail = true;
}
if (fail && messages.length > 0) {
console.log(`Ignoring incomplete descriptor`, d, messages);
return false;
} else {
if (messages.length > 0) {
console.log('Applied sensible defaults', d, messages);
}
return true;
}
};
async function importDescriptors() {
let descriptors = jsonfile.readFileSync(INPUT_DESCRIPTORS_JSON);
......@@ -197,33 +323,45 @@ async function importDescriptors() {
? true
: false;
o.crop = o.cropcode;
delete o.cropcode;
o.keyDescriptor = o.keyDescriptor && o.keyDescriptor === 'TRUE'
? true
: false;
return o;
}).filter((d) => {
if (d.title === null || d.dataType === null || d.versionTag === null) {
console.log(`Incomplete descriptor`, d);
return false;
} else {
return true;
}
});
}).filter((d) => isDescriptorComplete(d));
console.log(`Read ${descriptors.length} descriptors for importing`);
for (const d of descriptors) {
await importDescriptor(d)
.then((imported) => {
// console.log('Made new descriptor', data);
console.log(`Importing ${d.title}`);
await importDescriptor(d).then((imported) => {
// console.log('Made new descriptor', imported);
if (imported === undefined || imported === null || imported.uuid === null) {
console.log(`Descriptor "${d.title}" was not imported`);
return Promise.reject({error: `Descriptor "${d.title}" was not persisted`, data: d});
}
const descriptorList = AVAILABLE_DESCRIPTORLISTS.get(`${imported.crop}${imported.versionTag}`);
// process.exit(-1);
if (descriptorList !== undefined && descriptorList.uuid) {
console.log(`Adding ${imported.uuid} to ${descriptorList.title}`);
descriptorList.addlist.push(imported.uuid);
}
console.log(`\tImported ${imported.uuid} ${d.title}`);
return imported;
}).catch(handleAxiosError);
}).catch((err) => {
if (err.request)
handleAxiosError(err);
else
console.log(`Failed to import descriptor "${d.title}": ${err.message}`, err);
}
);
console.log(`\tDone ${d.title}`);
sleep.msleep(10);
}
console.log('Done with descriptors');
......@@ -232,11 +370,19 @@ async function importDescriptors() {
const runme = async () => {
if (IMPORT_DESCRIPTORLISTS) {
await importDescriptorLists();
sleep.sleep(2);
}
if (IMPORT_DESCRIPTORS) {
await loadControlledVocabs();
console.log('Done loading vocabs');
sleep.sleep(2);
await importDescriptors();
sleep.sleep(2);
console.log('Done importing descriptors, have', AVAILABLE_DESCRIPTORLISTS.values());
for (const dl of AVAILABLE_DESCRIPTORLISTS.values()) {
if (dl.addlist.length > 0) {
await addDescriptorsToList(dl, dl.addlist).then((r) => {
......
......@@ -2184,9 +2184,7 @@
"nan": {
"version": "2.8.0",
"resolved": "https://registry.npmjs.org/nan/-/nan-2.8.0.tgz",
"integrity": "sha1-7XFfP+neArV6XmJS2QqWZ14fCFo=",
"dev": true,
"optional": true
"integrity": "sha1-7XFfP+neArV6XmJS2QqWZ14fCFo="
},
"normalize-path": {
"version": "2.1.1",
......@@ -2485,6 +2483,14 @@
"integrity": "sha1-xB8vbDn8FtHNF61LXYlhFK5HDVU=",
"dev": true
},
"sleep": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/sleep/-/sleep-5.1.1.tgz",
"integrity": "sha1-h4+h1E0I7rDyb7IBjvhinrGjq5Q=",
"requires": {
"nan": "2.8.0"
}
},
"source-map": {
"version": "0.5.7",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz",
......
......@@ -13,6 +13,7 @@
"jsonfile": "^4.0.0",
"lodash": "^4.17.4",
"minimist": "^1.2.0",
"sleep": "^5.1.1",
"systemjs": "^0.20.19"
},
"devDependencies": {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment