Commit 5cdb8f2a authored by Matija Obreza's avatar Matija Obreza
Browse files

Merge branch '11-auto-detect-csv-configuration' into 'master'

Auto-detect CSV configuration

Closes #11

See merge request genesys-pgr/validator!11
parents 3b3db16a 69177b7b
label.title=Passport data validation
label.run-autodetect=Auto-detect CSV settings
label.run-check=Validate Taxonomic data
label.run-check-country=Validate Country of Origin
label.run-check-land=Classify Land or Water
......
......@@ -30,8 +30,7 @@
}
.btn {
padding: 1em 2em;
margin: 2em 0;
padding: 1em 2em;
}
code {
......@@ -85,7 +84,11 @@
</div>
<h2>2. Configure CSV settings</h2>
<div>
<button class="btn" type="button" value="" id="autodetectCsv">
<spring:message code="label.run-autodetect"/>
</button>
<h3><spring:message code="label.separator"/>:</h3>
<input id="separatorTab" name="separator" value="&#9;" type="radio" checked>
<label for="separatorTab"><spring:message code="label.separator.tab"/> <code>&#09;</code></label>
......@@ -97,7 +100,7 @@
<label for="separatorSpace"><spring:message code="label.separator.space"/> <code> </code></label>
<input id="separatorOther" name="separator" value="O" type="radio">
<label for="separatorOther"><spring:message code="label.separator.other"/></label>
<input name="separatorOther" type="text" maxlength="1">
<input id="separatorValue" name="separatorOther" type="text" maxlength="1">
<p class="hint"><spring:message code="hint.separator"/></p>
</div>
......@@ -125,7 +128,7 @@
<label for="toCurrentTaxaNo"><spring:message code="label.no"/></label>
</div>
<div>
<div style="margin: 2em 0;">
<button class="btn" type="submit" value="" name="validateType">
<spring:message code="label.run-check"/>
</button>
......@@ -136,6 +139,254 @@
<spring:message code="label.run-check-land"/>
</button>
</div>
</form>
</form>
<script>
var Char = function(name) {
this.name = name;
this.number = 1;
this.version = 0;
};
Char.prototype = {
setNumber: function(number) {
// For each row the number of occurrences should not change
if (this.number !== number) {
this.number = number;
this.version = ++this.version;
}
},
addOccurrence: function() {
this.number = ++this.number;
}
};
var CSVAutodetector = {
numberRowsToCheck: 10,
detectSeparator: function(lines) {
var symbols = CSVAutodetector.getAllNonLetterAndDigitSymbols(lines);
var separators = [];
symbols.forEach(function(symbol) {
separators.push(new Char(symbol));
});
lines.some(function(row, i) {
if (i < CSVAutodetector.numberRowsToCheck) {
if (row !== "") {
separators.forEach(function(separator, i) {
var numb = CSVAutodetector.columnNumberForSeparator(row, separator.name);
separator.setNumber(numb);
});
}
} else return true;
});
// version = 1 is a sign that the object has the same number of occurrences on each row
var detectedSeparator = null;
for(var i = 0; i < separators.length; i++) {
if (detectedSeparator === null) {
if (separators[i].version === 1) {
detectedSeparator = separators[i];
}
} else if (separators[i].number > detectedSeparator.number && separators[i].version === 1) {
detectedSeparator = separators[i];
}
}
return detectedSeparator === null ? '\t' : detectedSeparator.name;
},
detectQuoteChar: function(lines, separator) {
var columnValues = CSVAutodetector.getColumnValues(lines, separator);
var symbols = CSVAutodetector.getAllNonLetterAndDigitSymbols(lines);
var quoteChars = [];
columnValues.forEach(function(value) {
var firstChar = value.substr(0, 1);
var lastChar = value.substr(value.length - 1);
symbols.some(function(symbol, i) {
if (firstChar === symbol && lastChar === symbol) {
if (quoteChars.length === 0) {
quoteChars.push(new Char(symbol));
} else {
var indexOfQuote = CSVAutodetector.getCharIndexIfExistInArr(symbol, quoteChars);
if (indexOfQuote !== null) {
quoteChars[indexOfQuote].addOccurrence();
} else {
quoteChars.push(new Char(symbol));
}
}
return true;
}
})
});
return CSVAutodetector.getCharWithMaxOccurrence(quoteChars);
},
detectEscapeChar: function(lines, separator, quote) {
var columnValues = CSVAutodetector.getColumnValues(lines, separator);
var escapeChars = [];
columnValues.forEach(function(value) {
var firstChar = value.substr(0, 1);
var lastChar = value.substr(value.length - 1);
if (firstChar === quote && lastChar === quote) {
var word = value.substr(1, value.length - 2);
var index = word.indexOf(quote);
if (index !== -1) {
var escapeChar = word.substring(index - 1, index);
if (escapeChars.length === 0) {
escapeChars.push(new Char(escapeChar));
} else {
var indexOfEscapeChar = CSVAutodetector.getCharIndexIfExistInArr(escapeChar, escapeChars);
if (indexOfEscapeChar !== null) {
escapeChars[indexOfEscapeChar].addOccurrence();
} else {
escapeChars.push(new Char(escapeChar));
}
}
}
}
});
return CSVAutodetector.getCharWithMaxOccurrence(escapeChars);
},
detectDecimalMark: function(lines, separator) {
var columnValues = CSVAutodetector.getColumnValues(lines, separator);
var columnValuesWithDigit = [];
var regex = /\d/g;
columnValues.forEach(function(value) {
if (regex.test(value)) {
columnValuesWithDigit.push(value);
}
});
var decimalMarks = [];
columnValuesWithDigit.forEach(function(columnValue) {
var parsedMark = columnValue.replace(/[A-Za-z0-9]/g, '').replace(/[-+]/g, '');
if (parsedMark !== "") {
if (decimalMarks.length === 0) {
decimalMarks.push(new Char(parsedMark));
} else {
var indexOfDecimalMark = CSVAutodetector.getCharIndexIfExistInArr(parsedMark, decimalMarks);
if (indexOfDecimalMark !== null) {
decimalMarks[indexOfDecimalMark].addOccurrence();
} else {
decimalMarks.push(new Char(parsedMark));
}
}
}
});
return CSVAutodetector.getCharWithMaxOccurrence(decimalMarks);
},
getCharWithMaxOccurrence: function(arr) {
var detectedChar = arr[0];
if (detectedChar !== undefined) {
for(var i = 1; i < arr.length; i++) {
if (arr[i].number > detectedChar.number) {
detectedChar = arr[i];
}
}
return detectedChar.name;
}
},
getColumnValues: function(lines, separator) {
var columnValues = [];
lines.some(function(row, i) {
if (i < CSVAutodetector.numberRowsToCheck) {
if (row !== "") {
columnValues = columnValues.concat(row.toString().split(separator))
}
} else return true;
});
return columnValues;
},
getAllNonLetterAndDigitSymbols: function(lines) {
var symbols = [];
lines.some(function(row, i) {
if (i < CSVAutodetector.numberRowsToCheck) {
if (row !== "") {
var regexp = /[^A-Za-z0-9]/g;
var result;
while(result = regexp.exec(row)) {
if (symbols.indexOf(result[0]) === -1) {
symbols.push(result[0]);
}
}
}
} else return true;
});
return symbols;
},
columnNumberForSeparator: function(string, separator) {
var splits = string.toString().split(separator);
return splits.length;
},
getCharIndexIfExistInArr: function(char, arr) {
for(var i in arr) {
if (arr[i].name === char) {
return i;
}
}
return null;
},
autoSelectOptions: function(separator, quote, escape, decimalMark) {
if (separator !== undefined) {
//select separator char
if (separator === "\t") {
document.getElementById("separatorTab").checked = true;
} else if (separator === ",") {
document.getElementById("separatorComma").checked = true;
} else if (separator === ";") {
document.getElementById("separatorSemi").checked = true;
} else if (separator === " ") {
document.getElementById("separatorSpace").checked = true;
} else {
document.getElementById("separatorOther").checked = true;
document.getElementById("separatorValue").value = separator;
}
if (escape !== undefined) {
//select escape char
document.getElementById("escapeChar").value = escape;
} else {
document.getElementById("escapeChar").value = "";
}
if (decimalMark !== undefined) {
//select decimal mark
if (decimalMark === ".") {
document.getElementById("decimalMarkDot").checked = true;
} else if (decimalMark === ",") {
document.getElementById("decimalMarkComma").checked = true;
}
}
}
}
};
document.addEventListener("DOMContentLoaded", function() {
var autodetectSettings = function() {
var area = document.getElementById("csvText");
if (area.value.length < 10) {
console.log('Not enough data for auto-detection len=', area.value.length);
return;
}
var lines = area.value.replace(/\r\n/g, "\n").split("\n");
var separator = CSVAutodetector.detectSeparator(lines);
var quote = CSVAutodetector.detectQuoteChar(lines, separator);
var escape = CSVAutodetector.detectEscapeChar(lines, separator, quote);
var decimalMark = CSVAutodetector.detectDecimalMark(lines, separator);
console.log('CSV auto-detected separator=', separator, ' quote=', quote, ' escape=', escape, ' decimal=', decimalMark);
CSVAutodetector.autoSelectOptions(separator, quote, escape, decimalMark);
};
document.getElementById("autodetectCsv").addEventListener("click", autodetectSettings);
document.getElementById("csvText").addEventListener("paste", function(e) { setTimeout(autodetectSettings, 10); });
});
</script>
</body>
</html>
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment