Commit d51cf97e authored by Matija Obreza's avatar Matija Obreza
Browse files

Most frequent K characters metric

parent 2fd38e7f
/*
* Copyright 2016 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys2.gringlobal.taxonomy.component;
import java.util.Arrays;
/**
* Based on pseudocode at https://en.wikipedia.org/wiki/Most_frequent_k_characters and http://rosettacode.org/wiki/Most_frequent_k_chars_distance
*
* Does not handle digits [0-9] for obvious reasons.
*/
public class MostFrequentKChars {
/**
* <pre>
String function MostFreqKHashing (String inputString, int K)
def string outputString
for each distinct character
count occurrence of each character
for i := 0 to K
char c = next most freq ith character (if two chars have same frequency then get the first occurrence in inputString)
int count = number of occurrence of the character
append to outputString, c and count
end for
return outputString
* </pre>
*/
public static String mostFrequentKHashing(String string, int k) {
char[] input = string.toCharArray();
int[] occurrences = new int[string.length()];
// System.err.println("occurences=" + Arrays.toString(occurrences));
// track maximum occurence value
int maxOcc = 0;
// count occurrence of each character
for (char c : input) {
// IGNORE DIGITS
if (c >= '0' && c <= '9')
continue;
int charOccurence = ++occurrences[string.indexOf(c)];
if (charOccurence > maxOcc)
maxOcc = charOccurence;
}
// System.err.println("occurences=" + Arrays.toString(occurrences));
// System.err.println("maxocc=" + maxOcc);
StringBuilder sb = new StringBuilder();
// find at top K occurences
int limit = 0;
while (maxOcc > 0 && limit < k) {
int nextOcc = 0;
for (int i = 0; i < occurrences.length; i++) {
if (occurrences[i] == maxOcc) {
sb.append(input[i]).append(occurrences[i]);
// System.err.println("res=" + sb.toString());
if (++limit >= k) {
return sb.toString();
}
} else if (occurrences[i] > nextOcc) {
nextOcc = occurrences[i];
// System.err.println("nextocc=" + nextOcc);
}
}
maxOcc = nextOcc;
}
return sb.toString();
}
/**
* <pre>
int function MostFreqKSimilarity (String inputStr1, String inputStr2, int limit)
def int similarity
for each c = next character from inputStr1
lookup c in inputStr2
if c is null
continue
// similarity += frequency of c in inputStr1
similarity += frequency of c in inputStr1 + frequency of c in inputStr2
// return limit - similarity
return similarity
* </pre>
*/
public static int mostFreqKSimilarity(String hash1, String hash2) {
int similarity = 0;
int[] h1 = decode(hash1);
int[] h2 = decode(hash2);
for (int i = 0; i < h1.length; i++) {
char c = (char) h1[i++];
int freq1 = h1[i];
// System.err.println("c=" + c + " f=" + freq1);
int freq2 = findFrequency(h2, c);
if (freq2 >= 0) {
// System.err.println("found c=" + c + " f2=" + freq2);
similarity += freq1 + freq2;
}
}
return similarity;
}
/**
* <pre>
int function MostFreqKSDF (string inputStr1, string inputStr2, int K, int maxDistance)
return maxDistance - MostFreqKSimilarity(MostFreqKHashing(inputStr1,K), MostFreqKHashing(inputStr2,K))
* </pre>
*/
public static int mostFreqKSDF(String inputStr1, String inputStr2, int K, int maxDistance) {
return maxDistance - mostFreqKSimilarity(mostFrequentKHashing(inputStr1, K), mostFrequentKHashing(inputStr2, K));
}
/**
* Find frequency of char c in h2
*
* @param h2
* @param c
* @return frequency, or -1 if char not found
*/
private static int findFrequency(int[] h2, char c) {
for (int i = 0; i < h2.length; i++) {
char c2 = (char) h2[i++];
if (c == c2)
return h2[i];
}
return -1;
}
/**
* Convert the hash formatted string "a10b8c7" to int[]
*
* @param hash1
* @return
*/
static int[] decode(String hash1) {
int[] h = new int[hash1.length()];
int pos = 0;
for (int i = 0; i < hash1.length(); i++) {
h[pos++] = hash1.charAt(i);
int endIndex = i + 1;
char c;
while (endIndex < hash1.length() && ((c = hash1.charAt(endIndex)) >= '0' && c <= '9')) {
endIndex++;
}
h[pos++] = Integer.parseInt(hash1.substring(i + 1, endIndex));
i = endIndex - 1;
}
return Arrays.copyOf(h, pos);
// System.err.println(Arrays.toString(h));
// return h;
}
}
/*
* Copyright 2016 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys2.gringlobal.taxonomy.component;
import static org.hamcrest.Matchers.*;
import static org.junit.Assert.*;
import java.util.Arrays;
import org.junit.Test;
public class MostFrequentKCharsTest {
@Test
public void test1() {
assertThat(MostFrequentKChars.mostFrequentKHashing("night", 2), equalTo("n1i1"));
assertThat(MostFrequentKChars.mostFrequentKHashing("nacht", 2), equalTo("n1a1"));
assertThat(MostFrequentKChars.mostFrequentKHashing("research", 2), equalTo("r2e2"));
assertThat(MostFrequentKChars.mostFrequentKHashing("research", 2), equalTo("r2e2"));
assertThat(MostFrequentKChars.mostFrequentKHashing("aaaaabbbb", 2), equalTo("a5b4"));
assertThat(MostFrequentKChars.mostFrequentKHashing("ababababa", 2), equalTo("a5b4"));
assertThat(MostFrequentKChars.mostFrequentKHashing("significant", 2), equalTo("i3n2"));
assertThat(MostFrequentKChars.mostFrequentKHashing("capabilities", 2), equalTo("i3a2"));
assertThat(MostFrequentKChars.mostFrequentKHashing("LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV", 2), equalTo("L9T8"));
assertThat(MostFrequentKChars.mostFrequentKHashing("EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG", 2), equalTo("F9L8"));
}
@Test
public void testSimilarity() {
assertThat(MostFrequentKChars.mostFreqKSimilarity("n1i1", "n1a1"), equalTo(2));
assertThat(MostFrequentKChars.mostFreqKSimilarity("m1y1", "a1"), equalTo(0));
assertThat(MostFrequentKChars.mostFreqKSimilarity("r2e2", "r2e2"), equalTo(8));
assertThat(MostFrequentKChars.mostFreqKSimilarity("a5b4", "a5b4"), equalTo(18));
assertThat(MostFrequentKChars.mostFreqKSimilarity("i3n2", "i3a2"), equalTo(6));
assertThat(MostFrequentKChars.mostFreqKSimilarity("L9T8", "F9L8"), equalTo(17));
}
@Test
public void testDecode() {
assertTrue(Arrays.equals(MostFrequentKChars.decode("a1"), new int[] { 'a', 1 }));
assertTrue(Arrays.equals(MostFrequentKChars.decode("a11"), new int[] { 'a', 11 }));
assertTrue(Arrays.equals(MostFrequentKChars.decode("a11b2"), new int[] { 'a', 11, 'b', 2 }));
assertTrue(Arrays.equals(MostFrequentKChars.decode("a11b22"), new int[] { 'a', 11, 'b', 22 }));
assertTrue(Arrays.equals(MostFrequentKChars.decode("a11b22e8"), new int[] { 'a', 11, 'b', 22, 'e', 8 }));
assertTrue(Arrays.equals(MostFrequentKChars.decode("i3n2"), new int[] { 'i', 3, 'n', 2 }));
}
@Test
public void testMostFreqKMostFrequentKChars() {
assertThat(MostFrequentKChars.mostFreqKSDF("LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV", "EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG",
2, 100), equalTo(83));
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment